diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index dea212db9df3..7710d4022b19 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -244,7 +244,7 @@ Description: What: /sys/block//queue/zoned Date: September 2016 -Contact: Damien Le Moal +Contact: Damien Le Moal Description: zoned indicates if the device is a zoned block device and the zone model of the device if it is indeed zoned. @@ -259,6 +259,14 @@ Description: zone commands, they will be treated as regular block devices and zoned will report "none". +What: /sys/block//queue/nr_zones +Date: November 2018 +Contact: Damien Le Moal +Description: + nr_zones indicates the total number of zones of a zoned block + device ("host-aware" or "host-managed" zone model). For regular + block devices, the value is always 0. + What: /sys/block//queue/chunk_sectors Date: September 2016 Contact: Hannes Reinecke @@ -268,6 +276,6 @@ Description: indicates the size in 512B sectors of the RAID volume stripe segment. For a zoned block device, either host-aware or host-managed, chunk_sectors indicates the - size of 512B sectors of the zones of the device, with + size in 512B sectors of the zones of the device, with the eventual exception of the last zone of the device which may be smaller. diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 476722b7b636..baf19bf28385 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1879,8 +1879,10 @@ following two functions. wbc_init_bio(@wbc, @bio) Should be called for each bio carrying writeback data and - associates the bio with the inode's owner cgroup. Can be - called anytime between bio allocation and submission. + associates the bio with the inode's owner cgroup and the + corresponding request queue. This must be called after + a queue (device) has been associated with the bio and + before submission. wbc_account_io(@wbc, @page, @bytes) Should be called for each data segment being written out. @@ -1899,7 +1901,7 @@ the configuration, the bio may be executed at a lower priority and if the writeback session is holding shared resources, e.g. a journal entry, may lead to priority inversion. There is no one easy solution for the problem. Filesystems can try to work around specific problem -cases by skipping wbc_init_bio() or using bio_associate_blkcg() +cases by skipping wbc_init_bio() and using bio_associate_blkg() directly. diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 207eca58efaa..ac18b488cb5e 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -65,7 +65,6 @@ Description of Contents: 3.2.3 I/O completion 3.2.4 Implications for drivers that do not interpret bios (don't handle multiple segments) - 3.2.5 Request command tagging 3.3 I/O submission 4. The I/O scheduler 5. Scalability related changes @@ -708,93 +707,6 @@ is crossed on completion of a transfer. (The end*request* functions should be used if only if the request has come down from block/bio path, not for direct access requests which only specify rq->buffer without a valid rq->bio) -3.2.5 Generic request command tagging - -3.2.5.1 Tag helpers - -Block now offers some simple generic functionality to help support command -queueing (typically known as tagged command queueing), ie manage more than -one outstanding command on a queue at any given time. - - blk_queue_init_tags(struct request_queue *q, int depth) - - Initialize internal command tagging structures for a maximum - depth of 'depth'. - - blk_queue_free_tags((struct request_queue *q) - - Teardown tag info associated with the queue. This will be done - automatically by block if blk_queue_cleanup() is called on a queue - that is using tagging. - -The above are initialization and exit management, the main helpers during -normal operations are: - - blk_queue_start_tag(struct request_queue *q, struct request *rq) - - Start tagged operation for this request. A free tag number between - 0 and 'depth' is assigned to the request (rq->tag holds this number), - and 'rq' is added to the internal tag management. If the maximum depth - for this queue is already achieved (or if the tag wasn't started for - some other reason), 1 is returned. Otherwise 0 is returned. - - blk_queue_end_tag(struct request_queue *q, struct request *rq) - - End tagged operation on this request. 'rq' is removed from the internal - book keeping structures. - -To minimize struct request and queue overhead, the tag helpers utilize some -of the same request members that are used for normal request queue management. -This means that a request cannot both be an active tag and be on the queue -list at the same time. blk_queue_start_tag() will remove the request, but -the driver must remember to call blk_queue_end_tag() before signalling -completion of the request to the block layer. This means ending tag -operations before calling end_that_request_last()! For an example of a user -of these helpers, see the IDE tagged command queueing support. - -3.2.5.2 Tag info - -Some block functions exist to query current tag status or to go from a -tag number to the associated request. These are, in no particular order: - - blk_queue_tagged(q) - - Returns 1 if the queue 'q' is using tagging, 0 if not. - - blk_queue_tag_request(q, tag) - - Returns a pointer to the request associated with tag 'tag'. - - blk_queue_tag_depth(q) - - Return current queue depth. - - blk_queue_tag_queue(q) - - Returns 1 if the queue can accept a new queued command, 0 if we are - at the maximum depth already. - - blk_queue_rq_tagged(rq) - - Returns 1 if the request 'rq' is tagged. - -3.2.5.2 Internal structure - -Internally, block manages tags in the blk_queue_tag structure: - - struct blk_queue_tag { - struct request **tag_index; /* array or pointers to rq */ - unsigned long *tag_map; /* bitmap of free tags */ - struct list_head busy_list; /* fifo list of busy tags */ - int busy; /* queue depth */ - int max_depth; /* max queue depth */ - }; - -Most of the above is simple and straight forward, however busy_list may need -a bit of explaining. Normally we don't care too much about request ordering, -but in the event of any barrier requests in the tag queue we need to ensure -that requests are restarted in the order they were queue. - 3.3 I/O Submission The routine submit_bio() is used to submit a single io. Higher level i/o diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt deleted file mode 100644 index 895bd3813115..000000000000 --- a/Documentation/block/cfq-iosched.txt +++ /dev/null @@ -1,291 +0,0 @@ -CFQ (Complete Fairness Queueing) -=============================== - -The main aim of CFQ scheduler is to provide a fair allocation of the disk -I/O bandwidth for all the processes which requests an I/O operation. - -CFQ maintains the per process queue for the processes which request I/O -operation(synchronous requests). In case of asynchronous requests, all the -requests from all the processes are batched together according to their -process's I/O priority. - -CFQ ioscheduler tunables -======================== - -slice_idle ----------- -This specifies how long CFQ should idle for next request on certain cfq queues -(for sequential workloads) and service trees (for random workloads) before -queue is expired and CFQ selects next queue to dispatch from. - -By default slice_idle is a non-zero value. That means by default we idle on -queues/service trees. This can be very helpful on highly seeky media like -single spindle SATA/SAS disks where we can cut down on overall number of -seeks and see improved throughput. - -Setting slice_idle to 0 will remove all the idling on queues/service tree -level and one should see an overall improved throughput on faster storage -devices like multiple SATA/SAS disks in hardware RAID configuration. The down -side is that isolation provided from WRITES also goes down and notion of -IO priority becomes weaker. - -So depending on storage and workload, it might be useful to set slice_idle=0. -In general I think for SATA/SAS disks and software RAID of SATA/SAS disks -keeping slice_idle enabled should be useful. For any configurations where -there are multiple spindles behind single LUN (Host based hardware RAID -controller or for storage arrays), setting slice_idle=0 might end up in better -throughput and acceptable latencies. - -back_seek_max -------------- -This specifies, given in Kbytes, the maximum "distance" for backward seeking. -The distance is the amount of space from the current head location to the -sectors that are backward in terms of distance. - -This parameter allows the scheduler to anticipate requests in the "backward" -direction and consider them as being the "next" if they are within this -distance from the current head location. - -back_seek_penalty ------------------ -This parameter is used to compute the cost of backward seeking. If the -backward distance of request is just 1/back_seek_penalty from a "front" -request, then the seeking cost of two requests is considered equivalent. - -So scheduler will not bias toward one or the other request (otherwise scheduler -will bias toward front request). Default value of back_seek_penalty is 2. - -fifo_expire_async ------------------ -This parameter is used to set the timeout of asynchronous requests. Default -value of this is 248ms. - -fifo_expire_sync ----------------- -This parameter is used to set the timeout of synchronous requests. Default -value of this is 124ms. In case to favor synchronous requests over asynchronous -one, this value should be decreased relative to fifo_expire_async. - -group_idle ------------ -This parameter forces idling at the CFQ group level instead of CFQ -queue level. This was introduced after a bottleneck was observed -in higher end storage due to idle on sequential queue and allow dispatch -from a single queue. The idea with this parameter is that it can be run with -slice_idle=0 and group_idle=8, so that idling does not happen on individual -queues in the group but happens overall on the group and thus still keeps the -IO controller working. -Not idling on individual queues in the group will dispatch requests from -multiple queues in the group at the same time and achieve higher throughput -on higher end storage. - -Default value for this parameter is 8ms. - -low_latency ------------ -This parameter is used to enable/disable the low latency mode of the CFQ -scheduler. If enabled, CFQ tries to recompute the slice time for each process -based on the target_latency set for the system. This favors fairness over -throughput. Disabling low latency (setting it to 0) ignores target latency, -allowing each process in the system to get a full time slice. - -By default low latency mode is enabled. - -target_latency --------------- -This parameter is used to calculate the time slice for a process if cfq's -latency mode is enabled. It will ensure that sync requests have an estimated -latency. But if sequential workload is higher(e.g. sequential read), -then to meet the latency constraints, throughput may decrease because of less -time for each process to issue I/O request before the cfq queue is switched. - -Though this can be overcome by disabling the latency_mode, it may increase -the read latency for some applications. This parameter allows for changing -target_latency through the sysfs interface which can provide the balanced -throughput and read latency. - -Default value for target_latency is 300ms. - -slice_async ------------ -This parameter is same as of slice_sync but for asynchronous queue. The -default value is 40ms. - -slice_async_rq --------------- -This parameter is used to limit the dispatching of asynchronous request to -device request queue in queue's slice time. The maximum number of request that -are allowed to be dispatched also depends upon the io priority. Default value -for this is 2. - -slice_sync ----------- -When a queue is selected for execution, the queues IO requests are only -executed for a certain amount of time(time_slice) before switching to another -queue. This parameter is used to calculate the time slice of synchronous -queue. - -time_slice is computed using the below equation:- -time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the -time_slice of synchronous queue, increase the value of slice_sync. Default -value is 100ms. - -quantum -------- -This specifies the number of request dispatched to the device queue. In a -queue's time slice, a request will not be dispatched if the number of request -in the device exceeds this parameter. This parameter is used for synchronous -request. - -In case of storage with several disk, this setting can limit the parallel -processing of request. Therefore, increasing the value can improve the -performance although this can cause the latency of some I/O to increase due -to more number of requests. - -CFQ Group scheduling -==================== - -CFQ supports blkio cgroup and has "blkio." prefixed files in each -blkio cgroup directory. It is weight-based and there are four knobs -for configuration - weight[_device] and leaf_weight[_device]. -Internal cgroup nodes (the ones with children) can also have tasks in -them, so the former two configure how much proportion the cgroup as a -whole is entitled to at its parent's level while the latter two -configure how much proportion the tasks in the cgroup have compared to -its direct children. - -Another way to think about it is assuming that each internal node has -an implicit leaf child node which hosts all the tasks whose weight is -configured by leaf_weight[_device]. Let's assume a blkio hierarchy -composed of five cgroups - root, A, B, AA and AB - with the following -weights where the names represent the hierarchy. - - weight leaf_weight - root : 125 125 - A : 500 750 - B : 250 500 - AA : 500 500 - AB : 1000 500 - -root never has a parent making its weight is meaningless. For backward -compatibility, weight is always kept in sync with leaf_weight. B, AA -and AB have no child and thus its tasks have no children cgroup to -compete with. They always get 100% of what the cgroup won at the -parent level. Considering only the weights which matter, the hierarchy -looks like the following. - - root - / | \ - A B leaf - 500 250 125 - / | \ - AA AB leaf - 500 1000 750 - -If all cgroups have active IOs and competing with each other, disk -time will be distributed like the following. - -Distribution below root. The total active weight at this level is -A:500 + B:250 + C:125 = 875. - - root-leaf : 125 / 875 =~ 14% - A : 500 / 875 =~ 57% - B(-leaf) : 250 / 875 =~ 28% - -A has children and further distributes its 57% among the children and -the implicit leaf node. The total active weight at this level is -AA:500 + AB:1000 + A-leaf:750 = 2250. - - A-leaf : ( 750 / 2250) * A =~ 19% - AA(-leaf) : ( 500 / 2250) * A =~ 12% - AB(-leaf) : (1000 / 2250) * A =~ 25% - -CFQ IOPS Mode for group scheduling -=================================== -Basic CFQ design is to provide priority based time slices. Higher priority -process gets bigger time slice and lower priority process gets smaller time -slice. Measuring time becomes harder if storage is fast and supports NCQ and -it would be better to dispatch multiple requests from multiple cfq queues in -request queue at a time. In such scenario, it is not possible to measure time -consumed by single queue accurately. - -What is possible though is to measure number of requests dispatched from a -single queue and also allow dispatch from multiple cfq queue at the same time. -This effectively becomes the fairness in terms of IOPS (IO operations per -second). - -If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches -to IOPS mode and starts providing fairness in terms of number of requests -dispatched. Note that this mode switching takes effect only for group -scheduling. For non-cgroup users nothing should change. - -CFQ IO scheduler Idling Theory -=============================== -Idling on a queue is primarily about waiting for the next request to come -on same queue after completion of a request. In this process CFQ will not -dispatch requests from other cfq queues even if requests are pending there. - -The rationale behind idling is that it can cut down on number of seeks -on rotational media. For example, if a process is doing dependent -sequential reads (next read will come on only after completion of previous -one), then not dispatching request from other queue should help as we -did not move the disk head and kept on dispatching sequential IO from -one queue. - -CFQ has following service trees and various queues are put on these trees. - - sync-idle sync-noidle async - -All cfq queues doing synchronous sequential IO go on to sync-idle tree. -On this tree we idle on each queue individually. - -All synchronous non-sequential queues go on sync-noidle tree. Also any -synchronous write request which is not marked with REQ_IDLE goes on this -service tree. On this tree we do not idle on individual queues instead idle -on the whole group of queues or the tree. So if there are 4 queues waiting -for IO to dispatch we will idle only once last queue has dispatched the IO -and there is no more IO on this service tree. - -All async writes go on async service tree. There is no idling on async -queues. - -CFQ has some optimizations for SSDs and if it detects a non-rotational -media which can support higher queue depth (multiple requests at in -flight at a time), then it cuts down on idling of individual queues and -all the queues move to sync-noidle tree and only tree idle remains. This -tree idling provides isolation with buffered write queues on async tree. - -FAQ -=== -Q1. Why to idle at all on queues not marked with REQ_IDLE. - -A1. We only do tree idle (all queues on sync-noidle tree) on queues not marked - with REQ_IDLE. This helps in providing isolation with all the sync-idle - queues. Otherwise in presence of many sequential readers, other - synchronous IO might not get fair share of disk. - - For example, if there are 10 sequential readers doing IO and they get - 100ms each. If a !REQ_IDLE request comes in, it will be scheduled - roughly after 1 second. If after completion of !REQ_IDLE request we - do not idle, and after a couple of milli seconds a another !REQ_IDLE - request comes in, again it will be scheduled after 1second. Repeat it - and notice how a workload can lose its disk share and suffer due to - multiple sequential readers. - - fsync can generate dependent IO where bunch of data is written in the - context of fsync, and later some journaling data is written. Journaling - data comes in only after fsync has finished its IO (atleast for ext4 - that seemed to be the case). Now if one decides not to idle on fsync - thread due to !REQ_IDLE, then next journaling write will not get - scheduled for another second. A process doing small fsync, will suffer - badly in presence of multiple sequential readers. - - Hence doing tree idling on threads using !REQ_IDLE flag on requests - provides isolation from multiple sequential readers and at the same - time we do not idle on individual threads. - -Q2. When to specify REQ_IDLE -A2. I would think whenever one is doing synchronous write and expecting - more writes to be dispatched from same context soon, should be able - to specify REQ_IDLE on writes and that probably should work well for - most of the cases. diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index 2c1e67058fd3..39e286d7afc9 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -64,7 +64,7 @@ guess, the kernel will put the process issuing IO to sleep for an amount of time, before entering a classic poll loop. This mode might be a little slower than pure classic polling, but it will be more efficient. If set to a value larger than 0, the kernel will put the process issuing -IO to sleep for this amont of microseconds before entering classic +IO to sleep for this amount of microseconds before entering classic polling. iostats (RW) @@ -194,4 +194,31 @@ blk-throttle makes decision based on the samplings. Lower time means cgroups have more smooth throughput, but higher CPU overhead. This exists only when CONFIG_BLK_DEV_THROTTLING_LOW is enabled. +zoned (RO) +---------- +This indicates if the device is a zoned block device and the zone model of the +device if it is indeed zoned. The possible values indicated by zoned are +"none" for regular block devices and "host-aware" or "host-managed" for zoned +block devices. The characteristics of host-aware and host-managed zoned block +devices are described in the ZBC (Zoned Block Commands) and ZAC +(Zoned Device ATA Command Set) standards. These standards also define the +"drive-managed" zone model. However, since drive-managed zoned block devices +do not support zone commands, they will be treated as regular block devices +and zoned will report "none". + +nr_zones (RO) +------------- +For zoned block devices (zoned attribute indicating "host-managed" or +"host-aware"), this indicates the total number of zones of the device. +This is always 0 for regular block devices. + +chunk_sectors (RO) +------------------ +This has different meaning depending on the type of the block device. +For a RAID device (dm-raid), chunk_sectors indicates the size in 512B sectors +of the RAID volume stripe segment. For a zoned block device, either host-aware +or host-managed, chunk_sectors indicates the size in 512B sectors of the zones +of the device, with the eventual exception of the last zone of the device which +may be smaller. + Jens Axboe , February 2009 diff --git a/Documentation/scsi/scsi-parameters.txt b/Documentation/scsi/scsi-parameters.txt index 92999d4e0cb8..25a4b4cf04a6 100644 --- a/Documentation/scsi/scsi-parameters.txt +++ b/Documentation/scsi/scsi-parameters.txt @@ -97,11 +97,6 @@ parameters may be changed at runtime by the command allowing boot to proceed. none ignores them, expecting user space to do the scan. - scsi_mod.use_blk_mq= - [SCSI] use blk-mq I/O path by default - See SCSI_MQ_DEFAULT in drivers/scsi/Kconfig. - Format: - sim710= [SCSI,HW] See header of drivers/scsi/sim710.c. diff --git a/block/Kconfig b/block/Kconfig index f7045aa47edb..8044452a4fd3 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -155,12 +155,6 @@ config BLK_CGROUP_IOLATENCY Note, this is an experimental interface and could be changed someday. -config BLK_WBT_SQ - bool "Single queue writeback throttling" - depends on BLK_WBT - ---help--- - Enable writeback throttling by default on legacy single queue devices - config BLK_WBT_MQ bool "Multiqueue writeback throttling" default y diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index f95a48b0d7b2..4626b88b2d5a 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -3,67 +3,6 @@ if BLOCK menu "IO Schedulers" -config IOSCHED_NOOP - bool - default y - ---help--- - The no-op I/O scheduler is a minimal scheduler that does basic merging - and sorting. Its main uses include non-disk based block devices like - memory devices, and specialised software or hardware environments - that do their own scheduling and require only minimal assistance from - the kernel. - -config IOSCHED_DEADLINE - tristate "Deadline I/O scheduler" - default y - ---help--- - The deadline I/O scheduler is simple and compact. It will provide - CSCAN service with FIFO expiration of requests, switching to - a new point in the service tree and doing a batch of IO from there - in case of expiry. - -config IOSCHED_CFQ - tristate "CFQ I/O scheduler" - default y - ---help--- - The CFQ I/O scheduler tries to distribute bandwidth equally - among all processes in the system. It should provide a fair - and low latency working environment, suitable for both desktop - and server systems. - - This is the default I/O scheduler. - -config CFQ_GROUP_IOSCHED - bool "CFQ Group Scheduling support" - depends on IOSCHED_CFQ && BLK_CGROUP - ---help--- - Enable group IO scheduling in CFQ. - -choice - - prompt "Default I/O scheduler" - default DEFAULT_CFQ - help - Select the I/O scheduler which will be used by default for all - block devices. - - config DEFAULT_DEADLINE - bool "Deadline" if IOSCHED_DEADLINE=y - - config DEFAULT_CFQ - bool "CFQ" if IOSCHED_CFQ=y - - config DEFAULT_NOOP - bool "No-op" - -endchoice - -config DEFAULT_IOSCHED - string - default "deadline" if DEFAULT_DEADLINE - default "cfq" if DEFAULT_CFQ - default "noop" if DEFAULT_NOOP - config MQ_IOSCHED_DEADLINE tristate "MQ deadline I/O scheduler" default y diff --git a/block/Makefile b/block/Makefile index 27eac600474f..eee1b4ceecf9 100644 --- a/block/Makefile +++ b/block/Makefile @@ -3,7 +3,7 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ @@ -18,9 +18,6 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o -obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o -obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o -obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 9fe5952d117d..c6113af31960 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -334,7 +334,7 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) parent = bfqg_parent(bfqg); - lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); + lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock); if (unlikely(!parent)) return; @@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) uint64_t serial_nr; rcu_read_lock(); - serial_nr = bio_blkcg(bio)->css.serial_nr; + serial_nr = __bio_blkcg(bio)->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger @@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) goto out; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); + bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); /* * Update blkg_path for bfq_log_* functions. We cache this * path, and update it here, for the following diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 97337214bec4..cd307767a134 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -399,9 +399,9 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, unsigned long flags; struct bfq_io_cq *icq; - spin_lock_irqsave(q->queue_lock, flags); + spin_lock_irqsave(&q->queue_lock, flags); icq = icq_to_bic(ioc_lookup_icq(ioc, q)); - spin_unlock_irqrestore(q->queue_lock, flags); + spin_unlock_irqrestore(&q->queue_lock, flags); return icq; } @@ -4066,7 +4066,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q, * In addition, the following queue lock guarantees that * bfqq_group(bfqq) exists as well. */ - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); if (idle_timer_disabled) /* * Since the idle timer has been disabled, @@ -4085,7 +4085,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q, bfqg_stats_set_start_empty_time(bfqg); bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); } - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); } #else static inline void bfq_update_dispatch_stats(struct request_queue *q, @@ -4416,7 +4416,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, rcu_read_lock(); - bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); + bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); if (!bfqg) { bfqq = &bfqd->oom_bfqq; goto out; @@ -4669,11 +4669,11 @@ static void bfq_update_insert_stats(struct request_queue *q, * In addition, the following queue lock guarantees that * bfqq_group(bfqq) exists as well. */ - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); if (idle_timer_disabled) bfqg_stats_update_idle_time(bfqq_group(bfqq)); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); } #else static inline void bfq_update_insert_stats(struct request_queue *q, @@ -5414,9 +5414,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) } eq->elevator_data = bfqd; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); q->elevator = eq; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); /* * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. @@ -5756,7 +5756,7 @@ static struct elv_fs_entry bfq_attrs[] = { }; static struct elevator_type iosched_bfq_mq = { - .ops.mq = { + .ops = { .limit_depth = bfq_limit_depth, .prepare_request = bfq_prepare_request, .requeue_request = bfq_finish_requeue_request, @@ -5777,7 +5777,6 @@ static struct elevator_type iosched_bfq_mq = { .exit_sched = bfq_exit_queue, }, - .uses_mq = true, .icq_size = sizeof(struct bfq_io_cq), .icq_align = __alignof__(struct bfq_io_cq), .elevator_attrs = bfq_attrs, diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 290af497997b..1b633a3526d4 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -390,7 +390,6 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) bip->bip_iter.bi_sector += bytes_done >> 9; bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } -EXPORT_SYMBOL(bio_integrity_advance); /** * bio_integrity_trim - Trim integrity vector @@ -460,7 +459,6 @@ void bioset_integrity_free(struct bio_set *bs) mempool_exit(&bs->bio_integrity_pool); mempool_exit(&bs->bvec_integrity_pool); } -EXPORT_SYMBOL(bioset_integrity_free); void __init bio_integrity_init(void) { diff --git a/block/bio.c b/block/bio.c index 4d86e90654b2..8281bfcbc265 100644 --- a/block/bio.c +++ b/block/bio.c @@ -244,7 +244,7 @@ fallback: void bio_uninit(struct bio *bio) { - bio_disassociate_task(bio); + bio_disassociate_blkg(bio); } EXPORT_SYMBOL(bio_uninit); @@ -571,14 +571,13 @@ void bio_put(struct bio *bio) } EXPORT_SYMBOL(bio_put); -inline int bio_phys_segments(struct request_queue *q, struct bio *bio) +int bio_phys_segments(struct request_queue *q, struct bio *bio) { if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) blk_recount_segments(q, bio); return bio->bi_phys_segments; } -EXPORT_SYMBOL(bio_phys_segments); /** * __bio_clone_fast - clone a bio that shares the original bio's biovec @@ -610,7 +609,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; - bio_clone_blkcg_association(bio, bio_src); + bio_clone_blkg_association(bio, bio_src); + blkcg_bio_issue_init(bio); } EXPORT_SYMBOL(__bio_clone_fast); @@ -901,7 +901,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return 0; } -EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); static void submit_bio_wait_endio(struct bio *bio) { @@ -1592,7 +1591,6 @@ void bio_set_pages_dirty(struct bio *bio) set_page_dirty_lock(bvec->bv_page); } } -EXPORT_SYMBOL_GPL(bio_set_pages_dirty); static void bio_release_pages(struct bio *bio) { @@ -1662,17 +1660,33 @@ defer: spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } -EXPORT_SYMBOL_GPL(bio_check_pages_dirty); + +void update_io_ticks(struct hd_struct *part, unsigned long now) +{ + unsigned long stamp; +again: + stamp = READ_ONCE(part->stamp); + if (unlikely(stamp != now)) { + if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) { + __part_stat_add(part, io_ticks, 1); + } + } + if (part->partno) { + part = &part_to_disk(part)->part0; + goto again; + } +} void generic_start_io_acct(struct request_queue *q, int op, unsigned long sectors, struct hd_struct *part) { const int sgrp = op_stat_group(op); - int cpu = part_stat_lock(); - part_round_stats(q, cpu, part); - part_stat_inc(cpu, part, ios[sgrp]); - part_stat_add(cpu, part, sectors[sgrp], sectors); + part_stat_lock(); + + update_io_ticks(part, jiffies); + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, sectors[sgrp], sectors); part_inc_in_flight(q, part, op_is_write(op)); part_stat_unlock(); @@ -1682,12 +1696,15 @@ EXPORT_SYMBOL(generic_start_io_acct); void generic_end_io_acct(struct request_queue *q, int req_op, struct hd_struct *part, unsigned long start_time) { - unsigned long duration = jiffies - start_time; + unsigned long now = jiffies; + unsigned long duration = now - start_time; const int sgrp = op_stat_group(req_op); - int cpu = part_stat_lock(); - part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_round_stats(q, cpu, part); + part_stat_lock(); + + update_io_ticks(part, now); + part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); + part_stat_add(part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); @@ -1957,102 +1974,133 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP -#ifdef CONFIG_MEMCG /** - * bio_associate_blkcg_from_page - associate a bio with the page's blkcg + * bio_disassociate_blkg - puts back the blkg reference if associated * @bio: target bio - * @page: the page to lookup the blkcg from * - * Associate @bio with the blkcg from @page's owning memcg. This works like - * every other associate function wrt references. + * Helper to disassociate the blkg from @bio if a blkg is associated. */ -int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) +void bio_disassociate_blkg(struct bio *bio) { - struct cgroup_subsys_state *blkcg_css; - - if (unlikely(bio->bi_css)) - return -EBUSY; - if (!page->mem_cgroup) - return 0; - blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, - &io_cgrp_subsys); - bio->bi_css = blkcg_css; - return 0; -} -#endif /* CONFIG_MEMCG */ - -/** - * bio_associate_blkcg - associate a bio with the specified blkcg - * @bio: target bio - * @blkcg_css: css of the blkcg to associate - * - * Associate @bio with the blkcg specified by @blkcg_css. Block layer will - * treat @bio as if it were issued by a task which belongs to the blkcg. - * - * This function takes an extra reference of @blkcg_css which will be put - * when @bio is released. The caller must own @bio and is responsible for - * synchronizing calls to this function. - */ -int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) -{ - if (unlikely(bio->bi_css)) - return -EBUSY; - css_get(blkcg_css); - bio->bi_css = blkcg_css; - return 0; -} -EXPORT_SYMBOL_GPL(bio_associate_blkcg); - -/** - * bio_associate_blkg - associate a bio with the specified blkg - * @bio: target bio - * @blkg: the blkg to associate - * - * Associate @bio with the blkg specified by @blkg. This is the queue specific - * blkcg information associated with the @bio, a reference will be taken on the - * @blkg and will be freed when the bio is freed. - */ -int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) -{ - if (unlikely(bio->bi_blkg)) - return -EBUSY; - if (!blkg_try_get(blkg)) - return -ENODEV; - bio->bi_blkg = blkg; - return 0; -} - -/** - * bio_disassociate_task - undo bio_associate_current() - * @bio: target bio - */ -void bio_disassociate_task(struct bio *bio) -{ - if (bio->bi_ioc) { - put_io_context(bio->bi_ioc); - bio->bi_ioc = NULL; - } - if (bio->bi_css) { - css_put(bio->bi_css); - bio->bi_css = NULL; - } if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } } +EXPORT_SYMBOL_GPL(bio_disassociate_blkg); /** - * bio_clone_blkcg_association - clone blkcg association from src to dst bio + * __bio_associate_blkg - associate a bio with the a blkg + * @bio: target bio + * @blkg: the blkg to associate + * + * This tries to associate @bio with the specified @blkg. Association failure + * is handled by walking up the blkg tree. Therefore, the blkg associated can + * be anything between @blkg and the root_blkg. This situation only happens + * when a cgroup is dying and then the remaining bios will spill to the closest + * alive blkg. + * + * A reference will be taken on the @blkg and will be released when @bio is + * freed. + */ +static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) +{ + bio_disassociate_blkg(bio); + + bio->bi_blkg = blkg_tryget_closest(blkg); +} + +/** + * bio_associate_blkg_from_css - associate a bio with a specified css + * @bio: target bio + * @css: target css + * + * Associate @bio with the blkg found by combining the css's blkg and the + * request_queue of the @bio. This falls back to the queue's root_blkg if + * the association fails with the css. + */ +void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) +{ + struct request_queue *q = bio->bi_disk->queue; + struct blkcg_gq *blkg; + + rcu_read_lock(); + + if (!css || !css->parent) + blkg = q->root_blkg; + else + blkg = blkg_lookup_create(css_to_blkcg(css), q); + + __bio_associate_blkg(bio, blkg); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); + +#ifdef CONFIG_MEMCG +/** + * bio_associate_blkg_from_page - associate a bio with the page's blkg + * @bio: target bio + * @page: the page to lookup the blkcg from + * + * Associate @bio with the blkg from @page's owning memcg and the respective + * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's + * root_blkg. + */ +void bio_associate_blkg_from_page(struct bio *bio, struct page *page) +{ + struct cgroup_subsys_state *css; + + if (!page->mem_cgroup) + return; + + rcu_read_lock(); + + css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); + bio_associate_blkg_from_css(bio, css); + + rcu_read_unlock(); +} +#endif /* CONFIG_MEMCG */ + +/** + * bio_associate_blkg - associate a bio with a blkg + * @bio: target bio + * + * Associate @bio with the blkg found from the bio's css and request_queue. + * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is + * already associated, the css is reused and association redone as the + * request_queue may have changed. + */ +void bio_associate_blkg(struct bio *bio) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + if (bio->bi_blkg) + css = &bio_blkcg(bio)->css; + else + css = blkcg_css(); + + bio_associate_blkg_from_css(bio, css); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(bio_associate_blkg); + +/** + * bio_clone_blkg_association - clone blkg association from src to dst bio * @dst: destination bio * @src: source bio */ -void bio_clone_blkcg_association(struct bio *dst, struct bio *src) +void bio_clone_blkg_association(struct bio *dst, struct bio *src) { - if (src->bi_css) - WARN_ON(bio_associate_blkcg(dst, src->bi_css)); + if (src->bi_blkg) + __bio_associate_blkg(dst, src->bi_blkg); } -EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); +EXPORT_SYMBOL_GPL(bio_clone_blkg_association); #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c630e02836a8..c8cc1cbb6370 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -76,14 +76,42 @@ static void blkg_free(struct blkcg_gq *blkg) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - if (blkg->blkcg != &blkcg_root) - blk_exit_rl(blkg->q, &blkg->rl); - blkg_rwstat_exit(&blkg->stat_ios); blkg_rwstat_exit(&blkg->stat_bytes); kfree(blkg); } +static void __blkg_release(struct rcu_head *rcu) +{ + struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); + + percpu_ref_exit(&blkg->refcnt); + + /* release the blkcg and parent blkg refs this blkg has been holding */ + css_put(&blkg->blkcg->css); + if (blkg->parent) + blkg_put(blkg->parent); + + wb_congested_put(blkg->wb_congested); + + blkg_free(blkg); +} + +/* + * A group is RCU protected, but having an rcu lock does not mean that one + * can access all the fields of blkg and assume these are valid. For + * example, don't try to follow throtl_data and request queue links. + * + * Having a reference to blkg under an rcu allows accesses to only values + * local to groups like group stats and group rate limits. + */ +static void blkg_release(struct percpu_ref *ref) +{ + struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); + + call_rcu(&blkg->rcu_head, __blkg_release); +} + /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with @@ -110,14 +138,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; - atomic_set(&blkg->refcnt, 1); - - /* root blkg uses @q->root_rl, init rl only for !root blkgs */ - if (blkcg != &blkcg_root) { - if (blk_init_rl(&blkg->rl, q, gfp_mask)) - goto err_free; - blkg->rl.blkg = blkg; - } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -157,7 +177,7 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q == q) { if (update_hint) { - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&q->queue_lock); rcu_assign_pointer(blkcg->blkg_hint, blkg); } return blkg; @@ -180,7 +200,13 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&q->queue_lock); + + /* request_queue is dying, do not create/recreate a blkg */ + if (blk_queue_dying(q)) { + ret = -ENODEV; + goto err_free_blkg; + } /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { @@ -217,6 +243,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_get(blkg->parent); } + ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0, + GFP_NOWAIT | __GFP_NOWARN); + if (ret) + goto err_cancel_ref; + /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -249,6 +280,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); +err_cancel_ref: + percpu_ref_exit(&blkg->refcnt); err_put_congested: wb_congested_put(wb_congested); err_put_css: @@ -259,7 +292,7 @@ err_free_blkg: } /** - * blkg_lookup_create - lookup blkg, try to create one if not there + * __blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @q: request_queue of interest * @@ -268,24 +301,16 @@ err_free_blkg: * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and @q->queue_lock. * - * Returns pointer to the looked up or created blkg on success, ERR_PTR() - * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not - * dead and bypassing, returns ERR_PTR(-EBUSY). + * Returns the blkg or the closest blkg if blkg_create() fails as it walks + * down from root. */ -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) +struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) { struct blkcg_gq *blkg; WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(q->queue_lock); - - /* - * This could be the first entry point of blkcg implementation and - * we shouldn't allow anything to go through for a bypassing queue. - */ - if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); + lockdep_assert_held(&q->queue_lock); blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -293,30 +318,64 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, /* * Create blkgs walking down from blkcg_root to @blkcg, so that all - * non-root blkgs have access to their parents. + * non-root blkgs have access to their parents. Returns the closest + * blkg to the intended blkg should blkg_create() fail. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); + struct blkcg_gq *ret_blkg = q->root_blkg; - while (parent && !__blkg_lookup(parent, q, false)) { + while (parent) { + blkg = __blkg_lookup(parent, q, false); + if (blkg) { + /* remember closest blkg */ + ret_blkg = blkg; + break; + } pos = parent; parent = blkcg_parent(parent); } blkg = blkg_create(pos, q, NULL); - if (pos == blkcg || IS_ERR(blkg)) + if (IS_ERR(blkg)) + return ret_blkg; + if (pos == blkcg) return blkg; } } +/** + * blkg_lookup_create - find or create a blkg + * @blkcg: target block cgroup + * @q: target request_queue + * + * This looks up or creates the blkg representing the unique pair + * of the blkcg and the request_queue. + */ +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) +{ + struct blkcg_gq *blkg = blkg_lookup(blkcg, q); + + if (unlikely(!blkg)) { + unsigned long flags; + + spin_lock_irqsave(&q->queue_lock, flags); + blkg = __blkg_lookup_create(blkcg, q); + spin_unlock_irqrestore(&q->queue_lock, flags); + } + + return blkg; +} + static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; struct blkcg_gq *parent = blkg->parent; int i; - lockdep_assert_held(blkg->q->queue_lock); + lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); /* Something wrong if we are trying to remove same group twice */ @@ -353,7 +412,7 @@ static void blkg_destroy(struct blkcg_gq *blkg) * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ - blkg_put(blkg); + percpu_ref_kill(&blkg->refcnt); } /** @@ -366,8 +425,7 @@ static void blkg_destroy_all(struct request_queue *q) { struct blkcg_gq *blkg, *n; - lockdep_assert_held(q->queue_lock); - + spin_lock_irq(&q->queue_lock); list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; @@ -377,7 +435,7 @@ static void blkg_destroy_all(struct request_queue *q) } q->root_blkg = NULL; - q->root_rl.blkg = NULL; + spin_unlock_irq(&q->queue_lock); } /* @@ -403,41 +461,6 @@ void __blkg_release_rcu(struct rcu_head *rcu_head) } EXPORT_SYMBOL_GPL(__blkg_release_rcu); -/* - * The next function used by blk_queue_for_each_rl(). It's a bit tricky - * because the root blkg uses @q->root_rl instead of its own rl. - */ -struct request_list *__blk_queue_next_rl(struct request_list *rl, - struct request_queue *q) -{ - struct list_head *ent; - struct blkcg_gq *blkg; - - /* - * Determine the current blkg list_head. The first entry is - * root_rl which is off @q->blkg_list and mapped to the head. - */ - if (rl == &q->root_rl) { - ent = &q->blkg_list; - /* There are no more block groups, hence no request lists */ - if (list_empty(ent)) - return NULL; - } else { - blkg = container_of(rl, struct blkcg_gq, rl); - ent = &blkg->q_node; - } - - /* walk to the next list_head, skip root blkcg */ - ent = ent->next; - if (ent == &q->root_blkg->q_node) - ent = ent->next; - if (ent == &q->blkg_list) - return NULL; - - blkg = container_of(ent, struct blkcg_gq, q_node); - return &blkg->rl; -} - static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { @@ -477,7 +500,6 @@ const char *blkg_dev_name(struct blkcg_gq *blkg) return dev_name(blkg->q->backing_dev_info->dev); return NULL; } -EXPORT_SYMBOL_GPL(blkg_dev_name); /** * blkcg_print_blkgs - helper for printing per-blkg data @@ -508,10 +530,10 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - spin_lock_irq(blkg->q->queue_lock); + spin_lock_irq(&blkg->q->queue_lock); if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); - spin_unlock_irq(blkg->q->queue_lock); + spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); @@ -709,7 +731,7 @@ u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, struct cgroup_subsys_state *pos_css; u64 sum = 0; - lockdep_assert_held(blkg->q->queue_lock); + lockdep_assert_held(&blkg->q->queue_lock); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { @@ -752,7 +774,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkg_rwstat sum = { }; int i; - lockdep_assert_held(blkg->q->queue_lock); + lockdep_assert_held(&blkg->q->queue_lock); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { @@ -783,18 +805,10 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, struct request_queue *q) { WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) return ERR_PTR(-EOPNOTSUPP); - - /* - * This could be the first entry point of blkcg implementation and - * we shouldn't allow anything to go through for a bypassing queue. - */ - if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); - return __blkg_lookup(blkcg, q, true /* update_hint */); } @@ -812,7 +826,7 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(disk->queue->queue_lock) + __acquires(rcu) __acquires(&disk->queue->queue_lock) { struct gendisk *disk; struct request_queue *q; @@ -840,7 +854,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, q = disk->queue; rcu_read_lock(); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); blkg = blkg_lookup_check(blkcg, pol, q); if (IS_ERR(blkg)) { @@ -867,7 +881,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, } /* Drop locks to do new blkg allocation with GFP_KERNEL. */ - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); new_blkg = blkg_alloc(pos, q, GFP_KERNEL); @@ -877,7 +891,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, } rcu_read_lock(); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); blkg = blkg_lookup_check(pos, pol, q); if (IS_ERR(blkg)) { @@ -905,7 +919,7 @@ success: return 0; fail_unlock: - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); fail: put_disk_and_module(disk); @@ -921,7 +935,6 @@ fail: } return ret; } -EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_finish - finish up per-blkg config update @@ -931,13 +944,12 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); * with blkg_conf_prep(). */ void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(ctx->disk->queue->queue_lock) __releases(rcu) + __releases(&ctx->disk->queue->queue_lock) __releases(rcu) { - spin_unlock_irq(ctx->disk->queue->queue_lock); + spin_unlock_irq(&ctx->disk->queue->queue_lock); rcu_read_unlock(); put_disk_and_module(ctx->disk); } -EXPORT_SYMBOL_GPL(blkg_conf_finish); static int blkcg_print_stat(struct seq_file *sf, void *v) { @@ -967,7 +979,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) */ off += scnprintf(buf+off, size-off, "%s ", dname); - spin_lock_irq(blkg->q->queue_lock); + spin_lock_irq(&blkg->q->queue_lock); rwstat = blkg_rwstat_recursive_sum(blkg, NULL, offsetof(struct blkcg_gq, stat_bytes)); @@ -981,7 +993,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); - spin_unlock_irq(blkg->q->queue_lock); + spin_unlock_irq(&blkg->q->queue_lock); if (rbytes || wbytes || rios || wios) { has_stats = true; @@ -1102,9 +1114,9 @@ void blkcg_destroy_blkgs(struct blkcg *blkcg) struct blkcg_gq, blkcg_node); struct request_queue *q = blkg->q; - if (spin_trylock(q->queue_lock)) { + if (spin_trylock(&q->queue_lock)) { blkg_destroy(blkg); - spin_unlock(q->queue_lock); + spin_unlock(&q->queue_lock); } else { spin_unlock_irq(&blkcg->lock); cpu_relax(); @@ -1225,36 +1237,31 @@ int blkcg_init_queue(struct request_queue *q) /* Make sure the root blkg exists. */ rcu_read_lock(); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); blkg = blkg_create(&blkcg_root, q, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; - q->root_rl.blkg = blkg; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); ret = blk_iolatency_init(q); - if (ret) { - spin_lock_irq(q->queue_lock); - blkg_destroy_all(q); - spin_unlock_irq(q->queue_lock); - return ret; - } + if (ret) + goto err_destroy_all; ret = blk_throtl_init(q); - if (ret) { - spin_lock_irq(q->queue_lock); - blkg_destroy_all(q); - spin_unlock_irq(q->queue_lock); - } - return ret; + if (ret) + goto err_destroy_all; + return 0; +err_destroy_all: + blkg_destroy_all(q); + return ret; err_unlock: - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); @@ -1269,7 +1276,7 @@ err_unlock: */ void blkcg_drain_queue(struct request_queue *q) { - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&q->queue_lock); /* * @q could be exiting and already have destroyed all blkgs as @@ -1289,10 +1296,7 @@ void blkcg_drain_queue(struct request_queue *q) */ void blkcg_exit_queue(struct request_queue *q) { - spin_lock_irq(q->queue_lock); blkg_destroy_all(q); - spin_unlock_irq(q->queue_lock); - blk_throtl_exit(q); } @@ -1396,10 +1400,8 @@ int blkcg_activate_policy(struct request_queue *q, if (blkcg_policy_enabled(q, pol)) return 0; - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_freeze_queue(q); - else - blk_queue_bypass_start(q); pd_prealloc: if (!pd_prealloc) { pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); @@ -1409,7 +1411,7 @@ pd_prealloc: } } - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; @@ -1421,7 +1423,7 @@ pd_prealloc: if (!pd) swap(pd, pd_prealloc); if (!pd) { - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); goto pd_prealloc; } @@ -1435,12 +1437,10 @@ pd_prealloc: __set_bit(pol->plid, q->blkcg_pols); ret = 0; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); out_bypass_end: - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); - else - blk_queue_bypass_end(q); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; @@ -1463,12 +1463,10 @@ void blkcg_deactivate_policy(struct request_queue *q, if (!blkcg_policy_enabled(q, pol)) return; - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_freeze_queue(q); - else - blk_queue_bypass_start(q); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); @@ -1481,12 +1479,10 @@ void blkcg_deactivate_policy(struct request_queue *q, } } - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); - else - blk_queue_bypass_end(q); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); @@ -1748,8 +1744,7 @@ void blkcg_maybe_throttle_current(void) blkg = blkg_lookup(blkcg, q); if (!blkg) goto out; - blkg = blkg_try_get(blkg); - if (!blkg) + if (!blkg_tryget(blkg)) goto out; rcu_read_unlock(); @@ -1761,7 +1756,6 @@ out: rcu_read_unlock(); blk_put_queue(q); } -EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current); /** * blkcg_schedule_throttle - this task needs to check for throttling @@ -1795,7 +1789,6 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); } -EXPORT_SYMBOL_GPL(blkcg_schedule_throttle); /** * blkcg_add_delay - add delay to this blkg @@ -1810,7 +1803,6 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) blkcg_scale_delay(blkg, now); atomic64_add(delta, &blkg->delay_nsec); } -EXPORT_SYMBOL_GPL(blkcg_add_delay); module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); diff --git a/block/blk-core.c b/block/blk-core.c index deb56932f8c4..c78042975737 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -57,11 +57,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); -/* - * For the allocated request tables - */ -struct kmem_cache *request_cachep; - /* * For queue allocation */ @@ -79,11 +74,7 @@ static struct workqueue_struct *kblockd_workqueue; */ void blk_queue_flag_set(unsigned int flag, struct request_queue *q) { - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - queue_flag_set(flag, q); - spin_unlock_irqrestore(q->queue_lock, flags); + set_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_set); @@ -94,11 +85,7 @@ EXPORT_SYMBOL(blk_queue_flag_set); */ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) { - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - queue_flag_clear(flag, q); - spin_unlock_irqrestore(q->queue_lock, flags); + clear_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_clear); @@ -112,85 +99,15 @@ EXPORT_SYMBOL(blk_queue_flag_clear); */ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) { - unsigned long flags; - bool res; - - spin_lock_irqsave(q->queue_lock, flags); - res = queue_flag_test_and_set(flag, q); - spin_unlock_irqrestore(q->queue_lock, flags); - - return res; + return test_and_set_bit(flag, &q->queue_flags); } EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); -/** - * blk_queue_flag_test_and_clear - atomically test and clear a queue flag - * @flag: flag to be cleared - * @q: request queue - * - * Returns the previous value of @flag - 0 if the flag was not set and 1 if - * the flag was set. - */ -bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q) -{ - unsigned long flags; - bool res; - - spin_lock_irqsave(q->queue_lock, flags); - res = queue_flag_test_and_clear(flag, q); - spin_unlock_irqrestore(q->queue_lock, flags); - - return res; -} -EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear); - -static void blk_clear_congested(struct request_list *rl, int sync) -{ -#ifdef CONFIG_CGROUP_WRITEBACK - clear_wb_congested(rl->blkg->wb_congested, sync); -#else - /* - * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't - * flip its congestion state for events on other blkcgs. - */ - if (rl == &rl->q->root_rl) - clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync); -#endif -} - -static void blk_set_congested(struct request_list *rl, int sync) -{ -#ifdef CONFIG_CGROUP_WRITEBACK - set_wb_congested(rl->blkg->wb_congested, sync); -#else - /* see blk_clear_congested() */ - if (rl == &rl->q->root_rl) - set_wb_congested(rl->q->backing_dev_info->wb.congested, sync); -#endif -} - -void blk_queue_congestion_threshold(struct request_queue *q) -{ - int nr; - - nr = q->nr_requests - (q->nr_requests / 8) + 1; - if (nr > q->nr_requests) - nr = q->nr_requests; - q->nr_congestion_on = nr; - - nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; - if (nr < 1) - nr = 1; - q->nr_congestion_off = nr; -} - void blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); - INIT_LIST_HEAD(&rq->timeout_list); - rq->cpu = -1; rq->q = q; rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); @@ -256,10 +173,11 @@ static void print_req_error(struct request *req, blk_status_t status) if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return; - printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", - __func__, blk_errors[idx].name, req->rq_disk ? - req->rq_disk->disk_name : "?", - (unsigned long long)blk_rq_pos(req)); + printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu flags %x\n", + __func__, blk_errors[idx].name, + req->rq_disk ? req->rq_disk->disk_name : "?", + (unsigned long long)blk_rq_pos(req), + req->cmd_flags); } static void req_bio_endio(struct request *rq, struct bio *bio, @@ -292,99 +210,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg) } EXPORT_SYMBOL(blk_dump_rq_flags); -static void blk_delay_work(struct work_struct *work) -{ - struct request_queue *q; - - q = container_of(work, struct request_queue, delay_work.work); - spin_lock_irq(q->queue_lock); - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); -} - -/** - * blk_delay_queue - restart queueing after defined interval - * @q: The &struct request_queue in question - * @msecs: Delay in msecs - * - * Description: - * Sometimes queueing needs to be postponed for a little while, to allow - * resources to come back. This function will make sure that queueing is - * restarted around the specified time. - */ -void blk_delay_queue(struct request_queue *q, unsigned long msecs) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - if (likely(!blk_queue_dead(q))) - queue_delayed_work(kblockd_workqueue, &q->delay_work, - msecs_to_jiffies(msecs)); -} -EXPORT_SYMBOL(blk_delay_queue); - -/** - * blk_start_queue_async - asynchronously restart a previously stopped queue - * @q: The &struct request_queue in question - * - * Description: - * blk_start_queue_async() will clear the stop flag on the queue, and - * ensure that the request_fn for the queue is run from an async - * context. - **/ -void blk_start_queue_async(struct request_queue *q) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - queue_flag_clear(QUEUE_FLAG_STOPPED, q); - blk_run_queue_async(q); -} -EXPORT_SYMBOL(blk_start_queue_async); - -/** - * blk_start_queue - restart a previously stopped queue - * @q: The &struct request_queue in question - * - * Description: - * blk_start_queue() will clear the stop flag on the queue, and call - * the request_fn for the queue if it was in a stopped state when - * entered. Also see blk_stop_queue(). - **/ -void blk_start_queue(struct request_queue *q) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - queue_flag_clear(QUEUE_FLAG_STOPPED, q); - __blk_run_queue(q); -} -EXPORT_SYMBOL(blk_start_queue); - -/** - * blk_stop_queue - stop a queue - * @q: The &struct request_queue in question - * - * Description: - * The Linux block layer assumes that a block driver will consume all - * entries on the request queue when the request_fn strategy is called. - * Often this will not happen, because of hardware limitations (queue - * depth settings). If a device driver gets a 'queue full' response, - * or if it simply chooses not to queue more I/O at one point, it can - * call this function to prevent the request_fn from being called until - * the driver has signalled it's ready to go again. This happens by calling - * blk_start_queue() to restart queue operations. - **/ -void blk_stop_queue(struct request_queue *q) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - cancel_delayed_work(&q->delay_work); - queue_flag_set(QUEUE_FLAG_STOPPED, q); -} -EXPORT_SYMBOL(blk_stop_queue); - /** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue @@ -408,15 +233,13 @@ void blk_sync_queue(struct request_queue *q) del_timer_sync(&q->timeout); cancel_work_sync(&q->timeout_work); - if (q->mq_ops) { + if (queue_is_mq(q)) { struct blk_mq_hw_ctx *hctx; int i; cancel_delayed_work_sync(&q->requeue_work); queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); - } else { - cancel_delayed_work_sync(&q->delay_work); } } EXPORT_SYMBOL(blk_sync_queue); @@ -442,250 +265,12 @@ void blk_clear_pm_only(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_clear_pm_only); -/** - * __blk_run_queue_uncond - run a queue whether or not it has been stopped - * @q: The queue to run - * - * Description: - * Invoke request handling on a queue if there are any pending requests. - * May be used to restart request handling after a request has completed. - * This variant runs the queue whether or not the queue has been - * stopped. Must be called with the queue lock held and interrupts - * disabled. See also @blk_run_queue. - */ -inline void __blk_run_queue_uncond(struct request_queue *q) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - if (unlikely(blk_queue_dead(q))) - return; - - /* - * Some request_fn implementations, e.g. scsi_request_fn(), unlock - * the queue lock internally. As a result multiple threads may be - * running such a request function concurrently. Keep track of the - * number of active request_fn invocations such that blk_drain_queue() - * can wait until all these request_fn calls have finished. - */ - q->request_fn_active++; - q->request_fn(q); - q->request_fn_active--; -} -EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); - -/** - * __blk_run_queue - run a single device queue - * @q: The queue to run - * - * Description: - * See @blk_run_queue. - */ -void __blk_run_queue(struct request_queue *q) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - if (unlikely(blk_queue_stopped(q))) - return; - - __blk_run_queue_uncond(q); -} -EXPORT_SYMBOL(__blk_run_queue); - -/** - * blk_run_queue_async - run a single device queue in workqueue context - * @q: The queue to run - * - * Description: - * Tells kblockd to perform the equivalent of @blk_run_queue on behalf - * of us. - * - * Note: - * Since it is not allowed to run q->delay_work after blk_cleanup_queue() - * has canceled q->delay_work, callers must hold the queue lock to avoid - * race conditions between blk_cleanup_queue() and blk_run_queue_async(). - */ -void blk_run_queue_async(struct request_queue *q) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q))) - mod_delayed_work(kblockd_workqueue, &q->delay_work, 0); -} -EXPORT_SYMBOL(blk_run_queue_async); - -/** - * blk_run_queue - run a single device queue - * @q: The queue to run - * - * Description: - * Invoke request handling on this queue, if it has pending work to do. - * May be used to restart queueing when a request has completed. - */ -void blk_run_queue(struct request_queue *q) -{ - unsigned long flags; - - WARN_ON_ONCE(q->mq_ops); - - spin_lock_irqsave(q->queue_lock, flags); - __blk_run_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} -EXPORT_SYMBOL(blk_run_queue); - void blk_put_queue(struct request_queue *q) { kobject_put(&q->kobj); } EXPORT_SYMBOL(blk_put_queue); -/** - * __blk_drain_queue - drain requests from request_queue - * @q: queue to drain - * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV - * - * Drain requests from @q. If @drain_all is set, all requests are drained. - * If not, only ELVPRIV requests are drained. The caller is responsible - * for ensuring that no new requests which need to be drained are queued. - */ -static void __blk_drain_queue(struct request_queue *q, bool drain_all) - __releases(q->queue_lock) - __acquires(q->queue_lock) -{ - int i; - - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - while (true) { - bool drain = false; - - /* - * The caller might be trying to drain @q before its - * elevator is initialized. - */ - if (q->elevator) - elv_drain_elevator(q); - - blkcg_drain_queue(q); - - /* - * This function might be called on a queue which failed - * driver init after queue creation or is not yet fully - * active yet. Some drivers (e.g. fd and loop) get unhappy - * in such cases. Kick queue iff dispatch queue has - * something on it and @q has request_fn set. - */ - if (!list_empty(&q->queue_head) && q->request_fn) - __blk_run_queue(q); - - drain |= q->nr_rqs_elvpriv; - drain |= q->request_fn_active; - - /* - * Unfortunately, requests are queued at and tracked from - * multiple places and there's no single counter which can - * be drained. Check all the queues and counters. - */ - if (drain_all) { - struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); - drain |= !list_empty(&q->queue_head); - for (i = 0; i < 2; i++) { - drain |= q->nr_rqs[i]; - drain |= q->in_flight[i]; - if (fq) - drain |= !list_empty(&fq->flush_queue[i]); - } - } - - if (!drain) - break; - - spin_unlock_irq(q->queue_lock); - - msleep(10); - - spin_lock_irq(q->queue_lock); - } - - /* - * With queue marked dead, any woken up waiter will fail the - * allocation path, so the wakeup chaining is lost and we're - * left with hung waiters. We need to wake up those waiters. - */ - if (q->request_fn) { - struct request_list *rl; - - blk_queue_for_each_rl(rl, q) - for (i = 0; i < ARRAY_SIZE(rl->wait); i++) - wake_up_all(&rl->wait[i]); - } -} - -void blk_drain_queue(struct request_queue *q) -{ - spin_lock_irq(q->queue_lock); - __blk_drain_queue(q, true); - spin_unlock_irq(q->queue_lock); -} - -/** - * blk_queue_bypass_start - enter queue bypass mode - * @q: queue of interest - * - * In bypass mode, only the dispatch FIFO queue of @q is used. This - * function makes @q enter bypass mode and drains all requests which were - * throttled or issued before. On return, it's guaranteed that no request - * is being throttled or has ELVPRIV set and blk_queue_bypass() %true - * inside queue or RCU read lock. - */ -void blk_queue_bypass_start(struct request_queue *q) -{ - WARN_ON_ONCE(q->mq_ops); - - spin_lock_irq(q->queue_lock); - q->bypass_depth++; - queue_flag_set(QUEUE_FLAG_BYPASS, q); - spin_unlock_irq(q->queue_lock); - - /* - * Queues start drained. Skip actual draining till init is - * complete. This avoids lenghty delays during queue init which - * can happen many times during boot. - */ - if (blk_queue_init_done(q)) { - spin_lock_irq(q->queue_lock); - __blk_drain_queue(q, false); - spin_unlock_irq(q->queue_lock); - - /* ensure blk_queue_bypass() is %true inside RCU read lock */ - synchronize_rcu(); - } -} -EXPORT_SYMBOL_GPL(blk_queue_bypass_start); - -/** - * blk_queue_bypass_end - leave queue bypass mode - * @q: queue of interest - * - * Leave bypass mode and restore the normal queueing behavior. - * - * Note: although blk_queue_bypass_start() is only called for blk-sq queues, - * this function is called for both blk-sq and blk-mq queues. - */ -void blk_queue_bypass_end(struct request_queue *q) -{ - spin_lock_irq(q->queue_lock); - if (!--q->bypass_depth) - queue_flag_clear(QUEUE_FLAG_BYPASS, q); - WARN_ON_ONCE(q->bypass_depth < 0); - spin_unlock_irq(q->queue_lock); -} -EXPORT_SYMBOL_GPL(blk_queue_bypass_end); - void blk_set_queue_dying(struct request_queue *q) { blk_queue_flag_set(QUEUE_FLAG_DYING, q); @@ -697,20 +282,8 @@ void blk_set_queue_dying(struct request_queue *q) */ blk_freeze_queue_start(q); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_wake_waiters(q); - else { - struct request_list *rl; - - spin_lock_irq(q->queue_lock); - blk_queue_for_each_rl(rl, q) { - if (rl->rq_pool) { - wake_up_all(&rl->wait[BLK_RW_SYNC]); - wake_up_all(&rl->wait[BLK_RW_ASYNC]); - } - } - spin_unlock_irq(q->queue_lock); - } /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); @@ -755,29 +328,13 @@ void blk_exit_queue(struct request_queue *q) */ void blk_cleanup_queue(struct request_queue *q) { - spinlock_t *lock = q->queue_lock; - /* mark @q DYING, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); blk_set_queue_dying(q); - spin_lock_irq(lock); - /* - * A dying queue is permanently in bypass mode till released. Note - * that, unlike blk_queue_bypass_start(), we aren't performing - * synchronize_rcu() after entering bypass mode to avoid the delay - * as some drivers create and destroy a lot of queues while - * probing. This is still safe because blk_release_queue() will be - * called only after the queue refcnt drops to zero and nothing, - * RCU or not, would be traversing the queue by then. - */ - q->bypass_depth++; - queue_flag_set(QUEUE_FLAG_BYPASS, q); - - queue_flag_set(QUEUE_FLAG_NOMERGES, q); - queue_flag_set(QUEUE_FLAG_NOXMERGES, q); - queue_flag_set(QUEUE_FLAG_DYING, q); - spin_unlock_irq(lock); + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + blk_queue_flag_set(QUEUE_FLAG_DYING, q); mutex_unlock(&q->sysfs_lock); /* @@ -788,9 +345,7 @@ void blk_cleanup_queue(struct request_queue *q) rq_qos_exit(q); - spin_lock_irq(lock); - queue_flag_set(QUEUE_FLAG_DEAD, q); - spin_unlock_irq(lock); + blk_queue_flag_set(QUEUE_FLAG_DEAD, q); /* * make sure all in-progress dispatch are completed because @@ -801,7 +356,7 @@ void blk_cleanup_queue(struct request_queue *q) * We rely on driver to deal with the race in case that queue * initialization isn't done. */ - if (q->mq_ops && blk_queue_init_done(q)) + if (queue_is_mq(q) && blk_queue_init_done(q)) blk_mq_quiesce_queue(q); /* for synchronous bio-based driver finish in-flight integrity i/o */ @@ -819,98 +374,19 @@ void blk_cleanup_queue(struct request_queue *q) blk_exit_queue(q); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_free_queue(q); - percpu_ref_exit(&q->q_usage_counter); - spin_lock_irq(lock); - if (q->queue_lock != &q->__queue_lock) - q->queue_lock = &q->__queue_lock; - spin_unlock_irq(lock); + percpu_ref_exit(&q->q_usage_counter); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); -/* Allocate memory local to the request queue */ -static void *alloc_request_simple(gfp_t gfp_mask, void *data) -{ - struct request_queue *q = data; - - return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node); -} - -static void free_request_simple(void *element, void *data) -{ - kmem_cache_free(request_cachep, element); -} - -static void *alloc_request_size(gfp_t gfp_mask, void *data) -{ - struct request_queue *q = data; - struct request *rq; - - rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask, - q->node); - if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) { - kfree(rq); - rq = NULL; - } - return rq; -} - -static void free_request_size(void *element, void *data) -{ - struct request_queue *q = data; - - if (q->exit_rq_fn) - q->exit_rq_fn(q, element); - kfree(element); -} - -int blk_init_rl(struct request_list *rl, struct request_queue *q, - gfp_t gfp_mask) -{ - if (unlikely(rl->rq_pool) || q->mq_ops) - return 0; - - rl->q = q; - rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; - rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; - init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); - init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); - - if (q->cmd_size) { - rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, - alloc_request_size, free_request_size, - q, gfp_mask, q->node); - } else { - rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, - alloc_request_simple, free_request_simple, - q, gfp_mask, q->node); - } - if (!rl->rq_pool) - return -ENOMEM; - - if (rl != &q->root_rl) - WARN_ON_ONCE(!blk_get_queue(q)); - - return 0; -} - -void blk_exit_rl(struct request_queue *q, struct request_list *rl) -{ - if (rl->rq_pool) { - mempool_destroy(rl->rq_pool); - if (rl != &q->root_rl) - blk_put_queue(q); - } -} - struct request_queue *blk_alloc_queue(gfp_t gfp_mask) { - return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL); + return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); } EXPORT_SYMBOL(blk_alloc_queue); @@ -990,17 +466,8 @@ static void blk_rq_timed_out_timer(struct timer_list *t) * blk_alloc_queue_node - allocate a request queue * @gfp_mask: memory allocation flags * @node_id: NUMA node to allocate memory from - * @lock: For legacy queues, pointer to a spinlock that will be used to e.g. - * serialize calls to the legacy .request_fn() callback. Ignored for - * blk-mq request queues. - * - * Note: pass the queue lock as the third argument to this function instead of - * setting the queue lock pointer explicitly to avoid triggering a sporadic - * crash in the blkcg code. This function namely calls blkcg_init_queue() and - * the queue lock pointer must be set before blkcg_init_queue() is called. */ -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, - spinlock_t *lock) +struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; int ret; @@ -1012,8 +479,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; - q->end_sector = 0; - q->boundary_rq = NULL; q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) @@ -1041,12 +506,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, NULL); - INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->icq_list); #ifdef CONFIG_BLK_CGROUP INIT_LIST_HEAD(&q->blkg_list); #endif - INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); kobject_init(&q->kobj, &blk_queue_ktype); @@ -1054,18 +517,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, mutex_init(&q->blk_trace_mutex); #endif mutex_init(&q->sysfs_lock); - spin_lock_init(&q->__queue_lock); - - q->queue_lock = lock ? : &q->__queue_lock; - - /* - * A queue starts its life with bypass turned on to avoid - * unnecessary bypass on/off overhead and nasty surprises during - * init. The initial bypass will be finished when the queue is - * registered by blk_register_queue(). - */ - q->bypass_depth = 1; - queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q); + spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); @@ -1099,105 +551,6 @@ fail_q: } EXPORT_SYMBOL(blk_alloc_queue_node); -/** - * blk_init_queue - prepare a request queue for use with a block device - * @rfn: The function to be called to process requests that have been - * placed on the queue. - * @lock: Request queue spin lock - * - * Description: - * If a block device wishes to use the standard request handling procedures, - * which sorts requests and coalesces adjacent requests, then it must - * call blk_init_queue(). The function @rfn will be called when there - * are requests on the queue that need to be processed. If the device - * supports plugging, then @rfn may not be called immediately when requests - * are available on the queue, but may be called at some time later instead. - * Plugged queues are generally unplugged when a buffer belonging to one - * of the requests on the queue is needed, or due to memory pressure. - * - * @rfn is not required, or even expected, to remove all requests off the - * queue, but only as many as it can handle at a time. If it does leave - * requests on the queue, it is responsible for arranging that the requests - * get dealt with eventually. - * - * The queue spin lock must be held while manipulating the requests on the - * request queue; this lock will be taken also from interrupt context, so irq - * disabling is needed for it. - * - * Function returns a pointer to the initialized request queue, or %NULL if - * it didn't succeed. - * - * Note: - * blk_init_queue() must be paired with a blk_cleanup_queue() call - * when the block device is deactivated (such as at module unload). - **/ - -struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) -{ - return blk_init_queue_node(rfn, lock, NUMA_NO_NODE); -} -EXPORT_SYMBOL(blk_init_queue); - -struct request_queue * -blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) -{ - struct request_queue *q; - - q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock); - if (!q) - return NULL; - - q->request_fn = rfn; - if (blk_init_allocated_queue(q) < 0) { - blk_cleanup_queue(q); - return NULL; - } - - return q; -} -EXPORT_SYMBOL(blk_init_queue_node); - -static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); - - -int blk_init_allocated_queue(struct request_queue *q) -{ - WARN_ON_ONCE(q->mq_ops); - - q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size, GFP_KERNEL); - if (!q->fq) - return -ENOMEM; - - if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL)) - goto out_free_flush_queue; - - if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) - goto out_exit_flush_rq; - - INIT_WORK(&q->timeout_work, blk_timeout_work); - q->queue_flags |= QUEUE_FLAG_DEFAULT; - - /* - * This also sets hw/phys segments, boundary and size - */ - blk_queue_make_request(q, blk_queue_bio); - - q->sg_reserved_size = INT_MAX; - - if (elevator_init(q)) - goto out_exit_flush_rq; - return 0; - -out_exit_flush_rq: - if (q->exit_rq_fn) - q->exit_rq_fn(q, q->fq->flush_rq); -out_free_flush_queue: - blk_free_flush_queue(q->fq); - q->fq = NULL; - return -ENOMEM; -} -EXPORT_SYMBOL(blk_init_allocated_queue); - bool blk_get_queue(struct request_queue *q) { if (likely(!blk_queue_dying(q))) { @@ -1209,406 +562,6 @@ bool blk_get_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_get_queue); -static inline void blk_free_request(struct request_list *rl, struct request *rq) -{ - if (rq->rq_flags & RQF_ELVPRIV) { - elv_put_request(rl->q, rq); - if (rq->elv.icq) - put_io_context(rq->elv.icq->ioc); - } - - mempool_free(rq, rl->rq_pool); -} - -/* - * ioc_batching returns true if the ioc is a valid batching request and - * should be given priority access to a request. - */ -static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) -{ - if (!ioc) - return 0; - - /* - * Make sure the process is able to allocate at least 1 request - * even if the batch times out, otherwise we could theoretically - * lose wakeups. - */ - return ioc->nr_batch_requests == q->nr_batching || - (ioc->nr_batch_requests > 0 - && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); -} - -/* - * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This - * will cause the process to be a "batcher" on all queues in the system. This - * is the behaviour we want though - once it gets a wakeup it should be given - * a nice run. - */ -static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) -{ - if (!ioc || ioc_batching(q, ioc)) - return; - - ioc->nr_batch_requests = q->nr_batching; - ioc->last_waited = jiffies; -} - -static void __freed_request(struct request_list *rl, int sync) -{ - struct request_queue *q = rl->q; - - if (rl->count[sync] < queue_congestion_off_threshold(q)) - blk_clear_congested(rl, sync); - - if (rl->count[sync] + 1 <= q->nr_requests) { - if (waitqueue_active(&rl->wait[sync])) - wake_up(&rl->wait[sync]); - - blk_clear_rl_full(rl, sync); - } -} - -/* - * A request has just been released. Account for it, update the full and - * congestion status, wake up any waiters. Called under q->queue_lock. - */ -static void freed_request(struct request_list *rl, bool sync, - req_flags_t rq_flags) -{ - struct request_queue *q = rl->q; - - q->nr_rqs[sync]--; - rl->count[sync]--; - if (rq_flags & RQF_ELVPRIV) - q->nr_rqs_elvpriv--; - - __freed_request(rl, sync); - - if (unlikely(rl->starved[sync ^ 1])) - __freed_request(rl, sync ^ 1); -} - -int blk_update_nr_requests(struct request_queue *q, unsigned int nr) -{ - struct request_list *rl; - int on_thresh, off_thresh; - - WARN_ON_ONCE(q->mq_ops); - - spin_lock_irq(q->queue_lock); - q->nr_requests = nr; - blk_queue_congestion_threshold(q); - on_thresh = queue_congestion_on_threshold(q); - off_thresh = queue_congestion_off_threshold(q); - - blk_queue_for_each_rl(rl, q) { - if (rl->count[BLK_RW_SYNC] >= on_thresh) - blk_set_congested(rl, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < off_thresh) - blk_clear_congested(rl, BLK_RW_SYNC); - - if (rl->count[BLK_RW_ASYNC] >= on_thresh) - blk_set_congested(rl, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < off_thresh) - blk_clear_congested(rl, BLK_RW_ASYNC); - - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_SYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_SYNC); - wake_up(&rl->wait[BLK_RW_SYNC]); - } - - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_ASYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_ASYNC); - wake_up(&rl->wait[BLK_RW_ASYNC]); - } - } - - spin_unlock_irq(q->queue_lock); - return 0; -} - -/** - * __get_request - get a free request - * @rl: request list to allocate from - * @op: operation and flags - * @bio: bio to allocate request for (can be %NULL) - * @flags: BLQ_MQ_REQ_* flags - * @gfp_mask: allocator flags - * - * Get a free request from @q. This function may fail under memory - * pressure or if @q is dead. - * - * Must be called with @q->queue_lock held and, - * Returns ERR_PTR on failure, with @q->queue_lock held. - * Returns request pointer on success, with @q->queue_lock *not held*. - */ -static struct request *__get_request(struct request_list *rl, unsigned int op, - struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask) -{ - struct request_queue *q = rl->q; - struct request *rq; - struct elevator_type *et = q->elevator->type; - struct io_context *ioc = rq_ioc(bio); - struct io_cq *icq = NULL; - const bool is_sync = op_is_sync(op); - int may_queue; - req_flags_t rq_flags = RQF_ALLOCED; - - lockdep_assert_held(q->queue_lock); - - if (unlikely(blk_queue_dying(q))) - return ERR_PTR(-ENODEV); - - may_queue = elv_may_queue(q, op); - if (may_queue == ELV_MQUEUE_NO) - goto rq_starved; - - if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { - if (rl->count[is_sync]+1 >= q->nr_requests) { - /* - * The queue will fill after this allocation, so set - * it as full, and mark this process as "batching". - * This process will be allowed to complete a batch of - * requests, others will be blocked. - */ - if (!blk_rl_full(rl, is_sync)) { - ioc_set_batching(q, ioc); - blk_set_rl_full(rl, is_sync); - } else { - if (may_queue != ELV_MQUEUE_MUST - && !ioc_batching(q, ioc)) { - /* - * The queue is full and the allocating - * process is not a "batcher", and not - * exempted by the IO scheduler - */ - return ERR_PTR(-ENOMEM); - } - } - } - blk_set_congested(rl, is_sync); - } - - /* - * Only allow batching queuers to allocate up to 50% over the defined - * limit of requests, otherwise we could have thousands of requests - * allocated with any setting of ->nr_requests - */ - if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) - return ERR_PTR(-ENOMEM); - - q->nr_rqs[is_sync]++; - rl->count[is_sync]++; - rl->starved[is_sync] = 0; - - /* - * Decide whether the new request will be managed by elevator. If - * so, mark @rq_flags and increment elvpriv. Non-zero elvpriv will - * prevent the current elevator from being destroyed until the new - * request is freed. This guarantees icq's won't be destroyed and - * makes creating new ones safe. - * - * Flush requests do not use the elevator so skip initialization. - * This allows a request to share the flush and elevator data. - * - * Also, lookup icq while holding queue_lock. If it doesn't exist, - * it will be created after releasing queue_lock. - */ - if (!op_is_flush(op) && !blk_queue_bypass(q)) { - rq_flags |= RQF_ELVPRIV; - q->nr_rqs_elvpriv++; - if (et->icq_cache && ioc) - icq = ioc_lookup_icq(ioc, q); - } - - if (blk_queue_io_stat(q)) - rq_flags |= RQF_IO_STAT; - spin_unlock_irq(q->queue_lock); - - /* allocate and init request */ - rq = mempool_alloc(rl->rq_pool, gfp_mask); - if (!rq) - goto fail_alloc; - - blk_rq_init(q, rq); - blk_rq_set_rl(rq, rl); - rq->cmd_flags = op; - rq->rq_flags = rq_flags; - if (flags & BLK_MQ_REQ_PREEMPT) - rq->rq_flags |= RQF_PREEMPT; - - /* init elvpriv */ - if (rq_flags & RQF_ELVPRIV) { - if (unlikely(et->icq_cache && !icq)) { - if (ioc) - icq = ioc_create_icq(ioc, q, gfp_mask); - if (!icq) - goto fail_elvpriv; - } - - rq->elv.icq = icq; - if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) - goto fail_elvpriv; - - /* @rq->elv.icq holds io_context until @rq is freed */ - if (icq) - get_io_context(icq->ioc); - } -out: - /* - * ioc may be NULL here, and ioc_batching will be false. That's - * OK, if the queue is under the request limit then requests need - * not count toward the nr_batch_requests limit. There will always - * be some limit enforced by BLK_BATCH_TIME. - */ - if (ioc_batching(q, ioc)) - ioc->nr_batch_requests--; - - trace_block_getrq(q, bio, op); - return rq; - -fail_elvpriv: - /* - * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed - * and may fail indefinitely under memory pressure and thus - * shouldn't stall IO. Treat this request as !elvpriv. This will - * disturb iosched and blkcg but weird is bettern than dead. - */ - printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", - __func__, dev_name(q->backing_dev_info->dev)); - - rq->rq_flags &= ~RQF_ELVPRIV; - rq->elv.icq = NULL; - - spin_lock_irq(q->queue_lock); - q->nr_rqs_elvpriv--; - spin_unlock_irq(q->queue_lock); - goto out; - -fail_alloc: - /* - * Allocation failed presumably due to memory. Undo anything we - * might have messed up. - * - * Allocating task should really be put onto the front of the wait - * queue, but this is pretty rare. - */ - spin_lock_irq(q->queue_lock); - freed_request(rl, is_sync, rq_flags); - - /* - * in the very unlikely event that allocation failed and no - * requests for this direction was pending, mark us starved so that - * freeing of a request in the other direction will notice - * us. another possible fix would be to split the rq mempool into - * READ and WRITE - */ -rq_starved: - if (unlikely(rl->count[is_sync] == 0)) - rl->starved[is_sync] = 1; - return ERR_PTR(-ENOMEM); -} - -/** - * get_request - get a free request - * @q: request_queue to allocate request from - * @op: operation and flags - * @bio: bio to allocate request for (can be %NULL) - * @flags: BLK_MQ_REQ_* flags. - * @gfp: allocator flags - * - * Get a free request from @q. If %BLK_MQ_REQ_NOWAIT is set in @flags, - * this function keeps retrying under memory pressure and fails iff @q is dead. - * - * Must be called with @q->queue_lock held and, - * Returns ERR_PTR on failure, with @q->queue_lock held. - * Returns request pointer on success, with @q->queue_lock *not held*. - */ -static struct request *get_request(struct request_queue *q, unsigned int op, - struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp) -{ - const bool is_sync = op_is_sync(op); - DEFINE_WAIT(wait); - struct request_list *rl; - struct request *rq; - - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - rl = blk_get_rl(q, bio); /* transferred to @rq on success */ -retry: - rq = __get_request(rl, op, bio, flags, gfp); - if (!IS_ERR(rq)) - return rq; - - if (op & REQ_NOWAIT) { - blk_put_rl(rl); - return ERR_PTR(-EAGAIN); - } - - if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) { - blk_put_rl(rl); - return rq; - } - - /* wait on @rl and retry */ - prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, - TASK_UNINTERRUPTIBLE); - - trace_block_sleeprq(q, bio, op); - - spin_unlock_irq(q->queue_lock); - io_schedule(); - - /* - * After sleeping, we become a "batching" process and will be able - * to allocate at least one request, and up to a big batch of them - * for a small period time. See ioc_batching, ioc_set_batching - */ - ioc_set_batching(q, current->io_context); - - spin_lock_irq(q->queue_lock); - finish_wait(&rl->wait[is_sync], &wait); - - goto retry; -} - -/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */ -static struct request *blk_old_get_request(struct request_queue *q, - unsigned int op, blk_mq_req_flags_t flags) -{ - struct request *rq; - gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO; - int ret = 0; - - WARN_ON_ONCE(q->mq_ops); - - /* create ioc upfront */ - create_io_context(gfp_mask, q->node); - - ret = blk_queue_enter(q, flags); - if (ret) - return ERR_PTR(ret); - spin_lock_irq(q->queue_lock); - rq = get_request(q, op, NULL, flags, gfp_mask); - if (IS_ERR(rq)) { - spin_unlock_irq(q->queue_lock); - blk_queue_exit(q); - return rq; - } - - /* q->queue_lock is unlocked at this point */ - rq->__data_len = 0; - rq->__sector = (sector_t) -1; - rq->bio = rq->biotail = NULL; - return rq; -} - /** * blk_get_request - allocate a request * @q: request queue to allocate a request for @@ -1623,170 +576,17 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op, WARN_ON_ONCE(op & REQ_NOWAIT); WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); - if (q->mq_ops) { - req = blk_mq_alloc_request(q, op, flags); - if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) - q->mq_ops->initialize_rq_fn(req); - } else { - req = blk_old_get_request(q, op, flags); - if (!IS_ERR(req) && q->initialize_rq_fn) - q->initialize_rq_fn(req); - } + req = blk_mq_alloc_request(q, op, flags); + if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) + q->mq_ops->initialize_rq_fn(req); return req; } EXPORT_SYMBOL(blk_get_request); -/** - * blk_requeue_request - put a request back on queue - * @q: request queue where request should be inserted - * @rq: request to be inserted - * - * Description: - * Drivers often keep queueing requests until the hardware cannot accept - * more, when that condition happens we need to put the request back - * on the queue. Must be called with queue lock held. - */ -void blk_requeue_request(struct request_queue *q, struct request *rq) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - blk_delete_timer(rq); - blk_clear_rq_complete(rq); - trace_block_rq_requeue(q, rq); - rq_qos_requeue(q, rq); - - if (rq->rq_flags & RQF_QUEUED) - blk_queue_end_tag(q, rq); - - BUG_ON(blk_queued_rq(rq)); - - elv_requeue_request(q, rq); -} -EXPORT_SYMBOL(blk_requeue_request); - -static void add_acct_request(struct request_queue *q, struct request *rq, - int where) -{ - blk_account_io_start(rq, true); - __elv_add_request(q, rq, where); -} - -static void part_round_stats_single(struct request_queue *q, int cpu, - struct hd_struct *part, unsigned long now, - unsigned int inflight) -{ - if (inflight) { - __part_stat_add(cpu, part, time_in_queue, - inflight * (now - part->stamp)); - __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); - } - part->stamp = now; -} - -/** - * part_round_stats() - Round off the performance stats on a struct disk_stats. - * @q: target block queue - * @cpu: cpu number for stats access - * @part: target partition - * - * The average IO queue length and utilisation statistics are maintained - * by observing the current state of the queue length and the amount of - * time it has been in this state for. - * - * Normally, that accounting is done on IO completion, but that can result - * in more than a second's worth of IO being accounted for within any one - * second, leading to >100% utilisation. To deal with that, we call this - * function to do a round-off before returning the results when reading - * /proc/diskstats. This accounts immediately for all queue usage up to - * the current jiffies and restarts the counters again. - */ -void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part) -{ - struct hd_struct *part2 = NULL; - unsigned long now = jiffies; - unsigned int inflight[2]; - int stats = 0; - - if (part->stamp != now) - stats |= 1; - - if (part->partno) { - part2 = &part_to_disk(part)->part0; - if (part2->stamp != now) - stats |= 2; - } - - if (!stats) - return; - - part_in_flight(q, part, inflight); - - if (stats & 2) - part_round_stats_single(q, cpu, part2, now, inflight[1]); - if (stats & 1) - part_round_stats_single(q, cpu, part, now, inflight[0]); -} -EXPORT_SYMBOL_GPL(part_round_stats); - -void __blk_put_request(struct request_queue *q, struct request *req) -{ - req_flags_t rq_flags = req->rq_flags; - - if (unlikely(!q)) - return; - - if (q->mq_ops) { - blk_mq_free_request(req); - return; - } - - lockdep_assert_held(q->queue_lock); - - blk_req_zone_write_unlock(req); - blk_pm_put_request(req); - blk_pm_mark_last_busy(req); - - elv_completed_request(q, req); - - /* this is a bio leak */ - WARN_ON(req->bio != NULL); - - rq_qos_done(q, req); - - /* - * Request may not have originated from ll_rw_blk. if not, - * it didn't come out of our reserved rq pools - */ - if (rq_flags & RQF_ALLOCED) { - struct request_list *rl = blk_rq_rl(req); - bool sync = op_is_sync(req->cmd_flags); - - BUG_ON(!list_empty(&req->queuelist)); - BUG_ON(ELV_ON_HASH(req)); - - blk_free_request(rl, req); - freed_request(rl, sync, rq_flags); - blk_put_rl(rl); - blk_queue_exit(q); - } -} -EXPORT_SYMBOL_GPL(__blk_put_request); - void blk_put_request(struct request *req) { - struct request_queue *q = req->q; - - if (q->mq_ops) - blk_mq_free_request(req); - else { - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - __blk_put_request(q, req); - spin_unlock_irqrestore(q->queue_lock, flags); - } + blk_mq_free_request(req); } EXPORT_SYMBOL(blk_put_request); @@ -1806,7 +606,6 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; - req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); blk_account_io_start(req, false); return true; @@ -1830,7 +629,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; - req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); blk_account_io_start(req, false); return true; @@ -1850,7 +648,6 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; - req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); req->nr_phys_segments = segments + 1; blk_account_io_start(req, false); @@ -1883,7 +680,6 @@ no_merge: * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count, struct request **same_queue_rq) { struct blk_plug *plug; @@ -1893,25 +689,19 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, plug = current->plug; if (!plug) return false; - *request_count = 0; - if (q->mq_ops) - plug_list = &plug->mq_list; - else - plug_list = &plug->list; + plug_list = &plug->mq_list; list_for_each_entry_reverse(rq, plug_list, queuelist) { bool merged = false; - if (rq->q == q) { - (*request_count)++; + if (rq->q == q && same_queue_rq) { /* * Only blk-mq multiple hardware queues case checks the * rq in the same queue, there should be only one such * rq in a queue **/ - if (same_queue_rq) - *same_queue_rq = rq; + *same_queue_rq = rq; } if (rq->q != q || !blk_rq_merge_ok(rq, bio)) @@ -1938,176 +728,18 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, return false; } -unsigned int blk_plug_queued_count(struct request_queue *q) -{ - struct blk_plug *plug; - struct request *rq; - struct list_head *plug_list; - unsigned int ret = 0; - - plug = current->plug; - if (!plug) - goto out; - - if (q->mq_ops) - plug_list = &plug->mq_list; - else - plug_list = &plug->list; - - list_for_each_entry(rq, plug_list, queuelist) { - if (rq->q == q) - ret++; - } -out: - return ret; -} - void blk_init_request_from_bio(struct request *req, struct bio *bio) { - struct io_context *ioc = rq_ioc(bio); - if (bio->bi_opf & REQ_RAHEAD) req->cmd_flags |= REQ_FAILFAST_MASK; req->__sector = bio->bi_iter.bi_sector; - if (ioprio_valid(bio_prio(bio))) - req->ioprio = bio_prio(bio); - else if (ioc) - req->ioprio = ioc->ioprio; - else - req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + req->ioprio = bio_prio(bio); req->write_hint = bio->bi_write_hint; blk_rq_bio_prep(req->q, req, bio); } EXPORT_SYMBOL_GPL(blk_init_request_from_bio); -static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) -{ - struct blk_plug *plug; - int where = ELEVATOR_INSERT_SORT; - struct request *req, *free; - unsigned int request_count = 0; - - /* - * low level driver can indicate that it wants pages above a - * certain limit bounced to low memory (ie for highmem, or even - * ISA dma in theory) - */ - blk_queue_bounce(q, &bio); - - blk_queue_split(q, &bio); - - if (!bio_integrity_prep(bio)) - return BLK_QC_T_NONE; - - if (op_is_flush(bio->bi_opf)) { - spin_lock_irq(q->queue_lock); - where = ELEVATOR_INSERT_FLUSH; - goto get_rq; - } - - /* - * Check if we can merge with the plugged list before grabbing - * any locks. - */ - if (!blk_queue_nomerges(q)) { - if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return BLK_QC_T_NONE; - } else - request_count = blk_plug_queued_count(q); - - spin_lock_irq(q->queue_lock); - - switch (elv_merge(q, &req, bio)) { - case ELEVATOR_BACK_MERGE: - if (!bio_attempt_back_merge(q, req, bio)) - break; - elv_bio_merged(q, req, bio); - free = attempt_back_merge(q, req); - if (free) - __blk_put_request(q, free); - else - elv_merged_request(q, req, ELEVATOR_BACK_MERGE); - goto out_unlock; - case ELEVATOR_FRONT_MERGE: - if (!bio_attempt_front_merge(q, req, bio)) - break; - elv_bio_merged(q, req, bio); - free = attempt_front_merge(q, req); - if (free) - __blk_put_request(q, free); - else - elv_merged_request(q, req, ELEVATOR_FRONT_MERGE); - goto out_unlock; - default: - break; - } - -get_rq: - rq_qos_throttle(q, bio, q->queue_lock); - - /* - * Grab a free request. This is might sleep but can not fail. - * Returns with the queue unlocked. - */ - blk_queue_enter_live(q); - req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); - if (IS_ERR(req)) { - blk_queue_exit(q); - rq_qos_cleanup(q, bio); - if (PTR_ERR(req) == -ENOMEM) - bio->bi_status = BLK_STS_RESOURCE; - else - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - goto out_unlock; - } - - rq_qos_track(q, req, bio); - - /* - * After dropping the lock and possibly sleeping here, our request - * may now be mergeable after it had proven unmergeable (above). - * We don't worry about that case for efficiency. It won't happen - * often, and the elevators are able to handle it. - */ - blk_init_request_from_bio(req, bio); - - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) - req->cpu = raw_smp_processor_id(); - - plug = current->plug; - if (plug) { - /* - * If this is the first request added after a plug, fire - * of a plug trace. - * - * @request_count may become stale because of schedule - * out, so check plug list again. - */ - if (!request_count || list_empty(&plug->list)) - trace_block_plug(q); - else { - struct request *last = list_entry_rq(plug->list.prev); - if (request_count >= BLK_MAX_REQUEST_COUNT || - blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) { - blk_flush_plug_list(plug, false); - trace_block_plug(q); - } - } - list_add_tail(&req->queuelist, &plug->list); - blk_account_io_start(req, true); - } else { - spin_lock_irq(q->queue_lock); - add_acct_request(q, req, where); - __blk_run_queue(q); -out_unlock: - spin_unlock_irq(q->queue_lock); - } - - return BLK_QC_T_NONE; -} - static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; @@ -2259,7 +891,7 @@ generic_make_request_checks(struct bio *bio) * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. */ - if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) + if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q)) goto not_supported; if (should_fail_bio(bio)) @@ -2289,6 +921,9 @@ generic_make_request_checks(struct bio *bio) } } + if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + bio->bi_opf &= ~REQ_HIPRI; + switch (bio_op(bio)) { case REQ_OP_DISCARD: if (!blk_queue_discard(q)) @@ -2561,17 +1196,6 @@ blk_qc_t submit_bio(struct bio *bio) } EXPORT_SYMBOL(submit_bio); -bool blk_poll(struct request_queue *q, blk_qc_t cookie) -{ - if (!q->poll_fn || !blk_qc_t_valid(cookie)) - return false; - - if (current->plug) - blk_flush_plug_list(current->plug, false); - return q->poll_fn(q, cookie); -} -EXPORT_SYMBOL_GPL(blk_poll); - /** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for new the queue limits @@ -2619,8 +1243,7 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, */ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { - unsigned long flags; - int where = ELEVATOR_INSERT_BACK; + blk_qc_t unused; if (blk_cloned_rq_check_limits(q, rq)) return BLK_STS_IOERR; @@ -2629,38 +1252,15 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; - if (q->mq_ops) { - if (blk_queue_io_stat(q)) - blk_account_io_start(rq, true); - /* - * Since we have a scheduler attached on the top device, - * bypass a potential scheduler on the bottom device for - * insert. - */ - return blk_mq_request_issue_directly(rq); - } - - spin_lock_irqsave(q->queue_lock, flags); - if (unlikely(blk_queue_dying(q))) { - spin_unlock_irqrestore(q->queue_lock, flags); - return BLK_STS_IOERR; - } + if (blk_queue_io_stat(q)) + blk_account_io_start(rq, true); /* - * Submitting request must be dequeued before calling this function - * because it will be linked to another request_queue + * Since we have a scheduler attached on the top device, + * bypass a potential scheduler on the bottom device for + * insert. */ - BUG_ON(blk_queued_rq(rq)); - - if (op_is_flush(rq->cmd_flags)) - where = ELEVATOR_INSERT_FLUSH; - - add_acct_request(q, rq, where); - if (where == ELEVATOR_INSERT_FLUSH) - __blk_run_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); - - return BLK_STS_OK; + return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, true); } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); @@ -2710,11 +1310,10 @@ void blk_account_io_completion(struct request *req, unsigned int bytes) if (blk_do_io_stat(req)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; - int cpu; - cpu = part_stat_lock(); + part_stat_lock(); part = req->part; - part_stat_add(cpu, part, sectors[sgrp], bytes >> 9); + part_stat_add(part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } @@ -2729,14 +1328,14 @@ void blk_account_io_done(struct request *req, u64 now) if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; - int cpu; - cpu = part_stat_lock(); + part_stat_lock(); part = req->part; - part_stat_inc(cpu, part, ios[sgrp]); - part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); - part_round_stats(req->q, cpu, part); + update_io_ticks(part, jiffies); + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); + part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); @@ -2748,16 +1347,15 @@ void blk_account_io_start(struct request *rq, bool new_io) { struct hd_struct *part; int rw = rq_data_dir(rq); - int cpu; if (!blk_do_io_stat(rq)) return; - cpu = part_stat_lock(); + part_stat_lock(); if (!new_io) { part = rq->part; - part_stat_inc(cpu, part, merges[rw]); + part_stat_inc(part, merges[rw]); } else { part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); if (!hd_struct_try_get(part)) { @@ -2772,233 +1370,15 @@ void blk_account_io_start(struct request *rq, bool new_io) part = &rq->rq_disk->part0; hd_struct_get(part); } - part_round_stats(rq->q, cpu, part); part_inc_in_flight(rq->q, part, rw); rq->part = part; } + update_io_ticks(part, jiffies); + part_stat_unlock(); } -static struct request *elv_next_request(struct request_queue *q) -{ - struct request *rq; - struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); - - WARN_ON_ONCE(q->mq_ops); - - while (1) { - list_for_each_entry(rq, &q->queue_head, queuelist) { -#ifdef CONFIG_PM - /* - * If a request gets queued in state RPM_SUSPENDED - * then that's a kernel bug. - */ - WARN_ON_ONCE(q->rpm_status == RPM_SUSPENDED); -#endif - return rq; - } - - /* - * Flush request is running and flush request isn't queueable - * in the drive, we can hold the queue till flush request is - * finished. Even we don't do this, driver can't dispatch next - * requests and will requeue them. And this can improve - * throughput too. For example, we have request flush1, write1, - * flush 2. flush1 is dispatched, then queue is hold, write1 - * isn't inserted to queue. After flush1 is finished, flush2 - * will be dispatched. Since disk cache is already clean, - * flush2 will be finished very soon, so looks like flush2 is - * folded to flush1. - * Since the queue is hold, a flag is set to indicate the queue - * should be restarted later. Please see flush_end_io() for - * details. - */ - if (fq->flush_pending_idx != fq->flush_running_idx && - !queue_flush_queueable(q)) { - fq->flush_queue_delayed = 1; - return NULL; - } - if (unlikely(blk_queue_bypass(q)) || - !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) - return NULL; - } -} - -/** - * blk_peek_request - peek at the top of a request queue - * @q: request queue to peek at - * - * Description: - * Return the request at the top of @q. The returned request - * should be started using blk_start_request() before LLD starts - * processing it. - * - * Return: - * Pointer to the request at the top of @q if available. Null - * otherwise. - */ -struct request *blk_peek_request(struct request_queue *q) -{ - struct request *rq; - int ret; - - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - while ((rq = elv_next_request(q)) != NULL) { - if (!(rq->rq_flags & RQF_STARTED)) { - /* - * This is the first time the device driver - * sees this request (possibly after - * requeueing). Notify IO scheduler. - */ - if (rq->rq_flags & RQF_SORTED) - elv_activate_rq(q, rq); - - /* - * just mark as started even if we don't start - * it, a request that has been delayed should - * not be passed by new incoming requests - */ - rq->rq_flags |= RQF_STARTED; - trace_block_rq_issue(q, rq); - } - - if (!q->boundary_rq || q->boundary_rq == rq) { - q->end_sector = rq_end_sector(rq); - q->boundary_rq = NULL; - } - - if (rq->rq_flags & RQF_DONTPREP) - break; - - if (q->dma_drain_size && blk_rq_bytes(rq)) { - /* - * make sure space for the drain appears we - * know we can do this because max_hw_segments - * has been adjusted to be one fewer than the - * device can handle - */ - rq->nr_phys_segments++; - } - - if (!q->prep_rq_fn) - break; - - ret = q->prep_rq_fn(q, rq); - if (ret == BLKPREP_OK) { - break; - } else if (ret == BLKPREP_DEFER) { - /* - * the request may have been (partially) prepped. - * we need to keep this request in the front to - * avoid resource deadlock. RQF_STARTED will - * prevent other fs requests from passing this one. - */ - if (q->dma_drain_size && blk_rq_bytes(rq) && - !(rq->rq_flags & RQF_DONTPREP)) { - /* - * remove the space for the drain we added - * so that we don't add it again - */ - --rq->nr_phys_segments; - } - - rq = NULL; - break; - } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) { - rq->rq_flags |= RQF_QUIET; - /* - * Mark this request as started so we don't trigger - * any debug logic in the end I/O path. - */ - blk_start_request(rq); - __blk_end_request_all(rq, ret == BLKPREP_INVALID ? - BLK_STS_TARGET : BLK_STS_IOERR); - } else { - printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); - break; - } - } - - return rq; -} -EXPORT_SYMBOL(blk_peek_request); - -static void blk_dequeue_request(struct request *rq) -{ - struct request_queue *q = rq->q; - - BUG_ON(list_empty(&rq->queuelist)); - BUG_ON(ELV_ON_HASH(rq)); - - list_del_init(&rq->queuelist); - - /* - * the time frame between a request being removed from the lists - * and to it is freed is accounted as io that is in progress at - * the driver side. - */ - if (blk_account_rq(rq)) - q->in_flight[rq_is_sync(rq)]++; -} - -/** - * blk_start_request - start request processing on the driver - * @req: request to dequeue - * - * Description: - * Dequeue @req and start timeout timer on it. This hands off the - * request to the driver. - */ -void blk_start_request(struct request *req) -{ - lockdep_assert_held(req->q->queue_lock); - WARN_ON_ONCE(req->q->mq_ops); - - blk_dequeue_request(req); - - if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { - req->io_start_time_ns = ktime_get_ns(); -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - req->throtl_size = blk_rq_sectors(req); -#endif - req->rq_flags |= RQF_STATS; - rq_qos_issue(req->q, req); - } - - BUG_ON(blk_rq_is_complete(req)); - blk_add_timer(req); -} -EXPORT_SYMBOL(blk_start_request); - -/** - * blk_fetch_request - fetch a request from a request queue - * @q: request queue to fetch a request from - * - * Description: - * Return the request at the top of @q. The request is started on - * return and LLD can start processing it immediately. - * - * Return: - * Pointer to the request at the top of @q if available. Null - * otherwise. - */ -struct request *blk_fetch_request(struct request_queue *q) -{ - struct request *rq; - - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - rq = blk_peek_request(q); - if (rq) - blk_start_request(rq); - return rq; -} -EXPORT_SYMBOL(blk_fetch_request); - /* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before. @@ -3124,255 +1504,6 @@ bool blk_update_request(struct request *req, blk_status_t error, } EXPORT_SYMBOL_GPL(blk_update_request); -static bool blk_update_bidi_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes, - unsigned int bidi_bytes) -{ - if (blk_update_request(rq, error, nr_bytes)) - return true; - - /* Bidi request must be completed as a whole */ - if (unlikely(blk_bidi_rq(rq)) && - blk_update_request(rq->next_rq, error, bidi_bytes)) - return true; - - if (blk_queue_add_random(rq->q)) - add_disk_randomness(rq->rq_disk); - - return false; -} - -/** - * blk_unprep_request - unprepare a request - * @req: the request - * - * This function makes a request ready for complete resubmission (or - * completion). It happens only after all error handling is complete, - * so represents the appropriate moment to deallocate any resources - * that were allocated to the request in the prep_rq_fn. The queue - * lock is held when calling this. - */ -void blk_unprep_request(struct request *req) -{ - struct request_queue *q = req->q; - - req->rq_flags &= ~RQF_DONTPREP; - if (q->unprep_rq_fn) - q->unprep_rq_fn(q, req); -} -EXPORT_SYMBOL_GPL(blk_unprep_request); - -void blk_finish_request(struct request *req, blk_status_t error) -{ - struct request_queue *q = req->q; - u64 now = ktime_get_ns(); - - lockdep_assert_held(req->q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - if (req->rq_flags & RQF_STATS) - blk_stat_add(req, now); - - if (req->rq_flags & RQF_QUEUED) - blk_queue_end_tag(q, req); - - BUG_ON(blk_queued_rq(req)); - - if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req)) - laptop_io_completion(req->q->backing_dev_info); - - blk_delete_timer(req); - - if (req->rq_flags & RQF_DONTPREP) - blk_unprep_request(req); - - blk_account_io_done(req, now); - - if (req->end_io) { - rq_qos_done(q, req); - req->end_io(req, error); - } else { - if (blk_bidi_rq(req)) - __blk_put_request(req->next_rq->q, req->next_rq); - - __blk_put_request(q, req); - } -} -EXPORT_SYMBOL(blk_finish_request); - -/** - * blk_end_bidi_request - Complete a bidi request - * @rq: the request to complete - * @error: block status code - * @nr_bytes: number of bytes to complete @rq - * @bidi_bytes: number of bytes to complete @rq->next_rq - * - * Description: - * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. - * Drivers that supports bidi can safely call this member for any - * type of request, bidi or uni. In the later case @bidi_bytes is - * just ignored. - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - **/ -static bool blk_end_bidi_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes, unsigned int bidi_bytes) -{ - struct request_queue *q = rq->q; - unsigned long flags; - - WARN_ON_ONCE(q->mq_ops); - - if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) - return true; - - spin_lock_irqsave(q->queue_lock, flags); - blk_finish_request(rq, error); - spin_unlock_irqrestore(q->queue_lock, flags); - - return false; -} - -/** - * __blk_end_bidi_request - Complete a bidi request with queue lock held - * @rq: the request to complete - * @error: block status code - * @nr_bytes: number of bytes to complete @rq - * @bidi_bytes: number of bytes to complete @rq->next_rq - * - * Description: - * Identical to blk_end_bidi_request() except that queue lock is - * assumed to be locked on entry and remains so on return. - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - **/ -static bool __blk_end_bidi_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes, unsigned int bidi_bytes) -{ - lockdep_assert_held(rq->q->queue_lock); - WARN_ON_ONCE(rq->q->mq_ops); - - if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) - return true; - - blk_finish_request(rq, error); - - return false; -} - -/** - * blk_end_request - Helper function for drivers to complete the request. - * @rq: the request being processed - * @error: block status code - * @nr_bytes: number of bytes to complete - * - * Description: - * Ends I/O on a number of bytes attached to @rq. - * If @rq has leftover, sets it up for the next range of segments. - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - **/ -bool blk_end_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes) -{ - WARN_ON_ONCE(rq->q->mq_ops); - return blk_end_bidi_request(rq, error, nr_bytes, 0); -} -EXPORT_SYMBOL(blk_end_request); - -/** - * blk_end_request_all - Helper function for drives to finish the request. - * @rq: the request to finish - * @error: block status code - * - * Description: - * Completely finish @rq. - */ -void blk_end_request_all(struct request *rq, blk_status_t error) -{ - bool pending; - unsigned int bidi_bytes = 0; - - if (unlikely(blk_bidi_rq(rq))) - bidi_bytes = blk_rq_bytes(rq->next_rq); - - pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); - BUG_ON(pending); -} -EXPORT_SYMBOL(blk_end_request_all); - -/** - * __blk_end_request - Helper function for drivers to complete the request. - * @rq: the request being processed - * @error: block status code - * @nr_bytes: number of bytes to complete - * - * Description: - * Must be called with queue lock held unlike blk_end_request(). - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - **/ -bool __blk_end_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes) -{ - lockdep_assert_held(rq->q->queue_lock); - WARN_ON_ONCE(rq->q->mq_ops); - - return __blk_end_bidi_request(rq, error, nr_bytes, 0); -} -EXPORT_SYMBOL(__blk_end_request); - -/** - * __blk_end_request_all - Helper function for drives to finish the request. - * @rq: the request to finish - * @error: block status code - * - * Description: - * Completely finish @rq. Must be called with queue lock held. - */ -void __blk_end_request_all(struct request *rq, blk_status_t error) -{ - bool pending; - unsigned int bidi_bytes = 0; - - lockdep_assert_held(rq->q->queue_lock); - WARN_ON_ONCE(rq->q->mq_ops); - - if (unlikely(blk_bidi_rq(rq))) - bidi_bytes = blk_rq_bytes(rq->next_rq); - - pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); - BUG_ON(pending); -} -EXPORT_SYMBOL(__blk_end_request_all); - -/** - * __blk_end_request_cur - Helper function to finish the current request chunk. - * @rq: the request to finish the current chunk for - * @error: block status code - * - * Description: - * Complete the current consecutively mapped chunk from @rq. Must - * be called with queue lock held. - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - */ -bool __blk_end_request_cur(struct request *rq, blk_status_t error) -{ - return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); -} -EXPORT_SYMBOL(__blk_end_request_cur); - void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -3428,8 +1559,8 @@ EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); */ int blk_lld_busy(struct request_queue *q) { - if (q->lld_busy_fn) - return q->lld_busy_fn(q); + if (queue_is_mq(q) && q->mq_ops->busy) + return q->mq_ops->busy(q); return 0; } @@ -3460,7 +1591,6 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); */ static void __blk_rq_prep_clone(struct request *dst, struct request *src) { - dst->cpu = src->cpu; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); if (src->rq_flags & RQF_SPECIAL_PAYLOAD) { @@ -3572,9 +1702,11 @@ void blk_start_plug(struct blk_plug *plug) if (tsk->plug) return; - INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); + plug->rq_count = 0; + plug->multiple_queues = false; + /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier @@ -3583,36 +1715,6 @@ void blk_start_plug(struct blk_plug *plug) } EXPORT_SYMBOL(blk_start_plug); -static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) -{ - struct request *rqa = container_of(a, struct request, queuelist); - struct request *rqb = container_of(b, struct request, queuelist); - - return !(rqa->q < rqb->q || - (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb))); -} - -/* - * If 'from_schedule' is true, then postpone the dispatch of requests - * until a safe kblockd context. We due this to avoid accidental big - * additional stack usage in driver dispatch, in places where the originally - * plugger did not intend it. - */ -static void queue_unplugged(struct request_queue *q, unsigned int depth, - bool from_schedule) - __releases(q->queue_lock) -{ - lockdep_assert_held(q->queue_lock); - - trace_block_unplug(q, depth, !from_schedule); - - if (from_schedule) - blk_run_queue_async(q); - else - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); -} - static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) { LIST_HEAD(callbacks); @@ -3657,65 +1759,10 @@ EXPORT_SYMBOL(blk_check_plugged); void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { - struct request_queue *q; - struct request *rq; - LIST_HEAD(list); - unsigned int depth; - flush_plug_callbacks(plug, from_schedule); if (!list_empty(&plug->mq_list)) blk_mq_flush_plug_list(plug, from_schedule); - - if (list_empty(&plug->list)) - return; - - list_splice_init(&plug->list, &list); - - list_sort(NULL, &list, plug_rq_cmp); - - q = NULL; - depth = 0; - - while (!list_empty(&list)) { - rq = list_entry_rq(list.next); - list_del_init(&rq->queuelist); - BUG_ON(!rq->q); - if (rq->q != q) { - /* - * This drops the queue lock - */ - if (q) - queue_unplugged(q, depth, from_schedule); - q = rq->q; - depth = 0; - spin_lock_irq(q->queue_lock); - } - - /* - * Short-circuit if @q is dead - */ - if (unlikely(blk_queue_dying(q))) { - __blk_end_request_all(rq, BLK_STS_IOERR); - continue; - } - - /* - * rq is already accounted, so use raw insert - */ - if (op_is_flush(rq->cmd_flags)) - __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH); - else - __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); - - depth++; - } - - /* - * This drops the queue lock - */ - if (q) - queue_unplugged(q, depth, from_schedule); } void blk_finish_plug(struct blk_plug *plug) @@ -3742,9 +1789,6 @@ int __init blk_dev_init(void) if (!kblockd_workqueue) panic("Failed to create kblockd\n"); - request_cachep = kmem_cache_create("blkdev_requests", - sizeof(struct request), 0, SLAB_PANIC, NULL); - blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); diff --git a/block/blk-exec.c b/block/blk-exec.c index f7b292f12449..a34b7d918742 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -48,8 +48,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, struct request *rq, int at_head, rq_end_io_fn *done) { - int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); @@ -60,23 +58,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * don't check dying flag for MQ because the request won't * be reused after dying flag is set */ - if (q->mq_ops) { - blk_mq_sched_insert_request(rq, at_head, true, false); - return; - } - - spin_lock_irq(q->queue_lock); - - if (unlikely(blk_queue_dying(q))) { - rq->rq_flags |= RQF_QUIET; - __blk_end_request_all(rq, BLK_STS_IOERR); - spin_unlock_irq(q->queue_lock); - return; - } - - __elv_add_request(q, rq, where); - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); + blk_mq_sched_insert_request(rq, at_head, true, false); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); diff --git a/block/blk-flush.c b/block/blk-flush.c index 8b44b86779da..a3fc7191c694 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -93,7 +93,7 @@ enum { FLUSH_PENDING_TIMEOUT = 5 * HZ, }; -static bool blk_kick_flush(struct request_queue *q, +static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, unsigned int flags); static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) @@ -132,18 +132,9 @@ static void blk_flush_restore_request(struct request *rq) rq->end_io = rq->flush.saved_end_io; } -static bool blk_flush_queue_rq(struct request *rq, bool add_front) +static void blk_flush_queue_rq(struct request *rq, bool add_front) { - if (rq->q->mq_ops) { - blk_mq_add_to_requeue_list(rq, add_front, true); - return false; - } else { - if (add_front) - list_add(&rq->queuelist, &rq->q->queue_head); - else - list_add_tail(&rq->queuelist, &rq->q->queue_head); - return true; - } + blk_mq_add_to_requeue_list(rq, add_front, true); } /** @@ -157,18 +148,17 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) * completion and trigger the next step. * * CONTEXT: - * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) + * spin_lock_irq(fq->mq_flush_lock) * * RETURNS: * %true if requests were added to the dispatch queue, %false otherwise. */ -static bool blk_flush_complete_seq(struct request *rq, +static void blk_flush_complete_seq(struct request *rq, struct blk_flush_queue *fq, unsigned int seq, blk_status_t error) { struct request_queue *q = rq->q; struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; - bool queued = false, kicked; unsigned int cmd_flags; BUG_ON(rq->flush.seq & seq); @@ -191,7 +181,7 @@ static bool blk_flush_complete_seq(struct request *rq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); - queued = blk_flush_queue_rq(rq, true); + blk_flush_queue_rq(rq, true); break; case REQ_FSEQ_DONE: @@ -204,42 +194,34 @@ static bool blk_flush_complete_seq(struct request *rq, BUG_ON(!list_empty(&rq->queuelist)); list_del_init(&rq->flush.list); blk_flush_restore_request(rq); - if (q->mq_ops) - blk_mq_end_request(rq, error); - else - __blk_end_request_all(rq, error); + blk_mq_end_request(rq, error); break; default: BUG(); } - kicked = blk_kick_flush(q, fq, cmd_flags); - return kicked | queued; + blk_kick_flush(q, fq, cmd_flags); } static void flush_end_io(struct request *flush_rq, blk_status_t error) { struct request_queue *q = flush_rq->q; struct list_head *running; - bool queued = false; struct request *rq, *n; unsigned long flags = 0; struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); + struct blk_mq_hw_ctx *hctx; - if (q->mq_ops) { - struct blk_mq_hw_ctx *hctx; - - /* release the tag's ownership to the req cloned from */ - spin_lock_irqsave(&fq->mq_flush_lock, flags); - hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); - if (!q->elevator) { - blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); - flush_rq->tag = -1; - } else { - blk_mq_put_driver_tag_hctx(hctx, flush_rq); - flush_rq->internal_tag = -1; - } + /* release the tag's ownership to the req cloned from */ + spin_lock_irqsave(&fq->mq_flush_lock, flags); + hctx = flush_rq->mq_hctx; + if (!q->elevator) { + blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); + flush_rq->tag = -1; + } else { + blk_mq_put_driver_tag_hctx(hctx, flush_rq); + flush_rq->internal_tag = -1; } running = &fq->flush_queue[fq->flush_running_idx]; @@ -248,35 +230,16 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* account completion of the flush request */ fq->flush_running_idx ^= 1; - if (!q->mq_ops) - elv_completed_request(q, flush_rq); - /* and push the waiting requests to the next stage */ list_for_each_entry_safe(rq, n, running, flush.list) { unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); - queued |= blk_flush_complete_seq(rq, fq, seq, error); + blk_flush_complete_seq(rq, fq, seq, error); } - /* - * Kick the queue to avoid stall for two cases: - * 1. Moving a request silently to empty queue_head may stall the - * queue. - * 2. When flush request is running in non-queueable queue, the - * queue is hold. Restart the queue after flush request is finished - * to avoid stall. - * This function is called from request completion path and calling - * directly into request_fn may confuse the driver. Always use - * kblockd. - */ - if (queued || fq->flush_queue_delayed) { - WARN_ON(q->mq_ops); - blk_run_queue_async(q); - } fq->flush_queue_delayed = 0; - if (q->mq_ops) - spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); } /** @@ -289,12 +252,10 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) * Please read the comment at the top of this file for more info. * * CONTEXT: - * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) + * spin_lock_irq(fq->mq_flush_lock) * - * RETURNS: - * %true if flush was issued, %false otherwise. */ -static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, +static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, unsigned int flags) { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; @@ -304,7 +265,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, /* C1 described at the top of this file */ if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) - return false; + return; /* C2 and C3 * @@ -312,11 +273,10 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, * assigned to empty flushes, and we deadlock if we are expecting * other requests to make progress. Don't defer for that case. */ - if (!list_empty(&fq->flush_data_in_flight) && - !(q->mq_ops && q->elevator) && + if (!list_empty(&fq->flush_data_in_flight) && q->elevator && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) - return false; + return; /* * Issue flush and toggle pending_idx. This makes pending_idx @@ -334,19 +294,15 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, * In case of IO scheduler, flush rq need to borrow scheduler tag * just for cheating put/get driver tag. */ - if (q->mq_ops) { - struct blk_mq_hw_ctx *hctx; + flush_rq->mq_ctx = first_rq->mq_ctx; + flush_rq->mq_hctx = first_rq->mq_hctx; - flush_rq->mq_ctx = first_rq->mq_ctx; - - if (!q->elevator) { - fq->orig_rq = first_rq; - flush_rq->tag = first_rq->tag; - hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); - blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); - } else { - flush_rq->internal_tag = first_rq->internal_tag; - } + if (!q->elevator) { + fq->orig_rq = first_rq; + flush_rq->tag = first_rq->tag; + blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq); + } else { + flush_rq->internal_tag = first_rq->internal_tag; } flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; @@ -355,62 +311,17 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; - return blk_flush_queue_rq(flush_rq, false); -} - -static void flush_data_end_io(struct request *rq, blk_status_t error) -{ - struct request_queue *q = rq->q; - struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); - - lockdep_assert_held(q->queue_lock); - - /* - * Updating q->in_flight[] here for making this tag usable - * early. Because in blk_queue_start_tag(), - * q->in_flight[BLK_RW_ASYNC] is used to limit async I/O and - * reserve tags for sync I/O. - * - * More importantly this way can avoid the following I/O - * deadlock: - * - * - suppose there are 40 fua requests comming to flush queue - * and queue depth is 31 - * - 30 rqs are scheduled then blk_queue_start_tag() can't alloc - * tag for async I/O any more - * - all the 30 rqs are completed before FLUSH_PENDING_TIMEOUT - * and flush_data_end_io() is called - * - the other rqs still can't go ahead if not updating - * q->in_flight[BLK_RW_ASYNC] here, meantime these rqs - * are held in flush data queue and make no progress of - * handling post flush rq - * - only after the post flush rq is handled, all these rqs - * can be completed - */ - - elv_completed_request(q, rq); - - /* for avoiding double accounting */ - rq->rq_flags &= ~RQF_STARTED; - - /* - * After populating an empty queue, kick it to avoid stall. Read - * the comment in flush_end_io(). - */ - if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) - blk_run_queue_async(q); + blk_flush_queue_rq(flush_rq, false); } static void mq_flush_data_end_io(struct request *rq, blk_status_t error) { struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); - hctx = blk_mq_map_queue(q, ctx->cpu); - if (q->elevator) { WARN_ON(rq->tag < 0); blk_mq_put_driver_tag_hctx(hctx, rq); @@ -443,9 +354,6 @@ void blk_insert_flush(struct request *rq) unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); - if (!q->mq_ops) - lockdep_assert_held(q->queue_lock); - /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. @@ -468,10 +376,7 @@ void blk_insert_flush(struct request *rq) * complete the request. */ if (!policy) { - if (q->mq_ops) - blk_mq_end_request(rq, 0); - else - __blk_end_request(rq, 0, 0); + blk_mq_end_request(rq, 0); return; } @@ -484,10 +389,7 @@ void blk_insert_flush(struct request *rq) */ if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - if (q->mq_ops) - blk_mq_request_bypass_insert(rq, false); - else - list_add_tail(&rq->queuelist, &q->queue_head); + blk_mq_request_bypass_insert(rq, false); return; } @@ -499,17 +401,12 @@ void blk_insert_flush(struct request *rq) INIT_LIST_HEAD(&rq->flush.list); rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ - if (q->mq_ops) { - rq->end_io = mq_flush_data_end_io; - spin_lock_irq(&fq->mq_flush_lock); - blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); - spin_unlock_irq(&fq->mq_flush_lock); - return; - } - rq->end_io = flush_data_end_io; + rq->end_io = mq_flush_data_end_io; + spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); + spin_unlock_irq(&fq->mq_flush_lock); } /** @@ -575,8 +472,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, if (!fq) goto fail; - if (q->mq_ops) - spin_lock_init(&fq->mq_flush_lock); + spin_lock_init(&fq->mq_flush_lock); rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); fq->flush_rq = kzalloc_node(rq_sz, flags, node); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 01580f88fcb3..5ed59ac6ae58 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -28,7 +28,6 @@ void get_io_context(struct io_context *ioc) BUG_ON(atomic_long_read(&ioc->refcount) <= 0); atomic_long_inc(&ioc->refcount); } -EXPORT_SYMBOL(get_io_context); static void icq_free_icq_rcu(struct rcu_head *head) { @@ -48,10 +47,8 @@ static void ioc_exit_icq(struct io_cq *icq) if (icq->flags & ICQ_EXITED) return; - if (et->uses_mq && et->ops.mq.exit_icq) - et->ops.mq.exit_icq(icq); - else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn) - et->ops.sq.elevator_exit_icq_fn(icq); + if (et->ops.exit_icq) + et->ops.exit_icq(icq); icq->flags |= ICQ_EXITED; } @@ -113,9 +110,9 @@ static void ioc_release_fn(struct work_struct *work) struct io_cq, ioc_node); struct request_queue *q = icq->q; - if (spin_trylock(q->queue_lock)) { + if (spin_trylock(&q->queue_lock)) { ioc_destroy_icq(icq); - spin_unlock(q->queue_lock); + spin_unlock(&q->queue_lock); } else { spin_unlock_irqrestore(&ioc->lock, flags); cpu_relax(); @@ -162,7 +159,6 @@ void put_io_context(struct io_context *ioc) if (free_ioc) kmem_cache_free(iocontext_cachep, ioc); } -EXPORT_SYMBOL(put_io_context); /** * put_io_context_active - put active reference on ioc @@ -173,7 +169,6 @@ EXPORT_SYMBOL(put_io_context); */ void put_io_context_active(struct io_context *ioc) { - struct elevator_type *et; unsigned long flags; struct io_cq *icq; @@ -187,25 +182,12 @@ void put_io_context_active(struct io_context *ioc) * reverse double locking. Read comment in ioc_release_fn() for * explanation on the nested locking annotation. */ -retry: spin_lock_irqsave_nested(&ioc->lock, flags, 1); hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { if (icq->flags & ICQ_EXITED) continue; - et = icq->q->elevator->type; - if (et->uses_mq) { - ioc_exit_icq(icq); - } else { - if (spin_trylock(icq->q->queue_lock)) { - ioc_exit_icq(icq); - spin_unlock(icq->q->queue_lock); - } else { - spin_unlock_irqrestore(&ioc->lock, flags); - cpu_relax(); - goto retry; - } - } + ioc_exit_icq(icq); } spin_unlock_irqrestore(&ioc->lock, flags); @@ -232,7 +214,7 @@ static void __ioc_clear_queue(struct list_head *icq_list) while (!list_empty(icq_list)) { struct io_cq *icq = list_entry(icq_list->next, - struct io_cq, q_node); + struct io_cq, q_node); struct io_context *ioc = icq->ioc; spin_lock_irqsave(&ioc->lock, flags); @@ -251,16 +233,11 @@ void ioc_clear_queue(struct request_queue *q) { LIST_HEAD(icq_list); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); list_splice_init(&q->icq_list, &icq_list); + spin_unlock_irq(&q->queue_lock); - if (q->mq_ops) { - spin_unlock_irq(q->queue_lock); - __ioc_clear_queue(&icq_list); - } else { - __ioc_clear_queue(&icq_list); - spin_unlock_irq(q->queue_lock); - } + __ioc_clear_queue(&icq_list); } int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) @@ -336,7 +313,6 @@ struct io_context *get_task_io_context(struct task_struct *task, return NULL; } -EXPORT_SYMBOL(get_task_io_context); /** * ioc_lookup_icq - lookup io_cq from ioc @@ -350,7 +326,7 @@ struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) { struct io_cq *icq; - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&q->queue_lock); /* * icq's are indexed from @ioc using radix tree and hint pointer, @@ -409,16 +385,14 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, INIT_HLIST_NODE(&icq->ioc_node); /* lock both q and ioc and try to link @icq */ - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); spin_lock(&ioc->lock); if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { hlist_add_head(&icq->ioc_node, &ioc->icq_list); list_add(&icq->q_node, &q->icq_list); - if (et->uses_mq && et->ops.mq.init_icq) - et->ops.mq.init_icq(icq); - else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn) - et->ops.sq.elevator_init_icq_fn(icq); + if (et->ops.init_icq) + et->ops.init_icq(icq); } else { kmem_cache_free(et->icq_cache, icq); icq = ioc_lookup_icq(ioc, q); @@ -427,7 +401,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, } spin_unlock(&ioc->lock); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); radix_tree_preload_end(); return icq; } diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 38c35c32aff2..fc714ef402a6 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -262,29 +262,25 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, stat->rqs.mean); } -static inline bool iolatency_may_queue(struct iolatency_grp *iolat, - wait_queue_entry_t *wait, - bool first_block) +static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) { - struct rq_wait *rqw = &iolat->rq_wait; + atomic_dec(&rqw->inflight); + wake_up(&rqw->wait); +} - if (first_block && waitqueue_active(&rqw->wait) && - rqw->wait.head.next != &wait->entry) - return false; +static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) +{ + struct iolatency_grp *iolat = private_data; return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); } static void __blkcg_iolatency_throttle(struct rq_qos *rqos, struct iolatency_grp *iolat, - spinlock_t *lock, bool issue_as_root, + bool issue_as_root, bool use_memdelay) - __releases(lock) - __acquires(lock) { struct rq_wait *rqw = &iolat->rq_wait; unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); - DEFINE_WAIT(wait); - bool first_block = true; if (use_delay) blkcg_schedule_throttle(rqos->q, use_memdelay); @@ -301,27 +297,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos, return; } - if (iolatency_may_queue(iolat, &wait, first_block)) - return; - - do { - prepare_to_wait_exclusive(&rqw->wait, &wait, - TASK_UNINTERRUPTIBLE); - - if (iolatency_may_queue(iolat, &wait, first_block)) - break; - first_block = false; - - if (lock) { - spin_unlock_irq(lock); - io_schedule(); - spin_lock_irq(lock); - } else { - io_schedule(); - } - } while (1); - - finish_wait(&rqw->wait, &wait); + rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb); } #define SCALE_DOWN_FACTOR 2 @@ -478,38 +454,15 @@ static void check_scale_change(struct iolatency_grp *iolat) scale_change(iolat, direction > 0); } -static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, - spinlock_t *lock) +static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) { struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - struct blkcg *blkcg; - struct blkcg_gq *blkg; - struct request_queue *q = rqos->q; + struct blkcg_gq *blkg = bio->bi_blkg; bool issue_as_root = bio_issue_as_root_blkg(bio); if (!blk_iolatency_enabled(blkiolat)) return; - rcu_read_lock(); - blkcg = bio_blkcg(bio); - bio_associate_blkcg(bio, &blkcg->css); - blkg = blkg_lookup(blkcg, q); - if (unlikely(!blkg)) { - if (!lock) - spin_lock_irq(q->queue_lock); - blkg = blkg_lookup_create(blkcg, q); - if (IS_ERR(blkg)) - blkg = NULL; - if (!lock) - spin_unlock_irq(q->queue_lock); - } - if (!blkg) - goto out; - - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); - bio_associate_blkg(bio, blkg); -out: - rcu_read_unlock(); while (blkg && blkg->parent) { struct iolatency_grp *iolat = blkg_to_lat(blkg); if (!iolat) { @@ -518,7 +471,7 @@ out: } check_scale_change(iolat); - __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root, + __blkcg_iolatency_throttle(rqos, iolat, issue_as_root, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); blkg = blkg->parent; } @@ -640,7 +593,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) bool enabled = false; blkg = bio->bi_blkg; - if (!blkg) + if (!blkg || !bio_flagged(bio, BIO_TRACKED)) return; iolat = blkg_to_lat(bio->bi_blkg); @@ -730,7 +683,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) * We could be exiting, don't access the pd unless we have a * ref on the blkg. */ - if (!blkg_try_get(blkg)) + if (!blkg_tryget(blkg)) continue; iolat = blkg_to_lat(blkg); diff --git a/block/blk-merge.c b/block/blk-merge.c index 7695034f4b87..e7f1c6cf0167 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -389,7 +389,6 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio) bio_set_flag(bio, BIO_SEG_VALID); } -EXPORT_SYMBOL(blk_recount_segments); static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, struct bio *nxt) @@ -596,17 +595,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, return ll_new_hw_segment(q, req, bio); } -/* - * blk-mq uses req->special to carry normal driver per-request payload, it - * does not indicate a prepared command that we cannot merge with. - */ -static bool req_no_special_merge(struct request *req) -{ - struct request_queue *q = req->q; - - return !q->mq_ops && req->special; -} - static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, struct request *next) { @@ -632,13 +620,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, unsigned int seg_size = req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; - /* - * First check if the either of the requests are re-queued - * requests. Can't merge them if they are. - */ - if (req_no_special_merge(req) || req_no_special_merge(next)) - return 0; - if (req_gap_back_merge(req, next->bio)) return 0; @@ -703,12 +684,10 @@ static void blk_account_io_merge(struct request *req) { if (blk_do_io_stat(req)) { struct hd_struct *part; - int cpu; - cpu = part_stat_lock(); + part_stat_lock(); part = req->part; - part_round_stats(req->q, cpu, part); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); @@ -731,7 +710,8 @@ static inline bool blk_discard_mergable(struct request *req) return false; } -enum elv_merge blk_try_req_merge(struct request *req, struct request *next) +static enum elv_merge blk_try_req_merge(struct request *req, + struct request *next) { if (blk_discard_mergable(req)) return ELEVATOR_DISCARD_MERGE; @@ -748,9 +728,6 @@ enum elv_merge blk_try_req_merge(struct request *req, struct request *next) static struct request *attempt_merge(struct request_queue *q, struct request *req, struct request *next) { - if (!q->mq_ops) - lockdep_assert_held(q->queue_lock); - if (!rq_mergeable(req) || !rq_mergeable(next)) return NULL; @@ -758,8 +735,7 @@ static struct request *attempt_merge(struct request_queue *q, return NULL; if (rq_data_dir(req) != rq_data_dir(next) - || req->rq_disk != next->rq_disk - || req_no_special_merge(next)) + || req->rq_disk != next->rq_disk) return NULL; if (req_op(req) == REQ_OP_WRITE_SAME && @@ -773,6 +749,9 @@ static struct request *attempt_merge(struct request_queue *q, if (req->write_hint != next->write_hint) return NULL; + if (req->ioprio != next->ioprio) + return NULL; + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn @@ -828,10 +807,6 @@ static struct request *attempt_merge(struct request_queue *q, */ blk_account_io_merge(next); - req->ioprio = ioprio_best(req->ioprio, next->ioprio); - if (blk_rq_cpu_valid(next)) - req->cpu = next->cpu; - /* * ownership of bio passed from next to req, return 'next' for * the caller to free @@ -863,16 +838,11 @@ struct request *attempt_front_merge(struct request_queue *q, struct request *rq) int blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next) { - struct elevator_queue *e = q->elevator; struct request *free; - if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn) - if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next)) - return 0; - free = attempt_merge(q, rq, next); if (free) { - __blk_put_request(q, free); + blk_put_request(free); return 1; } @@ -891,8 +861,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (bio_data_dir(bio) != rq_data_dir(rq)) return false; - /* must be same device and not a special request */ - if (rq->rq_disk != bio->bi_disk || req_no_special_merge(rq)) + /* must be same device */ + if (rq->rq_disk != bio->bi_disk) return false; /* only merge integrity protected bio into ditto rq */ @@ -911,6 +881,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (rq->write_hint != bio->bi_write_hint) return false; + if (rq->ioprio != bio_prio(bio)) + return false; + return true; } diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 3eb169f15842..03a534820271 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -14,9 +14,10 @@ #include "blk.h" #include "blk-mq.h" -static int cpu_to_queue_index(unsigned int nr_queues, const int cpu) +static int cpu_to_queue_index(struct blk_mq_queue_map *qmap, + unsigned int nr_queues, const int cpu) { - return cpu % nr_queues; + return qmap->queue_offset + (cpu % nr_queues); } static int get_first_sibling(unsigned int cpu) @@ -30,10 +31,10 @@ static int get_first_sibling(unsigned int cpu) return cpu; } -int blk_mq_map_queues(struct blk_mq_tag_set *set) +int blk_mq_map_queues(struct blk_mq_queue_map *qmap) { - unsigned int *map = set->mq_map; - unsigned int nr_queues = set->nr_hw_queues; + unsigned int *map = qmap->mq_map; + unsigned int nr_queues = qmap->nr_queues; unsigned int cpu, first_sibling; for_each_possible_cpu(cpu) { @@ -44,11 +45,11 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set) * performace optimizations. */ if (cpu < nr_queues) { - map[cpu] = cpu_to_queue_index(nr_queues, cpu); + map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu); } else { first_sibling = get_first_sibling(cpu); if (first_sibling == cpu) - map[cpu] = cpu_to_queue_index(nr_queues, cpu); + map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu); else map[cpu] = map[first_sibling]; } @@ -62,12 +63,12 @@ EXPORT_SYMBOL_GPL(blk_mq_map_queues); * We have no quick way of doing reverse lookups. This is only used at * queue init time, so runtime isn't important. */ -int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) +int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index) { int i; for_each_possible_cpu(i) { - if (index == mq_map[i]) + if (index == qmap->mq_map[i]) return local_memory_node(cpu_to_node(i)); } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 10b284a1f18d..90d68760af08 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -23,6 +23,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" +#include "blk-rq-qos.h" static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) { @@ -112,10 +113,8 @@ static int queue_pm_only_show(void *data, struct seq_file *m) #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name static const char *const blk_queue_flag_name[] = { - QUEUE_FLAG_NAME(QUEUED), QUEUE_FLAG_NAME(STOPPED), QUEUE_FLAG_NAME(DYING), - QUEUE_FLAG_NAME(BYPASS), QUEUE_FLAG_NAME(BIDI), QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), @@ -318,7 +317,6 @@ static const char *const cmd_flag_name[] = { static const char *const rqf_name[] = { RQF_NAME(SORTED), RQF_NAME(STARTED), - RQF_NAME(QUEUED), RQF_NAME(SOFTBARRIER), RQF_NAME(FLUSH_SEQ), RQF_NAME(MIXED_MERGE), @@ -424,15 +422,18 @@ struct show_busy_params { /* * Note: the state of a request may change while this function is in progress, - * e.g. due to a concurrent blk_mq_finish_request() call. + * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to + * keep iterating requests. */ -static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) +static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved) { const struct show_busy_params *params = data; - if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx) + if (rq->mq_hctx == params->hctx) __blk_mq_debugfs_rq_show(params->m, list_entry_rq(&rq->queuelist)); + + return true; } static int hctx_busy_show(void *data, struct seq_file *m) @@ -446,6 +447,21 @@ static int hctx_busy_show(void *data, struct seq_file *m) return 0; } +static const char *const hctx_types[] = { + [HCTX_TYPE_DEFAULT] = "default", + [HCTX_TYPE_READ] = "read", + [HCTX_TYPE_POLL] = "poll", +}; + +static int hctx_type_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES); + seq_printf(m, "%s\n", hctx_types[hctx->type]); + return 0; +} + static int hctx_ctx_map_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -636,36 +652,43 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m) return 0; } -static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) - __acquires(&ctx->lock) -{ - struct blk_mq_ctx *ctx = m->private; - - spin_lock(&ctx->lock); - return seq_list_start(&ctx->rq_list, *pos); +#define CTX_RQ_SEQ_OPS(name, type) \ +static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \ + __acquires(&ctx->lock) \ +{ \ + struct blk_mq_ctx *ctx = m->private; \ + \ + spin_lock(&ctx->lock); \ + return seq_list_start(&ctx->rq_lists[type], *pos); \ +} \ + \ +static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v, \ + loff_t *pos) \ +{ \ + struct blk_mq_ctx *ctx = m->private; \ + \ + return seq_list_next(v, &ctx->rq_lists[type], pos); \ +} \ + \ +static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v) \ + __releases(&ctx->lock) \ +{ \ + struct blk_mq_ctx *ctx = m->private; \ + \ + spin_unlock(&ctx->lock); \ +} \ + \ +static const struct seq_operations ctx_##name##_rq_list_seq_ops = { \ + .start = ctx_##name##_rq_list_start, \ + .next = ctx_##name##_rq_list_next, \ + .stop = ctx_##name##_rq_list_stop, \ + .show = blk_mq_debugfs_rq_show, \ } -static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct blk_mq_ctx *ctx = m->private; +CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT); +CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ); +CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); - return seq_list_next(v, &ctx->rq_list, pos); -} - -static void ctx_rq_list_stop(struct seq_file *m, void *v) - __releases(&ctx->lock) -{ - struct blk_mq_ctx *ctx = m->private; - - spin_unlock(&ctx->lock); -} - -static const struct seq_operations ctx_rq_list_seq_ops = { - .start = ctx_rq_list_start, - .next = ctx_rq_list_next, - .stop = ctx_rq_list_stop, - .show = blk_mq_debugfs_rq_show, -}; static int ctx_dispatched_show(void *data, struct seq_file *m) { struct blk_mq_ctx *ctx = data; @@ -798,11 +821,14 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, + {"type", 0400, hctx_type_show}, {}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { - {"rq_list", 0400, .seq_ops = &ctx_rq_list_seq_ops}, + {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops}, + {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops}, + {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops}, {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write}, {"merged", 0600, ctx_merged_show, ctx_merged_write}, {"completed", 0600, ctx_completed_show, ctx_completed_write}, @@ -856,6 +882,15 @@ int blk_mq_debugfs_register(struct request_queue *q) goto err; } + if (q->rq_qos) { + struct rq_qos *rqos = q->rq_qos; + + while (rqos) { + blk_mq_debugfs_register_rqos(rqos); + rqos = rqos->next; + } + } + return 0; err: @@ -978,6 +1013,50 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q) q->sched_debugfs_dir = NULL; } +void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) +{ + debugfs_remove_recursive(rqos->debugfs_dir); + rqos->debugfs_dir = NULL; +} + +int blk_mq_debugfs_register_rqos(struct rq_qos *rqos) +{ + struct request_queue *q = rqos->q; + const char *dir_name = rq_qos_id_to_name(rqos->id); + + if (!q->debugfs_dir) + return -ENOENT; + + if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) + return 0; + + if (!q->rqos_debugfs_dir) { + q->rqos_debugfs_dir = debugfs_create_dir("rqos", + q->debugfs_dir); + if (!q->rqos_debugfs_dir) + return -ENOMEM; + } + + rqos->debugfs_dir = debugfs_create_dir(dir_name, + rqos->q->rqos_debugfs_dir); + if (!rqos->debugfs_dir) + return -ENOMEM; + + if (!debugfs_create_files(rqos->debugfs_dir, rqos, + rqos->ops->debugfs_attrs)) + goto err; + return 0; + err: + blk_mq_debugfs_unregister_rqos(rqos); + return -ENOMEM; +} + +void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) +{ + debugfs_remove_recursive(q->rqos_debugfs_dir); + q->rqos_debugfs_dir = NULL; +} + int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index a9160be12be0..8c9012a578c1 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -31,6 +31,10 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q); int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); + +int blk_mq_debugfs_register_rqos(struct rq_qos *rqos); +void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); +void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q); #else static inline int blk_mq_debugfs_register(struct request_queue *q) { @@ -78,6 +82,19 @@ static inline int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { } + +static inline int blk_mq_debugfs_register_rqos(struct rq_qos *rqos) +{ + return 0; +} + +static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) +{ +} + +static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) +{ +} #endif #ifdef CONFIG_BLK_DEBUG_FS_ZONED diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index db644ec624f5..1dce18553984 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -31,26 +31,26 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, +int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, int offset) { const struct cpumask *mask; unsigned int queue, cpu; - for (queue = 0; queue < set->nr_hw_queues; queue++) { + for (queue = 0; queue < qmap->nr_queues; queue++) { mask = pci_irq_get_affinity(pdev, queue + offset); if (!mask) goto fallback; for_each_cpu(cpu, mask) - set->mq_map[cpu] = queue; + qmap->mq_map[cpu] = qmap->queue_offset + queue; } return 0; fallback: - WARN_ON_ONCE(set->nr_hw_queues > 1); - blk_mq_clear_mq_map(set); + WARN_ON_ONCE(qmap->nr_queues > 1); + blk_mq_clear_mq_map(qmap); return 0; } EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c index 996167f1de18..45030a81a1ed 100644 --- a/block/blk-mq-rdma.c +++ b/block/blk-mq-rdma.c @@ -29,24 +29,24 @@ * @set->nr_hw_queues, or @dev does not provide an affinity mask for a * vector, we fallback to the naive mapping. */ -int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, +int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, struct ib_device *dev, int first_vec) { const struct cpumask *mask; unsigned int queue, cpu; - for (queue = 0; queue < set->nr_hw_queues; queue++) { + for (queue = 0; queue < map->nr_queues; queue++) { mask = ib_get_vector_affinity(dev, first_vec + queue); if (!mask) goto fallback; for_each_cpu(cpu, mask) - set->mq_map[cpu] = queue; + map->mq_map[cpu] = map->queue_offset + queue; } return 0; fallback: - return blk_mq_map_queues(set); + return blk_mq_map_queues(map); } EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 29bfe8017a2d..140933e4a7d1 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -31,15 +31,22 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); -void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) +void blk_mq_sched_assign_ioc(struct request *rq) { struct request_queue *q = rq->q; - struct io_context *ioc = rq_ioc(bio); + struct io_context *ioc; struct io_cq *icq; - spin_lock_irq(q->queue_lock); + /* + * May not have an IO context if it's a passthrough request + */ + ioc = current->io_context; + if (!ioc) + return; + + spin_lock_irq(&q->queue_lock); icq = ioc_lookup_icq(ioc, q); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); if (!icq) { icq = ioc_create_icq(ioc, q, GFP_ATOMIC); @@ -54,13 +61,14 @@ void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) * Mark a hardware queue as needing a restart. For shared queues, maintain * a count of how many hardware queues are marked for restart. */ -static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) +void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) return; set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } +EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { @@ -85,14 +93,13 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) do { struct request *rq; - if (e->type->ops.mq.has_work && - !e->type->ops.mq.has_work(hctx)) + if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; if (!blk_mq_get_dispatch_budget(hctx)) break; - rq = e->type->ops.mq.dispatch_request(hctx); + rq = e->type->ops.dispatch_request(hctx); if (!rq) { blk_mq_put_dispatch_budget(hctx); break; @@ -110,7 +117,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - unsigned idx = ctx->index_hw; + unsigned short idx = ctx->index_hw[hctx->type]; if (++idx == hctx->nr_ctx) idx = 0; @@ -163,7 +170,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; - const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; + const bool has_sched_dispatch = e && e->type->ops.dispatch_request; LIST_HEAD(rq_list); /* RCU or SRCU read lock is needed before checking quiesced flag */ @@ -295,11 +302,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); * too much time checking for merges. */ static bool blk_mq_attempt_merge(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct bio *bio) { + enum hctx_type type = hctx->type; + lockdep_assert_held(&ctx->lock); - if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) { + if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) { ctx->rq_merged++; return true; } @@ -311,19 +321,21 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu); bool ret = false; + enum hctx_type type; - if (e && e->type->ops.mq.bio_merge) { + if (e && e->type->ops.bio_merge) { blk_mq_put_ctx(ctx); - return e->type->ops.mq.bio_merge(hctx, bio); + return e->type->ops.bio_merge(hctx, bio); } + type = hctx->type; if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - !list_empty_careful(&ctx->rq_list)) { + !list_empty_careful(&ctx->rq_lists[type])) { /* default per sw-queue merge */ spin_lock(&ctx->lock); - ret = blk_mq_attempt_merge(q, ctx, bio); + ret = blk_mq_attempt_merge(q, hctx, ctx, bio); spin_unlock(&ctx->lock); } @@ -367,7 +379,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; /* flush rq in flush machinery need to be dispatched directly */ if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { @@ -380,11 +392,11 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) goto run; - if (e && e->type->ops.mq.insert_requests) { + if (e && e->type->ops.insert_requests) { LIST_HEAD(list); list_add(&rq->queuelist, &list); - e->type->ops.mq.insert_requests(hctx, &list, at_head); + e->type->ops.insert_requests(hctx, &list, at_head); } else { spin_lock(&ctx->lock); __blk_mq_insert_request(hctx, rq, at_head); @@ -396,27 +408,25 @@ run: blk_mq_run_hw_queue(hctx, async); } -void blk_mq_sched_insert_requests(struct request_queue *q, +void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async) { - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - struct elevator_queue *e = hctx->queue->elevator; + struct elevator_queue *e; - if (e && e->type->ops.mq.insert_requests) - e->type->ops.mq.insert_requests(hctx, list, false); + e = hctx->queue->elevator; + if (e && e->type->ops.insert_requests) + e->type->ops.insert_requests(hctx, list, false); else { /* * try to issue requests directly if the hw queue isn't * busy in case of 'none' scheduler, and this way may save * us one extra enqueue & dequeue to sw queue. */ - if (!hctx->dispatch_busy && !e && !run_queue_async) { + if (!hctx->dispatch_busy && !e && !run_queue_async) blk_mq_try_issue_list_directly(hctx, list); - if (list_empty(list)) - return; - } - blk_mq_insert_requests(hctx, ctx, list); + else + blk_mq_insert_requests(hctx, ctx, list); } blk_mq_run_hw_queue(hctx, run_queue_async); @@ -489,15 +499,15 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) goto err; } - ret = e->ops.mq.init_sched(q, e); + ret = e->ops.init_sched(q, e); if (ret) goto err; blk_mq_debugfs_register_sched(q); queue_for_each_hw_ctx(q, hctx, i) { - if (e->ops.mq.init_hctx) { - ret = e->ops.mq.init_hctx(hctx, i); + if (e->ops.init_hctx) { + ret = e->ops.init_hctx(hctx, i); if (ret) { eq = q->elevator; blk_mq_exit_sched(q, eq); @@ -523,14 +533,14 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) queue_for_each_hw_ctx(q, hctx, i) { blk_mq_debugfs_unregister_sched_hctx(hctx); - if (e->type->ops.mq.exit_hctx && hctx->sched_data) { - e->type->ops.mq.exit_hctx(hctx, i); + if (e->type->ops.exit_hctx && hctx->sched_data) { + e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } } blk_mq_debugfs_unregister_sched(q); - if (e->type->ops.mq.exit_sched) - e->type->ops.mq.exit_sched(e); + if (e->type->ops.exit_sched) + e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q); q->elevator = NULL; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 8a9544203173..c7bdb52367ac 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -8,18 +8,19 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q, void (*exit)(struct blk_mq_hw_ctx *)); -void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio); +void blk_mq_sched_assign_ioc(struct request *rq); void blk_mq_sched_request_inserted(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, struct request **merged_request); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async); -void blk_mq_sched_insert_requests(struct request_queue *q, +void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async); @@ -43,8 +44,8 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e && e->type->ops.mq.allow_merge) - return e->type->ops.mq.allow_merge(q, rq, bio); + if (e && e->type->ops.allow_merge) + return e->type->ops.allow_merge(q, rq, bio); return true; } @@ -53,8 +54,8 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { struct elevator_queue *e = rq->q->elevator; - if (e && e->type->ops.mq.completed_request) - e->type->ops.mq.completed_request(rq, now); + if (e && e->type->ops.completed_request) + e->type->ops.completed_request(rq, now); } static inline void blk_mq_sched_started_request(struct request *rq) @@ -62,8 +63,8 @@ static inline void blk_mq_sched_started_request(struct request *rq) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if (e && e->type->ops.mq.started_request) - e->type->ops.mq.started_request(rq); + if (e && e->type->ops.started_request) + e->type->ops.started_request(rq); } static inline void blk_mq_sched_requeue_request(struct request *rq) @@ -71,16 +72,16 @@ static inline void blk_mq_sched_requeue_request(struct request *rq) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if (e && e->type->ops.mq.requeue_request) - e->type->ops.mq.requeue_request(rq); + if (e && e->type->ops.requeue_request) + e->type->ops.requeue_request(rq); } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) { struct elevator_queue *e = hctx->queue->elevator; - if (e && e->type->ops.mq.has_work) - return e->type->ops.mq.has_work(hctx); + if (e && e->type->ops.has_work) + return e->type->ops.has_work(hctx); return false; } diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index aafb44224c89..3f9c3f4ac44c 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -15,6 +15,18 @@ static void blk_mq_sysfs_release(struct kobject *kobj) { + struct blk_mq_ctxs *ctxs = container_of(kobj, struct blk_mq_ctxs, kobj); + + free_percpu(ctxs->queue_ctx); + kfree(ctxs); +} + +static void blk_mq_ctx_sysfs_release(struct kobject *kobj) +{ + struct blk_mq_ctx *ctx = container_of(kobj, struct blk_mq_ctx, kobj); + + /* ctx->ctxs won't be released until all ctx are freed */ + kobject_put(&ctx->ctxs->kobj); } static void blk_mq_hw_sysfs_release(struct kobject *kobj) @@ -203,7 +215,7 @@ static struct kobj_type blk_mq_ktype = { static struct kobj_type blk_mq_ctx_ktype = { .sysfs_ops = &blk_mq_sysfs_ops, .default_attrs = default_ctx_attrs, - .release = blk_mq_sysfs_release, + .release = blk_mq_ctx_sysfs_release, }; static struct kobj_type blk_mq_hw_ktype = { @@ -235,7 +247,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) if (!hctx->nr_ctx) return 0; - ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); + ret = kobject_add(&hctx->kobj, q->mq_kobj, "%u", hctx->queue_num); if (ret) return ret; @@ -258,8 +270,8 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); - kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); - kobject_del(&q->mq_kobj); + kobject_uevent(q->mq_kobj, KOBJ_REMOVE); + kobject_del(q->mq_kobj); kobject_put(&dev->kobj); q->mq_sysfs_init_done = false; @@ -279,7 +291,7 @@ void blk_mq_sysfs_deinit(struct request_queue *q) ctx = per_cpu_ptr(q->queue_ctx, cpu); kobject_put(&ctx->kobj); } - kobject_put(&q->mq_kobj); + kobject_put(q->mq_kobj); } void blk_mq_sysfs_init(struct request_queue *q) @@ -287,10 +299,12 @@ void blk_mq_sysfs_init(struct request_queue *q) struct blk_mq_ctx *ctx; int cpu; - kobject_init(&q->mq_kobj, &blk_mq_ktype); + kobject_init(q->mq_kobj, &blk_mq_ktype); for_each_possible_cpu(cpu) { ctx = per_cpu_ptr(q->queue_ctx, cpu); + + kobject_get(q->mq_kobj); kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } } @@ -303,11 +317,11 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q) WARN_ON_ONCE(!q->kobj.parent); lockdep_assert_held(&q->sysfs_lock); - ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); + ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) goto out; - kobject_uevent(&q->mq_kobj, KOBJ_ADD); + kobject_uevent(q->mq_kobj, KOBJ_ADD); queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); @@ -324,8 +338,8 @@ unreg: while (--i >= 0) blk_mq_unregister_hctx(q->queue_hw_ctx[i]); - kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); - kobject_del(&q->mq_kobj); + kobject_uevent(q->mq_kobj, KOBJ_REMOVE); + kobject_del(q->mq_kobj); kobject_put(&dev->kobj); return ret; } @@ -340,7 +354,6 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q) return ret; } -EXPORT_SYMBOL_GPL(blk_mq_register_dev); void blk_mq_sysfs_unregister(struct request_queue *q) { diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index cfda95b85d34..2089c6c62f44 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -110,7 +110,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt; struct sbq_wait_state *ws; - DEFINE_WAIT(wait); + DEFINE_SBQ_WAIT(wait); unsigned int tag_offset; bool drop_ctx; int tag; @@ -154,8 +154,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break; - prepare_to_wait_exclusive(&ws->wait, &wait, - TASK_UNINTERRUPTIBLE); + sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); tag = __blk_mq_get_tag(data, bt); if (tag != -1) @@ -167,16 +166,17 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) bt_prev = bt; io_schedule(); + sbitmap_finish_wait(bt, ws, &wait); + data->ctx = blk_mq_get_ctx(data->q); - data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu); + data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, + data->ctx->cpu); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) bt = &tags->breserved_tags; else bt = &tags->bitmap_tags; - finish_wait(&ws->wait, &wait); - /* * If destination hw queue is changed, fake wake up on * previous queue for compensating the wake up miss, so @@ -191,7 +191,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (drop_ctx && data->ctx) blk_mq_put_ctx(data->ctx); - finish_wait(&ws->wait, &wait); + sbitmap_finish_wait(bt, ws, &wait); found_tag: return tag + tag_offset; @@ -235,7 +235,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * test and set the bit before assigning ->rqs[]. */ if (rq && rq->q == hctx->queue) - iter_data->fn(hctx, rq, iter_data->data, reserved); + return iter_data->fn(hctx, rq, iter_data->data, reserved); return true; } @@ -247,7 +247,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * @fn: Pointer to the function that will be called for each request * associated with @hctx that has been assigned a driver tag. * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) - * where rq is a pointer to a request. + * where rq is a pointer to a request. Return true to continue + * iterating tags, false to stop. * @data: Will be passed as third argument to @fn. * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. @@ -288,7 +289,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) */ rq = tags->rqs[bitnr]; if (rq && blk_mq_request_started(rq)) - iter_data->fn(rq, iter_data->data, reserved); + return iter_data->fn(rq, iter_data->data, reserved); return true; } @@ -300,7 +301,8 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @data, - * @reserved) where rq is a pointer to a request. + * @reserved) where rq is a pointer to a request. Return true + * to continue iterating tags, false to stop. * @data: Will be passed as second argument to @fn. * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. @@ -325,7 +327,8 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @priv, * reserved) where rq is a pointer to a request. 'reserved' - * indicates whether or not @rq is a reserved request. + * indicates whether or not @rq is a reserved request. Return + * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. */ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, @@ -342,7 +345,8 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @priv, * reserved) where rq is a pointer to a request. 'reserved' - * indicates whether or not @rq is a reserved request. + * indicates whether or not @rq is a reserved request. Return + * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. */ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, @@ -526,16 +530,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, */ u32 blk_mq_unique_tag(struct request *rq) { - struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx; - int hwq = 0; - - if (q->mq_ops) { - hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); - hwq = hctx->queue_num; - } - - return (hwq << BLK_MQ_UNIQUE_TAG_BITS) | + return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); } EXPORT_SYMBOL(blk_mq_unique_tag); diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c index c3afbca11299..370827163835 100644 --- a/block/blk-mq-virtio.c +++ b/block/blk-mq-virtio.c @@ -29,7 +29,7 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set, +int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, struct virtio_device *vdev, int first_vec) { const struct cpumask *mask; @@ -38,17 +38,17 @@ int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set, if (!vdev->config->get_vq_affinity) goto fallback; - for (queue = 0; queue < set->nr_hw_queues; queue++) { + for (queue = 0; queue < qmap->nr_queues; queue++) { mask = vdev->config->get_vq_affinity(vdev, first_vec + queue); if (!mask) goto fallback; for_each_cpu(cpu, mask) - set->mq_map[cpu] = queue; + qmap->mq_map[cpu] = qmap->queue_offset + queue; } return 0; fallback: - return blk_mq_map_queues(set); + return blk_mq_map_queues(qmap); } EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c index 6a7566244de3..3ba37b9e15e9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -38,7 +38,6 @@ #include "blk-mq-sched.h" #include "blk-rq-qos.h" -static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -75,14 +74,18 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) - sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); + const int bit = ctx->index_hw[hctx->type]; + + if (!sbitmap_test_bit(&hctx->ctx_map, bit)) + sbitmap_set_bit(&hctx->ctx_map, bit); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); + const int bit = ctx->index_hw[hctx->type]; + + sbitmap_clear_bit(&hctx->ctx_map, bit); } struct mq_inflight { @@ -90,33 +93,33 @@ struct mq_inflight { unsigned int *inflight; }; -static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, +static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct mq_inflight *mi = priv; /* - * index[0] counts the specific partition that was asked for. index[1] - * counts the ones that are active on the whole device, so increment - * that if mi->part is indeed a partition, and not a whole device. + * index[0] counts the specific partition that was asked for. */ if (rq->part == mi->part) mi->inflight[0]++; - if (mi->part->partno) - mi->inflight[1]++; + + return true; } -void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) { + unsigned inflight[2]; struct mq_inflight mi = { .part = part, .inflight = inflight, }; inflight[0] = inflight[1] = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + + return inflight[0]; } -static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, +static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { @@ -124,6 +127,8 @@ static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, if (rq->part == mi->part) mi->inflight[rq_data_dir(rq)]++; + + return true; } void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, @@ -142,7 +147,7 @@ void blk_freeze_queue_start(struct request_queue *q) freeze_depth = atomic_inc_return(&q->mq_freeze_depth); if (freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); } } @@ -177,8 +182,6 @@ void blk_freeze_queue(struct request_queue *q) * exported to drivers as the only user for unfreeze is blk_mq. */ blk_freeze_queue_start(q); - if (!q->mq_ops) - blk_drain_queue(q); blk_mq_freeze_queue_wait(q); } @@ -275,6 +278,15 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) } EXPORT_SYMBOL(blk_mq_can_queue); +/* + * Only need start/end time stamping if we have stats enabled, or using + * an IO scheduler. + */ +static inline bool blk_mq_need_time_stamp(struct request *rq) +{ + return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator; +} + static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, unsigned int tag, unsigned int op) { @@ -298,8 +310,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; + rq->mq_hctx = data->hctx; rq->rq_flags = rq_flags; - rq->cpu = -1; rq->cmd_flags = op; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; @@ -310,7 +322,10 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; - rq->start_time_ns = ktime_get_ns(); + if (blk_mq_need_time_stamp(rq)) + rq->start_time_ns = ktime_get_ns(); + else + rq->start_time_ns = 0; rq->io_start_time_ns = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) @@ -319,27 +334,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->special = NULL; /* tag was already set */ rq->extra_len = 0; - rq->__deadline = 0; + WRITE_ONCE(rq->deadline, 0); - INIT_LIST_HEAD(&rq->timeout_list); rq->timeout = 0; rq->end_io = NULL; rq->end_io_data = NULL; rq->next_rq = NULL; -#ifdef CONFIG_BLK_CGROUP - rq->rl = NULL; -#endif - data->ctx->rq_dispatched[op_is_sync(op)]++; refcount_set(&rq->ref, 1); return rq; } static struct request *blk_mq_get_request(struct request_queue *q, - struct bio *bio, unsigned int op, - struct blk_mq_alloc_data *data) + struct bio *bio, + struct blk_mq_alloc_data *data) { struct elevator_queue *e = q->elevator; struct request *rq; @@ -353,8 +363,9 @@ static struct request *blk_mq_get_request(struct request_queue *q, put_ctx_on_error = true; } if (likely(!data->hctx)) - data->hctx = blk_mq_map_queue(q, data->ctx->cpu); - if (op & REQ_NOWAIT) + data->hctx = blk_mq_map_queue(q, data->cmd_flags, + data->ctx->cpu); + if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; if (e) { @@ -365,9 +376,10 @@ static struct request *blk_mq_get_request(struct request_queue *q, * dispatch list. Don't include reserved tags in the * limiting, as it isn't useful. */ - if (!op_is_flush(op) && e->type->ops.mq.limit_depth && + if (!op_is_flush(data->cmd_flags) && + e->type->ops.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) - e->type->ops.mq.limit_depth(op, data); + e->type->ops.limit_depth(data->cmd_flags, data); } else { blk_mq_tag_busy(data->hctx); } @@ -382,14 +394,14 @@ static struct request *blk_mq_get_request(struct request_queue *q, return NULL; } - rq = blk_mq_rq_ctx_init(data, tag, op); - if (!op_is_flush(op)) { + rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags); + if (!op_is_flush(data->cmd_flags)) { rq->elv.icq = NULL; - if (e && e->type->ops.mq.prepare_request) { - if (e->type->icq_cache && rq_ioc(bio)) - blk_mq_sched_assign_ioc(rq, bio); + if (e && e->type->ops.prepare_request) { + if (e->type->icq_cache) + blk_mq_sched_assign_ioc(rq); - e->type->ops.mq.prepare_request(rq, bio); + e->type->ops.prepare_request(rq, bio); rq->rq_flags |= RQF_ELVPRIV; } } @@ -400,7 +412,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags) { - struct blk_mq_alloc_data alloc_data = { .flags = flags }; + struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; struct request *rq; int ret; @@ -408,7 +420,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, if (ret) return ERR_PTR(ret); - rq = blk_mq_get_request(q, NULL, op, &alloc_data); + rq = blk_mq_get_request(q, NULL, &alloc_data); blk_queue_exit(q); if (!rq) @@ -426,7 +438,7 @@ EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) { - struct blk_mq_alloc_data alloc_data = { .flags = flags }; + struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; struct request *rq; unsigned int cpu; int ret; @@ -459,7 +471,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); alloc_data.ctx = __blk_mq_get_ctx(q, cpu); - rq = blk_mq_get_request(q, NULL, op, &alloc_data); + rq = blk_mq_get_request(q, NULL, &alloc_data); blk_queue_exit(q); if (!rq) @@ -473,10 +485,11 @@ static void __blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag; blk_pm_mark_last_busy(rq); + rq->mq_hctx = NULL; if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) @@ -490,11 +503,11 @@ void blk_mq_free_request(struct request *rq) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (rq->rq_flags & RQF_ELVPRIV) { - if (e && e->type->ops.mq.finish_request) - e->type->ops.mq.finish_request(rq); + if (e && e->type->ops.finish_request) + e->type->ops.finish_request(rq); if (rq->elv.icq) { put_io_context(rq->elv.icq->ioc); rq->elv.icq = NULL; @@ -510,9 +523,6 @@ void blk_mq_free_request(struct request *rq) rq_qos_done(q, rq); - if (blk_rq_rl(rq)) - blk_put_rl(blk_rq_rl(rq)); - WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (refcount_dec_and_test(&rq->ref)) __blk_mq_free_request(rq); @@ -521,7 +531,10 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { - u64 now = ktime_get_ns(); + u64 now = 0; + + if (blk_mq_need_time_stamp(rq)) + now = ktime_get_ns(); if (rq->rq_flags & RQF_STATS) { blk_mq_poll_stats_start(rq->q); @@ -555,19 +568,19 @@ EXPORT_SYMBOL(blk_mq_end_request); static void __blk_mq_complete_request_remote(void *data) { struct request *rq = data; + struct request_queue *q = rq->q; - rq->q->softirq_done_fn(rq); + q->mq_ops->complete(rq); } static void __blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; + struct request_queue *q = rq->q; bool shared = false; int cpu; - if (!blk_mq_mark_complete(rq)) - return; - + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* * Most of single queue controllers, there is only one irq vector * for handling IO completion, and the only irq's affinity is set @@ -577,18 +590,23 @@ static void __blk_mq_complete_request(struct request *rq) * So complete IO reqeust in softirq context in case of single queue * for not degrading IO performance by irqsoff latency. */ - if (rq->q->nr_hw_queues == 1) { + if (q->nr_hw_queues == 1) { __blk_complete_request(rq); return; } - if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { - rq->q->softirq_done_fn(rq); + /* + * For a polled request, always complete locallly, it's pointless + * to redirect the completion. + */ + if ((rq->cmd_flags & REQ_HIPRI) || + !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { + q->mq_ops->complete(rq); return; } cpu = get_cpu(); - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) shared = cpus_share_cache(cpu, ctx->cpu); if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { @@ -597,7 +615,7 @@ static void __blk_mq_complete_request(struct request *rq) rq->csd.flags = 0; smp_call_function_single_async(ctx->cpu, &rq->csd); } else { - rq->q->softirq_done_fn(rq); + q->mq_ops->complete(rq); } put_cpu(); } @@ -630,11 +648,12 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) * Ends all I/O on a request. It does not handle partial completions. * The actual completion happens out-of-order, through a IPI handler. **/ -void blk_mq_complete_request(struct request *rq) +bool blk_mq_complete_request(struct request *rq) { if (unlikely(blk_should_fake_timeout(rq->q))) - return; + return false; __blk_mq_complete_request(rq); + return true; } EXPORT_SYMBOL(blk_mq_complete_request); @@ -701,7 +720,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - BUG_ON(blk_queued_rq(rq)); + BUG_ON(!list_empty(&rq->queuelist)); blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); } EXPORT_SYMBOL(blk_mq_requeue_request); @@ -786,6 +805,32 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) } EXPORT_SYMBOL(blk_mq_tag_to_rq); +static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, + void *priv, bool reserved) +{ + /* + * If we find a request that is inflight and the queue matches, + * we know the queue is busy. Return false to stop the iteration. + */ + if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) { + bool *busy = priv; + + *busy = true; + return false; + } + + return true; +} + +bool blk_mq_queue_inflight(struct request_queue *q) +{ + bool busy = false; + + blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); + return busy; +} +EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); + static void blk_mq_rq_timed_out(struct request *req, bool reserved) { req->rq_flags |= RQF_TIMED_OUT; @@ -810,7 +855,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) if (rq->rq_flags & RQF_TIMED_OUT) return false; - deadline = blk_rq_deadline(rq); + deadline = READ_ONCE(rq->deadline); if (time_after_eq(jiffies, deadline)) return true; @@ -821,7 +866,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) return false; } -static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, +static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { unsigned long *next = priv; @@ -831,7 +876,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * so we're not unnecessarilly synchronizing across CPUs. */ if (!blk_mq_req_expired(rq, next)) - return; + return true; /* * We have reason to believe the request may be expired. Take a @@ -843,7 +888,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * timeout handler to posting a natural completion. */ if (!refcount_inc_not_zero(&rq->ref)) - return; + return true; /* * The request is now locked and cannot be reallocated underneath the @@ -855,6 +900,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, blk_mq_rq_timed_out(rq, reserved); if (refcount_dec_and_test(&rq->ref)) __blk_mq_free_request(rq); + + return true; } static void blk_mq_timeout_work(struct work_struct *work) @@ -911,9 +958,10 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) struct flush_busy_ctx_data *flush_data = data; struct blk_mq_hw_ctx *hctx = flush_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + enum hctx_type type = hctx->type; spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, flush_data->list); + list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); sbitmap_clear_bit(sb, bitnr); spin_unlock(&ctx->lock); return true; @@ -945,12 +993,13 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, struct dispatch_rq_data *dispatch_data = data; struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + enum hctx_type type = hctx->type; spin_lock(&ctx->lock); - if (!list_empty(&ctx->rq_list)) { - dispatch_data->rq = list_entry_rq(ctx->rq_list.next); + if (!list_empty(&ctx->rq_lists[type])) { + dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); list_del_init(&dispatch_data->rq->queuelist); - if (list_empty(&ctx->rq_list)) + if (list_empty(&ctx->rq_lists[type])) sbitmap_clear_bit(sb, bitnr); } spin_unlock(&ctx->lock); @@ -961,7 +1010,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start) { - unsigned off = start ? start->index_hw : 0; + unsigned off = start ? start->index_hw[hctx->type] : 0; struct dispatch_rq_data data = { .hctx = hctx, .rq = NULL, @@ -985,8 +1034,9 @@ bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_alloc_data data = { .q = rq->q, - .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), + .hctx = rq->mq_hctx, .flags = BLK_MQ_REQ_NOWAIT, + .cmd_flags = rq->cmd_flags, }; bool shared; @@ -1150,7 +1200,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); - hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); + hctx = rq->mq_hctx; if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) break; @@ -1223,6 +1273,14 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!list_empty(list)) { bool needs_restart; + /* + * If we didn't flush the entire list, we could have told + * the driver there was more coming, but that turned out to + * be a lie. + */ + if (q->mq_ops->commit_rqs) + q->mq_ops->commit_rqs(hctx); + spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -1552,15 +1610,16 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; + enum hctx_type type = hctx->type; lockdep_assert_held(&ctx->lock); trace_block_rq_insert(hctx->queue, rq); if (at_head) - list_add(&rq->queuelist, &ctx->rq_list); + list_add(&rq->queuelist, &ctx->rq_lists[type]); else - list_add_tail(&rq->queuelist, &ctx->rq_list); + list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); } void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, @@ -1580,8 +1639,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); list_add_tail(&rq->queuelist, &hctx->dispatch); @@ -1596,6 +1654,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, { struct request *rq; + enum hctx_type type = hctx->type; /* * preemption doesn't flush plug list, so it's possible ctx->cpu is @@ -1607,35 +1666,46 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, } spin_lock(&ctx->lock); - list_splice_tail_init(list, &ctx->rq_list); + list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } -static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) +static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) { struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); - return !(rqa->mq_ctx < rqb->mq_ctx || - (rqa->mq_ctx == rqb->mq_ctx && - blk_rq_pos(rqa) < blk_rq_pos(rqb))); + if (rqa->mq_ctx < rqb->mq_ctx) + return -1; + else if (rqa->mq_ctx > rqb->mq_ctx) + return 1; + else if (rqa->mq_hctx < rqb->mq_hctx) + return -1; + else if (rqa->mq_hctx > rqb->mq_hctx) + return 1; + + return blk_rq_pos(rqa) > blk_rq_pos(rqb); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { + struct blk_mq_hw_ctx *this_hctx; struct blk_mq_ctx *this_ctx; struct request_queue *this_q; struct request *rq; LIST_HEAD(list); - LIST_HEAD(ctx_list); + LIST_HEAD(rq_list); unsigned int depth; list_splice_init(&plug->mq_list, &list); + plug->rq_count = 0; - list_sort(NULL, &list, plug_ctx_cmp); + if (plug->rq_count > 2 && plug->multiple_queues) + list_sort(NULL, &list, plug_rq_cmp); this_q = NULL; + this_hctx = NULL; this_ctx = NULL; depth = 0; @@ -1643,30 +1713,31 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) rq = list_entry_rq(list.next); list_del_init(&rq->queuelist); BUG_ON(!rq->q); - if (rq->mq_ctx != this_ctx) { - if (this_ctx) { + if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) { + if (this_hctx) { trace_block_unplug(this_q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_q, this_ctx, - &ctx_list, + blk_mq_sched_insert_requests(this_hctx, this_ctx, + &rq_list, from_schedule); } - this_ctx = rq->mq_ctx; this_q = rq->q; + this_ctx = rq->mq_ctx; + this_hctx = rq->mq_hctx; depth = 0; } depth++; - list_add_tail(&rq->queuelist, &ctx_list); + list_add_tail(&rq->queuelist, &rq_list); } /* - * If 'this_ctx' is set, we know we have entries to complete - * on 'ctx_list'. Do those. + * If 'this_hctx' is set, we know we have entries to complete + * on 'rq_list'. Do those. */ - if (this_ctx) { + if (this_hctx) { trace_block_unplug(this_q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, + blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, from_schedule); } } @@ -1675,27 +1746,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { blk_init_request_from_bio(rq, bio); - blk_rq_set_rl(rq, blk_get_rl(rq->q, bio)); - blk_account_io_start(rq, true); } -static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) -{ - if (rq->tag != -1) - return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); - - return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); -} - static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, - blk_qc_t *cookie) + blk_qc_t *cookie, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, - .last = true, + .last = last, }; blk_qc_t new_cookie; blk_status_t ret; @@ -1727,77 +1788,74 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; } -static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, +blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie, - bool bypass_insert) + bool bypass, bool last) { struct request_queue *q = rq->q; bool run_queue = true; + blk_status_t ret = BLK_STS_RESOURCE; + int srcu_idx; + bool force = false; + hctx_lock(hctx, &srcu_idx); /* - * RCU or SRCU read lock is needed before checking quiesced flag. + * hctx_lock is needed before checking quiesced flag. * - * When queue is stopped or quiesced, ignore 'bypass_insert' from - * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, - * and avoid driver to try to dispatch again. + * When queue is stopped or quiesced, ignore 'bypass', insert + * and return BLK_STS_OK to caller, and avoid driver to try to + * dispatch again. */ - if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) { run_queue = false; - bypass_insert = false; - goto insert; + bypass = false; + goto out_unlock; } - if (q->elevator && !bypass_insert) - goto insert; + if (unlikely(q->elevator && !bypass)) + goto out_unlock; if (!blk_mq_get_dispatch_budget(hctx)) - goto insert; + goto out_unlock; if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(hctx); - goto insert; + goto out_unlock; } - return __blk_mq_issue_directly(hctx, rq, cookie); -insert: - if (bypass_insert) - return BLK_STS_RESOURCE; - - blk_mq_request_bypass_insert(rq, run_queue); - return BLK_STS_OK; -} - -static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, blk_qc_t *cookie) -{ - blk_status_t ret; - int srcu_idx; - - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - hctx_lock(hctx, &srcu_idx); - - ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - blk_mq_request_bypass_insert(rq, true); - else if (ret != BLK_STS_OK) - blk_mq_end_request(rq, ret); - - hctx_unlock(hctx, srcu_idx); -} - -blk_status_t blk_mq_request_issue_directly(struct request *rq) -{ - blk_status_t ret; - int srcu_idx; - blk_qc_t unused_cookie; - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); - - hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true); + /* + * Always add a request that has been through + *.queue_rq() to the hardware dispatch list. + */ + force = true; + ret = __blk_mq_issue_directly(hctx, rq, cookie, last); +out_unlock: hctx_unlock(hctx, srcu_idx); + switch (ret) { + case BLK_STS_OK: + break; + case BLK_STS_DEV_RESOURCE: + case BLK_STS_RESOURCE: + if (force) { + blk_mq_request_bypass_insert(rq, run_queue); + /* + * We have to return BLK_STS_OK for the DM + * to avoid livelock. Otherwise, we return + * the real result to indicate whether the + * request is direct-issued successfully. + */ + ret = bypass ? BLK_STS_OK : ret; + } else if (!bypass) { + blk_mq_sched_insert_request(rq, false, + run_queue, false); + } + break; + default: + if (!bypass) + blk_mq_end_request(rq, ret); + break; + } return ret; } @@ -1805,22 +1863,42 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq) void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { + blk_qc_t unused; + blk_status_t ret = BLK_STS_OK; + while (!list_empty(list)) { - blk_status_t ret; struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - ret = blk_mq_request_issue_directly(rq); - if (ret != BLK_STS_OK) { - if (ret == BLK_STS_RESOURCE || - ret == BLK_STS_DEV_RESOURCE) { - blk_mq_request_bypass_insert(rq, + if (ret == BLK_STS_OK) + ret = blk_mq_try_issue_directly(hctx, rq, &unused, + false, list_empty(list)); - break; - } - blk_mq_end_request(rq, ret); - } + else + blk_mq_sched_insert_request(rq, false, true, false); + } + + /* + * If we didn't flush the entire list, we could have told + * the driver there was more coming, but that turned out to + * be a lie. + */ + if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs) + hctx->queue->mq_ops->commit_rqs(hctx); +} + +static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +{ + list_add_tail(&rq->queuelist, &plug->mq_list); + plug->rq_count++; + if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { + struct request *tmp; + + tmp = list_first_entry(&plug->mq_list, struct request, + queuelist); + if (tmp->q != rq->q) + plug->multiple_queues = true; } } @@ -1828,9 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { .flags = 0 }; + struct blk_mq_alloc_data data = { .flags = 0, .cmd_flags = bio->bi_opf }; struct request *rq; - unsigned int request_count = 0; struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; @@ -1843,15 +1920,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) + blk_attempt_plug_merge(q, bio, &same_queue_rq)) return BLK_QC_T_NONE; if (blk_mq_sched_bio_merge(q, bio)) return BLK_QC_T_NONE; - rq_qos_throttle(q, bio, NULL); + rq_qos_throttle(q, bio); - rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); + rq = blk_mq_get_request(q, bio, &data); if (unlikely(!rq)) { rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) @@ -1873,21 +1950,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) /* bypass scheduler for flush rq */ blk_insert_flush(rq); blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && q->nr_hw_queues == 1) { + } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) { + /* + * Use plugging if we have a ->commit_rqs() hook as well, as + * we know the driver uses bd->last in a smart fashion. + */ + unsigned int request_count = plug->rq_count; struct request *last = NULL; blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - /* - * @request_count may become stale because of schedule - * out, so check the list again. - */ - if (list_empty(&plug->mq_list)) - request_count = 0; - else if (blk_queue_nomerges(q)) - request_count = blk_plug_queued_count(q); - if (!request_count) trace_block_plug(q); else @@ -1899,7 +1972,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_plug(q); } - list_add_tail(&rq->queuelist, &plug->mq_list); + blk_add_rq_to_plug(plug, rq); } else if (plug && !blk_queue_nomerges(q)) { blk_mq_bio_to_request(rq, bio); @@ -1912,23 +1985,24 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) */ if (list_empty(&plug->mq_list)) same_queue_rq = NULL; - if (same_queue_rq) + if (same_queue_rq) { list_del_init(&same_queue_rq->queuelist); - list_add_tail(&rq->queuelist, &plug->mq_list); + plug->rq_count--; + } + blk_add_rq_to_plug(plug, rq); blk_mq_put_ctx(data.ctx); if (same_queue_rq) { - data.hctx = blk_mq_map_queue(q, - same_queue_rq->mq_ctx->cpu); + data.hctx = same_queue_rq->mq_hctx; blk_mq_try_issue_directly(data.hctx, same_queue_rq, - &cookie); + &cookie, false, true); } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_try_issue_directly(data.hctx, rq, &cookie); + blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); @@ -1986,7 +2060,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags; int node; - node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -2042,7 +2116,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, size_t rq_size, left; int node; - node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -2122,13 +2196,15 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); + enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); ctx = __blk_mq_get_ctx(hctx->queue, cpu); + type = hctx->type; spin_lock(&ctx->lock); - if (!list_empty(&ctx->rq_list)) { - list_splice_init(&ctx->rq_list, &tmp); + if (!list_empty(&ctx->rq_lists[type])) { + list_splice_init(&ctx->rq_lists[type], &tmp); blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); @@ -2259,24 +2335,30 @@ static int blk_mq_init_hctx(struct request_queue *q, static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { - unsigned int i; + struct blk_mq_tag_set *set = q->tag_set; + unsigned int i, j; for_each_possible_cpu(i) { struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); struct blk_mq_hw_ctx *hctx; + int k; __ctx->cpu = i; spin_lock_init(&__ctx->lock); - INIT_LIST_HEAD(&__ctx->rq_list); + for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) + INIT_LIST_HEAD(&__ctx->rq_lists[k]); + __ctx->queue = q; /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ - hctx = blk_mq_map_queue(q, i); - if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) - hctx->numa_node = local_memory_node(cpu_to_node(i)); + for (j = 0; j < set->nr_maps; j++) { + hctx = blk_mq_map_queue_type(q, j, i); + if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) + hctx->numa_node = local_memory_node(cpu_to_node(i)); + } } } @@ -2302,7 +2384,7 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, unsigned int hctx_idx) { - if (set->tags[hctx_idx]) { + if (set->tags && set->tags[hctx_idx]) { blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); blk_mq_free_rq_map(set->tags[hctx_idx]); set->tags[hctx_idx] = NULL; @@ -2311,7 +2393,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, static void blk_mq_map_swqueue(struct request_queue *q) { - unsigned int i, hctx_idx; + unsigned int i, j, hctx_idx; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; @@ -2333,7 +2415,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) * If the cpu isn't present, the cpu is mapped to first hctx. */ for_each_possible_cpu(i) { - hctx_idx = q->mq_map[i]; + hctx_idx = set->map[0].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && !__blk_mq_alloc_rq_map(set, hctx_idx)) { @@ -2343,15 +2425,35 @@ static void blk_mq_map_swqueue(struct request_queue *q) * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ - q->mq_map[i] = 0; + set->map[0].mq_map[i] = 0; } ctx = per_cpu_ptr(q->queue_ctx, i); - hctx = blk_mq_map_queue(q, i); + for (j = 0; j < set->nr_maps; j++) { + if (!set->map[j].nr_queues) + continue; - cpumask_set_cpu(i, hctx->cpumask); - ctx->index_hw = hctx->nr_ctx; - hctx->ctxs[hctx->nr_ctx++] = ctx; + hctx = blk_mq_map_queue_type(q, j, i); + + /* + * If the CPU is already set in the mask, then we've + * mapped this one already. This can happen if + * devices share queues across queue maps. + */ + if (cpumask_test_cpu(i, hctx->cpumask)) + continue; + + cpumask_set_cpu(i, hctx->cpumask); + hctx->type = j; + ctx->index_hw[hctx->type] = hctx->nr_ctx; + hctx->ctxs[hctx->nr_ctx++] = ctx; + + /* + * If the nr_ctx type overflows, we have exceeded the + * amount of sw queues we can support. + */ + BUG_ON(!hctx->nr_ctx); + } } mutex_unlock(&q->sysfs_lock); @@ -2441,8 +2543,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, struct request_queue *q) { - q->tag_set = set; - mutex_lock(&set->tag_list_lock); /* @@ -2461,6 +2561,34 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, mutex_unlock(&set->tag_list_lock); } +/* All allocations will be freed in release handler of q->mq_kobj */ +static int blk_mq_alloc_ctxs(struct request_queue *q) +{ + struct blk_mq_ctxs *ctxs; + int cpu; + + ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); + if (!ctxs) + return -ENOMEM; + + ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); + if (!ctxs->queue_ctx) + goto fail; + + for_each_possible_cpu(cpu) { + struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); + ctx->ctxs = ctxs; + } + + q->mq_kobj = &ctxs->kobj; + q->queue_ctx = ctxs->queue_ctx; + + return 0; + fail: + kfree(ctxs); + return -ENOMEM; +} + /* * It is the actual release handler for mq, but we do it from * request queue's release handler for avoiding use-after-free @@ -2479,8 +2607,6 @@ void blk_mq_release(struct request_queue *q) kobject_put(&hctx->kobj); } - q->mq_map = NULL; - kfree(q->queue_hw_ctx); /* @@ -2488,15 +2614,13 @@ void blk_mq_release(struct request_queue *q) * both share lifetime with request queue. */ blk_mq_sysfs_deinit(q); - - free_percpu(q->queue_ctx); } struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) { struct request_queue *uninit_q, *q; - uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL); + uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); @@ -2523,6 +2647,7 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, memset(set, 0, sizeof(*set)); set->ops = ops; set->nr_hw_queues = 1; + set->nr_maps = 1; set->queue_depth = queue_depth; set->numa_node = NUMA_NO_NODE; set->flags = set_flags; @@ -2600,7 +2725,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, int node; struct blk_mq_hw_ctx *hctx; - node = blk_mq_hw_queue_to_node(q->mq_map, i); + node = blk_mq_hw_queue_to_node(&set->map[0], i); /* * If the hw queue has been mapped to another numa node, * we need to realloc the hctx. If allocation fails, fallback @@ -2653,6 +2778,19 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } +/* + * Maximum number of hardware queues we support. For single sets, we'll never + * have more than the CPUs (software queues). For multiple sets, the tag_set + * user may have set ->nr_hw_queues larger. + */ +static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) +{ + if (set->nr_maps == 1) + return nr_cpu_ids; + + return max(set->nr_hw_queues, nr_cpu_ids); +} + struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { @@ -2665,19 +2803,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (!q->poll_cb) goto err_exit; - q->queue_ctx = alloc_percpu(struct blk_mq_ctx); - if (!q->queue_ctx) + if (blk_mq_alloc_ctxs(q)) goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); - q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)), + q->nr_queues = nr_hw_queues(set); + q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, set->numa_node); if (!q->queue_hw_ctx) - goto err_percpu; - - q->mq_map = set->mq_map; + goto err_sys_init; blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) @@ -2686,12 +2822,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); - q->nr_queues = nr_cpu_ids; + q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; + if (set->nr_maps > HCTX_TYPE_POLL && + set->map[HCTX_TYPE_POLL].nr_queues) + blk_queue_flag_set(QUEUE_FLAG_POLL, q); if (!(set->flags & BLK_MQ_F_SG_MERGE)) - queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q); q->sg_reserved_size = INT_MAX; @@ -2700,8 +2839,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, spin_lock_init(&q->requeue_lock); blk_queue_make_request(q, blk_mq_make_request); - if (q->mq_ops->poll) - q->poll_fn = blk_mq_poll; /* * Do this after blk_queue_make_request() overrides it... @@ -2713,9 +2850,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, */ q->poll_nsec = -1; - if (set->ops->complete) - blk_queue_softirq_done(q, set->ops->complete); - blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); @@ -2732,8 +2866,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, err_hctxs: kfree(q->queue_hw_ctx); -err_percpu: - free_percpu(q->queue_ctx); +err_sys_init: + blk_mq_sysfs_deinit(q); err_exit: q->mq_ops = NULL; return ERR_PTR(-ENOMEM); @@ -2802,7 +2936,9 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { - if (set->ops->map_queues) { + if (set->ops->map_queues && !is_kdump_kernel()) { + int i; + /* * transport .map_queues is usually done in the following * way: @@ -2810,18 +2946,21 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) * for (queue = 0; queue < set->nr_hw_queues; queue++) { * mask = get_cpu_mask(queue) * for_each_cpu(cpu, mask) - * set->mq_map[cpu] = queue; + * set->map[x].mq_map[cpu] = queue; * } * * When we need to remap, the table has to be cleared for * killing stale mapping since one CPU may not be mapped * to any hw queue. */ - blk_mq_clear_mq_map(set); + for (i = 0; i < set->nr_maps; i++) + blk_mq_clear_mq_map(&set->map[i]); return set->ops->map_queues(set); - } else - return blk_mq_map_queues(set); + } else { + BUG_ON(set->nr_maps > 1); + return blk_mq_map_queues(&set->map[0]); + } } /* @@ -2832,7 +2971,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { - int ret; + int i, ret; BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); @@ -2855,6 +2994,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->queue_depth = BLK_MQ_MAX_DEPTH; } + if (!set->nr_maps) + set->nr_maps = 1; + else if (set->nr_maps > HCTX_MAX_TYPES) + return -EINVAL; + /* * If a crashdump is active, then we are potentially in a very * memory constrained environment. Limit us to 1 queue and @@ -2862,24 +3006,30 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) */ if (is_kdump_kernel()) { set->nr_hw_queues = 1; + set->nr_maps = 1; set->queue_depth = min(64U, set->queue_depth); } /* - * There is no use for more h/w queues than cpus. + * There is no use for more h/w queues than cpus if we just have + * a single map */ - if (set->nr_hw_queues > nr_cpu_ids) + if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *), + set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) return -ENOMEM; ret = -ENOMEM; - set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map), - GFP_KERNEL, set->numa_node); - if (!set->mq_map) - goto out_free_tags; + for (i = 0; i < set->nr_maps; i++) { + set->map[i].mq_map = kcalloc_node(nr_cpu_ids, + sizeof(set->map[i].mq_map[0]), + GFP_KERNEL, set->numa_node); + if (!set->map[i].mq_map) + goto out_free_mq_map; + set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; + } ret = blk_mq_update_queue_map(set); if (ret) @@ -2895,9 +3045,10 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) return 0; out_free_mq_map: - kfree(set->mq_map); - set->mq_map = NULL; -out_free_tags: + for (i = 0; i < set->nr_maps; i++) { + kfree(set->map[i].mq_map); + set->map[i].mq_map = NULL; + } kfree(set->tags); set->tags = NULL; return ret; @@ -2906,13 +3057,15 @@ EXPORT_SYMBOL(blk_mq_alloc_tag_set); void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { - int i; + int i, j; - for (i = 0; i < nr_cpu_ids; i++) + for (i = 0; i < nr_hw_queues(set); i++) blk_mq_free_map_and_requests(set, i); - kfree(set->mq_map); - set->mq_map = NULL; + for (j = 0; j < set->nr_maps; j++) { + kfree(set->map[j].mq_map); + set->map[j].mq_map = NULL; + } kfree(set->tags); set->tags = NULL; @@ -3038,7 +3191,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, lockdep_assert_held(&set->tag_list_lock); - if (nr_hw_queues > nr_cpu_ids) + if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) nr_hw_queues = nr_cpu_ids; if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) return; @@ -3073,7 +3226,7 @@ fallback: pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); set->nr_hw_queues = prev_nr_hw_queues; - blk_mq_map_queues(set); + blk_mq_map_queues(&set->map[0]); goto fallback; } blk_mq_map_swqueue(q); @@ -3180,15 +3333,12 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, return false; /* - * poll_nsec can be: + * If we get here, hybrid polling is enabled. Hence poll_nsec can be: * - * -1: don't ever hybrid sleep * 0: use half of prev avg * >0: use this specific value */ - if (q->poll_nsec == -1) - return false; - else if (q->poll_nsec > 0) + if (q->poll_nsec > 0) nsecs = q->poll_nsec; else nsecs = blk_mq_poll_nsecs(q, hctx, rq); @@ -3225,59 +3375,14 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, return true; } -static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) +static bool blk_mq_poll_hybrid(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) { - struct request_queue *q = hctx->queue; - long state; - - /* - * If we sleep, have the caller restart the poll loop to reset - * the state. Like for the other success return cases, the - * caller is responsible for checking if the IO completed. If - * the IO isn't complete, we'll get called again and will go - * straight to the busy poll loop. - */ - if (blk_mq_poll_hybrid_sleep(q, hctx, rq)) - return true; - - hctx->poll_considered++; - - state = current->state; - while (!need_resched()) { - int ret; - - hctx->poll_invoked++; - - ret = q->mq_ops->poll(hctx, rq->tag); - if (ret > 0) { - hctx->poll_success++; - set_current_state(TASK_RUNNING); - return true; - } - - if (signal_pending_state(state, current)) - set_current_state(TASK_RUNNING); - - if (current->state == TASK_RUNNING) - return true; - if (ret < 0) - break; - cpu_relax(); - } - - __set_current_state(TASK_RUNNING); - return false; -} - -static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) -{ - struct blk_mq_hw_ctx *hctx; struct request *rq; - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (q->poll_nsec == -1) return false; - hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; if (!blk_qc_t_is_internal(cookie)) rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); else { @@ -3292,9 +3397,81 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) return false; } - return __blk_mq_poll(hctx, rq); + return blk_mq_poll_hybrid_sleep(q, hctx, rq); } +/** + * blk_poll - poll for IO completions + * @q: the queue + * @cookie: cookie passed back at IO submission time + * @spin: whether to spin for completions + * + * Description: + * Poll for completions on the passed in queue. Returns number of + * completed entries found. If @spin is true, then blk_poll will continue + * looping until at least one completion is found, unless the task is + * otherwise marked running (or we need to reschedule). + */ +int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) +{ + struct blk_mq_hw_ctx *hctx; + long state; + + if (!blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return 0; + + if (current->plug) + blk_flush_plug_list(current->plug, false); + + hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; + + /* + * If we sleep, have the caller restart the poll loop to reset + * the state. Like for the other success return cases, the + * caller is responsible for checking if the IO completed. If + * the IO isn't complete, we'll get called again and will go + * straight to the busy poll loop. + */ + if (blk_mq_poll_hybrid(q, hctx, cookie)) + return 1; + + hctx->poll_considered++; + + state = current->state; + do { + int ret; + + hctx->poll_invoked++; + + ret = q->mq_ops->poll(hctx); + if (ret > 0) { + hctx->poll_success++; + __set_current_state(TASK_RUNNING); + return ret; + } + + if (signal_pending_state(state, current)) + __set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return 1; + if (ret < 0 || !spin) + break; + cpu_relax(); + } while (!need_resched()); + + __set_current_state(TASK_RUNNING); + return 0; +} +EXPORT_SYMBOL_GPL(blk_poll); + +unsigned int blk_mq_rq_cpu(struct request *rq) +{ + return rq->mq_ctx->cpu; +} +EXPORT_SYMBOL(blk_mq_rq_cpu); + static int __init blk_mq_init(void) { cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, diff --git a/block/blk-mq.h b/block/blk-mq.h index 9497b47e2526..d943d46b0785 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -7,17 +7,22 @@ struct blk_mq_tag_set; +struct blk_mq_ctxs { + struct kobject kobj; + struct blk_mq_ctx __percpu *queue_ctx; +}; + /** * struct blk_mq_ctx - State for a software queue facing the submitting CPUs */ struct blk_mq_ctx { struct { spinlock_t lock; - struct list_head rq_list; - } ____cacheline_aligned_in_smp; + struct list_head rq_lists[HCTX_MAX_TYPES]; + } ____cacheline_aligned_in_smp; unsigned int cpu; - unsigned int index_hw; + unsigned short index_hw[HCTX_MAX_TYPES]; /* incremented at dispatch time */ unsigned long rq_dispatched[2]; @@ -27,6 +32,7 @@ struct blk_mq_ctx { unsigned long ____cacheline_aligned_in_smp rq_completed[2]; struct request_queue *queue; + struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; @@ -62,20 +68,55 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); -/* Used by blk_insert_cloned_request() to issue request directly */ -blk_status_t blk_mq_request_issue_directly(struct request *rq); +blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie, + bool bypass, bool last); void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); /* * CPU -> queue mappings */ -extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); +extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int); -static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, - int cpu) +/* + * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue + * @q: request queue + * @type: the hctx type index + * @cpu: CPU + */ +static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q, + enum hctx_type type, + unsigned int cpu) { - return q->queue_hw_ctx[q->mq_map[cpu]]; + return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; +} + +/* + * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue + * @q: request queue + * @flags: request command flags + * @cpu: CPU + */ +static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, + unsigned int flags, + unsigned int cpu) +{ + enum hctx_type type = HCTX_TYPE_DEFAULT; + + if ((flags & REQ_HIPRI) && + q->tag_set->nr_maps > HCTX_TYPE_POLL && + q->tag_set->map[HCTX_TYPE_POLL].nr_queues && + test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + type = HCTX_TYPE_POLL; + + else if (((flags & REQ_OP_MASK) == REQ_OP_READ) && + q->tag_set->nr_maps > HCTX_TYPE_READ && + q->tag_set->map[HCTX_TYPE_READ].nr_queues) + type = HCTX_TYPE_READ; + + return blk_mq_map_queue_type(q, type, cpu); } /* @@ -126,6 +167,7 @@ struct blk_mq_alloc_data { struct request_queue *q; blk_mq_req_flags_t flags; unsigned int shallow_depth; + unsigned int cmd_flags; /* input & output parameter */ struct blk_mq_ctx *ctx; @@ -150,8 +192,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) return hctx->nr_ctx && hctx->tags; } -void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); +unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); @@ -195,21 +236,18 @@ static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, static inline void blk_mq_put_driver_tag(struct request *rq) { - struct blk_mq_hw_ctx *hctx; - if (rq->tag == -1 || rq->internal_tag == -1) return; - hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); - __blk_mq_put_driver_tag(hctx, rq); + __blk_mq_put_driver_tag(rq->mq_hctx, rq); } -static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set) +static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; for_each_possible_cpu(cpu) - set->mq_map[cpu] = 0; + qmap->mq_map[cpu] = 0; } #endif diff --git a/block/blk-pm.c b/block/blk-pm.c index f8fdae01bea2..0a028c189897 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -89,12 +89,12 @@ int blk_pre_runtime_suspend(struct request_queue *q) /* Switch q_usage_counter back to per-cpu mode. */ blk_mq_unfreeze_queue(q); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); if (ret < 0) pm_runtime_mark_last_busy(q->dev); else q->rpm_status = RPM_SUSPENDING; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); if (ret) blk_clear_pm_only(q); @@ -121,14 +121,14 @@ void blk_post_runtime_suspend(struct request_queue *q, int err) if (!q->dev) return; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); if (!err) { q->rpm_status = RPM_SUSPENDED; } else { q->rpm_status = RPM_ACTIVE; pm_runtime_mark_last_busy(q->dev); } - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); if (err) blk_clear_pm_only(q); @@ -151,9 +151,9 @@ void blk_pre_runtime_resume(struct request_queue *q) if (!q->dev) return; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); q->rpm_status = RPM_RESUMING; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); } EXPORT_SYMBOL(blk_pre_runtime_resume); @@ -176,7 +176,7 @@ void blk_post_runtime_resume(struct request_queue *q, int err) if (!q->dev) return; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); if (!err) { q->rpm_status = RPM_ACTIVE; pm_runtime_mark_last_busy(q->dev); @@ -184,7 +184,7 @@ void blk_post_runtime_resume(struct request_queue *q, int err) } else { q->rpm_status = RPM_SUSPENDED; } - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); if (!err) blk_clear_pm_only(q); @@ -207,10 +207,10 @@ EXPORT_SYMBOL(blk_post_runtime_resume); */ void blk_set_runtime_active(struct request_queue *q) { - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); q->rpm_status = RPM_ACTIVE; pm_runtime_mark_last_busy(q->dev); pm_request_autosuspend(q->dev); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); } EXPORT_SYMBOL(blk_set_runtime_active); diff --git a/block/blk-pm.h b/block/blk-pm.h index a8564ea72a41..ea5507d23e75 100644 --- a/block/blk-pm.h +++ b/block/blk-pm.h @@ -21,7 +21,7 @@ static inline void blk_pm_mark_last_busy(struct request *rq) static inline void blk_pm_requeue_request(struct request *rq) { - lockdep_assert_held(rq->q->queue_lock); + lockdep_assert_held(&rq->q->queue_lock); if (rq->q->dev && !(rq->rq_flags & RQF_PM)) rq->q->nr_pending--; @@ -30,7 +30,7 @@ static inline void blk_pm_requeue_request(struct request *rq) static inline void blk_pm_add_request(struct request_queue *q, struct request *rq) { - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&q->queue_lock); if (q->dev && !(rq->rq_flags & RQF_PM)) q->nr_pending++; @@ -38,7 +38,7 @@ static inline void blk_pm_add_request(struct request_queue *q, static inline void blk_pm_put_request(struct request *rq) { - lockdep_assert_held(rq->q->queue_lock); + lockdep_assert_held(&rq->q->queue_lock); if (rq->q->dev && !(rq->rq_flags & RQF_PM)) --rq->q->nr_pending; diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 0005dfd568dd..d169d7188fa6 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -27,75 +27,67 @@ bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit) return atomic_inc_below(&rq_wait->inflight, limit); } -void rq_qos_cleanup(struct request_queue *q, struct bio *bio) +void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio) { - struct rq_qos *rqos; - - for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->cleanup) rqos->ops->cleanup(rqos, bio); - } + rqos = rqos->next; + } while (rqos); } -void rq_qos_done(struct request_queue *q, struct request *rq) +void __rq_qos_done(struct rq_qos *rqos, struct request *rq) { - struct rq_qos *rqos; - - for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->done) rqos->ops->done(rqos, rq); - } + rqos = rqos->next; + } while (rqos); } -void rq_qos_issue(struct request_queue *q, struct request *rq) +void __rq_qos_issue(struct rq_qos *rqos, struct request *rq) { - struct rq_qos *rqos; - - for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->issue) rqos->ops->issue(rqos, rq); - } + rqos = rqos->next; + } while (rqos); } -void rq_qos_requeue(struct request_queue *q, struct request *rq) +void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq) { - struct rq_qos *rqos; - - for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->requeue) rqos->ops->requeue(rqos, rq); - } + rqos = rqos->next; + } while (rqos); } -void rq_qos_throttle(struct request_queue *q, struct bio *bio, - spinlock_t *lock) +void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio) { - struct rq_qos *rqos; - - for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->throttle) - rqos->ops->throttle(rqos, bio, lock); - } + rqos->ops->throttle(rqos, bio); + rqos = rqos->next; + } while (rqos); } -void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) +void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) { - struct rq_qos *rqos; - - for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->track) rqos->ops->track(rqos, rq, bio); - } + rqos = rqos->next; + } while (rqos); } -void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio) { - struct rq_qos *rqos; - - for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->done_bio) rqos->ops->done_bio(rqos, bio); - } + rqos = rqos->next; + } while (rqos); } /* @@ -184,8 +176,96 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) rq_depth_calc_max_depth(rqd); } +struct rq_qos_wait_data { + struct wait_queue_entry wq; + struct task_struct *task; + struct rq_wait *rqw; + acquire_inflight_cb_t *cb; + void *private_data; + bool got_token; +}; + +static int rq_qos_wake_function(struct wait_queue_entry *curr, + unsigned int mode, int wake_flags, void *key) +{ + struct rq_qos_wait_data *data = container_of(curr, + struct rq_qos_wait_data, + wq); + + /* + * If we fail to get a budget, return -1 to interrupt the wake up loop + * in __wake_up_common. + */ + if (!data->cb(data->rqw, data->private_data)) + return -1; + + data->got_token = true; + list_del_init(&curr->entry); + wake_up_process(data->task); + return 1; +} + +/** + * rq_qos_wait - throttle on a rqw if we need to + * @private_data - caller provided specific data + * @acquire_inflight_cb - inc the rqw->inflight counter if we can + * @cleanup_cb - the callback to cleanup in case we race with a waker + * + * This provides a uniform place for the rq_qos users to do their throttling. + * Since you can end up with a lot of things sleeping at once, this manages the + * waking up based on the resources available. The acquire_inflight_cb should + * inc the rqw->inflight if we have the ability to do so, or return false if not + * and then we will sleep until the room becomes available. + * + * cleanup_cb is in case that we race with a waker and need to cleanup the + * inflight count accordingly. + */ +void rq_qos_wait(struct rq_wait *rqw, void *private_data, + acquire_inflight_cb_t *acquire_inflight_cb, + cleanup_cb_t *cleanup_cb) +{ + struct rq_qos_wait_data data = { + .wq = { + .func = rq_qos_wake_function, + .entry = LIST_HEAD_INIT(data.wq.entry), + }, + .task = current, + .rqw = rqw, + .cb = acquire_inflight_cb, + .private_data = private_data, + }; + bool has_sleeper; + + has_sleeper = wq_has_sleeper(&rqw->wait); + if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) + return; + + prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); + do { + if (data.got_token) + break; + if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { + finish_wait(&rqw->wait, &data.wq); + + /* + * We raced with wbt_wake_function() getting a token, + * which means we now have two. Put our local token + * and wake anyone else potentially waiting for one. + */ + if (data.got_token) + cleanup_cb(rqw, private_data); + break; + } + io_schedule(); + has_sleeper = false; + } while (1); + finish_wait(&rqw->wait, &data.wq); +} + void rq_qos_exit(struct request_queue *q) { + blk_mq_debugfs_unregister_queue_rqos(q); + while (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; q->rq_qos = rqos->next; diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 32b02efbfa66..564851889550 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -7,6 +7,10 @@ #include #include +#include "blk-mq-debugfs.h" + +struct blk_mq_debugfs_attr; + enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_CGROUP, @@ -22,10 +26,13 @@ struct rq_qos { struct request_queue *q; enum rq_qos_id id; struct rq_qos *next; +#ifdef CONFIG_BLK_DEBUG_FS + struct dentry *debugfs_dir; +#endif }; struct rq_qos_ops { - void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *); + void (*throttle)(struct rq_qos *, struct bio *); void (*track)(struct rq_qos *, struct request *, struct bio *); void (*issue)(struct rq_qos *, struct request *); void (*requeue)(struct rq_qos *, struct request *); @@ -33,6 +40,7 @@ struct rq_qos_ops { void (*done_bio)(struct rq_qos *, struct bio *); void (*cleanup)(struct rq_qos *, struct bio *); void (*exit)(struct rq_qos *); + const struct blk_mq_debugfs_attr *debugfs_attrs; }; struct rq_depth { @@ -66,6 +74,17 @@ static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) return rq_qos_id(q, RQ_QOS_CGROUP); } +static inline const char *rq_qos_id_to_name(enum rq_qos_id id) +{ + switch (id) { + case RQ_QOS_WBT: + return "wbt"; + case RQ_QOS_CGROUP: + return "cgroup"; + } + return "unknown"; +} + static inline void rq_wait_init(struct rq_wait *rq_wait) { atomic_set(&rq_wait->inflight, 0); @@ -76,6 +95,9 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) { rqos->next = q->rq_qos; q->rq_qos = rqos; + + if (rqos->ops->debugfs_attrs) + blk_mq_debugfs_register_rqos(rqos); } static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) @@ -91,19 +113,77 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) } prev = cur; } + + blk_mq_debugfs_unregister_rqos(rqos); } +typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); +typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data); + +void rq_qos_wait(struct rq_wait *rqw, void *private_data, + acquire_inflight_cb_t *acquire_inflight_cb, + cleanup_cb_t *cleanup_cb); bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); void rq_depth_scale_up(struct rq_depth *rqd); void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); bool rq_depth_calc_max_depth(struct rq_depth *rqd); -void rq_qos_cleanup(struct request_queue *, struct bio *); -void rq_qos_done(struct request_queue *, struct request *); -void rq_qos_issue(struct request_queue *, struct request *); -void rq_qos_requeue(struct request_queue *, struct request *); -void rq_qos_done_bio(struct request_queue *q, struct bio *bio); -void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); -void rq_qos_track(struct request_queue *q, struct request *, struct bio *); +void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio); +void __rq_qos_done(struct rq_qos *rqos, struct request *rq); +void __rq_qos_issue(struct rq_qos *rqos, struct request *rq); +void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq); +void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio); +void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio); +void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio); + +static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) +{ + if (q->rq_qos) + __rq_qos_cleanup(q->rq_qos, bio); +} + +static inline void rq_qos_done(struct request_queue *q, struct request *rq) +{ + if (q->rq_qos) + __rq_qos_done(q->rq_qos, rq); +} + +static inline void rq_qos_issue(struct request_queue *q, struct request *rq) +{ + if (q->rq_qos) + __rq_qos_issue(q->rq_qos, rq); +} + +static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) +{ + if (q->rq_qos) + __rq_qos_requeue(q->rq_qos, rq); +} + +static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +{ + if (q->rq_qos) + __rq_qos_done_bio(q->rq_qos, bio); +} + +static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) +{ + /* + * BIO_TRACKED lets controllers know that a bio went through the + * normal rq_qos path. + */ + bio_set_flag(bio, BIO_TRACKED); + if (q->rq_qos) + __rq_qos_throttle(q->rq_qos, bio); +} + +static inline void rq_qos_track(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + if (q->rq_qos) + __rq_qos_track(q->rq_qos, rq, bio); +} + void rq_qos_exit(struct request_queue *); + #endif diff --git a/block/blk-settings.c b/block/blk-settings.c index 696c04c1ab6c..3abe831e92c8 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -20,65 +20,12 @@ EXPORT_SYMBOL(blk_max_low_pfn); unsigned long blk_max_pfn; -/** - * blk_queue_prep_rq - set a prepare_request function for queue - * @q: queue - * @pfn: prepare_request function - * - * It's possible for a queue to register a prepare_request callback which - * is invoked before the request is handed to the request_fn. The goal of - * the function is to prepare a request for I/O, it can be used to build a - * cdb from the request data for instance. - * - */ -void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) -{ - q->prep_rq_fn = pfn; -} -EXPORT_SYMBOL(blk_queue_prep_rq); - -/** - * blk_queue_unprep_rq - set an unprepare_request function for queue - * @q: queue - * @ufn: unprepare_request function - * - * It's possible for a queue to register an unprepare_request callback - * which is invoked before the request is finally completed. The goal - * of the function is to deallocate any data that was allocated in the - * prepare_request callback. - * - */ -void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn) -{ - q->unprep_rq_fn = ufn; -} -EXPORT_SYMBOL(blk_queue_unprep_rq); - -void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) -{ - q->softirq_done_fn = fn; -} -EXPORT_SYMBOL(blk_queue_softirq_done); - void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) { q->rq_timeout = timeout; } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); -void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn) -{ - WARN_ON_ONCE(q->mq_ops); - q->rq_timed_out_fn = fn; -} -EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out); - -void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn) -{ - q->lld_busy_fn = fn; -} -EXPORT_SYMBOL_GPL(blk_queue_lld_busy); - /** * blk_set_default_limits - reset limits to default values * @lim: the queue_limits structure to reset @@ -169,8 +116,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) q->make_request_fn = mfn; blk_queue_dma_alignment(q, 511); - blk_queue_congestion_threshold(q); - q->nr_batching = BLK_BATCH_REQ; blk_set_default_limits(&q->limits); } @@ -889,16 +834,14 @@ EXPORT_SYMBOL(blk_set_queue_depth); */ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) { - spin_lock_irq(q->queue_lock); if (wc) - queue_flag_set(QUEUE_FLAG_WC, q); + blk_queue_flag_set(QUEUE_FLAG_WC, q); else - queue_flag_clear(QUEUE_FLAG_WC, q); + blk_queue_flag_clear(QUEUE_FLAG_WC, q); if (fua) - queue_flag_set(QUEUE_FLAG_FUA, q); + blk_queue_flag_set(QUEUE_FLAG_FUA, q); else - queue_flag_clear(QUEUE_FLAG_FUA, q); - spin_unlock_irq(q->queue_lock); + blk_queue_flag_clear(QUEUE_FLAG_FUA, q); wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } diff --git a/block/blk-softirq.c b/block/blk-softirq.c index e47a2f751884..457d9ba3eb20 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -34,7 +34,7 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h) rq = list_entry(local_list.next, struct request, ipi_list); list_del_init(&rq->ipi_list); - rq->q->softirq_done_fn(rq); + rq->q->mq_ops->complete(rq); } } @@ -98,11 +98,11 @@ static int blk_softirq_cpu_dead(unsigned int cpu) void __blk_complete_request(struct request *req) { struct request_queue *q = req->q; - int cpu, ccpu = q->mq_ops ? req->mq_ctx->cpu : req->cpu; + int cpu, ccpu = req->mq_ctx->cpu; unsigned long flags; bool shared = false; - BUG_ON(!q->softirq_done_fn); + BUG_ON(!q->mq_ops->complete); local_irq_save(flags); cpu = smp_processor_id(); @@ -143,27 +143,6 @@ do_local: local_irq_restore(flags); } -EXPORT_SYMBOL(__blk_complete_request); - -/** - * blk_complete_request - end I/O on a request - * @req: the request being processed - * - * Description: - * Ends all I/O on a request. It does not handle partial completions, - * unless the driver actually implements this in its completion callback - * through requeueing. The actual completion happens out-of-order, - * through a softirq handler. The user must have registered a completion - * callback through blk_queue_softirq_done(). - **/ -void blk_complete_request(struct request *req) -{ - if (unlikely(blk_should_fake_timeout(req->q))) - return; - if (!blk_mark_rq_complete(req)) - __blk_complete_request(req); -} -EXPORT_SYMBOL(blk_complete_request); static __init int blk_softirq_init(void) { diff --git a/block/blk-stat.c b/block/blk-stat.c index 90561af85a62..696a04176e4d 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -130,7 +130,6 @@ blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), return cb; } -EXPORT_SYMBOL_GPL(blk_stat_alloc_callback); void blk_stat_add_callback(struct request_queue *q, struct blk_stat_callback *cb) @@ -151,7 +150,6 @@ void blk_stat_add_callback(struct request_queue *q, blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock(&q->stats->lock); } -EXPORT_SYMBOL_GPL(blk_stat_add_callback); void blk_stat_remove_callback(struct request_queue *q, struct blk_stat_callback *cb) @@ -164,7 +162,6 @@ void blk_stat_remove_callback(struct request_queue *q, del_timer_sync(&cb->timer); } -EXPORT_SYMBOL_GPL(blk_stat_remove_callback); static void blk_stat_free_callback_rcu(struct rcu_head *head) { @@ -181,7 +178,6 @@ void blk_stat_free_callback(struct blk_stat_callback *cb) if (cb) call_rcu(&cb->rcu, blk_stat_free_callback_rcu); } -EXPORT_SYMBOL_GPL(blk_stat_free_callback); void blk_stat_enable_accounting(struct request_queue *q) { diff --git a/block/blk-stat.h b/block/blk-stat.h index f4a1568e81a4..17b47a86eefb 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -145,6 +145,11 @@ static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb, mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs)); } +static inline void blk_stat_deactivate(struct blk_stat_callback *cb) +{ + del_timer_sync(&cb->timer); +} + /** * blk_stat_activate_msecs() - Gather block statistics during a time window in * milliseconds. diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 844a454a7b3a..0619c8922893 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -68,7 +68,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) unsigned long nr; int ret, err; - if (!q->request_fn && !q->mq_ops) + if (!queue_is_mq(q)) return -EINVAL; ret = queue_var_store(&nr, page, count); @@ -78,11 +78,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - if (q->request_fn) - err = blk_update_nr_requests(q, nr); - else - err = blk_mq_update_nr_requests(q, nr); - + err = blk_mq_update_nr_requests(q, nr); if (err) return err; @@ -242,10 +238,10 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) return -EINVAL; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); return ret; } @@ -320,14 +316,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, if (ret < 0) return ret; - spin_lock_irq(q->queue_lock); - queue_flag_clear(QUEUE_FLAG_NOMERGES, q); - queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); + blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); if (nm == 2) - queue_flag_set(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); else if (nm) - queue_flag_set(QUEUE_FLAG_NOXMERGES, q); - spin_unlock_irq(q->queue_lock); + blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); return ret; } @@ -351,18 +345,16 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; - spin_lock_irq(q->queue_lock); if (val == 2) { - queue_flag_set(QUEUE_FLAG_SAME_COMP, q); - queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); + blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); + blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); } else if (val == 1) { - queue_flag_set(QUEUE_FLAG_SAME_COMP, q); - queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); + blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } else if (val == 0) { - queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); - queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } - spin_unlock_irq(q->queue_lock); #endif return ret; } @@ -410,7 +402,8 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, unsigned long poll_on; ssize_t ret; - if (!q->mq_ops || !q->mq_ops->poll) + if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL || + !q->tag_set->map[HCTX_TYPE_POLL].nr_queues) return -EINVAL; ret = queue_var_store(&poll_on, page, count); @@ -425,6 +418,26 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, return ret; } +static ssize_t queue_io_timeout_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%u\n", jiffies_to_msecs(q->rq_timeout)); +} + +static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned int val; + int err; + + err = kstrtou32(page, 10, &val); + if (err || val == 0) + return -EINVAL; + + blk_queue_rq_timeout(q, msecs_to_jiffies(val)); + + return count; +} + static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) { if (!wbt_rq_qos(q)) @@ -463,20 +476,14 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, * ends up either enabling or disabling wbt completely. We can't * have IO inflight if that happens. */ - if (q->mq_ops) { - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - } else - blk_queue_bypass_start(q); + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); wbt_set_min_lat(q, val); wbt_update_limits(q); - if (q->mq_ops) { - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); - } else - blk_queue_bypass_end(q); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); return count; } @@ -699,6 +706,12 @@ static struct queue_sysfs_entry queue_dax_entry = { .show = queue_dax_show, }; +static struct queue_sysfs_entry queue_io_timeout_entry = { + .attr = {.name = "io_timeout", .mode = 0644 }, + .show = queue_io_timeout_show, + .store = queue_io_timeout_store, +}; + static struct queue_sysfs_entry queue_wb_lat_entry = { .attr = {.name = "wbt_lat_usec", .mode = 0644 }, .show = queue_wb_lat_show, @@ -748,6 +761,7 @@ static struct attribute *default_attrs[] = { &queue_dax_entry.attr, &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, + &queue_io_timeout_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &throtl_sample_time_entry.attr, #endif @@ -847,24 +861,14 @@ static void __blk_release_queue(struct work_struct *work) blk_free_queue_stats(q->stats); - blk_exit_rl(q, &q->root_rl); - - if (q->queue_tags) - __blk_queue_free_tags(q); - blk_queue_free_zone_bitmaps(q); - if (!q->mq_ops) { - if (q->exit_rq_fn) - q->exit_rq_fn(q, q->fq->flush_rq); - blk_free_flush_queue(q->fq); - } else { + if (queue_is_mq(q)) blk_mq_release(q); - } blk_trace_shutdown(q); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_debugfs_unregister(q); bioset_exit(&q->bio_split); @@ -909,7 +913,7 @@ int blk_register_queue(struct gendisk *disk) WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); - queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q); + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); /* * SCSI probing may synchronously create and destroy a lot of @@ -921,9 +925,8 @@ int blk_register_queue(struct gendisk *disk) * request_queues for non-existent devices never get registered. */ if (!blk_queue_init_done(q)) { - queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); + blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); percpu_ref_switch_to_percpu(&q->q_usage_counter); - blk_queue_bypass_end(q); } ret = blk_trace_init_sysfs(dev); @@ -939,7 +942,7 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } - if (q->mq_ops) { + if (queue_is_mq(q)) { __blk_mq_register_dev(dev, q); blk_mq_debugfs_register(q); } @@ -950,7 +953,7 @@ int blk_register_queue(struct gendisk *disk) blk_throtl_register_queue(q); - if (q->request_fn || (q->mq_ops && q->elevator)) { + if (q->elevator) { ret = elv_register_queue(q); if (ret) { mutex_unlock(&q->sysfs_lock); @@ -999,7 +1002,7 @@ void blk_unregister_queue(struct gendisk *disk) * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. */ - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_unregister_dev(disk_to_dev(disk), q); mutex_unlock(&q->sysfs_lock); @@ -1008,7 +1011,7 @@ void blk_unregister_queue(struct gendisk *disk) blk_trace_remove_sysfs(disk_to_dev(disk)); mutex_lock(&q->sysfs_lock); - if (q->request_fn || (q->mq_ops && q->elevator)) + if (q->elevator) elv_unregister_queue(q); mutex_unlock(&q->sysfs_lock); diff --git a/block/blk-tag.c b/block/blk-tag.c deleted file mode 100644 index fbc153aef166..000000000000 --- a/block/blk-tag.c +++ /dev/null @@ -1,378 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Functions related to tagged command queuing - */ -#include -#include -#include -#include -#include - -#include "blk.h" - -/** - * blk_queue_find_tag - find a request by its tag and queue - * @q: The request queue for the device - * @tag: The tag of the request - * - * Notes: - * Should be used when a device returns a tag and you want to match - * it with a request. - * - * no locks need be held. - **/ -struct request *blk_queue_find_tag(struct request_queue *q, int tag) -{ - return blk_map_queue_find_tag(q->queue_tags, tag); -} -EXPORT_SYMBOL(blk_queue_find_tag); - -/** - * blk_free_tags - release a given set of tag maintenance info - * @bqt: the tag map to free - * - * Drop the reference count on @bqt and frees it when the last reference - * is dropped. - */ -void blk_free_tags(struct blk_queue_tag *bqt) -{ - if (atomic_dec_and_test(&bqt->refcnt)) { - BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) < - bqt->max_depth); - - kfree(bqt->tag_index); - bqt->tag_index = NULL; - - kfree(bqt->tag_map); - bqt->tag_map = NULL; - - kfree(bqt); - } -} -EXPORT_SYMBOL(blk_free_tags); - -/** - * __blk_queue_free_tags - release tag maintenance info - * @q: the request queue for the device - * - * Notes: - * blk_cleanup_queue() will take care of calling this function, if tagging - * has been used. So there's no need to call this directly. - **/ -void __blk_queue_free_tags(struct request_queue *q) -{ - struct blk_queue_tag *bqt = q->queue_tags; - - if (!bqt) - return; - - blk_free_tags(bqt); - - q->queue_tags = NULL; - queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q); -} - -/** - * blk_queue_free_tags - release tag maintenance info - * @q: the request queue for the device - * - * Notes: - * This is used to disable tagged queuing to a device, yet leave - * queue in function. - **/ -void blk_queue_free_tags(struct request_queue *q) -{ - queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q); -} -EXPORT_SYMBOL(blk_queue_free_tags); - -static int -init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth) -{ - struct request **tag_index; - unsigned long *tag_map; - int nr_ulongs; - - if (q && depth > q->nr_requests * 2) { - depth = q->nr_requests * 2; - printk(KERN_ERR "%s: adjusted depth to %d\n", - __func__, depth); - } - - tag_index = kcalloc(depth, sizeof(struct request *), GFP_ATOMIC); - if (!tag_index) - goto fail; - - nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; - tag_map = kcalloc(nr_ulongs, sizeof(unsigned long), GFP_ATOMIC); - if (!tag_map) - goto fail; - - tags->real_max_depth = depth; - tags->max_depth = depth; - tags->tag_index = tag_index; - tags->tag_map = tag_map; - - return 0; -fail: - kfree(tag_index); - return -ENOMEM; -} - -static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, - int depth, int alloc_policy) -{ - struct blk_queue_tag *tags; - - tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); - if (!tags) - goto fail; - - if (init_tag_map(q, tags, depth)) - goto fail; - - atomic_set(&tags->refcnt, 1); - tags->alloc_policy = alloc_policy; - tags->next_tag = 0; - return tags; -fail: - kfree(tags); - return NULL; -} - -/** - * blk_init_tags - initialize the tag info for an external tag map - * @depth: the maximum queue depth supported - * @alloc_policy: tag allocation policy - **/ -struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy) -{ - return __blk_queue_init_tags(NULL, depth, alloc_policy); -} -EXPORT_SYMBOL(blk_init_tags); - -/** - * blk_queue_init_tags - initialize the queue tag info - * @q: the request queue for the device - * @depth: the maximum queue depth supported - * @tags: the tag to use - * @alloc_policy: tag allocation policy - * - * Queue lock must be held here if the function is called to resize an - * existing map. - **/ -int blk_queue_init_tags(struct request_queue *q, int depth, - struct blk_queue_tag *tags, int alloc_policy) -{ - int rc; - - BUG_ON(tags && q->queue_tags && tags != q->queue_tags); - - if (!tags && !q->queue_tags) { - tags = __blk_queue_init_tags(q, depth, alloc_policy); - - if (!tags) - return -ENOMEM; - - } else if (q->queue_tags) { - rc = blk_queue_resize_tags(q, depth); - if (rc) - return rc; - queue_flag_set(QUEUE_FLAG_QUEUED, q); - return 0; - } else - atomic_inc(&tags->refcnt); - - /* - * assign it, all done - */ - q->queue_tags = tags; - queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q); - return 0; -} -EXPORT_SYMBOL(blk_queue_init_tags); - -/** - * blk_queue_resize_tags - change the queueing depth - * @q: the request queue for the device - * @new_depth: the new max command queueing depth - * - * Notes: - * Must be called with the queue lock held. - **/ -int blk_queue_resize_tags(struct request_queue *q, int new_depth) -{ - struct blk_queue_tag *bqt = q->queue_tags; - struct request **tag_index; - unsigned long *tag_map; - int max_depth, nr_ulongs; - - if (!bqt) - return -ENXIO; - - /* - * if we already have large enough real_max_depth. just - * adjust max_depth. *NOTE* as requests with tag value - * between new_depth and real_max_depth can be in-flight, tag - * map can not be shrunk blindly here. - */ - if (new_depth <= bqt->real_max_depth) { - bqt->max_depth = new_depth; - return 0; - } - - /* - * Currently cannot replace a shared tag map with a new - * one, so error out if this is the case - */ - if (atomic_read(&bqt->refcnt) != 1) - return -EBUSY; - - /* - * save the old state info, so we can copy it back - */ - tag_index = bqt->tag_index; - tag_map = bqt->tag_map; - max_depth = bqt->real_max_depth; - - if (init_tag_map(q, bqt, new_depth)) - return -ENOMEM; - - memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); - nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; - memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); - - kfree(tag_index); - kfree(tag_map); - return 0; -} -EXPORT_SYMBOL(blk_queue_resize_tags); - -/** - * blk_queue_end_tag - end tag operations for a request - * @q: the request queue for the device - * @rq: the request that has completed - * - * Description: - * Typically called when end_that_request_first() returns %0, meaning - * all transfers have been done for a request. It's important to call - * this function before end_that_request_last(), as that will put the - * request back on the free list thus corrupting the internal tag list. - **/ -void blk_queue_end_tag(struct request_queue *q, struct request *rq) -{ - struct blk_queue_tag *bqt = q->queue_tags; - unsigned tag = rq->tag; /* negative tags invalid */ - - lockdep_assert_held(q->queue_lock); - - BUG_ON(tag >= bqt->real_max_depth); - - list_del_init(&rq->queuelist); - rq->rq_flags &= ~RQF_QUEUED; - rq->tag = -1; - rq->internal_tag = -1; - - if (unlikely(bqt->tag_index[tag] == NULL)) - printk(KERN_ERR "%s: tag %d is missing\n", - __func__, tag); - - bqt->tag_index[tag] = NULL; - - if (unlikely(!test_bit(tag, bqt->tag_map))) { - printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", - __func__, tag); - return; - } - /* - * The tag_map bit acts as a lock for tag_index[bit], so we need - * unlock memory barrier semantics. - */ - clear_bit_unlock(tag, bqt->tag_map); -} - -/** - * blk_queue_start_tag - find a free tag and assign it - * @q: the request queue for the device - * @rq: the block request that needs tagging - * - * Description: - * This can either be used as a stand-alone helper, or possibly be - * assigned as the queue &prep_rq_fn (in which case &struct request - * automagically gets a tag assigned). Note that this function - * assumes that any type of request can be queued! if this is not - * true for your device, you must check the request type before - * calling this function. The request will also be removed from - * the request queue, so it's the drivers responsibility to readd - * it if it should need to be restarted for some reason. - **/ -int blk_queue_start_tag(struct request_queue *q, struct request *rq) -{ - struct blk_queue_tag *bqt = q->queue_tags; - unsigned max_depth; - int tag; - - lockdep_assert_held(q->queue_lock); - - if (unlikely((rq->rq_flags & RQF_QUEUED))) { - printk(KERN_ERR - "%s: request %p for device [%s] already tagged %d", - __func__, rq, - rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); - BUG(); - } - - /* - * Protect against shared tag maps, as we may not have exclusive - * access to the tag map. - * - * We reserve a few tags just for sync IO, since we don't want - * to starve sync IO on behalf of flooding async IO. - */ - max_depth = bqt->max_depth; - if (!rq_is_sync(rq) && max_depth > 1) { - switch (max_depth) { - case 2: - max_depth = 1; - break; - case 3: - max_depth = 2; - break; - default: - max_depth -= 2; - } - if (q->in_flight[BLK_RW_ASYNC] > max_depth) - return 1; - } - - do { - if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) { - tag = find_first_zero_bit(bqt->tag_map, max_depth); - if (tag >= max_depth) - return 1; - } else { - int start = bqt->next_tag; - int size = min_t(int, bqt->max_depth, max_depth + start); - tag = find_next_zero_bit(bqt->tag_map, size, start); - if (tag >= size && start + size > bqt->max_depth) { - size = start + size - bqt->max_depth; - tag = find_first_zero_bit(bqt->tag_map, size); - } - if (tag >= size) - return 1; - } - - } while (test_and_set_bit_lock(tag, bqt->tag_map)); - /* - * We need lock ordering semantics given by test_and_set_bit_lock. - * See blk_queue_end_tag for details. - */ - - bqt->next_tag = (tag + 1) % bqt->max_depth; - rq->rq_flags |= RQF_QUEUED; - rq->tag = tag; - bqt->tag_index[tag] = rq; - blk_start_request(rq); - return 0; -} -EXPORT_SYMBOL(blk_queue_start_tag); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index db1a3a2ae006..1b97a73d2fb1 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1243,7 +1243,7 @@ static void throtl_pending_timer_fn(struct timer_list *t) bool dispatched; int ret; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); if (throtl_can_upgrade(td, NULL)) throtl_upgrade_state(td); @@ -1266,9 +1266,9 @@ again: break; /* this dispatch windows is still open, relax and repeat */ - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); cpu_relax(); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); } if (!dispatched) @@ -1290,7 +1290,7 @@ again: queue_work(kthrotld_workqueue, &td->dispatch_work); } out_unlock: - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); } /** @@ -1314,11 +1314,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) bio_list_init(&bio_list_on_stack); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); for (rw = READ; rw <= WRITE; rw++) while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) bio_list_add(&bio_list_on_stack, bio); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); @@ -2115,16 +2115,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif -static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) -{ -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - /* fallback to root_blkg if we fail to get a blkg ref */ - if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV)) - bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg); - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); -#endif -} - bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio) { @@ -2141,14 +2131,10 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) goto out; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); throtl_update_latency_buckets(td); - if (unlikely(blk_queue_bypass(q))) - goto out_unlock; - - blk_throtl_assoc_bio(tg, bio); blk_throtl_update_idletime(tg); sq = &tg->service_queue; @@ -2227,7 +2213,7 @@ again: } out_unlock: - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); out: bio_set_flag(bio, BIO_THROTTLED); @@ -2348,7 +2334,7 @@ static void tg_drain_bios(struct throtl_service_queue *parent_sq) * Dispatch all currently throttled bios on @q through ->make_request_fn(). */ void blk_throtl_drain(struct request_queue *q) - __releases(q->queue_lock) __acquires(q->queue_lock) + __releases(&q->queue_lock) __acquires(&q->queue_lock) { struct throtl_data *td = q->td; struct blkcg_gq *blkg; @@ -2356,7 +2342,6 @@ void blk_throtl_drain(struct request_queue *q) struct bio *bio; int rw; - queue_lockdep_assert_held(q); rcu_read_lock(); /* @@ -2372,7 +2357,7 @@ void blk_throtl_drain(struct request_queue *q) tg_drain_bios(&td->service_queue); rcu_read_unlock(); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); /* all bios now should be in td->service_queue, issue them */ for (rw = READ; rw <= WRITE; rw++) @@ -2380,7 +2365,7 @@ void blk_throtl_drain(struct request_queue *q) NULL))) generic_make_request(bio); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); } int blk_throtl_init(struct request_queue *q) @@ -2460,7 +2445,7 @@ void blk_throtl_register_queue(struct request_queue *q) td->throtl_slice = DFL_THROTL_SLICE_HD; #endif - td->track_bio_latency = !queue_is_rq_based(q); + td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index f2cfd56e1606..124c26128bf6 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -68,80 +68,6 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, #endif /* CONFIG_FAIL_IO_TIMEOUT */ -/* - * blk_delete_timer - Delete/cancel timer for a given function. - * @req: request that we are canceling timer for - * - */ -void blk_delete_timer(struct request *req) -{ - list_del_init(&req->timeout_list); -} - -static void blk_rq_timed_out(struct request *req) -{ - struct request_queue *q = req->q; - enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; - - if (q->rq_timed_out_fn) - ret = q->rq_timed_out_fn(req); - switch (ret) { - case BLK_EH_RESET_TIMER: - blk_add_timer(req); - blk_clear_rq_complete(req); - break; - case BLK_EH_DONE: - /* - * LLD handles this for now but in the future - * we can send a request msg to abort the command - * and we can move more of the generic scsi eh code to - * the blk layer. - */ - break; - default: - printk(KERN_ERR "block: bad eh return: %d\n", ret); - break; - } -} - -static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, - unsigned int *next_set) -{ - const unsigned long deadline = blk_rq_deadline(rq); - - if (time_after_eq(jiffies, deadline)) { - list_del_init(&rq->timeout_list); - - /* - * Check if we raced with end io completion - */ - if (!blk_mark_rq_complete(rq)) - blk_rq_timed_out(rq); - } else if (!*next_set || time_after(*next_timeout, deadline)) { - *next_timeout = deadline; - *next_set = 1; - } -} - -void blk_timeout_work(struct work_struct *work) -{ - struct request_queue *q = - container_of(work, struct request_queue, timeout_work); - unsigned long flags, next = 0; - struct request *rq, *tmp; - int next_set = 0; - - spin_lock_irqsave(q->queue_lock, flags); - - list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) - blk_rq_check_expired(rq, &next, &next_set); - - if (next_set) - mod_timer(&q->timeout, round_jiffies_up(next)); - - spin_unlock_irqrestore(q->queue_lock, flags); -} - /** * blk_abort_request -- Request request recovery for the specified command * @req: pointer to the request of interest @@ -149,24 +75,17 @@ void blk_timeout_work(struct work_struct *work) * This function requests that the block layer start recovery for the * request by deleting the timer and calling the q's timeout function. * LLDDs who implement their own error recovery MAY ignore the timeout - * event if they generated blk_abort_req. Must hold queue lock. + * event if they generated blk_abort_request. */ void blk_abort_request(struct request *req) { - if (req->q->mq_ops) { - /* - * All we need to ensure is that timeout scan takes place - * immediately and that scan sees the new timeout value. - * No need for fancy synchronizations. - */ - blk_rq_set_deadline(req, jiffies); - kblockd_schedule_work(&req->q->timeout_work); - } else { - if (blk_mark_rq_complete(req)) - return; - blk_delete_timer(req); - blk_rq_timed_out(req); - } + /* + * All we need to ensure is that timeout scan takes place + * immediately and that scan sees the new timeout value. + * No need for fancy synchronizations. + */ + WRITE_ONCE(req->deadline, jiffies); + kblockd_schedule_work(&req->q->timeout_work); } EXPORT_SYMBOL_GPL(blk_abort_request); @@ -194,15 +113,6 @@ void blk_add_timer(struct request *req) struct request_queue *q = req->q; unsigned long expiry; - if (!q->mq_ops) - lockdep_assert_held(q->queue_lock); - - /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ - if (!q->mq_ops && !q->rq_timed_out_fn) - return; - - BUG_ON(!list_empty(&req->timeout_list)); - /* * Some LLDs, like scsi, peek at the timeout to prevent a * command from being retried forever. @@ -211,21 +121,16 @@ void blk_add_timer(struct request *req) req->timeout = q->rq_timeout; req->rq_flags &= ~RQF_TIMED_OUT; - blk_rq_set_deadline(req, jiffies + req->timeout); - /* - * Only the non-mq case needs to add the request to a protected list. - * For the mq case we simply scan the tag map. - */ - if (!q->mq_ops) - list_add_tail(&req->timeout_list, &req->q->timeout_list); + expiry = jiffies + req->timeout; + WRITE_ONCE(req->deadline, expiry); /* * If the timer isn't already pending or this timeout is earlier * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req))); + expiry = blk_rq_timeout(round_jiffies_up(expiry)); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 8ac93fcbaa2e..f0c56649775f 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -489,31 +489,21 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) } struct wbt_wait_data { - struct wait_queue_entry wq; - struct task_struct *task; struct rq_wb *rwb; - struct rq_wait *rqw; + enum wbt_flags wb_acct; unsigned long rw; - bool got_token; }; -static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, - int wake_flags, void *key) +static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data) { - struct wbt_wait_data *data = container_of(curr, struct wbt_wait_data, - wq); + struct wbt_wait_data *data = private_data; + return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw)); +} - /* - * If we fail to get a budget, return -1 to interrupt the wake up - * loop in __wake_up_common. - */ - if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw))) - return -1; - - data->got_token = true; - list_del_init(&curr->entry); - wake_up_process(data->task); - return 1; +static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) +{ + struct wbt_wait_data *data = private_data; + wbt_rqw_done(data->rwb, rqw, data->wb_acct); } /* @@ -521,57 +511,16 @@ static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, * the timer to kick off queuing again. */ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, - unsigned long rw, spinlock_t *lock) - __releases(lock) - __acquires(lock) + unsigned long rw) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); struct wbt_wait_data data = { - .wq = { - .func = wbt_wake_function, - .entry = LIST_HEAD_INIT(data.wq.entry), - }, - .task = current, .rwb = rwb, - .rqw = rqw, + .wb_acct = wb_acct, .rw = rw, }; - bool has_sleeper; - has_sleeper = wq_has_sleeper(&rqw->wait); - if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) - return; - - prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); - do { - if (data.got_token) - break; - - if (!has_sleeper && - rq_wait_inc_below(rqw, get_limit(rwb, rw))) { - finish_wait(&rqw->wait, &data.wq); - - /* - * We raced with wbt_wake_function() getting a token, - * which means we now have two. Put our local token - * and wake anyone else potentially waiting for one. - */ - if (data.got_token) - wbt_rqw_done(rwb, rqw, wb_acct); - break; - } - - if (lock) { - spin_unlock_irq(lock); - io_schedule(); - spin_lock_irq(lock); - } else - io_schedule(); - - has_sleeper = false; - } while (1); - - finish_wait(&rqw->wait, &data.wq); + rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); } static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) @@ -624,7 +573,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. */ -static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) +static void wbt_wait(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); enum wbt_flags flags; @@ -636,7 +585,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) return; } - __wbt_wait(rwb, flags, bio->bi_opf, lock); + __wbt_wait(rwb, flags, bio->bi_opf); if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); @@ -709,8 +658,7 @@ void wbt_enable_default(struct request_queue *q) if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) return; - if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) || - (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ))) + if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ)) wbt_init(q); } EXPORT_SYMBOL_GPL(wbt_enable_default); @@ -760,11 +708,100 @@ void wbt_disable_default(struct request_queue *q) if (!rqos) return; rwb = RQWB(rqos); - if (rwb->enable_state == WBT_STATE_ON_DEFAULT) + if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { + blk_stat_deactivate(rwb->cb); rwb->wb_normal = 0; + } } EXPORT_SYMBOL_GPL(wbt_disable_default); +#ifdef CONFIG_BLK_DEBUG_FS +static int wbt_curr_win_nsec_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%llu\n", rwb->cur_win_nsec); + return 0; +} + +static int wbt_enabled_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%d\n", rwb->enable_state); + return 0; +} + +static int wbt_id_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + + seq_printf(m, "%u\n", rqos->id); + return 0; +} + +static int wbt_inflight_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + int i; + + for (i = 0; i < WBT_NUM_RWQ; i++) + seq_printf(m, "%d: inflight %d\n", i, + atomic_read(&rwb->rq_wait[i].inflight)); + return 0; +} + +static int wbt_min_lat_nsec_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%lu\n", rwb->min_lat_nsec); + return 0; +} + +static int wbt_unknown_cnt_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%u\n", rwb->unknown_cnt); + return 0; +} + +static int wbt_normal_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%u\n", rwb->wb_normal); + return 0; +} + +static int wbt_background_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%u\n", rwb->wb_background); + return 0; +} + +static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = { + {"curr_win_nsec", 0400, wbt_curr_win_nsec_show}, + {"enabled", 0400, wbt_enabled_show}, + {"id", 0400, wbt_id_show}, + {"inflight", 0400, wbt_inflight_show}, + {"min_lat_nsec", 0400, wbt_min_lat_nsec_show}, + {"unknown_cnt", 0400, wbt_unknown_cnt_show}, + {"wb_normal", 0400, wbt_normal_show}, + {"wb_background", 0400, wbt_background_show}, + {}, +}; +#endif static struct rq_qos_ops wbt_rqos_ops = { .throttle = wbt_wait, @@ -774,6 +811,9 @@ static struct rq_qos_ops wbt_rqos_ops = { .done = wbt_done, .cleanup = wbt_cleanup, .exit = wbt_exit, +#ifdef CONFIG_BLK_DEBUG_FS + .debugfs_attrs = wbt_debugfs_attrs, +#endif }; int wbt_init(struct request_queue *q) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index a327bef07642..2d98803faec2 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -421,7 +421,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) * BIO based queues do not use a scheduler so only q->nr_zones * needs to be updated so that the sysfs exposed value is correct. */ - if (!queue_is_rq_based(q)) { + if (!queue_is_mq(q)) { q->nr_zones = nr_zones; return 0; } diff --git a/block/blk.h b/block/blk.h index 0089fefdf771..848278c52030 100644 --- a/block/blk.h +++ b/block/blk.h @@ -7,12 +7,6 @@ #include #include "blk-mq.h" -/* Amount of time in which a process may batch requests */ -#define BLK_BATCH_TIME (HZ/50UL) - -/* Number of requests a "batching" process may submit */ -#define BLK_BATCH_REQ 32 - /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) @@ -38,85 +32,13 @@ struct blk_flush_queue { }; extern struct kmem_cache *blk_requestq_cachep; -extern struct kmem_cache *request_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; -/* - * @q->queue_lock is set while a queue is being initialized. Since we know - * that no other threads access the queue object before @q->queue_lock has - * been set, it is safe to manipulate queue flags without holding the - * queue_lock if @q->queue_lock == NULL. See also blk_alloc_queue_node() and - * blk_init_allocated_queue(). - */ -static inline void queue_lockdep_assert_held(struct request_queue *q) +static inline struct blk_flush_queue * +blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) { - if (q->queue_lock) - lockdep_assert_held(q->queue_lock); -} - -static inline void queue_flag_set_unlocked(unsigned int flag, - struct request_queue *q) -{ - if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) && - kref_read(&q->kobj.kref)) - lockdep_assert_held(q->queue_lock); - __set_bit(flag, &q->queue_flags); -} - -static inline void queue_flag_clear_unlocked(unsigned int flag, - struct request_queue *q) -{ - if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) && - kref_read(&q->kobj.kref)) - lockdep_assert_held(q->queue_lock); - __clear_bit(flag, &q->queue_flags); -} - -static inline int queue_flag_test_and_clear(unsigned int flag, - struct request_queue *q) -{ - queue_lockdep_assert_held(q); - - if (test_bit(flag, &q->queue_flags)) { - __clear_bit(flag, &q->queue_flags); - return 1; - } - - return 0; -} - -static inline int queue_flag_test_and_set(unsigned int flag, - struct request_queue *q) -{ - queue_lockdep_assert_held(q); - - if (!test_bit(flag, &q->queue_flags)) { - __set_bit(flag, &q->queue_flags); - return 0; - } - - return 1; -} - -static inline void queue_flag_set(unsigned int flag, struct request_queue *q) -{ - queue_lockdep_assert_held(q); - __set_bit(flag, &q->queue_flags); -} - -static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) -{ - queue_lockdep_assert_held(q); - __clear_bit(flag, &q->queue_flags); -} - -static inline struct blk_flush_queue *blk_get_flush_queue( - struct request_queue *q, struct blk_mq_ctx *ctx) -{ - if (q->mq_ops) - return blk_mq_map_queue(q, ctx->cpu)->fq; - return q->fq; + return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx->cpu)->fq; } static inline void __blk_get_queue(struct request_queue *q) @@ -128,15 +50,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); -int blk_init_rl(struct request_list *rl, struct request_queue *q, - gfp_t gfp_mask); -void blk_exit_rl(struct request_queue *q, struct request_list *rl); void blk_exit_queue(struct request_queue *q); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); -void blk_queue_bypass_start(struct request_queue *q); -void blk_queue_bypass_end(struct request_queue *q); -void __blk_queue_free_tags(struct request_queue *q); void blk_freeze_queue(struct request_queue *q); static inline void blk_queue_enter_live(struct request_queue *q) @@ -235,11 +151,8 @@ static inline bool bio_integrity_endio(struct bio *bio) } #endif /* CONFIG_BLK_DEV_INTEGRITY */ -void blk_timeout_work(struct work_struct *work); unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); -void blk_delete_timer(struct request *); - bool bio_attempt_front_merge(struct request_queue *q, struct request *req, struct bio *bio); @@ -248,34 +161,12 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count, struct request **same_queue_rq); -unsigned int blk_plug_queued_count(struct request_queue *q); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); void blk_account_io_done(struct request *req, u64 now); -/* - * EH timer and IO completion will both attempt to 'grab' the request, make - * sure that only one of them succeeds. Steal the bottom bit of the - * __deadline field for this. - */ -static inline int blk_mark_rq_complete(struct request *rq) -{ - return test_and_set_bit(0, &rq->__deadline); -} - -static inline void blk_clear_rq_complete(struct request *rq) -{ - clear_bit(0, &rq->__deadline); -} - -static inline bool blk_rq_is_complete(struct request *rq) -{ - return test_bit(0, &rq->__deadline); -} - /* * Internal elevator interface */ @@ -283,23 +174,6 @@ static inline bool blk_rq_is_complete(struct request *rq) void blk_insert_flush(struct request *rq); -static inline void elv_activate_rq(struct request_queue *q, struct request *rq) -{ - struct elevator_queue *e = q->elevator; - - if (e->type->ops.sq.elevator_activate_req_fn) - e->type->ops.sq.elevator_activate_req_fn(q, rq); -} - -static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) -{ - struct elevator_queue *e = q->elevator; - - if (e->type->ops.sq.elevator_deactivate_req_fn) - e->type->ops.sq.elevator_deactivate_req_fn(q, rq); -} - -int elevator_init(struct request_queue *); int elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); @@ -334,31 +208,8 @@ void blk_rq_set_mixed_merge(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); -void blk_queue_congestion_threshold(struct request_queue *q); - int blk_dev_init(void); - -/* - * Return the threshold (number of used requests) at which the queue is - * considered to be congested. It include a little hysteresis to keep the - * context switch rate down. - */ -static inline int queue_congestion_on_threshold(struct request_queue *q) -{ - return q->nr_congestion_on; -} - -/* - * The threshold at which a queue is considered to be uncongested - */ -static inline int queue_congestion_off_threshold(struct request_queue *q) -{ - return q->nr_congestion_off; -} - -extern int blk_update_nr_requests(struct request_queue *, unsigned int); - /* * Contribute to IO statistics IFF: * @@ -380,21 +231,6 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req) q->last_merge = NULL; } -/* - * Steal a bit from this field for legacy IO path atomic IO marking. Note that - * setting the deadline clears the bottom bit, potentially clearing the - * completed bit. The user has to be OK with this (current ones are fine). - */ -static inline void blk_rq_set_deadline(struct request *rq, unsigned long time) -{ - rq->__deadline = time & ~0x1UL; -} - -static inline unsigned long blk_rq_deadline(struct request *rq) -{ - return rq->__deadline & ~0x1UL; -} - /* * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size * is defined as 'unsigned int', meantime it has to aligned to with logical @@ -416,22 +252,6 @@ void ioc_clear_queue(struct request_queue *q); int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); -/** - * rq_ioc - determine io_context for request allocation - * @bio: request being allocated is for this bio (can be %NULL) - * - * Determine io_context to use for request allocation for @bio. May return - * %NULL if %current->io_context doesn't exist. - */ -static inline struct io_context *rq_ioc(struct bio *bio) -{ -#ifdef CONFIG_BLK_CGROUP - if (bio && bio->bi_ioc) - return bio->bi_ioc; -#endif - return current->io_context; -} - /** * create_io_context - try to create task->io_context * @gfp_mask: allocation mask @@ -490,8 +310,6 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) } #endif /* CONFIG_BOUNCE */ -extern void blk_drain_queue(struct request_queue *q); - #ifdef CONFIG_BLK_CGROUP_IOLATENCY extern int blk_iolatency_init(struct request_queue *q); #else diff --git a/block/bounce.c b/block/bounce.c index 559c55bda040..ffb9e9ecfa7e 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -277,7 +277,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, } } - bio_clone_blkcg_association(bio, bio_src); + bio_clone_blkg_association(bio, bio_src); + blkcg_bio_issue_init(bio); return bio; } diff --git a/block/bsg-lib.c b/block/bsg-lib.c index f3501cdaf1a6..192129856342 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -21,7 +21,7 @@ * */ #include -#include +#include #include #include #include @@ -31,6 +31,12 @@ #define uptr64(val) ((void __user *)(uintptr_t)(val)) +struct bsg_set { + struct blk_mq_tag_set tag_set; + bsg_job_fn *job_fn; + bsg_timeout_fn *timeout_fn; +}; + static int bsg_transport_check_proto(struct sg_io_v4 *hdr) { if (hdr->protocol != BSG_PROTOCOL_SCSI || @@ -129,7 +135,7 @@ static void bsg_teardown_job(struct kref *kref) kfree(job->request_payload.sg_list); kfree(job->reply_payload.sg_list); - blk_end_request_all(rq, BLK_STS_OK); + blk_mq_end_request(rq, BLK_STS_OK); } void bsg_job_put(struct bsg_job *job) @@ -157,15 +163,15 @@ void bsg_job_done(struct bsg_job *job, int result, { job->result = result; job->reply_payload_rcv_len = reply_payload_rcv_len; - blk_complete_request(blk_mq_rq_from_pdu(job)); + blk_mq_complete_request(blk_mq_rq_from_pdu(job)); } EXPORT_SYMBOL_GPL(bsg_job_done); /** - * bsg_softirq_done - softirq done routine for destroying the bsg requests + * bsg_complete - softirq done routine for destroying the bsg requests * @rq: BSG request that holds the job to be destroyed */ -static void bsg_softirq_done(struct request *rq) +static void bsg_complete(struct request *rq) { struct bsg_job *job = blk_mq_rq_to_pdu(rq); @@ -224,54 +230,48 @@ failjob_rls_job: } /** - * bsg_request_fn - generic handler for bsg requests - * @q: request queue to manage + * bsg_queue_rq - generic handler for bsg requests + * @hctx: hardware queue + * @bd: queue data * * On error the create_bsg_job function should return a -Exyz error value * that will be set to ->result. * * Drivers/subsys should pass this to the queue init function. */ -static void bsg_request_fn(struct request_queue *q) - __releases(q->queue_lock) - __acquires(q->queue_lock) +static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct request_queue *q = hctx->queue; struct device *dev = q->queuedata; - struct request *req; + struct request *req = bd->rq; + struct bsg_set *bset = + container_of(q->tag_set, struct bsg_set, tag_set); int ret; + blk_mq_start_request(req); + if (!get_device(dev)) - return; + return BLK_STS_IOERR; - while (1) { - req = blk_fetch_request(q); - if (!req) - break; - spin_unlock_irq(q->queue_lock); + if (!bsg_prepare_job(dev, req)) + return BLK_STS_IOERR; - if (!bsg_prepare_job(dev, req)) { - blk_end_request_all(req, BLK_STS_OK); - spin_lock_irq(q->queue_lock); - continue; - } + ret = bset->job_fn(blk_mq_rq_to_pdu(req)); + if (ret) + return BLK_STS_IOERR; - ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req)); - spin_lock_irq(q->queue_lock); - if (ret) - break; - } - - spin_unlock_irq(q->queue_lock); put_device(dev); - spin_lock_irq(q->queue_lock); + return BLK_STS_OK; } /* called right after the request is allocated for the request_queue */ -static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp) +static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req, + unsigned int hctx_idx, unsigned int numa_node) { struct bsg_job *job = blk_mq_rq_to_pdu(req); - job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp); + job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL); if (!job->reply) return -ENOMEM; return 0; @@ -289,13 +289,47 @@ static void bsg_initialize_rq(struct request *req) job->dd_data = job + 1; } -static void bsg_exit_rq(struct request_queue *q, struct request *req) +static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req, + unsigned int hctx_idx) { struct bsg_job *job = blk_mq_rq_to_pdu(req); kfree(job->reply); } +void bsg_remove_queue(struct request_queue *q) +{ + if (q) { + struct bsg_set *bset = + container_of(q->tag_set, struct bsg_set, tag_set); + + bsg_unregister_queue(q); + blk_cleanup_queue(q); + blk_mq_free_tag_set(&bset->tag_set); + kfree(bset); + } +} +EXPORT_SYMBOL_GPL(bsg_remove_queue); + +static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved) +{ + struct bsg_set *bset = + container_of(rq->q->tag_set, struct bsg_set, tag_set); + + if (!bset->timeout_fn) + return BLK_EH_DONE; + return bset->timeout_fn(rq); +} + +static const struct blk_mq_ops bsg_mq_ops = { + .queue_rq = bsg_queue_rq, + .init_request = bsg_init_rq, + .exit_request = bsg_exit_rq, + .initialize_rq_fn = bsg_initialize_rq, + .complete = bsg_complete, + .timeout = bsg_timeout, +}; + /** * bsg_setup_queue - Create and add the bsg hooks so we can receive requests * @dev: device to attach bsg device to @@ -304,28 +338,38 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req) * @dd_job_size: size of LLD data needed for each job */ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, - bsg_job_fn *job_fn, int dd_job_size) + bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size) { + struct bsg_set *bset; + struct blk_mq_tag_set *set; struct request_queue *q; - int ret; + int ret = -ENOMEM; - q = blk_alloc_queue(GFP_KERNEL); - if (!q) + bset = kzalloc(sizeof(*bset), GFP_KERNEL); + if (!bset) return ERR_PTR(-ENOMEM); - q->cmd_size = sizeof(struct bsg_job) + dd_job_size; - q->init_rq_fn = bsg_init_rq; - q->exit_rq_fn = bsg_exit_rq; - q->initialize_rq_fn = bsg_initialize_rq; - q->request_fn = bsg_request_fn; - ret = blk_init_allocated_queue(q); - if (ret) - goto out_cleanup_queue; + bset->job_fn = job_fn; + bset->timeout_fn = timeout; + + set = &bset->tag_set; + set->ops = &bsg_mq_ops, + set->nr_hw_queues = 1; + set->queue_depth = 128; + set->numa_node = NUMA_NO_NODE; + set->cmd_size = sizeof(struct bsg_job) + dd_job_size; + set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING; + if (blk_mq_alloc_tag_set(set)) + goto out_tag_set; + + q = blk_mq_init_queue(set); + if (IS_ERR(q)) { + ret = PTR_ERR(q); + goto out_queue; + } q->queuedata = dev; - q->bsg_job_fn = job_fn; blk_queue_flag_set(QUEUE_FLAG_BIDI, q); - blk_queue_softirq_done(q, bsg_softirq_done); blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); ret = bsg_register_queue(q, dev, name, &bsg_transport_ops); @@ -338,6 +382,10 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, return q; out_cleanup_queue: blk_cleanup_queue(q); +out_queue: + blk_mq_free_tag_set(set); +out_tag_set: + kfree(bset); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(bsg_setup_queue); diff --git a/block/bsg.c b/block/bsg.c index 9a442c23a715..44f6028b9567 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -471,7 +471,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent, /* * we need a proper transport to send commands, not a stacked device */ - if (!queue_is_rq_based(q)) + if (!queue_is_mq(q)) return 0; bcd = &q->bsg_dev; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c deleted file mode 100644 index ed41aa978c4a..000000000000 --- a/block/cfq-iosched.c +++ /dev/null @@ -1,4916 +0,0 @@ -/* - * CFQ, or complete fairness queueing, disk scheduler. - * - * Based on ideas from a previously unfinished io - * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. - * - * Copyright (C) 2003 Jens Axboe - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "blk.h" -#include "blk-wbt.h" - -/* - * tunables - */ -/* max queue in one round of service */ -static const int cfq_quantum = 8; -static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -/* maximum backwards seek, in KiB */ -static const int cfq_back_max = 16 * 1024; -/* penalty of a backwards seek */ -static const int cfq_back_penalty = 2; -static const u64 cfq_slice_sync = NSEC_PER_SEC / 10; -static u64 cfq_slice_async = NSEC_PER_SEC / 25; -static const int cfq_slice_async_rq = 2; -static u64 cfq_slice_idle = NSEC_PER_SEC / 125; -static u64 cfq_group_idle = NSEC_PER_SEC / 125; -static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */ -static const int cfq_hist_divisor = 4; - -/* - * offset from end of queue service tree for idle class - */ -#define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5) -/* offset from end of group service tree under time slice mode */ -#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5) -/* offset from end of group service under IOPS mode */ -#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5) - -/* - * below this threshold, we consider thinktime immediate - */ -#define CFQ_MIN_TT (2 * NSEC_PER_SEC / HZ) - -#define CFQ_SLICE_SCALE (5) -#define CFQ_HW_QUEUE_MIN (5) -#define CFQ_SERVICE_SHIFT 12 - -#define CFQQ_SEEK_THR (sector_t)(8 * 100) -#define CFQQ_CLOSE_THR (sector_t)(8 * 1024) -#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) - -#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq) -#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0]) -#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1]) - -static struct kmem_cache *cfq_pool; - -#define CFQ_PRIO_LISTS IOPRIO_BE_NR -#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) - -#define sample_valid(samples) ((samples) > 80) -#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) - -/* blkio-related constants */ -#define CFQ_WEIGHT_LEGACY_MIN 10 -#define CFQ_WEIGHT_LEGACY_DFL 500 -#define CFQ_WEIGHT_LEGACY_MAX 1000 - -struct cfq_ttime { - u64 last_end_request; - - u64 ttime_total; - u64 ttime_mean; - unsigned long ttime_samples; -}; - -/* - * Most of our rbtree usage is for sorting with min extraction, so - * if we cache the leftmost node we don't have to walk down the tree - * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should - * move this into the elevator for the rq sorting as well. - */ -struct cfq_rb_root { - struct rb_root_cached rb; - struct rb_node *rb_rightmost; - unsigned count; - u64 min_vdisktime; - struct cfq_ttime ttime; -}; -#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \ - .rb_rightmost = NULL, \ - .ttime = {.last_end_request = ktime_get_ns(),},} - -/* - * Per process-grouping structure - */ -struct cfq_queue { - /* reference count */ - int ref; - /* various state flags, see below */ - unsigned int flags; - /* parent cfq_data */ - struct cfq_data *cfqd; - /* service_tree member */ - struct rb_node rb_node; - /* service_tree key */ - u64 rb_key; - /* prio tree member */ - struct rb_node p_node; - /* prio tree root we belong to, if any */ - struct rb_root *p_root; - /* sorted list of pending requests */ - struct rb_root sort_list; - /* if fifo isn't expired, next request to serve */ - struct request *next_rq; - /* requests queued in sort_list */ - int queued[2]; - /* currently allocated requests */ - int allocated[2]; - /* fifo list of requests in sort_list */ - struct list_head fifo; - - /* time when queue got scheduled in to dispatch first request. */ - u64 dispatch_start; - u64 allocated_slice; - u64 slice_dispatch; - /* time when first request from queue completed and slice started. */ - u64 slice_start; - u64 slice_end; - s64 slice_resid; - - /* pending priority requests */ - int prio_pending; - /* number of requests that are on the dispatch list or inside driver */ - int dispatched; - - /* io prio of this group */ - unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; - - pid_t pid; - - u32 seek_history; - sector_t last_request_pos; - - struct cfq_rb_root *service_tree; - struct cfq_queue *new_cfqq; - struct cfq_group *cfqg; - /* Number of sectors dispatched from queue in single dispatch round */ - unsigned long nr_sectors; -}; - -/* - * First index in the service_trees. - * IDLE is handled separately, so it has negative index - */ -enum wl_class_t { - BE_WORKLOAD = 0, - RT_WORKLOAD = 1, - IDLE_WORKLOAD = 2, - CFQ_PRIO_NR, -}; - -/* - * Second index in the service_trees. - */ -enum wl_type_t { - ASYNC_WORKLOAD = 0, - SYNC_NOIDLE_WORKLOAD = 1, - SYNC_WORKLOAD = 2 -}; - -struct cfqg_stats { -#ifdef CONFIG_CFQ_GROUP_IOSCHED - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ - struct blkg_rwstat service_time; - /* total time spent waiting in scheduler queue in ns */ - struct blkg_rwstat wait_time; - /* number of IOs queued up */ - struct blkg_rwstat queued; - /* total disk time and nr sectors dispatched by this group */ - struct blkg_stat time; -#ifdef CONFIG_DEBUG_BLK_CGROUP - /* time not charged to this cgroup */ - struct blkg_stat unaccounted_time; - /* sum of number of ios queued across all samples */ - struct blkg_stat avg_queue_size_sum; - /* count of samples taken for average */ - struct blkg_stat avg_queue_size_samples; - /* how many times this group has been removed from service tree */ - struct blkg_stat dequeue; - /* total time spent waiting for it to be assigned a timeslice. */ - struct blkg_stat group_wait_time; - /* time spent idling for this blkcg_gq */ - struct blkg_stat idle_time; - /* total time with empty current active q with other requests queued */ - struct blkg_stat empty_time; - /* fields after this shouldn't be cleared on stat reset */ - u64 start_group_wait_time; - u64 start_idle_time; - u64 start_empty_time; - uint16_t flags; -#endif /* CONFIG_DEBUG_BLK_CGROUP */ -#endif /* CONFIG_CFQ_GROUP_IOSCHED */ -}; - -/* Per-cgroup data */ -struct cfq_group_data { - /* must be the first member */ - struct blkcg_policy_data cpd; - - unsigned int weight; - unsigned int leaf_weight; -}; - -/* This is per cgroup per device grouping structure */ -struct cfq_group { - /* must be the first member */ - struct blkg_policy_data pd; - - /* group service_tree member */ - struct rb_node rb_node; - - /* group service_tree key */ - u64 vdisktime; - - /* - * The number of active cfqgs and sum of their weights under this - * cfqg. This covers this cfqg's leaf_weight and all children's - * weights, but does not cover weights of further descendants. - * - * If a cfqg is on the service tree, it's active. An active cfqg - * also activates its parent and contributes to the children_weight - * of the parent. - */ - int nr_active; - unsigned int children_weight; - - /* - * vfraction is the fraction of vdisktime that the tasks in this - * cfqg are entitled to. This is determined by compounding the - * ratios walking up from this cfqg to the root. - * - * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all - * vfractions on a service tree is approximately 1. The sum may - * deviate a bit due to rounding errors and fluctuations caused by - * cfqgs entering and leaving the service tree. - */ - unsigned int vfraction; - - /* - * There are two weights - (internal) weight is the weight of this - * cfqg against the sibling cfqgs. leaf_weight is the wight of - * this cfqg against the child cfqgs. For the root cfqg, both - * weights are kept in sync for backward compatibility. - */ - unsigned int weight; - unsigned int new_weight; - unsigned int dev_weight; - - unsigned int leaf_weight; - unsigned int new_leaf_weight; - unsigned int dev_leaf_weight; - - /* number of cfqq currently on this group */ - int nr_cfqq; - - /* - * Per group busy queues average. Useful for workload slice calc. We - * create the array for each prio class but at run time it is used - * only for RT and BE class and slot for IDLE class remains unused. - * This is primarily done to avoid confusion and a gcc warning. - */ - unsigned int busy_queues_avg[CFQ_PRIO_NR]; - /* - * rr lists of queues with requests. We maintain service trees for - * RT and BE classes. These trees are subdivided in subclasses - * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE - * class there is no subclassification and all the cfq queues go on - * a single tree service_tree_idle. - * Counts are embedded in the cfq_rb_root - */ - struct cfq_rb_root service_trees[2][3]; - struct cfq_rb_root service_tree_idle; - - u64 saved_wl_slice; - enum wl_type_t saved_wl_type; - enum wl_class_t saved_wl_class; - - /* number of requests that are on the dispatch list or inside driver */ - int dispatched; - struct cfq_ttime ttime; - struct cfqg_stats stats; /* stats for this cfqg */ - - /* async queue for each priority case */ - struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; - struct cfq_queue *async_idle_cfqq; - -}; - -struct cfq_io_cq { - struct io_cq icq; /* must be the first member */ - struct cfq_queue *cfqq[2]; - struct cfq_ttime ttime; - int ioprio; /* the current ioprio */ -#ifdef CONFIG_CFQ_GROUP_IOSCHED - uint64_t blkcg_serial_nr; /* the current blkcg serial */ -#endif -}; - -/* - * Per block device queue structure - */ -struct cfq_data { - struct request_queue *queue; - /* Root service tree for cfq_groups */ - struct cfq_rb_root grp_service_tree; - struct cfq_group *root_group; - - /* - * The priority currently being served - */ - enum wl_class_t serving_wl_class; - enum wl_type_t serving_wl_type; - u64 workload_expires; - struct cfq_group *serving_group; - - /* - * Each priority tree is sorted by next_request position. These - * trees are used when determining if two or more queues are - * interleaving requests (see cfq_close_cooperator). - */ - struct rb_root prio_trees[CFQ_PRIO_LISTS]; - - unsigned int busy_queues; - unsigned int busy_sync_queues; - - int rq_in_driver; - int rq_in_flight[2]; - - /* - * queue-depth detection - */ - int rq_queued; - int hw_tag; - /* - * hw_tag can be - * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) - * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) - * 0 => no NCQ - */ - int hw_tag_est_depth; - unsigned int hw_tag_samples; - - /* - * idle window management - */ - struct hrtimer idle_slice_timer; - struct work_struct unplug_work; - - struct cfq_queue *active_queue; - struct cfq_io_cq *active_cic; - - sector_t last_position; - - /* - * tunables, see top of file - */ - unsigned int cfq_quantum; - unsigned int cfq_back_penalty; - unsigned int cfq_back_max; - unsigned int cfq_slice_async_rq; - unsigned int cfq_latency; - u64 cfq_fifo_expire[2]; - u64 cfq_slice[2]; - u64 cfq_slice_idle; - u64 cfq_group_idle; - u64 cfq_target_latency; - - /* - * Fallback dummy cfqq for extreme OOM conditions - */ - struct cfq_queue oom_cfqq; - - u64 last_delayed_sync; -}; - -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); -static void cfq_put_queue(struct cfq_queue *cfqq); - -static struct cfq_rb_root *st_for(struct cfq_group *cfqg, - enum wl_class_t class, - enum wl_type_t type) -{ - if (!cfqg) - return NULL; - - if (class == IDLE_WORKLOAD) - return &cfqg->service_tree_idle; - - return &cfqg->service_trees[class][type]; -} - -enum cfqq_state_flags { - CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ - CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ - CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ - CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ - CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ - CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ - CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ - CFQ_CFQQ_FLAG_sync, /* synchronous queue */ - CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ - CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ - CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ - CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ -}; - -#define CFQ_CFQQ_FNS(name) \ -static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ -{ \ - (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ -} \ -static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ -{ \ - (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ -} \ -static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ -{ \ - return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ -} - -CFQ_CFQQ_FNS(on_rr); -CFQ_CFQQ_FNS(wait_request); -CFQ_CFQQ_FNS(must_dispatch); -CFQ_CFQQ_FNS(must_alloc_slice); -CFQ_CFQQ_FNS(fifo_expire); -CFQ_CFQQ_FNS(idle_window); -CFQ_CFQQ_FNS(prio_changed); -CFQ_CFQQ_FNS(slice_new); -CFQ_CFQQ_FNS(sync); -CFQ_CFQQ_FNS(coop); -CFQ_CFQQ_FNS(split_coop); -CFQ_CFQQ_FNS(deep); -CFQ_CFQQ_FNS(wait_busy); -#undef CFQ_CFQQ_FNS - -#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - -/* cfqg stats flags */ -enum cfqg_stats_flags { - CFQG_stats_waiting = 0, - CFQG_stats_idling, - CFQG_stats_empty, -}; - -#define CFQG_FLAG_FNS(name) \ -static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \ -{ \ - stats->flags |= (1 << CFQG_stats_##name); \ -} \ -static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \ -{ \ - stats->flags &= ~(1 << CFQG_stats_##name); \ -} \ -static inline int cfqg_stats_##name(struct cfqg_stats *stats) \ -{ \ - return (stats->flags & (1 << CFQG_stats_##name)) != 0; \ -} \ - -CFQG_FLAG_FNS(waiting) -CFQG_FLAG_FNS(idling) -CFQG_FLAG_FNS(empty) -#undef CFQG_FLAG_FNS - -/* This should be called with the queue_lock held. */ -static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) -{ - u64 now; - - if (!cfqg_stats_waiting(stats)) - return; - - now = ktime_get_ns(); - if (now > stats->start_group_wait_time) - blkg_stat_add(&stats->group_wait_time, - now - stats->start_group_wait_time); - cfqg_stats_clear_waiting(stats); -} - -/* This should be called with the queue_lock held. */ -static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - if (cfqg_stats_waiting(stats)) - return; - if (cfqg == curr_cfqg) - return; - stats->start_group_wait_time = ktime_get_ns(); - cfqg_stats_mark_waiting(stats); -} - -/* This should be called with the queue_lock held. */ -static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) -{ - u64 now; - - if (!cfqg_stats_empty(stats)) - return; - - now = ktime_get_ns(); - if (now > stats->start_empty_time) - blkg_stat_add(&stats->empty_time, - now - stats->start_empty_time); - cfqg_stats_clear_empty(stats); -} - -static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) -{ - blkg_stat_add(&cfqg->stats.dequeue, 1); -} - -static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - if (blkg_rwstat_total(&stats->queued)) - return; - - /* - * group is already marked empty. This can happen if cfqq got new - * request in parent group and moved to this group while being added - * to service tree. Just ignore the event and move on. - */ - if (cfqg_stats_empty(stats)) - return; - - stats->start_empty_time = ktime_get_ns(); - cfqg_stats_mark_empty(stats); -} - -static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - if (cfqg_stats_idling(stats)) { - u64 now = ktime_get_ns(); - - if (now > stats->start_idle_time) - blkg_stat_add(&stats->idle_time, - now - stats->start_idle_time); - cfqg_stats_clear_idling(stats); - } -} - -static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - BUG_ON(cfqg_stats_idling(stats)); - - stats->start_idle_time = ktime_get_ns(); - cfqg_stats_mark_idling(stats); -} - -static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - blkg_stat_add(&stats->avg_queue_size_sum, - blkg_rwstat_total(&stats->queued)); - blkg_stat_add(&stats->avg_queue_size_samples, 1); - cfqg_stats_update_group_wait_time(stats); -} - -#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ - -static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { } -static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { } -static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { } -static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { } -static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { } -static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { } -static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } - -#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ - -#ifdef CONFIG_CFQ_GROUP_IOSCHED - -static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct cfq_group, pd) : NULL; -} - -static struct cfq_group_data -*cpd_to_cfqgd(struct blkcg_policy_data *cpd) -{ - return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; -} - -static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) -{ - return pd_to_blkg(&cfqg->pd); -} - -static struct blkcg_policy blkcg_policy_cfq; - -static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) -{ - return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); -} - -static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg) -{ - return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq)); -} - -static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) -{ - struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; - - return pblkg ? blkg_to_cfqg(pblkg) : NULL; -} - -static inline bool cfqg_is_descendant(struct cfq_group *cfqg, - struct cfq_group *ancestor) -{ - return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup, - cfqg_to_blkg(ancestor)->blkcg->css.cgroup); -} - -static inline void cfqg_get(struct cfq_group *cfqg) -{ - return blkg_get(cfqg_to_blkg(cfqg)); -} - -static inline void cfqg_put(struct cfq_group *cfqg) -{ - return blkg_put(cfqg_to_blkg(cfqg)); -} - -#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ - blk_add_cgroup_trace_msg((cfqd)->queue, \ - cfqg_to_blkg((cfqq)->cfqg)->blkcg, \ - "cfq%d%c%c " fmt, (cfqq)->pid, \ - cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ - cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ - ##args); \ -} while (0) - -#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ - blk_add_cgroup_trace_msg((cfqd)->queue, \ - cfqg_to_blkg(cfqg)->blkcg, fmt, ##args); \ -} while (0) - -static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, - unsigned int op) -{ - blkg_rwstat_add(&cfqg->stats.queued, op, 1); - cfqg_stats_end_empty_time(&cfqg->stats); - cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); -} - -static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, - uint64_t time, unsigned long unaccounted_time) -{ - blkg_stat_add(&cfqg->stats.time, time); -#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time); -#endif -} - -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, - unsigned int op) -{ - blkg_rwstat_add(&cfqg->stats.queued, op, -1); -} - -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, - unsigned int op) -{ - blkg_rwstat_add(&cfqg->stats.merged, op, 1); -} - -static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - u64 start_time_ns, - u64 io_start_time_ns, - unsigned int op) -{ - struct cfqg_stats *stats = &cfqg->stats; - u64 now = ktime_get_ns(); - - if (now > io_start_time_ns) - blkg_rwstat_add(&stats->service_time, op, - now - io_start_time_ns); - if (io_start_time_ns > start_time_ns) - blkg_rwstat_add(&stats->wait_time, op, - io_start_time_ns - start_time_ns); -} - -/* @stats = 0 */ -static void cfqg_stats_reset(struct cfqg_stats *stats) -{ - /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->merged); - blkg_rwstat_reset(&stats->service_time); - blkg_rwstat_reset(&stats->wait_time); - blkg_stat_reset(&stats->time); -#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_reset(&stats->unaccounted_time); - blkg_stat_reset(&stats->avg_queue_size_sum); - blkg_stat_reset(&stats->avg_queue_size_samples); - blkg_stat_reset(&stats->dequeue); - blkg_stat_reset(&stats->group_wait_time); - blkg_stat_reset(&stats->idle_time); - blkg_stat_reset(&stats->empty_time); -#endif -} - -/* @to += @from */ -static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) -{ - /* queued stats shouldn't be cleared */ - blkg_rwstat_add_aux(&to->merged, &from->merged); - blkg_rwstat_add_aux(&to->service_time, &from->service_time); - blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); - blkg_stat_add_aux(&from->time, &from->time); -#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); - blkg_stat_add_aux(&to->dequeue, &from->dequeue); - blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); - blkg_stat_add_aux(&to->idle_time, &from->idle_time); - blkg_stat_add_aux(&to->empty_time, &from->empty_time); -#endif -} - -/* - * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' - * recursive stats can still account for the amount used by this cfqg after - * it's gone. - */ -static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) -{ - struct cfq_group *parent = cfqg_parent(cfqg); - - lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock); - - if (unlikely(!parent)) - return; - - cfqg_stats_add_aux(&parent->stats, &cfqg->stats); - cfqg_stats_reset(&cfqg->stats); -} - -#else /* CONFIG_CFQ_GROUP_IOSCHED */ - -static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } -static inline bool cfqg_is_descendant(struct cfq_group *cfqg, - struct cfq_group *ancestor) -{ - return true; -} -static inline void cfqg_get(struct cfq_group *cfqg) { } -static inline void cfqg_put(struct cfq_group *cfqg) { } - -#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ - blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \ - cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ - cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ - ##args) -#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) - -static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, unsigned int op) { } -static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, - uint64_t time, unsigned long unaccounted_time) { } -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, - unsigned int op) { } -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, - unsigned int op) { } -static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - u64 start_time_ns, - u64 io_start_time_ns, - unsigned int op) { } - -#endif /* CONFIG_CFQ_GROUP_IOSCHED */ - -#define cfq_log(cfqd, fmt, args...) \ - blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) - -/* Traverses through cfq group service trees */ -#define for_each_cfqg_st(cfqg, i, j, st) \ - for (i = 0; i <= IDLE_WORKLOAD; i++) \ - for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ - : &cfqg->service_tree_idle; \ - (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ - (i == IDLE_WORKLOAD && j == 0); \ - j++, st = i < IDLE_WORKLOAD ? \ - &cfqg->service_trees[i][j]: NULL) \ - -static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd, - struct cfq_ttime *ttime, bool group_idle) -{ - u64 slice; - if (!sample_valid(ttime->ttime_samples)) - return false; - if (group_idle) - slice = cfqd->cfq_group_idle; - else - slice = cfqd->cfq_slice_idle; - return ttime->ttime_mean > slice; -} - -static inline bool iops_mode(struct cfq_data *cfqd) -{ - /* - * If we are not idling on queues and it is a NCQ drive, parallel - * execution of requests is on and measuring time is not possible - * in most of the cases until and unless we drive shallower queue - * depths and that becomes a performance bottleneck. In such cases - * switch to start providing fairness in terms of number of IOs. - */ - if (!cfqd->cfq_slice_idle && cfqd->hw_tag) - return true; - else - return false; -} - -static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq) -{ - if (cfq_class_idle(cfqq)) - return IDLE_WORKLOAD; - if (cfq_class_rt(cfqq)) - return RT_WORKLOAD; - return BE_WORKLOAD; -} - - -static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) -{ - if (!cfq_cfqq_sync(cfqq)) - return ASYNC_WORKLOAD; - if (!cfq_cfqq_idle_window(cfqq)) - return SYNC_NOIDLE_WORKLOAD; - return SYNC_WORKLOAD; -} - -static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class, - struct cfq_data *cfqd, - struct cfq_group *cfqg) -{ - if (wl_class == IDLE_WORKLOAD) - return cfqg->service_tree_idle.count; - - return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count + - cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count + - cfqg->service_trees[wl_class][SYNC_WORKLOAD].count; -} - -static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, - struct cfq_group *cfqg) -{ - return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + - cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; -} - -static void cfq_dispatch_insert(struct request_queue *, struct request *); -static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, - struct cfq_io_cq *cic, struct bio *bio); - -static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) -{ - /* cic->icq is the first member, %NULL will convert to %NULL */ - return container_of(icq, struct cfq_io_cq, icq); -} - -static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd, - struct io_context *ioc) -{ - if (ioc) - return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue)); - return NULL; -} - -static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync) -{ - return cic->cfqq[is_sync]; -} - -static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq, - bool is_sync) -{ - cic->cfqq[is_sync] = cfqq; -} - -static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) -{ - return cic->icq.q->elevator->elevator_data; -} - -/* - * scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing - */ -static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) -{ - if (cfqd->busy_queues) { - cfq_log(cfqd, "schedule dispatch"); - kblockd_schedule_work(&cfqd->unplug_work); - } -} - -/* - * Scale schedule slice based on io priority. Use the sync time slice only - * if a queue is marked sync and has sync io queued. A sync queue with async - * io only, should not get full sync slice length. - */ -static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync, - unsigned short prio) -{ - u64 base_slice = cfqd->cfq_slice[sync]; - u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE); - - WARN_ON(prio >= IOPRIO_BE_NR); - - return base_slice + (slice * (4 - prio)); -} - -static inline u64 -cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); -} - -/** - * cfqg_scale_charge - scale disk time charge according to cfqg weight - * @charge: disk time being charged - * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT - * - * Scale @charge according to @vfraction, which is in range (0, 1]. The - * scaling is inversely proportional. - * - * scaled = charge / vfraction - * - * The result is also in fixed point w/ CFQ_SERVICE_SHIFT. - */ -static inline u64 cfqg_scale_charge(u64 charge, - unsigned int vfraction) -{ - u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */ - - /* charge / vfraction */ - c <<= CFQ_SERVICE_SHIFT; - return div_u64(c, vfraction); -} - -static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) -{ - s64 delta = (s64)(vdisktime - min_vdisktime); - if (delta > 0) - min_vdisktime = vdisktime; - - return min_vdisktime; -} - -static void update_min_vdisktime(struct cfq_rb_root *st) -{ - if (!RB_EMPTY_ROOT(&st->rb.rb_root)) { - struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost); - - st->min_vdisktime = max_vdisktime(st->min_vdisktime, - cfqg->vdisktime); - } -} - -/* - * get averaged number of queues of RT/BE priority. - * average is updated, with a formula that gives more weight to higher numbers, - * to quickly follows sudden increases and decrease slowly - */ - -static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, - struct cfq_group *cfqg, bool rt) -{ - unsigned min_q, max_q; - unsigned mult = cfq_hist_divisor - 1; - unsigned round = cfq_hist_divisor / 2; - unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); - - min_q = min(cfqg->busy_queues_avg[rt], busy); - max_q = max(cfqg->busy_queues_avg[rt], busy); - cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / - cfq_hist_divisor; - return cfqg->busy_queues_avg[rt]; -} - -static inline u64 -cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) -{ - return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT; -} - -static inline u64 -cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - u64 slice = cfq_prio_to_slice(cfqd, cfqq); - if (cfqd->cfq_latency) { - /* - * interested queues (we consider only the ones with the same - * priority class in the cfq group) - */ - unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, - cfq_class_rt(cfqq)); - u64 sync_slice = cfqd->cfq_slice[1]; - u64 expect_latency = sync_slice * iq; - u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg); - - if (expect_latency > group_slice) { - u64 base_low_slice = 2 * cfqd->cfq_slice_idle; - u64 low_slice; - - /* scale low_slice according to IO priority - * and sync vs async */ - low_slice = div64_u64(base_low_slice*slice, sync_slice); - low_slice = min(slice, low_slice); - /* the adapted slice value is scaled to fit all iqs - * into the target latency */ - slice = div64_u64(slice*group_slice, expect_latency); - slice = max(slice, low_slice); - } - } - return slice; -} - -static inline void -cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq); - u64 now = ktime_get_ns(); - - cfqq->slice_start = now; - cfqq->slice_end = now + slice; - cfqq->allocated_slice = slice; - cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now); -} - -/* - * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end - * isn't valid until the first request from the dispatch is activated - * and the slice time set. - */ -static inline bool cfq_slice_used(struct cfq_queue *cfqq) -{ - if (cfq_cfqq_slice_new(cfqq)) - return false; - if (ktime_get_ns() < cfqq->slice_end) - return false; - - return true; -} - -/* - * Lifted from AS - choose which of rq1 and rq2 that is best served now. - * We choose the request that is closest to the head right now. Distance - * behind the head is penalized and only allowed to a certain extent. - */ -static struct request * -cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) -{ - sector_t s1, s2, d1 = 0, d2 = 0; - unsigned long back_max; -#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ - unsigned wrap = 0; /* bit mask: requests behind the disk head? */ - - if (rq1 == NULL || rq1 == rq2) - return rq2; - if (rq2 == NULL) - return rq1; - - if (rq_is_sync(rq1) != rq_is_sync(rq2)) - return rq_is_sync(rq1) ? rq1 : rq2; - - if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO) - return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2; - - s1 = blk_rq_pos(rq1); - s2 = blk_rq_pos(rq2); - - /* - * by definition, 1KiB is 2 sectors - */ - back_max = cfqd->cfq_back_max * 2; - - /* - * Strict one way elevator _except_ in the case where we allow - * short backward seeks which are biased as twice the cost of a - * similar forward seek. - */ - if (s1 >= last) - d1 = s1 - last; - else if (s1 + back_max >= last) - d1 = (last - s1) * cfqd->cfq_back_penalty; - else - wrap |= CFQ_RQ1_WRAP; - - if (s2 >= last) - d2 = s2 - last; - else if (s2 + back_max >= last) - d2 = (last - s2) * cfqd->cfq_back_penalty; - else - wrap |= CFQ_RQ2_WRAP; - - /* Found required data */ - - /* - * By doing switch() on the bit mask "wrap" we avoid having to - * check two variables for all permutations: --> faster! - */ - switch (wrap) { - case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ - if (d1 < d2) - return rq1; - else if (d2 < d1) - return rq2; - else { - if (s1 >= s2) - return rq1; - else - return rq2; - } - - case CFQ_RQ2_WRAP: - return rq1; - case CFQ_RQ1_WRAP: - return rq2; - case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ - default: - /* - * Since both rqs are wrapped, - * start with the one that's further behind head - * (--> only *one* back seek required), - * since back seek takes more time than forward. - */ - if (s1 <= s2) - return rq1; - else - return rq2; - } -} - -static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) -{ - /* Service tree is empty */ - if (!root->count) - return NULL; - - return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node); -} - -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) -{ - return rb_entry_cfqg(rb_first_cached(&root->rb)); -} - -static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) -{ - if (root->rb_rightmost == n) - root->rb_rightmost = rb_prev(n); - - rb_erase_cached(n, &root->rb); - RB_CLEAR_NODE(n); - - --root->count; -} - -/* - * would be nice to take fifo expire time into account as well - */ -static struct request * -cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *last) -{ - struct rb_node *rbnext = rb_next(&last->rb_node); - struct rb_node *rbprev = rb_prev(&last->rb_node); - struct request *next = NULL, *prev = NULL; - - BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - - if (rbprev) - prev = rb_entry_rq(rbprev); - - if (rbnext) - next = rb_entry_rq(rbnext); - else { - rbnext = rb_first(&cfqq->sort_list); - if (rbnext && rbnext != &last->rb_node) - next = rb_entry_rq(rbnext); - } - - return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); -} - -static u64 cfq_slice_offset(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - /* - * just an approximation, should be ok. - */ - return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); -} - -static inline s64 -cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) -{ - return cfqg->vdisktime - st->min_vdisktime; -} - -static void -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) -{ - struct rb_node **node = &st->rb.rb_root.rb_node; - struct rb_node *parent = NULL; - struct cfq_group *__cfqg; - s64 key = cfqg_key(st, cfqg); - bool leftmost = true, rightmost = true; - - while (*node != NULL) { - parent = *node; - __cfqg = rb_entry_cfqg(parent); - - if (key < cfqg_key(st, __cfqg)) { - node = &parent->rb_left; - rightmost = false; - } else { - node = &parent->rb_right; - leftmost = false; - } - } - - if (rightmost) - st->rb_rightmost = &cfqg->rb_node; - - rb_link_node(&cfqg->rb_node, parent, node); - rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost); -} - -/* - * This has to be called only on activation of cfqg - */ -static void -cfq_update_group_weight(struct cfq_group *cfqg) -{ - if (cfqg->new_weight) { - cfqg->weight = cfqg->new_weight; - cfqg->new_weight = 0; - } -} - -static void -cfq_update_group_leaf_weight(struct cfq_group *cfqg) -{ - BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - - if (cfqg->new_leaf_weight) { - cfqg->leaf_weight = cfqg->new_leaf_weight; - cfqg->new_leaf_weight = 0; - } -} - -static void -cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) -{ - unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */ - struct cfq_group *pos = cfqg; - struct cfq_group *parent; - bool propagate; - - /* add to the service tree */ - BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - - /* - * Update leaf_weight. We cannot update weight at this point - * because cfqg might already have been activated and is - * contributing its current weight to the parent's child_weight. - */ - cfq_update_group_leaf_weight(cfqg); - __cfq_group_service_tree_add(st, cfqg); - - /* - * Activate @cfqg and calculate the portion of vfraction @cfqg is - * entitled to. vfraction is calculated by walking the tree - * towards the root calculating the fraction it has at each level. - * The compounded ratio is how much vfraction @cfqg owns. - * - * Start with the proportion tasks in this cfqg has against active - * children cfqgs - its leaf_weight against children_weight. - */ - propagate = !pos->nr_active++; - pos->children_weight += pos->leaf_weight; - vfr = vfr * pos->leaf_weight / pos->children_weight; - - /* - * Compound ->weight walking up the tree. Both activation and - * vfraction calculation are done in the same loop. Propagation - * stops once an already activated node is met. vfraction - * calculation should always continue to the root. - */ - while ((parent = cfqg_parent(pos))) { - if (propagate) { - cfq_update_group_weight(pos); - propagate = !parent->nr_active++; - parent->children_weight += pos->weight; - } - vfr = vfr * pos->weight / parent->children_weight; - pos = parent; - } - - cfqg->vfraction = max_t(unsigned, vfr, 1); -} - -static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd) -{ - if (!iops_mode(cfqd)) - return CFQ_SLICE_MODE_GROUP_DELAY; - else - return CFQ_IOPS_MODE_GROUP_DELAY; -} - -static void -cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) -{ - struct cfq_rb_root *st = &cfqd->grp_service_tree; - struct cfq_group *__cfqg; - struct rb_node *n; - - cfqg->nr_cfqq++; - if (!RB_EMPTY_NODE(&cfqg->rb_node)) - return; - - /* - * Currently put the group at the end. Later implement something - * so that groups get lesser vtime based on their weights, so that - * if group does not loose all if it was not continuously backlogged. - */ - n = st->rb_rightmost; - if (n) { - __cfqg = rb_entry_cfqg(n); - cfqg->vdisktime = __cfqg->vdisktime + - cfq_get_cfqg_vdisktime_delay(cfqd); - } else - cfqg->vdisktime = st->min_vdisktime; - cfq_group_service_tree_add(st, cfqg); -} - -static void -cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) -{ - struct cfq_group *pos = cfqg; - bool propagate; - - /* - * Undo activation from cfq_group_service_tree_add(). Deactivate - * @cfqg and propagate deactivation upwards. - */ - propagate = !--pos->nr_active; - pos->children_weight -= pos->leaf_weight; - - while (propagate) { - struct cfq_group *parent = cfqg_parent(pos); - - /* @pos has 0 nr_active at this point */ - WARN_ON_ONCE(pos->children_weight); - pos->vfraction = 0; - - if (!parent) - break; - - propagate = !--parent->nr_active; - parent->children_weight -= pos->weight; - pos = parent; - } - - /* remove from the service tree */ - if (!RB_EMPTY_NODE(&cfqg->rb_node)) - cfq_rb_erase(&cfqg->rb_node, st); -} - -static void -cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) -{ - struct cfq_rb_root *st = &cfqd->grp_service_tree; - - BUG_ON(cfqg->nr_cfqq < 1); - cfqg->nr_cfqq--; - - /* If there are other cfq queues under this group, don't delete it */ - if (cfqg->nr_cfqq) - return; - - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); - cfq_group_service_tree_del(st, cfqg); - cfqg->saved_wl_slice = 0; - cfqg_stats_update_dequeue(cfqg); -} - -static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq, - u64 *unaccounted_time) -{ - u64 slice_used; - u64 now = ktime_get_ns(); - - /* - * Queue got expired before even a single request completed or - * got expired immediately after first request completion. - */ - if (!cfqq->slice_start || cfqq->slice_start == now) { - /* - * Also charge the seek time incurred to the group, otherwise - * if there are mutiple queues in the group, each can dispatch - * a single request on seeky media and cause lots of seek time - * and group will never know it. - */ - slice_used = max_t(u64, (now - cfqq->dispatch_start), - jiffies_to_nsecs(1)); - } else { - slice_used = now - cfqq->slice_start; - if (slice_used > cfqq->allocated_slice) { - *unaccounted_time = slice_used - cfqq->allocated_slice; - slice_used = cfqq->allocated_slice; - } - if (cfqq->slice_start > cfqq->dispatch_start) - *unaccounted_time += cfqq->slice_start - - cfqq->dispatch_start; - } - - return slice_used; -} - -static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, - struct cfq_queue *cfqq) -{ - struct cfq_rb_root *st = &cfqd->grp_service_tree; - u64 used_sl, charge, unaccounted_sl = 0; - int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) - - cfqg->service_tree_idle.count; - unsigned int vfr; - u64 now = ktime_get_ns(); - - BUG_ON(nr_sync < 0); - used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); - - if (iops_mode(cfqd)) - charge = cfqq->slice_dispatch; - else if (!cfq_cfqq_sync(cfqq) && !nr_sync) - charge = cfqq->allocated_slice; - - /* - * Can't update vdisktime while on service tree and cfqg->vfraction - * is valid only while on it. Cache vfr, leave the service tree, - * update vdisktime and go back on. The re-addition to the tree - * will also update the weights as necessary. - */ - vfr = cfqg->vfraction; - cfq_group_service_tree_del(st, cfqg); - cfqg->vdisktime += cfqg_scale_charge(charge, vfr); - cfq_group_service_tree_add(st, cfqg); - - /* This group is being expired. Save the context */ - if (cfqd->workload_expires > now) { - cfqg->saved_wl_slice = cfqd->workload_expires - now; - cfqg->saved_wl_type = cfqd->serving_wl_type; - cfqg->saved_wl_class = cfqd->serving_wl_class; - } else - cfqg->saved_wl_slice = 0; - - cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, - st->min_vdisktime); - cfq_log_cfqq(cfqq->cfqd, cfqq, - "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu", - used_sl, cfqq->slice_dispatch, charge, - iops_mode(cfqd), cfqq->nr_sectors); - cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl); - cfqg_stats_set_start_empty_time(cfqg); -} - -/** - * cfq_init_cfqg_base - initialize base part of a cfq_group - * @cfqg: cfq_group to initialize - * - * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED - * is enabled or not. - */ -static void cfq_init_cfqg_base(struct cfq_group *cfqg) -{ - struct cfq_rb_root *st; - int i, j; - - for_each_cfqg_st(cfqg, i, j, st) - *st = CFQ_RB_ROOT; - RB_CLEAR_NODE(&cfqg->rb_node); - - cfqg->ttime.last_end_request = ktime_get_ns(); -} - -#ifdef CONFIG_CFQ_GROUP_IOSCHED -static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, - bool on_dfl, bool reset_dev, bool is_leaf_weight); - -static void cfqg_stats_exit(struct cfqg_stats *stats) -{ - blkg_rwstat_exit(&stats->merged); - blkg_rwstat_exit(&stats->service_time); - blkg_rwstat_exit(&stats->wait_time); - blkg_rwstat_exit(&stats->queued); - blkg_stat_exit(&stats->time); -#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_exit(&stats->unaccounted_time); - blkg_stat_exit(&stats->avg_queue_size_sum); - blkg_stat_exit(&stats->avg_queue_size_samples); - blkg_stat_exit(&stats->dequeue); - blkg_stat_exit(&stats->group_wait_time); - blkg_stat_exit(&stats->idle_time); - blkg_stat_exit(&stats->empty_time); -#endif -} - -static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) -{ - if (blkg_rwstat_init(&stats->merged, gfp) || - blkg_rwstat_init(&stats->service_time, gfp) || - blkg_rwstat_init(&stats->wait_time, gfp) || - blkg_rwstat_init(&stats->queued, gfp) || - blkg_stat_init(&stats->time, gfp)) - goto err; - -#ifdef CONFIG_DEBUG_BLK_CGROUP - if (blkg_stat_init(&stats->unaccounted_time, gfp) || - blkg_stat_init(&stats->avg_queue_size_sum, gfp) || - blkg_stat_init(&stats->avg_queue_size_samples, gfp) || - blkg_stat_init(&stats->dequeue, gfp) || - blkg_stat_init(&stats->group_wait_time, gfp) || - blkg_stat_init(&stats->idle_time, gfp) || - blkg_stat_init(&stats->empty_time, gfp)) - goto err; -#endif - return 0; -err: - cfqg_stats_exit(stats); - return -ENOMEM; -} - -static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) -{ - struct cfq_group_data *cgd; - - cgd = kzalloc(sizeof(*cgd), gfp); - if (!cgd) - return NULL; - return &cgd->cpd; -} - -static void cfq_cpd_init(struct blkcg_policy_data *cpd) -{ - struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); - unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? - CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; - - if (cpd_to_blkcg(cpd) == &blkcg_root) - weight *= 2; - - cgd->weight = weight; - cgd->leaf_weight = weight; -} - -static void cfq_cpd_free(struct blkcg_policy_data *cpd) -{ - kfree(cpd_to_cfqgd(cpd)); -} - -static void cfq_cpd_bind(struct blkcg_policy_data *cpd) -{ - struct blkcg *blkcg = cpd_to_blkcg(cpd); - bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys); - unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; - - if (blkcg == &blkcg_root) - weight *= 2; - - WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); - WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); -} - -static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) -{ - struct cfq_group *cfqg; - - cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); - if (!cfqg) - return NULL; - - cfq_init_cfqg_base(cfqg); - if (cfqg_stats_init(&cfqg->stats, gfp)) { - kfree(cfqg); - return NULL; - } - - return &cfqg->pd; -} - -static void cfq_pd_init(struct blkg_policy_data *pd) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); - - cfqg->weight = cgd->weight; - cfqg->leaf_weight = cgd->leaf_weight; -} - -static void cfq_pd_offline(struct blkg_policy_data *pd) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (cfqg->async_cfqq[0][i]) { - cfq_put_queue(cfqg->async_cfqq[0][i]); - cfqg->async_cfqq[0][i] = NULL; - } - if (cfqg->async_cfqq[1][i]) { - cfq_put_queue(cfqg->async_cfqq[1][i]); - cfqg->async_cfqq[1][i] = NULL; - } - } - - if (cfqg->async_idle_cfqq) { - cfq_put_queue(cfqg->async_idle_cfqq); - cfqg->async_idle_cfqq = NULL; - } - - /* - * @blkg is going offline and will be ignored by - * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so - * that they don't get lost. If IOs complete after this point, the - * stats for them will be lost. Oh well... - */ - cfqg_stats_xfer_dead(cfqg); -} - -static void cfq_pd_free(struct blkg_policy_data *pd) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - - cfqg_stats_exit(&cfqg->stats); - return kfree(cfqg); -} - -static void cfq_pd_reset_stats(struct blkg_policy_data *pd) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - - cfqg_stats_reset(&cfqg->stats); -} - -static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) -{ - struct blkcg_gq *blkg; - - blkg = blkg_lookup(blkcg, cfqd->queue); - if (likely(blkg)) - return blkg_to_cfqg(blkg); - return NULL; -} - -static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) -{ - cfqq->cfqg = cfqg; - /* cfqq reference on cfqg */ - cfqg_get(cfqg); -} - -static u64 cfqg_prfill_weight_device(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - - if (!cfqg->dev_weight) - return 0; - return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); -} - -static int cfqg_print_weight_device(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_weight_device, &blkcg_policy_cfq, - 0, false); - return 0; -} - -static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - - if (!cfqg->dev_leaf_weight) - return 0; - return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); -} - -static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, - 0, false); - return 0; -} - -static int cfq_print_weight(struct seq_file *sf, void *v) -{ - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); - unsigned int val = 0; - - if (cgd) - val = cgd->weight; - - seq_printf(sf, "%u\n", val); - return 0; -} - -static int cfq_print_leaf_weight(struct seq_file *sf, void *v) -{ - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); - unsigned int val = 0; - - if (cgd) - val = cgd->leaf_weight; - - seq_printf(sf, "%u\n", val); - return 0; -} - -static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off, - bool on_dfl, bool is_leaf_weight) -{ - unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; - unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; - struct blkcg *blkcg = css_to_blkcg(of_css(of)); - struct blkg_conf_ctx ctx; - struct cfq_group *cfqg; - struct cfq_group_data *cfqgd; - int ret; - u64 v; - - ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); - if (ret) - return ret; - - if (sscanf(ctx.body, "%llu", &v) == 1) { - /* require "default" on dfl */ - ret = -ERANGE; - if (!v && on_dfl) - goto out_finish; - } else if (!strcmp(strim(ctx.body), "default")) { - v = 0; - } else { - ret = -EINVAL; - goto out_finish; - } - - cfqg = blkg_to_cfqg(ctx.blkg); - cfqgd = blkcg_to_cfqgd(blkcg); - - ret = -ERANGE; - if (!v || (v >= min && v <= max)) { - if (!is_leaf_weight) { - cfqg->dev_weight = v; - cfqg->new_weight = v ?: cfqgd->weight; - } else { - cfqg->dev_leaf_weight = v; - cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; - } - ret = 0; - } -out_finish: - blkg_conf_finish(&ctx); - return ret ?: nbytes; -} - -static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); -} - -static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); -} - -static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, - bool on_dfl, bool reset_dev, bool is_leaf_weight) -{ - unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; - unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; - struct cfq_group_data *cfqgd; - int ret = 0; - - if (val < min || val > max) - return -ERANGE; - - spin_lock_irq(&blkcg->lock); - cfqgd = blkcg_to_cfqgd(blkcg); - if (!cfqgd) { - ret = -EINVAL; - goto out; - } - - if (!is_leaf_weight) - cfqgd->weight = val; - else - cfqgd->leaf_weight = val; - - hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); - - if (!cfqg) - continue; - - if (!is_leaf_weight) { - if (reset_dev) - cfqg->dev_weight = 0; - if (!cfqg->dev_weight) - cfqg->new_weight = cfqgd->weight; - } else { - if (reset_dev) - cfqg->dev_leaf_weight = 0; - if (!cfqg->dev_leaf_weight) - cfqg->new_leaf_weight = cfqgd->leaf_weight; - } - } - -out: - spin_unlock_irq(&blkcg->lock); - return ret; -} - -static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val) -{ - return __cfq_set_weight(css, val, false, false, false); -} - -static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - return __cfq_set_weight(css, val, false, false, true); -} - -static int cfqg_print_stat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, - &blkcg_policy_cfq, seq_cft(sf)->private, false); - return 0; -} - -static int cfqg_print_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, - &blkcg_policy_cfq, seq_cft(sf)->private, true); - return 0; -} - -static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), - &blkcg_policy_cfq, off); - return __blkg_prfill_u64(sf, pd, sum); -} - -static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), - &blkcg_policy_cfq, off); - return __blkg_prfill_rwstat(sf, pd, &sum); -} - -static int cfqg_print_stat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_stat_recursive, &blkcg_policy_cfq, - seq_cft(sf)->private, false); - return 0; -} - -static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, - seq_cft(sf)->private, true); - return 0; -} - -static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, - int off) -{ - u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); - - return __blkg_prfill_u64(sf, pd, sum >> 9); -} - -static int cfqg_print_stat_sectors(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false); - return 0; -} - -static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes)); - u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + - atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); - - return __blkg_prfill_u64(sf, pd, sum >> 9); -} - -static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0, - false); - return 0; -} - -#ifdef CONFIG_DEBUG_BLK_CGROUP -static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples); - u64 v = 0; - - if (samples) { - v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); - v = div64_u64(v, samples); - } - __blkg_prfill_u64(sf, pd, v); - return 0; -} - -/* print avg_queue_size */ -static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, - 0, false); - return 0; -} -#endif /* CONFIG_DEBUG_BLK_CGROUP */ - -static struct cftype cfq_blkcg_legacy_files[] = { - /* on root, weight is mapped to leaf_weight */ - { - .name = "weight_device", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cfqg_print_leaf_weight_device, - .write = cfqg_set_leaf_weight_device, - }, - { - .name = "weight", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cfq_print_leaf_weight, - .write_u64 = cfq_set_leaf_weight, - }, - - /* no such mapping necessary for !roots */ - { - .name = "weight_device", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cfqg_print_weight_device, - .write = cfqg_set_weight_device, - }, - { - .name = "weight", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cfq_print_weight, - .write_u64 = cfq_set_weight, - }, - - { - .name = "leaf_weight_device", - .seq_show = cfqg_print_leaf_weight_device, - .write = cfqg_set_leaf_weight_device, - }, - { - .name = "leaf_weight", - .seq_show = cfq_print_leaf_weight, - .write_u64 = cfq_set_leaf_weight, - }, - - /* statistics, covers only the tasks in the cfqg */ - { - .name = "time", - .private = offsetof(struct cfq_group, stats.time), - .seq_show = cfqg_print_stat, - }, - { - .name = "sectors", - .seq_show = cfqg_print_stat_sectors, - }, - { - .name = "io_service_bytes", - .private = (unsigned long)&blkcg_policy_cfq, - .seq_show = blkg_print_stat_bytes, - }, - { - .name = "io_serviced", - .private = (unsigned long)&blkcg_policy_cfq, - .seq_show = blkg_print_stat_ios, - }, - { - .name = "io_service_time", - .private = offsetof(struct cfq_group, stats.service_time), - .seq_show = cfqg_print_rwstat, - }, - { - .name = "io_wait_time", - .private = offsetof(struct cfq_group, stats.wait_time), - .seq_show = cfqg_print_rwstat, - }, - { - .name = "io_merged", - .private = offsetof(struct cfq_group, stats.merged), - .seq_show = cfqg_print_rwstat, - }, - { - .name = "io_queued", - .private = offsetof(struct cfq_group, stats.queued), - .seq_show = cfqg_print_rwstat, - }, - - /* the same statictics which cover the cfqg and its descendants */ - { - .name = "time_recursive", - .private = offsetof(struct cfq_group, stats.time), - .seq_show = cfqg_print_stat_recursive, - }, - { - .name = "sectors_recursive", - .seq_show = cfqg_print_stat_sectors_recursive, - }, - { - .name = "io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_cfq, - .seq_show = blkg_print_stat_bytes_recursive, - }, - { - .name = "io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_cfq, - .seq_show = blkg_print_stat_ios_recursive, - }, - { - .name = "io_service_time_recursive", - .private = offsetof(struct cfq_group, stats.service_time), - .seq_show = cfqg_print_rwstat_recursive, - }, - { - .name = "io_wait_time_recursive", - .private = offsetof(struct cfq_group, stats.wait_time), - .seq_show = cfqg_print_rwstat_recursive, - }, - { - .name = "io_merged_recursive", - .private = offsetof(struct cfq_group, stats.merged), - .seq_show = cfqg_print_rwstat_recursive, - }, - { - .name = "io_queued_recursive", - .private = offsetof(struct cfq_group, stats.queued), - .seq_show = cfqg_print_rwstat_recursive, - }, -#ifdef CONFIG_DEBUG_BLK_CGROUP - { - .name = "avg_queue_size", - .seq_show = cfqg_print_avg_queue_size, - }, - { - .name = "group_wait_time", - .private = offsetof(struct cfq_group, stats.group_wait_time), - .seq_show = cfqg_print_stat, - }, - { - .name = "idle_time", - .private = offsetof(struct cfq_group, stats.idle_time), - .seq_show = cfqg_print_stat, - }, - { - .name = "empty_time", - .private = offsetof(struct cfq_group, stats.empty_time), - .seq_show = cfqg_print_stat, - }, - { - .name = "dequeue", - .private = offsetof(struct cfq_group, stats.dequeue), - .seq_show = cfqg_print_stat, - }, - { - .name = "unaccounted_time", - .private = offsetof(struct cfq_group, stats.unaccounted_time), - .seq_show = cfqg_print_stat, - }, -#endif /* CONFIG_DEBUG_BLK_CGROUP */ - { } /* terminate */ -}; - -static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) -{ - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); - - seq_printf(sf, "default %u\n", cgd->weight); - blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, - &blkcg_policy_cfq, 0, false); - return 0; -} - -static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - char *endp; - int ret; - u64 v; - - buf = strim(buf); - - /* "WEIGHT" or "default WEIGHT" sets the default weight */ - v = simple_strtoull(buf, &endp, 0); - if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { - ret = __cfq_set_weight(of_css(of), v, true, false, false); - return ret ?: nbytes; - } - - /* "MAJ:MIN WEIGHT" */ - return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); -} - -static struct cftype cfq_blkcg_files[] = { - { - .name = "weight", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cfq_print_weight_on_dfl, - .write = cfq_set_weight_on_dfl, - }, - { } /* terminate */ -}; - -#else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) -{ - return cfqd->root_group; -} - -static inline void -cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { - cfqq->cfqg = cfqg; -} - -#endif /* GROUP_IOSCHED */ - -/* - * The cfqd->service_trees holds all pending cfq_queue's that have - * requests waiting to be processed. It is sorted in the order that - * we will service the queues. - */ -static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, - bool add_front) -{ - struct rb_node **p, *parent; - struct cfq_queue *__cfqq; - u64 rb_key; - struct cfq_rb_root *st; - bool leftmost = true; - int new_cfqq = 1; - u64 now = ktime_get_ns(); - - st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq)); - if (cfq_class_idle(cfqq)) { - rb_key = CFQ_IDLE_DELAY; - parent = st->rb_rightmost; - if (parent && parent != &cfqq->rb_node) { - __cfqq = rb_entry(parent, struct cfq_queue, rb_node); - rb_key += __cfqq->rb_key; - } else - rb_key += now; - } else if (!add_front) { - /* - * Get our rb key offset. Subtract any residual slice - * value carried from last service. A negative resid - * count indicates slice overrun, and this should position - * the next service time further away in the tree. - */ - rb_key = cfq_slice_offset(cfqd, cfqq) + now; - rb_key -= cfqq->slice_resid; - cfqq->slice_resid = 0; - } else { - rb_key = -NSEC_PER_SEC; - __cfqq = cfq_rb_first(st); - rb_key += __cfqq ? __cfqq->rb_key : now; - } - - if (!RB_EMPTY_NODE(&cfqq->rb_node)) { - new_cfqq = 0; - /* - * same position, nothing more to do - */ - if (rb_key == cfqq->rb_key && cfqq->service_tree == st) - return; - - cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); - cfqq->service_tree = NULL; - } - - parent = NULL; - cfqq->service_tree = st; - p = &st->rb.rb_root.rb_node; - while (*p) { - parent = *p; - __cfqq = rb_entry(parent, struct cfq_queue, rb_node); - - /* - * sort by key, that represents service time. - */ - if (rb_key < __cfqq->rb_key) - p = &parent->rb_left; - else { - p = &parent->rb_right; - leftmost = false; - } - } - - cfqq->rb_key = rb_key; - rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost); - st->count++; - if (add_front || !new_cfqq) - return; - cfq_group_notify_queue_add(cfqd, cfqq->cfqg); -} - -static struct cfq_queue * -cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, - sector_t sector, struct rb_node **ret_parent, - struct rb_node ***rb_link) -{ - struct rb_node **p, *parent; - struct cfq_queue *cfqq = NULL; - - parent = NULL; - p = &root->rb_node; - while (*p) { - struct rb_node **n; - - parent = *p; - cfqq = rb_entry(parent, struct cfq_queue, p_node); - - /* - * Sort strictly based on sector. Smallest to the left, - * largest to the right. - */ - if (sector > blk_rq_pos(cfqq->next_rq)) - n = &(*p)->rb_right; - else if (sector < blk_rq_pos(cfqq->next_rq)) - n = &(*p)->rb_left; - else - break; - p = n; - cfqq = NULL; - } - - *ret_parent = parent; - if (rb_link) - *rb_link = p; - return cfqq; -} - -static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - struct rb_node **p, *parent; - struct cfq_queue *__cfqq; - - if (cfqq->p_root) { - rb_erase(&cfqq->p_node, cfqq->p_root); - cfqq->p_root = NULL; - } - - if (cfq_class_idle(cfqq)) - return; - if (!cfqq->next_rq) - return; - - cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; - __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, - blk_rq_pos(cfqq->next_rq), &parent, &p); - if (!__cfqq) { - rb_link_node(&cfqq->p_node, parent, p); - rb_insert_color(&cfqq->p_node, cfqq->p_root); - } else - cfqq->p_root = NULL; -} - -/* - * Update cfqq's position in the service tree. - */ -static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - /* - * Resorting requires the cfqq to be on the RR list already. - */ - if (cfq_cfqq_on_rr(cfqq)) { - cfq_service_tree_add(cfqd, cfqq, 0); - cfq_prio_tree_add(cfqd, cfqq); - } -} - -/* - * add to busy list of queues for service, trying to be fair in ordering - * the pending list according to last request service - */ -static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); - BUG_ON(cfq_cfqq_on_rr(cfqq)); - cfq_mark_cfqq_on_rr(cfqq); - cfqd->busy_queues++; - if (cfq_cfqq_sync(cfqq)) - cfqd->busy_sync_queues++; - - cfq_resort_rr_list(cfqd, cfqq); -} - -/* - * Called when the cfqq no longer has requests pending, remove it from - * the service tree. - */ -static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); - BUG_ON(!cfq_cfqq_on_rr(cfqq)); - cfq_clear_cfqq_on_rr(cfqq); - - if (!RB_EMPTY_NODE(&cfqq->rb_node)) { - cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); - cfqq->service_tree = NULL; - } - if (cfqq->p_root) { - rb_erase(&cfqq->p_node, cfqq->p_root); - cfqq->p_root = NULL; - } - - cfq_group_notify_queue_del(cfqd, cfqq->cfqg); - BUG_ON(!cfqd->busy_queues); - cfqd->busy_queues--; - if (cfq_cfqq_sync(cfqq)) - cfqd->busy_sync_queues--; -} - -/* - * rb tree support functions - */ -static void cfq_del_rq_rb(struct request *rq) -{ - struct cfq_queue *cfqq = RQ_CFQQ(rq); - const int sync = rq_is_sync(rq); - - BUG_ON(!cfqq->queued[sync]); - cfqq->queued[sync]--; - - elv_rb_del(&cfqq->sort_list, rq); - - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { - /* - * Queue will be deleted from service tree when we actually - * expire it later. Right now just remove it from prio tree - * as it is empty. - */ - if (cfqq->p_root) { - rb_erase(&cfqq->p_node, cfqq->p_root); - cfqq->p_root = NULL; - } - } -} - -static void cfq_add_rq_rb(struct request *rq) -{ - struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = cfqq->cfqd; - struct request *prev; - - cfqq->queued[rq_is_sync(rq)]++; - - elv_rb_add(&cfqq->sort_list, rq); - - if (!cfq_cfqq_on_rr(cfqq)) - cfq_add_cfqq_rr(cfqd, cfqq); - - /* - * check if this request is a better next-serve candidate - */ - prev = cfqq->next_rq; - cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); - - /* - * adjust priority tree position, if ->next_rq changes - */ - if (prev != cfqq->next_rq) - cfq_prio_tree_add(cfqd, cfqq); - - BUG_ON(!cfqq->next_rq); -} - -static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) -{ - elv_rb_del(&cfqq->sort_list, rq); - cfqq->queued[rq_is_sync(rq)]--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); - cfq_add_rq_rb(rq); - cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, - rq->cmd_flags); -} - -static struct request * -cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) -{ - struct task_struct *tsk = current; - struct cfq_io_cq *cic; - struct cfq_queue *cfqq; - - cic = cfq_cic_lookup(cfqd, tsk->io_context); - if (!cic) - return NULL; - - cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf)); - if (cfqq) - return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio)); - - return NULL; -} - -static void cfq_activate_request(struct request_queue *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - - cfqd->rq_in_driver++; - cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", - cfqd->rq_in_driver); - - cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -} - -static void cfq_deactivate_request(struct request_queue *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - - WARN_ON(!cfqd->rq_in_driver); - cfqd->rq_in_driver--; - cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", - cfqd->rq_in_driver); -} - -static void cfq_remove_request(struct request *rq) -{ - struct cfq_queue *cfqq = RQ_CFQQ(rq); - - if (cfqq->next_rq == rq) - cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); - - list_del_init(&rq->queuelist); - cfq_del_rq_rb(rq); - - cfqq->cfqd->rq_queued--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); - if (rq->cmd_flags & REQ_PRIO) { - WARN_ON(!cfqq->prio_pending); - cfqq->prio_pending--; - } -} - -static enum elv_merge cfq_merge(struct request_queue *q, struct request **req, - struct bio *bio) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - struct request *__rq; - - __rq = cfq_find_rq_fmerge(cfqd, bio); - if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; - } - - return ELEVATOR_NO_MERGE; -} - -static void cfq_merged_request(struct request_queue *q, struct request *req, - enum elv_merge type) -{ - if (type == ELEVATOR_FRONT_MERGE) { - struct cfq_queue *cfqq = RQ_CFQQ(req); - - cfq_reposition_rq_rb(cfqq, req); - } -} - -static void cfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) -{ - cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf); -} - -static void -cfq_merged_requests(struct request_queue *q, struct request *rq, - struct request *next) -{ - struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = q->elevator->elevator_data; - - /* - * reposition in fifo if next is older than rq - */ - if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && - next->fifo_time < rq->fifo_time && - cfqq == RQ_CFQQ(next)) { - list_move(&rq->queuelist, &next->queuelist); - rq->fifo_time = next->fifo_time; - } - - if (cfqq->next_rq == next) - cfqq->next_rq = rq; - cfq_remove_request(next); - cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); - - cfqq = RQ_CFQQ(next); - /* - * all requests of this queue are merged to other queues, delete it - * from the service tree. If it's the active_queue, - * cfq_dispatch_requests() will choose to expire it or do idle - */ - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) && - cfqq != cfqd->active_queue) - cfq_del_cfqq_rr(cfqd, cfqq); -} - -static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq, - struct bio *bio) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - bool is_sync = op_is_sync(bio->bi_opf); - struct cfq_io_cq *cic; - struct cfq_queue *cfqq; - - /* - * Disallow merge of a sync bio into an async request. - */ - if (is_sync && !rq_is_sync(rq)) - return false; - - /* - * Lookup the cfqq that this bio will be queued with and allow - * merge only if rq is queued there. - */ - cic = cfq_cic_lookup(cfqd, current->io_context); - if (!cic) - return false; - - cfqq = cic_to_cfqq(cic, is_sync); - return cfqq == RQ_CFQQ(rq); -} - -static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq, - struct request *next) -{ - return RQ_CFQQ(rq) == RQ_CFQQ(next); -} - -static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - hrtimer_try_to_cancel(&cfqd->idle_slice_timer); - cfqg_stats_update_idle_time(cfqq->cfqg); -} - -static void __cfq_set_active_queue(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d", - cfqd->serving_wl_class, cfqd->serving_wl_type); - cfqg_stats_update_avg_queue_size(cfqq->cfqg); - cfqq->slice_start = 0; - cfqq->dispatch_start = ktime_get_ns(); - cfqq->allocated_slice = 0; - cfqq->slice_end = 0; - cfqq->slice_dispatch = 0; - cfqq->nr_sectors = 0; - - cfq_clear_cfqq_wait_request(cfqq); - cfq_clear_cfqq_must_dispatch(cfqq); - cfq_clear_cfqq_must_alloc_slice(cfqq); - cfq_clear_cfqq_fifo_expire(cfqq); - cfq_mark_cfqq_slice_new(cfqq); - - cfq_del_timer(cfqd, cfqq); - } - - cfqd->active_queue = cfqq; -} - -/* - * current cfqq expired its slice (or was too idle), select new one - */ -static void -__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, - bool timed_out) -{ - cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); - - if (cfq_cfqq_wait_request(cfqq)) - cfq_del_timer(cfqd, cfqq); - - cfq_clear_cfqq_wait_request(cfqq); - cfq_clear_cfqq_wait_busy(cfqq); - - /* - * If this cfqq is shared between multiple processes, check to - * make sure that those processes are still issuing I/Os within - * the mean seek distance. If not, it may be time to break the - * queues apart again. - */ - if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) - cfq_mark_cfqq_split_coop(cfqq); - - /* - * store what was left of this slice, if the queue idled/timed out - */ - if (timed_out) { - if (cfq_cfqq_slice_new(cfqq)) - cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq); - else - cfqq->slice_resid = cfqq->slice_end - ktime_get_ns(); - cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid); - } - - cfq_group_served(cfqd, cfqq->cfqg, cfqq); - - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) - cfq_del_cfqq_rr(cfqd, cfqq); - - cfq_resort_rr_list(cfqd, cfqq); - - if (cfqq == cfqd->active_queue) - cfqd->active_queue = NULL; - - if (cfqd->active_cic) { - put_io_context(cfqd->active_cic->icq.ioc); - cfqd->active_cic = NULL; - } -} - -static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) -{ - struct cfq_queue *cfqq = cfqd->active_queue; - - if (cfqq) - __cfq_slice_expired(cfqd, cfqq, timed_out); -} - -/* - * Get next queue for service. Unless we have a queue preemption, - * we'll simply select the first cfqq in the service tree. - */ -static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) -{ - struct cfq_rb_root *st = st_for(cfqd->serving_group, - cfqd->serving_wl_class, cfqd->serving_wl_type); - - if (!cfqd->rq_queued) - return NULL; - - /* There is nothing to dispatch */ - if (!st) - return NULL; - if (RB_EMPTY_ROOT(&st->rb.rb_root)) - return NULL; - return cfq_rb_first(st); -} - -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) -{ - struct cfq_group *cfqg; - struct cfq_queue *cfqq; - int i, j; - struct cfq_rb_root *st; - - if (!cfqd->rq_queued) - return NULL; - - cfqg = cfq_get_next_cfqg(cfqd); - if (!cfqg) - return NULL; - - for_each_cfqg_st(cfqg, i, j, st) { - cfqq = cfq_rb_first(st); - if (cfqq) - return cfqq; - } - return NULL; -} - -/* - * Get and set a new active queue for service. - */ -static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - if (!cfqq) - cfqq = cfq_get_next_queue(cfqd); - - __cfq_set_active_queue(cfqd, cfqq); - return cfqq; -} - -static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, - struct request *rq) -{ - if (blk_rq_pos(rq) >= cfqd->last_position) - return blk_rq_pos(rq) - cfqd->last_position; - else - return cfqd->last_position - blk_rq_pos(rq); -} - -static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *rq) -{ - return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; -} - -static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, - struct cfq_queue *cur_cfqq) -{ - struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; - struct rb_node *parent, *node; - struct cfq_queue *__cfqq; - sector_t sector = cfqd->last_position; - - if (RB_EMPTY_ROOT(root)) - return NULL; - - /* - * First, if we find a request starting at the end of the last - * request, choose it. - */ - __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); - if (__cfqq) - return __cfqq; - - /* - * If the exact sector wasn't found, the parent of the NULL leaf - * will contain the closest sector. - */ - __cfqq = rb_entry(parent, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) - return __cfqq; - - if (blk_rq_pos(__cfqq->next_rq) < sector) - node = rb_next(&__cfqq->p_node); - else - node = rb_prev(&__cfqq->p_node); - if (!node) - return NULL; - - __cfqq = rb_entry(node, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) - return __cfqq; - - return NULL; -} - -/* - * cfqd - obvious - * cur_cfqq - passed in so that we don't decide that the current queue is - * closely cooperating with itself. - * - * So, basically we're assuming that that cur_cfqq has dispatched at least - * one request, and that cfqd->last_position reflects a position on the disk - * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid - * assumption. - */ -static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, - struct cfq_queue *cur_cfqq) -{ - struct cfq_queue *cfqq; - - if (cfq_class_idle(cur_cfqq)) - return NULL; - if (!cfq_cfqq_sync(cur_cfqq)) - return NULL; - if (CFQQ_SEEKY(cur_cfqq)) - return NULL; - - /* - * Don't search priority tree if it's the only queue in the group. - */ - if (cur_cfqq->cfqg->nr_cfqq == 1) - return NULL; - - /* - * We should notice if some of the queues are cooperating, eg - * working closely on the same area of the disk. In that case, - * we can group them together and don't waste time idling. - */ - cfqq = cfqq_close(cfqd, cur_cfqq); - if (!cfqq) - return NULL; - - /* If new queue belongs to different cfq_group, don't choose it */ - if (cur_cfqq->cfqg != cfqq->cfqg) - return NULL; - - /* - * It only makes sense to merge sync queues. - */ - if (!cfq_cfqq_sync(cfqq)) - return NULL; - if (CFQQ_SEEKY(cfqq)) - return NULL; - - /* - * Do not merge queues of different priority classes - */ - if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) - return NULL; - - return cfqq; -} - -/* - * Determine whether we should enforce idle window for this queue. - */ - -static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - enum wl_class_t wl_class = cfqq_class(cfqq); - struct cfq_rb_root *st = cfqq->service_tree; - - BUG_ON(!st); - BUG_ON(!st->count); - - if (!cfqd->cfq_slice_idle) - return false; - - /* We never do for idle class queues. */ - if (wl_class == IDLE_WORKLOAD) - return false; - - /* We do for queues that were marked with idle window flag. */ - if (cfq_cfqq_idle_window(cfqq) && - !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) - return true; - - /* - * Otherwise, we do only if they are the last ones - * in their service tree. - */ - if (st->count == 1 && cfq_cfqq_sync(cfqq) && - !cfq_io_thinktime_big(cfqd, &st->ttime, false)) - return true; - cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count); - return false; -} - -static void cfq_arm_slice_timer(struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq = cfqd->active_queue; - struct cfq_rb_root *st = cfqq->service_tree; - struct cfq_io_cq *cic; - u64 sl, group_idle = 0; - u64 now = ktime_get_ns(); - - /* - * SSD device without seek penalty, disable idling. But only do so - * for devices that support queuing, otherwise we still have a problem - * with sync vs async workloads. - */ - if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag && - !cfqd->cfq_group_idle) - return; - - WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); - WARN_ON(cfq_cfqq_slice_new(cfqq)); - - /* - * idle is disabled, either manually or by past process history - */ - if (!cfq_should_idle(cfqd, cfqq)) { - /* no queue idling. Check for group idling */ - if (cfqd->cfq_group_idle) - group_idle = cfqd->cfq_group_idle; - else - return; - } - - /* - * still active requests from this queue, don't idle - */ - if (cfqq->dispatched) - return; - - /* - * task has exited, don't wait - */ - cic = cfqd->active_cic; - if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) - return; - - /* - * If our average think time is larger than the remaining time - * slice, then don't idle. This avoids overrunning the allotted - * time slice. - */ - if (sample_valid(cic->ttime.ttime_samples) && - (cfqq->slice_end - now < cic->ttime.ttime_mean)) { - cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu", - cic->ttime.ttime_mean); - return; - } - - /* - * There are other queues in the group or this is the only group and - * it has too big thinktime, don't do group idle. - */ - if (group_idle && - (cfqq->cfqg->nr_cfqq > 1 || - cfq_io_thinktime_big(cfqd, &st->ttime, true))) - return; - - cfq_mark_cfqq_wait_request(cfqq); - - if (group_idle) - sl = cfqd->cfq_group_idle; - else - sl = cfqd->cfq_slice_idle; - - hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl), - HRTIMER_MODE_REL); - cfqg_stats_set_start_idle_time(cfqq->cfqg); - cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl, - group_idle ? 1 : 0); -} - -/* - * Move request from internal lists to the request queue dispatch list. - */ -static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_queue *cfqq = RQ_CFQQ(rq); - - cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); - - cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); - cfq_remove_request(rq); - cfqq->dispatched++; - (RQ_CFQG(rq))->dispatched++; - elv_dispatch_sort(q, rq); - - cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; - cfqq->nr_sectors += blk_rq_sectors(rq); -} - -/* - * return expired entry, or NULL to just start from scratch in rbtree - */ -static struct request *cfq_check_fifo(struct cfq_queue *cfqq) -{ - struct request *rq = NULL; - - if (cfq_cfqq_fifo_expire(cfqq)) - return NULL; - - cfq_mark_cfqq_fifo_expire(cfqq); - - if (list_empty(&cfqq->fifo)) - return NULL; - - rq = rq_entry_fifo(cfqq->fifo.next); - if (ktime_get_ns() < rq->fifo_time) - rq = NULL; - - return rq; -} - -static inline int -cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - const int base_rq = cfqd->cfq_slice_async_rq; - - WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); - - return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio); -} - -/* - * Must be called with the queue_lock held. - */ -static int cfqq_process_refs(struct cfq_queue *cfqq) -{ - int process_refs, io_refs; - - io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; - process_refs = cfqq->ref - io_refs; - BUG_ON(process_refs < 0); - return process_refs; -} - -static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) -{ - int process_refs, new_process_refs; - struct cfq_queue *__cfqq; - - /* - * If there are no process references on the new_cfqq, then it is - * unsafe to follow the ->new_cfqq chain as other cfqq's in the - * chain may have dropped their last reference (not just their - * last process reference). - */ - if (!cfqq_process_refs(new_cfqq)) - return; - - /* Avoid a circular list and skip interim queue merges */ - while ((__cfqq = new_cfqq->new_cfqq)) { - if (__cfqq == cfqq) - return; - new_cfqq = __cfqq; - } - - process_refs = cfqq_process_refs(cfqq); - new_process_refs = cfqq_process_refs(new_cfqq); - /* - * If the process for the cfqq has gone away, there is no - * sense in merging the queues. - */ - if (process_refs == 0 || new_process_refs == 0) - return; - - /* - * Merge in the direction of the lesser amount of work. - */ - if (new_process_refs >= process_refs) { - cfqq->new_cfqq = new_cfqq; - new_cfqq->ref += process_refs; - } else { - new_cfqq->new_cfqq = cfqq; - cfqq->ref += new_process_refs; - } -} - -static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd, - struct cfq_group *cfqg, enum wl_class_t wl_class) -{ - struct cfq_queue *queue; - int i; - bool key_valid = false; - u64 lowest_key = 0; - enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; - - for (i = 0; i <= SYNC_WORKLOAD; ++i) { - /* select the one with lowest rb_key */ - queue = cfq_rb_first(st_for(cfqg, wl_class, i)); - if (queue && - (!key_valid || queue->rb_key < lowest_key)) { - lowest_key = queue->rb_key; - cur_best = i; - key_valid = true; - } - } - - return cur_best; -} - -static void -choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg) -{ - u64 slice; - unsigned count; - struct cfq_rb_root *st; - u64 group_slice; - enum wl_class_t original_class = cfqd->serving_wl_class; - u64 now = ktime_get_ns(); - - /* Choose next priority. RT > BE > IDLE */ - if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) - cfqd->serving_wl_class = RT_WORKLOAD; - else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) - cfqd->serving_wl_class = BE_WORKLOAD; - else { - cfqd->serving_wl_class = IDLE_WORKLOAD; - cfqd->workload_expires = now + jiffies_to_nsecs(1); - return; - } - - if (original_class != cfqd->serving_wl_class) - goto new_workload; - - /* - * For RT and BE, we have to choose also the type - * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload - * expiration time - */ - st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); - count = st->count; - - /* - * check workload expiration, and that we still have other queues ready - */ - if (count && !(now > cfqd->workload_expires)) - return; - -new_workload: - /* otherwise select new workload type */ - cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg, - cfqd->serving_wl_class); - st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); - count = st->count; - - /* - * the workload slice is computed as a fraction of target latency - * proportional to the number of queues in that workload, over - * all the queues in the same priority class - */ - group_slice = cfq_group_slice(cfqd, cfqg); - - slice = div_u64(group_slice * count, - max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class], - cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd, - cfqg))); - - if (cfqd->serving_wl_type == ASYNC_WORKLOAD) { - u64 tmp; - - /* - * Async queues are currently system wide. Just taking - * proportion of queues with-in same group will lead to higher - * async ratio system wide as generally root group is going - * to have higher weight. A more accurate thing would be to - * calculate system wide asnc/sync ratio. - */ - tmp = cfqd->cfq_target_latency * - cfqg_busy_async_queues(cfqd, cfqg); - tmp = div_u64(tmp, cfqd->busy_queues); - slice = min_t(u64, slice, tmp); - - /* async workload slice is scaled down according to - * the sync/async slice ratio. */ - slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]); - } else - /* sync workload slice is at least 2 * cfq_slice_idle */ - slice = max(slice, 2 * cfqd->cfq_slice_idle); - - slice = max_t(u64, slice, CFQ_MIN_TT); - cfq_log(cfqd, "workload slice:%llu", slice); - cfqd->workload_expires = now + slice; -} - -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) -{ - struct cfq_rb_root *st = &cfqd->grp_service_tree; - struct cfq_group *cfqg; - - if (RB_EMPTY_ROOT(&st->rb.rb_root)) - return NULL; - cfqg = cfq_rb_first_group(st); - update_min_vdisktime(st); - return cfqg; -} - -static void cfq_choose_cfqg(struct cfq_data *cfqd) -{ - struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); - u64 now = ktime_get_ns(); - - cfqd->serving_group = cfqg; - - /* Restore the workload type data */ - if (cfqg->saved_wl_slice) { - cfqd->workload_expires = now + cfqg->saved_wl_slice; - cfqd->serving_wl_type = cfqg->saved_wl_type; - cfqd->serving_wl_class = cfqg->saved_wl_class; - } else - cfqd->workload_expires = now - 1; - - choose_wl_class_and_type(cfqd, cfqg); -} - -/* - * Select a queue for service. If we have a current active queue, - * check whether to continue servicing it, or retrieve and set a new one. - */ -static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq, *new_cfqq = NULL; - u64 now = ktime_get_ns(); - - cfqq = cfqd->active_queue; - if (!cfqq) - goto new_queue; - - if (!cfqd->rq_queued) - return NULL; - - /* - * We were waiting for group to get backlogged. Expire the queue - */ - if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) - goto expire; - - /* - * The active queue has run out of time, expire it and select new. - */ - if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) { - /* - * If slice had not expired at the completion of last request - * we might not have turned on wait_busy flag. Don't expire - * the queue yet. Allow the group to get backlogged. - * - * The very fact that we have used the slice, that means we - * have been idling all along on this queue and it should be - * ok to wait for this request to complete. - */ - if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list) - && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { - cfqq = NULL; - goto keep_queue; - } else - goto check_group_idle; - } - - /* - * The active queue has requests and isn't expired, allow it to - * dispatch. - */ - if (!RB_EMPTY_ROOT(&cfqq->sort_list)) - goto keep_queue; - - /* - * If another queue has a request waiting within our mean seek - * distance, let it run. The expire code will check for close - * cooperators and put the close queue at the front of the service - * tree. If possible, merge the expiring queue with the new cfqq. - */ - new_cfqq = cfq_close_cooperator(cfqd, cfqq); - if (new_cfqq) { - if (!cfqq->new_cfqq) - cfq_setup_merge(cfqq, new_cfqq); - goto expire; - } - - /* - * No requests pending. If the active queue still has requests in - * flight or is idling for a new request, allow either of these - * conditions to happen (or time out) before selecting a new queue. - */ - if (hrtimer_active(&cfqd->idle_slice_timer)) { - cfqq = NULL; - goto keep_queue; - } - - /* - * This is a deep seek queue, but the device is much faster than - * the queue can deliver, don't idle - **/ - if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) && - (cfq_cfqq_slice_new(cfqq) || - (cfqq->slice_end - now > now - cfqq->slice_start))) { - cfq_clear_cfqq_deep(cfqq); - cfq_clear_cfqq_idle_window(cfqq); - } - - if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { - cfqq = NULL; - goto keep_queue; - } - - /* - * If group idle is enabled and there are requests dispatched from - * this group, wait for requests to complete. - */ -check_group_idle: - if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 && - cfqq->cfqg->dispatched && - !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) { - cfqq = NULL; - goto keep_queue; - } - -expire: - cfq_slice_expired(cfqd, 0); -new_queue: - /* - * Current queue expired. Check if we have to switch to a new - * service tree - */ - if (!new_cfqq) - cfq_choose_cfqg(cfqd); - - cfqq = cfq_set_active_queue(cfqd, new_cfqq); -keep_queue: - return cfqq; -} - -static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) -{ - int dispatched = 0; - - while (cfqq->next_rq) { - cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); - dispatched++; - } - - BUG_ON(!list_empty(&cfqq->fifo)); - - /* By default cfqq is not expired if it is empty. Do it explicitly */ - __cfq_slice_expired(cfqq->cfqd, cfqq, 0); - return dispatched; -} - -/* - * Drain our current requests. Used for barriers and when switching - * io schedulers on-the-fly. - */ -static int cfq_forced_dispatch(struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq; - int dispatched = 0; - - /* Expire the timeslice of the current active queue first */ - cfq_slice_expired(cfqd, 0); - while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { - __cfq_set_active_queue(cfqd, cfqq); - dispatched += __cfq_forced_dispatch_cfqq(cfqq); - } - - BUG_ON(cfqd->busy_queues); - - cfq_log(cfqd, "forced_dispatch=%d", dispatched); - return dispatched; -} - -static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - u64 now = ktime_get_ns(); - - /* the queue hasn't finished any request, can't estimate */ - if (cfq_cfqq_slice_new(cfqq)) - return true; - if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end) - return true; - - return false; -} - -static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - unsigned int max_dispatch; - - if (cfq_cfqq_must_dispatch(cfqq)) - return true; - - /* - * Drain async requests before we start sync IO - */ - if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) - return false; - - /* - * If this is an async queue and we have sync IO in flight, let it wait - */ - if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) - return false; - - max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); - if (cfq_class_idle(cfqq)) - max_dispatch = 1; - - /* - * Does this cfqq already have too much IO in flight? - */ - if (cfqq->dispatched >= max_dispatch) { - bool promote_sync = false; - /* - * idle queue must always only have a single IO in flight - */ - if (cfq_class_idle(cfqq)) - return false; - - /* - * If there is only one sync queue - * we can ignore async queue here and give the sync - * queue no dispatch limit. The reason is a sync queue can - * preempt async queue, limiting the sync queue doesn't make - * sense. This is useful for aiostress test. - */ - if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) - promote_sync = true; - - /* - * We have other queues, don't allow more IO from this one - */ - if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) && - !promote_sync) - return false; - - /* - * Sole queue user, no limit - */ - if (cfqd->busy_queues == 1 || promote_sync) - max_dispatch = -1; - else - /* - * Normally we start throttling cfqq when cfq_quantum/2 - * requests have been dispatched. But we can drive - * deeper queue depths at the beginning of slice - * subjected to upper limit of cfq_quantum. - * */ - max_dispatch = cfqd->cfq_quantum; - } - - /* - * Async queues must wait a bit before being allowed dispatch. - * We also ramp up the dispatch depth gradually for async IO, - * based on the last sync IO we serviced - */ - if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { - u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync; - unsigned int depth; - - depth = div64_u64(last_sync, cfqd->cfq_slice[1]); - if (!depth && !cfqq->dispatched) - depth = 1; - if (depth < max_dispatch) - max_dispatch = depth; - } - - /* - * If we're below the current max, allow a dispatch - */ - return cfqq->dispatched < max_dispatch; -} - -/* - * Dispatch a request from cfqq, moving them to the request queue - * dispatch list. - */ -static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - struct request *rq; - - BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); - - rq = cfq_check_fifo(cfqq); - if (rq) - cfq_mark_cfqq_must_dispatch(cfqq); - - if (!cfq_may_dispatch(cfqd, cfqq)) - return false; - - /* - * follow expired path, else get first next available - */ - if (!rq) - rq = cfqq->next_rq; - else - cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); - - /* - * insert request into driver dispatch list - */ - cfq_dispatch_insert(cfqd->queue, rq); - - if (!cfqd->active_cic) { - struct cfq_io_cq *cic = RQ_CIC(rq); - - atomic_long_inc(&cic->icq.ioc->refcount); - cfqd->active_cic = cic; - } - - return true; -} - -/* - * Find the cfqq that we need to service and move a request from that to the - * dispatch list - */ -static int cfq_dispatch_requests(struct request_queue *q, int force) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_queue *cfqq; - - if (!cfqd->busy_queues) - return 0; - - if (unlikely(force)) - return cfq_forced_dispatch(cfqd); - - cfqq = cfq_select_queue(cfqd); - if (!cfqq) - return 0; - - /* - * Dispatch a request from this cfqq, if it is allowed - */ - if (!cfq_dispatch_request(cfqd, cfqq)) - return 0; - - cfqq->slice_dispatch++; - cfq_clear_cfqq_must_dispatch(cfqq); - - /* - * expire an async queue immediately if it has used up its slice. idle - * queue always expire after 1 dispatch round. - */ - if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && - cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || - cfq_class_idle(cfqq))) { - cfqq->slice_end = ktime_get_ns() + 1; - cfq_slice_expired(cfqd, 0); - } - - cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); - return 1; -} - -/* - * task holds one reference to the queue, dropped when task exits. each rq - * in-flight on this queue also holds a reference, dropped when rq is freed. - * - * Each cfq queue took a reference on the parent group. Drop it now. - * queue lock must be held here. - */ -static void cfq_put_queue(struct cfq_queue *cfqq) -{ - struct cfq_data *cfqd = cfqq->cfqd; - struct cfq_group *cfqg; - - BUG_ON(cfqq->ref <= 0); - - cfqq->ref--; - if (cfqq->ref) - return; - - cfq_log_cfqq(cfqd, cfqq, "put_queue"); - BUG_ON(rb_first(&cfqq->sort_list)); - BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); - cfqg = cfqq->cfqg; - - if (unlikely(cfqd->active_queue == cfqq)) { - __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); - } - - BUG_ON(cfq_cfqq_on_rr(cfqq)); - kmem_cache_free(cfq_pool, cfqq); - cfqg_put(cfqg); -} - -static void cfq_put_cooperator(struct cfq_queue *cfqq) -{ - struct cfq_queue *__cfqq, *next; - - /* - * If this queue was scheduled to merge with another queue, be - * sure to drop the reference taken on that queue (and others in - * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. - */ - __cfqq = cfqq->new_cfqq; - while (__cfqq) { - if (__cfqq == cfqq) { - WARN(1, "cfqq->new_cfqq loop detected\n"); - break; - } - next = __cfqq->new_cfqq; - cfq_put_queue(__cfqq); - __cfqq = next; - } -} - -static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - if (unlikely(cfqq == cfqd->active_queue)) { - __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); - } - - cfq_put_cooperator(cfqq); - - cfq_put_queue(cfqq); -} - -static void cfq_init_icq(struct io_cq *icq) -{ - struct cfq_io_cq *cic = icq_to_cic(icq); - - cic->ttime.last_end_request = ktime_get_ns(); -} - -static void cfq_exit_icq(struct io_cq *icq) -{ - struct cfq_io_cq *cic = icq_to_cic(icq); - struct cfq_data *cfqd = cic_to_cfqd(cic); - - if (cic_to_cfqq(cic, false)) { - cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false)); - cic_set_cfqq(cic, NULL, false); - } - - if (cic_to_cfqq(cic, true)) { - cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true)); - cic_set_cfqq(cic, NULL, true); - } -} - -static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) -{ - struct task_struct *tsk = current; - int ioprio_class; - - if (!cfq_cfqq_prio_changed(cfqq)) - return; - - ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); - switch (ioprio_class) { - default: - printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); - /* fall through */ - case IOPRIO_CLASS_NONE: - /* - * no prio set, inherit CPU scheduling settings - */ - cfqq->ioprio = task_nice_ioprio(tsk); - cfqq->ioprio_class = task_nice_ioclass(tsk); - break; - case IOPRIO_CLASS_RT: - cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); - cfqq->ioprio_class = IOPRIO_CLASS_RT; - break; - case IOPRIO_CLASS_BE: - cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); - cfqq->ioprio_class = IOPRIO_CLASS_BE; - break; - case IOPRIO_CLASS_IDLE: - cfqq->ioprio_class = IOPRIO_CLASS_IDLE; - cfqq->ioprio = 7; - cfq_clear_cfqq_idle_window(cfqq); - break; - } - - /* - * keep track of original prio settings in case we have to temporarily - * elevate the priority of this queue - */ - cfqq->org_ioprio = cfqq->ioprio; - cfqq->org_ioprio_class = cfqq->ioprio_class; - cfq_clear_cfqq_prio_changed(cfqq); -} - -static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) -{ - int ioprio = cic->icq.ioc->ioprio; - struct cfq_data *cfqd = cic_to_cfqd(cic); - struct cfq_queue *cfqq; - - /* - * Check whether ioprio has changed. The condition may trigger - * spuriously on a newly created cic but there's no harm. - */ - if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) - return; - - cfqq = cic_to_cfqq(cic, false); - if (cfqq) { - cfq_put_queue(cfqq); - cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio); - cic_set_cfqq(cic, cfqq, false); - } - - cfqq = cic_to_cfqq(cic, true); - if (cfqq) - cfq_mark_cfqq_prio_changed(cfqq); - - cic->ioprio = ioprio; -} - -static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, - pid_t pid, bool is_sync) -{ - RB_CLEAR_NODE(&cfqq->rb_node); - RB_CLEAR_NODE(&cfqq->p_node); - INIT_LIST_HEAD(&cfqq->fifo); - - cfqq->ref = 0; - cfqq->cfqd = cfqd; - - cfq_mark_cfqq_prio_changed(cfqq); - - if (is_sync) { - if (!cfq_class_idle(cfqq)) - cfq_mark_cfqq_idle_window(cfqq); - cfq_mark_cfqq_sync(cfqq); - } - cfqq->pid = pid; -} - -#ifdef CONFIG_CFQ_GROUP_IOSCHED -static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) -{ - struct cfq_data *cfqd = cic_to_cfqd(cic); - struct cfq_queue *cfqq; - uint64_t serial_nr; - - rcu_read_lock(); - serial_nr = bio_blkcg(bio)->css.serial_nr; - rcu_read_unlock(); - - /* - * Check whether blkcg has changed. The condition may trigger - * spuriously on a newly created cic but there's no harm. - */ - if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) - return; - - /* - * Drop reference to queues. New queues will be assigned in new - * group upon arrival of fresh requests. - */ - cfqq = cic_to_cfqq(cic, false); - if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); - cic_set_cfqq(cic, NULL, false); - cfq_put_queue(cfqq); - } - - cfqq = cic_to_cfqq(cic, true); - if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); - cic_set_cfqq(cic, NULL, true); - cfq_put_queue(cfqq); - } - - cic->blkcg_serial_nr = serial_nr; -} -#else -static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) -{ -} -#endif /* CONFIG_CFQ_GROUP_IOSCHED */ - -static struct cfq_queue ** -cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio) -{ - switch (ioprio_class) { - case IOPRIO_CLASS_RT: - return &cfqg->async_cfqq[0][ioprio]; - case IOPRIO_CLASS_NONE: - ioprio = IOPRIO_NORM; - /* fall through */ - case IOPRIO_CLASS_BE: - return &cfqg->async_cfqq[1][ioprio]; - case IOPRIO_CLASS_IDLE: - return &cfqg->async_idle_cfqq; - default: - BUG(); - } -} - -static struct cfq_queue * -cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio) -{ - int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); - int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); - struct cfq_queue **async_cfqq = NULL; - struct cfq_queue *cfqq; - struct cfq_group *cfqg; - - rcu_read_lock(); - cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); - if (!cfqg) { - cfqq = &cfqd->oom_cfqq; - goto out; - } - - if (!is_sync) { - if (!ioprio_valid(cic->ioprio)) { - struct task_struct *tsk = current; - ioprio = task_nice_ioprio(tsk); - ioprio_class = task_nice_ioclass(tsk); - } - async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio); - cfqq = *async_cfqq; - if (cfqq) - goto out; - } - - cfqq = kmem_cache_alloc_node(cfq_pool, - GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, - cfqd->queue->node); - if (!cfqq) { - cfqq = &cfqd->oom_cfqq; - goto out; - } - - /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */ - cfqq->ioprio_class = IOPRIO_CLASS_NONE; - cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); - cfq_init_prio_data(cfqq, cic); - cfq_link_cfqq_cfqg(cfqq, cfqg); - cfq_log_cfqq(cfqd, cfqq, "alloced"); - - if (async_cfqq) { - /* a new async queue is created, pin and remember */ - cfqq->ref++; - *async_cfqq = cfqq; - } -out: - cfqq->ref++; - rcu_read_unlock(); - return cfqq; -} - -static void -__cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle) -{ - u64 elapsed = ktime_get_ns() - ttime->last_end_request; - elapsed = min(elapsed, 2UL * slice_idle); - - ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; - ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); - ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, - ttime->ttime_samples); -} - -static void -cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct cfq_io_cq *cic) -{ - if (cfq_cfqq_sync(cfqq)) { - __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); - __cfq_update_io_thinktime(&cfqq->service_tree->ttime, - cfqd->cfq_slice_idle); - } -#ifdef CONFIG_CFQ_GROUP_IOSCHED - __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle); -#endif -} - -static void -cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *rq) -{ - sector_t sdist = 0; - sector_t n_sec = blk_rq_sectors(rq); - if (cfqq->last_request_pos) { - if (cfqq->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - cfqq->last_request_pos; - else - sdist = cfqq->last_request_pos - blk_rq_pos(rq); - } - - cfqq->seek_history <<= 1; - if (blk_queue_nonrot(cfqd->queue)) - cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); - else - cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); -} - -static inline bool req_noidle(struct request *req) -{ - return req_op(req) == REQ_OP_WRITE && - (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC; -} - -/* - * Disable idle window if the process thinks too long or seeks so much that - * it doesn't matter - */ -static void -cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct cfq_io_cq *cic) -{ - int old_idle, enable_idle; - - /* - * Don't idle for async or idle io prio class - */ - if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) - return; - - enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); - - if (cfqq->queued[0] + cfqq->queued[1] >= 4) - cfq_mark_cfqq_deep(cfqq); - - if (cfqq->next_rq && req_noidle(cfqq->next_rq)) - enable_idle = 0; - else if (!atomic_read(&cic->icq.ioc->active_ref) || - !cfqd->cfq_slice_idle || - (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) - enable_idle = 0; - else if (sample_valid(cic->ttime.ttime_samples)) { - if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) - enable_idle = 0; - else - enable_idle = 1; - } - - if (old_idle != enable_idle) { - cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); - if (enable_idle) - cfq_mark_cfqq_idle_window(cfqq); - else - cfq_clear_cfqq_idle_window(cfqq); - } -} - -/* - * Check if new_cfqq should preempt the currently active queue. Return 0 for - * no or if we aren't sure, a 1 will cause a preempt. - */ -static bool -cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, - struct request *rq) -{ - struct cfq_queue *cfqq; - - cfqq = cfqd->active_queue; - if (!cfqq) - return false; - - if (cfq_class_idle(new_cfqq)) - return false; - - if (cfq_class_idle(cfqq)) - return true; - - /* - * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice. - */ - if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq)) - return false; - - /* - * if the new request is sync, but the currently running queue is - * not, let the sync request have priority. - */ - if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) - return true; - - /* - * Treat ancestors of current cgroup the same way as current cgroup. - * For anybody else we disallow preemption to guarantee service - * fairness among cgroups. - */ - if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg)) - return false; - - if (cfq_slice_used(cfqq)) - return true; - - /* - * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. - */ - if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) - return true; - - WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class); - /* Allow preemption only if we are idling on sync-noidle tree */ - if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD && - cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && - RB_EMPTY_ROOT(&cfqq->sort_list)) - return true; - - /* - * So both queues are sync. Let the new request get disk time if - * it's a metadata request and the current queue is doing regular IO. - */ - if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) - return true; - - /* An idle queue should not be idle now for some reason */ - if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) - return true; - - if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) - return false; - - /* - * if this request is as-good as one we would expect from the - * current cfqq, let it preempt - */ - if (cfq_rq_close(cfqd, cfqq, rq)) - return true; - - return false; -} - -/* - * cfqq preempts the active queue. if we allowed preempt with no slice left, - * let it have half of its nominal slice. - */ -static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - enum wl_type_t old_type = cfqq_type(cfqd->active_queue); - - cfq_log_cfqq(cfqd, cfqq, "preempt"); - cfq_slice_expired(cfqd, 1); - - /* - * workload type is changed, don't save slice, otherwise preempt - * doesn't happen - */ - if (old_type != cfqq_type(cfqq)) - cfqq->cfqg->saved_wl_slice = 0; - - /* - * Put the new queue at the front of the of the current list, - * so we know that it will be selected next. - */ - BUG_ON(!cfq_cfqq_on_rr(cfqq)); - - cfq_service_tree_add(cfqd, cfqq, 1); - - cfqq->slice_end = 0; - cfq_mark_cfqq_slice_new(cfqq); -} - -/* - * Called when a new fs request (rq) is added (to cfqq). Check if there's - * something we should do about it - */ -static void -cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *rq) -{ - struct cfq_io_cq *cic = RQ_CIC(rq); - - cfqd->rq_queued++; - if (rq->cmd_flags & REQ_PRIO) - cfqq->prio_pending++; - - cfq_update_io_thinktime(cfqd, cfqq, cic); - cfq_update_io_seektime(cfqd, cfqq, rq); - cfq_update_idle_window(cfqd, cfqq, cic); - - cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - - if (cfqq == cfqd->active_queue) { - /* - * Remember that we saw a request from this process, but - * don't start queuing just yet. Otherwise we risk seeing lots - * of tiny requests, because we disrupt the normal plugging - * and merging. If the request is already larger than a single - * page, let it rip immediately. For that case we assume that - * merging is already done. Ditto for a busy system that - * has other work pending, don't risk delaying until the - * idle timer unplug to continue working. - */ - if (cfq_cfqq_wait_request(cfqq)) { - if (blk_rq_bytes(rq) > PAGE_SIZE || - cfqd->busy_queues > 1) { - cfq_del_timer(cfqd, cfqq); - cfq_clear_cfqq_wait_request(cfqq); - __blk_run_queue(cfqd->queue); - } else { - cfqg_stats_update_idle_time(cfqq->cfqg); - cfq_mark_cfqq_must_dispatch(cfqq); - } - } - } else if (cfq_should_preempt(cfqd, cfqq, rq)) { - /* - * not the active queue - expire current slice if it is - * idle and has expired it's mean thinktime or this new queue - * has some old slice time left and is of higher priority or - * this new queue is RT and the current one is BE - */ - cfq_preempt_queue(cfqd, cfqq); - __blk_run_queue(cfqd->queue); - } -} - -static void cfq_insert_request(struct request_queue *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_queue *cfqq = RQ_CFQQ(rq); - - cfq_log_cfqq(cfqd, cfqq, "insert_request"); - cfq_init_prio_data(cfqq, RQ_CIC(rq)); - - rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)]; - list_add_tail(&rq->queuelist, &cfqq->fifo); - cfq_add_rq_rb(rq); - cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, - rq->cmd_flags); - cfq_rq_enqueued(cfqd, cfqq, rq); -} - -/* - * Update hw_tag based on peak queue depth over 50 samples under - * sufficient load. - */ -static void cfq_update_hw_tag(struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq = cfqd->active_queue; - - if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) - cfqd->hw_tag_est_depth = cfqd->rq_in_driver; - - if (cfqd->hw_tag == 1) - return; - - if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && - cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) - return; - - /* - * If active queue hasn't enough requests and can idle, cfq might not - * dispatch sufficient requests to hardware. Don't zero hw_tag in this - * case - */ - if (cfqq && cfq_cfqq_idle_window(cfqq) && - cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < - CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) - return; - - if (cfqd->hw_tag_samples++ < 50) - return; - - if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) - cfqd->hw_tag = 1; - else - cfqd->hw_tag = 0; -} - -static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - struct cfq_io_cq *cic = cfqd->active_cic; - u64 now = ktime_get_ns(); - - /* If the queue already has requests, don't wait */ - if (!RB_EMPTY_ROOT(&cfqq->sort_list)) - return false; - - /* If there are other queues in the group, don't wait */ - if (cfqq->cfqg->nr_cfqq > 1) - return false; - - /* the only queue in the group, but think time is big */ - if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) - return false; - - if (cfq_slice_used(cfqq)) - return true; - - /* if slice left is less than think time, wait busy */ - if (cic && sample_valid(cic->ttime.ttime_samples) - && (cfqq->slice_end - now < cic->ttime.ttime_mean)) - return true; - - /* - * If think times is less than a jiffy than ttime_mean=0 and above - * will not be true. It might happen that slice has not expired yet - * but will expire soon (4-5 ns) during select_queue(). To cover the - * case where think time is less than a jiffy, mark the queue wait - * busy if only 1 jiffy is left in the slice. - */ - if (cfqq->slice_end - now <= jiffies_to_nsecs(1)) - return true; - - return false; -} - -static void cfq_completed_request(struct request_queue *q, struct request *rq) -{ - struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = cfqq->cfqd; - const int sync = rq_is_sync(rq); - u64 now = ktime_get_ns(); - - cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq)); - - cfq_update_hw_tag(cfqd); - - WARN_ON(!cfqd->rq_in_driver); - WARN_ON(!cfqq->dispatched); - cfqd->rq_in_driver--; - cfqq->dispatched--; - (RQ_CFQG(rq))->dispatched--; - cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns, - rq->io_start_time_ns, rq->cmd_flags); - - cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; - - if (sync) { - struct cfq_rb_root *st; - - RQ_CIC(rq)->ttime.last_end_request = now; - - if (cfq_cfqq_on_rr(cfqq)) - st = cfqq->service_tree; - else - st = st_for(cfqq->cfqg, cfqq_class(cfqq), - cfqq_type(cfqq)); - - st->ttime.last_end_request = now; - if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now) - cfqd->last_delayed_sync = now; - } - -#ifdef CONFIG_CFQ_GROUP_IOSCHED - cfqq->cfqg->ttime.last_end_request = now; -#endif - - /* - * If this is the active queue, check if it needs to be expired, - * or if we want to idle in case it has no pending requests. - */ - if (cfqd->active_queue == cfqq) { - const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); - - if (cfq_cfqq_slice_new(cfqq)) { - cfq_set_prio_slice(cfqd, cfqq); - cfq_clear_cfqq_slice_new(cfqq); - } - - /* - * Should we wait for next request to come in before we expire - * the queue. - */ - if (cfq_should_wait_busy(cfqd, cfqq)) { - u64 extend_sl = cfqd->cfq_slice_idle; - if (!cfqd->cfq_slice_idle) - extend_sl = cfqd->cfq_group_idle; - cfqq->slice_end = now + extend_sl; - cfq_mark_cfqq_wait_busy(cfqq); - cfq_log_cfqq(cfqd, cfqq, "will busy wait"); - } - - /* - * Idling is not enabled on: - * - expired queues - * - idle-priority queues - * - async queues - * - queues with still some requests queued - * - when there is a close cooperator - */ - if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) - cfq_slice_expired(cfqd, 1); - else if (sync && cfqq_empty && - !cfq_close_cooperator(cfqd, cfqq)) { - cfq_arm_slice_timer(cfqd); - } - } - - if (!cfqd->rq_in_driver) - cfq_schedule_dispatch(cfqd); -} - -static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op) -{ - /* - * If REQ_PRIO is set, boost class and prio level, if it's below - * BE/NORM. If prio is not set, restore the potentially boosted - * class/prio level. - */ - if (!(op & REQ_PRIO)) { - cfqq->ioprio_class = cfqq->org_ioprio_class; - cfqq->ioprio = cfqq->org_ioprio; - } else { - if (cfq_class_idle(cfqq)) - cfqq->ioprio_class = IOPRIO_CLASS_BE; - if (cfqq->ioprio > IOPRIO_NORM) - cfqq->ioprio = IOPRIO_NORM; - } -} - -static inline int __cfq_may_queue(struct cfq_queue *cfqq) -{ - if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { - cfq_mark_cfqq_must_alloc_slice(cfqq); - return ELV_MQUEUE_MUST; - } - - return ELV_MQUEUE_MAY; -} - -static int cfq_may_queue(struct request_queue *q, unsigned int op) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - struct task_struct *tsk = current; - struct cfq_io_cq *cic; - struct cfq_queue *cfqq; - - /* - * don't force setup of a queue from here, as a call to may_queue - * does not necessarily imply that a request actually will be queued. - * so just lookup a possibly existing queue, or return 'may queue' - * if that fails - */ - cic = cfq_cic_lookup(cfqd, tsk->io_context); - if (!cic) - return ELV_MQUEUE_MAY; - - cfqq = cic_to_cfqq(cic, op_is_sync(op)); - if (cfqq) { - cfq_init_prio_data(cfqq, cic); - cfqq_boost_on_prio(cfqq, op); - - return __cfq_may_queue(cfqq); - } - - return ELV_MQUEUE_MAY; -} - -/* - * queue lock held here - */ -static void cfq_put_request(struct request *rq) -{ - struct cfq_queue *cfqq = RQ_CFQQ(rq); - - if (cfqq) { - const int rw = rq_data_dir(rq); - - BUG_ON(!cfqq->allocated[rw]); - cfqq->allocated[rw]--; - - /* Put down rq reference on cfqg */ - cfqg_put(RQ_CFQG(rq)); - rq->elv.priv[0] = NULL; - rq->elv.priv[1] = NULL; - - cfq_put_queue(cfqq); - } -} - -static struct cfq_queue * -cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic, - struct cfq_queue *cfqq) -{ - cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); - cic_set_cfqq(cic, cfqq->new_cfqq, 1); - cfq_mark_cfqq_coop(cfqq->new_cfqq); - cfq_put_queue(cfqq); - return cic_to_cfqq(cic, 1); -} - -/* - * Returns NULL if a new cfqq should be allocated, or the old cfqq if this - * was the last process referring to said cfqq. - */ -static struct cfq_queue * -split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) -{ - if (cfqq_process_refs(cfqq) == 1) { - cfqq->pid = current->pid; - cfq_clear_cfqq_coop(cfqq); - cfq_clear_cfqq_split_coop(cfqq); - return cfqq; - } - - cic_set_cfqq(cic, NULL, 1); - - cfq_put_cooperator(cfqq); - - cfq_put_queue(cfqq); - return NULL; -} -/* - * Allocate cfq data structures associated with this request. - */ -static int -cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, - gfp_t gfp_mask) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); - const int rw = rq_data_dir(rq); - const bool is_sync = rq_is_sync(rq); - struct cfq_queue *cfqq; - - spin_lock_irq(q->queue_lock); - - check_ioprio_changed(cic, bio); - check_blkcg_changed(cic, bio); -new_queue: - cfqq = cic_to_cfqq(cic, is_sync); - if (!cfqq || cfqq == &cfqd->oom_cfqq) { - if (cfqq) - cfq_put_queue(cfqq); - cfqq = cfq_get_queue(cfqd, is_sync, cic, bio); - cic_set_cfqq(cic, cfqq, is_sync); - } else { - /* - * If the queue was seeky for too long, break it apart. - */ - if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { - cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); - cfqq = split_cfqq(cic, cfqq); - if (!cfqq) - goto new_queue; - } - - /* - * Check to see if this queue is scheduled to merge with - * another, closely cooperating queue. The merging of - * queues happens here as it must be done in process context. - * The reference on new_cfqq was taken in merge_cfqqs. - */ - if (cfqq->new_cfqq) - cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); - } - - cfqq->allocated[rw]++; - - cfqq->ref++; - cfqg_get(cfqq->cfqg); - rq->elv.priv[0] = cfqq; - rq->elv.priv[1] = cfqq->cfqg; - spin_unlock_irq(q->queue_lock); - - return 0; -} - -static void cfq_kick_queue(struct work_struct *work) -{ - struct cfq_data *cfqd = - container_of(work, struct cfq_data, unplug_work); - struct request_queue *q = cfqd->queue; - - spin_lock_irq(q->queue_lock); - __blk_run_queue(cfqd->queue); - spin_unlock_irq(q->queue_lock); -} - -/* - * Timer running if the active_queue is currently idling inside its time slice - */ -static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer) -{ - struct cfq_data *cfqd = container_of(timer, struct cfq_data, - idle_slice_timer); - struct cfq_queue *cfqq; - unsigned long flags; - int timed_out = 1; - - cfq_log(cfqd, "idle timer fired"); - - spin_lock_irqsave(cfqd->queue->queue_lock, flags); - - cfqq = cfqd->active_queue; - if (cfqq) { - timed_out = 0; - - /* - * We saw a request before the queue expired, let it through - */ - if (cfq_cfqq_must_dispatch(cfqq)) - goto out_kick; - - /* - * expired - */ - if (cfq_slice_used(cfqq)) - goto expire; - - /* - * only expire and reinvoke request handler, if there are - * other queues with pending requests - */ - if (!cfqd->busy_queues) - goto out_cont; - - /* - * not expired and it has a request pending, let it dispatch - */ - if (!RB_EMPTY_ROOT(&cfqq->sort_list)) - goto out_kick; - - /* - * Queue depth flag is reset only when the idle didn't succeed - */ - cfq_clear_cfqq_deep(cfqq); - } -expire: - cfq_slice_expired(cfqd, timed_out); -out_kick: - cfq_schedule_dispatch(cfqd); -out_cont: - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); - return HRTIMER_NORESTART; -} - -static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) -{ - hrtimer_cancel(&cfqd->idle_slice_timer); - cancel_work_sync(&cfqd->unplug_work); -} - -static void cfq_exit_queue(struct elevator_queue *e) -{ - struct cfq_data *cfqd = e->elevator_data; - struct request_queue *q = cfqd->queue; - - cfq_shutdown_timer_wq(cfqd); - - spin_lock_irq(q->queue_lock); - - if (cfqd->active_queue) - __cfq_slice_expired(cfqd, cfqd->active_queue, 0); - - spin_unlock_irq(q->queue_lock); - - cfq_shutdown_timer_wq(cfqd); - -#ifdef CONFIG_CFQ_GROUP_IOSCHED - blkcg_deactivate_policy(q, &blkcg_policy_cfq); -#else - kfree(cfqd->root_group); -#endif - kfree(cfqd); -} - -static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) -{ - struct cfq_data *cfqd; - struct blkcg_gq *blkg __maybe_unused; - int i, ret; - struct elevator_queue *eq; - - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; - - cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); - if (!cfqd) { - kobject_put(&eq->kobj); - return -ENOMEM; - } - eq->elevator_data = cfqd; - - cfqd->queue = q; - spin_lock_irq(q->queue_lock); - q->elevator = eq; - spin_unlock_irq(q->queue_lock); - - /* Init root service tree */ - cfqd->grp_service_tree = CFQ_RB_ROOT; - - /* Init root group and prefer root group over other groups by default */ -#ifdef CONFIG_CFQ_GROUP_IOSCHED - ret = blkcg_activate_policy(q, &blkcg_policy_cfq); - if (ret) - goto out_free; - - cfqd->root_group = blkg_to_cfqg(q->root_blkg); -#else - ret = -ENOMEM; - cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group), - GFP_KERNEL, cfqd->queue->node); - if (!cfqd->root_group) - goto out_free; - - cfq_init_cfqg_base(cfqd->root_group); - cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; - cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; -#endif - - /* - * Not strictly needed (since RB_ROOT just clears the node and we - * zeroed cfqd on alloc), but better be safe in case someone decides - * to add magic to the rb code - */ - for (i = 0; i < CFQ_PRIO_LISTS; i++) - cfqd->prio_trees[i] = RB_ROOT; - - /* - * Our fallback cfqq if cfq_get_queue() runs into OOM issues. - * Grab a permanent reference to it, so that the normal code flow - * will not attempt to free it. oom_cfqq is linked to root_group - * but shouldn't hold a reference as it'll never be unlinked. Lose - * the reference from linking right away. - */ - cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); - cfqd->oom_cfqq.ref++; - - spin_lock_irq(q->queue_lock); - cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); - cfqg_put(cfqd->root_group); - spin_unlock_irq(q->queue_lock); - - hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - cfqd->idle_slice_timer.function = cfq_idle_slice_timer; - - INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); - - cfqd->cfq_quantum = cfq_quantum; - cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; - cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; - cfqd->cfq_back_max = cfq_back_max; - cfqd->cfq_back_penalty = cfq_back_penalty; - cfqd->cfq_slice[0] = cfq_slice_async; - cfqd->cfq_slice[1] = cfq_slice_sync; - cfqd->cfq_target_latency = cfq_target_latency; - cfqd->cfq_slice_async_rq = cfq_slice_async_rq; - cfqd->cfq_slice_idle = cfq_slice_idle; - cfqd->cfq_group_idle = cfq_group_idle; - cfqd->cfq_latency = 1; - cfqd->hw_tag = -1; - /* - * we optimistically start assuming sync ops weren't delayed in last - * second, in order to have larger depth for async operations. - */ - cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC; - return 0; - -out_free: - kfree(cfqd); - kobject_put(&eq->kobj); - return ret; -} - -static void cfq_registered_queue(struct request_queue *q) -{ - struct elevator_queue *e = q->elevator; - struct cfq_data *cfqd = e->elevator_data; - - /* - * Default to IOPS mode with no idling for SSDs - */ - if (blk_queue_nonrot(q)) - cfqd->cfq_slice_idle = 0; - wbt_disable_default(q); -} - -/* - * sysfs parts below --> - */ -static ssize_t -cfq_var_show(unsigned int var, char *page) -{ - return sprintf(page, "%u\n", var); -} - -static void -cfq_var_store(unsigned int *var, const char *page) -{ - char *p = (char *) page; - - *var = simple_strtoul(p, &p, 10); -} - -#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct cfq_data *cfqd = e->elevator_data; \ - u64 __data = __VAR; \ - if (__CONV) \ - __data = div_u64(__data, NSEC_PER_MSEC); \ - return cfq_var_show(__data, (page)); \ -} -SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); -SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); -SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); -SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); -SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); -SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); -SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); -SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); -SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); -SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); -SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); -SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1); -#undef SHOW_FUNCTION - -#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct cfq_data *cfqd = e->elevator_data; \ - u64 __data = __VAR; \ - __data = div_u64(__data, NSEC_PER_USEC); \ - return cfq_var_show(__data, (page)); \ -} -USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle); -USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle); -USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]); -USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]); -USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency); -#undef USEC_SHOW_FUNCTION - -#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ -{ \ - struct cfq_data *cfqd = e->elevator_data; \ - unsigned int __data, __min = (MIN), __max = (MAX); \ - \ - cfq_var_store(&__data, (page)); \ - if (__data < __min) \ - __data = __min; \ - else if (__data > __max) \ - __data = __max; \ - if (__CONV) \ - *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ - else \ - *(__PTR) = __data; \ - return count; \ -} -STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); -STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, - UINT_MAX, 1); -STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, - UINT_MAX, 1); -STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); -STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, - UINT_MAX, 0); -STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); -STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); -STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); -STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); -STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, - UINT_MAX, 0); -STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); -STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1); -#undef STORE_FUNCTION - -#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ -{ \ - struct cfq_data *cfqd = e->elevator_data; \ - unsigned int __data, __min = (MIN), __max = (MAX); \ - \ - cfq_var_store(&__data, (page)); \ - if (__data < __min) \ - __data = __min; \ - else if (__data > __max) \ - __data = __max; \ - *(__PTR) = (u64)__data * NSEC_PER_USEC; \ - return count; \ -} -USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX); -USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX); -USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX); -USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX); -USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX); -#undef USEC_STORE_FUNCTION - -#define CFQ_ATTR(name) \ - __ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store) - -static struct elv_fs_entry cfq_attrs[] = { - CFQ_ATTR(quantum), - CFQ_ATTR(fifo_expire_sync), - CFQ_ATTR(fifo_expire_async), - CFQ_ATTR(back_seek_max), - CFQ_ATTR(back_seek_penalty), - CFQ_ATTR(slice_sync), - CFQ_ATTR(slice_sync_us), - CFQ_ATTR(slice_async), - CFQ_ATTR(slice_async_us), - CFQ_ATTR(slice_async_rq), - CFQ_ATTR(slice_idle), - CFQ_ATTR(slice_idle_us), - CFQ_ATTR(group_idle), - CFQ_ATTR(group_idle_us), - CFQ_ATTR(low_latency), - CFQ_ATTR(target_latency), - CFQ_ATTR(target_latency_us), - __ATTR_NULL -}; - -static struct elevator_type iosched_cfq = { - .ops.sq = { - .elevator_merge_fn = cfq_merge, - .elevator_merged_fn = cfq_merged_request, - .elevator_merge_req_fn = cfq_merged_requests, - .elevator_allow_bio_merge_fn = cfq_allow_bio_merge, - .elevator_allow_rq_merge_fn = cfq_allow_rq_merge, - .elevator_bio_merged_fn = cfq_bio_merged, - .elevator_dispatch_fn = cfq_dispatch_requests, - .elevator_add_req_fn = cfq_insert_request, - .elevator_activate_req_fn = cfq_activate_request, - .elevator_deactivate_req_fn = cfq_deactivate_request, - .elevator_completed_req_fn = cfq_completed_request, - .elevator_former_req_fn = elv_rb_former_request, - .elevator_latter_req_fn = elv_rb_latter_request, - .elevator_init_icq_fn = cfq_init_icq, - .elevator_exit_icq_fn = cfq_exit_icq, - .elevator_set_req_fn = cfq_set_request, - .elevator_put_req_fn = cfq_put_request, - .elevator_may_queue_fn = cfq_may_queue, - .elevator_init_fn = cfq_init_queue, - .elevator_exit_fn = cfq_exit_queue, - .elevator_registered_fn = cfq_registered_queue, - }, - .icq_size = sizeof(struct cfq_io_cq), - .icq_align = __alignof__(struct cfq_io_cq), - .elevator_attrs = cfq_attrs, - .elevator_name = "cfq", - .elevator_owner = THIS_MODULE, -}; - -#ifdef CONFIG_CFQ_GROUP_IOSCHED -static struct blkcg_policy blkcg_policy_cfq = { - .dfl_cftypes = cfq_blkcg_files, - .legacy_cftypes = cfq_blkcg_legacy_files, - - .cpd_alloc_fn = cfq_cpd_alloc, - .cpd_init_fn = cfq_cpd_init, - .cpd_free_fn = cfq_cpd_free, - .cpd_bind_fn = cfq_cpd_bind, - - .pd_alloc_fn = cfq_pd_alloc, - .pd_init_fn = cfq_pd_init, - .pd_offline_fn = cfq_pd_offline, - .pd_free_fn = cfq_pd_free, - .pd_reset_stats_fn = cfq_pd_reset_stats, -}; -#endif - -static int __init cfq_init(void) -{ - int ret; - -#ifdef CONFIG_CFQ_GROUP_IOSCHED - ret = blkcg_policy_register(&blkcg_policy_cfq); - if (ret) - return ret; -#else - cfq_group_idle = 0; -#endif - - ret = -ENOMEM; - cfq_pool = KMEM_CACHE(cfq_queue, 0); - if (!cfq_pool) - goto err_pol_unreg; - - ret = elv_register(&iosched_cfq); - if (ret) - goto err_free_pool; - - return 0; - -err_free_pool: - kmem_cache_destroy(cfq_pool); -err_pol_unreg: -#ifdef CONFIG_CFQ_GROUP_IOSCHED - blkcg_policy_unregister(&blkcg_policy_cfq); -#endif - return ret; -} - -static void __exit cfq_exit(void) -{ -#ifdef CONFIG_CFQ_GROUP_IOSCHED - blkcg_policy_unregister(&blkcg_policy_cfq); -#endif - elv_unregister(&iosched_cfq); - kmem_cache_destroy(cfq_pool); -} - -module_init(cfq_init); -module_exit(cfq_exit); - -MODULE_AUTHOR("Jens Axboe"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c deleted file mode 100644 index ef2f1f09e9b3..000000000000 --- a/block/deadline-iosched.c +++ /dev/null @@ -1,560 +0,0 @@ -/* - * Deadline i/o scheduler. - * - * Copyright (C) 2002 Jens Axboe - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * See Documentation/block/deadline-iosched.txt - */ -static const int read_expire = HZ / 2; /* max time before a read is submitted. */ -static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ -static const int writes_starved = 2; /* max times reads can starve a write */ -static const int fifo_batch = 16; /* # of sequential requests treated as one - by the above parameters. For throughput. */ - -struct deadline_data { - /* - * run time data - */ - - /* - * requests (deadline_rq s) are present on both sort_list and fifo_list - */ - struct rb_root sort_list[2]; - struct list_head fifo_list[2]; - - /* - * next in sort order. read, write or both are NULL - */ - struct request *next_rq[2]; - unsigned int batching; /* number of sequential requests made */ - unsigned int starved; /* times reads have starved writes */ - - /* - * settings that change how the i/o scheduler behaves - */ - int fifo_expire[2]; - int fifo_batch; - int writes_starved; - int front_merges; -}; - -static inline struct rb_root * -deadline_rb_root(struct deadline_data *dd, struct request *rq) -{ - return &dd->sort_list[rq_data_dir(rq)]; -} - -/* - * get the request after `rq' in sector-sorted order - */ -static inline struct request * -deadline_latter_request(struct request *rq) -{ - struct rb_node *node = rb_next(&rq->rb_node); - - if (node) - return rb_entry_rq(node); - - return NULL; -} - -static void -deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) -{ - struct rb_root *root = deadline_rb_root(dd, rq); - - elv_rb_add(root, rq); -} - -static inline void -deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) -{ - const int data_dir = rq_data_dir(rq); - - if (dd->next_rq[data_dir] == rq) - dd->next_rq[data_dir] = deadline_latter_request(rq); - - elv_rb_del(deadline_rb_root(dd, rq), rq); -} - -/* - * add rq to rbtree and fifo - */ -static void -deadline_add_request(struct request_queue *q, struct request *rq) -{ - struct deadline_data *dd = q->elevator->elevator_data; - const int data_dir = rq_data_dir(rq); - - /* - * This may be a requeue of a write request that has locked its - * target zone. If it is the case, this releases the zone lock. - */ - blk_req_zone_write_unlock(rq); - - deadline_add_rq_rb(dd, rq); - - /* - * set expire time and add to fifo list - */ - rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); -} - -/* - * remove rq from rbtree and fifo. - */ -static void deadline_remove_request(struct request_queue *q, struct request *rq) -{ - struct deadline_data *dd = q->elevator->elevator_data; - - rq_fifo_clear(rq); - deadline_del_rq_rb(dd, rq); -} - -static enum elv_merge -deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) -{ - struct deadline_data *dd = q->elevator->elevator_data; - struct request *__rq; - - /* - * check for front merge - */ - if (dd->front_merges) { - sector_t sector = bio_end_sector(bio); - - __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); - if (__rq) { - BUG_ON(sector != blk_rq_pos(__rq)); - - if (elv_bio_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; - } - } - } - - return ELEVATOR_NO_MERGE; -} - -static void deadline_merged_request(struct request_queue *q, - struct request *req, enum elv_merge type) -{ - struct deadline_data *dd = q->elevator->elevator_data; - - /* - * if the merge was a front merge, we need to reposition request - */ - if (type == ELEVATOR_FRONT_MERGE) { - elv_rb_del(deadline_rb_root(dd, req), req); - deadline_add_rq_rb(dd, req); - } -} - -static void -deadline_merged_requests(struct request_queue *q, struct request *req, - struct request *next) -{ - /* - * if next expires before rq, assign its expire time to rq - * and move into next position (next will be deleted) in fifo - */ - if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { - if (time_before((unsigned long)next->fifo_time, - (unsigned long)req->fifo_time)) { - list_move(&req->queuelist, &next->queuelist); - req->fifo_time = next->fifo_time; - } - } - - /* - * kill knowledge of next, this one is a goner - */ - deadline_remove_request(q, next); -} - -/* - * move request from sort list to dispatch queue. - */ -static inline void -deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq) -{ - struct request_queue *q = rq->q; - - /* - * For a zoned block device, write requests must write lock their - * target zone. - */ - blk_req_zone_write_lock(rq); - - deadline_remove_request(q, rq); - elv_dispatch_add_tail(q, rq); -} - -/* - * move an entry to dispatch queue - */ -static void -deadline_move_request(struct deadline_data *dd, struct request *rq) -{ - const int data_dir = rq_data_dir(rq); - - dd->next_rq[READ] = NULL; - dd->next_rq[WRITE] = NULL; - dd->next_rq[data_dir] = deadline_latter_request(rq); - - /* - * take it off the sort and fifo list, move - * to dispatch queue - */ - deadline_move_to_dispatch(dd, rq); -} - -/* - * deadline_check_fifo returns 0 if there are no expired requests on the fifo, - * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) - */ -static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) -{ - struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); - - /* - * rq is expired! - */ - if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) - return 1; - - return 0; -} - -/* - * For the specified data direction, return the next request to dispatch using - * arrival ordered lists. - */ -static struct request * -deadline_fifo_request(struct deadline_data *dd, int data_dir) -{ - struct request *rq; - - if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) - return NULL; - - if (list_empty(&dd->fifo_list[data_dir])) - return NULL; - - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); - if (data_dir == READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. - */ - list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { - if (blk_req_can_dispatch_to_zone(rq)) - return rq; - } - - return NULL; -} - -/* - * For the specified data direction, return the next request to dispatch using - * sector position sorted lists. - */ -static struct request * -deadline_next_request(struct deadline_data *dd, int data_dir) -{ - struct request *rq; - - if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) - return NULL; - - rq = dd->next_rq[data_dir]; - if (!rq) - return NULL; - - if (data_dir == READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. - */ - while (rq) { - if (blk_req_can_dispatch_to_zone(rq)) - return rq; - rq = deadline_latter_request(rq); - } - - return NULL; -} - -/* - * deadline_dispatch_requests selects the best request according to - * read/write expire, fifo_batch, etc - */ -static int deadline_dispatch_requests(struct request_queue *q, int force) -{ - struct deadline_data *dd = q->elevator->elevator_data; - const int reads = !list_empty(&dd->fifo_list[READ]); - const int writes = !list_empty(&dd->fifo_list[WRITE]); - struct request *rq, *next_rq; - int data_dir; - - /* - * batches are currently reads XOR writes - */ - rq = deadline_next_request(dd, WRITE); - if (!rq) - rq = deadline_next_request(dd, READ); - - if (rq && dd->batching < dd->fifo_batch) - /* we have a next request are still entitled to batch */ - goto dispatch_request; - - /* - * at this point we are not running a batch. select the appropriate - * data direction (read / write) - */ - - if (reads) { - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); - - if (deadline_fifo_request(dd, WRITE) && - (dd->starved++ >= dd->writes_starved)) - goto dispatch_writes; - - data_dir = READ; - - goto dispatch_find_request; - } - - /* - * there are either no reads or writes have been starved - */ - - if (writes) { -dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); - - dd->starved = 0; - - data_dir = WRITE; - - goto dispatch_find_request; - } - - return 0; - -dispatch_find_request: - /* - * we are not running a batch, find best request for selected data_dir - */ - next_rq = deadline_next_request(dd, data_dir); - if (deadline_check_fifo(dd, data_dir) || !next_rq) { - /* - * A deadline has expired, the last request was in the other - * direction, or we have run out of higher-sectored requests. - * Start again from the request with the earliest expiry time. - */ - rq = deadline_fifo_request(dd, data_dir); - } else { - /* - * The last req was the same dir and we have a next request in - * sort order. No expired requests so continue on from here. - */ - rq = next_rq; - } - - /* - * For a zoned block device, if we only have writes queued and none of - * them can be dispatched, rq will be NULL. - */ - if (!rq) - return 0; - - dd->batching = 0; - -dispatch_request: - /* - * rq is the selected appropriate request. - */ - dd->batching++; - deadline_move_request(dd, rq); - - return 1; -} - -/* - * For zoned block devices, write unlock the target zone of completed - * write requests. - */ -static void -deadline_completed_request(struct request_queue *q, struct request *rq) -{ - blk_req_zone_write_unlock(rq); -} - -static void deadline_exit_queue(struct elevator_queue *e) -{ - struct deadline_data *dd = e->elevator_data; - - BUG_ON(!list_empty(&dd->fifo_list[READ])); - BUG_ON(!list_empty(&dd->fifo_list[WRITE])); - - kfree(dd); -} - -/* - * initialize elevator private data (deadline_data). - */ -static int deadline_init_queue(struct request_queue *q, struct elevator_type *e) -{ - struct deadline_data *dd; - struct elevator_queue *eq; - - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; - - dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); - if (!dd) { - kobject_put(&eq->kobj); - return -ENOMEM; - } - eq->elevator_data = dd; - - INIT_LIST_HEAD(&dd->fifo_list[READ]); - INIT_LIST_HEAD(&dd->fifo_list[WRITE]); - dd->sort_list[READ] = RB_ROOT; - dd->sort_list[WRITE] = RB_ROOT; - dd->fifo_expire[READ] = read_expire; - dd->fifo_expire[WRITE] = write_expire; - dd->writes_starved = writes_starved; - dd->front_merges = 1; - dd->fifo_batch = fifo_batch; - - spin_lock_irq(q->queue_lock); - q->elevator = eq; - spin_unlock_irq(q->queue_lock); - return 0; -} - -/* - * sysfs parts below - */ - -static ssize_t -deadline_var_show(int var, char *page) -{ - return sprintf(page, "%d\n", var); -} - -static void -deadline_var_store(int *var, const char *page) -{ - char *p = (char *) page; - - *var = simple_strtol(p, &p, 10); -} - -#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct deadline_data *dd = e->elevator_data; \ - int __data = __VAR; \ - if (__CONV) \ - __data = jiffies_to_msecs(__data); \ - return deadline_var_show(__data, (page)); \ -} -SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); -SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); -SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); -SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); -SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); -#undef SHOW_FUNCTION - -#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ -{ \ - struct deadline_data *dd = e->elevator_data; \ - int __data; \ - deadline_var_store(&__data, (page)); \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ - if (__CONV) \ - *(__PTR) = msecs_to_jiffies(__data); \ - else \ - *(__PTR) = __data; \ - return count; \ -} -STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); -STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); -STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); -#undef STORE_FUNCTION - -#define DD_ATTR(name) \ - __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) - -static struct elv_fs_entry deadline_attrs[] = { - DD_ATTR(read_expire), - DD_ATTR(write_expire), - DD_ATTR(writes_starved), - DD_ATTR(front_merges), - DD_ATTR(fifo_batch), - __ATTR_NULL -}; - -static struct elevator_type iosched_deadline = { - .ops.sq = { - .elevator_merge_fn = deadline_merge, - .elevator_merged_fn = deadline_merged_request, - .elevator_merge_req_fn = deadline_merged_requests, - .elevator_dispatch_fn = deadline_dispatch_requests, - .elevator_completed_req_fn = deadline_completed_request, - .elevator_add_req_fn = deadline_add_request, - .elevator_former_req_fn = elv_rb_former_request, - .elevator_latter_req_fn = elv_rb_latter_request, - .elevator_init_fn = deadline_init_queue, - .elevator_exit_fn = deadline_exit_queue, - }, - - .elevator_attrs = deadline_attrs, - .elevator_name = "deadline", - .elevator_owner = THIS_MODULE, -}; - -static int __init deadline_init(void) -{ - return elv_register(&iosched_deadline); -} - -static void __exit deadline_exit(void) -{ - elv_unregister(&iosched_deadline); -} - -module_init(deadline_init); -module_exit(deadline_exit); - -MODULE_AUTHOR("Jens Axboe"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("deadline IO scheduler"); diff --git a/block/elevator.c b/block/elevator.c index 8fdcd64ae12e..f05e90d4e695 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -61,10 +61,8 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if (e->uses_mq && e->type->ops.mq.allow_merge) - return e->type->ops.mq.allow_merge(q, rq, bio); - else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn) - return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio); + if (e->type->ops.allow_merge) + return e->type->ops.allow_merge(q, rq, bio); return 1; } @@ -95,14 +93,14 @@ static bool elevator_match(const struct elevator_type *e, const char *name) } /* - * Return scheduler with name 'name' and with matching 'mq capability + * Return scheduler with name 'name' */ -static struct elevator_type *elevator_find(const char *name, bool mq) +static struct elevator_type *elevator_find(const char *name) { struct elevator_type *e; list_for_each_entry(e, &elv_list, list) { - if (elevator_match(e, name) && (mq == e->uses_mq)) + if (elevator_match(e, name)) return e; } @@ -121,12 +119,12 @@ static struct elevator_type *elevator_get(struct request_queue *q, spin_lock(&elv_list_lock); - e = elevator_find(name, q->mq_ops != NULL); + e = elevator_find(name); if (!e && try_loading) { spin_unlock(&elv_list_lock); request_module("%s-iosched", name); spin_lock(&elv_list_lock); - e = elevator_find(name, q->mq_ops != NULL); + e = elevator_find(name); } if (e && !try_module_get(e->elevator_owner)) @@ -150,26 +148,6 @@ static int __init elevator_setup(char *str) __setup("elevator=", elevator_setup); -/* called during boot to load the elevator chosen by the elevator param */ -void __init load_default_elevator_module(void) -{ - struct elevator_type *e; - - if (!chosen_elevator[0]) - return; - - /* - * Boot parameter is deprecated, we haven't supported that for MQ. - * Only look for non-mq schedulers from here. - */ - spin_lock(&elv_list_lock); - e = elevator_find(chosen_elevator, false); - spin_unlock(&elv_list_lock); - - if (!e) - request_module("%s-iosched", chosen_elevator); -} - static struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, @@ -185,7 +163,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); - eq->uses_mq = e->uses_mq; return eq; } @@ -200,54 +177,11 @@ static void elevator_release(struct kobject *kobj) kfree(e); } -/* - * Use the default elevator specified by config boot param for non-mq devices, - * or by config option. Don't try to load modules as we could be running off - * async and request_module() isn't allowed from async. - */ -int elevator_init(struct request_queue *q) -{ - struct elevator_type *e = NULL; - int err = 0; - - /* - * q->sysfs_lock must be held to provide mutual exclusion between - * elevator_switch() and here. - */ - mutex_lock(&q->sysfs_lock); - if (unlikely(q->elevator)) - goto out_unlock; - - if (*chosen_elevator) { - e = elevator_get(q, chosen_elevator, false); - if (!e) - printk(KERN_ERR "I/O scheduler %s not found\n", - chosen_elevator); - } - - if (!e) - e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false); - if (!e) { - printk(KERN_ERR - "Default I/O scheduler not found. Using noop.\n"); - e = elevator_get(q, "noop", false); - } - - err = e->ops.sq.elevator_init_fn(q, e); - if (err) - elevator_put(e); -out_unlock: - mutex_unlock(&q->sysfs_lock); - return err; -} - void elevator_exit(struct request_queue *q, struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); - if (e->uses_mq && e->type->ops.mq.exit_sched) + if (e->type->ops.exit_sched) blk_mq_exit_sched(q, e); - else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn) - e->type->ops.sq.elevator_exit_fn(e); mutex_unlock(&e->sysfs_lock); kobject_put(&e->kobj); @@ -356,68 +290,6 @@ struct request *elv_rb_find(struct rb_root *root, sector_t sector) } EXPORT_SYMBOL(elv_rb_find); -/* - * Insert rq into dispatch queue of q. Queue lock must be held on - * entry. rq is sort instead into the dispatch queue. To be used by - * specific elevators. - */ -void elv_dispatch_sort(struct request_queue *q, struct request *rq) -{ - sector_t boundary; - struct list_head *entry; - - if (q->last_merge == rq) - q->last_merge = NULL; - - elv_rqhash_del(q, rq); - - q->nr_sorted--; - - boundary = q->end_sector; - list_for_each_prev(entry, &q->queue_head) { - struct request *pos = list_entry_rq(entry); - - if (req_op(rq) != req_op(pos)) - break; - if (rq_data_dir(rq) != rq_data_dir(pos)) - break; - if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER)) - break; - if (blk_rq_pos(rq) >= boundary) { - if (blk_rq_pos(pos) < boundary) - continue; - } else { - if (blk_rq_pos(pos) >= boundary) - break; - } - if (blk_rq_pos(rq) >= blk_rq_pos(pos)) - break; - } - - list_add(&rq->queuelist, entry); -} -EXPORT_SYMBOL(elv_dispatch_sort); - -/* - * Insert rq into dispatch queue of q. Queue lock must be held on - * entry. rq is added to the back of the dispatch queue. To be used by - * specific elevators. - */ -void elv_dispatch_add_tail(struct request_queue *q, struct request *rq) -{ - if (q->last_merge == rq) - q->last_merge = NULL; - - elv_rqhash_del(q, rq); - - q->nr_sorted--; - - q->end_sector = rq_end_sector(rq); - q->boundary_rq = rq; - list_add_tail(&rq->queuelist, &q->queue_head); -} -EXPORT_SYMBOL(elv_dispatch_add_tail); - enum elv_merge elv_merge(struct request_queue *q, struct request **req, struct bio *bio) { @@ -457,10 +329,8 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, return ELEVATOR_BACK_MERGE; } - if (e->uses_mq && e->type->ops.mq.request_merge) - return e->type->ops.mq.request_merge(q, req, bio); - else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn) - return e->type->ops.sq.elevator_merge_fn(q, req, bio); + if (e->type->ops.request_merge) + return e->type->ops.request_merge(q, req, bio); return ELEVATOR_NO_MERGE; } @@ -511,10 +381,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e->uses_mq && e->type->ops.mq.request_merged) - e->type->ops.mq.request_merged(q, rq, type); - else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn) - e->type->ops.sq.elevator_merged_fn(q, rq, type); + if (e->type->ops.request_merged) + e->type->ops.request_merged(q, rq, type); if (type == ELEVATOR_BACK_MERGE) elv_rqhash_reposition(q, rq); @@ -526,176 +394,20 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, struct request *next) { struct elevator_queue *e = q->elevator; - bool next_sorted = false; - if (e->uses_mq && e->type->ops.mq.requests_merged) - e->type->ops.mq.requests_merged(q, rq, next); - else if (e->type->ops.sq.elevator_merge_req_fn) { - next_sorted = (__force bool)(next->rq_flags & RQF_SORTED); - if (next_sorted) - e->type->ops.sq.elevator_merge_req_fn(q, rq, next); - } + if (e->type->ops.requests_merged) + e->type->ops.requests_merged(q, rq, next); elv_rqhash_reposition(q, rq); - - if (next_sorted) { - elv_rqhash_del(q, next); - q->nr_sorted--; - } - q->last_merge = rq; } -void elv_bio_merged(struct request_queue *q, struct request *rq, - struct bio *bio) -{ - struct elevator_queue *e = q->elevator; - - if (WARN_ON_ONCE(e->uses_mq)) - return; - - if (e->type->ops.sq.elevator_bio_merged_fn) - e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio); -} - -void elv_requeue_request(struct request_queue *q, struct request *rq) -{ - /* - * it already went through dequeue, we need to decrement the - * in_flight count again - */ - if (blk_account_rq(rq)) { - q->in_flight[rq_is_sync(rq)]--; - if (rq->rq_flags & RQF_SORTED) - elv_deactivate_rq(q, rq); - } - - rq->rq_flags &= ~RQF_STARTED; - - blk_pm_requeue_request(rq); - - __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE); -} - -void elv_drain_elevator(struct request_queue *q) -{ - struct elevator_queue *e = q->elevator; - static int printed; - - if (WARN_ON_ONCE(e->uses_mq)) - return; - - lockdep_assert_held(q->queue_lock); - - while (e->type->ops.sq.elevator_dispatch_fn(q, 1)) - ; - if (q->nr_sorted && !blk_queue_is_zoned(q) && printed++ < 10 ) { - printk(KERN_ERR "%s: forced dispatching is broken " - "(nr_sorted=%u), please report this\n", - q->elevator->type->elevator_name, q->nr_sorted); - } -} - -void __elv_add_request(struct request_queue *q, struct request *rq, int where) -{ - trace_block_rq_insert(q, rq); - - blk_pm_add_request(q, rq); - - rq->q = q; - - if (rq->rq_flags & RQF_SOFTBARRIER) { - /* barriers are scheduling boundary, update end_sector */ - if (!blk_rq_is_passthrough(rq)) { - q->end_sector = rq_end_sector(rq); - q->boundary_rq = rq; - } - } else if (!(rq->rq_flags & RQF_ELVPRIV) && - (where == ELEVATOR_INSERT_SORT || - where == ELEVATOR_INSERT_SORT_MERGE)) - where = ELEVATOR_INSERT_BACK; - - switch (where) { - case ELEVATOR_INSERT_REQUEUE: - case ELEVATOR_INSERT_FRONT: - rq->rq_flags |= RQF_SOFTBARRIER; - list_add(&rq->queuelist, &q->queue_head); - break; - - case ELEVATOR_INSERT_BACK: - rq->rq_flags |= RQF_SOFTBARRIER; - elv_drain_elevator(q); - list_add_tail(&rq->queuelist, &q->queue_head); - /* - * We kick the queue here for the following reasons. - * - The elevator might have returned NULL previously - * to delay requests and returned them now. As the - * queue wasn't empty before this request, ll_rw_blk - * won't run the queue on return, resulting in hang. - * - Usually, back inserted requests won't be merged - * with anything. There's no point in delaying queue - * processing. - */ - __blk_run_queue(q); - break; - - case ELEVATOR_INSERT_SORT_MERGE: - /* - * If we succeed in merging this request with one in the - * queue already, we are done - rq has now been freed, - * so no need to do anything further. - */ - if (elv_attempt_insert_merge(q, rq)) - break; - /* fall through */ - case ELEVATOR_INSERT_SORT: - BUG_ON(blk_rq_is_passthrough(rq)); - rq->rq_flags |= RQF_SORTED; - q->nr_sorted++; - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); - if (!q->last_merge) - q->last_merge = rq; - } - - /* - * Some ioscheds (cfq) run q->request_fn directly, so - * rq cannot be accessed after calling - * elevator_add_req_fn. - */ - q->elevator->type->ops.sq.elevator_add_req_fn(q, rq); - break; - - case ELEVATOR_INSERT_FLUSH: - rq->rq_flags |= RQF_SOFTBARRIER; - blk_insert_flush(rq); - break; - default: - printk(KERN_ERR "%s: bad insertion point %d\n", - __func__, where); - BUG(); - } -} -EXPORT_SYMBOL(__elv_add_request); - -void elv_add_request(struct request_queue *q, struct request *rq, int where) -{ - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - __elv_add_request(q, rq, where); - spin_unlock_irqrestore(q->queue_lock, flags); -} -EXPORT_SYMBOL(elv_add_request); - struct request *elv_latter_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->uses_mq && e->type->ops.mq.next_request) - return e->type->ops.mq.next_request(q, rq); - else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn) - return e->type->ops.sq.elevator_latter_req_fn(q, rq); + if (e->type->ops.next_request) + return e->type->ops.next_request(q, rq); return NULL; } @@ -704,68 +416,12 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->uses_mq && e->type->ops.mq.former_request) - return e->type->ops.mq.former_request(q, rq); - if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn) - return e->type->ops.sq.elevator_former_req_fn(q, rq); + if (e->type->ops.former_request) + return e->type->ops.former_request(q, rq); + return NULL; } -int elv_set_request(struct request_queue *q, struct request *rq, - struct bio *bio, gfp_t gfp_mask) -{ - struct elevator_queue *e = q->elevator; - - if (WARN_ON_ONCE(e->uses_mq)) - return 0; - - if (e->type->ops.sq.elevator_set_req_fn) - return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask); - return 0; -} - -void elv_put_request(struct request_queue *q, struct request *rq) -{ - struct elevator_queue *e = q->elevator; - - if (WARN_ON_ONCE(e->uses_mq)) - return; - - if (e->type->ops.sq.elevator_put_req_fn) - e->type->ops.sq.elevator_put_req_fn(rq); -} - -int elv_may_queue(struct request_queue *q, unsigned int op) -{ - struct elevator_queue *e = q->elevator; - - if (WARN_ON_ONCE(e->uses_mq)) - return 0; - - if (e->type->ops.sq.elevator_may_queue_fn) - return e->type->ops.sq.elevator_may_queue_fn(q, op); - - return ELV_MQUEUE_MAY; -} - -void elv_completed_request(struct request_queue *q, struct request *rq) -{ - struct elevator_queue *e = q->elevator; - - if (WARN_ON_ONCE(e->uses_mq)) - return; - - /* - * request is released from the driver, io must be done - */ - if (blk_account_rq(rq)) { - q->in_flight[rq_is_sync(rq)]--; - if ((rq->rq_flags & RQF_SORTED) && - e->type->ops.sq.elevator_completed_req_fn) - e->type->ops.sq.elevator_completed_req_fn(q, rq); - } -} - #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) static ssize_t @@ -832,8 +488,6 @@ int elv_register_queue(struct request_queue *q) } kobject_uevent(&e->kobj, KOBJ_ADD); e->registered = 1; - if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn) - e->type->ops.sq.elevator_registered_fn(q); } return error; } @@ -873,7 +527,7 @@ int elv_register(struct elevator_type *e) /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); - if (elevator_find(e->elevator_name, e->uses_mq)) { + if (elevator_find(e->elevator_name)) { spin_unlock(&elv_list_lock); kmem_cache_destroy(e->icq_cache); return -EBUSY; @@ -881,12 +535,6 @@ int elv_register(struct elevator_type *e) list_add_tail(&e->list, &elv_list); spin_unlock(&elv_list_lock); - /* print pretty message */ - if (elevator_match(e, chosen_elevator) || - (!*chosen_elevator && - elevator_match(e, CONFIG_DEFAULT_IOSCHED))) - def = " (default)"; - printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, def); return 0; @@ -989,71 +637,17 @@ out_unlock: */ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { - struct elevator_queue *old = q->elevator; - bool old_registered = false; int err; lockdep_assert_held(&q->sysfs_lock); - if (q->mq_ops) { - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); - err = elevator_switch_mq(q, new_e); + err = elevator_switch_mq(q, new_e); - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); - - return err; - } - - /* - * Turn on BYPASS and drain all requests w/ elevator private data. - * Block layer doesn't call into a quiesced elevator - all requests - * are directly put on the dispatch list without elevator data - * using INSERT_BACK. All requests have SOFTBARRIER set and no - * merge happens either. - */ - if (old) { - old_registered = old->registered; - - blk_queue_bypass_start(q); - - /* unregister and clear all auxiliary data of the old elevator */ - if (old_registered) - elv_unregister_queue(q); - - ioc_clear_queue(q); - } - - /* allocate, init and register new elevator */ - err = new_e->ops.sq.elevator_init_fn(q, new_e); - if (err) - goto fail_init; - - err = elv_register_queue(q); - if (err) - goto fail_register; - - /* done, kill the old one and finish */ - if (old) { - elevator_exit(q, old); - blk_queue_bypass_end(q); - } - - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); - - return 0; - -fail_register: - elevator_exit(q, q->elevator); -fail_init: - /* switch failed, restore and re-register old elevator */ - if (old) { - q->elevator = old; - elv_register_queue(q); - blk_queue_bypass_end(q); - } + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); return err; } @@ -1073,7 +667,7 @@ static int __elevator_change(struct request_queue *q, const char *name) /* * Special case for mq, turn off scheduling */ - if (q->mq_ops && !strncmp(name, "none", 4)) + if (!strncmp(name, "none", 4)) return elevator_switch(q, NULL); strlcpy(elevator_name, name, sizeof(elevator_name)); @@ -1091,8 +685,7 @@ static int __elevator_change(struct request_queue *q, const char *name) static inline bool elv_support_iosched(struct request_queue *q) { - if (q->mq_ops && q->tag_set && (q->tag_set->flags & - BLK_MQ_F_NO_SCHED)) + if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) return false; return true; } @@ -1102,7 +695,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, { int ret; - if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q)) + if (!queue_is_mq(q) || !elv_support_iosched(q)) return count; ret = __elevator_change(q, name); @@ -1117,10 +710,9 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_queue *e = q->elevator; struct elevator_type *elv = NULL; struct elevator_type *__e; - bool uses_mq = q->mq_ops != NULL; int len = 0; - if (!queue_is_rq_based(q)) + if (!queue_is_mq(q)) return sprintf(name, "none\n"); if (!q->elevator) @@ -1130,19 +722,16 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { - if (elv && elevator_match(elv, __e->elevator_name) && - (__e->uses_mq == uses_mq)) { + if (elv && elevator_match(elv, __e->elevator_name)) { len += sprintf(name+len, "[%s] ", elv->elevator_name); continue; } - if (__e->uses_mq && q->mq_ops && elv_support_iosched(q)) - len += sprintf(name+len, "%s ", __e->elevator_name); - else if (!__e->uses_mq && !q->mq_ops) + if (elv_support_iosched(q)) len += sprintf(name+len, "%s ", __e->elevator_name); } spin_unlock(&elv_list_lock); - if (q->mq_ops && q->elevator) + if (q->elevator) len += sprintf(name+len, "none"); len += sprintf(len+name, "\n"); diff --git a/block/genhd.c b/block/genhd.c index cff6bdf27226..1dd8fd6613b8 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -47,51 +47,64 @@ static void disk_release_events(struct gendisk *disk); void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { - if (q->mq_ops) + if (queue_is_mq(q)) return; - atomic_inc(&part->in_flight[rw]); + part_stat_local_inc(part, in_flight[rw]); if (part->partno) - atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); + part_stat_local_inc(&part_to_disk(part)->part0, in_flight[rw]); } void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { - if (q->mq_ops) + if (queue_is_mq(q)) return; - atomic_dec(&part->in_flight[rw]); + part_stat_local_dec(part, in_flight[rw]); if (part->partno) - atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); + part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]); } -void part_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) { - if (q->mq_ops) { - blk_mq_in_flight(q, part, inflight); - return; + int cpu; + unsigned int inflight; + + if (queue_is_mq(q)) { + return blk_mq_in_flight(q, part); } - inflight[0] = atomic_read(&part->in_flight[0]) + - atomic_read(&part->in_flight[1]); - if (part->partno) { - part = &part_to_disk(part)->part0; - inflight[1] = atomic_read(&part->in_flight[0]) + - atomic_read(&part->in_flight[1]); + inflight = 0; + for_each_possible_cpu(cpu) { + inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) + + part_stat_local_read_cpu(part, in_flight[1], cpu); } + if ((int)inflight < 0) + inflight = 0; + + return inflight; } void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - if (q->mq_ops) { + int cpu; + + if (queue_is_mq(q)) { blk_mq_in_flight_rw(q, part, inflight); return; } - inflight[0] = atomic_read(&part->in_flight[0]); - inflight[1] = atomic_read(&part->in_flight[1]); + inflight[0] = 0; + inflight[1] = 0; + for_each_possible_cpu(cpu) { + inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu); + inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu); + } + if ((int)inflight[0] < 0) + inflight[0] = 0; + if ((int)inflight[1] < 0) + inflight[1] = 0; } struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) @@ -1325,8 +1338,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct disk_part_iter piter; struct hd_struct *hd; char buf[BDEVNAME_SIZE]; - unsigned int inflight[2]; - int cpu; + unsigned int inflight; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1338,10 +1350,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - cpu = part_stat_lock(); - part_round_stats(gp->queue, cpu, hd); - part_stat_unlock(); - part_in_flight(gp->queue, hd, inflight); + inflight = part_in_flight(gp->queue, hd); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " "%lu %lu %lu %u " @@ -1357,7 +1366,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, merges[STAT_WRITE]), part_stat_read(hd, sectors[STAT_WRITE]), (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), - inflight[0], + inflight, jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)), part_stat_read(hd, ios[STAT_DISCARD]), diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index eccac01a10b6..ec6a04e01bc1 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -195,7 +195,7 @@ struct kyber_hctx_data { unsigned int batching; struct kyber_ctx_queue *kcqs; struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; - wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; + struct sbq_wait domain_wait[KYBER_NUM_DOMAINS]; struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; atomic_t wait_index[KYBER_NUM_DOMAINS]; }; @@ -501,10 +501,11 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) for (i = 0; i < KYBER_NUM_DOMAINS; i++) { INIT_LIST_HEAD(&khd->rqs[i]); - init_waitqueue_func_entry(&khd->domain_wait[i], + khd->domain_wait[i].sbq = NULL; + init_waitqueue_func_entry(&khd->domain_wait[i].wait, kyber_domain_wake); - khd->domain_wait[i].private = hctx; - INIT_LIST_HEAD(&khd->domain_wait[i].entry); + khd->domain_wait[i].wait.private = hctx; + INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry); atomic_set(&khd->wait_index[i], 0); } @@ -576,7 +577,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) { struct kyber_hctx_data *khd = hctx->sched_data; struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); - struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw]; + struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); struct list_head *rq_list = &kcq->rq_list[sched_domain]; bool merged; @@ -602,7 +603,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, list_for_each_entry_safe(rq, next, rq_list, queuelist) { unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); - struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; + struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]]; struct list_head *head = &kcq->rq_list[sched_domain]; spin_lock(&kcq->lock); @@ -611,7 +612,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, else list_move_tail(&rq->queuelist, head); sbitmap_set_bit(&khd->kcq_map[sched_domain], - rq->mq_ctx->index_hw); + rq->mq_ctx->index_hw[hctx->type]); blk_mq_sched_request_inserted(rq); spin_unlock(&kcq->lock); } @@ -698,12 +699,13 @@ static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd, flush_busy_kcq, &data); } -static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, +static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags, void *key) { - struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); + struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private); + struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait); - list_del_init(&wait->entry); + sbitmap_del_wait_queue(wait); blk_mq_run_hw_queue(hctx, true); return 1; } @@ -714,7 +716,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, { unsigned int sched_domain = khd->cur_domain; struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; - wait_queue_entry_t *wait = &khd->domain_wait[sched_domain]; + struct sbq_wait *wait = &khd->domain_wait[sched_domain]; struct sbq_wait_state *ws; int nr; @@ -725,11 +727,11 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, * run when one becomes available. Note that this is serialized on * khd->lock, but we still need to be careful about the waker. */ - if (nr < 0 && list_empty_careful(&wait->entry)) { + if (nr < 0 && list_empty_careful(&wait->wait.entry)) { ws = sbq_wait_ptr(domain_tokens, &khd->wait_index[sched_domain]); khd->domain_ws[sched_domain] = ws; - add_wait_queue(&ws->wait, wait); + sbitmap_add_wait_queue(domain_tokens, ws, wait); /* * Try again in case a token was freed before we got on the wait @@ -745,10 +747,10 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, * between the !list_empty_careful() check and us grabbing the lock, but * list_del_init() is okay with that. */ - if (nr >= 0 && !list_empty_careful(&wait->entry)) { + if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) { ws = khd->domain_ws[sched_domain]; spin_lock_irq(&ws->wait.lock); - list_del_init(&wait->entry); + sbitmap_del_wait_queue(wait); spin_unlock_irq(&ws->wait.lock); } @@ -951,7 +953,7 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ { \ struct blk_mq_hw_ctx *hctx = data; \ struct kyber_hctx_data *khd = hctx->sched_data; \ - wait_queue_entry_t *wait = &khd->domain_wait[domain]; \ + wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \ \ seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ return 0; \ @@ -1017,7 +1019,7 @@ static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { #endif static struct elevator_type kyber_sched = { - .ops.mq = { + .ops = { .init_sched = kyber_init_sched, .exit_sched = kyber_exit_sched, .init_hctx = kyber_init_hctx, @@ -1032,7 +1034,6 @@ static struct elevator_type kyber_sched = { .dispatch_request = kyber_dispatch_request, .has_work = kyber_has_work, }, - .uses_mq = true, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = kyber_queue_debugfs_attrs, .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 099a9e05854c..14288f864e94 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -373,9 +373,16 @@ done: /* * One confusing aspect here is that we get called for a specific - * hardware queue, but we return a request that may not be for a + * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared * state for all hardware queues, in terms of sorting, FIFOs, etc. + * + * For a zoned block device, __dd_dispatch_request() may return NULL + * if all the queued write requests are directed at zones that are already + * locked due to on-going write requests. In this case, make sure to mark + * the queue as needing a restart to ensure that the queue is run again + * and the pending writes dispatched once the target zones for the ongoing + * write requests are unlocked in dd_finish_request(). */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { @@ -384,6 +391,9 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock(&dd->lock); rq = __dd_dispatch_request(dd); + if (!rq && blk_queue_is_zoned(hctx->queue) && + !list_empty(&dd->fifo_list[WRITE])) + blk_mq_sched_mark_restart_hctx(hctx); spin_unlock(&dd->lock); return rq; @@ -761,7 +771,7 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { #endif static struct elevator_type mq_deadline = { - .ops.mq = { + .ops = { .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, @@ -777,7 +787,6 @@ static struct elevator_type mq_deadline = { .exit_sched = dd_exit_queue, }, - .uses_mq = true, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = deadline_queue_debugfs_attrs, #endif diff --git a/block/noop-iosched.c b/block/noop-iosched.c deleted file mode 100644 index 2d1b15d89b45..000000000000 --- a/block/noop-iosched.c +++ /dev/null @@ -1,124 +0,0 @@ -/* - * elevator noop - */ -#include -#include -#include -#include -#include -#include - -struct noop_data { - struct list_head queue; -}; - -static void noop_merged_requests(struct request_queue *q, struct request *rq, - struct request *next) -{ - list_del_init(&next->queuelist); -} - -static int noop_dispatch(struct request_queue *q, int force) -{ - struct noop_data *nd = q->elevator->elevator_data; - struct request *rq; - - rq = list_first_entry_or_null(&nd->queue, struct request, queuelist); - if (rq) { - list_del_init(&rq->queuelist); - elv_dispatch_sort(q, rq); - return 1; - } - return 0; -} - -static void noop_add_request(struct request_queue *q, struct request *rq) -{ - struct noop_data *nd = q->elevator->elevator_data; - - list_add_tail(&rq->queuelist, &nd->queue); -} - -static struct request * -noop_former_request(struct request_queue *q, struct request *rq) -{ - struct noop_data *nd = q->elevator->elevator_data; - - if (rq->queuelist.prev == &nd->queue) - return NULL; - return list_prev_entry(rq, queuelist); -} - -static struct request * -noop_latter_request(struct request_queue *q, struct request *rq) -{ - struct noop_data *nd = q->elevator->elevator_data; - - if (rq->queuelist.next == &nd->queue) - return NULL; - return list_next_entry(rq, queuelist); -} - -static int noop_init_queue(struct request_queue *q, struct elevator_type *e) -{ - struct noop_data *nd; - struct elevator_queue *eq; - - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; - - nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); - if (!nd) { - kobject_put(&eq->kobj); - return -ENOMEM; - } - eq->elevator_data = nd; - - INIT_LIST_HEAD(&nd->queue); - - spin_lock_irq(q->queue_lock); - q->elevator = eq; - spin_unlock_irq(q->queue_lock); - return 0; -} - -static void noop_exit_queue(struct elevator_queue *e) -{ - struct noop_data *nd = e->elevator_data; - - BUG_ON(!list_empty(&nd->queue)); - kfree(nd); -} - -static struct elevator_type elevator_noop = { - .ops.sq = { - .elevator_merge_req_fn = noop_merged_requests, - .elevator_dispatch_fn = noop_dispatch, - .elevator_add_req_fn = noop_add_request, - .elevator_former_req_fn = noop_former_request, - .elevator_latter_req_fn = noop_latter_request, - .elevator_init_fn = noop_init_queue, - .elevator_exit_fn = noop_exit_queue, - }, - .elevator_name = "noop", - .elevator_owner = THIS_MODULE, -}; - -static int __init noop_init(void) -{ - return elv_register(&elevator_noop); -} - -static void __exit noop_exit(void) -{ - elv_unregister(&elevator_noop); -} - -module_init(noop_init); -module_exit(noop_exit); - - -MODULE_AUTHOR("Jens Axboe"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("No-op IO scheduler"); diff --git a/block/partition-generic.c b/block/partition-generic.c index d3d14e81fb12..8e596a8dff32 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -120,13 +120,9 @@ ssize_t part_stat_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); struct request_queue *q = part_to_disk(p)->queue; - unsigned int inflight[2]; - int cpu; + unsigned int inflight; - cpu = part_stat_lock(); - part_round_stats(q, cpu, p); - part_stat_unlock(); - part_in_flight(q, p, inflight); + inflight = part_in_flight(q, p); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -141,7 +137,7 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[STAT_WRITE]), (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), (unsigned int)part_stat_read_msecs(p, STAT_WRITE), - inflight[0], + inflight, jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue)), part_stat_read(p, ios[STAT_DISCARD]), @@ -249,9 +245,10 @@ struct device_type part_type = { .uevent = part_uevent, }; -static void delete_partition_rcu_cb(struct rcu_head *head) +static void delete_partition_work_fn(struct work_struct *work) { - struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); + struct hd_struct *part = container_of(to_rcu_work(work), struct hd_struct, + rcu_work); part->start_sect = 0; part->nr_sects = 0; @@ -262,7 +259,8 @@ static void delete_partition_rcu_cb(struct rcu_head *head) void __delete_partition(struct percpu_ref *ref) { struct hd_struct *part = container_of(ref, struct hd_struct, ref); - call_rcu(&part->rcu_head, delete_partition_rcu_cb); + INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn); + queue_rcu_work(system_wq, &part->rcu_work); } /* diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 01306c018398..938ed513b070 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -919,8 +919,6 @@ static void ata_eh_set_pending(struct ata_port *ap, int fastdrain) void ata_qc_schedule_eh(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; - struct request_queue *q = qc->scsicmd->device->request_queue; - unsigned long flags; WARN_ON(!ap->ops->error_handler); @@ -932,9 +930,7 @@ void ata_qc_schedule_eh(struct ata_queued_cmd *qc) * Note that ATA_QCFLAG_FAILED is unconditionally set after * this function completes. */ - spin_lock_irqsave(q->queue_lock, flags); blk_abort_request(qc->scsicmd->request); - spin_unlock_irqrestore(q->queue_lock, flags); } /** diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 7ca76ed2e71a..84d0fcebd6af 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -100,6 +100,10 @@ enum { MAX_TAINT = 1000, /* cap on aoetgt taint */ }; +struct aoe_req { + unsigned long nr_bios; +}; + struct buf { ulong nframesout; struct bio *bio; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index ed26b7287256..e2c6aae2d636 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -387,6 +387,7 @@ aoeblk_gdalloc(void *vp) set = &d->tag_set; set->ops = &aoeblk_mq_ops; + set->cmd_size = sizeof(struct aoe_req); set->nr_hw_queues = 1; set->queue_depth = 128; set->numa_node = NUMA_NO_NODE; diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index bb2fba651bd2..3cf9bc5d8d95 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -822,17 +822,6 @@ out: spin_unlock_irqrestore(&d->lock, flags); } -static unsigned long -rqbiocnt(struct request *r) -{ - struct bio *bio; - unsigned long n = 0; - - __rq_for_each_bio(bio, r) - n++; - return n; -} - static void bufinit(struct buf *buf, struct request *rq, struct bio *bio) { @@ -847,6 +836,7 @@ nextbuf(struct aoedev *d) { struct request *rq; struct request_queue *q; + struct aoe_req *req; struct buf *buf; struct bio *bio; @@ -865,7 +855,11 @@ nextbuf(struct aoedev *d) blk_mq_start_request(rq); d->ip.rq = rq; d->ip.nxbio = rq->bio; - rq->special = (void *) rqbiocnt(rq); + + req = blk_mq_rq_to_pdu(rq); + req->nr_bios = 0; + __rq_for_each_bio(bio, rq) + req->nr_bios++; } buf = mempool_alloc(d->bufpool, GFP_ATOMIC); if (buf == NULL) { @@ -1069,16 +1063,13 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail) static void aoe_end_buf(struct aoedev *d, struct buf *buf) { - struct request *rq; - unsigned long n; + struct request *rq = buf->rq; + struct aoe_req *req = blk_mq_rq_to_pdu(rq); if (buf == d->ip.buf) d->ip.buf = NULL; - rq = buf->rq; mempool_free(buf, d->bufpool); - n = (unsigned long) rq->special; - rq->special = (void *) --n; - if (n == 0) + if (--req->nr_bios == 0) aoe_end_request(d, rq, 0); } diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 9063f8efbd3b..5b49f1b33ebe 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -160,21 +160,22 @@ static void aoe_failip(struct aoedev *d) { struct request *rq; + struct aoe_req *req; struct bio *bio; - unsigned long n; aoe_failbuf(d, d->ip.buf); - rq = d->ip.rq; if (rq == NULL) return; + + req = blk_mq_rq_to_pdu(rq); while ((bio = d->ip.nxbio)) { bio->bi_status = BLK_STS_IOERR; d->ip.nxbio = bio->bi_next; - n = (unsigned long) rq->special; - rq->special = (void *) --n; + req->nr_bios--; } - if ((unsigned long) rq->special == 0) + + if (!req->nr_bios) aoe_end_request(d, rq, 0); } diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c index 251482066977..1e4e2971171c 100644 --- a/drivers/block/aoe/aoemain.c +++ b/drivers/block/aoe/aoemain.c @@ -24,7 +24,7 @@ static void discover_timer(struct timer_list *t) aoecmd_cfg(0xffff, 0xff); } -static void +static void __exit aoe_exit(void) { del_timer_sync(&timer); diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index f88b4c26d422..b0dbbdfeb33e 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1471,6 +1471,15 @@ static void setup_req_params( int drive ) ReqTrack, ReqSector, (unsigned long)ReqData )); } +static void ataflop_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + spin_lock_irq(&ataflop_lock); + atari_disable_irq(IRQ_MFP_FDC); + finish_fdc(); + atari_enable_irq(IRQ_MFP_FDC); + spin_unlock_irq(&ataflop_lock); +} + static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -1947,6 +1956,7 @@ static const struct block_device_operations floppy_fops = { static const struct blk_mq_ops ataflop_mq_ops = { .queue_rq = ataflop_queue_rq, + .commit_rqs = ataflop_commit_rqs, }; static struct kobject *floppy_find(dev_t dev, int *part, void *data) @@ -1982,6 +1992,7 @@ static int __init atari_floppy_init (void) &ataflop_mq_ops, 2, BLK_MQ_F_SHOULD_MERGE); if (IS_ERR(unit[i].disk->queue)) { + put_disk(unit[i].disk); ret = PTR_ERR(unit[i].disk->queue); unit[i].disk->queue = NULL; goto err; @@ -2033,18 +2044,13 @@ static int __init atari_floppy_init (void) return 0; err: - do { + while (--i >= 0) { struct gendisk *disk = unit[i].disk; - if (disk) { - if (disk->queue) { - blk_cleanup_queue(disk->queue); - disk->queue = NULL; - } - blk_mq_free_tag_set(&unit[i].tag_set); - put_disk(unit[i].disk); - } - } while (i--); + blk_cleanup_queue(disk->queue); + blk_mq_free_tag_set(&unit[i].tag_set); + put_disk(unit[i].disk); + } unregister_blkdev(FLOPPY_MAJOR, "fd"); return ret; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index fa8204214ac0..f973a2a845c8 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2792,7 +2792,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_init_set_defaults(device); - q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, &resource->req_lock); + q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); if (!q) goto out_no_q; device->rq_queue = q; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index fb23578e9a41..6f2856c6d0f2 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2231,7 +2231,6 @@ static void request_done(int uptodate) { struct request *req = current_req; struct request_queue *q; - unsigned long flags; int block; char msg[sizeof("request done ") + sizeof(int) * 3]; @@ -2254,10 +2253,7 @@ static void request_done(int uptodate) if (block > _floppy->sect) DRS->maxtrack = 1; - /* unlock chained buffers */ - spin_lock_irqsave(q->queue_lock, flags); floppy_end_request(req, 0); - spin_unlock_irqrestore(q->queue_lock, flags); } else { if (rq_data_dir(req) == WRITE) { /* record write error information */ @@ -2269,9 +2265,7 @@ static void request_done(int uptodate) DRWE->last_error_sector = blk_rq_pos(req); DRWE->last_error_generation = DRS->generation; } - spin_lock_irqsave(q->queue_lock, flags); floppy_end_request(req, BLK_STS_IOERR); - spin_unlock_irqrestore(q->queue_lock, flags); } } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index cb0cc8685076..0939f36548c9 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,13 +77,14 @@ #include #include #include +#include #include "loop.h" #include static DEFINE_IDR(loop_index_idr); -static DEFINE_MUTEX(loop_index_mutex); +static DEFINE_MUTEX(loop_ctl_mutex); static int max_part; static int part_shift; @@ -630,18 +631,7 @@ static void loop_reread_partitions(struct loop_device *lo, { int rc; - /* - * bd_mutex has been held already in release path, so don't - * acquire it if this function is called in such case. - * - * If the reread partition isn't from release path, lo_refcnt - * must be at least one and it can only become zero when the - * current holder is released. - */ - if (!atomic_read(&lo->lo_refcnt)) - rc = __blkdev_reread_part(bdev); - else - rc = blkdev_reread_part(bdev); + rc = blkdev_reread_part(bdev); if (rc) pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", __func__, lo->lo_number, lo->lo_file_name, rc); @@ -688,26 +678,30 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, unsigned int arg) { - struct file *file, *old_file; + struct file *file = NULL, *old_file; int error; + bool partscan; + error = mutex_lock_killable(&loop_ctl_mutex); + if (error) + return error; error = -ENXIO; if (lo->lo_state != Lo_bound) - goto out; + goto out_err; /* the loop device has to be read-only */ error = -EINVAL; if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) - goto out; + goto out_err; error = -EBADF; file = fget(arg); if (!file) - goto out; + goto out_err; error = loop_validate_file(file, bdev); if (error) - goto out_putf; + goto out_err; old_file = lo->lo_backing_file; @@ -715,7 +709,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, /* size of the new backing store needs to be the same */ if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) - goto out_putf; + goto out_err; /* and ... switch */ blk_mq_freeze_queue(lo->lo_queue); @@ -726,15 +720,22 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue); - + partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; + mutex_unlock(&loop_ctl_mutex); + /* + * We must drop file reference outside of loop_ctl_mutex as dropping + * the file ref can take bd_mutex which creates circular locking + * dependency. + */ fput(old_file); - if (lo->lo_flags & LO_FLAGS_PARTSCAN) + if (partscan) loop_reread_partitions(lo, bdev); return 0; - out_putf: - fput(file); - out: +out_err: + mutex_unlock(&loop_ctl_mutex); + if (file) + fput(file); return error; } @@ -909,6 +910,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, int lo_flags = 0; int error; loff_t size; + bool partscan; /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); @@ -918,13 +920,17 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, if (!file) goto out; + error = mutex_lock_killable(&loop_ctl_mutex); + if (error) + goto out_putf; + error = -EBUSY; if (lo->lo_state != Lo_unbound) - goto out_putf; + goto out_unlock; error = loop_validate_file(file, bdev); if (error) - goto out_putf; + goto out_unlock; mapping = file->f_mapping; inode = mapping->host; @@ -936,10 +942,10 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, error = -EFBIG; size = get_loop_size(lo, file); if ((loff_t)(sector_t)size != size) - goto out_putf; + goto out_unlock; error = loop_prepare_queue(lo); if (error) - goto out_putf; + goto out_unlock; error = 0; @@ -971,18 +977,22 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->lo_state = Lo_bound; if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; - if (lo->lo_flags & LO_FLAGS_PARTSCAN) - loop_reread_partitions(lo, bdev); + partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; /* Grab the block_device to prevent its destruction after we - * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). + * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev). */ bdgrab(bdev); + mutex_unlock(&loop_ctl_mutex); + if (partscan) + loop_reread_partitions(lo, bdev); return 0; - out_putf: +out_unlock: + mutex_unlock(&loop_ctl_mutex); +out_putf: fput(file); - out: +out: /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); return error; @@ -1025,39 +1035,31 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, return err; } -static int loop_clr_fd(struct loop_device *lo) +static int __loop_clr_fd(struct loop_device *lo, bool release) { - struct file *filp = lo->lo_backing_file; + struct file *filp = NULL; gfp_t gfp = lo->old_gfp_mask; struct block_device *bdev = lo->lo_device; + int err = 0; + bool partscan = false; + int lo_number; - if (lo->lo_state != Lo_bound) - return -ENXIO; - - /* - * If we've explicitly asked to tear down the loop device, - * and it has an elevated reference count, set it for auto-teardown when - * the last reference goes away. This stops $!~#$@ udev from - * preventing teardown because it decided that it needs to run blkid on - * the loopback device whenever they appear. xfstests is notorious for - * failing tests because blkid via udev races with a losetup - * /do something like mkfs/losetup -d causing the losetup -d - * command to fail with EBUSY. - */ - if (atomic_read(&lo->lo_refcnt) > 1) { - lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&lo->lo_ctl_mutex); - return 0; + mutex_lock(&loop_ctl_mutex); + if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) { + err = -ENXIO; + goto out_unlock; } - if (filp == NULL) - return -EINVAL; + filp = lo->lo_backing_file; + if (filp == NULL) { + err = -EINVAL; + goto out_unlock; + } /* freeze request queue during the transition */ blk_mq_freeze_queue(lo->lo_queue); spin_lock_irq(&lo->lo_lock); - lo->lo_state = Lo_rundown; lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); @@ -1093,21 +1095,73 @@ static int loop_clr_fd(struct loop_device *lo) module_put(THIS_MODULE); blk_mq_unfreeze_queue(lo->lo_queue); - if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) - loop_reread_partitions(lo, bdev); + partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev; + lo_number = lo->lo_number; lo->lo_flags = 0; if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; loop_unprepare_queue(lo); - mutex_unlock(&lo->lo_ctl_mutex); +out_unlock: + mutex_unlock(&loop_ctl_mutex); + if (partscan) { + /* + * bd_mutex has been held already in release path, so don't + * acquire it if this function is called in such case. + * + * If the reread partition isn't from release path, lo_refcnt + * must be at least one and it can only become zero when the + * current holder is released. + */ + if (release) + err = __blkdev_reread_part(bdev); + else + err = blkdev_reread_part(bdev); + pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", + __func__, lo_number, err); + /* Device is gone, no point in returning error */ + err = 0; + } /* - * Need not hold lo_ctl_mutex to fput backing file. - * Calling fput holding lo_ctl_mutex triggers a circular + * Need not hold loop_ctl_mutex to fput backing file. + * Calling fput holding loop_ctl_mutex triggers a circular * lock dependency possibility warning as fput can take - * bd_mutex which is usually taken before lo_ctl_mutex. + * bd_mutex which is usually taken before loop_ctl_mutex. */ - fput(filp); - return 0; + if (filp) + fput(filp); + return err; +} + +static int loop_clr_fd(struct loop_device *lo) +{ + int err; + + err = mutex_lock_killable(&loop_ctl_mutex); + if (err) + return err; + if (lo->lo_state != Lo_bound) { + mutex_unlock(&loop_ctl_mutex); + return -ENXIO; + } + /* + * If we've explicitly asked to tear down the loop device, + * and it has an elevated reference count, set it for auto-teardown when + * the last reference goes away. This stops $!~#$@ udev from + * preventing teardown because it decided that it needs to run blkid on + * the loopback device whenever they appear. xfstests is notorious for + * failing tests because blkid via udev races with a losetup + * /do something like mkfs/losetup -d causing the losetup -d + * command to fail with EBUSY. + */ + if (atomic_read(&lo->lo_refcnt) > 1) { + lo->lo_flags |= LO_FLAGS_AUTOCLEAR; + mutex_unlock(&loop_ctl_mutex); + return 0; + } + lo->lo_state = Lo_rundown; + mutex_unlock(&loop_ctl_mutex); + + return __loop_clr_fd(lo, false); } static int @@ -1116,47 +1170,58 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) int err; struct loop_func_table *xfer; kuid_t uid = current_uid(); + struct block_device *bdev; + bool partscan = false; + err = mutex_lock_killable(&loop_ctl_mutex); + if (err) + return err; if (lo->lo_encrypt_key_size && !uid_eq(lo->lo_key_owner, uid) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - if (lo->lo_state != Lo_bound) - return -ENXIO; - if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) - return -EINVAL; + !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out_unlock; + } + if (lo->lo_state != Lo_bound) { + err = -ENXIO; + goto out_unlock; + } + if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) { + err = -EINVAL; + goto out_unlock; + } /* I/O need to be drained during transfer transition */ blk_mq_freeze_queue(lo->lo_queue); err = loop_release_xfer(lo); if (err) - goto exit; + goto out_unfreeze; if (info->lo_encrypt_type) { unsigned int type = info->lo_encrypt_type; if (type >= MAX_LO_CRYPT) { err = -EINVAL; - goto exit; + goto out_unfreeze; } xfer = xfer_funcs[type]; if (xfer == NULL) { err = -EINVAL; - goto exit; + goto out_unfreeze; } } else xfer = NULL; err = loop_init_xfer(lo, xfer, info); if (err) - goto exit; + goto out_unfreeze; if (lo->lo_offset != info->lo_offset || lo->lo_sizelimit != info->lo_sizelimit) { if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) { err = -EFBIG; - goto exit; + goto out_unfreeze; } } @@ -1188,15 +1253,20 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) /* update dio if lo_offset or transfer is changed */ __loop_update_dio(lo, lo->use_dio); - exit: +out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue); if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) && !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { lo->lo_flags |= LO_FLAGS_PARTSCAN; lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - loop_reread_partitions(lo, lo->lo_device); + bdev = lo->lo_device; + partscan = true; } +out_unlock: + mutex_unlock(&loop_ctl_mutex); + if (partscan) + loop_reread_partitions(lo, bdev); return err; } @@ -1204,12 +1274,15 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) static int loop_get_status(struct loop_device *lo, struct loop_info64 *info) { - struct file *file; + struct path path; struct kstat stat; int ret; + ret = mutex_lock_killable(&loop_ctl_mutex); + if (ret) + return ret; if (lo->lo_state != Lo_bound) { - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); return -ENXIO; } @@ -1228,17 +1301,17 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info) lo->lo_encrypt_key_size); } - /* Drop lo_ctl_mutex while we call into the filesystem. */ - file = get_file(lo->lo_backing_file); - mutex_unlock(&lo->lo_ctl_mutex); - ret = vfs_getattr(&file->f_path, &stat, STATX_INO, - AT_STATX_SYNC_AS_STAT); + /* Drop loop_ctl_mutex while we call into the filesystem. */ + path = lo->lo_backing_file->f_path; + path_get(&path); + mutex_unlock(&loop_ctl_mutex); + ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); if (!ret) { info->lo_device = huge_encode_dev(stat.dev); info->lo_inode = stat.ino; info->lo_rdevice = huge_encode_dev(stat.rdev); } - fput(file); + path_put(&path); return ret; } @@ -1322,10 +1395,8 @@ loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) { struct loop_info64 info64; int err; - if (!arg) { - mutex_unlock(&lo->lo_ctl_mutex); + if (!arg) return -EINVAL; - } err = loop_get_status(lo, &info64); if (!err) err = loop_info64_to_old(&info64, &info); @@ -1340,10 +1411,8 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { struct loop_info64 info64; int err; - if (!arg) { - mutex_unlock(&lo->lo_ctl_mutex); + if (!arg) return -EINVAL; - } err = loop_get_status(lo, &info64); if (!err && copy_to_user(arg, &info64, sizeof(info64))) err = -EFAULT; @@ -1393,70 +1462,73 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) return 0; } +static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, + unsigned long arg) +{ + int err; + + err = mutex_lock_killable(&loop_ctl_mutex); + if (err) + return err; + switch (cmd) { + case LOOP_SET_CAPACITY: + err = loop_set_capacity(lo); + break; + case LOOP_SET_DIRECT_IO: + err = loop_set_dio(lo, arg); + break; + case LOOP_SET_BLOCK_SIZE: + err = loop_set_block_size(lo, arg); + break; + default: + err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; + } + mutex_unlock(&loop_ctl_mutex); + return err; +} + static int lo_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; int err; - err = mutex_lock_killable_nested(&lo->lo_ctl_mutex, 1); - if (err) - goto out_unlocked; - switch (cmd) { case LOOP_SET_FD: - err = loop_set_fd(lo, mode, bdev, arg); - break; + return loop_set_fd(lo, mode, bdev, arg); case LOOP_CHANGE_FD: - err = loop_change_fd(lo, bdev, arg); - break; + return loop_change_fd(lo, bdev, arg); case LOOP_CLR_FD: - /* loop_clr_fd would have unlocked lo_ctl_mutex on success */ - err = loop_clr_fd(lo); - if (!err) - goto out_unlocked; - break; + return loop_clr_fd(lo); case LOOP_SET_STATUS: err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { err = loop_set_status_old(lo, (struct loop_info __user *)arg); + } break; case LOOP_GET_STATUS: - err = loop_get_status_old(lo, (struct loop_info __user *) arg); - /* loop_get_status() unlocks lo_ctl_mutex */ - goto out_unlocked; + return loop_get_status_old(lo, (struct loop_info __user *) arg); case LOOP_SET_STATUS64: err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { err = loop_set_status64(lo, (struct loop_info64 __user *) arg); + } break; case LOOP_GET_STATUS64: - err = loop_get_status64(lo, (struct loop_info64 __user *) arg); - /* loop_get_status() unlocks lo_ctl_mutex */ - goto out_unlocked; + return loop_get_status64(lo, (struct loop_info64 __user *) arg); case LOOP_SET_CAPACITY: - err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) - err = loop_set_capacity(lo); - break; case LOOP_SET_DIRECT_IO: - err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) - err = loop_set_dio(lo, arg); - break; case LOOP_SET_BLOCK_SIZE: - err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) - err = loop_set_block_size(lo, arg); - break; + if (!(mode & FMODE_WRITE) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* Fall through */ default: - err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; + err = lo_simple_ioctl(lo, cmd, arg); + break; } - mutex_unlock(&lo->lo_ctl_mutex); -out_unlocked: return err; } @@ -1570,10 +1642,8 @@ loop_get_status_compat(struct loop_device *lo, struct loop_info64 info64; int err; - if (!arg) { - mutex_unlock(&lo->lo_ctl_mutex); + if (!arg) return -EINVAL; - } err = loop_get_status(lo, &info64); if (!err) err = loop_info64_to_compat(&info64, arg); @@ -1588,20 +1658,12 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, switch(cmd) { case LOOP_SET_STATUS: - err = mutex_lock_killable(&lo->lo_ctl_mutex); - if (!err) { - err = loop_set_status_compat(lo, - (const struct compat_loop_info __user *)arg); - mutex_unlock(&lo->lo_ctl_mutex); - } + err = loop_set_status_compat(lo, + (const struct compat_loop_info __user *)arg); break; case LOOP_GET_STATUS: - err = mutex_lock_killable(&lo->lo_ctl_mutex); - if (!err) { - err = loop_get_status_compat(lo, - (struct compat_loop_info __user *)arg); - /* loop_get_status() unlocks lo_ctl_mutex */ - } + err = loop_get_status_compat(lo, + (struct compat_loop_info __user *)arg); break; case LOOP_SET_CAPACITY: case LOOP_CLR_FD: @@ -1625,9 +1687,11 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, static int lo_open(struct block_device *bdev, fmode_t mode) { struct loop_device *lo; - int err = 0; + int err; - mutex_lock(&loop_index_mutex); + err = mutex_lock_killable(&loop_ctl_mutex); + if (err) + return err; lo = bdev->bd_disk->private_data; if (!lo) { err = -ENXIO; @@ -1636,26 +1700,30 @@ static int lo_open(struct block_device *bdev, fmode_t mode) atomic_inc(&lo->lo_refcnt); out: - mutex_unlock(&loop_index_mutex); + mutex_unlock(&loop_ctl_mutex); return err; } -static void __lo_release(struct loop_device *lo) +static void lo_release(struct gendisk *disk, fmode_t mode) { - int err; + struct loop_device *lo; + mutex_lock(&loop_ctl_mutex); + lo = disk->private_data; if (atomic_dec_return(&lo->lo_refcnt)) - return; + goto out_unlock; - mutex_lock(&lo->lo_ctl_mutex); if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { + if (lo->lo_state != Lo_bound) + goto out_unlock; + lo->lo_state = Lo_rundown; + mutex_unlock(&loop_ctl_mutex); /* * In autoclear mode, stop the loop thread * and remove configuration after last close. */ - err = loop_clr_fd(lo); - if (!err) - return; + __loop_clr_fd(lo, true); + return; } else if (lo->lo_state == Lo_bound) { /* * Otherwise keep thread (if running) and config, @@ -1665,14 +1733,8 @@ static void __lo_release(struct loop_device *lo) blk_mq_unfreeze_queue(lo->lo_queue); } - mutex_unlock(&lo->lo_ctl_mutex); -} - -static void lo_release(struct gendisk *disk, fmode_t mode) -{ - mutex_lock(&loop_index_mutex); - __lo_release(disk->private_data); - mutex_unlock(&loop_index_mutex); +out_unlock: + mutex_unlock(&loop_ctl_mutex); } static const struct block_device_operations lo_fops = { @@ -1711,10 +1773,10 @@ static int unregister_transfer_cb(int id, void *ptr, void *data) struct loop_device *lo = ptr; struct loop_func_table *xfer = data; - mutex_lock(&lo->lo_ctl_mutex); + mutex_lock(&loop_ctl_mutex); if (lo->lo_encryption == xfer) loop_release_xfer(lo); - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); return 0; } @@ -1759,8 +1821,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, /* always use the first bio's css */ #ifdef CONFIG_BLK_CGROUP - if (cmd->use_aio && rq->bio && rq->bio->bi_css) { - cmd->css = rq->bio->bi_css; + if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) { + cmd->css = &bio_blkcg(rq->bio)->css; css_get(cmd->css); } else #endif @@ -1853,7 +1915,7 @@ static int loop_add(struct loop_device **l, int i) goto out_free_idr; lo->lo_queue = blk_mq_init_queue(&lo->tag_set); - if (IS_ERR_OR_NULL(lo->lo_queue)) { + if (IS_ERR(lo->lo_queue)) { err = PTR_ERR(lo->lo_queue); goto out_cleanup_tags; } @@ -1895,7 +1957,6 @@ static int loop_add(struct loop_device **l, int i) if (!part_shift) disk->flags |= GENHD_FL_NO_PART_SCAN; disk->flags |= GENHD_FL_EXT_DEVT; - mutex_init(&lo->lo_ctl_mutex); atomic_set(&lo->lo_refcnt, 0); lo->lo_number = i; spin_lock_init(&lo->lo_lock); @@ -1974,7 +2035,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data) struct kobject *kobj; int err; - mutex_lock(&loop_index_mutex); + mutex_lock(&loop_ctl_mutex); err = loop_lookup(&lo, MINOR(dev) >> part_shift); if (err < 0) err = loop_add(&lo, MINOR(dev) >> part_shift); @@ -1982,7 +2043,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data) kobj = NULL; else kobj = get_disk_and_module(lo->lo_disk); - mutex_unlock(&loop_index_mutex); + mutex_unlock(&loop_ctl_mutex); *part = 0; return kobj; @@ -1992,9 +2053,13 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd, unsigned long parm) { struct loop_device *lo; - int ret = -ENOSYS; + int ret; - mutex_lock(&loop_index_mutex); + ret = mutex_lock_killable(&loop_ctl_mutex); + if (ret) + return ret; + + ret = -ENOSYS; switch (cmd) { case LOOP_CTL_ADD: ret = loop_lookup(&lo, parm); @@ -2008,21 +2073,15 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd, ret = loop_lookup(&lo, parm); if (ret < 0) break; - ret = mutex_lock_killable(&lo->lo_ctl_mutex); - if (ret) - break; if (lo->lo_state != Lo_unbound) { ret = -EBUSY; - mutex_unlock(&lo->lo_ctl_mutex); break; } if (atomic_read(&lo->lo_refcnt) > 0) { ret = -EBUSY; - mutex_unlock(&lo->lo_ctl_mutex); break; } lo->lo_disk->private_data = NULL; - mutex_unlock(&lo->lo_ctl_mutex); idr_remove(&loop_index_idr, lo->lo_number); loop_remove(lo); break; @@ -2032,7 +2091,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd, break; ret = loop_add(&lo, -1); } - mutex_unlock(&loop_index_mutex); + mutex_unlock(&loop_ctl_mutex); return ret; } @@ -2116,10 +2175,10 @@ static int __init loop_init(void) THIS_MODULE, loop_probe, NULL, NULL); /* pre-create number of devices given by config or max_loop */ - mutex_lock(&loop_index_mutex); + mutex_lock(&loop_ctl_mutex); for (i = 0; i < nr; i++) loop_add(&lo, i); - mutex_unlock(&loop_index_mutex); + mutex_unlock(&loop_ctl_mutex); printk(KERN_INFO "loop: module loaded\n"); return 0; diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 4d42c7af7de7..af75a5ee4094 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -54,7 +54,6 @@ struct loop_device { spinlock_t lo_lock; int lo_state; - struct mutex lo_ctl_mutex; struct kthread_worker worker; struct task_struct *worker_task; bool use_dio; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index a7daa8acbab3..88e8440e75c3 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -168,41 +168,6 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev) return false; /* device present */ } -/* we have to use runtime tag to setup command header */ -static void mtip_init_cmd_header(struct request *rq) -{ - struct driver_data *dd = rq->q->queuedata; - struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); - - /* Point the command headers at the command tables. */ - cmd->command_header = dd->port->command_list + - (sizeof(struct mtip_cmd_hdr) * rq->tag); - cmd->command_header_dma = dd->port->command_list_dma + - (sizeof(struct mtip_cmd_hdr) * rq->tag); - - if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags)) - cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16); - - cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF); -} - -static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) -{ - struct request *rq; - - if (mtip_check_surprise_removal(dd->pdev)) - return NULL; - - rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED); - if (IS_ERR(rq)) - return NULL; - - /* Internal cmd isn't submitted via .queue_rq */ - mtip_init_cmd_header(rq); - - return blk_mq_rq_to_pdu(rq); -} - static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd, unsigned int tag) { @@ -1023,13 +988,14 @@ static int mtip_exec_internal_command(struct mtip_port *port, return -EFAULT; } - int_cmd = mtip_get_int_command(dd); - if (!int_cmd) { + if (mtip_check_surprise_removal(dd->pdev)) + return -EFAULT; + + rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED); + if (IS_ERR(rq)) { dbg_printk(MTIP_DRV_NAME "Unable to allocate tag for PIO cmd\n"); return -EFAULT; } - rq = blk_mq_rq_from_pdu(int_cmd); - rq->special = &icmd; set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); @@ -1050,6 +1016,8 @@ static int mtip_exec_internal_command(struct mtip_port *port, } /* Copy the command to the command table */ + int_cmd = blk_mq_rq_to_pdu(rq); + int_cmd->icmd = &icmd; memcpy(int_cmd->command, fis, fis_len*4); rq->timeout = timeout; @@ -1423,23 +1391,19 @@ static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id, * @dd pointer to driver_data structure * @lba starting lba * @len # of 512b sectors to trim - * - * return value - * -ENOMEM Out of dma memory - * -EINVAL Invalid parameters passed in, trim not supported - * -EIO Error submitting trim request to hw */ -static int mtip_send_trim(struct driver_data *dd, unsigned int lba, - unsigned int len) +static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba, + unsigned int len) { - int i, rv = 0; u64 tlba, tlen, sect_left; struct mtip_trim_entry *buf; dma_addr_t dma_addr; struct host_to_dev_fis fis; + blk_status_t ret = BLK_STS_OK; + int i; if (!len || dd->trim_supp == false) - return -EINVAL; + return BLK_STS_IOERR; /* Trim request too big */ WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES)); @@ -1454,7 +1418,7 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba, buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr, GFP_KERNEL); if (!buf) - return -ENOMEM; + return BLK_STS_RESOURCE; memset(buf, 0, ATA_SECT_SIZE); for (i = 0, sect_left = len, tlba = lba; @@ -1463,8 +1427,8 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba, tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ? MTIP_MAX_TRIM_ENTRY_LEN : sect_left); - buf[i].lba = __force_bit2int cpu_to_le32(tlba); - buf[i].range = __force_bit2int cpu_to_le16(tlen); + buf[i].lba = cpu_to_le32(tlba); + buf[i].range = cpu_to_le16(tlen); tlba += tlen; sect_left -= tlen; } @@ -1486,10 +1450,10 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba, ATA_SECT_SIZE, 0, MTIP_TRIM_TIMEOUT_MS) < 0) - rv = -EIO; + ret = BLK_STS_IOERR; dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr); - return rv; + return ret; } /* @@ -1585,23 +1549,20 @@ static inline void fill_command_sg(struct driver_data *dd, int n; unsigned int dma_len; struct mtip_cmd_sg *command_sg; - struct scatterlist *sg = command->sg; + struct scatterlist *sg; command_sg = command->command + AHCI_CMD_TBL_HDR_SZ; - for (n = 0; n < nents; n++) { + for_each_sg(command->sg, sg, nents, n) { dma_len = sg_dma_len(sg); if (dma_len > 0x400000) dev_err(&dd->pdev->dev, "DMA segment length truncated\n"); - command_sg->info = __force_bit2int - cpu_to_le32((dma_len-1) & 0x3FFFFF); - command_sg->dba = __force_bit2int - cpu_to_le32(sg_dma_address(sg)); - command_sg->dba_upper = __force_bit2int + command_sg->info = cpu_to_le32((dma_len-1) & 0x3FFFFF); + command_sg->dba = cpu_to_le32(sg_dma_address(sg)); + command_sg->dba_upper = cpu_to_le32((sg_dma_address(sg) >> 16) >> 16); command_sg++; - sg++; } } @@ -2171,7 +2132,6 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, * @dd Pointer to the driver data structure. * @start First sector to read. * @nsect Number of sectors to read. - * @nents Number of entries in scatter list for the read command. * @tag The tag of this read command. * @callback Pointer to the function that should be called * when the read completes. @@ -2183,16 +2143,20 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, * None */ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, - struct mtip_cmd *command, int nents, + struct mtip_cmd *command, struct blk_mq_hw_ctx *hctx) { + struct mtip_cmd_hdr *hdr = + dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag; struct host_to_dev_fis *fis; struct mtip_port *port = dd->port; int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE; u64 start = blk_rq_pos(rq); unsigned int nsect = blk_rq_sectors(rq); + unsigned int nents; /* Map the scatter list for DMA access */ + nents = blk_rq_map_sg(hctx->queue, rq, command->sg); nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); prefetch(&port->flags); @@ -2233,10 +2197,11 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, fis->device |= 1 << 7; /* Populate the command header */ - command->command_header->opts = - __force_bit2int cpu_to_le32( - (nents << 16) | 5 | AHCI_CMD_PREFETCH); - command->command_header->byte_count = 0; + hdr->ctba = cpu_to_le32(command->command_dma & 0xFFFFFFFF); + if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags)) + hdr->ctbau = cpu_to_le32((command->command_dma >> 16) >> 16); + hdr->opts = cpu_to_le32((nents << 16) | 5 | AHCI_CMD_PREFETCH); + hdr->byte_count = 0; command->direction = dma_dir; @@ -2715,12 +2680,12 @@ static void mtip_softirq_done_fn(struct request *rq) cmd->direction); if (unlikely(cmd->unaligned)) - up(&dd->port->cmd_slot_unal); + atomic_inc(&dd->port->cmd_slot_unal); blk_mq_end_request(rq, cmd->status); } -static void mtip_abort_cmd(struct request *req, void *data, bool reserved) +static bool mtip_abort_cmd(struct request *req, void *data, bool reserved) { struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); struct driver_data *dd = data; @@ -2730,14 +2695,16 @@ static void mtip_abort_cmd(struct request *req, void *data, bool reserved) clear_bit(req->tag, dd->port->cmds_to_issue); cmd->status = BLK_STS_IOERR; mtip_softirq_done_fn(req); + return true; } -static void mtip_queue_cmd(struct request *req, void *data, bool reserved) +static bool mtip_queue_cmd(struct request *req, void *data, bool reserved) { struct driver_data *dd = data; set_bit(req->tag, dd->port->cmds_to_issue); blk_abort_request(req); + return true; } /* @@ -2803,10 +2770,7 @@ restart_eh: blk_mq_quiesce_queue(dd->queue); - spin_lock(dd->queue->queue_lock); - blk_mq_tagset_busy_iter(&dd->tags, - mtip_queue_cmd, dd); - spin_unlock(dd->queue->queue_lock); + blk_mq_tagset_busy_iter(&dd->tags, mtip_queue_cmd, dd); set_bit(MTIP_PF_ISSUE_CMDS_BIT, &dd->port->flags); @@ -3026,7 +2990,7 @@ static int mtip_hw_init(struct driver_data *dd) else dd->unal_qdepth = 0; - sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth); + atomic_set(&dd->port->cmd_slot_unal, dd->unal_qdepth); /* Spinlock to prevent concurrent issue */ for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) @@ -3531,58 +3495,24 @@ static inline bool is_se_active(struct driver_data *dd) return false; } -/* - * Block layer make request function. - * - * This function is called by the kernel to process a BIO for - * the P320 device. - * - * @queue Pointer to the request queue. Unused other than to obtain - * the driver data structure. - * @rq Pointer to the request. - * - */ -static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) +static inline bool is_stopped(struct driver_data *dd, struct request *rq) { - struct driver_data *dd = hctx->queue->queuedata; - struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); - unsigned int nents; + if (likely(!(dd->dd_flag & MTIP_DDF_STOP_IO))) + return false; - if (is_se_active(dd)) - return -ENODATA; + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) + return true; + if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag)) + return true; + if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag) && + rq_data_dir(rq)) + return true; + if (test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) + return true; + if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) + return true; - if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { - if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, - &dd->dd_flag))) { - return -ENXIO; - } - if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) { - return -ENODATA; - } - if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT, - &dd->dd_flag) && - rq_data_dir(rq))) { - return -ENODATA; - } - if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag) || - test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))) - return -ENODATA; - } - - if (req_op(rq) == REQ_OP_DISCARD) { - int err; - - err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); - blk_mq_end_request(rq, err ? BLK_STS_IOERR : BLK_STS_OK); - return 0; - } - - /* Create the scatter list for this request. */ - nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg); - - /* Issue the read/write. */ - mtip_hw_submit_io(dd, rq, cmd, nents, hctx); - return 0; + return false; } static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, @@ -3603,7 +3533,7 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, cmd->unaligned = 1; } - if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal)) + if (cmd->unaligned && atomic_dec_if_positive(&dd->port->cmd_slot_unal) >= 0) return true; return false; @@ -3613,32 +3543,33 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct driver_data *dd = hctx->queue->queuedata; - struct mtip_int_cmd *icmd = rq->special; struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct mtip_int_cmd *icmd = cmd->icmd; + struct mtip_cmd_hdr *hdr = + dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag; struct mtip_cmd_sg *command_sg; if (mtip_commands_active(dd->port)) - return BLK_STS_RESOURCE; + return BLK_STS_DEV_RESOURCE; + hdr->ctba = cpu_to_le32(cmd->command_dma & 0xFFFFFFFF); + if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags)) + hdr->ctbau = cpu_to_le32((cmd->command_dma >> 16) >> 16); /* Populate the SG list */ - cmd->command_header->opts = - __force_bit2int cpu_to_le32(icmd->opts | icmd->fis_len); + hdr->opts = cpu_to_le32(icmd->opts | icmd->fis_len); if (icmd->buf_len) { command_sg = cmd->command + AHCI_CMD_TBL_HDR_SZ; - command_sg->info = - __force_bit2int cpu_to_le32((icmd->buf_len-1) & 0x3FFFFF); - command_sg->dba = - __force_bit2int cpu_to_le32(icmd->buffer & 0xFFFFFFFF); + command_sg->info = cpu_to_le32((icmd->buf_len-1) & 0x3FFFFF); + command_sg->dba = cpu_to_le32(icmd->buffer & 0xFFFFFFFF); command_sg->dba_upper = - __force_bit2int cpu_to_le32((icmd->buffer >> 16) >> 16); + cpu_to_le32((icmd->buffer >> 16) >> 16); - cmd->command_header->opts |= - __force_bit2int cpu_to_le32((1 << 16)); + hdr->opts |= cpu_to_le32((1 << 16)); } /* Populate the command header */ - cmd->command_header->byte_count = 0; + hdr->byte_count = 0; blk_mq_start_request(rq); mtip_issue_non_ncq_command(dd->port, rq->tag); @@ -3648,23 +3579,25 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx, static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { + struct driver_data *dd = hctx->queue->queuedata; struct request *rq = bd->rq; - int ret; - - mtip_init_cmd_header(rq); + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); if (blk_rq_is_passthrough(rq)) return mtip_issue_reserved_cmd(hctx, rq); if (unlikely(mtip_check_unal_depth(hctx, rq))) - return BLK_STS_RESOURCE; + return BLK_STS_DEV_RESOURCE; + + if (is_se_active(dd) || is_stopped(dd, rq)) + return BLK_STS_IOERR; blk_mq_start_request(rq); - ret = mtip_submit_request(hctx, rq); - if (likely(!ret)) - return BLK_STS_OK; - return BLK_STS_IOERR; + if (req_op(rq) == REQ_OP_DISCARD) + return mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); + mtip_hw_submit_io(dd, rq, cmd, hctx); + return BLK_STS_OK; } static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq, @@ -3920,12 +3853,13 @@ protocol_init_error: return rv; } -static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv) +static bool mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv) { struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); cmd->status = BLK_STS_IOERR; blk_mq_complete_request(rq); + return true; } /* diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index e20e55dab443..abce25f27f57 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -126,8 +126,6 @@ #define MTIP_DFS_MAX_BUF_SIZE 1024 -#define __force_bit2int (unsigned int __force) - enum { /* below are bit numbers in 'flags' defined in mtip_port */ MTIP_PF_IC_ACTIVE_BIT = 0, /* pio/ioctl */ @@ -174,10 +172,10 @@ enum { struct smart_attr { u8 attr_id; - u16 flags; + __le16 flags; u8 cur; u8 worst; - u32 data; + __le32 data; u8 res[3]; } __packed; @@ -200,9 +198,9 @@ struct mtip_work { #define MTIP_MAX_TRIM_ENTRY_LEN 0xfff8 struct mtip_trim_entry { - u32 lba; /* starting lba of region */ - u16 rsvd; /* unused */ - u16 range; /* # of 512b blocks to trim */ + __le32 lba; /* starting lba of region */ + __le16 rsvd; /* unused */ + __le16 range; /* # of 512b blocks to trim */ } __packed; struct mtip_trim { @@ -278,24 +276,24 @@ struct mtip_cmd_hdr { * - Bit 5 Unused in this implementation. * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes). */ - unsigned int opts; + __le32 opts; /* This field is unsed when using NCQ. */ union { - unsigned int byte_count; - unsigned int status; + __le32 byte_count; + __le32 status; }; /* * Lower 32 bits of the command table address associated with this * header. The command table addresses must be 128 byte aligned. */ - unsigned int ctba; + __le32 ctba; /* * If 64 bit addressing is used this field is the upper 32 bits * of the command table address associated with this command. */ - unsigned int ctbau; + __le32 ctbau; /* Reserved and unused. */ - unsigned int res[4]; + u32 res[4]; }; /* Command scatter gather structure (PRD). */ @@ -305,31 +303,28 @@ struct mtip_cmd_sg { * address must be 8 byte aligned signified by bits 2:0 being * set to 0. */ - unsigned int dba; + __le32 dba; /* * When 64 bit addressing is used this field is the upper * 32 bits of the data buffer address. */ - unsigned int dba_upper; + __le32 dba_upper; /* Unused. */ - unsigned int reserved; + __le32 reserved; /* * Bit 31: interrupt when this data block has been transferred. * Bits 30..22: reserved * Bits 21..0: byte count (minus 1). For P320 the byte count must be * 8 byte aligned signified by bits 2:0 being set to 1. */ - unsigned int info; + __le32 info; }; struct mtip_port; +struct mtip_int_cmd; + /* Structure used to describe a command. */ struct mtip_cmd { - - struct mtip_cmd_hdr *command_header; /* ptr to command header entry */ - - dma_addr_t command_header_dma; /* corresponding physical address */ - void *command; /* ptr to command table entry */ dma_addr_t command_dma; /* corresponding physical address */ @@ -338,7 +333,10 @@ struct mtip_cmd { int unaligned; /* command is unaligned on 4k boundary */ - struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */ + union { + struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */ + struct mtip_int_cmd *icmd; + }; int retries; /* The number of retries left for this command. */ @@ -435,8 +433,8 @@ struct mtip_port { */ unsigned long ic_pause_timer; - /* Semaphore to control queue depth of unaligned IOs */ - struct semaphore cmd_slot_unal; + /* Counter to control queue depth of unaligned IOs */ + atomic_t cmd_slot_unal; /* Spinlock for working around command-issue bug. */ spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS]; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 4d4d6129ff66..08696f5f00bb 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -734,12 +734,13 @@ static void recv_work(struct work_struct *work) kfree(args); } -static void nbd_clear_req(struct request *req, void *data, bool reserved) +static bool nbd_clear_req(struct request *req, void *data, bool reserved) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); cmd->status = BLK_STS_IOERR; blk_mq_complete_request(req); + return true; } static void nbd_clear_que(struct nbd_device *nbd) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 7685df43f1ef..b3df2793e7cd 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -49,6 +49,7 @@ struct nullb_device { unsigned long completion_nsec; /* time in ns to complete a request */ unsigned long cache_size; /* disk cache size in MB */ unsigned long zone_size; /* zone size in MB if device is zoned */ + unsigned int zone_nr_conv; /* number of conventional zones */ unsigned int submit_queues; /* number of submission queues */ unsigned int home_node; /* home node for the device */ unsigned int queue_mode; /* block interface */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 09339203dfba..62c9654b9ce8 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -188,6 +188,10 @@ static unsigned long g_zone_size = 256; module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); +static unsigned int g_zone_nr_conv; +module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); +MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); + static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); @@ -293,6 +297,7 @@ NULLB_DEVICE_ATTR(mbps, uint); NULLB_DEVICE_ATTR(cache_size, ulong); NULLB_DEVICE_ATTR(zoned, bool); NULLB_DEVICE_ATTR(zone_size, ulong); +NULLB_DEVICE_ATTR(zone_nr_conv, uint); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -407,6 +412,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_badblocks, &nullb_device_attr_zoned, &nullb_device_attr_zone_size, + &nullb_device_attr_zone_nr_conv, NULL, }; @@ -520,6 +526,7 @@ static struct nullb_device *null_alloc_dev(void) dev->use_per_node_hctx = g_use_per_node_hctx; dev->zoned = g_zoned; dev->zone_size = g_zone_size; + dev->zone_nr_conv = g_zone_nr_conv; return dev; } @@ -635,14 +642,9 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd) hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); } -static void null_softirq_done_fn(struct request *rq) +static void null_complete_rq(struct request *rq) { - struct nullb *nullb = rq->q->queuedata; - - if (nullb->dev->queue_mode == NULL_Q_MQ) - end_cmd(blk_mq_rq_to_pdu(rq)); - else - end_cmd(rq->special); + end_cmd(blk_mq_rq_to_pdu(rq)); } static struct nullb_page *null_alloc_page(gfp_t gfp_flags) @@ -1350,7 +1352,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, static const struct blk_mq_ops null_mq_ops = { .queue_rq = null_queue_rq, - .complete = null_softirq_done_fn, + .complete = null_complete_rq, .timeout = null_timeout_rq, }; @@ -1657,8 +1659,7 @@ static int null_add_dev(struct nullb_device *dev) } null_init_queues(nullb); } else if (dev->queue_mode == NULL_Q_BIO) { - nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node, - NULL); + nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node); if (!nullb->q) { rv = -ENOMEM; goto out_cleanup_queues; diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index c0b0e4a3fa8f..5d1c261a2cfd 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -29,7 +29,25 @@ int null_zone_init(struct nullb_device *dev) if (!dev->zones) return -ENOMEM; - for (i = 0; i < dev->nr_zones; i++) { + if (dev->zone_nr_conv >= dev->nr_zones) { + dev->zone_nr_conv = dev->nr_zones - 1; + pr_info("null_blk: changed the number of conventional zones to %u", + dev->zone_nr_conv); + } + + for (i = 0; i < dev->zone_nr_conv; i++) { + struct blk_zone *zone = &dev->zones[i]; + + zone->start = sector; + zone->len = dev->zone_size_sects; + zone->wp = zone->start + zone->len; + zone->type = BLK_ZONE_TYPE_CONVENTIONAL; + zone->cond = BLK_ZONE_COND_NOT_WP; + + sector += dev->zone_size_sects; + } + + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { struct blk_zone *zone = &dev->zones[i]; zone->start = zone->wp = sector; @@ -98,6 +116,8 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector, if (zone->wp == zone->start + zone->len) zone->cond = BLK_ZONE_COND_FULL; break; + case BLK_ZONE_COND_NOT_WP: + break; default: /* Invalid zone condition */ cmd->error = BLK_STS_IOERR; @@ -111,6 +131,11 @@ void null_zone_reset(struct nullb_cmd *cmd, sector_t sector) unsigned int zno = null_zone_no(dev, sector); struct blk_zone *zone = &dev->zones[zno]; + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { + cmd->error = BLK_STS_IOERR; + return; + } + zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; } diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index ae4971e5d9a8..0ff9b12d0e35 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -242,6 +242,11 @@ struct pd_unit { static struct pd_unit pd[PD_UNITS]; +struct pd_req { + /* for REQ_OP_DRV_IN: */ + enum action (*func)(struct pd_unit *disk); +}; + static char pd_scratch[512]; /* scratch block buffer */ static char *pd_errs[17] = { "ERR", "INDEX", "ECC", "DRQ", "SEEK", "WRERR", @@ -502,8 +507,9 @@ static enum action do_pd_io_start(void) static enum action pd_special(void) { - enum action (*func)(struct pd_unit *) = pd_req->special; - return func(pd_current); + struct pd_req *req = blk_mq_rq_to_pdu(pd_req); + + return req->func(pd_current); } static int pd_next_buf(void) @@ -767,12 +773,14 @@ static int pd_special_command(struct pd_unit *disk, enum action (*func)(struct pd_unit *disk)) { struct request *rq; + struct pd_req *req; rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); + req = blk_mq_rq_to_pdu(rq); - rq->special = func; + req->func = func; blk_execute_rq(disk->gd->queue, disk->gd, rq, 0); blk_put_request(rq); return 0; @@ -892,9 +900,21 @@ static void pd_probe_drive(struct pd_unit *disk) disk->gd = p; p->private_data = disk; - p->queue = blk_mq_init_sq_queue(&disk->tag_set, &pd_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING); + memset(&disk->tag_set, 0, sizeof(disk->tag_set)); + disk->tag_set.ops = &pd_mq_ops; + disk->tag_set.cmd_size = sizeof(struct pd_req); + disk->tag_set.nr_hw_queues = 1; + disk->tag_set.nr_maps = 1; + disk->tag_set.queue_depth = 2; + disk->tag_set.numa_node = NUMA_NO_NODE; + disk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; + + if (blk_mq_alloc_tag_set(&disk->tag_set)) + return; + + p->queue = blk_mq_init_queue(&disk->tag_set); if (IS_ERR(p->queue)) { + blk_mq_free_tag_set(&disk->tag_set); p->queue = NULL; return; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 9381f4e3b221..f5a71023f76c 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2203,9 +2203,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) * Some CDRW drives can not handle writes larger than one packet, * even if the size is a multiple of the packet size. */ - spin_lock_irq(q->queue_lock); blk_queue_max_hw_sectors(q, pd->settings.size); - spin_unlock_irq(q->queue_lock); set_bit(PACKET_WRITABLE, &pd->flags); } else { pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 2459dcc04b1c..a10d5736d8f7 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -181,6 +181,7 @@ struct skd_request_context { struct fit_completion_entry_v1 completion; struct fit_comp_error_info err_info; + int retries; blk_status_t status; }; @@ -382,11 +383,12 @@ static void skd_log_skreq(struct skd_device *skdev, * READ/WRITE REQUESTS ***************************************************************************** */ -static void skd_inc_in_flight(struct request *rq, void *data, bool reserved) +static bool skd_inc_in_flight(struct request *rq, void *data, bool reserved) { int *count = data; count++; + return true; } static int skd_in_flight(struct skd_device *skdev) @@ -494,6 +496,11 @@ static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(skdev->state != SKD_DRVR_STATE_ONLINE)) return skd_fail_all(q) ? BLK_STS_IOERR : BLK_STS_RESOURCE; + if (!(req->rq_flags & RQF_DONTPREP)) { + skreq->retries = 0; + req->rq_flags |= RQF_DONTPREP; + } + blk_mq_start_request(req); WARN_ONCE(tag >= skd_max_queue_depth, "%#x > %#x (nr_requests = %lu)\n", @@ -1425,7 +1432,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev, break; case SKD_CHECK_STATUS_REQUEUE_REQUEST: - if ((unsigned long) ++req->special < SKD_MAX_RETRIES) { + if (++skreq->retries < SKD_MAX_RETRIES) { skd_log_skreq(skdev, skreq, "retry"); blk_mq_requeue_request(req, true); break; @@ -1887,13 +1894,13 @@ static void skd_isr_fwstate(struct skd_device *skdev) skd_skdev_state_to_str(skdev->state), skdev->state); } -static void skd_recover_request(struct request *req, void *data, bool reserved) +static bool skd_recover_request(struct request *req, void *data, bool reserved) { struct skd_device *const skdev = data; struct skd_request_context *skreq = blk_mq_rq_to_pdu(req); if (skreq->state != SKD_REQ_STATE_BUSY) - return; + return true; skd_log_skreq(skdev, skreq, "recover"); @@ -1904,6 +1911,7 @@ static void skd_recover_request(struct request *req, void *data, bool reserved) skreq->state = SKD_REQ_STATE_IDLE; skreq->status = BLK_STS_IOERR; blk_mq_complete_request(req); + return true; } static void skd_recover_requests(struct skd_device *skdev) diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index b54fa6726303..9c0553dd13e7 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include @@ -45,6 +45,8 @@ MODULE_VERSION(DRV_MODULE_VERSION); #define WAITING_FOR_GEN_CMD 0x04 #define WAITING_FOR_ANY -1 +#define VDC_MAX_RETRIES 10 + static struct workqueue_struct *sunvdc_wq; struct vdc_req_entry { @@ -66,9 +68,10 @@ struct vdc_port { u64 max_xfer_size; u32 vdisk_block_size; + u32 drain; u64 ldc_timeout; - struct timer_list ldc_reset_timer; + struct delayed_work ldc_reset_timer_work; struct work_struct ldc_reset_work; /* The server fills these in for us in the disk attribute @@ -80,12 +83,14 @@ struct vdc_port { u8 vdisk_mtype; u32 vdisk_phys_blksz; + struct blk_mq_tag_set tag_set; + char disk_name[32]; }; static void vdc_ldc_reset(struct vdc_port *port); static void vdc_ldc_reset_work(struct work_struct *work); -static void vdc_ldc_reset_timer(struct timer_list *t); +static void vdc_ldc_reset_timer_work(struct work_struct *work); static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio) { @@ -175,11 +180,8 @@ static void vdc_blk_queue_start(struct vdc_port *port) * handshake completes, so check for initial handshake before we've * allocated a disk. */ - if (port->disk && blk_queue_stopped(port->disk->queue) && - vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50) { - blk_start_queue(port->disk->queue); - } - + if (port->disk && vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50) + blk_mq_start_hw_queues(port->disk->queue); } static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for) @@ -197,7 +199,7 @@ static void vdc_handshake_complete(struct vio_driver_state *vio) { struct vdc_port *port = to_vdc_port(vio); - del_timer(&port->ldc_reset_timer); + cancel_delayed_work(&port->ldc_reset_timer_work); vdc_finish(vio, 0, WAITING_FOR_LINK_UP); vdc_blk_queue_start(port); } @@ -320,7 +322,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr, rqe->req = NULL; - __blk_end_request(req, (desc->status ? BLK_STS_IOERR : 0), desc->size); + blk_mq_end_request(req, desc->status ? BLK_STS_IOERR : 0); vdc_blk_queue_start(port); } @@ -431,6 +433,7 @@ static int __vdc_tx_trigger(struct vdc_port *port) .end_idx = dr->prod, }; int err, delay; + int retries = 0; hdr.seq = dr->snd_nxt; delay = 1; @@ -443,6 +446,8 @@ static int __vdc_tx_trigger(struct vdc_port *port) udelay(delay); if ((delay <<= 1) > 128) delay = 128; + if (retries++ > VDC_MAX_RETRIES) + break; } while (err == -EAGAIN); if (err == -ENOTCONN) @@ -525,29 +530,40 @@ static int __send_request(struct request *req) return err; } -static void do_vdc_request(struct request_queue *rq) +static blk_status_t vdc_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct request *req; + struct vdc_port *port = hctx->queue->queuedata; + struct vio_dring_state *dr; + unsigned long flags; - while ((req = blk_peek_request(rq)) != NULL) { - struct vdc_port *port; - struct vio_dring_state *dr; + dr = &port->vio.drings[VIO_DRIVER_TX_RING]; - port = req->rq_disk->private_data; - dr = &port->vio.drings[VIO_DRIVER_TX_RING]; - if (unlikely(vdc_tx_dring_avail(dr) < 1)) - goto wait; + blk_mq_start_request(bd->rq); - blk_start_request(req); + spin_lock_irqsave(&port->vio.lock, flags); - if (__send_request(req) < 0) { - blk_requeue_request(rq, req); -wait: - /* Avoid pointless unplugs. */ - blk_stop_queue(rq); - break; - } + /* + * Doing drain, just end the request in error + */ + if (unlikely(port->drain)) { + spin_unlock_irqrestore(&port->vio.lock, flags); + return BLK_STS_IOERR; } + + if (unlikely(vdc_tx_dring_avail(dr) < 1)) { + spin_unlock_irqrestore(&port->vio.lock, flags); + blk_mq_stop_hw_queue(hctx); + return BLK_STS_DEV_RESOURCE; + } + + if (__send_request(bd->rq) < 0) { + spin_unlock_irqrestore(&port->vio.lock, flags); + return BLK_STS_IOERR; + } + + spin_unlock_irqrestore(&port->vio.lock, flags); + return BLK_STS_OK; } static int generic_request(struct vdc_port *port, u8 op, void *buf, int len) @@ -759,6 +775,31 @@ static void vdc_port_down(struct vdc_port *port) vio_ldc_free(&port->vio); } +static const struct blk_mq_ops vdc_mq_ops = { + .queue_rq = vdc_queue_rq, +}; + +static void cleanup_queue(struct request_queue *q) +{ + struct vdc_port *port = q->queuedata; + + blk_cleanup_queue(q); + blk_mq_free_tag_set(&port->tag_set); +} + +static struct request_queue *init_queue(struct vdc_port *port) +{ + struct request_queue *q; + + q = blk_mq_init_sq_queue(&port->tag_set, &vdc_mq_ops, VDC_TX_RING_SIZE, + BLK_MQ_F_SHOULD_MERGE); + if (IS_ERR(q)) + return q; + + q->queuedata = port; + return q; +} + static int probe_disk(struct vdc_port *port) { struct request_queue *q; @@ -796,17 +837,17 @@ static int probe_disk(struct vdc_port *port) (u64)geom.num_sec); } - q = blk_init_queue(do_vdc_request, &port->vio.lock); - if (!q) { + q = init_queue(port); + if (IS_ERR(q)) { printk(KERN_ERR PFX "%s: Could not allocate queue.\n", port->vio.name); - return -ENOMEM; + return PTR_ERR(q); } g = alloc_disk(1 << PARTITION_SHIFT); if (!g) { printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n", port->vio.name); - blk_cleanup_queue(q); + cleanup_queue(q); return -ENOMEM; } @@ -981,7 +1022,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) */ ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL); port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0; - timer_setup(&port->ldc_reset_timer, vdc_ldc_reset_timer, 0); + INIT_DELAYED_WORK(&port->ldc_reset_timer_work, vdc_ldc_reset_timer_work); INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work); err = vio_driver_init(&port->vio, vdev, VDEV_DISK, @@ -1034,18 +1075,14 @@ static int vdc_port_remove(struct vio_dev *vdev) struct vdc_port *port = dev_get_drvdata(&vdev->dev); if (port) { - unsigned long flags; - - spin_lock_irqsave(&port->vio.lock, flags); - blk_stop_queue(port->disk->queue); - spin_unlock_irqrestore(&port->vio.lock, flags); + blk_mq_stop_hw_queues(port->disk->queue); flush_work(&port->ldc_reset_work); - del_timer_sync(&port->ldc_reset_timer); + cancel_delayed_work_sync(&port->ldc_reset_timer_work); del_timer_sync(&port->vio.timer); del_gendisk(port->disk); - blk_cleanup_queue(port->disk->queue); + cleanup_queue(port->disk->queue); put_disk(port->disk); port->disk = NULL; @@ -1080,32 +1117,46 @@ static void vdc_requeue_inflight(struct vdc_port *port) } rqe->req = NULL; - blk_requeue_request(port->disk->queue, req); + blk_mq_requeue_request(req, false); } } static void vdc_queue_drain(struct vdc_port *port) { - struct request *req; + struct request_queue *q = port->disk->queue; - while ((req = blk_fetch_request(port->disk->queue)) != NULL) - __blk_end_request_all(req, BLK_STS_IOERR); + /* + * Mark the queue as draining, then freeze/quiesce to ensure + * that all existing requests are seen in ->queue_rq() and killed + */ + port->drain = 1; + spin_unlock_irq(&port->vio.lock); + + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + spin_lock_irq(&port->vio.lock); + port->drain = 0; + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); } -static void vdc_ldc_reset_timer(struct timer_list *t) +static void vdc_ldc_reset_timer_work(struct work_struct *work) { - struct vdc_port *port = from_timer(port, t, ldc_reset_timer); - struct vio_driver_state *vio = &port->vio; - unsigned long flags; + struct vdc_port *port; + struct vio_driver_state *vio; - spin_lock_irqsave(&vio->lock, flags); + port = container_of(work, struct vdc_port, ldc_reset_timer_work.work); + vio = &port->vio; + + spin_lock_irq(&vio->lock); if (!(port->vio.hs_state & VIO_HS_COMPLETE)) { pr_warn(PFX "%s ldc down %llu seconds, draining queue\n", port->disk_name, port->ldc_timeout); vdc_queue_drain(port); vdc_blk_queue_start(port); } - spin_unlock_irqrestore(&vio->lock, flags); + spin_unlock_irq(&vio->lock); } static void vdc_ldc_reset_work(struct work_struct *work) @@ -1129,7 +1180,7 @@ static void vdc_ldc_reset(struct vdc_port *port) assert_spin_locked(&port->vio.lock); pr_warn(PFX "%s ldc link reset\n", port->disk_name); - blk_stop_queue(port->disk->queue); + blk_mq_stop_hw_queues(port->disk->queue); vdc_requeue_inflight(port); vdc_port_down(port); @@ -1146,7 +1197,7 @@ static void vdc_ldc_reset(struct vdc_port *port) } if (port->ldc_timeout) - mod_timer(&port->ldc_reset_timer, + mod_delayed_work(system_wq, &port->ldc_reset_timer_work, round_jiffies(jiffies + HZ * port->ldc_timeout)); mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ)); return; diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 064b8c5c7a32..4478eb7efee0 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -243,7 +243,6 @@ struct carm_port { unsigned int port_no; struct gendisk *disk; struct carm_host *host; - struct blk_mq_tag_set tag_set; /* attached device characteristics */ u64 capacity; @@ -254,13 +253,10 @@ struct carm_port { }; struct carm_request { - unsigned int tag; int n_elem; unsigned int msg_type; unsigned int msg_subtype; unsigned int msg_bucket; - struct request *rq; - struct carm_port *port; struct scatterlist sg[CARM_MAX_REQ_SG]; }; @@ -291,9 +287,6 @@ struct carm_host { unsigned int wait_q_cons; struct request_queue *wait_q[CARM_MAX_WAIT_Q]; - unsigned int n_msgs; - u64 msg_alloc; - struct carm_request req[CARM_MAX_REQ]; void *msg_base; dma_addr_t msg_dma; @@ -478,10 +471,10 @@ static inline dma_addr_t carm_ref_msg_dma(struct carm_host *host, } static int carm_send_msg(struct carm_host *host, - struct carm_request *crq) + struct carm_request *crq, unsigned tag) { void __iomem *mmio = host->mmio; - u32 msg = (u32) carm_ref_msg_dma(host, crq->tag); + u32 msg = (u32) carm_ref_msg_dma(host, tag); u32 cm_bucket = crq->msg_bucket; u32 tmp; int rc = 0; @@ -506,99 +499,24 @@ static int carm_send_msg(struct carm_host *host, return rc; } -static struct carm_request *carm_get_request(struct carm_host *host) -{ - unsigned int i; - - /* obey global hardware limit on S/G entries */ - if (host->hw_sg_used >= (CARM_MAX_HOST_SG - CARM_MAX_REQ_SG)) - return NULL; - - for (i = 0; i < max_queue; i++) - if ((host->msg_alloc & (1ULL << i)) == 0) { - struct carm_request *crq = &host->req[i]; - crq->port = NULL; - crq->n_elem = 0; - - host->msg_alloc |= (1ULL << i); - host->n_msgs++; - - assert(host->n_msgs <= CARM_MAX_REQ); - sg_init_table(crq->sg, CARM_MAX_REQ_SG); - return crq; - } - - DPRINTK("no request available, returning NULL\n"); - return NULL; -} - -static int carm_put_request(struct carm_host *host, struct carm_request *crq) -{ - assert(crq->tag < max_queue); - - if (unlikely((host->msg_alloc & (1ULL << crq->tag)) == 0)) - return -EINVAL; /* tried to clear a tag that was not active */ - - assert(host->hw_sg_used >= crq->n_elem); - - host->msg_alloc &= ~(1ULL << crq->tag); - host->hw_sg_used -= crq->n_elem; - host->n_msgs--; - - return 0; -} - -static struct carm_request *carm_get_special(struct carm_host *host) -{ - unsigned long flags; - struct carm_request *crq = NULL; - struct request *rq; - int tries = 5000; - - while (tries-- > 0) { - spin_lock_irqsave(&host->lock, flags); - crq = carm_get_request(host); - spin_unlock_irqrestore(&host->lock, flags); - - if (crq) - break; - msleep(10); - } - - if (!crq) - return NULL; - - rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, 0); - if (IS_ERR(rq)) { - spin_lock_irqsave(&host->lock, flags); - carm_put_request(host, crq); - spin_unlock_irqrestore(&host->lock, flags); - return NULL; - } - - crq->rq = rq; - return crq; -} - static int carm_array_info (struct carm_host *host, unsigned int array_idx) { struct carm_msg_ioctl *ioc; - unsigned int idx; u32 msg_data; dma_addr_t msg_dma; struct carm_request *crq; + struct request *rq; int rc; - crq = carm_get_special(host); - if (!crq) { + rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0); + if (IS_ERR(rq)) { rc = -ENOMEM; goto err_out; } + crq = blk_mq_rq_to_pdu(rq); - idx = crq->tag; - - ioc = carm_ref_msg(host, idx); - msg_dma = carm_ref_msg_dma(host, idx); + ioc = carm_ref_msg(host, rq->tag); + msg_dma = carm_ref_msg_dma(host, rq->tag); msg_data = (u32) (msg_dma + sizeof(struct carm_array_info)); crq->msg_type = CARM_MSG_ARRAY; @@ -612,7 +530,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) ioc->type = CARM_MSG_ARRAY; ioc->subtype = CARM_ARRAY_INFO; ioc->array_id = (u8) array_idx; - ioc->handle = cpu_to_le32(TAG_ENCODE(idx)); + ioc->handle = cpu_to_le32(TAG_ENCODE(rq->tag)); ioc->data_addr = cpu_to_le32(msg_data); spin_lock_irq(&host->lock); @@ -620,9 +538,8 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) host->state == HST_DEV_SCAN); spin_unlock_irq(&host->lock); - DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); - crq->rq->special = crq; - blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); + DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); + blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL); return 0; @@ -637,21 +554,21 @@ typedef unsigned int (*carm_sspc_t)(struct carm_host *, unsigned int, void *); static int carm_send_special (struct carm_host *host, carm_sspc_t func) { + struct request *rq; struct carm_request *crq; struct carm_msg_ioctl *ioc; void *mem; - unsigned int idx, msg_size; + unsigned int msg_size; int rc; - crq = carm_get_special(host); - if (!crq) + rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0); + if (IS_ERR(rq)) return -ENOMEM; + crq = blk_mq_rq_to_pdu(rq); - idx = crq->tag; + mem = carm_ref_msg(host, rq->tag); - mem = carm_ref_msg(host, idx); - - msg_size = func(host, idx, mem); + msg_size = func(host, rq->tag, mem); ioc = mem; crq->msg_type = ioc->type; @@ -660,9 +577,8 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) BUG_ON(rc < 0); crq->msg_bucket = (u32) rc; - DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); - crq->rq->special = crq; - blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); + DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); + blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL); return 0; } @@ -744,19 +660,6 @@ static unsigned int carm_fill_get_fw_ver(struct carm_host *host, sizeof(struct carm_fw_ver); } -static inline void carm_end_request_queued(struct carm_host *host, - struct carm_request *crq, - blk_status_t error) -{ - struct request *req = crq->rq; - int rc; - - blk_mq_end_request(req, error); - - rc = carm_put_request(host, crq); - assert(rc == 0); -} - static inline void carm_push_q (struct carm_host *host, struct request_queue *q) { unsigned int idx = host->wait_q_prod % CARM_MAX_WAIT_Q; @@ -791,101 +694,50 @@ static inline void carm_round_robin(struct carm_host *host) } } -static inline void carm_end_rq(struct carm_host *host, struct carm_request *crq, - blk_status_t error) +static inline enum dma_data_direction carm_rq_dir(struct request *rq) { - carm_end_request_queued(host, crq, error); - if (max_queue == 1) - carm_round_robin(host); - else if ((host->n_msgs <= CARM_MSG_LOW_WATER) && - (host->hw_sg_used <= CARM_SG_LOW_WATER)) { - carm_round_robin(host); - } -} - -static blk_status_t carm_oob_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) -{ - struct request_queue *q = hctx->queue; - struct carm_host *host = q->queuedata; - struct carm_request *crq; - int rc; - - blk_mq_start_request(bd->rq); - - spin_lock_irq(&host->lock); - - crq = bd->rq->special; - assert(crq != NULL); - assert(crq->rq == bd->rq); - - crq->n_elem = 0; - - DPRINTK("send req\n"); - rc = carm_send_msg(host, crq); - if (rc) { - carm_push_q(host, q); - spin_unlock_irq(&host->lock); - return BLK_STS_DEV_RESOURCE; - } - - spin_unlock_irq(&host->lock); - return BLK_STS_OK; + return op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; } static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request_queue *q = hctx->queue; + struct request *rq = bd->rq; struct carm_port *port = q->queuedata; struct carm_host *host = port->host; + struct carm_request *crq = blk_mq_rq_to_pdu(rq); struct carm_msg_rw *msg; - struct carm_request *crq; - struct request *rq = bd->rq; struct scatterlist *sg; - int writing = 0, pci_dir, i, n_elem, rc; - u32 tmp; + int i, n_elem = 0, rc; unsigned int msg_size; + u32 tmp; + + crq->n_elem = 0; + sg_init_table(crq->sg, CARM_MAX_REQ_SG); blk_mq_start_request(rq); spin_lock_irq(&host->lock); - - crq = carm_get_request(host); - if (!crq) { - carm_push_q(host, q); - spin_unlock_irq(&host->lock); - return BLK_STS_DEV_RESOURCE; - } - crq->rq = rq; - - if (rq_data_dir(rq) == WRITE) { - writing = 1; - pci_dir = DMA_TO_DEVICE; - } else { - pci_dir = DMA_FROM_DEVICE; - } + if (req_op(rq) == REQ_OP_DRV_OUT) + goto send_msg; /* get scatterlist from block layer */ sg = &crq->sg[0]; n_elem = blk_rq_map_sg(q, rq, sg); - if (n_elem <= 0) { - /* request with no s/g entries? */ - carm_end_rq(host, crq, BLK_STS_IOERR); - spin_unlock_irq(&host->lock); - return BLK_STS_IOERR; - } + if (n_elem <= 0) + goto out_ioerr; /* map scatterlist to PCI bus addresses */ - n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, pci_dir); - if (n_elem <= 0) { - /* request with no s/g entries? */ - carm_end_rq(host, crq, BLK_STS_IOERR); - spin_unlock_irq(&host->lock); - return BLK_STS_IOERR; - } + n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, carm_rq_dir(rq)); + if (n_elem <= 0) + goto out_ioerr; + + /* obey global hardware limit on S/G entries */ + if (host->hw_sg_used >= CARM_MAX_HOST_SG - n_elem) + goto out_resource; + crq->n_elem = n_elem; - crq->port = port; host->hw_sg_used += n_elem; /* @@ -893,9 +745,9 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx, */ VPRINTK("build msg\n"); - msg = (struct carm_msg_rw *) carm_ref_msg(host, crq->tag); + msg = (struct carm_msg_rw *) carm_ref_msg(host, rq->tag); - if (writing) { + if (rq_data_dir(rq) == WRITE) { msg->type = CARM_MSG_WRITE; crq->msg_type = CARM_MSG_WRITE; } else { @@ -906,7 +758,7 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx, msg->id = port->port_no; msg->sg_count = n_elem; msg->sg_type = SGT_32BIT; - msg->handle = cpu_to_le32(TAG_ENCODE(crq->tag)); + msg->handle = cpu_to_le32(TAG_ENCODE(rq->tag)); msg->lba = cpu_to_le32(blk_rq_pos(rq) & 0xffffffff); tmp = (blk_rq_pos(rq) >> 16) >> 16; msg->lba_high = cpu_to_le16( (u16) tmp ); @@ -923,22 +775,28 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx, rc = carm_lookup_bucket(msg_size); BUG_ON(rc < 0); crq->msg_bucket = (u32) rc; - +send_msg: /* * queue read/write message to hardware */ - - VPRINTK("send msg, tag == %u\n", crq->tag); - rc = carm_send_msg(host, crq); + VPRINTK("send msg, tag == %u\n", rq->tag); + rc = carm_send_msg(host, crq, rq->tag); if (rc) { - carm_put_request(host, crq); - carm_push_q(host, q); - spin_unlock_irq(&host->lock); - return BLK_STS_DEV_RESOURCE; + host->hw_sg_used -= n_elem; + goto out_resource; } spin_unlock_irq(&host->lock); return BLK_STS_OK; +out_resource: + dma_unmap_sg(&host->pdev->dev, &crq->sg[0], n_elem, carm_rq_dir(rq)); + carm_push_q(host, q); + spin_unlock_irq(&host->lock); + return BLK_STS_DEV_RESOURCE; +out_ioerr: + carm_round_robin(host); + spin_unlock_irq(&host->lock); + return BLK_STS_IOERR; } static void carm_handle_array_info(struct carm_host *host, @@ -954,8 +812,6 @@ static void carm_handle_array_info(struct carm_host *host, DPRINTK("ENTER\n"); - carm_end_rq(host, crq, error); - if (error) goto out; if (le32_to_cpu(desc->array_status) & ARRAY_NO_EXIST) @@ -1011,8 +867,6 @@ static void carm_handle_scan_chan(struct carm_host *host, DPRINTK("ENTER\n"); - carm_end_rq(host, crq, error); - if (error) { new_state = HST_ERROR; goto out; @@ -1040,8 +894,6 @@ static void carm_handle_generic(struct carm_host *host, { DPRINTK("ENTER\n"); - carm_end_rq(host, crq, error); - assert(host->state == cur_state); if (error) host->state = HST_ERROR; @@ -1050,28 +902,12 @@ static void carm_handle_generic(struct carm_host *host, schedule_work(&host->fsm_task); } -static inline void carm_handle_rw(struct carm_host *host, - struct carm_request *crq, blk_status_t error) -{ - int pci_dir; - - VPRINTK("ENTER\n"); - - if (rq_data_dir(crq->rq) == WRITE) - pci_dir = DMA_TO_DEVICE; - else - pci_dir = DMA_FROM_DEVICE; - - dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem, pci_dir); - - carm_end_rq(host, crq, error); -} - static inline void carm_handle_resp(struct carm_host *host, __le32 ret_handle_le, u32 status) { u32 handle = le32_to_cpu(ret_handle_le); unsigned int msg_idx; + struct request *rq; struct carm_request *crq; blk_status_t error = (status == RMSG_OK) ? 0 : BLK_STS_IOERR; u8 *mem; @@ -1087,13 +923,15 @@ static inline void carm_handle_resp(struct carm_host *host, msg_idx = TAG_DECODE(handle); VPRINTK("tag == %u\n", msg_idx); - crq = &host->req[msg_idx]; + rq = blk_mq_tag_to_rq(host->tag_set.tags[0], msg_idx); + crq = blk_mq_rq_to_pdu(rq); /* fast path */ if (likely(crq->msg_type == CARM_MSG_READ || crq->msg_type == CARM_MSG_WRITE)) { - carm_handle_rw(host, crq, error); - return; + dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem, + carm_rq_dir(rq)); + goto done; } mem = carm_ref_msg(host, msg_idx); @@ -1103,7 +941,7 @@ static inline void carm_handle_resp(struct carm_host *host, switch (crq->msg_subtype) { case CARM_IOC_SCAN_CHAN: carm_handle_scan_chan(host, crq, mem, error); - break; + goto done; default: /* unknown / invalid response */ goto err_out; @@ -1116,11 +954,11 @@ static inline void carm_handle_resp(struct carm_host *host, case MISC_ALLOC_MEM: carm_handle_generic(host, crq, error, HST_ALLOC_BUF, HST_SYNC_TIME); - break; + goto done; case MISC_SET_TIME: carm_handle_generic(host, crq, error, HST_SYNC_TIME, HST_GET_FW_VER); - break; + goto done; case MISC_GET_FW_VER: { struct carm_fw_ver *ver = (struct carm_fw_ver *) (mem + sizeof(struct carm_msg_get_fw_ver)); @@ -1130,7 +968,7 @@ static inline void carm_handle_resp(struct carm_host *host, } carm_handle_generic(host, crq, error, HST_GET_FW_VER, HST_PORT_SCAN); - break; + goto done; } default: /* unknown / invalid response */ @@ -1161,7 +999,13 @@ static inline void carm_handle_resp(struct carm_host *host, err_out: printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n", pci_name(host->pdev), crq->msg_type, crq->msg_subtype); - carm_end_rq(host, crq, BLK_STS_IOERR); + error = BLK_STS_IOERR; +done: + host->hw_sg_used -= crq->n_elem; + blk_mq_end_request(blk_mq_rq_from_pdu(crq), error); + + if (host->hw_sg_used <= CARM_SG_LOW_WATER) + carm_round_robin(host); } static inline void carm_handle_responses(struct carm_host *host) @@ -1491,78 +1335,56 @@ static int carm_init_host(struct carm_host *host) return 0; } -static const struct blk_mq_ops carm_oob_mq_ops = { - .queue_rq = carm_oob_queue_rq, -}; - static const struct blk_mq_ops carm_mq_ops = { .queue_rq = carm_queue_rq, }; -static int carm_init_disks(struct carm_host *host) +static int carm_init_disk(struct carm_host *host, unsigned int port_no) { - unsigned int i; - int rc = 0; + struct carm_port *port = &host->port[port_no]; + struct gendisk *disk; + struct request_queue *q; - for (i = 0; i < CARM_MAX_PORTS; i++) { - struct gendisk *disk; - struct request_queue *q; - struct carm_port *port; + port->host = host; + port->port_no = port_no; - port = &host->port[i]; - port->host = host; - port->port_no = i; + disk = alloc_disk(CARM_MINORS_PER_MAJOR); + if (!disk) + return -ENOMEM; - disk = alloc_disk(CARM_MINORS_PER_MAJOR); - if (!disk) { - rc = -ENOMEM; - break; - } + port->disk = disk; + sprintf(disk->disk_name, DRV_NAME "/%u", + (unsigned int)host->id * CARM_MAX_PORTS + port_no); + disk->major = host->major; + disk->first_minor = port_no * CARM_MINORS_PER_MAJOR; + disk->fops = &carm_bd_ops; + disk->private_data = port; - port->disk = disk; - sprintf(disk->disk_name, DRV_NAME "/%u", - (unsigned int) (host->id * CARM_MAX_PORTS) + i); - disk->major = host->major; - disk->first_minor = i * CARM_MINORS_PER_MAJOR; - disk->fops = &carm_bd_ops; - disk->private_data = port; + q = blk_mq_init_queue(&host->tag_set); + if (IS_ERR(q)) + return PTR_ERR(q); - q = blk_mq_init_sq_queue(&port->tag_set, &carm_mq_ops, - max_queue, BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(q)) { - rc = PTR_ERR(q); - break; - } - disk->queue = q; - blk_queue_max_segments(q, CARM_MAX_REQ_SG); - blk_queue_segment_boundary(q, CARM_SG_BOUNDARY); + blk_queue_max_segments(q, CARM_MAX_REQ_SG); + blk_queue_segment_boundary(q, CARM_SG_BOUNDARY); - q->queuedata = port; - } - - return rc; + q->queuedata = port; + disk->queue = q; + return 0; } -static void carm_free_disks(struct carm_host *host) +static void carm_free_disk(struct carm_host *host, unsigned int port_no) { - unsigned int i; + struct carm_port *port = &host->port[port_no]; + struct gendisk *disk = port->disk; - for (i = 0; i < CARM_MAX_PORTS; i++) { - struct carm_port *port = &host->port[i]; - struct gendisk *disk = port->disk; + if (!disk) + return; - if (disk) { - struct request_queue *q = disk->queue; - - if (disk->flags & GENHD_FL_UP) - del_gendisk(disk); - if (q) { - blk_mq_free_tag_set(&port->tag_set); - blk_cleanup_queue(q); - } - put_disk(disk); - } - } + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + if (disk->queue) + blk_cleanup_queue(disk->queue); + put_disk(disk); } static int carm_init_shm(struct carm_host *host) @@ -1618,9 +1440,6 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) INIT_WORK(&host->fsm_task, carm_fsm_task); init_completion(&host->probe_comp); - for (i = 0; i < ARRAY_SIZE(host->req); i++) - host->req[i].tag = i; - host->mmio = ioremap(pci_resource_start(pdev, 0), pci_resource_len(pdev, 0)); if (!host->mmio) { @@ -1637,14 +1456,26 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) goto err_out_iounmap; } - q = blk_mq_init_sq_queue(&host->tag_set, &carm_oob_mq_ops, 1, - BLK_MQ_F_NO_SCHED); + memset(&host->tag_set, 0, sizeof(host->tag_set)); + host->tag_set.ops = &carm_mq_ops; + host->tag_set.cmd_size = sizeof(struct carm_request); + host->tag_set.nr_hw_queues = 1; + host->tag_set.nr_maps = 1; + host->tag_set.queue_depth = max_queue; + host->tag_set.numa_node = NUMA_NO_NODE; + host->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + + rc = blk_mq_alloc_tag_set(&host->tag_set); + if (rc) + goto err_out_dma_free; + + q = blk_mq_init_queue(&host->tag_set); if (IS_ERR(q)) { - printk(KERN_ERR DRV_NAME "(%s): OOB queue alloc failure\n", - pci_name(pdev)); rc = PTR_ERR(q); + blk_mq_free_tag_set(&host->tag_set); goto err_out_dma_free; } + host->oob_q = q; q->queuedata = host; @@ -1667,9 +1498,11 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) if (host->flags & FL_DYN_MAJOR) host->major = rc; - rc = carm_init_disks(host); - if (rc) - goto err_out_blkdev_disks; + for (i = 0; i < CARM_MAX_PORTS; i++) { + rc = carm_init_disk(host, i); + if (rc) + goto err_out_blkdev_disks; + } pci_set_master(pdev); @@ -1699,7 +1532,8 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) err_out_free_irq: free_irq(pdev->irq, host); err_out_blkdev_disks: - carm_free_disks(host); + for (i = 0; i < CARM_MAX_PORTS; i++) + carm_free_disk(host, i); unregister_blkdev(host->major, host->name); err_out_free_majors: if (host->major == 160) @@ -1724,6 +1558,7 @@ err_out: static void carm_remove_one (struct pci_dev *pdev) { struct carm_host *host = pci_get_drvdata(pdev); + unsigned int i; if (!host) { printk(KERN_ERR PFX "BUG: no host data for PCI(%s)\n", @@ -1732,7 +1567,8 @@ static void carm_remove_one (struct pci_dev *pdev) } free_irq(pdev->irq, host); - carm_free_disks(host); + for (i = 0; i < CARM_MAX_PORTS; i++) + carm_free_disk(host, i); unregister_blkdev(host->major, host->name); if (host->major == 160) clear_bit(0, &carm_major_alloc); diff --git a/drivers/block/umem.c b/drivers/block/umem.c index be3e3ab79950..aa035cf8a51d 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -888,8 +888,7 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) card->biotail = &card->bio; spin_lock_init(&card->lock); - card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, - &card->lock); + card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); if (!card->queue) goto failed_alloc; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 086c6bb12baa..912c4265e592 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -214,6 +214,20 @@ static void virtblk_done(struct virtqueue *vq) spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } +static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct virtio_blk *vblk = hctx->queue->queuedata; + struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num]; + bool kick; + + spin_lock_irq(&vq->lock); + kick = virtqueue_kick_prepare(vq->vq); + spin_unlock_irq(&vq->lock); + + if (kick) + virtqueue_notify(vq->vq); +} + static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -624,7 +638,7 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set) { struct virtio_blk *vblk = set->driver_data; - return blk_mq_virtio_map_queues(set, vblk->vdev, 0); + return blk_mq_virtio_map_queues(&set->map[0], vblk->vdev, 0); } #ifdef CONFIG_VIRTIO_BLK_SCSI @@ -638,6 +652,7 @@ static void virtblk_initialize_rq(struct request *req) static const struct blk_mq_ops virtio_mq_ops = { .queue_rq = virtio_queue_rq, + .commit_rqs = virtio_commit_rqs, .complete = virtblk_request_done, .init_request = virtblk_init_request, #ifdef CONFIG_VIRTIO_BLK_SCSI diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 8b2b72b93885..da58020a144e 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -94,7 +94,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_MISC; - rq->special = (char *)pc; + ide_req(rq)->special = pc; if (buf && bufflen) { error = blk_rq_map_kern(drive->queue, rq, buf, bufflen, @@ -172,8 +172,8 @@ EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd); void ide_prep_sense(ide_drive_t *drive, struct request *rq) { struct request_sense *sense = &drive->sense_data; - struct request *sense_rq = drive->sense_rq; - struct scsi_request *req = scsi_req(sense_rq); + struct request *sense_rq; + struct scsi_request *req; unsigned int cmd_len, sense_len; int err; @@ -196,9 +196,16 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) if (ata_sense_request(rq) || drive->sense_rq_armed) return; + sense_rq = drive->sense_rq; + if (!sense_rq) { + sense_rq = blk_mq_alloc_request(drive->queue, REQ_OP_DRV_IN, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + drive->sense_rq = sense_rq; + } + req = scsi_req(sense_rq); + memset(sense, 0, sizeof(*sense)); - blk_rq_init(rq->q, sense_rq); scsi_req_init(req); err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len, @@ -207,6 +214,8 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) if (printk_ratelimit()) printk(KERN_WARNING PFX "%s: failed to map sense " "buffer\n", drive->name); + blk_mq_free_request(sense_rq); + drive->sense_rq = NULL; return; } @@ -226,6 +235,8 @@ EXPORT_SYMBOL_GPL(ide_prep_sense); int ide_queue_sense_rq(ide_drive_t *drive, void *special) { + struct request *sense_rq = drive->sense_rq; + /* deferred failure from ide_prep_sense() */ if (!drive->sense_rq_armed) { printk(KERN_WARNING PFX "%s: error queuing a sense request\n", @@ -233,12 +244,12 @@ int ide_queue_sense_rq(ide_drive_t *drive, void *special) return -ENOMEM; } - drive->sense_rq->special = special; + ide_req(sense_rq)->special = special; drive->sense_rq_armed = false; drive->hwif->rq = NULL; - elv_add_request(drive->queue, drive->sense_rq, ELEVATOR_INSERT_FRONT); + ide_insert_request_head(drive, sense_rq); return 0; } EXPORT_SYMBOL_GPL(ide_queue_sense_rq); @@ -270,10 +281,8 @@ void ide_retry_pc(ide_drive_t *drive) */ drive->hwif->rq = NULL; ide_requeue_and_plug(drive, failed_rq); - if (ide_queue_sense_rq(drive, pc)) { - blk_start_request(failed_rq); + if (ide_queue_sense_rq(drive, pc)) ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(failed_rq)); - } } EXPORT_SYMBOL_GPL(ide_retry_pc); diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index f9b59d41813f..1f03884a6808 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -211,12 +211,12 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive, static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) { /* - * For ATA_PRIV_SENSE, "rq->special" points to the original + * For ATA_PRIV_SENSE, "ide_req(rq)->special" points to the original * failed request. Also, the sense data should be read * directly from rq which might be different from the original * sense buffer if it got copied during mapping. */ - struct request *failed = (struct request *)rq->special; + struct request *failed = ide_req(rq)->special; void *sense = bio_data(rq->bio); if (failed) { @@ -258,11 +258,22 @@ static int ide_cd_breathe(ide_drive_t *drive, struct request *rq) /* * take a breather */ - blk_delay_queue(drive->queue, 1); + blk_mq_requeue_request(rq, false); + blk_mq_delay_kick_requeue_list(drive->queue, 1); return 1; } } +static void ide_cd_free_sense(ide_drive_t *drive) +{ + if (!drive->sense_rq) + return; + + blk_mq_free_request(drive->sense_rq); + drive->sense_rq = NULL; + drive->sense_rq_armed = false; +} + /** * Returns: * 0: if the request should be continued. @@ -516,6 +527,82 @@ static bool ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd) return false; } +/* standard prep_rq that builds 10 byte cmds */ +static bool ide_cdrom_prep_fs(struct request_queue *q, struct request *rq) +{ + int hard_sect = queue_logical_block_size(q); + long block = (long)blk_rq_pos(rq) / (hard_sect >> 9); + unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9); + struct scsi_request *req = scsi_req(rq); + + if (rq_data_dir(rq) == READ) + req->cmd[0] = GPCMD_READ_10; + else + req->cmd[0] = GPCMD_WRITE_10; + + /* + * fill in lba + */ + req->cmd[2] = (block >> 24) & 0xff; + req->cmd[3] = (block >> 16) & 0xff; + req->cmd[4] = (block >> 8) & 0xff; + req->cmd[5] = block & 0xff; + + /* + * and transfer length + */ + req->cmd[7] = (blocks >> 8) & 0xff; + req->cmd[8] = blocks & 0xff; + req->cmd_len = 10; + return true; +} + +/* + * Most of the SCSI commands are supported directly by ATAPI devices. + * This transform handles the few exceptions. + */ +static bool ide_cdrom_prep_pc(struct request *rq) +{ + u8 *c = scsi_req(rq)->cmd; + + /* transform 6-byte read/write commands to the 10-byte version */ + if (c[0] == READ_6 || c[0] == WRITE_6) { + c[8] = c[4]; + c[5] = c[3]; + c[4] = c[2]; + c[3] = c[1] & 0x1f; + c[2] = 0; + c[1] &= 0xe0; + c[0] += (READ_10 - READ_6); + scsi_req(rq)->cmd_len = 10; + return true; + } + + /* + * it's silly to pretend we understand 6-byte sense commands, just + * reject with ILLEGAL_REQUEST and the caller should take the + * appropriate action + */ + if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) { + scsi_req(rq)->result = ILLEGAL_REQUEST; + return false; + } + + return true; +} + +static bool ide_cdrom_prep_rq(ide_drive_t *drive, struct request *rq) +{ + if (!blk_rq_is_passthrough(rq)) { + scsi_req_init(scsi_req(rq)); + + return ide_cdrom_prep_fs(drive->queue, rq); + } else if (blk_rq_is_scsi(rq)) + return ide_cdrom_prep_pc(rq); + + return true; +} + static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; @@ -675,7 +762,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) out_end: if (blk_rq_is_scsi(rq) && rc == 0) { scsi_req(rq)->resid_len = 0; - blk_end_request_all(rq, BLK_STS_OK); + blk_mq_end_request(rq, BLK_STS_OK); hwif->rq = NULL; } else { if (sense && uptodate) @@ -705,6 +792,8 @@ out_end: if (sense && rc == 2) ide_error(drive, "request sense failure", stat); } + + ide_cd_free_sense(drive); return ide_stopped; } @@ -729,7 +818,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq) * We may be retrying this request after an error. Fix up any * weirdness which might be present in the request packet. */ - q->prep_rq_fn(q, rq); + ide_cdrom_prep_rq(drive, rq); } /* fs requests *must* be hardware frame aligned */ @@ -1323,82 +1412,6 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive) return nslots; } -/* standard prep_rq_fn that builds 10 byte cmds */ -static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq) -{ - int hard_sect = queue_logical_block_size(q); - long block = (long)blk_rq_pos(rq) / (hard_sect >> 9); - unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9); - struct scsi_request *req = scsi_req(rq); - - q->initialize_rq_fn(rq); - - if (rq_data_dir(rq) == READ) - req->cmd[0] = GPCMD_READ_10; - else - req->cmd[0] = GPCMD_WRITE_10; - - /* - * fill in lba - */ - req->cmd[2] = (block >> 24) & 0xff; - req->cmd[3] = (block >> 16) & 0xff; - req->cmd[4] = (block >> 8) & 0xff; - req->cmd[5] = block & 0xff; - - /* - * and transfer length - */ - req->cmd[7] = (blocks >> 8) & 0xff; - req->cmd[8] = blocks & 0xff; - req->cmd_len = 10; - return BLKPREP_OK; -} - -/* - * Most of the SCSI commands are supported directly by ATAPI devices. - * This transform handles the few exceptions. - */ -static int ide_cdrom_prep_pc(struct request *rq) -{ - u8 *c = scsi_req(rq)->cmd; - - /* transform 6-byte read/write commands to the 10-byte version */ - if (c[0] == READ_6 || c[0] == WRITE_6) { - c[8] = c[4]; - c[5] = c[3]; - c[4] = c[2]; - c[3] = c[1] & 0x1f; - c[2] = 0; - c[1] &= 0xe0; - c[0] += (READ_10 - READ_6); - scsi_req(rq)->cmd_len = 10; - return BLKPREP_OK; - } - - /* - * it's silly to pretend we understand 6-byte sense commands, just - * reject with ILLEGAL_REQUEST and the caller should take the - * appropriate action - */ - if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) { - scsi_req(rq)->result = ILLEGAL_REQUEST; - return BLKPREP_KILL; - } - - return BLKPREP_OK; -} - -static int ide_cdrom_prep_fn(struct request_queue *q, struct request *rq) -{ - if (!blk_rq_is_passthrough(rq)) - return ide_cdrom_prep_fs(q, rq); - else if (blk_rq_is_scsi(rq)) - return ide_cdrom_prep_pc(rq); - - return 0; -} - struct cd_list_entry { const char *id_model; const char *id_firmware; @@ -1508,7 +1521,7 @@ static int ide_cdrom_setup(ide_drive_t *drive) ide_debug_log(IDE_DBG_PROBE, "enter"); - blk_queue_prep_rq(q, ide_cdrom_prep_fn); + drive->prep_rq = ide_cdrom_prep_rq; blk_queue_dma_alignment(q, 31); blk_queue_update_dma_pad(q, 15); @@ -1569,7 +1582,7 @@ static void ide_cd_release(struct device *dev) if (devinfo->handle == drive) unregister_cdrom(devinfo); drive->driver_data = NULL; - blk_queue_prep_rq(drive->queue, NULL); + drive->prep_rq = NULL; g->private_data = NULL; put_disk(g); kfree(info); diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index f4f8afdf8bbe..f2f93ed40356 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -171,7 +171,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, scsi_req(rq)->cmd_len = 5; scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC; *(int *)&scsi_req(rq)->cmd[1] = arg; - rq->special = setting->set; + ide_req(rq)->special = setting->set; blk_execute_rq(q, NULL, rq, 0); ret = scsi_req(rq)->result; @@ -182,7 +182,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq) { - int err, (*setfunc)(ide_drive_t *, int) = rq->special; + int err, (*setfunc)(ide_drive_t *, int) = ide_req(rq)->special; err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]); if (err) diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index e3b4e659082d..197912af5c2f 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -427,16 +427,15 @@ static void ide_disk_unlock_native_capacity(ide_drive_t *drive) drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */ } -static int idedisk_prep_fn(struct request_queue *q, struct request *rq) +static bool idedisk_prep_rq(ide_drive_t *drive, struct request *rq) { - ide_drive_t *drive = q->queuedata; struct ide_cmd *cmd; if (req_op(rq) != REQ_OP_FLUSH) - return BLKPREP_OK; + return true; - if (rq->special) { - cmd = rq->special; + if (ide_req(rq)->special) { + cmd = ide_req(rq)->special; memset(cmd, 0, sizeof(*cmd)); } else { cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC); @@ -456,10 +455,10 @@ static int idedisk_prep_fn(struct request_queue *q, struct request *rq) rq->cmd_flags &= ~REQ_OP_MASK; rq->cmd_flags |= REQ_OP_DRV_OUT; ide_req(rq)->type = ATA_PRIV_TASKFILE; - rq->special = cmd; + ide_req(rq)->special = cmd; cmd->rq = rq; - return BLKPREP_OK; + return true; } ide_devset_get(multcount, mult_count); @@ -548,7 +547,7 @@ static void update_flush(ide_drive_t *drive) if (barrier) { wc = true; - blk_queue_prep_rq(drive->queue, idedisk_prep_fn); + drive->prep_rq = idedisk_prep_rq; } } diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c index 47d5f3379748..e1323e058454 100644 --- a/drivers/ide/ide-eh.c +++ b/drivers/ide/ide-eh.c @@ -125,7 +125,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat) /* retry only "normal" I/O: */ if (blk_rq_is_passthrough(rq)) { if (ata_taskfile_request(rq)) { - struct ide_cmd *cmd = rq->special; + struct ide_cmd *cmd = ide_req(rq)->special; if (cmd) ide_complete_cmd(drive, cmd, stat, err); diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index a8df300f949c..780d33ccc5d8 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -276,7 +276,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, switch (ide_req(rq)->type) { case ATA_PRIV_MISC: case ATA_PRIV_SENSE: - pc = (struct ide_atapi_pc *)rq->special; + pc = (struct ide_atapi_pc *)ide_req(rq)->special; break; default: BUG(); diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 0d93e0cfbeaf..8445b484ae69 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -67,7 +67,15 @@ int ide_end_rq(ide_drive_t *drive, struct request *rq, blk_status_t error, ide_dma_on(drive); } - return blk_end_request(rq, error, nr_bytes); + if (!blk_update_request(rq, error, nr_bytes)) { + if (rq == drive->sense_rq) + drive->sense_rq = NULL; + + __blk_mq_end_request(rq, error); + return 0; + } + + return 1; } EXPORT_SYMBOL_GPL(ide_end_rq); @@ -103,7 +111,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err) } if (rq && ata_taskfile_request(rq)) { - struct ide_cmd *orig_cmd = rq->special; + struct ide_cmd *orig_cmd = ide_req(rq)->special; if (cmd->tf_flags & IDE_TFLAG_DYN) kfree(orig_cmd); @@ -253,7 +261,7 @@ EXPORT_SYMBOL_GPL(ide_init_sg_cmd); static ide_startstop_t execute_drive_cmd (ide_drive_t *drive, struct request *rq) { - struct ide_cmd *cmd = rq->special; + struct ide_cmd *cmd = ide_req(rq)->special; if (cmd) { if (cmd->protocol == ATA_PROT_PIO) { @@ -307,8 +315,6 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) { ide_startstop_t startstop; - BUG_ON(!(rq->rq_flags & RQF_STARTED)); - #ifdef DEBUG printk("%s: start_request: current=0x%08lx\n", drive->hwif->name, (unsigned long) rq); @@ -320,6 +326,9 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) goto kill_rq; } + if (drive->prep_rq && !drive->prep_rq(drive, rq)) + return ide_stopped; + if (ata_pm_request(rq)) ide_check_pm_state(drive, rq); @@ -343,7 +352,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) if (ata_taskfile_request(rq)) return execute_drive_cmd(drive, rq); else if (ata_pm_request(rq)) { - struct ide_pm_state *pm = rq->special; + struct ide_pm_state *pm = ide_req(rq)->special; #ifdef DEBUG_PM printk("%s: start_power_step(step: %d)\n", drive->name, pm->pm_step); @@ -430,44 +439,42 @@ static inline void ide_unlock_host(struct ide_host *host) } } -static void __ide_requeue_and_plug(struct request_queue *q, struct request *rq) -{ - if (rq) - blk_requeue_request(q, rq); - if (rq || blk_peek_request(q)) { - /* Use 3ms as that was the old plug delay */ - blk_delay_queue(q, 3); - } -} - void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq) { struct request_queue *q = drive->queue; - unsigned long flags; - spin_lock_irqsave(q->queue_lock, flags); - __ide_requeue_and_plug(q, rq); - spin_unlock_irqrestore(q->queue_lock, flags); + /* Use 3ms as that was the old plug delay */ + if (rq) { + blk_mq_requeue_request(rq, false); + blk_mq_delay_kick_requeue_list(q, 3); + } else + blk_mq_delay_run_hw_queue(q->queue_hw_ctx[0], 3); } /* * Issue a new request to a device. */ -void do_ide_request(struct request_queue *q) +blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - ide_drive_t *drive = q->queuedata; + ide_drive_t *drive = hctx->queue->queuedata; ide_hwif_t *hwif = drive->hwif; struct ide_host *host = hwif->host; - struct request *rq = NULL; + struct request *rq = bd->rq; ide_startstop_t startstop; - spin_unlock_irq(q->queue_lock); + if (!blk_rq_is_passthrough(rq) && !(rq->rq_flags & RQF_DONTPREP)) { + rq->rq_flags |= RQF_DONTPREP; + ide_req(rq)->special = NULL; + } /* HLD do_request() callback might sleep, make sure it's okay */ might_sleep(); if (ide_lock_host(host, hwif)) - goto plug_device_2; + return BLK_STS_DEV_RESOURCE; + + blk_mq_start_request(rq); spin_lock_irq(&hwif->lock); @@ -503,21 +510,16 @@ repeat: hwif->cur_dev = drive; drive->dev_flags &= ~(IDE_DFLAG_SLEEPING | IDE_DFLAG_PARKED); - spin_unlock_irq(&hwif->lock); - spin_lock_irq(q->queue_lock); /* * we know that the queue isn't empty, but this can happen - * if the q->prep_rq_fn() decides to kill a request + * if ->prep_rq() decides to kill a request */ - if (!rq) - rq = blk_fetch_request(drive->queue); - - spin_unlock_irq(q->queue_lock); - spin_lock_irq(&hwif->lock); - if (!rq) { - ide_unlock_port(hwif); - goto out; + rq = bd->rq; + if (!rq) { + ide_unlock_port(hwif); + goto out; + } } /* @@ -551,23 +553,24 @@ repeat: if (startstop == ide_stopped) { rq = hwif->rq; hwif->rq = NULL; - goto repeat; + if (rq) + goto repeat; + ide_unlock_port(hwif); + goto out; } - } else - goto plug_device; + } else { +plug_device: + spin_unlock_irq(&hwif->lock); + ide_unlock_host(host); + ide_requeue_and_plug(drive, rq); + return BLK_STS_OK; + } + out: spin_unlock_irq(&hwif->lock); if (rq == NULL) ide_unlock_host(host); - spin_lock_irq(q->queue_lock); - return; - -plug_device: - spin_unlock_irq(&hwif->lock); - ide_unlock_host(host); -plug_device_2: - spin_lock_irq(q->queue_lock); - __ide_requeue_and_plug(q, rq); + return BLK_STS_OK; } static int drive_is_ready(ide_drive_t *drive) @@ -887,3 +890,16 @@ void ide_pad_transfer(ide_drive_t *drive, int write, int len) } } EXPORT_SYMBOL_GPL(ide_pad_transfer); + +void ide_insert_request_head(ide_drive_t *drive, struct request *rq) +{ + ide_hwif_t *hwif = drive->hwif; + unsigned long flags; + + spin_lock_irqsave(&hwif->lock, flags); + list_add_tail(&rq->queuelist, &drive->rq_list); + spin_unlock_irqrestore(&hwif->lock, flags); + + kblockd_schedule_work(&drive->rq_work); +} +EXPORT_SYMBOL_GPL(ide_insert_request_head); diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index 622f0edb3945..102aa3bc3e7f 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -27,7 +27,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) spin_unlock_irq(&hwif->lock); if (start_queue) - blk_run_queue(q); + blk_mq_run_hw_queues(q, true); return; } spin_unlock_irq(&hwif->lock); @@ -36,7 +36,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) scsi_req(rq)->cmd[0] = REQ_PARK_HEADS; scsi_req(rq)->cmd_len = 1; ide_req(rq)->type = ATA_PRIV_MISC; - rq->special = &timeout; + ide_req(rq)->special = &timeout; blk_execute_rq(q, NULL, rq, 1); rc = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); @@ -54,7 +54,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) scsi_req(rq)->cmd[0] = REQ_UNPARK_HEADS; scsi_req(rq)->cmd_len = 1; ide_req(rq)->type = ATA_PRIV_MISC; - elv_add_request(q, rq, ELEVATOR_INSERT_FRONT); + ide_insert_request_head(drive, rq); out: return; @@ -67,7 +67,7 @@ ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq) memset(&cmd, 0, sizeof(cmd)); if (scsi_req(rq)->cmd[0] == REQ_PARK_HEADS) { - drive->sleep = *(unsigned long *)rq->special; + drive->sleep = *(unsigned long *)ide_req(rq)->special; drive->dev_flags |= IDE_DFLAG_SLEEPING; tf->command = ATA_CMD_IDLEIMMEDIATE; tf->feature = 0x44; diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 59217aa1d1fb..192e6c65d34e 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -21,7 +21,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) memset(&rqpm, 0, sizeof(rqpm)); rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_PM_SUSPEND; - rq->special = &rqpm; + ide_req(rq)->special = &rqpm; rqpm.pm_step = IDE_PM_START_SUSPEND; if (mesg.event == PM_EVENT_PRETHAW) mesg.event = PM_EVENT_FREEZE; @@ -40,32 +40,17 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) return ret; } -static void ide_end_sync_rq(struct request *rq, blk_status_t error) -{ - complete(rq->end_io_data); -} - static int ide_pm_execute_rq(struct request *rq) { struct request_queue *q = rq->q; - DECLARE_COMPLETION_ONSTACK(wait); - rq->end_io_data = &wait; - rq->end_io = ide_end_sync_rq; - - spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) { rq->rq_flags |= RQF_QUIET; scsi_req(rq)->result = -ENXIO; - __blk_end_request_all(rq, BLK_STS_OK); - spin_unlock_irq(q->queue_lock); + blk_mq_end_request(rq, BLK_STS_OK); return -ENXIO; } - __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT); - __blk_run_queue_uncond(q); - spin_unlock_irq(q->queue_lock); - - wait_for_completion_io(&wait); + blk_execute_rq(q, NULL, rq, true); return scsi_req(rq)->result ? -EIO : 0; } @@ -79,6 +64,8 @@ int generic_ide_resume(struct device *dev) struct ide_pm_state rqpm; int err; + blk_mq_start_stopped_hw_queues(drive->queue, true); + if (ide_port_acpi(hwif)) { /* call ACPI _PS0 / _STM only once */ if ((drive->dn & 1) == 0 || pair == NULL) { @@ -92,7 +79,7 @@ int generic_ide_resume(struct device *dev) memset(&rqpm, 0, sizeof(rqpm)); rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PREEMPT); ide_req(rq)->type = ATA_PRIV_PM_RESUME; - rq->special = &rqpm; + ide_req(rq)->special = &rqpm; rqpm.pm_step = IDE_PM_START_RESUME; rqpm.pm_state = PM_EVENT_ON; @@ -111,7 +98,7 @@ int generic_ide_resume(struct device *dev) void ide_complete_power_step(ide_drive_t *drive, struct request *rq) { - struct ide_pm_state *pm = rq->special; + struct ide_pm_state *pm = ide_req(rq)->special; #ifdef DEBUG_PM printk(KERN_INFO "%s: complete_power_step(step: %d)\n", @@ -141,7 +128,7 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq) ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq) { - struct ide_pm_state *pm = rq->special; + struct ide_pm_state *pm = ide_req(rq)->special; struct ide_cmd cmd = { }; switch (pm->pm_step) { @@ -213,8 +200,7 @@ out_do_tf: void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) { struct request_queue *q = drive->queue; - struct ide_pm_state *pm = rq->special; - unsigned long flags; + struct ide_pm_state *pm = ide_req(rq)->special; ide_complete_power_step(drive, rq); if (pm->pm_step != IDE_PM_COMPLETED) @@ -224,22 +210,19 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) printk("%s: completing PM request, %s\n", drive->name, (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND) ? "suspend" : "resume"); #endif - spin_lock_irqsave(q->queue_lock, flags); if (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND) - blk_stop_queue(q); + blk_mq_stop_hw_queues(q); else drive->dev_flags &= ~IDE_DFLAG_BLOCKED; - spin_unlock_irqrestore(q->queue_lock, flags); drive->hwif->rq = NULL; - if (blk_end_request(rq, BLK_STS_OK, 0)) - BUG(); + blk_mq_end_request(rq, BLK_STS_OK); } void ide_check_pm_state(ide_drive_t *drive, struct request *rq) { - struct ide_pm_state *pm = rq->special; + struct ide_pm_state *pm = ide_req(rq)->special; if (blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_PM_SUSPEND && @@ -260,7 +243,6 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq) ide_hwif_t *hwif = drive->hwif; const struct ide_tp_ops *tp_ops = hwif->tp_ops; struct request_queue *q = drive->queue; - unsigned long flags; int rc; #ifdef DEBUG_PM printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name); @@ -274,8 +256,6 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq) if (rc) printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name); - spin_lock_irqsave(q->queue_lock, flags); - blk_start_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); + blk_mq_start_hw_queues(q); } } diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 3b75a7b7a284..63627be0811a 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -746,10 +746,16 @@ static void ide_initialize_rq(struct request *rq) { struct ide_request *req = blk_mq_rq_to_pdu(rq); + req->special = NULL; scsi_req_init(&req->sreq); req->sreq.sense = req->sense; } +static const struct blk_mq_ops ide_mq_ops = { + .queue_rq = ide_queue_rq, + .initialize_rq_fn = ide_initialize_rq, +}; + /* * init request queue */ @@ -759,6 +765,7 @@ static int ide_init_queue(ide_drive_t *drive) ide_hwif_t *hwif = drive->hwif; int max_sectors = 256; int max_sg_entries = PRD_ENTRIES; + struct blk_mq_tag_set *set; /* * Our default set up assumes the normal IDE case, @@ -767,19 +774,26 @@ static int ide_init_queue(ide_drive_t *drive) * limits and LBA48 we could raise it but as yet * do not. */ - q = blk_alloc_queue_node(GFP_KERNEL, hwif_to_node(hwif), NULL); - if (!q) + + set = &drive->tag_set; + set->ops = &ide_mq_ops; + set->nr_hw_queues = 1; + set->queue_depth = 32; + set->reserved_tags = 1; + set->cmd_size = sizeof(struct ide_request); + set->numa_node = hwif_to_node(hwif); + set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; + if (blk_mq_alloc_tag_set(set)) return 1; - q->request_fn = do_ide_request; - q->initialize_rq_fn = ide_initialize_rq; - q->cmd_size = sizeof(struct ide_request); - blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q); - if (blk_init_allocated_queue(q) < 0) { - blk_cleanup_queue(q); + q = blk_mq_init_queue(set); + if (IS_ERR(q)) { + blk_mq_free_tag_set(set); return 1; } + blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q); + q->queuedata = drive; blk_queue_segment_boundary(q, 0xffff); @@ -965,8 +979,12 @@ static void drive_release_dev (struct device *dev) ide_proc_unregister_device(drive); + if (drive->sense_rq) + blk_mq_free_request(drive->sense_rq); + blk_cleanup_queue(drive->queue); drive->queue = NULL; + blk_mq_free_tag_set(&drive->tag_set); drive->dev_flags &= ~IDE_DFLAG_PRESENT; @@ -1133,6 +1151,28 @@ static void ide_port_cable_detect(ide_hwif_t *hwif) } } +/* + * Deferred request list insertion handler + */ +static void drive_rq_insert_work(struct work_struct *work) +{ + ide_drive_t *drive = container_of(work, ide_drive_t, rq_work); + ide_hwif_t *hwif = drive->hwif; + struct request *rq; + LIST_HEAD(list); + + spin_lock_irq(&hwif->lock); + if (!list_empty(&drive->rq_list)) + list_splice_init(&drive->rq_list, &list); + spin_unlock_irq(&hwif->lock); + + while (!list_empty(&list)) { + rq = list_first_entry(&list, struct request, queuelist); + list_del_init(&rq->queuelist); + blk_execute_rq_nowait(drive->queue, rq->rq_disk, rq, true, NULL); + } +} + static const u8 ide_hwif_to_major[] = { IDE0_MAJOR, IDE1_MAJOR, IDE2_MAJOR, IDE3_MAJOR, IDE4_MAJOR, IDE5_MAJOR, IDE6_MAJOR, IDE7_MAJOR, IDE8_MAJOR, IDE9_MAJOR }; @@ -1145,12 +1185,10 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif) ide_port_for_each_dev(i, drive, hwif) { u8 j = (hwif->index * MAX_DRIVES) + i; u16 *saved_id = drive->id; - struct request *saved_sense_rq = drive->sense_rq; memset(drive, 0, sizeof(*drive)); memset(saved_id, 0, SECTOR_SIZE); drive->id = saved_id; - drive->sense_rq = saved_sense_rq; drive->media = ide_disk; drive->select = (i << 4) | ATA_DEVICE_OBS; @@ -1166,6 +1204,9 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif) INIT_LIST_HEAD(&drive->list); init_completion(&drive->gendev_rel_comp); + + INIT_WORK(&drive->rq_work, drive_rq_insert_work); + INIT_LIST_HEAD(&drive->rq_list); } } @@ -1255,7 +1296,6 @@ static void ide_port_free_devices(ide_hwif_t *hwif) int i; ide_port_for_each_dev(i, drive, hwif) { - kfree(drive->sense_rq); kfree(drive->id); kfree(drive); } @@ -1283,17 +1323,10 @@ static int ide_port_alloc_devices(ide_hwif_t *hwif, int node) if (drive->id == NULL) goto out_free_drive; - drive->sense_rq = kmalloc(sizeof(struct request) + - sizeof(struct ide_request), GFP_KERNEL); - if (!drive->sense_rq) - goto out_free_id; - hwif->devices[i] = drive; } return 0; -out_free_id: - kfree(drive->id); out_free_drive: kfree(drive); out_nomem: diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 34c1165226a4..db1a65f4b490 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -639,7 +639,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, goto out; } if (req->cmd[13] & REQ_IDETAPE_PC1) { - pc = (struct ide_atapi_pc *)rq->special; + pc = (struct ide_atapi_pc *)ide_req(rq)->special; req->cmd[13] &= ~(REQ_IDETAPE_PC1); req->cmd[13] |= REQ_IDETAPE_PC2; goto out; diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index c21d5c50ae3a..17b2e379e872 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -440,7 +440,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, goto put_req; } - rq->special = cmd; + ide_req(rq)->special = cmd; cmd->rq = rq; blk_execute_rq(drive->queue, NULL, rq, 0); diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index efb976a863d2..5f82036fe322 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -389,7 +389,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) goto err_dev; } - tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node, NULL); + tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node); if (!tqueue) { ret = -ENOMEM; goto err_disk; @@ -974,7 +974,7 @@ static int nvm_get_bb_meta(struct nvm_dev *dev, sector_t slba, struct ppa_addr ppa; u8 *blks; int ch, lun, nr_blks; - int ret; + int ret = 0; ppa.ppa = slba; ppa = dev_to_generic_addr(dev, ppa); @@ -1140,20 +1140,26 @@ EXPORT_SYMBOL(nvm_alloc_dev); int nvm_register(struct nvm_dev *dev) { - int ret; + int ret, exp_pool_size; if (!dev->q || !dev->ops) return -EINVAL; - dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist"); - if (!dev->dma_pool) { - pr_err("nvm: could not create dma pool\n"); - return -ENOMEM; - } - ret = nvm_init(dev); if (ret) - goto err_init; + return ret; + + exp_pool_size = max_t(int, PAGE_SIZE, + (NVM_MAX_VLBA * (sizeof(u64) + dev->geo.sos))); + exp_pool_size = round_up(exp_pool_size, PAGE_SIZE); + + dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist", + exp_pool_size); + if (!dev->dma_pool) { + pr_err("nvm: could not create dma pool\n"); + nvm_free(dev); + return -ENOMEM; + } /* register device with a supported media manager */ down_write(&nvm_lock); @@ -1161,9 +1167,6 @@ int nvm_register(struct nvm_dev *dev) up_write(&nvm_lock); return 0; -err_init: - dev->ops->destroy_dma_pool(dev->dma_pool); - return ret; } EXPORT_SYMBOL(nvm_register); diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 6944aac43b01..1ff165351180 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -250,8 +250,8 @@ int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd) if (rqd->nr_ppas == 1) return 0; - rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; - rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; + rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size(pblk); + rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size(pblk); return 0; } @@ -376,7 +376,7 @@ void pblk_write_should_kick(struct pblk *pblk) { unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb); - if (secs_avail >= pblk->min_write_pgs) + if (secs_avail >= pblk->min_write_pgs_data) pblk_write_kick(pblk); } @@ -407,7 +407,9 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct list_head *move_list = NULL; - int vsc = le32_to_cpu(*line->vsc); + int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data) + * (pblk->min_write_pgs - pblk->min_write_pgs_data); + int vsc = le32_to_cpu(*line->vsc) + packed_meta; lockdep_assert_held(&line->lock); @@ -531,7 +533,7 @@ void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd) if (caddr == 0) trace_pblk_chunk_state(pblk_disk_name(pblk), ppa, NVM_CHK_ST_OPEN); - else if (caddr == chunk->cnlb) + else if (caddr == (chunk->cnlb - 1)) trace_pblk_chunk_state(pblk_disk_name(pblk), ppa, NVM_CHK_ST_CLOSED); } @@ -620,12 +622,15 @@ out: } int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, - unsigned long secs_to_flush) + unsigned long secs_to_flush, bool skip_meta) { int max = pblk->sec_per_write; int min = pblk->min_write_pgs; int secs_to_sync = 0; + if (skip_meta && pblk->min_write_pgs_data != pblk->min_write_pgs) + min = max = pblk->min_write_pgs_data; + if (secs_avail >= max) secs_to_sync = max; else if (secs_avail >= min) @@ -796,10 +801,11 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line, rqd.is_seq = 1; for (i = 0; i < lm->smeta_sec; i++, paddr++) { - struct pblk_sec_meta *meta_list = rqd.meta_list; + struct pblk_sec_meta *meta = pblk_get_meta(pblk, + rqd.meta_list, i); rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - meta_list[i].lba = lba_list[paddr] = addr_empty; + meta->lba = lba_list[paddr] = addr_empty; } ret = pblk_submit_io_sync_sem(pblk, &rqd); @@ -845,13 +851,13 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line, if (!meta_list) return -ENOMEM; - ppa_list = meta_list + pblk_dma_meta_size; - dma_ppa_list = dma_meta_list + pblk_dma_meta_size; + ppa_list = meta_list + pblk_dma_meta_size(pblk); + dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk); next_rq: memset(&rqd, 0, sizeof(struct nvm_rq)); - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); + rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); rq_len = rq_ppas * geo->csecs; bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len, @@ -1276,6 +1282,7 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) return 0; } +/* Line allocations in the recovery path are always single threaded */ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; @@ -1295,15 +1302,22 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) ret = pblk_line_alloc_bitmaps(pblk, line); if (ret) - return ret; + goto fail; if (!pblk_line_init_bb(pblk, line, 0)) { - list_add(&line->list, &l_mg->free_list); - return -EINTR; + ret = -EINTR; + goto fail; } pblk_rl_free_lines_dec(&pblk->rl, line, true); return 0; + +fail: + spin_lock(&l_mg->free_lock); + list_add(&line->list, &l_mg->free_list); + spin_unlock(&l_mg->free_lock); + + return ret; } void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line) @@ -2160,3 +2174,38 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, } spin_unlock(&pblk->trans_lock); } + +void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd) +{ + void *buffer; + + if (pblk_is_oob_meta_supported(pblk)) { + /* Just use OOB metadata buffer as always */ + buffer = rqd->meta_list; + } else { + /* We need to reuse last page of request (packed metadata) + * in similar way as traditional oob metadata + */ + buffer = page_to_virt( + rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page); + } + + return buffer; +} + +void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd) +{ + void *meta_list = rqd->meta_list; + void *page; + int i = 0; + + if (pblk_is_oob_meta_supported(pblk)) + return; + + page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page); + /* We need to fill oob meta buffer with data from packed metadata */ + for (; i < rqd->nr_ppas; i++) + memcpy(pblk_get_meta(pblk, meta_list, i), + page + (i * sizeof(struct pblk_sec_meta)), + sizeof(struct pblk_sec_meta)); +} diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 13822594647c..f9a3e47b6a93 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -207,9 +207,6 @@ static int pblk_rwb_init(struct pblk *pblk) return pblk_rb_init(&pblk->rwb, buffer_size, threshold, geo->csecs); } -/* Minimum pages needed within a lun */ -#define ADDR_POOL_SIZE 64 - static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo, struct nvm_addrf_12 *dst) { @@ -350,23 +347,19 @@ fail_destroy_ws: static int pblk_get_global_caches(void) { - int ret; + int ret = 0; mutex_lock(&pblk_caches.mutex); - if (kref_read(&pblk_caches.kref) > 0) { - kref_get(&pblk_caches.kref); - mutex_unlock(&pblk_caches.mutex); - return 0; - } + if (kref_get_unless_zero(&pblk_caches.kref)) + goto out; ret = pblk_create_global_caches(); - if (!ret) - kref_get(&pblk_caches.kref); + kref_init(&pblk_caches.kref); +out: mutex_unlock(&pblk_caches.mutex); - return ret; } @@ -406,12 +399,45 @@ static int pblk_core_init(struct pblk *pblk) pblk->nr_flush_rst = 0; pblk->min_write_pgs = geo->ws_opt; + pblk->min_write_pgs_data = pblk->min_write_pgs; max_write_ppas = pblk->min_write_pgs * geo->all_luns; pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); pblk->max_write_pgs = min_t(int, pblk->max_write_pgs, queue_max_hw_sectors(dev->q) / (geo->csecs >> SECTOR_SHIFT)); pblk_set_sec_per_write(pblk, pblk->min_write_pgs); + pblk->oob_meta_size = geo->sos; + if (!pblk_is_oob_meta_supported(pblk)) { + /* For drives which does not have OOB metadata feature + * in order to support recovery feature we need to use + * so called packed metadata. Packed metada will store + * the same information as OOB metadata (l2p table mapping, + * but in the form of the single page at the end of + * every write request. + */ + if (pblk->min_write_pgs + * sizeof(struct pblk_sec_meta) > PAGE_SIZE) { + /* We want to keep all the packed metadata on single + * page per write requests. So we need to ensure that + * it will fit. + * + * This is more like sanity check, since there is + * no device with such a big minimal write size + * (above 1 metabytes). + */ + pblk_err(pblk, "Not supported min write size\n"); + return -EINVAL; + } + /* For packed meta approach we do some simplification. + * On read path we always issue requests which size + * equal to max_write_pgs, with all pages filled with + * user payload except of last one page which will be + * filled with packed metadata. + */ + pblk->max_write_pgs = pblk->min_write_pgs; + pblk->min_write_pgs_data = pblk->min_write_pgs - 1; + } + pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t), GFP_KERNEL); if (!pblk->pad_dist) @@ -635,40 +661,61 @@ static unsigned int calc_emeta_len(struct pblk *pblk) return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]); } -static void pblk_set_provision(struct pblk *pblk, long nr_free_blks) +static int pblk_set_provision(struct pblk *pblk, int nr_free_chks) { struct nvm_tgt_dev *dev = pblk->dev; struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; struct nvm_geo *geo = &dev->geo; sector_t provisioned; - int sec_meta, blk_meta; + int sec_meta, blk_meta, clba; + int minimum; if (geo->op == NVM_TARGET_DEFAULT_OP) pblk->op = PBLK_DEFAULT_OP; else pblk->op = geo->op; - provisioned = nr_free_blks; + minimum = pblk_get_min_chks(pblk); + provisioned = nr_free_chks; provisioned *= (100 - pblk->op); sector_div(provisioned, 100); - pblk->op_blks = nr_free_blks - provisioned; + if ((nr_free_chks - provisioned) < minimum) { + if (geo->op != NVM_TARGET_DEFAULT_OP) { + pblk_err(pblk, "OP too small to create a sane instance\n"); + return -EINTR; + } + + /* If the user did not specify an OP value, and PBLK_DEFAULT_OP + * is not enough, calculate and set sane value + */ + + provisioned = nr_free_chks - minimum; + pblk->op = (100 * minimum) / nr_free_chks; + pblk_info(pblk, "Default OP insufficient, adjusting OP to %d\n", + pblk->op); + } + + pblk->op_blks = nr_free_chks - provisioned; /* Internally pblk manages all free blocks, but all calculations based * on user capacity consider only provisioned blocks */ - pblk->rl.total_blocks = nr_free_blks; - pblk->rl.nr_secs = nr_free_blks * geo->clba; + pblk->rl.total_blocks = nr_free_chks; + pblk->rl.nr_secs = nr_free_chks * geo->clba; /* Consider sectors used for metadata */ sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); - pblk->capacity = (provisioned - blk_meta) * geo->clba; + clba = (geo->clba / pblk->min_write_pgs) * pblk->min_write_pgs_data; + pblk->capacity = (provisioned - blk_meta) * clba; - atomic_set(&pblk->rl.free_blocks, nr_free_blks); - atomic_set(&pblk->rl.free_user_blocks, nr_free_blks); + atomic_set(&pblk->rl.free_blocks, nr_free_chks); + atomic_set(&pblk->rl.free_user_blocks, nr_free_chks); + + return 0; } static int pblk_setup_line_meta_chk(struct pblk *pblk, struct pblk_line *line, @@ -984,7 +1031,7 @@ static int pblk_lines_init(struct pblk *pblk) struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *line; void *chunk_meta; - long nr_free_chks = 0; + int nr_free_chks = 0; int i, ret; ret = pblk_line_meta_init(pblk); @@ -1031,7 +1078,9 @@ static int pblk_lines_init(struct pblk *pblk) goto fail_free_lines; } - pblk_set_provision(pblk, nr_free_chks); + ret = pblk_set_provision(pblk, nr_free_chks); + if (ret) + goto fail_free_lines; vfree(chunk_meta); return 0; @@ -1041,7 +1090,7 @@ fail_free_lines: pblk_line_meta_free(l_mg, &pblk->lines[i]); kfree(pblk->lines); fail_free_chunk_meta: - kfree(chunk_meta); + vfree(chunk_meta); fail_free_luns: kfree(pblk->luns); fail_free_meta: @@ -1154,6 +1203,12 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, return ERR_PTR(-EINVAL); } + if (geo->ext) { + pblk_err(pblk, "extended metadata not supported\n"); + kfree(pblk); + return ERR_PTR(-EINVAL); + } + spin_lock_init(&pblk->resubmit_lock); spin_lock_init(&pblk->trans_lock); spin_lock_init(&pblk->lock); diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c index 6dcbd44e3acb..79df583ea709 100644 --- a/drivers/lightnvm/pblk-map.c +++ b/drivers/lightnvm/pblk-map.c @@ -22,7 +22,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, struct ppa_addr *ppa_list, unsigned long *lun_bitmap, - struct pblk_sec_meta *meta_list, + void *meta_list, unsigned int valid_secs) { struct pblk_line *line = pblk_line_get_data(pblk); @@ -33,6 +33,9 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, int nr_secs = pblk->min_write_pgs; int i; + if (!line) + return -ENOSPC; + if (pblk_line_is_full(line)) { struct pblk_line *prev_line = line; @@ -42,8 +45,11 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, line = pblk_line_replace_data(pblk); pblk_line_close_meta(pblk, prev_line); - if (!line) - return -EINTR; + if (!line) { + pblk_pipeline_stop(pblk); + return -ENOSPC; + } + } emeta = line->emeta; @@ -52,6 +58,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, paddr = pblk_alloc_page(pblk, line, nr_secs); for (i = 0; i < nr_secs; i++, paddr++) { + struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); /* ppa to be sent to the device */ @@ -68,14 +75,15 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, kref_get(&line->ref); w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i); w_ctx->ppa = ppa_list[i]; - meta_list[i].lba = cpu_to_le64(w_ctx->lba); + meta->lba = cpu_to_le64(w_ctx->lba); lba_list[paddr] = cpu_to_le64(w_ctx->lba); if (lba_list[paddr] != addr_empty) line->nr_valid_lbas++; else atomic64_inc(&pblk->pad_wa); } else { - lba_list[paddr] = meta_list[i].lba = addr_empty; + lba_list[paddr] = addr_empty; + meta->lba = addr_empty; __pblk_map_invalidate(pblk, line, paddr); } } @@ -84,50 +92,57 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, return 0; } -void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, +int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, unsigned long *lun_bitmap, unsigned int valid_secs, unsigned int off) { - struct pblk_sec_meta *meta_list = rqd->meta_list; + void *meta_list = pblk_get_meta_for_writes(pblk, rqd); + void *meta_buffer; struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); unsigned int map_secs; int min = pblk->min_write_pgs; int i; + int ret; for (i = off; i < rqd->nr_ppas; i += min) { map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; - if (pblk_map_page_data(pblk, sentry + i, &ppa_list[i], - lun_bitmap, &meta_list[i], map_secs)) { - bio_put(rqd->bio); - pblk_free_rqd(pblk, rqd, PBLK_WRITE); - pblk_pipeline_stop(pblk); - } + meta_buffer = pblk_get_meta(pblk, meta_list, i); + + ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i], + lun_bitmap, meta_buffer, map_secs); + if (ret) + return ret; } + + return 0; } /* only if erase_ppa is set, acquire erase semaphore */ -void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, +int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, unsigned long *lun_bitmap, unsigned int valid_secs, struct ppa_addr *erase_ppa) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct pblk_line_meta *lm = &pblk->lm; - struct pblk_sec_meta *meta_list = rqd->meta_list; + void *meta_list = pblk_get_meta_for_writes(pblk, rqd); + void *meta_buffer; struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); struct pblk_line *e_line, *d_line; unsigned int map_secs; int min = pblk->min_write_pgs; int i, erase_lun; + int ret; + for (i = 0; i < rqd->nr_ppas; i += min) { map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; - if (pblk_map_page_data(pblk, sentry + i, &ppa_list[i], - lun_bitmap, &meta_list[i], map_secs)) { - bio_put(rqd->bio); - pblk_free_rqd(pblk, rqd, PBLK_WRITE); - pblk_pipeline_stop(pblk); - } + meta_buffer = pblk_get_meta(pblk, meta_list, i); + + ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i], + lun_bitmap, meta_buffer, map_secs); + if (ret) + return ret; erase_lun = pblk_ppa_to_pos(geo, ppa_list[i]); @@ -163,7 +178,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, */ e_line = pblk_line_get_erase(pblk); if (!e_line) - return; + return -ENOSPC; /* Erase blocks that are bad in this line but might not be in next */ if (unlikely(pblk_ppa_empty(*erase_ppa)) && @@ -174,7 +189,7 @@ retry: bit = find_next_bit(d_line->blk_bitmap, lm->blk_per_line, bit + 1); if (bit >= lm->blk_per_line) - return; + return 0; spin_lock(&e_line->lock); if (test_bit(bit, e_line->erase_bitmap)) { @@ -188,4 +203,6 @@ retry: *erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */ erase_ppa->a.blk = e_line->id; } + + return 0; } diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index b1f4b51783f4..d4ca8c64ee0f 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -147,7 +147,7 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold, /* * Initialize rate-limiter, which controls access to the write buffer - * but user and GC I/O + * by user and GC I/O */ pblk_rl_init(&pblk->rl, rb->nr_entries); @@ -552,6 +552,9 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, to_read = count; } + /* Add space for packed metadata if in use*/ + pad += (pblk->min_write_pgs - pblk->min_write_pgs_data); + c_ctx->sentry = pos; c_ctx->nr_valid = to_read; c_ctx->nr_padded = pad; diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 9fba614adeeb..3789185144da 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -43,7 +43,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, sector_t blba, unsigned long *read_bitmap) { - struct pblk_sec_meta *meta_list = rqd->meta_list; + void *meta_list = rqd->meta_list; struct ppa_addr ppas[NVM_MAX_VLBA]; int nr_secs = rqd->nr_ppas; bool advanced_bio = false; @@ -53,12 +53,15 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, for (i = 0; i < nr_secs; i++) { struct ppa_addr p = ppas[i]; + struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); sector_t lba = blba + i; retry: if (pblk_ppa_empty(p)) { + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); + WARN_ON(test_and_set_bit(i, read_bitmap)); - meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); + meta->lba = addr_empty; if (unlikely(!advanced_bio)) { bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE); @@ -78,7 +81,7 @@ retry: goto retry; } WARN_ON(test_and_set_bit(i, read_bitmap)); - meta_list[i].lba = cpu_to_le64(lba); + meta->lba = cpu_to_le64(lba); advanced_bio = true; #ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_inc(&pblk->cache_reads); @@ -105,12 +108,16 @@ next: static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd, sector_t blba) { - struct pblk_sec_meta *meta_lba_list = rqd->meta_list; + void *meta_list = rqd->meta_list; int nr_lbas = rqd->nr_ppas; int i; + if (!pblk_is_oob_meta_supported(pblk)) + return; + for (i = 0; i < nr_lbas; i++) { - u64 lba = le64_to_cpu(meta_lba_list[i].lba); + struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); + u64 lba = le64_to_cpu(meta->lba); if (lba == ADDR_EMPTY) continue; @@ -134,17 +141,22 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd, static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd, u64 *lba_list, int nr_lbas) { - struct pblk_sec_meta *meta_lba_list = rqd->meta_list; + void *meta_lba_list = rqd->meta_list; int i, j; + if (!pblk_is_oob_meta_supported(pblk)) + return; + for (i = 0, j = 0; i < nr_lbas; i++) { + struct pblk_sec_meta *meta = pblk_get_meta(pblk, + meta_lba_list, j); u64 lba = lba_list[i]; u64 meta_lba; if (lba == ADDR_EMPTY) continue; - meta_lba = le64_to_cpu(meta_lba_list[j].lba); + meta_lba = le64_to_cpu(meta->lba); if (lba != meta_lba) { #ifdef CONFIG_NVM_PBLK_DEBUG @@ -216,15 +228,15 @@ static void pblk_end_partial_read(struct nvm_rq *rqd) struct pblk *pblk = rqd->private; struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); struct pblk_pr_ctx *pr_ctx = r_ctx->private; + struct pblk_sec_meta *meta; struct bio *new_bio = rqd->bio; struct bio *bio = pr_ctx->orig_bio; struct bio_vec src_bv, dst_bv; - struct pblk_sec_meta *meta_list = rqd->meta_list; + void *meta_list = rqd->meta_list; int bio_init_idx = pr_ctx->bio_init_idx; unsigned long *read_bitmap = pr_ctx->bitmap; int nr_secs = pr_ctx->orig_nr_secs; int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); - __le64 *lba_list_mem, *lba_list_media; void *src_p, *dst_p; int hole, i; @@ -237,13 +249,10 @@ static void pblk_end_partial_read(struct nvm_rq *rqd) rqd->ppa_list[0] = ppa; } - /* Re-use allocated memory for intermediate lbas */ - lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); - lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size); - for (i = 0; i < nr_secs; i++) { - lba_list_media[i] = meta_list[i].lba; - meta_list[i].lba = lba_list_mem[i]; + meta = pblk_get_meta(pblk, meta_list, i); + pr_ctx->lba_list_media[i] = le64_to_cpu(meta->lba); + meta->lba = cpu_to_le64(pr_ctx->lba_list_mem[i]); } /* Fill the holes in the original bio */ @@ -255,7 +264,8 @@ static void pblk_end_partial_read(struct nvm_rq *rqd) line = pblk_ppa_to_line(pblk, rqd->ppa_list[i]); kref_put(&line->ref, pblk_line_put); - meta_list[hole].lba = lba_list_media[i]; + meta = pblk_get_meta(pblk, meta_list, hole); + meta->lba = cpu_to_le64(pr_ctx->lba_list_media[i]); src_bv = new_bio->bi_io_vec[i++]; dst_bv = bio->bi_io_vec[bio_init_idx + hole]; @@ -291,17 +301,13 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd, unsigned long *read_bitmap, int nr_holes) { - struct pblk_sec_meta *meta_list = rqd->meta_list; + void *meta_list = rqd->meta_list; struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); struct pblk_pr_ctx *pr_ctx; struct bio *new_bio, *bio = r_ctx->private; - __le64 *lba_list_mem; int nr_secs = rqd->nr_ppas; int i; - /* Re-use allocated memory for intermediate lbas */ - lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); - new_bio = bio_alloc(GFP_KERNEL, nr_holes); if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) @@ -312,12 +318,15 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd, goto fail_free_pages; } - pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL); + pr_ctx = kzalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL); if (!pr_ctx) goto fail_free_pages; - for (i = 0; i < nr_secs; i++) - lba_list_mem[i] = meta_list[i].lba; + for (i = 0; i < nr_secs; i++) { + struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); + + pr_ctx->lba_list_mem[i] = le64_to_cpu(meta->lba); + } new_bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(new_bio, REQ_OP_READ, 0); @@ -325,7 +334,6 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd, rqd->bio = new_bio; rqd->nr_ppas = nr_holes; - pr_ctx->ppa_ptr = NULL; pr_ctx->orig_bio = bio; bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA); pr_ctx->bio_init_idx = bio_init_idx; @@ -383,7 +391,7 @@ err: static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, sector_t lba, unsigned long *read_bitmap) { - struct pblk_sec_meta *meta_list = rqd->meta_list; + struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0); struct ppa_addr ppa; pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); @@ -394,8 +402,10 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, retry: if (pblk_ppa_empty(ppa)) { + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); + WARN_ON(test_and_set_bit(0, read_bitmap)); - meta_list[0].lba = cpu_to_le64(ADDR_EMPTY); + meta->lba = addr_empty; return; } @@ -409,7 +419,7 @@ retry: } WARN_ON(test_and_set_bit(0, read_bitmap)); - meta_list[0].lba = cpu_to_le64(lba); + meta->lba = cpu_to_le64(lba); #ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_inc(&pblk->cache_reads); diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 5740b7509bd8..3fcf062d752c 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -13,6 +13,9 @@ * General Public License for more details. * * pblk-recovery.c - pblk's recovery path + * + * The L2P recovery path is single threaded as the L2P table is updated in order + * following the line sequence ID. */ #include "pblk.h" @@ -124,7 +127,7 @@ static u64 pblk_sec_in_open_line(struct pblk *pblk, struct pblk_line *line) struct pblk_recov_alloc { struct ppa_addr *ppa_list; - struct pblk_sec_meta *meta_list; + void *meta_list; struct nvm_rq *rqd; void *data; dma_addr_t dma_ppa_list; @@ -158,7 +161,7 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line, { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - struct pblk_sec_meta *meta_list; + void *meta_list; struct pblk_pad_rq *pad_rq; struct nvm_rq *rqd; struct bio *bio; @@ -188,7 +191,7 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line, kref_init(&pad_rq->ref); next_pad_rq: - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); + rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); if (rq_ppas < pblk->min_write_pgs) { pblk_err(pblk, "corrupted pad line %d\n", line->id); goto fail_free_pad; @@ -237,12 +240,15 @@ next_pad_rq: for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) { struct ppa_addr dev_ppa; + struct pblk_sec_meta *meta; __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); pblk_map_invalidate(pblk, dev_ppa); - lba_list[w_ptr] = meta_list[i].lba = addr_empty; + lba_list[w_ptr] = addr_empty; + meta = pblk_get_meta(pblk, meta_list, i); + meta->lba = addr_empty; rqd->ppa_list[i] = dev_ppa; } } @@ -334,20 +340,21 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, struct pblk_recov_alloc p) { struct nvm_tgt_dev *dev = pblk->dev; + struct pblk_line_meta *lm = &pblk->lm; struct nvm_geo *geo = &dev->geo; struct ppa_addr *ppa_list; - struct pblk_sec_meta *meta_list; + void *meta_list; struct nvm_rq *rqd; struct bio *bio; void *data; dma_addr_t dma_ppa_list, dma_meta_list; __le64 *lba_list; - u64 paddr = 0; + u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; bool padded = false; int rq_ppas, rq_len; int i, j; int ret; - u64 left_ppas = pblk_sec_in_open_line(pblk, line); + u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec; if (pblk_line_wp_is_unbalanced(pblk, line)) pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id); @@ -364,17 +371,19 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, next_rq: memset(rqd, 0, pblk_g_rq_size); - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); + rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); if (!rq_ppas) rq_ppas = pblk->min_write_pgs; rq_len = rq_ppas * geo->csecs; +retry_rq: bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL); if (IS_ERR(bio)) return PTR_ERR(bio); bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio_get(bio); rqd->bio = bio; rqd->opcode = NVM_OP_PREAD; @@ -387,7 +396,6 @@ next_rq: if (pblk_io_aligned(pblk, rq_ppas)) rqd->is_seq = 1; -retry_rq: for (i = 0; i < rqd->nr_ppas; ) { struct ppa_addr ppa; int pos; @@ -410,6 +418,7 @@ retry_rq: if (ret) { pblk_err(pblk, "I/O submission failed: %d\n", ret); bio_put(bio); + bio_put(bio); return ret; } @@ -421,20 +430,28 @@ retry_rq: if (padded) { pblk_log_read_err(pblk, rqd); + bio_put(bio); return -EINTR; } pad_distance = pblk_pad_distance(pblk, line); ret = pblk_recov_pad_line(pblk, line, pad_distance); - if (ret) + if (ret) { + bio_put(bio); return ret; + } padded = true; + bio_put(bio); goto retry_rq; } + pblk_get_packed_meta(pblk, rqd); + bio_put(bio); + for (i = 0; i < rqd->nr_ppas; i++) { - u64 lba = le64_to_cpu(meta_list[i].lba); + struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); + u64 lba = le64_to_cpu(meta->lba); lba_list[paddr++] = cpu_to_le64(lba); @@ -463,7 +480,7 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) struct nvm_geo *geo = &dev->geo; struct nvm_rq *rqd; struct ppa_addr *ppa_list; - struct pblk_sec_meta *meta_list; + void *meta_list; struct pblk_recov_alloc p; void *data; dma_addr_t dma_ppa_list, dma_meta_list; @@ -473,8 +490,8 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) if (!meta_list) return -ENOMEM; - ppa_list = (void *)(meta_list) + pblk_dma_meta_size; - dma_ppa_list = dma_meta_list + pblk_dma_meta_size; + ppa_list = (void *)(meta_list) + pblk_dma_meta_size(pblk); + dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk); data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL); if (!data) { @@ -804,7 +821,6 @@ next: WARN_ON_ONCE(!test_and_clear_bit(meta_line, &l_mg->meta_bitmap)); spin_unlock(&l_mg->free_lock); - pblk_line_replace_data(pblk); } else { spin_lock(&l_mg->free_lock); /* Allocate next line for preparation */ diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index db55a1c89997..76116d5f78e4 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -214,11 +214,10 @@ void pblk_rl_init(struct pblk_rl *rl, int budget) struct nvm_geo *geo = &dev->geo; struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; - int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE; int sec_meta, blk_meta; - unsigned int rb_windows; + /* Consider sectors used for metadata */ sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); @@ -226,7 +225,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget) rl->high = pblk->op_blks - blk_meta - lm->blk_per_line; rl->high_pw = get_count_order(rl->high); - rl->rsv_blocks = min_blocks; + rl->rsv_blocks = pblk_get_min_chks(pblk); /* This will always be a power-of-2 */ rb_windows = budget / NVM_MAX_VLBA; diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c index 2d2818155aa8..7d8958df9472 100644 --- a/drivers/lightnvm/pblk-sysfs.c +++ b/drivers/lightnvm/pblk-sysfs.c @@ -479,6 +479,13 @@ static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk, if (kstrtouint(page, 0, &sec_per_write)) return -EINVAL; + if (!pblk_is_oob_meta_supported(pblk)) { + /* For packed metadata case it is + * not allowed to change sec_per_write. + */ + return -EINVAL; + } + if (sec_per_write < pblk->min_write_pgs || sec_per_write > pblk->max_write_pgs || sec_per_write % pblk->min_write_pgs != 0) diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index fa8726493b39..06d56deb645d 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -105,14 +105,20 @@ retry: } /* Map remaining sectors in chunk, starting from ppa */ -static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa) +static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa, + int rqd_ppas) { struct pblk_line *line; struct ppa_addr map_ppa = *ppa; + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); + __le64 *lba_list; u64 paddr; int done = 0; + int n = 0; line = pblk_ppa_to_line(pblk, *ppa); + lba_list = emeta_to_lbas(pblk, line->emeta->buf); + spin_lock(&line->lock); while (!done) { @@ -121,10 +127,17 @@ static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa) if (!test_and_set_bit(paddr, line->map_bitmap)) line->left_msecs--; + if (n < rqd_ppas && lba_list[paddr] != addr_empty) + line->nr_valid_lbas--; + + lba_list[paddr] = addr_empty; + if (!test_and_set_bit(paddr, line->invalid_bitmap)) le32_add_cpu(line->vsc, -1); done = nvm_next_ppa_in_chk(pblk->dev, &map_ppa); + + n++; } line->w_err_gc->has_write_err = 1; @@ -148,9 +161,11 @@ static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry, w_ctx = &entry->w_ctx; /* Check if the lba has been overwritten */ - ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba); - if (!pblk_ppa_comp(ppa_l2p, entry->cacheline)) - w_ctx->lba = ADDR_EMPTY; + if (w_ctx->lba != ADDR_EMPTY) { + ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba); + if (!pblk_ppa_comp(ppa_l2p, entry->cacheline)) + w_ctx->lba = ADDR_EMPTY; + } /* Mark up the entry as submittable again */ flags = READ_ONCE(w_ctx->flags); @@ -200,7 +215,7 @@ static void pblk_submit_rec(struct work_struct *work) pblk_log_write_err(pblk, rqd); - pblk_map_remaining(pblk, ppa_list); + pblk_map_remaining(pblk, ppa_list, rqd->nr_ppas); pblk_queue_resubmit(pblk, c_ctx); pblk_up_rq(pblk, c_ctx->lun_bitmap); @@ -319,12 +334,13 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, } if (likely(!e_line || !atomic_read(&e_line->left_eblks))) - pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0); + ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, + valid, 0); else - pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, + ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, erase_ppa); - return 0; + return ret; } static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, @@ -332,7 +348,7 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, { int secs_to_sync; - secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush); + secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush, true); #ifdef CONFIG_NVM_PBLK_DEBUG if ((!secs_to_sync && secs_to_flush) @@ -548,15 +564,17 @@ static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd) c_ctx->nr_padded); } -static int pblk_submit_write(struct pblk *pblk) +static int pblk_submit_write(struct pblk *pblk, int *secs_left) { struct bio *bio; struct nvm_rq *rqd; unsigned int secs_avail, secs_to_sync, secs_to_com; - unsigned int secs_to_flush; + unsigned int secs_to_flush, packed_meta_pgs; unsigned long pos; unsigned int resubmit; + *secs_left = 0; + spin_lock(&pblk->resubmit_lock); resubmit = !list_empty(&pblk->resubmit_list); spin_unlock(&pblk->resubmit_lock); @@ -586,17 +604,17 @@ static int pblk_submit_write(struct pblk *pblk) */ secs_avail = pblk_rb_read_count(&pblk->rwb); if (!secs_avail) - return 1; + return 0; secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); - if (!secs_to_flush && secs_avail < pblk->min_write_pgs) - return 1; + if (!secs_to_flush && secs_avail < pblk->min_write_pgs_data) + return 0; secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush); if (secs_to_sync > pblk->max_write_pgs) { pblk_err(pblk, "bad buffer sync calculation\n"); - return 1; + return 0; } secs_to_com = (secs_to_sync > secs_avail) ? @@ -604,7 +622,8 @@ static int pblk_submit_write(struct pblk *pblk) pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); } - bio = bio_alloc(GFP_KERNEL, secs_to_sync); + packed_meta_pgs = (pblk->min_write_pgs - pblk->min_write_pgs_data); + bio = bio_alloc(GFP_KERNEL, secs_to_sync + packed_meta_pgs); bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -625,6 +644,7 @@ static int pblk_submit_write(struct pblk *pblk) atomic_long_add(secs_to_sync, &pblk->sub_writes); #endif + *secs_left = 1; return 0; fail_free_bio: @@ -633,16 +653,22 @@ fail_put_bio: bio_put(bio); pblk_free_rqd(pblk, rqd, PBLK_WRITE); - return 1; + return -EINTR; } int pblk_write_ts(void *data) { struct pblk *pblk = data; + int secs_left; + int write_failure = 0; while (!kthread_should_stop()) { - if (!pblk_submit_write(pblk)) - continue; + if (!write_failure) { + write_failure = pblk_submit_write(pblk, &secs_left); + + if (secs_left) + continue; + } set_current_state(TASK_INTERRUPTIBLE); io_schedule(); } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 02bb2e98f8a9..85e38ed62f85 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -104,7 +104,6 @@ enum { PBLK_RL_LOW = 4 }; -#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * NVM_MAX_VLBA) #define pblk_dma_ppa_size (sizeof(u64) * NVM_MAX_VLBA) /* write buffer completion context */ @@ -132,6 +131,8 @@ struct pblk_pr_ctx { unsigned int bio_init_idx; void *ppa_ptr; dma_addr_t dma_ppa_list; + __le64 lba_list_mem[NVM_MAX_VLBA]; + __le64 lba_list_media[NVM_MAX_VLBA]; }; /* Pad context */ @@ -631,7 +632,9 @@ struct pblk { int state; /* pblk line state */ int min_write_pgs; /* Minimum amount of pages required by controller */ + int min_write_pgs_data; /* Minimum amount of payload pages */ int max_write_pgs; /* Maximum amount of pages supported by controller */ + int oob_meta_size; /* Size of OOB sector metadata */ sector_t capacity; /* Device capacity when bad blocks are subtracted */ @@ -836,7 +839,7 @@ void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, - unsigned long secs_to_flush); + unsigned long secs_to_flush, bool skip_meta); void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa, unsigned long *lun_bitmap); void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa); @@ -860,6 +863,8 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, u64 *lba_list, int nr_secs); void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, sector_t blba, int nr_secs); +void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd); +void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd); /* * pblk user I/O write path @@ -871,10 +876,10 @@ int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq); /* * pblk map */ -void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, +int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, unsigned long *lun_bitmap, unsigned int valid_secs, struct ppa_addr *erase_ppa); -void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, +int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, unsigned long *lun_bitmap, unsigned int valid_secs, unsigned int off); @@ -905,7 +910,6 @@ int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta); #define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ #define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */ #define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ -#define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */ int pblk_gc_init(struct pblk *pblk); void pblk_gc_exit(struct pblk *pblk, bool graceful); @@ -1370,4 +1374,33 @@ static inline char *pblk_disk_name(struct pblk *pblk) return disk->disk_name; } + +static inline unsigned int pblk_get_min_chks(struct pblk *pblk) +{ + struct pblk_line_meta *lm = &pblk->lm; + /* In a worst-case scenario every line will have OP invalid sectors. + * We will then need a minimum of 1/OP lines to free up a single line + */ + + return DIV_ROUND_UP(100, pblk->op) * lm->blk_per_line; +} + +static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk, + void *meta, int index) +{ + return meta + + max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size) + * index; +} + +static inline int pblk_dma_meta_size(struct pblk *pblk) +{ + return max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size) + * NVM_MAX_VLBA; +} + +static inline int pblk_is_oob_meta_supported(struct pblk *pblk) +{ + return pblk->oob_meta_size >= sizeof(struct pblk_sec_meta); +} #endif /* PBLK_H_ */ diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index b61b83bbcfff..fdf75352e16a 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -626,6 +626,20 @@ struct cache_set { /* Where in the btree gc currently is */ struct bkey gc_done; + /* + * For automatical garbage collection after writeback completed, this + * varialbe is used as bit fields, + * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback + * - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback + * This is an optimization for following write request after writeback + * finished, but read hit rate dropped due to clean data on cache is + * discarded. Unless user explicitly sets it via sysfs, it won't be + * enabled. + */ +#define BCH_ENABLE_AUTO_GC 1 +#define BCH_DO_AUTO_GC 2 + uint8_t gc_after_writeback; + /* * The allocation code needs gc_mark in struct bucket to be correct, but * it's not while a gc is in progress. Protected by bucket_lock. @@ -658,7 +672,11 @@ struct cache_set { /* * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - have to dynamically allocate them + * on the stack - have to dynamically allocate them. + * bch_cache_set_alloc() will make sure the pool can allocate iterators + * equipped with enough room that can host + * (sb.bucket_size / sb.block_size) + * btree_iter_sets, which is more than static MAX_BSETS. */ mempool_t fill_iter; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 3f4211b5cd33..23cb1dc7296b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -207,6 +207,11 @@ void bch_btree_node_read_done(struct btree *b) struct bset *i = btree_bset_first(b); struct btree_iter *iter; + /* + * c->fill_iter can allocate an iterator with more memory space + * than static MAX_BSETS. + * See the comment arount cache_set->fill_iter. + */ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index a68d6c55783b..d1c72ef64edf 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -266,6 +266,24 @@ static inline void wake_up_gc(struct cache_set *c) wake_up(&c->gc_wait); } +static inline void force_wake_up_gc(struct cache_set *c) +{ + /* + * Garbage collection thread only works when sectors_to_gc < 0, + * calling wake_up_gc() won't start gc thread if sectors_to_gc is + * not a nagetive value. + * Therefore sectors_to_gc is set to -1 here, before waking up + * gc thread by calling wake_up_gc(). Then gc_should_run() will + * give a chance to permit gc thread to run. "Give a chance" means + * before going into gc_should_run(), there is still possibility + * that c->sectors_to_gc being set to other positive value. So + * this routine won't 100% make sure gc thread will be woken up + * to run. + */ + atomic_set(&c->sectors_to_gc, -1); + wake_up_gc(c); +} + #define MAP_DONE 0 #define MAP_CONTINUE 1 diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 8f448b9c96a1..8b123be05254 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -249,8 +249,7 @@ void bch_debug_init_cache_set(struct cache_set *c) void bch_debug_exit(void) { - if (!IS_ERR_OR_NULL(bcache_debug)) - debugfs_remove_recursive(bcache_debug); + debugfs_remove_recursive(bcache_debug); } void __init bch_debug_init(void) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 522c7426f3a0..b2fd412715b1 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -663,7 +663,7 @@ static void journal_write_unlocked(struct closure *cl) REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); bch_bio_map(bio, w->data); - trace_bcache_journal_write(bio); + trace_bcache_journal_write(bio, w->data->keys); bio_list_add(&list, bio); SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 3bf35914bb57..15070412a32e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -311,11 +311,11 @@ err: * data is written it calls bch_journal, and after the keys have been added to * the next journal write they're inserted into the btree. * - * It inserts the data in s->cache_bio; bi_sector is used for the key offset, + * It inserts the data in op->bio; bi_sector is used for the key offset, * and op->inode is used for the key inode. * - * If s->bypass is true, instead of inserting the data it invalidates the - * region of the cache represented by s->cache_bio and op->inode. + * If op->bypass is true, instead of inserting the data it invalidates the + * region of the cache represented by op->bio and op->inode. */ void bch_data_insert(struct closure *cl) { diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7bbd670a5a84..4dee119c3664 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -25,8 +25,8 @@ #include #include -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Kent Overstreet "); +unsigned int bch_cutoff_writeback; +unsigned int bch_cutoff_writeback_sync; static const char bcache_magic[] = { 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, @@ -1510,8 +1510,7 @@ static void cache_set_free(struct closure *cl) struct cache *ca; unsigned int i; - if (!IS_ERR_OR_NULL(c->debug)) - debugfs_remove(c->debug); + debugfs_remove(c->debug); bch_open_buckets_free(c); bch_btree_cache_free(c); @@ -2424,6 +2423,32 @@ static void bcache_exit(void) mutex_destroy(&bch_register_lock); } +/* Check and fixup module parameters */ +static void check_module_parameters(void) +{ + if (bch_cutoff_writeback_sync == 0) + bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC; + else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) { + pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u", + bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX); + bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX; + } + + if (bch_cutoff_writeback == 0) + bch_cutoff_writeback = CUTOFF_WRITEBACK; + else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) { + pr_warn("set bch_cutoff_writeback (%u) to max value %u", + bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX); + bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX; + } + + if (bch_cutoff_writeback > bch_cutoff_writeback_sync) { + pr_warn("set bch_cutoff_writeback (%u) to %u", + bch_cutoff_writeback, bch_cutoff_writeback_sync); + bch_cutoff_writeback = bch_cutoff_writeback_sync; + } +} + static int __init bcache_init(void) { static const struct attribute *files[] = { @@ -2432,6 +2457,8 @@ static int __init bcache_init(void) NULL }; + check_module_parameters(); + mutex_init(&bch_register_lock); init_waitqueue_head(&unregister_wait); register_reboot_notifier(&reboot); @@ -2468,5 +2495,18 @@ err: return -ENOMEM; } +/* + * Module hooks + */ module_exit(bcache_exit); module_init(bcache_init); + +module_param(bch_cutoff_writeback, uint, 0); +MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback"); + +module_param(bch_cutoff_writeback_sync, uint, 0); +MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback"); + +MODULE_DESCRIPTION("Bcache: a Linux block layer cache"); +MODULE_AUTHOR("Kent Overstreet "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 26f035a0c5b9..557a8a3270a1 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -16,7 +16,7 @@ #include #include -/* Default is -1; we skip past it for struct cached_dev's cache mode */ +/* Default is 0 ("writethrough") */ static const char * const bch_cache_modes[] = { "writethrough", "writeback", @@ -25,7 +25,7 @@ static const char * const bch_cache_modes[] = { NULL }; -/* Default is -1; we skip past it for stop_when_cache_set_failed */ +/* Default is 0 ("auto") */ static const char * const bch_stop_on_failure_modes[] = { "auto", "always", @@ -88,6 +88,8 @@ read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); read_attribute(io_errors); read_attribute(congested); +read_attribute(cutoff_writeback); +read_attribute(cutoff_writeback_sync); rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us); @@ -128,6 +130,7 @@ rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); +rw_attribute(gc_after_writeback); rw_attribute(size); static ssize_t bch_snprint_string_list(char *buf, @@ -264,7 +267,8 @@ STORE(__cached_dev) d_strtoul(writeback_running); d_strtoul(writeback_delay); - sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); + sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, + 0, bch_cutoff_writeback); if (attr == &sysfs_writeback_rate) { ssize_t ret; @@ -384,8 +388,25 @@ STORE(bch_cached_dev) mutex_lock(&bch_register_lock); size = __cached_dev_store(kobj, attr, buf, size); - if (attr == &sysfs_writeback_running) - bch_writeback_queue(dc); + if (attr == &sysfs_writeback_running) { + /* dc->writeback_running changed in __cached_dev_store() */ + if (IS_ERR_OR_NULL(dc->writeback_thread)) { + /* + * reject setting it to 1 via sysfs if writeback + * kthread is not created yet. + */ + if (dc->writeback_running) { + dc->writeback_running = false; + pr_err("%s: failed to run non-existent writeback thread", + dc->disk.disk->disk_name); + } + } else + /* + * writeback kthread will check if dc->writeback_running + * is true or false. + */ + bch_writeback_queue(dc); + } if (attr == &sysfs_writeback_percent) if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) @@ -668,6 +689,9 @@ SHOW(__bch_cache_set) sysfs_print(congested_write_threshold_us, c->congested_write_threshold_us); + sysfs_print(cutoff_writeback, bch_cutoff_writeback); + sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync); + sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); sysfs_printf(verify, "%i", c->verify); sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); @@ -676,6 +700,7 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); sysfs_printf(io_disable, "%i", test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -725,21 +750,8 @@ STORE(__bch_cache_set) bch_cache_accounting_clear(&c->accounting); } - if (attr == &sysfs_trigger_gc) { - /* - * Garbage collection thread only works when sectors_to_gc < 0, - * when users write to sysfs entry trigger_gc, most of time - * they want to forcibly triger gargage collection. Here -1 is - * set to c->sectors_to_gc, to make gc_should_run() give a - * chance to permit gc thread to run. "give a chance" means - * before going into gc_should_run(), there is still chance - * that c->sectors_to_gc being set to other positive value. So - * writing sysfs entry trigger_gc won't always make sure gc - * thread takes effect. - */ - atomic_set(&c->sectors_to_gc, -1); - wake_up_gc(c); - } + if (attr == &sysfs_trigger_gc) + force_wake_up_gc(c); if (attr == &sysfs_prune_cache) { struct shrink_control sc; @@ -789,6 +801,12 @@ STORE(__bch_cache_set) sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); + /* + * write gc_after_writeback here may overwrite an already set + * BCH_DO_AUTO_GC, it doesn't matter because this flag will be + * set in next chance. + */ + sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1); return size; } @@ -869,7 +887,10 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_gc_after_writeback, &sysfs_io_disable, + &sysfs_cutoff_writeback, + &sysfs_cutoff_writeback_sync, NULL }; KTYPE(bch_cache_set_internal); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 08c3a9f9676c..73f0efac2b9f 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -17,6 +17,15 @@ #include #include +static void update_gc_after_writeback(struct cache_set *c) +{ + if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) || + c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD) + return; + + c->gc_after_writeback |= BCH_DO_AUTO_GC; +} + /* Rate limiting */ static uint64_t __calc_target_rate(struct cached_dev *dc) { @@ -191,6 +200,7 @@ static void update_writeback_rate(struct work_struct *work) if (!set_at_max_writeback_rate(c, dc)) { down_read(&dc->writeback_lock); __update_writeback_rate(dc); + update_gc_after_writeback(c); up_read(&dc->writeback_lock); } } @@ -689,6 +699,23 @@ static int bch_writeback_thread(void *arg) up_write(&dc->writeback_lock); break; } + + /* + * When dirty data rate is high (e.g. 50%+), there might + * be heavy buckets fragmentation after writeback + * finished, which hurts following write performance. + * If users really care about write performance they + * may set BCH_ENABLE_AUTO_GC via sysfs, then when + * BCH_DO_AUTO_GC is set, garbage collection thread + * will be wake up here. After moving gc, the shrunk + * btree and discarded free buckets SSD space may be + * helpful for following write requests. + */ + if (c->gc_after_writeback == + (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { + c->gc_after_writeback &= ~BCH_DO_AUTO_GC; + force_wake_up_gc(c); + } } up_write(&dc->writeback_lock); @@ -777,7 +804,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) bch_keybuf_init(&dc->writeback_keys); dc->writeback_metadata = true; - dc->writeback_running = true; + dc->writeback_running = false; dc->writeback_percent = 10; dc->writeback_delay = 30; atomic_long_set(&dc->writeback_rate.rate, 1024); @@ -805,6 +832,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) cached_dev_put(dc); return PTR_ERR(dc->writeback_thread); } + dc->writeback_running = true; WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); schedule_delayed_work(&dc->writeback_rate_update, diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index d2b9fdbc8994..6a743d3bb338 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -5,12 +5,17 @@ #define CUTOFF_WRITEBACK 40 #define CUTOFF_WRITEBACK_SYNC 70 +#define CUTOFF_WRITEBACK_MAX 70 +#define CUTOFF_WRITEBACK_SYNC_MAX 90 + #define MAX_WRITEBACKS_IN_PASS 5 #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ #define WRITEBACK_RATE_UPDATE_SECS_MAX 60 #define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 +#define BCH_AUTO_GC_DIRTY_THRESHOLD 50 + /* * 14 (16384ths) is chosen here as something that each backing device * should be a reasonable fraction of the share, and not to blow up @@ -53,6 +58,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, } } +extern unsigned int bch_cutoff_writeback; +extern unsigned int bch_cutoff_writeback_sync; + static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned int cache_mode, bool would_skip) { @@ -60,7 +68,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - in_use > CUTOFF_WRITEBACK_SYNC) + in_use > bch_cutoff_writeback_sync) return false; if (dc->partial_stripes_expensive && @@ -73,7 +81,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, return (op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO) || - in_use <= CUTOFF_WRITEBACK); + in_use <= bch_cutoff_writeback); } static inline void bch_writeback_queue(struct cached_dev *dc) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 224d44503a06..95c6d86ab5e8 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -65,7 +65,6 @@ struct mapped_device { */ struct work_struct work; wait_queue_head_t wait; - atomic_t pending[2]; spinlock_t deferred_lock; struct bio_list deferred; @@ -107,9 +106,6 @@ struct mapped_device { struct block_device *bdev; - /* zero-length flush that will be cloned and submitted to targets */ - struct bio flush_bio; - struct dm_stats stats; /* for blk-mq request-based DM support */ @@ -119,7 +115,6 @@ struct mapped_device { struct srcu_struct io_barrier; }; -int md_in_flight(struct mapped_device *md); void disable_write_same(struct mapped_device *md); void disable_write_zeroes(struct mapped_device *md); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 7cd36e4d1310..4e06be4f0a62 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -43,7 +43,7 @@ static unsigned dm_get_blk_mq_queue_depth(void) int dm_request_based(struct mapped_device *md) { - return queue_is_rq_based(md->queue); + return queue_is_mq(md->queue); } void dm_start_queue(struct request_queue *q) @@ -130,10 +130,8 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) */ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { - atomic_dec(&md->pending[rw]); - /* nudge anyone waiting on suspend queue */ - if (!md_in_flight(md)) + if (unlikely(waitqueue_active(&md->wait))) wake_up(&md->wait); /* @@ -436,7 +434,6 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, static void dm_start_request(struct mapped_device *md, struct request *orig) { blk_mq_start_request(orig); - atomic_inc(&md->pending[rq_data_dir(orig)]); if (unlikely(dm_stats_used(&md->stats))) { struct dm_rq_target_io *tio = tio_from_request(orig); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 9038c302d5c2..844f7d0f2ef8 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -919,12 +919,12 @@ static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev, struct request_queue *q = bdev_get_queue(dev->bdev); struct verify_rq_based_data *v = data; - if (q->mq_ops) + if (queue_is_mq(q)) v->mq_count++; else v->sq_count++; - return queue_is_rq_based(q); + return queue_is_mq(q); } static int dm_table_determine_type(struct dm_table *t) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 63a7c416b224..a4a06982ed91 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -646,26 +646,38 @@ static void free_tio(struct dm_target_io *tio) bio_put(&tio->clone); } -int md_in_flight(struct mapped_device *md) +static bool md_in_flight_bios(struct mapped_device *md) { - return atomic_read(&md->pending[READ]) + - atomic_read(&md->pending[WRITE]); + int cpu; + struct hd_struct *part = &dm_disk(md)->part0; + long sum = 0; + + for_each_possible_cpu(cpu) { + sum += part_stat_local_read_cpu(part, in_flight[0], cpu); + sum += part_stat_local_read_cpu(part, in_flight[1], cpu); + } + + return sum != 0; +} + +static bool md_in_flight(struct mapped_device *md) +{ + if (queue_is_mq(md->queue)) + return blk_mq_queue_inflight(md->queue); + else + return md_in_flight_bios(md); } static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; - int rw = bio_data_dir(bio); io->start_time = jiffies; generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), &dm_disk(md)->part0); - atomic_set(&dm_disk(md)->part0.in_flight[rw], - atomic_inc_return(&md->pending[rw])); - if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), bio->bi_iter.bi_sector, bio_sectors(bio), @@ -677,8 +689,6 @@ static void end_io_acct(struct dm_io *io) struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; unsigned long duration = jiffies - io->start_time; - int pending; - int rw = bio_data_dir(bio); generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, io->start_time); @@ -688,16 +698,8 @@ static void end_io_acct(struct dm_io *io) bio->bi_iter.bi_sector, bio_sectors(bio), true, duration, &io->stats_aux); - /* - * After this is decremented the bio must not be touched if it is - * a flush. - */ - pending = atomic_dec_return(&md->pending[rw]); - atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); - pending += atomic_read(&md->pending[rw^0x1]); - /* nudge anyone waiting on suspend queue */ - if (!pending) + if (unlikely(waitqueue_active(&md->wait))) wake_up(&md->wait); } @@ -1417,10 +1419,21 @@ static int __send_empty_flush(struct clone_info *ci) unsigned target_nr = 0; struct dm_target *ti; + /* + * Empty flush uses a statically initialized bio, as the base for + * cloning. However, blkg association requires that a bdev is + * associated with a gendisk, which doesn't happen until the bdev is + * opened. So, blkg association is done at issue time of the flush + * rather than when the device is created in alloc_dev(). + */ + bio_set_dev(ci->bio, ci->io->md->bdev); + BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + bio_disassociate_blkg(ci->bio); + return 0; } @@ -1598,7 +1611,16 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, init_clone_info(&ci, md, map, bio); if (bio->bi_opf & REQ_PREFLUSH) { - ci.bio = &ci.io->md->flush_bio; + struct bio flush_bio; + + /* + * Use an on-stack bio for this, it's safe since we don't + * need to reference it after submit. It's just used as + * the basis for the clone(s). + */ + bio_init(&flush_bio, NULL, 0); + flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + ci.bio = &flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ @@ -1654,7 +1676,16 @@ static blk_qc_t __process_bio(struct mapped_device *md, init_clone_info(&ci, md, map, bio); if (bio->bi_opf & REQ_PREFLUSH) { - ci.bio = &ci.io->md->flush_bio; + struct bio flush_bio; + + /* + * Use an on-stack bio for this, it's safe since we don't + * need to reference it after submit. It's just used as + * the basis for the clone(s). + */ + bio_init(&flush_bio, NULL, 0); + flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + ci.bio = &flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ @@ -1898,7 +1929,7 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL); + md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); if (!md->queue) goto bad; md->queue->queuedata = md; @@ -1908,8 +1939,6 @@ static struct mapped_device *alloc_dev(int minor) if (!md->disk) goto bad; - atomic_set(&md->pending[0], 0); - atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); @@ -1940,10 +1969,6 @@ static struct mapped_device *alloc_dev(int minor) if (!md->bdev) goto bad; - bio_init(&md->flush_bio, NULL, 0); - bio_set_dev(&md->flush_bio, md->bdev); - md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - dm_stats_init(&md->stats); /* Populate the mapping, nobody knows we exist yet */ diff --git a/drivers/md/md.c b/drivers/md/md.c index fc488cb30a94..9a0a1e0934d5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -334,7 +334,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) const int sgrp = op_stat_group(bio_op(bio)); struct mddev *mddev = q->queuedata; unsigned int sectors; - int cpu; blk_queue_split(q, &bio); @@ -359,9 +358,9 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) md_handle_request(mddev, bio); - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors); + part_stat_lock(); + part_stat_inc(&mddev->gendisk->part0, ios[sgrp]); + part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors); part_stat_unlock(); return BLK_QC_T_NONE; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ac1cffd2a09b..f3fb5bb8c82a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) !discard_bio) continue; bio_chain(discard_bio, bio); - bio_clone_blkcg_association(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); if (mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rdev->bdev), discard_bio, disk_devt(mddev->gendisk), diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 8a02f11076f9..82daccc9ea62 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -15,7 +15,7 @@ #define pr_fmt(fmt) DRIVER_NAME ": " fmt #include -#include +#include #include #include #include @@ -1873,69 +1873,65 @@ static void msb_io_work(struct work_struct *work) struct msb_data *msb = container_of(work, struct msb_data, io_work); int page, error, len; sector_t lba; - unsigned long flags; struct scatterlist *sg = msb->prealloc_sg; + struct request *req; dbg_verbose("IO: work started"); while (1) { - spin_lock_irqsave(&msb->q_lock, flags); + spin_lock_irq(&msb->q_lock); if (msb->need_flush_cache) { msb->need_flush_cache = false; - spin_unlock_irqrestore(&msb->q_lock, flags); + spin_unlock_irq(&msb->q_lock); msb_cache_flush(msb); continue; } - if (!msb->req) { - msb->req = blk_fetch_request(msb->queue); - if (!msb->req) { - dbg_verbose("IO: no more requests exiting"); - spin_unlock_irqrestore(&msb->q_lock, flags); - return; - } + req = msb->req; + if (!req) { + dbg_verbose("IO: no more requests exiting"); + spin_unlock_irq(&msb->q_lock); + return; } - spin_unlock_irqrestore(&msb->q_lock, flags); - - /* If card was removed meanwhile */ - if (!msb->req) - return; + spin_unlock_irq(&msb->q_lock); /* process the request */ dbg_verbose("IO: processing new request"); - blk_rq_map_sg(msb->queue, msb->req, sg); + blk_rq_map_sg(msb->queue, req, sg); - lba = blk_rq_pos(msb->req); + lba = blk_rq_pos(req); sector_div(lba, msb->page_size / 512); page = sector_div(lba, msb->pages_in_block); if (rq_data_dir(msb->req) == READ) error = msb_do_read_request(msb, lba, page, sg, - blk_rq_bytes(msb->req), &len); + blk_rq_bytes(req), &len); else error = msb_do_write_request(msb, lba, page, sg, - blk_rq_bytes(msb->req), &len); + blk_rq_bytes(req), &len); - spin_lock_irqsave(&msb->q_lock, flags); - - if (len) - if (!__blk_end_request(msb->req, BLK_STS_OK, len)) - msb->req = NULL; + if (len && !blk_update_request(req, BLK_STS_OK, len)) { + __blk_mq_end_request(req, BLK_STS_OK); + spin_lock_irq(&msb->q_lock); + msb->req = NULL; + spin_unlock_irq(&msb->q_lock); + } if (error && msb->req) { blk_status_t ret = errno_to_blk_status(error); + dbg_verbose("IO: ending one sector of the request with error"); - if (!__blk_end_request(msb->req, ret, msb->page_size)) - msb->req = NULL; + blk_mq_end_request(req, ret); + spin_lock_irq(&msb->q_lock); + msb->req = NULL; + spin_unlock_irq(&msb->q_lock); } if (msb->req) dbg_verbose("IO: request still pending"); - - spin_unlock_irqrestore(&msb->q_lock, flags); } } @@ -2002,29 +1998,40 @@ static int msb_bd_getgeo(struct block_device *bdev, return 0; } -static void msb_submit_req(struct request_queue *q) +static blk_status_t msb_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct memstick_dev *card = q->queuedata; + struct memstick_dev *card = hctx->queue->queuedata; struct msb_data *msb = memstick_get_drvdata(card); - struct request *req = NULL; + struct request *req = bd->rq; dbg_verbose("Submit request"); + spin_lock_irq(&msb->q_lock); + if (msb->card_dead) { dbg("Refusing requests on removed card"); WARN_ON(!msb->io_queue_stopped); - while ((req = blk_fetch_request(q)) != NULL) - __blk_end_request_all(req, BLK_STS_IOERR); - return; + spin_unlock_irq(&msb->q_lock); + blk_mq_start_request(req); + return BLK_STS_IOERR; } - if (msb->req) - return; + if (msb->req) { + spin_unlock_irq(&msb->q_lock); + return BLK_STS_DEV_RESOURCE; + } + + blk_mq_start_request(req); + msb->req = req; if (!msb->io_queue_stopped) queue_work(msb->io_queue, &msb->io_work); + + spin_unlock_irq(&msb->q_lock); + return BLK_STS_OK; } static int msb_check_card(struct memstick_dev *card) @@ -2040,21 +2047,20 @@ static void msb_stop(struct memstick_dev *card) dbg("Stopping all msblock IO"); + blk_mq_stop_hw_queues(msb->queue); spin_lock_irqsave(&msb->q_lock, flags); - blk_stop_queue(msb->queue); msb->io_queue_stopped = true; spin_unlock_irqrestore(&msb->q_lock, flags); del_timer_sync(&msb->cache_flush_timer); flush_workqueue(msb->io_queue); + spin_lock_irqsave(&msb->q_lock, flags); if (msb->req) { - spin_lock_irqsave(&msb->q_lock, flags); - blk_requeue_request(msb->queue, msb->req); + blk_mq_requeue_request(msb->req, false); msb->req = NULL; - spin_unlock_irqrestore(&msb->q_lock, flags); } - + spin_unlock_irqrestore(&msb->q_lock, flags); } static void msb_start(struct memstick_dev *card) @@ -2077,9 +2083,7 @@ static void msb_start(struct memstick_dev *card) msb->need_flush_cache = true; msb->io_queue_stopped = false; - spin_lock_irqsave(&msb->q_lock, flags); - blk_start_queue(msb->queue); - spin_unlock_irqrestore(&msb->q_lock, flags); + blk_mq_start_hw_queues(msb->queue); queue_work(msb->io_queue, &msb->io_work); @@ -2092,6 +2096,10 @@ static const struct block_device_operations msb_bdops = { .owner = THIS_MODULE }; +static const struct blk_mq_ops msb_mq_ops = { + .queue_rq = msb_queue_rq, +}; + /* Registers the block device */ static int msb_init_disk(struct memstick_dev *card) { @@ -2112,9 +2120,11 @@ static int msb_init_disk(struct memstick_dev *card) goto out_release_id; } - msb->queue = blk_init_queue(msb_submit_req, &msb->q_lock); - if (!msb->queue) { - rc = -ENOMEM; + msb->queue = blk_mq_init_sq_queue(&msb->tag_set, &msb_mq_ops, 2, + BLK_MQ_F_SHOULD_MERGE); + if (IS_ERR(msb->queue)) { + rc = PTR_ERR(msb->queue); + msb->queue = NULL; goto out_put_disk; } @@ -2202,12 +2212,13 @@ static void msb_remove(struct memstick_dev *card) /* Take care of unhandled + new requests from now on */ spin_lock_irqsave(&msb->q_lock, flags); msb->card_dead = true; - blk_start_queue(msb->queue); spin_unlock_irqrestore(&msb->q_lock, flags); + blk_mq_start_hw_queues(msb->queue); /* Remove the disk */ del_gendisk(msb->disk); blk_cleanup_queue(msb->queue); + blk_mq_free_tag_set(&msb->tag_set); msb->queue = NULL; mutex_lock(&msb_disk_lock); diff --git a/drivers/memstick/core/ms_block.h b/drivers/memstick/core/ms_block.h index 53962c3b21df..9ba84e0ced63 100644 --- a/drivers/memstick/core/ms_block.h +++ b/drivers/memstick/core/ms_block.h @@ -152,6 +152,7 @@ struct msb_data { struct gendisk *disk; struct request_queue *queue; spinlock_t q_lock; + struct blk_mq_tag_set tag_set; struct hd_geometry geometry; struct attribute_group attr_group; struct request *req; diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 0cd30dcb6801..aba50ec98b4d 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -12,7 +12,7 @@ * */ -#include +#include #include #include #include @@ -142,6 +142,7 @@ struct mspro_block_data { struct gendisk *disk; struct request_queue *queue; struct request *block_req; + struct blk_mq_tag_set tag_set; spinlock_t q_lock; unsigned short page_size; @@ -152,7 +153,6 @@ struct mspro_block_data { unsigned char system; unsigned char read_only:1, eject:1, - has_request:1, data_dir:1, active:1; unsigned char transfer_cmd; @@ -694,13 +694,12 @@ static void h_mspro_block_setup_cmd(struct memstick_dev *card, u64 offset, /*** Data transfer ***/ -static int mspro_block_issue_req(struct memstick_dev *card, int chunk) +static int mspro_block_issue_req(struct memstick_dev *card, bool chunk) { struct mspro_block_data *msb = memstick_get_drvdata(card); u64 t_off; unsigned int count; -try_again: while (chunk) { msb->current_page = 0; msb->current_seg = 0; @@ -709,9 +708,17 @@ try_again: msb->req_sg); if (!msb->seg_count) { - chunk = __blk_end_request_cur(msb->block_req, - BLK_STS_RESOURCE); - continue; + unsigned int bytes = blk_rq_cur_bytes(msb->block_req); + + chunk = blk_update_request(msb->block_req, + BLK_STS_RESOURCE, + bytes); + if (chunk) + continue; + __blk_mq_end_request(msb->block_req, + BLK_STS_RESOURCE); + msb->block_req = NULL; + break; } t_off = blk_rq_pos(msb->block_req); @@ -729,30 +736,22 @@ try_again: return 0; } - dev_dbg(&card->dev, "blk_fetch\n"); - msb->block_req = blk_fetch_request(msb->queue); - if (!msb->block_req) { - dev_dbg(&card->dev, "issue end\n"); - return -EAGAIN; - } - - dev_dbg(&card->dev, "trying again\n"); - chunk = 1; - goto try_again; + return 1; } static int mspro_block_complete_req(struct memstick_dev *card, int error) { struct mspro_block_data *msb = memstick_get_drvdata(card); - int chunk, cnt; + int cnt; + bool chunk; unsigned int t_len = 0; unsigned long flags; spin_lock_irqsave(&msb->q_lock, flags); - dev_dbg(&card->dev, "complete %d, %d\n", msb->has_request ? 1 : 0, + dev_dbg(&card->dev, "complete %d, %d\n", msb->block_req ? 1 : 0, error); - if (msb->has_request) { + if (msb->block_req) { /* Nothing to do - not really an error */ if (error == -EAGAIN) error = 0; @@ -777,15 +776,17 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error) if (error && !t_len) t_len = blk_rq_cur_bytes(msb->block_req); - chunk = __blk_end_request(msb->block_req, + chunk = blk_update_request(msb->block_req, errno_to_blk_status(error), t_len); - - error = mspro_block_issue_req(card, chunk); - - if (!error) - goto out; - else - msb->has_request = 0; + if (chunk) { + error = mspro_block_issue_req(card, chunk); + if (!error) + goto out; + } else { + __blk_mq_end_request(msb->block_req, + errno_to_blk_status(error)); + msb->block_req = NULL; + } } else { if (!error) error = -EAGAIN; @@ -806,8 +807,8 @@ static void mspro_block_stop(struct memstick_dev *card) while (1) { spin_lock_irqsave(&msb->q_lock, flags); - if (!msb->has_request) { - blk_stop_queue(msb->queue); + if (!msb->block_req) { + blk_mq_stop_hw_queues(msb->queue); rc = 1; } spin_unlock_irqrestore(&msb->q_lock, flags); @@ -822,32 +823,37 @@ static void mspro_block_stop(struct memstick_dev *card) static void mspro_block_start(struct memstick_dev *card) { struct mspro_block_data *msb = memstick_get_drvdata(card); - unsigned long flags; - spin_lock_irqsave(&msb->q_lock, flags); - blk_start_queue(msb->queue); - spin_unlock_irqrestore(&msb->q_lock, flags); + blk_mq_start_hw_queues(msb->queue); } -static void mspro_block_submit_req(struct request_queue *q) +static blk_status_t mspro_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct memstick_dev *card = q->queuedata; + struct memstick_dev *card = hctx->queue->queuedata; struct mspro_block_data *msb = memstick_get_drvdata(card); - struct request *req = NULL; - if (msb->has_request) - return; + spin_lock_irq(&msb->q_lock); - if (msb->eject) { - while ((req = blk_fetch_request(q)) != NULL) - __blk_end_request_all(req, BLK_STS_IOERR); - - return; + if (msb->block_req) { + spin_unlock_irq(&msb->q_lock); + return BLK_STS_DEV_RESOURCE; } - msb->has_request = 1; - if (mspro_block_issue_req(card, 0)) - msb->has_request = 0; + if (msb->eject) { + spin_unlock_irq(&msb->q_lock); + blk_mq_start_request(bd->rq); + return BLK_STS_IOERR; + } + + msb->block_req = bd->rq; + blk_mq_start_request(bd->rq); + + if (mspro_block_issue_req(card, true)) + msb->block_req = NULL; + + spin_unlock_irq(&msb->q_lock); + return BLK_STS_OK; } /*** Initialization ***/ @@ -1167,6 +1173,10 @@ static int mspro_block_init_card(struct memstick_dev *card) } +static const struct blk_mq_ops mspro_mq_ops = { + .queue_rq = mspro_queue_rq, +}; + static int mspro_block_init_disk(struct memstick_dev *card) { struct mspro_block_data *msb = memstick_get_drvdata(card); @@ -1206,9 +1216,11 @@ static int mspro_block_init_disk(struct memstick_dev *card) goto out_release_id; } - msb->queue = blk_init_queue(mspro_block_submit_req, &msb->q_lock); - if (!msb->queue) { - rc = -ENOMEM; + msb->queue = blk_mq_init_sq_queue(&msb->tag_set, &mspro_mq_ops, 2, + BLK_MQ_F_SHOULD_MERGE); + if (IS_ERR(msb->queue)) { + rc = PTR_ERR(msb->queue); + msb->queue = NULL; goto out_put_disk; } @@ -1318,13 +1330,14 @@ static void mspro_block_remove(struct memstick_dev *card) spin_lock_irqsave(&msb->q_lock, flags); msb->eject = 1; - blk_start_queue(msb->queue); spin_unlock_irqrestore(&msb->q_lock, flags); + blk_mq_start_hw_queues(msb->queue); del_gendisk(msb->disk); dev_dbg(&card->dev, "mspro block remove\n"); blk_cleanup_queue(msb->queue); + blk_mq_free_tag_set(&msb->tag_set); msb->queue = NULL; sysfs_remove_group(&card->dev.kobj, &msb->attr_group); @@ -1344,8 +1357,9 @@ static int mspro_block_suspend(struct memstick_dev *card, pm_message_t state) struct mspro_block_data *msb = memstick_get_drvdata(card); unsigned long flags; + blk_mq_stop_hw_queues(msb->queue); + spin_lock_irqsave(&msb->q_lock, flags); - blk_stop_queue(msb->queue); msb->active = 0; spin_unlock_irqrestore(&msb->q_lock, flags); @@ -1355,7 +1369,6 @@ static int mspro_block_suspend(struct memstick_dev *card, pm_message_t state) static int mspro_block_resume(struct memstick_dev *card) { struct mspro_block_data *msb = memstick_get_drvdata(card); - unsigned long flags; int rc = 0; #ifdef CONFIG_MEMSTICK_UNSAFE_RESUME @@ -1401,9 +1414,7 @@ out_unlock: #endif /* CONFIG_MEMSTICK_UNSAFE_RESUME */ - spin_lock_irqsave(&msb->q_lock, flags); - blk_start_queue(msb->queue); - spin_unlock_irqrestore(&msb->q_lock, flags); + blk_mq_start_hw_queues(msb->queue); return rc; } diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 111934838da2..62e7619d5a4d 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -100,7 +100,6 @@ static DEFINE_IDA(mmc_rpmb_ida); * There is one mmc_blk_data per slot. */ struct mmc_blk_data { - spinlock_t lock; struct device *parent; struct gendisk *disk; struct mmc_queue queue; @@ -1488,7 +1487,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req) blk_mq_end_request(req, BLK_STS_OK); } - spin_lock_irqsave(q->queue_lock, flags); + spin_lock_irqsave(&mq->lock, flags); mq->in_flight[mmc_issue_type(mq, req)] -= 1; @@ -1496,7 +1495,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req) mmc_cqe_check_busy(mq); - spin_unlock_irqrestore(q->queue_lock, flags); + spin_unlock_irqrestore(&mq->lock, flags); if (!mq->cqe_busy) blk_mq_run_hw_queues(q, true); @@ -1993,17 +1992,16 @@ static void mmc_blk_mq_poll_completion(struct mmc_queue *mq, static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req) { - struct request_queue *q = req->q; unsigned long flags; bool put_card; - spin_lock_irqsave(q->queue_lock, flags); + spin_lock_irqsave(&mq->lock, flags); mq->in_flight[mmc_issue_type(mq, req)] -= 1; put_card = (mmc_tot_in_flight(mq) == 0); - spin_unlock_irqrestore(q->queue_lock, flags); + spin_unlock_irqrestore(&mq->lock, flags); if (put_card) mmc_put_card(mq->card, &mq->ctx); @@ -2099,11 +2097,11 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq) * request does not need to wait (although it does need to * complete complete_req first). */ - spin_lock_irqsave(q->queue_lock, flags); + spin_lock_irqsave(&mq->lock, flags); mq->complete_req = req; mq->rw_wait = false; waiting = mq->waiting; - spin_unlock_irqrestore(q->queue_lock, flags); + spin_unlock_irqrestore(&mq->lock, flags); /* * If 'waiting' then the waiting task will complete this @@ -2122,10 +2120,10 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq) /* Take the recovery path for errors or urgent background operations */ if (mmc_blk_rq_error(&mqrq->brq) || mmc_blk_urgent_bkops_needed(mq, mqrq)) { - spin_lock_irqsave(q->queue_lock, flags); + spin_lock_irqsave(&mq->lock, flags); mq->recovery_needed = true; mq->recovery_req = req; - spin_unlock_irqrestore(q->queue_lock, flags); + spin_unlock_irqrestore(&mq->lock, flags); wake_up(&mq->wait); schedule_work(&mq->recovery_work); return; @@ -2141,7 +2139,6 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq) static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err) { - struct request_queue *q = mq->queue; unsigned long flags; bool done; @@ -2149,7 +2146,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err) * Wait while there is another request in progress, but not if recovery * is needed. Also indicate whether there is a request waiting to start. */ - spin_lock_irqsave(q->queue_lock, flags); + spin_lock_irqsave(&mq->lock, flags); if (mq->recovery_needed) { *err = -EBUSY; done = true; @@ -2157,7 +2154,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err) done = !mq->rw_wait; } mq->waiting = !done; - spin_unlock_irqrestore(q->queue_lock, flags); + spin_unlock_irqrestore(&mq->lock, flags); return done; } @@ -2334,12 +2331,11 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, goto err_kfree; } - spin_lock_init(&md->lock); INIT_LIST_HEAD(&md->part); INIT_LIST_HEAD(&md->rpmbs); md->usage = 1; - ret = mmc_init_queue(&md->queue, card, &md->lock, subname); + ret = mmc_init_queue(&md->queue, card); if (ret) goto err_putdisk; diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 6edffeed9953..35cc138b096d 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -89,9 +89,9 @@ void mmc_cqe_recovery_notifier(struct mmc_request *mrq) struct mmc_queue *mq = q->queuedata; unsigned long flags; - spin_lock_irqsave(q->queue_lock, flags); + spin_lock_irqsave(&mq->lock, flags); __mmc_cqe_recovery_notifier(mq); - spin_unlock_irqrestore(q->queue_lock, flags); + spin_unlock_irqrestore(&mq->lock, flags); } static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req) @@ -128,14 +128,14 @@ static enum blk_eh_timer_return mmc_mq_timed_out(struct request *req, unsigned long flags; int ret; - spin_lock_irqsave(q->queue_lock, flags); + spin_lock_irqsave(&mq->lock, flags); if (mq->recovery_needed || !mq->use_cqe) ret = BLK_EH_RESET_TIMER; else ret = mmc_cqe_timed_out(req); - spin_unlock_irqrestore(q->queue_lock, flags); + spin_unlock_irqrestore(&mq->lock, flags); return ret; } @@ -157,9 +157,9 @@ static void mmc_mq_recovery_handler(struct work_struct *work) mq->in_recovery = false; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&mq->lock); mq->recovery_needed = false; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&mq->lock); mmc_put_card(mq->card, &mq->ctx); @@ -258,10 +258,10 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx, issue_type = mmc_issue_type(mq, req); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&mq->lock); if (mq->recovery_needed || mq->busy) { - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&mq->lock); return BLK_STS_RESOURCE; } @@ -269,7 +269,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx, case MMC_ISSUE_DCMD: if (mmc_cqe_dcmd_busy(mq)) { mq->cqe_busy |= MMC_CQE_DCMD_BUSY; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&mq->lock); return BLK_STS_RESOURCE; } break; @@ -294,7 +294,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx, get_card = (mmc_tot_in_flight(mq) == 1); cqe_retune_ok = (mmc_cqe_qcnt(mq) == 1); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&mq->lock); if (!(req->rq_flags & RQF_DONTPREP)) { req_to_mmc_queue_req(req)->retries = 0; @@ -328,12 +328,12 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx, if (issued != MMC_REQ_STARTED) { bool put_card = false; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&mq->lock); mq->in_flight[issue_type] -= 1; if (mmc_tot_in_flight(mq) == 0) put_card = true; mq->busy = false; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&mq->lock); if (put_card) mmc_put_card(card, &mq->ctx); } else { @@ -378,14 +378,37 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card) init_waitqueue_head(&mq->wait); } -static int mmc_mq_init_queue(struct mmc_queue *mq, int q_depth, - const struct blk_mq_ops *mq_ops, spinlock_t *lock) +/* Set queue depth to get a reasonable value for q->nr_requests */ +#define MMC_QUEUE_DEPTH 64 + +/** + * mmc_init_queue - initialise a queue structure. + * @mq: mmc queue + * @card: mmc card to attach this queue + * + * Initialise a MMC card request queue. + */ +int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card) { + struct mmc_host *host = card->host; int ret; + mq->card = card; + mq->use_cqe = host->cqe_enabled; + + spin_lock_init(&mq->lock); + memset(&mq->tag_set, 0, sizeof(mq->tag_set)); - mq->tag_set.ops = mq_ops; - mq->tag_set.queue_depth = q_depth; + mq->tag_set.ops = &mmc_mq_ops; + /* + * The queue depth for CQE must match the hardware because the request + * tag is used to index the hardware queue. + */ + if (mq->use_cqe) + mq->tag_set.queue_depth = + min_t(int, card->ext_csd.cmdq_depth, host->cqe_qdepth); + else + mq->tag_set.queue_depth = MMC_QUEUE_DEPTH; mq->tag_set.numa_node = NUMA_NO_NODE; mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING; @@ -403,68 +426,17 @@ static int mmc_mq_init_queue(struct mmc_queue *mq, int q_depth, goto free_tag_set; } - mq->queue->queue_lock = lock; mq->queue->queuedata = mq; + blk_queue_rq_timeout(mq->queue, 60 * HZ); + mmc_setup_queue(mq, card); return 0; free_tag_set: blk_mq_free_tag_set(&mq->tag_set); - return ret; } -/* Set queue depth to get a reasonable value for q->nr_requests */ -#define MMC_QUEUE_DEPTH 64 - -static int mmc_mq_init(struct mmc_queue *mq, struct mmc_card *card, - spinlock_t *lock) -{ - struct mmc_host *host = card->host; - int q_depth; - int ret; - - /* - * The queue depth for CQE must match the hardware because the request - * tag is used to index the hardware queue. - */ - if (mq->use_cqe) - q_depth = min_t(int, card->ext_csd.cmdq_depth, host->cqe_qdepth); - else - q_depth = MMC_QUEUE_DEPTH; - - ret = mmc_mq_init_queue(mq, q_depth, &mmc_mq_ops, lock); - if (ret) - return ret; - - blk_queue_rq_timeout(mq->queue, 60 * HZ); - - mmc_setup_queue(mq, card); - - return 0; -} - -/** - * mmc_init_queue - initialise a queue structure. - * @mq: mmc queue - * @card: mmc card to attach this queue - * @lock: queue lock - * @subname: partition subname - * - * Initialise a MMC card request queue. - */ -int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, - spinlock_t *lock, const char *subname) -{ - struct mmc_host *host = card->host; - - mq->card = card; - - mq->use_cqe = host->cqe_enabled; - - return mmc_mq_init(mq, card, lock); -} - void mmc_queue_suspend(struct mmc_queue *mq) { blk_mq_quiesce_queue(mq->queue); diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h index 9bf3c9245075..fd11491ced9f 100644 --- a/drivers/mmc/core/queue.h +++ b/drivers/mmc/core/queue.h @@ -77,6 +77,7 @@ struct mmc_queue { struct blk_mq_tag_set tag_set; struct mmc_blk_data *blkdata; struct request_queue *queue; + spinlock_t lock; int in_flight[MMC_ISSUE_MAX]; unsigned int cqe_busy; #define MMC_CQE_DCMD_BUSY BIT(0) @@ -95,8 +96,7 @@ struct mmc_queue { struct work_struct complete_work; }; -extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *, - const char *); +extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *); extern void mmc_cleanup_queue(struct mmc_queue *); extern void mmc_queue_suspend(struct mmc_queue *); extern void mmc_queue_resume(struct mmc_queue *); diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index 59dd50866932..5477a014e1fb 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -1322,7 +1322,7 @@ static int ath6kl_cfg80211_set_default_key(struct wiphy *wiphy, struct ath6kl_vif *vif = netdev_priv(ndev); struct ath6kl_key *key = NULL; u8 key_usage; - enum crypto_type key_type = NONE_CRYPT; + enum ath6kl_crypto_type key_type = NONE_CRYPT; ath6kl_dbg(ATH6KL_DBG_WLAN_CFG, "%s: index %d\n", __func__, key_index); diff --git a/drivers/net/wireless/ath/ath6kl/common.h b/drivers/net/wireless/ath/ath6kl/common.h index 4f82e8632d37..d6e5234f67a1 100644 --- a/drivers/net/wireless/ath/ath6kl/common.h +++ b/drivers/net/wireless/ath/ath6kl/common.h @@ -67,7 +67,7 @@ struct ath6kl_llc_snap_hdr { __be16 eth_type; } __packed; -enum crypto_type { +enum ath6kl_crypto_type { NONE_CRYPT = 0x01, WEP_CRYPT = 0x02, TKIP_CRYPT = 0x04, diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c index 777acc564ac9..9d7ac1ab2d02 100644 --- a/drivers/net/wireless/ath/ath6kl/wmi.c +++ b/drivers/net/wireless/ath/ath6kl/wmi.c @@ -1849,9 +1849,9 @@ int ath6kl_wmi_connect_cmd(struct wmi *wmi, u8 if_idx, enum network_type nw_type, enum dot11_auth_mode dot11_auth_mode, enum auth_mode auth_mode, - enum crypto_type pairwise_crypto, + enum ath6kl_crypto_type pairwise_crypto, u8 pairwise_crypto_len, - enum crypto_type group_crypto, + enum ath6kl_crypto_type group_crypto, u8 group_crypto_len, int ssid_len, u8 *ssid, u8 *bssid, u16 channel, u32 ctrl_flags, u8 nw_subtype) @@ -2301,7 +2301,7 @@ int ath6kl_wmi_disctimeout_cmd(struct wmi *wmi, u8 if_idx, u8 timeout) } int ath6kl_wmi_addkey_cmd(struct wmi *wmi, u8 if_idx, u8 key_index, - enum crypto_type key_type, + enum ath6kl_crypto_type key_type, u8 key_usage, u8 key_len, u8 *key_rsc, unsigned int key_rsc_len, u8 *key_material, diff --git a/drivers/net/wireless/ath/ath6kl/wmi.h b/drivers/net/wireless/ath/ath6kl/wmi.h index a60bb49fe920..784940ba4c90 100644 --- a/drivers/net/wireless/ath/ath6kl/wmi.h +++ b/drivers/net/wireless/ath/ath6kl/wmi.h @@ -2556,9 +2556,9 @@ int ath6kl_wmi_connect_cmd(struct wmi *wmi, u8 if_idx, enum network_type nw_type, enum dot11_auth_mode dot11_auth_mode, enum auth_mode auth_mode, - enum crypto_type pairwise_crypto, + enum ath6kl_crypto_type pairwise_crypto, u8 pairwise_crypto_len, - enum crypto_type group_crypto, + enum ath6kl_crypto_type group_crypto, u8 group_crypto_len, int ssid_len, u8 *ssid, u8 *bssid, u16 channel, u32 ctrl_flags, u8 nw_subtype); @@ -2610,7 +2610,7 @@ int ath6kl_wmi_config_debug_module_cmd(struct wmi *wmi, u32 valid, u32 config); int ath6kl_wmi_get_stats_cmd(struct wmi *wmi, u8 if_idx); int ath6kl_wmi_addkey_cmd(struct wmi *wmi, u8 if_idx, u8 key_index, - enum crypto_type key_type, + enum ath6kl_crypto_type key_type, u8 key_usage, u8 key_len, u8 *key_rsc, unsigned int key_rsc_len, u8 *key_material, diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 0e39e3d1846f..f7019294740c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -393,7 +393,7 @@ static int pmem_attach_disk(struct device *dev, return -EBUSY; } - q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev), NULL); + q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); if (!q) return -ENOMEM; diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 88a8b5916624..0f345e207675 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -57,3 +57,18 @@ config NVME_FC from https://github.com/linux-nvme/nvme-cli. If unsure, say N. + +config NVME_TCP + tristate "NVM Express over Fabrics TCP host driver" + depends on INET + depends on BLK_DEV_NVME + select NVME_FABRICS + help + This provides support for the NVMe over Fabrics protocol using + the TCP transport. This allows you to use remote block devices + exported using the NVMe protocol set. + + To configure a NVMe over Fabrics controller use the nvme-cli tool + from https://github.com/linux-nvme/nvme-cli. + + If unsure, say N. diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index aea459c65ae1..8a4b671c5f0c 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_BLK_DEV_NVME) += nvme.o obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o +obj-$(CONFIG_NVME_TCP) += nvme-tcp.o nvme-core-y := core.o nvme-core-$(CONFIG_TRACING) += trace.o @@ -21,3 +22,5 @@ nvme-fabrics-y += fabrics.o nvme-rdma-y += rdma.o nvme-fc-y += fc.o + +nvme-tcp-y += tcp.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 962012135b62..08f2c92602f4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -97,7 +97,6 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; -static void nvme_ns_remove(struct nvme_ns *ns); static int nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, @@ -245,12 +244,31 @@ static inline bool nvme_req_needs_retry(struct request *req) return true; } +static void nvme_retry_req(struct request *req) +{ + struct nvme_ns *ns = req->q->queuedata; + unsigned long delay = 0; + u16 crd; + + /* The mask and shift result must be <= 3 */ + crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11; + if (ns && crd) + delay = ns->ctrl->crdt[crd - 1] * 100; + + nvme_req(req)->retries++; + blk_mq_requeue_request(req, false); + blk_mq_delay_kick_requeue_list(req->q, delay); +} + void nvme_complete_rq(struct request *req) { blk_status_t status = nvme_error_status(req); trace_nvme_complete_rq(req); + if (nvme_req(req)->ctrl->kas) + nvme_req(req)->ctrl->comp_seen = true; + if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { if ((req->cmd_flags & REQ_NVME_MPATH) && blk_path_error(status)) { @@ -259,8 +277,7 @@ void nvme_complete_rq(struct request *req) } if (!blk_queue_dying(req->q)) { - nvme_req(req)->retries++; - blk_mq_requeue_request(req, true); + nvme_retry_req(req); return; } } @@ -268,14 +285,14 @@ void nvme_complete_rq(struct request *req) } EXPORT_SYMBOL_GPL(nvme_complete_rq); -void nvme_cancel_request(struct request *req, void *data, bool reserved) +bool nvme_cancel_request(struct request *req, void *data, bool reserved) { dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, "Cancelling I/O %d", req->tag); nvme_req(req)->status = NVME_SC_ABORT_REQ; blk_mq_complete_request(req); - + return true; } EXPORT_SYMBOL_GPL(nvme_cancel_request); @@ -536,7 +553,6 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { - memset(cmnd, 0, sizeof(*cmnd)); cmnd->common.opcode = nvme_cmd_flush; cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); } @@ -548,9 +564,19 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, struct nvme_dsm_range *range; struct bio *bio; - range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC); - if (!range) - return BLK_STS_RESOURCE; + range = kmalloc_array(segments, sizeof(*range), + GFP_ATOMIC | __GFP_NOWARN); + if (!range) { + /* + * If we fail allocation our range, fallback to the controller + * discard page. If that's also busy, it's safe to return + * busy, as we know we can make progress once that's freed. + */ + if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy)) + return BLK_STS_RESOURCE; + + range = page_address(ns->ctrl->discard_page); + } __rq_for_each_bio(bio, req) { u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); @@ -565,11 +591,13 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, } if (WARN_ON_ONCE(n != segments)) { - kfree(range); + if (virt_to_page(range) == ns->ctrl->discard_page) + clear_bit_unlock(0, &ns->ctrl->discard_page_busy); + else + kfree(range); return BLK_STS_IOERR; } - memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); cmnd->dsm.nr = cpu_to_le32(segments - 1); @@ -598,7 +626,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - memset(cmnd, 0, sizeof(*cmnd)); cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); @@ -650,8 +677,13 @@ void nvme_cleanup_cmd(struct request *req) blk_rq_bytes(req) >> ns->lba_shift); } if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - kfree(page_address(req->special_vec.bv_page) + - req->special_vec.bv_offset); + struct nvme_ns *ns = req->rq_disk->private_data; + struct page *page = req->special_vec.bv_page; + + if (page == ns->ctrl->discard_page) + clear_bit_unlock(0, &ns->ctrl->discard_page_busy); + else + kfree(page_address(page) + req->special_vec.bv_offset); } } EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); @@ -663,6 +695,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, nvme_clear_nvme_request(req); + memset(cmd, 0, sizeof(*cmd)); switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: @@ -691,6 +724,31 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, } EXPORT_SYMBOL_GPL(nvme_setup_cmd); +static void nvme_end_sync_rq(struct request *rq, blk_status_t error) +{ + struct completion *waiting = rq->end_io_data; + + rq->end_io_data = NULL; + complete(waiting); +} + +static void nvme_execute_rq_polled(struct request_queue *q, + struct gendisk *bd_disk, struct request *rq, int at_head) +{ + DECLARE_COMPLETION_ONSTACK(wait); + + WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)); + + rq->cmd_flags |= REQ_HIPRI; + rq->end_io_data = &wait; + blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq); + + while (!completion_done(&wait)) { + blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true); + cond_resched(); + } +} + /* * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code @@ -698,7 +756,7 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, unsigned timeout, int qid, int at_head, - blk_mq_req_flags_t flags) + blk_mq_req_flags_t flags, bool poll) { struct request *req; int ret; @@ -715,7 +773,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, goto out; } - blk_execute_rq(req->q, NULL, req, at_head); + if (poll) + nvme_execute_rq_polled(req->q, NULL, req, at_head); + else + blk_execute_rq(req->q, NULL, req, at_head); if (result) *result = nvme_req(req)->result; if (nvme_req(req)->flags & NVME_REQ_CANCELLED) @@ -732,7 +793,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buffer, unsigned bufflen) { return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, - NVME_QID_ANY, 0, 0); + NVME_QID_ANY, 0, 0, false); } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); @@ -843,6 +904,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) return; } + ctrl->comp_seen = false; spin_lock_irqsave(&ctrl->lock, flags); if (ctrl->state == NVME_CTRL_LIVE || ctrl->state == NVME_CTRL_CONNECTING) @@ -873,6 +935,15 @@ static void nvme_keep_alive_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), struct nvme_ctrl, ka_work); + bool comp_seen = ctrl->comp_seen; + + if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) { + dev_dbg(ctrl->device, + "reschedule traffic based keep-alive timer\n"); + ctrl->comp_seen = false; + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + return; + } if (nvme_keep_alive(ctrl)) { /* allocation failure, reset the controller */ @@ -1041,7 +1112,7 @@ static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword c.features.dword11 = cpu_to_le32(dword11); ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, - buffer, buflen, 0, NVME_QID_ANY, 0, 0); + buffer, buflen, 0, NVME_QID_ANY, 0, 0, false); if (ret >= 0 && result) *result = le32_to_cpu(res.u32); return ret; @@ -1240,12 +1311,12 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.nsid = cpu_to_le32(cmd.nsid); c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); - c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); - c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); - c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); - c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); - c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); @@ -1524,8 +1595,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->noiob) nvme_set_chunk_size(ns); nvme_update_disk_info(disk, ns, id); - if (ns->ndev) - nvme_nvm_update_nvm_info(ns); #ifdef CONFIG_NVME_MULTIPATH if (ns->head->disk) { nvme_update_disk_info(ns->head->disk, ns, id); @@ -1608,7 +1677,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10, memset(&c, 0, sizeof(c)); c.common.opcode = op; c.common.nsid = cpu_to_le32(ns->head->ns_id); - c.common.cdw10[0] = cpu_to_le32(cdw10); + c.common.cdw10 = cpu_to_le32(cdw10); ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); nvme_put_ns_from_disk(head, srcu_idx); @@ -1682,11 +1751,11 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, else cmd.common.opcode = nvme_admin_security_recv; cmd.common.nsid = 0; - cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); - cmd.common.cdw10[1] = cpu_to_le32(len); + cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); + cmd.common.cdw11 = cpu_to_le32(len); return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, - ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0); + ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false); } EXPORT_SYMBOL_GPL(nvme_sec_submit); #endif /* CONFIG_BLK_SED_OPAL */ @@ -1881,6 +1950,26 @@ static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) return ret; } +static int nvme_configure_acre(struct nvme_ctrl *ctrl) +{ + struct nvme_feat_host_behavior *host; + int ret; + + /* Don't bother enabling the feature if retry delay is not reported */ + if (!ctrl->crdt[0]) + return 0; + + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return 0; + + host->acre = NVME_ENABLE_ACRE; + ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, + host, sizeof(*host), NULL); + kfree(host); + return ret; +} + static int nvme_configure_apst(struct nvme_ctrl *ctrl) { /* @@ -2402,6 +2491,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; } + ctrl->crdt[0] = le16_to_cpu(id->crdt1); + ctrl->crdt[1] = le16_to_cpu(id->crdt2); + ctrl->crdt[2] = le16_to_cpu(id->crdt3); + ctrl->oacs = le16_to_cpu(id->oacs); ctrl->oncs = le16_to_cpup(&id->oncs); ctrl->oaes = le32_to_cpu(id->oaes); @@ -2419,6 +2512,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); ctrl->max_namespaces = le32_to_cpu(id->mnan); + ctrl->ctratt = le32_to_cpu(id->ctratt); if (id->rtd3e) { /* us -> s */ @@ -2501,6 +2595,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (ret < 0) return ret; + ret = nvme_configure_acre(ctrl); + if (ret < 0) + return ret; + ctrl->identified = true; return 0; @@ -2776,6 +2874,7 @@ static ssize_t field##_show(struct device *dev, \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); nvme_show_int_function(cntlid); +nvme_show_int_function(numa_node); static ssize_t nvme_sysfs_delete(struct device *dev, struct device_attribute *attr, const char *buf, @@ -2855,6 +2954,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_subsysnqn.attr, &dev_attr_address.attr, &dev_attr_state.attr, + &dev_attr_numa_node.attr, NULL }; @@ -3065,7 +3165,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) struct gendisk *disk; struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; - int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT; + int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) @@ -3100,13 +3200,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_setup_streams_ns(ctrl, ns); nvme_set_disk_name(disk_name, ns, ctrl, &flags); - if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - if (nvme_nvm_register(ns, disk_name, node)) { - dev_warn(ctrl->device, "LightNVM init failure\n"); - goto out_unlink_ns; - } - } - disk = alloc_disk_node(0, node); if (!disk) goto out_unlink_ns; @@ -3120,6 +3213,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) __nvme_revalidate_disk(disk, id); + if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { + if (nvme_nvm_register(ns, disk_name, node)) { + dev_warn(ctrl->device, "LightNVM init failure\n"); + goto out_put_disk; + } + } + down_write(&ctrl->namespaces_rwsem); list_add_tail(&ns->list, &ctrl->namespaces); up_write(&ctrl->namespaces_rwsem); @@ -3133,6 +3233,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) kfree(id); return; + out_put_disk: + put_disk(ns->disk); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); @@ -3522,6 +3624,7 @@ static void nvme_free_ctrl(struct device *dev) ida_simple_remove(&nvme_instance_ida, ctrl->instance); kfree(ctrl->effects); nvme_mpath_uninit(ctrl); + __free_page(ctrl->discard_page); if (subsys) { mutex_lock(&subsys->lock); @@ -3562,6 +3665,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; + BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) > + PAGE_SIZE); + ctrl->discard_page = alloc_page(GFP_KERNEL); + if (!ctrl->discard_page) { + ret = -ENOMEM; + goto out; + } + ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); if (ret < 0) goto out; @@ -3599,6 +3710,8 @@ out_free_name: out_release_instance: ida_simple_remove(&nvme_instance_ida, ctrl->instance); out: + if (ctrl->discard_page) + __free_page(ctrl->discard_page); return ret; } EXPORT_SYMBOL_GPL(nvme_init_ctrl); @@ -3746,7 +3859,7 @@ out: return result; } -void nvme_core_exit(void) +void __exit nvme_core_exit(void) { ida_destroy(&nvme_subsystems_ida); class_destroy(nvme_subsys_class); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index bd0969db6225..b2ab213f43de 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -159,7 +159,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) cmd.prop_get.offset = cpu_to_le32(off); ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, - NVME_QID_ANY, 0, 0); + NVME_QID_ANY, 0, 0, false); if (ret >= 0) *val = le64_to_cpu(res.u64); @@ -206,7 +206,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) cmd.prop_get.offset = cpu_to_le32(off); ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, - NVME_QID_ANY, 0, 0); + NVME_QID_ANY, 0, 0, false); if (ret >= 0) *val = le64_to_cpu(res.u64); @@ -252,7 +252,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) cmd.prop_set.value = cpu_to_le64(val); ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0, - NVME_QID_ANY, 0, 0); + NVME_QID_ANY, 0, 0, false); if (unlikely(ret)) dev_err(ctrl->device, "Property Set error: %d, offset %#x\n", @@ -392,6 +392,9 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 : cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000); + if (ctrl->opts->disable_sqflow) + cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; @@ -403,7 +406,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, data, sizeof(*data), 0, NVME_QID_ANY, 1, - BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false); if (ret) { nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), &cmd, data); @@ -438,7 +441,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue); * > 0: NVMe error status code * < 0: Linux errno error code */ -int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) +int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll) { struct nvme_command cmd; struct nvmf_connect_data *data; @@ -451,6 +454,9 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) cmd.connect.qid = cpu_to_le16(qid); cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize); + if (ctrl->opts->disable_sqflow) + cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; @@ -462,7 +468,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res, data, sizeof(*data), 0, qid, 1, - BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, poll); if (ret) { nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), &cmd, data); @@ -607,6 +613,11 @@ static const match_table_t opt_tokens = { { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, { NVMF_OPT_HOST_ID, "hostid=%s" }, { NVMF_OPT_DUP_CONNECT, "duplicate_connect" }, + { NVMF_OPT_DISABLE_SQFLOW, "disable_sqflow" }, + { NVMF_OPT_HDR_DIGEST, "hdr_digest" }, + { NVMF_OPT_DATA_DIGEST, "data_digest" }, + { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" }, + { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, { NVMF_OPT_ERR, NULL } }; @@ -626,6 +637,8 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; opts->kato = NVME_DEFAULT_KATO; opts->duplicate_connect = false; + opts->hdr_digest = false; + opts->data_digest = false; options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -817,6 +830,39 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, case NVMF_OPT_DUP_CONNECT: opts->duplicate_connect = true; break; + case NVMF_OPT_DISABLE_SQFLOW: + opts->disable_sqflow = true; + break; + case NVMF_OPT_HDR_DIGEST: + opts->hdr_digest = true; + break; + case NVMF_OPT_DATA_DIGEST: + opts->data_digest = true; + break; + case NVMF_OPT_NR_WRITE_QUEUES: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid nr_write_queues %d\n", token); + ret = -EINVAL; + goto out; + } + opts->nr_write_queues = token; + break; + case NVMF_OPT_NR_POLL_QUEUES: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid nr_poll_queues %d\n", token); + ret = -EINVAL; + goto out; + } + opts->nr_poll_queues = token; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -933,7 +979,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); #define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ - NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT) + NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\ + NVMF_OPT_DISABLE_SQFLOW) static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 6ea6275f332a..478343b73e38 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -58,6 +58,11 @@ enum { NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, NVMF_OPT_HOST_ID = 1 << 12, NVMF_OPT_DUP_CONNECT = 1 << 13, + NVMF_OPT_DISABLE_SQFLOW = 1 << 14, + NVMF_OPT_HDR_DIGEST = 1 << 15, + NVMF_OPT_DATA_DIGEST = 1 << 16, + NVMF_OPT_NR_WRITE_QUEUES = 1 << 17, + NVMF_OPT_NR_POLL_QUEUES = 1 << 18, }; /** @@ -85,6 +90,11 @@ enum { * @max_reconnects: maximum number of allowed reconnect attempts before removing * the controller, (-1) means reconnect forever, zero means remove * immediately; + * @disable_sqflow: disable controller sq flow control + * @hdr_digest: generate/verify header digest (TCP) + * @data_digest: generate/verify data digest (TCP) + * @nr_write_queues: number of queues for write I/O + * @nr_poll_queues: number of queues for polling I/O */ struct nvmf_ctrl_options { unsigned mask; @@ -101,6 +111,11 @@ struct nvmf_ctrl_options { unsigned int kato; struct nvmf_host *host; int max_reconnects; + bool disable_sqflow; + bool hdr_digest; + bool data_digest; + unsigned int nr_write_queues; + unsigned int nr_poll_queues; }; /* @@ -156,7 +171,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl); -int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid); +int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll); int nvmf_register_transport(struct nvmf_transport_ops *ops); void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index feb86b59170e..89accc76d71c 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1975,7 +1975,7 @@ nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize) (qsize / 5)); if (ret) break; - ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + ret = nvmf_connect_io_queue(&ctrl->ctrl, i, false); if (ret) break; @@ -2326,38 +2326,6 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir); } -static struct blk_mq_tags * -nvme_fc_tagset(struct nvme_fc_queue *queue) -{ - if (queue->qnum == 0) - return queue->ctrl->admin_tag_set.tags[queue->qnum]; - - return queue->ctrl->tag_set.tags[queue->qnum - 1]; -} - -static int -nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) - -{ - struct nvme_fc_queue *queue = hctx->driver_data; - struct nvme_fc_ctrl *ctrl = queue->ctrl; - struct request *req; - struct nvme_fc_fcp_op *op; - - req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag); - if (!req) - return 0; - - op = blk_mq_rq_to_pdu(req); - - if ((atomic_read(&op->state) == FCPOP_STATE_ACTIVE) && - (ctrl->lport->ops->poll_queue)) - ctrl->lport->ops->poll_queue(&ctrl->lport->localport, - queue->lldd_handle); - - return ((atomic_read(&op->state) != FCPOP_STATE_ACTIVE)); -} - static void nvme_fc_submit_async_event(struct nvme_ctrl *arg) { @@ -2410,7 +2378,7 @@ nvme_fc_complete_rq(struct request *rq) * status. The done path will return the io request back to the block * layer with an error status. */ -static void +static bool nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) { struct nvme_ctrl *nctrl = data; @@ -2418,6 +2386,7 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req); __nvme_fc_abort_op(ctrl, op); + return true; } @@ -2427,7 +2396,6 @@ static const struct blk_mq_ops nvme_fc_mq_ops = { .init_request = nvme_fc_init_request, .exit_request = nvme_fc_exit_request, .init_hctx = nvme_fc_init_hctx, - .poll = nvme_fc_poll, .timeout = nvme_fc_timeout, }; @@ -2457,7 +2425,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) ctrl->tag_set.ops = &nvme_fc_mq_ops; ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; ctrl->tag_set.reserved_tags = 1; /* fabric connect */ - ctrl->tag_set.numa_node = NUMA_NO_NODE; + ctrl->tag_set.numa_node = ctrl->ctrl.numa_node; ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ctrl->tag_set.cmd_size = struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, @@ -3050,6 +3018,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.opts = opts; ctrl->ctrl.nr_reconnects = 0; + ctrl->ctrl.numa_node = dev_to_node(lport->dev); INIT_LIST_HEAD(&ctrl->ctrl_list); ctrl->lport = lport; ctrl->rport = rport; @@ -3090,7 +3059,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */ - ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; + ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node; ctrl->admin_tag_set.cmd_size = struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, ctrl->lport->ops->fcprqst_priv_sz); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index a4f3b263cd6c..b759c25c89c8 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -577,7 +577,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, struct ppa_addr ppa; size_t left = nchks * sizeof(struct nvme_nvm_chk_meta); size_t log_pos, offset, len; - int ret, i, max_len; + int i, max_len; + int ret = 0; /* * limit requests to maximum 256K to avoid issuing arbitrary large @@ -731,11 +732,12 @@ static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd) return ret; } -static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) +static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name, + int size) { struct nvme_ns *ns = nvmdev->q->queuedata; - return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0); + return dma_pool_create(name, ns->ctrl->dev, size, PAGE_SIZE, 0); } static void nvme_nvm_destroy_dma_pool(void *pool) @@ -935,9 +937,9 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, /* cdw11-12 */ c.ph_rw.length = cpu_to_le16(vcmd.nppas); c.ph_rw.control = cpu_to_le16(vcmd.control); - c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13); - c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14); - c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15); + c.common.cdw13 = cpu_to_le32(vcmd.cdw13); + c.common.cdw14 = cpu_to_le32(vcmd.cdw14); + c.common.cdw15 = cpu_to_le32(vcmd.cdw15); if (vcmd.timeout_ms) timeout = msecs_to_jiffies(vcmd.timeout_ms); @@ -972,22 +974,11 @@ int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg) } } -void nvme_nvm_update_nvm_info(struct nvme_ns *ns) -{ - struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; - - if (geo->version == NVM_OCSSD_SPEC_12) - return; - - geo->csecs = 1 << ns->lba_shift; - geo->sos = ns->ms; -} - int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) { struct request_queue *q = ns->queue; struct nvm_dev *dev; + struct nvm_geo *geo; _nvme_nvm_check_size(); @@ -995,6 +986,12 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) if (!dev) return -ENOMEM; + /* Note that csecs and sos will be overridden if it is a 1.2 drive. */ + geo = &dev->geo; + geo->csecs = 1 << ns->lba_shift; + geo->sos = ns->ms; + geo->ext = ns->ext; + dev->q = q; memcpy(dev->name, disk_name, DISK_NAME_LEN); dev->ops = &nvme_nvm_dev_ops; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 9901afd804ce..183ec17ba067 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -141,7 +141,7 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) test_bit(NVME_NS_ANA_PENDING, &ns->flags)) continue; - distance = node_distance(node, dev_to_node(ns->ctrl->dev)); + distance = node_distance(node, ns->ctrl->numa_node); switch (ns->ana_state) { case NVME_ANA_OPTIMIZED: @@ -220,21 +220,6 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, return ret; } -static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) -{ - struct nvme_ns_head *head = q->queuedata; - struct nvme_ns *ns; - bool found = false; - int srcu_idx; - - srcu_idx = srcu_read_lock(&head->srcu); - ns = srcu_dereference(head->current_path[numa_node_id()], &head->srcu); - if (likely(ns && nvme_path_is_optimized(ns))) - found = ns->queue->poll_fn(q, qc); - srcu_read_unlock(&head->srcu, srcu_idx); - return found; -} - static void nvme_requeue_work(struct work_struct *work) { struct nvme_ns_head *head = @@ -276,12 +261,11 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) return 0; - q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL); + q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node); if (!q) goto out; q->queuedata = head; blk_queue_make_request(q, nvme_ns_head_make_request); - q->poll_fn = nvme_ns_head_poll; blk_queue_flag_set(QUEUE_FLAG_NONROT, q); /* set to a default value for 512 until disk is validated */ blk_queue_logical_block_size(q, 512); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 081cbdcce880..2b36ac922596 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -145,6 +145,7 @@ enum nvme_ctrl_state { }; struct nvme_ctrl { + bool comp_seen; enum nvme_ctrl_state state; bool identified; spinlock_t lock; @@ -153,6 +154,7 @@ struct nvme_ctrl { struct request_queue *connect_q; struct device *dev; int instance; + int numa_node; struct blk_mq_tag_set *tagset; struct blk_mq_tag_set *admin_tagset; struct list_head namespaces; @@ -179,6 +181,7 @@ struct nvme_ctrl { u32 page_size; u32 max_hw_sectors; u32 max_segments; + u16 crdt[3]; u16 oncs; u16 oacs; u16 nssa; @@ -193,6 +196,7 @@ struct nvme_ctrl { u8 apsta; u32 oaes; u32 aen_result; + u32 ctratt; unsigned int shutdown_timeout; unsigned int kato; bool subsystem; @@ -237,6 +241,9 @@ struct nvme_ctrl { u16 maxcmd; int nr_reconnects; struct nvmf_ctrl_options *opts; + + struct page *discard_page; + unsigned long discard_page_busy; }; struct nvme_subsystem { @@ -364,15 +371,6 @@ static inline void nvme_fault_inject_fini(struct nvme_ns *ns) {} static inline void nvme_should_fail(struct request *req) {} #endif -static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) -{ - u32 val = 0; - - if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) - return false; - return val & NVME_CSTS_RDY; -} - static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) { if (!ctrl->subsystem) @@ -408,7 +406,7 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl) } void nvme_complete_rq(struct request *req); -void nvme_cancel_request(struct request *req, void *data, bool reserved); +bool nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); @@ -449,7 +447,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, unsigned timeout, int qid, int at_head, - blk_mq_req_flags_t flags); + blk_mq_req_flags_t flags, bool poll); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); @@ -545,13 +543,11 @@ static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl) #endif /* CONFIG_NVME_MULTIPATH */ #ifdef CONFIG_NVM -void nvme_nvm_update_nvm_info(struct nvme_ns *ns); int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); extern const struct attribute_group nvme_nvm_attr_group; int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg); #else -static inline void nvme_nvm_update_nvm_info(struct nvme_ns *ns) {}; static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) { @@ -572,6 +568,6 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) } int __init nvme_core_init(void); -void nvme_core_exit(void); +void __exit nvme_core_exit(void); #endif /* _NVME_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c33bb201b884..5a0bf6a24d50 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -32,6 +32,7 @@ #include #include +#include "trace.h" #include "nvme.h" #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) @@ -74,6 +75,22 @@ static int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); +static int queue_count_set(const char *val, const struct kernel_param *kp); +static const struct kernel_param_ops queue_count_ops = { + .set = queue_count_set, + .get = param_get_int, +}; + +static int write_queues; +module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644); +MODULE_PARM_DESC(write_queues, + "Number of queues to use for writes. If not set, reads and writes " + "will share a queue set."); + +static int poll_queues = 0; +module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644); +MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); + struct nvme_dev; struct nvme_queue; @@ -92,6 +109,7 @@ struct nvme_dev { struct dma_pool *prp_small_pool; unsigned online_queues; unsigned max_qid; + unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; int q_depth; u32 db_stride; @@ -105,7 +123,6 @@ struct nvme_dev { u32 cmbsz; u32 cmbloc; struct nvme_ctrl ctrl; - struct completion ioq_wait; mempool_t *iod_mempool; @@ -134,6 +151,17 @@ static int io_queue_depth_set(const char *val, const struct kernel_param *kp) return param_set_int(val, kp); } +static int queue_count_set(const char *val, const struct kernel_param *kp) +{ + int n = 0, ret; + + ret = kstrtoint(val, 10, &n); + if (n > num_possible_cpus()) + n = num_possible_cpus(); + + return param_set_int(val, kp); +} + static inline unsigned int sq_idx(unsigned int qid, u32 stride) { return qid * 2 * stride; @@ -158,8 +186,8 @@ struct nvme_queue { struct nvme_dev *dev; spinlock_t sq_lock; struct nvme_command *sq_cmds; - bool sq_cmds_is_io; - spinlock_t cq_lock ____cacheline_aligned_in_smp; + /* only used for poll queues: */ + spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; volatile struct nvme_completion *cqes; struct blk_mq_tags **tags; dma_addr_t sq_dma_addr; @@ -168,14 +196,20 @@ struct nvme_queue { u16 q_depth; s16 cq_vector; u16 sq_tail; + u16 last_sq_tail; u16 cq_head; u16 last_cq_head; u16 qid; u8 cq_phase; + unsigned long flags; +#define NVMEQ_ENABLED 0 +#define NVMEQ_SQ_CMB 1 +#define NVMEQ_DELETE_ERROR 2 u32 *dbbuf_sq_db; u32 *dbbuf_cq_db; u32 *dbbuf_sq_ei; u32 *dbbuf_cq_ei; + struct completion delete_done; }; /* @@ -218,9 +252,20 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); } +static unsigned int max_io_queues(void) +{ + return num_possible_cpus() + write_queues + poll_queues; +} + +static unsigned int max_queue_count(void) +{ + /* IO queues + admin queue */ + return 1 + max_io_queues(); +} + static inline unsigned int nvme_dbbuf_size(u32 stride) { - return ((num_possible_cpus() + 1) * 8 * stride); + return (max_queue_count() * 8 * stride); } static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) @@ -431,30 +476,90 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, return 0; } +static int queue_irq_offset(struct nvme_dev *dev) +{ + /* if we have more than 1 vec, admin queue offsets us by 1 */ + if (dev->num_vecs > 1) + return 1; + + return 0; +} + static int nvme_pci_map_queues(struct blk_mq_tag_set *set) { struct nvme_dev *dev = set->driver_data; + int i, qoff, offset; - return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), - dev->num_vecs > 1 ? 1 /* admin queue */ : 0); + offset = queue_irq_offset(dev); + for (i = 0, qoff = 0; i < set->nr_maps; i++) { + struct blk_mq_queue_map *map = &set->map[i]; + + map->nr_queues = dev->io_queues[i]; + if (!map->nr_queues) { + BUG_ON(i == HCTX_TYPE_DEFAULT); + continue; + } + + /* + * The poll queue(s) doesn't have an IRQ (and hence IRQ + * affinity), so use the regular blk-mq cpu mapping + */ + map->queue_offset = qoff; + if (i != HCTX_TYPE_POLL) + blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset); + else + blk_mq_map_queues(map); + qoff += map->nr_queues; + offset += map->nr_queues; + } + + return 0; +} + +/* + * Write sq tail if we are asked to, or if the next command would wrap. + */ +static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq) +{ + if (!write_sq) { + u16 next_tail = nvmeq->sq_tail + 1; + + if (next_tail == nvmeq->q_depth) + next_tail = 0; + if (next_tail != nvmeq->last_sq_tail) + return; + } + + if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, + nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) + writel(nvmeq->sq_tail, nvmeq->q_db); + nvmeq->last_sq_tail = nvmeq->sq_tail; } /** * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use * @cmd: The command to send + * @write_sq: whether to write to the SQ doorbell */ -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, + bool write_sq) { spin_lock(&nvmeq->sq_lock); - memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); - if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; - if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, - nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) - writel(nvmeq->sq_tail, nvmeq->q_db); + nvme_write_sq_db(nvmeq, write_sq); + spin_unlock(&nvmeq->sq_lock); +} + +static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + + spin_lock(&nvmeq->sq_lock); + if (nvmeq->sq_tail != nvmeq->last_sq_tail) + nvme_write_sq_db(nvmeq, true); spin_unlock(&nvmeq->sq_lock); } @@ -822,7 +927,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, * We should not need to do this, but we're still using this to * ensure we can drain requests on a dying queue. */ - if (unlikely(nvmeq->cq_vector < 0)) + if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) return BLK_STS_IOERR; ret = nvme_setup_cmd(ns, req, &cmnd); @@ -840,7 +945,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, } blk_mq_start_request(req); - nvme_submit_cmd(nvmeq, &cmnd); + nvme_submit_cmd(nvmeq, &cmnd, bd->last); return BLK_STS_OK; out_cleanup_iod: nvme_free_iod(dev, req); @@ -899,6 +1004,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) } req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); + trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); nvme_end_request(req, cqe->status, cqe->result); } @@ -919,15 +1025,15 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) } } -static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, - u16 *end, int tag) +static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, + u16 *end, unsigned int tag) { - bool found = false; + int found = 0; *start = nvmeq->cq_head; - while (!found && nvme_cqe_pending(nvmeq)) { - if (nvmeq->cqes[nvmeq->cq_head].command_id == tag) - found = true; + while (nvme_cqe_pending(nvmeq)) { + if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag) + found++; nvme_update_cq_head(nvmeq); } *end = nvmeq->cq_head; @@ -943,12 +1049,16 @@ static irqreturn_t nvme_irq(int irq, void *data) irqreturn_t ret = IRQ_NONE; u16 start, end; - spin_lock(&nvmeq->cq_lock); + /* + * The rmb/wmb pair ensures we see all updates from a previous run of + * the irq handler, even if that was on another CPU. + */ + rmb(); if (nvmeq->cq_head != nvmeq->last_cq_head) ret = IRQ_HANDLED; nvme_process_cq(nvmeq, &start, &end, -1); nvmeq->last_cq_head = nvmeq->cq_head; - spin_unlock(&nvmeq->cq_lock); + wmb(); if (start != end) { nvme_complete_cqes(nvmeq, start, end); @@ -966,29 +1076,52 @@ static irqreturn_t nvme_irq_check(int irq, void *data) return IRQ_NONE; } -static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) +/* + * Poll for completions any queue, including those not dedicated to polling. + * Can be called from any context. + */ +static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag) { + struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); + u16 start, end; + int found; + + /* + * For a poll queue we need to protect against the polling thread + * using the CQ lock. For normal interrupt driven threads we have + * to disable the interrupt to avoid racing with it. + */ + if (nvmeq->cq_vector == -1) { + spin_lock(&nvmeq->cq_poll_lock); + found = nvme_process_cq(nvmeq, &start, &end, tag); + spin_unlock(&nvmeq->cq_poll_lock); + } else { + disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); + found = nvme_process_cq(nvmeq, &start, &end, tag); + enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); + } + + nvme_complete_cqes(nvmeq, start, end); + return found; +} + +static int nvme_poll(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_queue *nvmeq = hctx->driver_data; u16 start, end; bool found; if (!nvme_cqe_pending(nvmeq)) return 0; - spin_lock_irq(&nvmeq->cq_lock); - found = nvme_process_cq(nvmeq, &start, &end, tag); - spin_unlock_irq(&nvmeq->cq_lock); + spin_lock(&nvmeq->cq_poll_lock); + found = nvme_process_cq(nvmeq, &start, &end, -1); + spin_unlock(&nvmeq->cq_poll_lock); nvme_complete_cqes(nvmeq, start, end); return found; } -static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) -{ - struct nvme_queue *nvmeq = hctx->driver_data; - - return __nvme_poll(nvmeq, tag); -} - static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); @@ -998,7 +1131,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; - nvme_submit_cmd(nvmeq, &c); + nvme_submit_cmd(nvmeq, &c, true); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) @@ -1016,7 +1149,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, struct nvme_queue *nvmeq, s16 vector) { struct nvme_command c; - int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; + int flags = NVME_QUEUE_PHYS_CONTIG; + + if (vector != -1) + flags |= NVME_CQ_IRQ_ENABLED; /* * Note: we (ab)use the fact that the prp fields survive if no data @@ -1028,7 +1164,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cqid = cpu_to_le16(qid); c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); c.create_cq.cq_flags = cpu_to_le16(flags); - c.create_cq.irq_vector = cpu_to_le16(vector); + if (vector != -1) + c.create_cq.irq_vector = cpu_to_le16(vector); + else + c.create_cq.irq_vector = 0; return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } @@ -1157,7 +1296,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) /* * Did we miss an interrupt? */ - if (__nvme_poll(nvmeq, req->tag)) { + if (nvme_poll_irqdisable(nvmeq, req->tag)) { dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, completion polled\n", req->tag, nvmeq->qid); @@ -1237,17 +1376,15 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) { dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + if (!nvmeq->sq_cmds) + return; - if (nvmeq->sq_cmds) { - if (nvmeq->sq_cmds_is_io) - pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev), - nvmeq->sq_cmds, - SQ_SIZE(nvmeq->q_depth)); - else - dma_free_coherent(nvmeq->q_dmadev, - SQ_SIZE(nvmeq->q_depth), - nvmeq->sq_cmds, - nvmeq->sq_dma_addr); + if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) { + pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev), + nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth)); + } else { + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); } } @@ -1267,47 +1404,32 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) */ static int nvme_suspend_queue(struct nvme_queue *nvmeq) { - int vector; - - spin_lock_irq(&nvmeq->cq_lock); - if (nvmeq->cq_vector == -1) { - spin_unlock_irq(&nvmeq->cq_lock); + if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags)) return 1; - } - vector = nvmeq->cq_vector; - nvmeq->dev->online_queues--; - nvmeq->cq_vector = -1; - spin_unlock_irq(&nvmeq->cq_lock); - /* - * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without - * having to grab the lock. - */ + /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */ mb(); + nvmeq->dev->online_queues--; if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); - - pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq); - + if (nvmeq->cq_vector == -1) + return 0; + pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); + nvmeq->cq_vector = -1; return 0; } static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { struct nvme_queue *nvmeq = &dev->queues[0]; - u16 start, end; if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); else nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); - spin_lock_irq(&nvmeq->cq_lock); - nvme_process_cq(nvmeq, &start, &end, -1); - spin_unlock_irq(&nvmeq->cq_lock); - - nvme_complete_cqes(nvmeq, start, end); + nvme_poll_irqdisable(nvmeq, -1); } static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, @@ -1343,15 +1465,14 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, nvmeq->sq_cmds); - nvmeq->sq_cmds_is_io = true; - } - - if (!nvmeq->sq_cmds) { - nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), - &nvmeq->sq_dma_addr, GFP_KERNEL); - nvmeq->sq_cmds_is_io = false; + if (nvmeq->sq_dma_addr) { + set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); + return 0; + } } + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), + &nvmeq->sq_dma_addr, GFP_KERNEL); if (!nvmeq->sq_cmds) return -ENOMEM; return 0; @@ -1375,7 +1496,7 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) nvmeq->q_dmadev = dev->dev; nvmeq->dev = dev; spin_lock_init(&nvmeq->sq_lock); - spin_lock_init(&nvmeq->cq_lock); + spin_lock_init(&nvmeq->cq_poll_lock); nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; @@ -1411,28 +1532,34 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) { struct nvme_dev *dev = nvmeq->dev; - spin_lock_irq(&nvmeq->cq_lock); nvmeq->sq_tail = 0; + nvmeq->last_sq_tail = 0; nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); nvme_dbbuf_init(dev, nvmeq, qid); dev->online_queues++; - spin_unlock_irq(&nvmeq->cq_lock); + wmb(); /* ensure the first interrupt sees the initialization */ } -static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) { struct nvme_dev *dev = nvmeq->dev; int result; s16 vector; + clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); + /* * A queue's vector matches the queue identifier unless the controller * has only one vector available. */ - vector = dev->num_vecs == 1 ? 0 : qid; + if (!polled) + vector = dev->num_vecs == 1 ? 0 : qid; + else + vector = -1; + result = adapter_alloc_cq(dev, qid, nvmeq, vector); if (result) return result; @@ -1443,17 +1570,16 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) else if (result) goto release_cq; - /* - * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will - * invoke free_irq for it and cause a 'Trying to free already-free IRQ - * xxx' warning if the create CQ/SQ command times out. - */ nvmeq->cq_vector = vector; nvme_init_queue(nvmeq, qid); - result = queue_request_irq(nvmeq); - if (result < 0) - goto release_sq; + if (vector != -1) { + result = queue_request_irq(nvmeq); + if (result < 0) + goto release_sq; + } + + set_bit(NVMEQ_ENABLED, &nvmeq->flags); return result; release_sq: @@ -1477,6 +1603,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { static const struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_pci_complete_rq, + .commit_rqs = nvme_commit_rqs, .init_hctx = nvme_init_hctx, .init_request = nvme_init_request, .map_queues = nvme_pci_map_queues, @@ -1602,12 +1729,13 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) return result; } + set_bit(NVMEQ_ENABLED, &nvmeq->flags); return result; } static int nvme_create_io_queues(struct nvme_dev *dev) { - unsigned i, max; + unsigned i, max, rw_queues; int ret = 0; for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { @@ -1618,8 +1746,17 @@ static int nvme_create_io_queues(struct nvme_dev *dev) } max = min(dev->max_qid, dev->ctrl.queue_count - 1); + if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) { + rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] + + dev->io_queues[HCTX_TYPE_READ]; + } else { + rw_queues = max; + } + for (i = dev->online_queues; i <= max; i++) { - ret = nvme_create_queue(&dev->queues[i], i); + bool polled = i > rw_queues; + + ret = nvme_create_queue(&dev->queues[i], i, polled); if (ret) break; } @@ -1891,6 +2028,110 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) return ret; } +static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues) +{ + unsigned int this_w_queues = write_queues; + + /* + * Setup read/write queue split + */ + if (irq_queues == 1) { + dev->io_queues[HCTX_TYPE_DEFAULT] = 1; + dev->io_queues[HCTX_TYPE_READ] = 0; + return; + } + + /* + * If 'write_queues' is set, ensure it leaves room for at least + * one read queue + */ + if (this_w_queues >= irq_queues) + this_w_queues = irq_queues - 1; + + /* + * If 'write_queues' is set to zero, reads and writes will share + * a queue set. + */ + if (!this_w_queues) { + dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues; + dev->io_queues[HCTX_TYPE_READ] = 0; + } else { + dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues; + dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues; + } +} + +static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + int irq_sets[2]; + struct irq_affinity affd = { + .pre_vectors = 1, + .nr_sets = ARRAY_SIZE(irq_sets), + .sets = irq_sets, + }; + int result = 0; + unsigned int irq_queues, this_p_queues; + + /* + * Poll queues don't need interrupts, but we need at least one IO + * queue left over for non-polled IO. + */ + this_p_queues = poll_queues; + if (this_p_queues >= nr_io_queues) { + this_p_queues = nr_io_queues - 1; + irq_queues = 1; + } else { + irq_queues = nr_io_queues - this_p_queues; + } + dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; + + /* + * For irq sets, we have to ask for minvec == maxvec. This passes + * any reduction back to us, so we can adjust our queue counts and + * IRQ vector needs. + */ + do { + nvme_calc_io_queues(dev, irq_queues); + irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT]; + irq_sets[1] = dev->io_queues[HCTX_TYPE_READ]; + if (!irq_sets[1]) + affd.nr_sets = 1; + + /* + * If we got a failure and we're down to asking for just + * 1 + 1 queues, just ask for a single vector. We'll share + * that between the single IO queue and the admin queue. + */ + if (result >= 0 && irq_queues > 1) + irq_queues = irq_sets[0] + irq_sets[1] + 1; + + result = pci_alloc_irq_vectors_affinity(pdev, irq_queues, + irq_queues, + PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); + + /* + * Need to reduce our vec counts. If we get ENOSPC, the + * platform should support mulitple vecs, we just need + * to decrease our ask. If we get EINVAL, the platform + * likely does not. Back down to ask for just one vector. + */ + if (result == -ENOSPC) { + irq_queues--; + if (!irq_queues) + return result; + continue; + } else if (result == -EINVAL) { + irq_queues = 1; + continue; + } else if (result <= 0) + return -EIO; + break; + } while (1); + + return result; +} + static int nvme_setup_io_queues(struct nvme_dev *dev) { struct nvme_queue *adminq = &dev->queues[0]; @@ -1898,17 +2139,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) int result, nr_io_queues; unsigned long size; - struct irq_affinity affd = { - .pre_vectors = 1 - }; - - nr_io_queues = num_possible_cpus(); + nr_io_queues = max_io_queues(); result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result; if (nr_io_queues == 0) return 0; + + clear_bit(NVMEQ_ENABLED, &adminq->flags); if (dev->cmb_use_sqes) { result = nvme_cmb_qdepth(dev, nr_io_queues, @@ -1937,12 +2176,19 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) * setting up the full range we need. */ pci_free_irq_vectors(pdev); - result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1, - PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); + + result = nvme_setup_irqs(dev, nr_io_queues); if (result <= 0) return -EIO; + dev->num_vecs = result; - dev->max_qid = max(result - 1, 1); + result = max(result - 1, 1); + dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL]; + + dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", + dev->io_queues[HCTX_TYPE_DEFAULT], + dev->io_queues[HCTX_TYPE_READ], + dev->io_queues[HCTX_TYPE_POLL]); /* * Should investigate if there's a performance win from allocating @@ -1956,6 +2202,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) adminq->cq_vector = -1; return result; } + set_bit(NVMEQ_ENABLED, &adminq->flags); return nvme_create_io_queues(dev); } @@ -1964,23 +2211,15 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error) struct nvme_queue *nvmeq = req->end_io_data; blk_mq_free_request(req); - complete(&nvmeq->dev->ioq_wait); + complete(&nvmeq->delete_done); } static void nvme_del_cq_end(struct request *req, blk_status_t error) { struct nvme_queue *nvmeq = req->end_io_data; - u16 start, end; - if (!error) { - unsigned long flags; - - spin_lock_irqsave(&nvmeq->cq_lock, flags); - nvme_process_cq(nvmeq, &start, &end, -1); - spin_unlock_irqrestore(&nvmeq->cq_lock, flags); - - nvme_complete_cqes(nvmeq, start, end); - } + if (error) + set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); nvme_del_queue_end(req, error); } @@ -2002,37 +2241,44 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) req->timeout = ADMIN_TIMEOUT; req->end_io_data = nvmeq; + init_completion(&nvmeq->delete_done); blk_execute_rq_nowait(q, NULL, req, false, opcode == nvme_admin_delete_cq ? nvme_del_cq_end : nvme_del_queue_end); return 0; } -static void nvme_disable_io_queues(struct nvme_dev *dev) +static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) { - int pass, queues = dev->online_queues - 1; + int nr_queues = dev->online_queues - 1, sent = 0; unsigned long timeout; - u8 opcode = nvme_admin_delete_sq; - for (pass = 0; pass < 2; pass++) { - int sent = 0, i = queues; - - reinit_completion(&dev->ioq_wait); retry: - timeout = ADMIN_TIMEOUT; - for (; i > 0; i--, sent++) - if (nvme_delete_queue(&dev->queues[i], opcode)) - break; - - while (sent--) { - timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout); - if (timeout == 0) - return; - if (i) - goto retry; - } - opcode = nvme_admin_delete_cq; + timeout = ADMIN_TIMEOUT; + while (nr_queues > 0) { + if (nvme_delete_queue(&dev->queues[nr_queues], opcode)) + break; + nr_queues--; + sent++; } + while (sent) { + struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent]; + + timeout = wait_for_completion_io_timeout(&nvmeq->delete_done, + timeout); + if (timeout == 0) + return false; + + /* handle any remaining CQEs */ + if (opcode == nvme_admin_delete_cq && + !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags)) + nvme_poll_irqdisable(nvmeq, -1); + + sent--; + if (nr_queues) + goto retry; + } + return true; } /* @@ -2045,6 +2291,10 @@ static int nvme_dev_add(struct nvme_dev *dev) if (!dev->ctrl.tagset) { dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; + dev->tagset.nr_maps = 2; /* default + read */ + if (dev->io_queues[HCTX_TYPE_POLL]) + dev->tagset.nr_maps++; + dev->tagset.nr_maps = HCTX_MAX_TYPES; dev->tagset.timeout = NVME_IO_TIMEOUT; dev->tagset.numa_node = dev_to_node(dev->dev); dev->tagset.queue_depth = @@ -2187,7 +2437,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_stop_queues(&dev->ctrl); if (!dead && dev->ctrl.queue_count > 0) { - nvme_disable_io_queues(dev); + if (nvme_disable_io_queues(dev, nvme_admin_delete_sq)) + nvme_disable_io_queues(dev, nvme_admin_delete_cq); nvme_disable_admin_queue(dev, shutdown); } for (i = dev->ctrl.queue_count - 1; i >= 0; i--) @@ -2491,8 +2742,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!dev) return -ENOMEM; - dev->queues = kcalloc_node(num_possible_cpus() + 1, - sizeof(struct nvme_queue), GFP_KERNEL, node); + dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue), + GFP_KERNEL, node); if (!dev->queues) goto free; @@ -2506,7 +2757,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); mutex_init(&dev->shutdown_lock); - init_completion(&dev->ioq_wait); result = nvme_setup_prp_pools(dev); if (result) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ab6ec7295bf9..0a2fd2949ad7 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -162,6 +162,13 @@ static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue) return queue - queue->ctrl->queues; } +static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue) +{ + return nvme_rdma_queue_idx(queue) > + queue->ctrl->ctrl.opts->nr_io_queues + + queue->ctrl->ctrl.opts->nr_write_queues; +} + static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue) { return queue->cmnd_capsule_len - sizeof(struct nvme_command); @@ -440,6 +447,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) const int send_wr_factor = 3; /* MR, SEND, INV */ const int cq_factor = send_wr_factor + 1; /* + RECV */ int comp_vector, idx = nvme_rdma_queue_idx(queue); + enum ib_poll_context poll_ctx; int ret; queue->device = nvme_rdma_find_get_device(queue->cm_id); @@ -456,10 +464,16 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) */ comp_vector = idx == 0 ? idx : idx - 1; + /* Polling queues need direct cq polling context */ + if (nvme_rdma_poll_queue(queue)) + poll_ctx = IB_POLL_DIRECT; + else + poll_ctx = IB_POLL_SOFTIRQ; + /* +1 for ib_stop_cq */ queue->ib_cq = ib_alloc_cq(ibdev, queue, cq_factor * queue->queue_size + 1, - comp_vector, IB_POLL_SOFTIRQ); + comp_vector, poll_ctx); if (IS_ERR(queue->ib_cq)) { ret = PTR_ERR(queue->ib_cq); goto out_put_dev; @@ -595,15 +609,17 @@ static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl) static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) { + struct nvme_rdma_queue *queue = &ctrl->queues[idx]; + bool poll = nvme_rdma_poll_queue(queue); int ret; if (idx) - ret = nvmf_connect_io_queue(&ctrl->ctrl, idx); + ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll); else ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (!ret) - set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags); + set_bit(NVME_RDMA_Q_LIVE, &queue->flags); else dev_info(ctrl->ctrl.device, "failed to connect queue: %d ret=%d\n", idx, ret); @@ -645,6 +661,9 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) nr_io_queues = min_t(unsigned int, nr_io_queues, ibdev->num_comp_vectors); + nr_io_queues += min(opts->nr_write_queues, num_online_cpus()); + nr_io_queues += min(opts->nr_poll_queues, num_online_cpus()); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) return ret; @@ -694,7 +713,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set->ops = &nvme_rdma_admin_mq_ops; set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; set->reserved_tags = 2; /* connect + keep-alive */ - set->numa_node = NUMA_NO_NODE; + set->numa_node = nctrl->numa_node; set->cmd_size = sizeof(struct nvme_rdma_request) + SG_CHUNK_SIZE * sizeof(struct scatterlist); set->driver_data = ctrl; @@ -707,13 +726,14 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set->ops = &nvme_rdma_mq_ops; set->queue_depth = nctrl->sqsize + 1; set->reserved_tags = 1; /* fabric connect */ - set->numa_node = NUMA_NO_NODE; + set->numa_node = nctrl->numa_node; set->flags = BLK_MQ_F_SHOULD_MERGE; set->cmd_size = sizeof(struct nvme_rdma_request) + SG_CHUNK_SIZE * sizeof(struct scatterlist); set->driver_data = ctrl; set->nr_hw_queues = nctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; } ret = blk_mq_alloc_tag_set(set); @@ -763,6 +783,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, return error; ctrl->device = ctrl->queues[0].device; + ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device); ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); @@ -1411,12 +1432,11 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) WARN_ON_ONCE(ret); } -static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, - struct nvme_completion *cqe, struct ib_wc *wc, int tag) +static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, + struct nvme_completion *cqe, struct ib_wc *wc) { struct request *rq; struct nvme_rdma_request *req; - int ret = 0; rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id); if (!rq) { @@ -1424,7 +1444,7 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, "tag 0x%x on QP %#x not found\n", cqe->command_id, queue->qp->qp_num); nvme_rdma_error_recovery(queue->ctrl); - return ret; + return; } req = blk_mq_rq_to_pdu(rq); @@ -1439,6 +1459,8 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, nvme_rdma_error_recovery(queue->ctrl); } } else if (req->mr) { + int ret; + ret = nvme_rdma_inv_rkey(queue, req); if (unlikely(ret < 0)) { dev_err(queue->ctrl->ctrl.device, @@ -1447,19 +1469,14 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, nvme_rdma_error_recovery(queue->ctrl); } /* the local invalidation completion will end the request */ - return 0; + return; } - if (refcount_dec_and_test(&req->ref)) { - if (rq->tag == tag) - ret = 1; + if (refcount_dec_and_test(&req->ref)) nvme_end_request(rq, req->status, req->result); - } - - return ret; } -static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) +static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvme_rdma_qe *qe = container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); @@ -1467,11 +1484,10 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) struct ib_device *ibdev = queue->device->dev; struct nvme_completion *cqe = qe->data; const size_t len = sizeof(struct nvme_completion); - int ret = 0; if (unlikely(wc->status != IB_WC_SUCCESS)) { nvme_rdma_wr_error(cq, wc, "RECV"); - return 0; + return; } ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE); @@ -1486,16 +1502,10 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); else - ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag); + nvme_rdma_process_nvme_rsp(queue, cqe, wc); ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE); nvme_rdma_post_recv(queue, qe); - return ret; -} - -static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) -{ - __nvme_rdma_recv_done(cq, wc, -1); } static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue) @@ -1749,25 +1759,11 @@ err: return BLK_STS_IOERR; } -static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) +static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx) { struct nvme_rdma_queue *queue = hctx->driver_data; - struct ib_cq *cq = queue->ib_cq; - struct ib_wc wc; - int found = 0; - while (ib_poll_cq(cq, 1, &wc) > 0) { - struct ib_cqe *cqe = wc.wr_cqe; - - if (cqe) { - if (cqe->done == nvme_rdma_recv_done) - found |= __nvme_rdma_recv_done(cq, &wc, tag); - else - cqe->done(cq, &wc); - } - } - - return found; + return ib_process_cq_direct(queue->ib_cq, -1); } static void nvme_rdma_complete_rq(struct request *rq) @@ -1782,7 +1778,36 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) { struct nvme_rdma_ctrl *ctrl = set->driver_data; - return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0); + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues; + if (ctrl->ctrl.opts->nr_write_queues) { + /* separate read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->ctrl.opts->nr_write_queues; + set->map[HCTX_TYPE_READ].queue_offset = + ctrl->ctrl.opts->nr_write_queues; + } else { + /* mixed read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->ctrl.opts->nr_io_queues; + set->map[HCTX_TYPE_READ].queue_offset = 0; + } + blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT], + ctrl->device->dev, 0); + blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ], + ctrl->device->dev, 0); + + if (ctrl->ctrl.opts->nr_poll_queues) { + set->map[HCTX_TYPE_POLL].nr_queues = + ctrl->ctrl.opts->nr_poll_queues; + set->map[HCTX_TYPE_POLL].queue_offset = + ctrl->ctrl.opts->nr_io_queues; + if (ctrl->ctrl.opts->nr_write_queues) + set->map[HCTX_TYPE_POLL].queue_offset += + ctrl->ctrl.opts->nr_write_queues; + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); + } + return 0; } static const struct blk_mq_ops nvme_rdma_mq_ops = { @@ -1791,9 +1816,9 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = { .init_request = nvme_rdma_init_request, .exit_request = nvme_rdma_exit_request, .init_hctx = nvme_rdma_init_hctx, - .poll = nvme_rdma_poll, .timeout = nvme_rdma_timeout, .map_queues = nvme_rdma_map_queues, + .poll = nvme_rdma_poll, }; static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { @@ -1938,7 +1963,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); - ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + + opts->nr_poll_queues + 1; ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; @@ -1989,7 +2015,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = { .module = THIS_MODULE, .required_opts = NVMF_OPT_TRADDR, .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | - NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO, + NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES, .create_ctrl = nvme_rdma_create_ctrl, }; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c new file mode 100644 index 000000000000..de174912445e --- /dev/null +++ b/drivers/nvme/host/tcp.c @@ -0,0 +1,2278 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics TCP host. + * Copyright (c) 2018 Lightbits Labs. All rights reserved. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvme.h" +#include "fabrics.h" + +struct nvme_tcp_queue; + +enum nvme_tcp_send_state { + NVME_TCP_SEND_CMD_PDU = 0, + NVME_TCP_SEND_H2C_PDU, + NVME_TCP_SEND_DATA, + NVME_TCP_SEND_DDGST, +}; + +struct nvme_tcp_request { + struct nvme_request req; + void *pdu; + struct nvme_tcp_queue *queue; + u32 data_len; + u32 pdu_len; + u32 pdu_sent; + u16 ttag; + struct list_head entry; + __le32 ddgst; + + struct bio *curr_bio; + struct iov_iter iter; + + /* send state */ + size_t offset; + size_t data_sent; + enum nvme_tcp_send_state state; +}; + +enum nvme_tcp_queue_flags { + NVME_TCP_Q_ALLOCATED = 0, + NVME_TCP_Q_LIVE = 1, +}; + +enum nvme_tcp_recv_state { + NVME_TCP_RECV_PDU = 0, + NVME_TCP_RECV_DATA, + NVME_TCP_RECV_DDGST, +}; + +struct nvme_tcp_ctrl; +struct nvme_tcp_queue { + struct socket *sock; + struct work_struct io_work; + int io_cpu; + + spinlock_t lock; + struct list_head send_list; + + /* recv state */ + void *pdu; + int pdu_remaining; + int pdu_offset; + size_t data_remaining; + size_t ddgst_remaining; + + /* send state */ + struct nvme_tcp_request *request; + + int queue_size; + size_t cmnd_capsule_len; + struct nvme_tcp_ctrl *ctrl; + unsigned long flags; + bool rd_enabled; + + bool hdr_digest; + bool data_digest; + struct ahash_request *rcv_hash; + struct ahash_request *snd_hash; + __le32 exp_ddgst; + __le32 recv_ddgst; + + struct page_frag_cache pf_cache; + + void (*state_change)(struct sock *); + void (*data_ready)(struct sock *); + void (*write_space)(struct sock *); +}; + +struct nvme_tcp_ctrl { + /* read only in the hot path */ + struct nvme_tcp_queue *queues; + struct blk_mq_tag_set tag_set; + + /* other member variables */ + struct list_head list; + struct blk_mq_tag_set admin_tag_set; + struct sockaddr_storage addr; + struct sockaddr_storage src_addr; + struct nvme_ctrl ctrl; + + struct work_struct err_work; + struct delayed_work connect_work; + struct nvme_tcp_request async_req; +}; + +static LIST_HEAD(nvme_tcp_ctrl_list); +static DEFINE_MUTEX(nvme_tcp_ctrl_mutex); +static struct workqueue_struct *nvme_tcp_wq; +static struct blk_mq_ops nvme_tcp_mq_ops; +static struct blk_mq_ops nvme_tcp_admin_mq_ops; + +static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_tcp_ctrl, ctrl); +} + +static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) +{ + u32 queue_idx = nvme_tcp_queue_id(queue); + + if (queue_idx == 0) + return queue->ctrl->admin_tag_set.tags[queue_idx]; + return queue->ctrl->tag_set.tags[queue_idx - 1]; +} + +static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue) +{ + return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; +} + +static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue) +{ + return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; +} + +static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue) +{ + return queue->cmnd_capsule_len - sizeof(struct nvme_command); +} + +static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) +{ + return req == &req->queue->ctrl->async_req; +} + +static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req) +{ + struct request *rq; + unsigned int bytes; + + if (unlikely(nvme_tcp_async_req(req))) + return false; /* async events don't have a request */ + + rq = blk_mq_rq_from_pdu(req); + bytes = blk_rq_payload_bytes(rq); + + return rq_data_dir(rq) == WRITE && bytes && + bytes <= nvme_tcp_inline_data_size(req->queue); +} + +static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req) +{ + return req->iter.bvec->bv_page; +} + +static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req) +{ + return req->iter.bvec->bv_offset + req->iter.iov_offset; +} + +static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req) +{ + return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset, + req->pdu_len - req->pdu_sent); +} + +static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req) +{ + return req->iter.iov_offset; +} + +static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req) +{ + return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ? + req->pdu_len - req->pdu_sent : 0; +} + +static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, + int len) +{ + return nvme_tcp_pdu_data_left(req) <= len; +} + +static void nvme_tcp_init_iter(struct nvme_tcp_request *req, + unsigned int dir) +{ + struct request *rq = blk_mq_rq_from_pdu(req); + struct bio_vec *vec; + unsigned int size; + int nsegs; + size_t offset; + + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { + vec = &rq->special_vec; + nsegs = 1; + size = blk_rq_payload_bytes(rq); + offset = 0; + } else { + struct bio *bio = req->curr_bio; + + vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + nsegs = bio_segments(bio); + size = bio->bi_iter.bi_size; + offset = bio->bi_iter.bi_bvec_done; + } + + iov_iter_bvec(&req->iter, dir, vec, nsegs, size); + req->iter.iov_offset = offset; +} + +static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, + int len) +{ + req->data_sent += len; + req->pdu_sent += len; + iov_iter_advance(&req->iter, len); + if (!iov_iter_count(&req->iter) && + req->data_sent < req->data_len) { + req->curr_bio = req->curr_bio->bi_next; + nvme_tcp_init_iter(req, WRITE); + } +} + +static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + + spin_lock(&queue->lock); + list_add_tail(&req->entry, &queue->send_list); + spin_unlock(&queue->lock); + + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + +static inline struct nvme_tcp_request * +nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + + spin_lock(&queue->lock); + req = list_first_entry_or_null(&queue->send_list, + struct nvme_tcp_request, entry); + if (req) + list_del(&req->entry); + spin_unlock(&queue->lock); + + return req; +} + +static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, + __le32 *dgst) +{ + ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0); + crypto_ahash_final(hash); +} + +static inline void nvme_tcp_ddgst_update(struct ahash_request *hash, + struct page *page, off_t off, size_t len) +{ + struct scatterlist sg; + + sg_init_marker(&sg, 1); + sg_set_page(&sg, page, len, off); + ahash_request_set_crypt(hash, &sg, NULL, len); + crypto_ahash_update(hash); +} + +static inline void nvme_tcp_hdgst(struct ahash_request *hash, + void *pdu, size_t len) +{ + struct scatterlist sg; + + sg_init_one(&sg, pdu, len); + ahash_request_set_crypt(hash, &sg, pdu + len, len); + crypto_ahash_digest(hash); +} + +static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, + void *pdu, size_t pdu_len) +{ + struct nvme_tcp_hdr *hdr = pdu; + __le32 recv_digest; + __le32 exp_digest; + + if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) { + dev_err(queue->ctrl->ctrl.device, + "queue %d: header digest flag is cleared\n", + nvme_tcp_queue_id(queue)); + return -EPROTO; + } + + recv_digest = *(__le32 *)(pdu + hdr->hlen); + nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len); + exp_digest = *(__le32 *)(pdu + hdr->hlen); + if (recv_digest != exp_digest) { + dev_err(queue->ctrl->ctrl.device, + "header digest error: recv %#x expected %#x\n", + le32_to_cpu(recv_digest), le32_to_cpu(exp_digest)); + return -EIO; + } + + return 0; +} + +static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu) +{ + struct nvme_tcp_hdr *hdr = pdu; + u8 digest_len = nvme_tcp_hdgst_len(queue); + u32 len; + + len = le32_to_cpu(hdr->plen) - hdr->hlen - + ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0); + + if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) { + dev_err(queue->ctrl->ctrl.device, + "queue %d: data digest flag is cleared\n", + nvme_tcp_queue_id(queue)); + return -EPROTO; + } + crypto_ahash_init(queue->rcv_hash); + + return 0; +} + +static void nvme_tcp_exit_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + page_frag_free(req->pdu); +} + +static int nvme_tcp_init_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx, + unsigned int numa_node) +{ + struct nvme_tcp_ctrl *ctrl = set->driver_data; + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; + struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; + u8 hdgst = nvme_tcp_hdgst_len(queue); + + req->pdu = page_frag_alloc(&queue->pf_cache, + sizeof(struct nvme_tcp_cmd_pdu) + hdgst, + GFP_KERNEL | __GFP_ZERO); + if (!req->pdu) + return -ENOMEM; + + req->queue = queue; + nvme_req(rq)->ctrl = &ctrl->ctrl; + + return 0; +} + +static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_tcp_ctrl *ctrl = data; + struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1]; + + hctx->driver_data = queue; + return 0; +} + +static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_tcp_ctrl *ctrl = data; + struct nvme_tcp_queue *queue = &ctrl->queues[0]; + + hctx->driver_data = queue; + return 0; +} + +static enum nvme_tcp_recv_state +nvme_tcp_recv_state(struct nvme_tcp_queue *queue) +{ + return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU : + (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST : + NVME_TCP_RECV_DATA; +} + +static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue) +{ + queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) + + nvme_tcp_hdgst_len(queue); + queue->pdu_offset = 0; + queue->data_remaining = -1; + queue->ddgst_remaining = 0; +} + +static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) + return; + + queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work); +} + +static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, + struct nvme_completion *cqe) +{ + struct request *rq; + + rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag 0x%x not found\n", + nvme_tcp_queue_id(queue), cqe->command_id); + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + return -EINVAL; + } + + nvme_end_request(rq, cqe->status, cqe->result); + + return 0; +} + +static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, + struct nvme_tcp_data_pdu *pdu) +{ + struct request *rq; + + rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x not found\n", + nvme_tcp_queue_id(queue), pdu->command_id); + return -ENOENT; + } + + if (!blk_rq_payload_bytes(rq)) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x unexpected data\n", + nvme_tcp_queue_id(queue), rq->tag); + return -EIO; + } + + queue->data_remaining = le32_to_cpu(pdu->data_length); + + return 0; + +} + +static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, + struct nvme_tcp_rsp_pdu *pdu) +{ + struct nvme_completion *cqe = &pdu->cqe; + int ret = 0; + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_tcp_queue_id(queue) == 0 && + cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) + nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, + &cqe->result); + else + ret = nvme_tcp_process_nvme_cqe(queue, cqe); + + return ret; +} + +static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, + struct nvme_tcp_r2t_pdu *pdu) +{ + struct nvme_tcp_data_pdu *data = req->pdu; + struct nvme_tcp_queue *queue = req->queue; + struct request *rq = blk_mq_rq_from_pdu(req); + u8 hdgst = nvme_tcp_hdgst_len(queue); + u8 ddgst = nvme_tcp_ddgst_len(queue); + + req->pdu_len = le32_to_cpu(pdu->r2t_length); + req->pdu_sent = 0; + + if (unlikely(req->data_sent + req->pdu_len > req->data_len)) { + dev_err(queue->ctrl->ctrl.device, + "req %d r2t len %u exceeded data len %u (%zu sent)\n", + rq->tag, req->pdu_len, req->data_len, + req->data_sent); + return -EPROTO; + } + + if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) { + dev_err(queue->ctrl->ctrl.device, + "req %d unexpected r2t offset %u (expected %zu)\n", + rq->tag, le32_to_cpu(pdu->r2t_offset), + req->data_sent); + return -EPROTO; + } + + memset(data, 0, sizeof(*data)); + data->hdr.type = nvme_tcp_h2c_data; + data->hdr.flags = NVME_TCP_F_DATA_LAST; + if (queue->hdr_digest) + data->hdr.flags |= NVME_TCP_F_HDGST; + if (queue->data_digest) + data->hdr.flags |= NVME_TCP_F_DDGST; + data->hdr.hlen = sizeof(*data); + data->hdr.pdo = data->hdr.hlen + hdgst; + data->hdr.plen = + cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); + data->ttag = pdu->ttag; + data->command_id = rq->tag; + data->data_offset = cpu_to_le32(req->data_sent); + data->data_length = cpu_to_le32(req->pdu_len); + return 0; +} + +static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, + struct nvme_tcp_r2t_pdu *pdu) +{ + struct nvme_tcp_request *req; + struct request *rq; + int ret; + + rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x not found\n", + nvme_tcp_queue_id(queue), pdu->command_id); + return -ENOENT; + } + req = blk_mq_rq_to_pdu(rq); + + ret = nvme_tcp_setup_h2c_data_pdu(req, pdu); + if (unlikely(ret)) + return ret; + + req->state = NVME_TCP_SEND_H2C_PDU; + req->offset = 0; + + nvme_tcp_queue_request(req); + + return 0; +} + +static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, + unsigned int *offset, size_t *len) +{ + struct nvme_tcp_hdr *hdr; + char *pdu = queue->pdu; + size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); + int ret; + + ret = skb_copy_bits(skb, *offset, + &pdu[queue->pdu_offset], rcv_len); + if (unlikely(ret)) + return ret; + + queue->pdu_remaining -= rcv_len; + queue->pdu_offset += rcv_len; + *offset += rcv_len; + *len -= rcv_len; + if (queue->pdu_remaining) + return 0; + + hdr = queue->pdu; + if (queue->hdr_digest) { + ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); + if (unlikely(ret)) + return ret; + } + + + if (queue->data_digest) { + ret = nvme_tcp_check_ddgst(queue, queue->pdu); + if (unlikely(ret)) + return ret; + } + + switch (hdr->type) { + case nvme_tcp_c2h_data: + ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); + break; + case nvme_tcp_rsp: + nvme_tcp_init_recv_ctx(queue); + ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu); + break; + case nvme_tcp_r2t: + nvme_tcp_init_recv_ctx(queue); + ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu); + break; + default: + dev_err(queue->ctrl->ctrl.device, + "unsupported pdu type (%d)\n", hdr->type); + return -EINVAL; + } + + return ret; +} + +static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, + unsigned int *offset, size_t *len) +{ + struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; + struct nvme_tcp_request *req; + struct request *rq; + + rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x not found\n", + nvme_tcp_queue_id(queue), pdu->command_id); + return -ENOENT; + } + req = blk_mq_rq_to_pdu(rq); + + while (true) { + int recv_len, ret; + + recv_len = min_t(size_t, *len, queue->data_remaining); + if (!recv_len) + break; + + if (!iov_iter_count(&req->iter)) { + req->curr_bio = req->curr_bio->bi_next; + + /* + * If we don`t have any bios it means that controller + * sent more data than we requested, hence error + */ + if (!req->curr_bio) { + dev_err(queue->ctrl->ctrl.device, + "queue %d no space in request %#x", + nvme_tcp_queue_id(queue), rq->tag); + nvme_tcp_init_recv_ctx(queue); + return -EIO; + } + nvme_tcp_init_iter(req, READ); + } + + /* we can read only from what is left in this bio */ + recv_len = min_t(size_t, recv_len, + iov_iter_count(&req->iter)); + + if (queue->data_digest) + ret = skb_copy_and_hash_datagram_iter(skb, *offset, + &req->iter, recv_len, queue->rcv_hash); + else + ret = skb_copy_datagram_iter(skb, *offset, + &req->iter, recv_len); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "queue %d failed to copy request %#x data", + nvme_tcp_queue_id(queue), rq->tag); + return ret; + } + + *len -= recv_len; + *offset += recv_len; + queue->data_remaining -= recv_len; + } + + if (!queue->data_remaining) { + if (queue->data_digest) { + nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); + queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; + } else { + nvme_tcp_init_recv_ctx(queue); + } + } + + return 0; +} + +static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, + struct sk_buff *skb, unsigned int *offset, size_t *len) +{ + char *ddgst = (char *)&queue->recv_ddgst; + size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining); + off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining; + int ret; + + ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len); + if (unlikely(ret)) + return ret; + + queue->ddgst_remaining -= recv_len; + *offset += recv_len; + *len -= recv_len; + if (queue->ddgst_remaining) + return 0; + + if (queue->recv_ddgst != queue->exp_ddgst) { + dev_err(queue->ctrl->ctrl.device, + "data digest error: recv %#x expected %#x\n", + le32_to_cpu(queue->recv_ddgst), + le32_to_cpu(queue->exp_ddgst)); + return -EIO; + } + + nvme_tcp_init_recv_ctx(queue); + return 0; +} + +static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct nvme_tcp_queue *queue = desc->arg.data; + size_t consumed = len; + int result; + + while (len) { + switch (nvme_tcp_recv_state(queue)) { + case NVME_TCP_RECV_PDU: + result = nvme_tcp_recv_pdu(queue, skb, &offset, &len); + break; + case NVME_TCP_RECV_DATA: + result = nvme_tcp_recv_data(queue, skb, &offset, &len); + break; + case NVME_TCP_RECV_DDGST: + result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len); + break; + default: + result = -EFAULT; + } + if (result) { + dev_err(queue->ctrl->ctrl.device, + "receive failed: %d\n", result); + queue->rd_enabled = false; + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + return result; + } + } + + return consumed; +} + +static void nvme_tcp_data_ready(struct sock *sk) +{ + struct nvme_tcp_queue *queue; + + read_lock(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (likely(queue && queue->rd_enabled)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); + read_unlock(&sk->sk_callback_lock); +} + +static void nvme_tcp_write_space(struct sock *sk) +{ + struct nvme_tcp_queue *queue; + + read_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (likely(queue && sk_stream_is_writeable(sk))) { + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); + } + read_unlock_bh(&sk->sk_callback_lock); +} + +static void nvme_tcp_state_change(struct sock *sk) +{ + struct nvme_tcp_queue *queue; + + read_lock(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (!queue) + goto done; + + switch (sk->sk_state) { + case TCP_CLOSE: + case TCP_CLOSE_WAIT: + case TCP_LAST_ACK: + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + /* fallthrough */ + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + break; + default: + dev_info(queue->ctrl->ctrl.device, + "queue %d socket state %d\n", + nvme_tcp_queue_id(queue), sk->sk_state); + } + + queue->state_change(sk); +done: + read_unlock(&sk->sk_callback_lock); +} + +static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) +{ + queue->request = NULL; +} + +static void nvme_tcp_fail_request(struct nvme_tcp_request *req) +{ + union nvme_result res = {}; + + nvme_end_request(blk_mq_rq_from_pdu(req), + cpu_to_le16(NVME_SC_DATA_XFER_ERROR), res); +} + +static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + + while (true) { + struct page *page = nvme_tcp_req_cur_page(req); + size_t offset = nvme_tcp_req_cur_offset(req); + size_t len = nvme_tcp_req_cur_length(req); + bool last = nvme_tcp_pdu_last_send(req, len); + int ret, flags = MSG_DONTWAIT; + + if (last && !queue->data_digest) + flags |= MSG_EOR; + else + flags |= MSG_MORE; + + ret = kernel_sendpage(queue->sock, page, offset, len, flags); + if (ret <= 0) + return ret; + + nvme_tcp_advance_req(req, ret); + if (queue->data_digest) + nvme_tcp_ddgst_update(queue->snd_hash, page, + offset, ret); + + /* fully successful last write*/ + if (last && ret == len) { + if (queue->data_digest) { + nvme_tcp_ddgst_final(queue->snd_hash, + &req->ddgst); + req->state = NVME_TCP_SEND_DDGST; + req->offset = 0; + } else { + nvme_tcp_done_send_req(queue); + } + return 1; + } + } + return -EAGAIN; +} + +static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + struct nvme_tcp_cmd_pdu *pdu = req->pdu; + bool inline_data = nvme_tcp_has_inline_data(req); + int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR); + u8 hdgst = nvme_tcp_hdgst_len(queue); + int len = sizeof(*pdu) + hdgst - req->offset; + int ret; + + if (queue->hdr_digest && !req->offset) + nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + + ret = kernel_sendpage(queue->sock, virt_to_page(pdu), + offset_in_page(pdu) + req->offset, len, flags); + if (unlikely(ret <= 0)) + return ret; + + len -= ret; + if (!len) { + if (inline_data) { + req->state = NVME_TCP_SEND_DATA; + if (queue->data_digest) + crypto_ahash_init(queue->snd_hash); + nvme_tcp_init_iter(req, WRITE); + } else { + nvme_tcp_done_send_req(queue); + } + return 1; + } + req->offset += ret; + + return -EAGAIN; +} + +static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + struct nvme_tcp_data_pdu *pdu = req->pdu; + u8 hdgst = nvme_tcp_hdgst_len(queue); + int len = sizeof(*pdu) - req->offset + hdgst; + int ret; + + if (queue->hdr_digest && !req->offset) + nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + + ret = kernel_sendpage(queue->sock, virt_to_page(pdu), + offset_in_page(pdu) + req->offset, len, + MSG_DONTWAIT | MSG_MORE); + if (unlikely(ret <= 0)) + return ret; + + len -= ret; + if (!len) { + req->state = NVME_TCP_SEND_DATA; + if (queue->data_digest) + crypto_ahash_init(queue->snd_hash); + if (!req->data_sent) + nvme_tcp_init_iter(req, WRITE); + return 1; + } + req->offset += ret; + + return -EAGAIN; +} + +static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + int ret; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + struct kvec iov = { + .iov_base = &req->ddgst + req->offset, + .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset + }; + + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); + if (unlikely(ret <= 0)) + return ret; + + if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) { + nvme_tcp_done_send_req(queue); + return 1; + } + + req->offset += ret; + return -EAGAIN; +} + +static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + int ret = 1; + + if (!queue->request) { + queue->request = nvme_tcp_fetch_request(queue); + if (!queue->request) + return 0; + } + req = queue->request; + + if (req->state == NVME_TCP_SEND_CMD_PDU) { + ret = nvme_tcp_try_send_cmd_pdu(req); + if (ret <= 0) + goto done; + if (!nvme_tcp_has_inline_data(req)) + return ret; + } + + if (req->state == NVME_TCP_SEND_H2C_PDU) { + ret = nvme_tcp_try_send_data_pdu(req); + if (ret <= 0) + goto done; + } + + if (req->state == NVME_TCP_SEND_DATA) { + ret = nvme_tcp_try_send_data(req); + if (ret <= 0) + goto done; + } + + if (req->state == NVME_TCP_SEND_DDGST) + ret = nvme_tcp_try_send_ddgst(req); +done: + if (ret == -EAGAIN) + ret = 0; + return ret; +} + +static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) +{ + struct sock *sk = queue->sock->sk; + read_descriptor_t rd_desc; + int consumed; + + rd_desc.arg.data = queue; + rd_desc.count = 1; + lock_sock(sk); + consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb); + release_sock(sk); + return consumed; +} + +static void nvme_tcp_io_work(struct work_struct *w) +{ + struct nvme_tcp_queue *queue = + container_of(w, struct nvme_tcp_queue, io_work); + unsigned long start = jiffies + msecs_to_jiffies(1); + + do { + bool pending = false; + int result; + + result = nvme_tcp_try_send(queue); + if (result > 0) { + pending = true; + } else if (unlikely(result < 0)) { + dev_err(queue->ctrl->ctrl.device, + "failed to send request %d\n", result); + if (result != -EPIPE) + nvme_tcp_fail_request(queue->request); + nvme_tcp_done_send_req(queue); + return; + } + + result = nvme_tcp_try_recv(queue); + if (result > 0) + pending = true; + + if (!pending) + return; + + } while (time_after(jiffies, start)); /* quota is exhausted */ + + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + +static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); + + ahash_request_free(queue->rcv_hash); + ahash_request_free(queue->snd_hash); + crypto_free_ahash(tfm); +} + +static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue) +{ + struct crypto_ahash *tfm; + + tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); + if (!queue->snd_hash) + goto free_tfm; + ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); + + queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); + if (!queue->rcv_hash) + goto free_snd_hash; + ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); + + return 0; +free_snd_hash: + ahash_request_free(queue->snd_hash); +free_tfm: + crypto_free_ahash(tfm); + return -ENOMEM; +} + +static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl) +{ + struct nvme_tcp_request *async = &ctrl->async_req; + + page_frag_free(async->pdu); +} + +static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) +{ + struct nvme_tcp_queue *queue = &ctrl->queues[0]; + struct nvme_tcp_request *async = &ctrl->async_req; + u8 hdgst = nvme_tcp_hdgst_len(queue); + + async->pdu = page_frag_alloc(&queue->pf_cache, + sizeof(struct nvme_tcp_cmd_pdu) + hdgst, + GFP_KERNEL | __GFP_ZERO); + if (!async->pdu) + return -ENOMEM; + + async->queue = &ctrl->queues[0]; + return 0; +} + +static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + + if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) + return; + + if (queue->hdr_digest || queue->data_digest) + nvme_tcp_free_crypto(queue); + + sock_release(queue->sock); + kfree(queue->pdu); +} + +static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_icreq_pdu *icreq; + struct nvme_tcp_icresp_pdu *icresp; + struct msghdr msg = {}; + struct kvec iov; + bool ctrl_hdgst, ctrl_ddgst; + int ret; + + icreq = kzalloc(sizeof(*icreq), GFP_KERNEL); + if (!icreq) + return -ENOMEM; + + icresp = kzalloc(sizeof(*icresp), GFP_KERNEL); + if (!icresp) { + ret = -ENOMEM; + goto free_icreq; + } + + icreq->hdr.type = nvme_tcp_icreq; + icreq->hdr.hlen = sizeof(*icreq); + icreq->hdr.pdo = 0; + icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen); + icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); + icreq->maxr2t = 0; /* single inflight r2t supported */ + icreq->hpda = 0; /* no alignment constraint */ + if (queue->hdr_digest) + icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE; + if (queue->data_digest) + icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE; + + iov.iov_base = icreq; + iov.iov_len = sizeof(*icreq); + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); + if (ret < 0) + goto free_icresp; + + memset(&msg, 0, sizeof(msg)); + iov.iov_base = icresp; + iov.iov_len = sizeof(*icresp); + ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, + iov.iov_len, msg.msg_flags); + if (ret < 0) + goto free_icresp; + + ret = -EINVAL; + if (icresp->hdr.type != nvme_tcp_icresp) { + pr_err("queue %d: bad type returned %d\n", + nvme_tcp_queue_id(queue), icresp->hdr.type); + goto free_icresp; + } + + if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) { + pr_err("queue %d: bad pdu length returned %d\n", + nvme_tcp_queue_id(queue), icresp->hdr.plen); + goto free_icresp; + } + + if (icresp->pfv != NVME_TCP_PFV_1_0) { + pr_err("queue %d: bad pfv returned %d\n", + nvme_tcp_queue_id(queue), icresp->pfv); + goto free_icresp; + } + + ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE); + if ((queue->data_digest && !ctrl_ddgst) || + (!queue->data_digest && ctrl_ddgst)) { + pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n", + nvme_tcp_queue_id(queue), + queue->data_digest ? "enabled" : "disabled", + ctrl_ddgst ? "enabled" : "disabled"); + goto free_icresp; + } + + ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE); + if ((queue->hdr_digest && !ctrl_hdgst) || + (!queue->hdr_digest && ctrl_hdgst)) { + pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n", + nvme_tcp_queue_id(queue), + queue->hdr_digest ? "enabled" : "disabled", + ctrl_hdgst ? "enabled" : "disabled"); + goto free_icresp; + } + + if (icresp->cpda != 0) { + pr_err("queue %d: unsupported cpda returned %d\n", + nvme_tcp_queue_id(queue), icresp->cpda); + goto free_icresp; + } + + ret = 0; +free_icresp: + kfree(icresp); +free_icreq: + kfree(icreq); + return ret; +} + +static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, + int qid, size_t queue_size) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + struct linger sol = { .l_onoff = 1, .l_linger = 0 }; + int ret, opt, rcv_pdu_size, n; + + queue->ctrl = ctrl; + INIT_LIST_HEAD(&queue->send_list); + spin_lock_init(&queue->lock); + INIT_WORK(&queue->io_work, nvme_tcp_io_work); + queue->queue_size = queue_size; + + if (qid > 0) + queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + else + queue->cmnd_capsule_len = sizeof(struct nvme_command) + + NVME_TCP_ADMIN_CCSZ; + + ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &queue->sock); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to create socket: %d\n", ret); + return ret; + } + + /* Single syn retry */ + opt = 1; + ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, + (char *)&opt, sizeof(opt)); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to set TCP_SYNCNT sock opt %d\n", ret); + goto err_sock; + } + + /* Set TCP no delay */ + opt = 1; + ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, + TCP_NODELAY, (char *)&opt, sizeof(opt)); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to set TCP_NODELAY sock opt %d\n", ret); + goto err_sock; + } + + /* + * Cleanup whatever is sitting in the TCP transmit queue on socket + * close. This is done to prevent stale data from being sent should + * the network connection be restored before TCP times out. + */ + ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, + (char *)&sol, sizeof(sol)); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to set SO_LINGER sock opt %d\n", ret); + goto err_sock; + } + + queue->sock->sk->sk_allocation = GFP_ATOMIC; + if (!qid) + n = 0; + else + n = (qid - 1) % num_online_cpus(); + queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); + queue->request = NULL; + queue->data_remaining = 0; + queue->ddgst_remaining = 0; + queue->pdu_remaining = 0; + queue->pdu_offset = 0; + sk_set_memalloc(queue->sock->sk); + + if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { + ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, + sizeof(ctrl->src_addr)); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to bind queue %d socket %d\n", + qid, ret); + goto err_sock; + } + } + + queue->hdr_digest = nctrl->opts->hdr_digest; + queue->data_digest = nctrl->opts->data_digest; + if (queue->hdr_digest || queue->data_digest) { + ret = nvme_tcp_alloc_crypto(queue); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to allocate queue %d crypto\n", qid); + goto err_sock; + } + } + + rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) + + nvme_tcp_hdgst_len(queue); + queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL); + if (!queue->pdu) { + ret = -ENOMEM; + goto err_crypto; + } + + dev_dbg(ctrl->ctrl.device, "connecting queue %d\n", + nvme_tcp_queue_id(queue)); + + ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, + sizeof(ctrl->addr), 0); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to connect socket: %d\n", ret); + goto err_rcv_pdu; + } + + ret = nvme_tcp_init_connection(queue); + if (ret) + goto err_init_connect; + + queue->rd_enabled = true; + set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags); + nvme_tcp_init_recv_ctx(queue); + + write_lock_bh(&queue->sock->sk->sk_callback_lock); + queue->sock->sk->sk_user_data = queue; + queue->state_change = queue->sock->sk->sk_state_change; + queue->data_ready = queue->sock->sk->sk_data_ready; + queue->write_space = queue->sock->sk->sk_write_space; + queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; + queue->sock->sk->sk_state_change = nvme_tcp_state_change; + queue->sock->sk->sk_write_space = nvme_tcp_write_space; + write_unlock_bh(&queue->sock->sk->sk_callback_lock); + + return 0; + +err_init_connect: + kernel_sock_shutdown(queue->sock, SHUT_RDWR); +err_rcv_pdu: + kfree(queue->pdu); +err_crypto: + if (queue->hdr_digest || queue->data_digest) + nvme_tcp_free_crypto(queue); +err_sock: + sock_release(queue->sock); + queue->sock = NULL; + return ret; +} + +static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue) +{ + struct socket *sock = queue->sock; + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_user_data = NULL; + sock->sk->sk_data_ready = queue->data_ready; + sock->sk->sk_state_change = queue->state_change; + sock->sk->sk_write_space = queue->write_space; + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) +{ + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + nvme_tcp_restore_sock_calls(queue); + cancel_work_sync(&queue->io_work); +} + +static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + + if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) + return; + + __nvme_tcp_stop_queue(queue); +} + +static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + int ret; + + if (idx) + ret = nvmf_connect_io_queue(nctrl, idx, false); + else + ret = nvmf_connect_admin_queue(nctrl); + + if (!ret) { + set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags); + } else { + __nvme_tcp_stop_queue(&ctrl->queues[idx]); + dev_err(nctrl->device, + "failed to connect queue: %d ret=%d\n", idx, ret); + } + return ret; +} + +static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, + bool admin) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct blk_mq_tag_set *set; + int ret; + + if (admin) { + set = &ctrl->admin_tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_tcp_admin_mq_ops; + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; + set->reserved_tags = 2; /* connect + keep-alive */ + set->numa_node = NUMA_NO_NODE; + set->cmd_size = sizeof(struct nvme_tcp_request); + set->driver_data = ctrl; + set->nr_hw_queues = 1; + set->timeout = ADMIN_TIMEOUT; + } else { + set = &ctrl->tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_tcp_mq_ops; + set->queue_depth = nctrl->sqsize + 1; + set->reserved_tags = 1; /* fabric connect */ + set->numa_node = NUMA_NO_NODE; + set->flags = BLK_MQ_F_SHOULD_MERGE; + set->cmd_size = sizeof(struct nvme_tcp_request); + set->driver_data = ctrl; + set->nr_hw_queues = nctrl->queue_count - 1; + set->timeout = NVME_IO_TIMEOUT; + set->nr_maps = 2 /* default + read */; + } + + ret = blk_mq_alloc_tag_set(set); + if (ret) + return ERR_PTR(ret); + + return set; +} + +static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl) +{ + if (to_tcp_ctrl(ctrl)->async_req.pdu) { + nvme_tcp_free_async_req(to_tcp_ctrl(ctrl)); + to_tcp_ctrl(ctrl)->async_req.pdu = NULL; + } + + nvme_tcp_free_queue(ctrl, 0); +} + +static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->queue_count; i++) + nvme_tcp_free_queue(ctrl, i); +} + +static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->queue_count; i++) + nvme_tcp_stop_queue(ctrl, i); +} + +static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) +{ + int i, ret = 0; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvme_tcp_start_queue(ctrl, i); + if (ret) + goto out_stop_queues; + } + + return 0; + +out_stop_queues: + for (i--; i >= 1; i--) + nvme_tcp_stop_queue(ctrl, i); + return ret; +} + +static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) +{ + int ret; + + ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); + if (ret) + return ret; + + ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl)); + if (ret) + goto out_free_queue; + + return 0; + +out_free_queue: + nvme_tcp_free_queue(ctrl, 0); + return ret; +} + +static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) +{ + int i, ret; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvme_tcp_alloc_queue(ctrl, i, + ctrl->sqsize + 1); + if (ret) + goto out_free_queues; + } + + return 0; + +out_free_queues: + for (i--; i >= 1; i--) + nvme_tcp_free_queue(ctrl, i); + + return ret; +} + +static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) +{ + unsigned int nr_io_queues; + + nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); + nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); + + return nr_io_queues; +} + +static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl) +{ + unsigned int nr_io_queues; + int ret; + + nr_io_queues = nvme_tcp_nr_io_queues(ctrl); + ret = nvme_set_queue_count(ctrl, &nr_io_queues); + if (ret) + return ret; + + ctrl->queue_count = nr_io_queues + 1; + if (ctrl->queue_count < 2) + return 0; + + dev_info(ctrl->device, + "creating %d I/O queues.\n", nr_io_queues); + + return nvme_tcp_alloc_io_queues(ctrl); +} + +static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) +{ + nvme_tcp_stop_io_queues(ctrl); + if (remove) { + if (ctrl->ops->flags & NVME_F_FABRICS) + blk_cleanup_queue(ctrl->connect_q); + blk_mq_free_tag_set(ctrl->tagset); + } + nvme_tcp_free_io_queues(ctrl); +} + +static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) +{ + int ret; + + ret = nvme_alloc_io_queues(ctrl); + if (ret) + return ret; + + if (new) { + ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false); + if (IS_ERR(ctrl->tagset)) { + ret = PTR_ERR(ctrl->tagset); + goto out_free_io_queues; + } + + if (ctrl->ops->flags & NVME_F_FABRICS) { + ctrl->connect_q = blk_mq_init_queue(ctrl->tagset); + if (IS_ERR(ctrl->connect_q)) { + ret = PTR_ERR(ctrl->connect_q); + goto out_free_tag_set; + } + } + } else { + blk_mq_update_nr_hw_queues(ctrl->tagset, + ctrl->queue_count - 1); + } + + ret = nvme_tcp_start_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; + + return 0; + +out_cleanup_connect_q: + if (new && (ctrl->ops->flags & NVME_F_FABRICS)) + blk_cleanup_queue(ctrl->connect_q); +out_free_tag_set: + if (new) + blk_mq_free_tag_set(ctrl->tagset); +out_free_io_queues: + nvme_tcp_free_io_queues(ctrl); + return ret; +} + +static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) +{ + nvme_tcp_stop_queue(ctrl, 0); + if (remove) { + free_opal_dev(ctrl->opal_dev); + blk_cleanup_queue(ctrl->admin_q); + blk_mq_free_tag_set(ctrl->admin_tagset); + } + nvme_tcp_free_admin_queue(ctrl); +} + +static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) +{ + int error; + + error = nvme_tcp_alloc_admin_queue(ctrl); + if (error) + return error; + + if (new) { + ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true); + if (IS_ERR(ctrl->admin_tagset)) { + error = PTR_ERR(ctrl->admin_tagset); + goto out_free_queue; + } + + ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); + if (IS_ERR(ctrl->admin_q)) { + error = PTR_ERR(ctrl->admin_q); + goto out_free_tagset; + } + } + + error = nvme_tcp_start_queue(ctrl, 0); + if (error) + goto out_cleanup_queue; + + error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); + if (error) { + dev_err(ctrl->device, + "prop_get NVME_REG_CAP failed\n"); + goto out_stop_queue; + } + + ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + + error = nvme_enable_ctrl(ctrl, ctrl->cap); + if (error) + goto out_stop_queue; + + error = nvme_init_identify(ctrl); + if (error) + goto out_stop_queue; + + return 0; + +out_stop_queue: + nvme_tcp_stop_queue(ctrl, 0); +out_cleanup_queue: + if (new) + blk_cleanup_queue(ctrl->admin_q); +out_free_tagset: + if (new) + blk_mq_free_tag_set(ctrl->admin_tagset); +out_free_queue: + nvme_tcp_free_admin_queue(ctrl); + return error; +} + +static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, + bool remove) +{ + blk_mq_quiesce_queue(ctrl->admin_q); + nvme_tcp_stop_queue(ctrl, 0); + blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl); + blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_tcp_destroy_admin_queue(ctrl, remove); +} + +static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, + bool remove) +{ + if (ctrl->queue_count <= 1) + return; + nvme_stop_queues(ctrl); + nvme_tcp_stop_io_queues(ctrl); + blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl); + if (remove) + nvme_start_queues(ctrl); + nvme_tcp_destroy_io_queues(ctrl, remove); +} + +static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) +{ + /* If we are resetting/deleting then do nothing */ + if (ctrl->state != NVME_CTRL_CONNECTING) { + WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW || + ctrl->state == NVME_CTRL_LIVE); + return; + } + + if (nvmf_should_reconnect(ctrl)) { + dev_info(ctrl->device, "Reconnecting in %d seconds...\n", + ctrl->opts->reconnect_delay); + queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work, + ctrl->opts->reconnect_delay * HZ); + } else { + dev_info(ctrl->device, "Removing controller...\n"); + nvme_delete_ctrl(ctrl); + } +} + +static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) +{ + struct nvmf_ctrl_options *opts = ctrl->opts; + int ret = -EINVAL; + + ret = nvme_tcp_configure_admin_queue(ctrl, new); + if (ret) + return ret; + + if (ctrl->icdoff) { + dev_err(ctrl->device, "icdoff is not supported!\n"); + goto destroy_admin; + } + + if (opts->queue_size > ctrl->sqsize + 1) + dev_warn(ctrl->device, + "queue_size %zu > ctrl sqsize %u, clamping down\n", + opts->queue_size, ctrl->sqsize + 1); + + if (ctrl->sqsize + 1 > ctrl->maxcmd) { + dev_warn(ctrl->device, + "sqsize %u > ctrl maxcmd %u, clamping down\n", + ctrl->sqsize + 1, ctrl->maxcmd); + ctrl->sqsize = ctrl->maxcmd - 1; + } + + if (ctrl->queue_count > 1) { + ret = nvme_tcp_configure_io_queues(ctrl, new); + if (ret) + goto destroy_admin; + } + + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { + /* state change failure is ok if we're in DELETING state */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + ret = -EINVAL; + goto destroy_io; + } + + nvme_start_ctrl(ctrl); + return 0; + +destroy_io: + if (ctrl->queue_count > 1) + nvme_tcp_destroy_io_queues(ctrl, new); +destroy_admin: + nvme_tcp_stop_queue(ctrl, 0); + nvme_tcp_destroy_admin_queue(ctrl, new); + return ret; +} + +static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work), + struct nvme_tcp_ctrl, connect_work); + struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + + ++ctrl->nr_reconnects; + + if (nvme_tcp_setup_ctrl(ctrl, false)) + goto requeue; + + dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n", + ctrl->nr_reconnects); + + ctrl->nr_reconnects = 0; + + return; + +requeue: + dev_info(ctrl->device, "Failed reconnect attempt %d\n", + ctrl->nr_reconnects); + nvme_tcp_reconnect_or_remove(ctrl); +} + +static void nvme_tcp_error_recovery_work(struct work_struct *work) +{ + struct nvme_tcp_ctrl *tcp_ctrl = container_of(work, + struct nvme_tcp_ctrl, err_work); + struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + + nvme_stop_keep_alive(ctrl); + nvme_tcp_teardown_io_queues(ctrl, false); + /* unquiesce to fail fast pending requests */ + nvme_start_queues(ctrl); + nvme_tcp_teardown_admin_queue(ctrl, false); + + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { + /* state change failure is ok if we're in DELETING state */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + return; + } + + nvme_tcp_reconnect_or_remove(ctrl); +} + +static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) +{ + nvme_tcp_teardown_io_queues(ctrl, shutdown); + if (shutdown) + nvme_shutdown_ctrl(ctrl); + else + nvme_disable_ctrl(ctrl, ctrl->cap); + nvme_tcp_teardown_admin_queue(ctrl, shutdown); +} + +static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl) +{ + nvme_tcp_teardown_ctrl(ctrl, true); +} + +static void nvme_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, reset_work); + + nvme_stop_ctrl(ctrl); + nvme_tcp_teardown_ctrl(ctrl, false); + + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { + /* state change failure is ok if we're in DELETING state */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + return; + } + + if (nvme_tcp_setup_ctrl(ctrl, false)) + goto out_fail; + + return; + +out_fail: + ++ctrl->nr_reconnects; + nvme_tcp_reconnect_or_remove(ctrl); +} + +static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl) +{ + cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work); + cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); +} + +static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_tcp_ctrl_mutex); + + nvmf_free_options(nctrl->opts); +free_ctrl: + kfree(ctrl->queues); + kfree(ctrl); +} + +static void nvme_tcp_set_sg_null(struct nvme_command *c) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + sg->addr = 0; + sg->length = 0; + sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | + NVME_SGL_FMT_TRANSPORT_A; +} + +static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue, + struct nvme_command *c, u32 data_len) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); + sg->length = cpu_to_le32(data_len); + sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; +} + +static void nvme_tcp_set_sg_host_data(struct nvme_command *c, + u32 data_len) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + sg->addr = 0; + sg->length = cpu_to_le32(data_len); + sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | + NVME_SGL_FMT_TRANSPORT_A; +} + +static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg); + struct nvme_tcp_queue *queue = &ctrl->queues[0]; + struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu; + struct nvme_command *cmd = &pdu->cmd; + u8 hdgst = nvme_tcp_hdgst_len(queue); + + memset(pdu, 0, sizeof(*pdu)); + pdu->hdr.type = nvme_tcp_cmd; + if (queue->hdr_digest) + pdu->hdr.flags |= NVME_TCP_F_HDGST; + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); + + cmd->common.opcode = nvme_admin_async_event; + cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH; + cmd->common.flags |= NVME_CMD_SGL_METABUF; + nvme_tcp_set_sg_null(cmd); + + ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU; + ctrl->async_req.offset = 0; + ctrl->async_req.curr_bio = NULL; + ctrl->async_req.data_len = 0; + + nvme_tcp_queue_request(&ctrl->async_req); +} + +static enum blk_eh_timer_return +nvme_tcp_timeout(struct request *rq, bool reserved) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; + struct nvme_tcp_cmd_pdu *pdu = req->pdu; + + dev_dbg(ctrl->ctrl.device, + "queue %d: timeout request %#x type %d\n", + nvme_tcp_queue_id(req->queue), rq->tag, + pdu->hdr.type); + + if (ctrl->ctrl.state != NVME_CTRL_LIVE) { + union nvme_result res = {}; + + nvme_req(rq)->flags |= NVME_REQ_CANCELLED; + nvme_end_request(rq, cpu_to_le16(NVME_SC_ABORT_REQ), res); + return BLK_EH_DONE; + } + + /* queue error recovery */ + nvme_tcp_error_recovery(&ctrl->ctrl); + + return BLK_EH_RESET_TIMER; +} + +static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, + struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_command *c = &pdu->cmd; + + c->common.flags |= NVME_CMD_SGL_METABUF; + + if (rq_data_dir(rq) == WRITE && req->data_len && + req->data_len <= nvme_tcp_inline_data_size(queue)) + nvme_tcp_set_sg_inline(queue, c, req->data_len); + else + nvme_tcp_set_sg_host_data(c, req->data_len); + + return 0; +} + +static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, + struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_queue *queue = req->queue; + u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0; + blk_status_t ret; + + ret = nvme_setup_cmd(ns, rq, &pdu->cmd); + if (ret) + return ret; + + req->state = NVME_TCP_SEND_CMD_PDU; + req->offset = 0; + req->data_sent = 0; + req->pdu_len = 0; + req->pdu_sent = 0; + req->data_len = blk_rq_payload_bytes(rq); + req->curr_bio = rq->bio; + + if (rq_data_dir(rq) == WRITE && + req->data_len <= nvme_tcp_inline_data_size(queue)) + req->pdu_len = req->data_len; + else if (req->curr_bio) + nvme_tcp_init_iter(req, READ); + + pdu->hdr.type = nvme_tcp_cmd; + pdu->hdr.flags = 0; + if (queue->hdr_digest) + pdu->hdr.flags |= NVME_TCP_F_HDGST; + if (queue->data_digest && req->pdu_len) { + pdu->hdr.flags |= NVME_TCP_F_DDGST; + ddgst = nvme_tcp_ddgst_len(queue); + } + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0; + pdu->hdr.plen = + cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst); + + ret = nvme_tcp_map_data(queue, rq); + if (unlikely(ret)) { + dev_err(queue->ctrl->ctrl.device, + "Failed to map data (%d)\n", ret); + return ret; + } + + return 0; +} + +static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_tcp_queue *queue = hctx->driver_data; + struct request *rq = bd->rq; + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags); + blk_status_t ret; + + if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); + + ret = nvme_tcp_setup_cmd_pdu(ns, rq); + if (unlikely(ret)) + return ret; + + blk_mq_start_request(rq); + + nvme_tcp_queue_request(req); + + return BLK_STS_OK; +} + +static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) +{ + struct nvme_tcp_ctrl *ctrl = set->driver_data; + + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues; + if (ctrl->ctrl.opts->nr_write_queues) { + /* separate read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->ctrl.opts->nr_write_queues; + set->map[HCTX_TYPE_READ].queue_offset = + ctrl->ctrl.opts->nr_write_queues; + } else { + /* mixed read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->ctrl.opts->nr_io_queues; + set->map[HCTX_TYPE_READ].queue_offset = 0; + } + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + return 0; +} + +static struct blk_mq_ops nvme_tcp_mq_ops = { + .queue_rq = nvme_tcp_queue_rq, + .complete = nvme_complete_rq, + .init_request = nvme_tcp_init_request, + .exit_request = nvme_tcp_exit_request, + .init_hctx = nvme_tcp_init_hctx, + .timeout = nvme_tcp_timeout, + .map_queues = nvme_tcp_map_queues, +}; + +static struct blk_mq_ops nvme_tcp_admin_mq_ops = { + .queue_rq = nvme_tcp_queue_rq, + .complete = nvme_complete_rq, + .init_request = nvme_tcp_init_request, + .exit_request = nvme_tcp_exit_request, + .init_hctx = nvme_tcp_init_admin_hctx, + .timeout = nvme_tcp_timeout, +}; + +static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { + .name = "tcp", + .module = THIS_MODULE, + .flags = NVME_F_FABRICS, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .free_ctrl = nvme_tcp_free_ctrl, + .submit_async_event = nvme_tcp_submit_async_event, + .delete_ctrl = nvme_tcp_delete_ctrl, + .get_address = nvmf_get_address, + .stop_ctrl = nvme_tcp_stop_ctrl, +}; + +static bool +nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts) +{ + struct nvme_tcp_ctrl *ctrl; + bool found = false; + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) { + found = nvmf_ip_options_match(&ctrl->ctrl, opts); + if (found) + break; + } + mutex_unlock(&nvme_tcp_ctrl_mutex); + + return found; +} + +static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_tcp_ctrl *ctrl; + int ret; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ctrl->list); + ctrl->ctrl.opts = opts; + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1; + ctrl->ctrl.sqsize = opts->queue_size - 1; + ctrl->ctrl.kato = opts->kato; + + INIT_DELAYED_WORK(&ctrl->connect_work, + nvme_tcp_reconnect_ctrl_work); + INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work); + + if (!(opts->mask & NVMF_OPT_TRSVCID)) { + opts->trsvcid = + kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL); + if (!opts->trsvcid) { + ret = -ENOMEM; + goto out_free_ctrl; + } + opts->mask |= NVMF_OPT_TRSVCID; + } + + ret = inet_pton_with_scope(&init_net, AF_UNSPEC, + opts->traddr, opts->trsvcid, &ctrl->addr); + if (ret) { + pr_err("malformed address passed: %s:%s\n", + opts->traddr, opts->trsvcid); + goto out_free_ctrl; + } + + if (opts->mask & NVMF_OPT_HOST_TRADDR) { + ret = inet_pton_with_scope(&init_net, AF_UNSPEC, + opts->host_traddr, NULL, &ctrl->src_addr); + if (ret) { + pr_err("malformed src address passed: %s\n", + opts->host_traddr); + goto out_free_ctrl; + } + } + + if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) { + ret = -EALREADY; + goto out_free_ctrl; + } + + ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), + GFP_KERNEL); + if (!ctrl->queues) { + ret = -ENOMEM; + goto out_free_ctrl; + } + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0); + if (ret) + goto out_kfree_queues; + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { + WARN_ON_ONCE(1); + ret = -EINTR; + goto out_uninit_ctrl; + } + + ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true); + if (ret) + goto out_uninit_ctrl; + + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", + ctrl->ctrl.opts->subsysnqn, &ctrl->addr); + + nvme_get_ctrl(&ctrl->ctrl); + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); + mutex_unlock(&nvme_tcp_ctrl_mutex); + + return &ctrl->ctrl; + +out_uninit_ctrl: + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); +out_kfree_queues: + kfree(ctrl->queues); +out_free_ctrl: + kfree(ctrl); + return ERR_PTR(ret); +} + +static struct nvmf_transport_ops nvme_tcp_transport = { + .name = "tcp", + .module = THIS_MODULE, + .required_opts = NVMF_OPT_TRADDR, + .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | + NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | + NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | + NVMF_OPT_NR_WRITE_QUEUES, + .create_ctrl = nvme_tcp_create_ctrl, +}; + +static int __init nvme_tcp_init_module(void) +{ + nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + if (!nvme_tcp_wq) + return -ENOMEM; + + nvmf_register_transport(&nvme_tcp_transport); + return 0; +} + +static void __exit nvme_tcp_cleanup_module(void) +{ + struct nvme_tcp_ctrl *ctrl; + + nvmf_unregister_transport(&nvme_tcp_transport); + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) + nvme_delete_ctrl(&ctrl->ctrl); + mutex_unlock(&nvme_tcp_ctrl_mutex); + flush_workqueue(nvme_delete_wq); + + destroy_workqueue(nvme_tcp_wq); +} + +module_init(nvme_tcp_init_module); +module_exit(nvme_tcp_cleanup_module); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 25b0e310f4a8..5566dda3237a 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -139,3 +139,6 @@ const char *nvme_trace_disk_name(struct trace_seq *p, char *name) return ret; } +EXPORT_SYMBOL_GPL(nvme_trace_disk_name); + +EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_sq); diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 196d5bd56718..3564120aa7b3 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -115,8 +115,8 @@ TRACE_EVENT(nvme_setup_cmd, __entry->nsid = le32_to_cpu(cmd->common.nsid); __entry->metadata = le64_to_cpu(cmd->common.metadata); __assign_disk_name(__entry->disk, req->rq_disk); - memcpy(__entry->cdw10, cmd->common.cdw10, - sizeof(__entry->cdw10)); + memcpy(__entry->cdw10, &cmd->common.cdw10, + 6 * sizeof(__entry->cdw10)); ), TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", __entry->ctrl_id, __print_disk_name(__entry->disk), @@ -184,6 +184,29 @@ TRACE_EVENT(nvme_async_event, #undef aer_name +TRACE_EVENT(nvme_sq, + TP_PROTO(struct request *req, __le16 sq_head, int sq_tail), + TP_ARGS(req, sq_head, sq_tail), + TP_STRUCT__entry( + __field(int, ctrl_id) + __array(char, disk, DISK_NAME_LEN) + __field(int, qid) + __field(u16, sq_head) + __field(u16, sq_tail) + ), + TP_fast_assign( + __entry->ctrl_id = nvme_req(req)->ctrl->instance; + __assign_disk_name(__entry->disk, req->rq_disk); + __entry->qid = nvme_req_qid(req); + __entry->sq_head = le16_to_cpu(sq_head); + __entry->sq_tail = sq_tail; + ), + TP_printk("nvme%d: %sqid=%d, head=%u, tail=%u", + __entry->ctrl_id, __print_disk_name(__entry->disk), + __entry->qid, __entry->sq_head, __entry->sq_tail + ) +); + #endif /* _TRACE_NVME_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 3c7b61ddb0d1..d94f25cde019 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -60,3 +60,13 @@ config NVME_TARGET_FCLOOP to test NVMe-FC transport interfaces. If unsure, say N. + +config NVME_TARGET_TCP + tristate "NVMe over Fabrics TCP target support" + depends on INET + depends on NVME_TARGET + help + This enables the NVMe TCP target support, which allows exporting NVMe + devices over TCP. + + If unsure, say N. diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile index 8118c93391c6..8c3ad0fb6860 100644 --- a/drivers/nvme/target/Makefile +++ b/drivers/nvme/target/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o obj-$(CONFIG_NVME_TARGET_FC) += nvmet-fc.o obj-$(CONFIG_NVME_TARGET_FCLOOP) += nvme-fcloop.o +obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ discovery.o io-cmd-file.o io-cmd-bdev.o @@ -12,3 +13,4 @@ nvme-loop-y += loop.o nvmet-rdma-y += rdma.o nvmet-fc-y += fc.o nvme-fcloop-y += fcloop.o +nvmet-tcp-y += tcp.o diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 1179f6314323..11baeb14c388 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -19,19 +19,6 @@ #include #include "nvmet.h" -/* - * This helper allows us to clear the AEN based on the RAE bit, - * Please use this helper when processing the log pages which are - * associated with the AEN. - */ -static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit) -{ - int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15; - - if (!rae) - clear_bit(aen_bit, &req->sq->ctrl->aen_masked); -} - u32 nvmet_get_log_page_len(struct nvme_command *cmd) { u32 len = le16_to_cpu(cmd->get_log_page.numdu); @@ -50,6 +37,34 @@ static void nvmet_execute_get_log_page_noop(struct nvmet_req *req) nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len)); } +static void nvmet_execute_get_log_page_error(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = NVME_SC_SUCCESS; + unsigned long flags; + off_t offset = 0; + u64 slot; + u64 i; + + spin_lock_irqsave(&ctrl->error_lock, flags); + slot = ctrl->err_counter % NVMET_ERROR_LOG_SLOTS; + + for (i = 0; i < NVMET_ERROR_LOG_SLOTS; i++) { + status = nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot], + sizeof(struct nvme_error_slot)); + if (status) + break; + + if (slot == 0) + slot = NVMET_ERROR_LOG_SLOTS - 1; + else + slot--; + offset += sizeof(struct nvme_error_slot); + } + spin_unlock_irqrestore(&ctrl->error_lock, flags); + nvmet_req_complete(req, status); +} + static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, struct nvme_smart_log *slog) { @@ -60,6 +75,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, if (!ns) { pr_err("Could not find namespace id : %d\n", le32_to_cpu(req->cmd->get_log_page.nsid)); + req->error_loc = offsetof(struct nvme_rw_command, nsid); return NVME_SC_INVALID_NS; } @@ -119,6 +135,7 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req) { struct nvme_smart_log *log; u16 status = NVME_SC_INTERNAL; + unsigned long flags; if (req->data_len != sizeof(*log)) goto out; @@ -134,6 +151,11 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req) if (status) goto out_free_log; + spin_lock_irqsave(&req->sq->ctrl->error_lock, flags); + put_unaligned_le64(req->sq->ctrl->err_counter, + &log->num_err_log_entries); + spin_unlock_irqrestore(&req->sq->ctrl->error_lock, flags); + status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); out_free_log: kfree(log); @@ -189,7 +211,7 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) if (!status) status = nvmet_zero_sgl(req, len, req->data_len - len); ctrl->nr_changed_ns = 0; - nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR); + nvmet_clear_aen_bit(req, NVME_AEN_BIT_NS_ATTR); mutex_unlock(&ctrl->lock); out: nvmet_req_complete(req, status); @@ -252,7 +274,7 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt); hdr.ngrps = cpu_to_le16(ngrps); - nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE); + nvmet_clear_aen_bit(req, NVME_AEN_BIT_ANA_CHANGE); up_read(&nvmet_ana_sem); kfree(desc); @@ -304,7 +326,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) /* XXX: figure out what to do about RTD3R/RTD3 */ id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL); - id->ctratt = cpu_to_le32(1 << 0); + id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT | + NVME_CTRL_ATTR_TBKAS); id->oacs = 0; @@ -392,6 +415,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) u16 status = 0; if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) { + req->error_loc = offsetof(struct nvme_identify, nsid); status = NVME_SC_INVALID_NS | NVME_SC_DNR; goto out; } @@ -512,6 +536,7 @@ static void nvmet_execute_identify_desclist(struct nvmet_req *req) ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); if (!ns) { + req->error_loc = offsetof(struct nvme_identify, nsid); status = NVME_SC_INVALID_NS | NVME_SC_DNR; goto out; } @@ -569,13 +594,15 @@ static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req) static u16 nvmet_set_feat_write_protect(struct nvmet_req *req) { - u32 write_protect = le32_to_cpu(req->cmd->common.cdw10[1]); + u32 write_protect = le32_to_cpu(req->cmd->common.cdw11); struct nvmet_subsys *subsys = req->sq->ctrl->subsys; u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE; req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid); - if (unlikely(!req->ns)) + if (unlikely(!req->ns)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); return status; + } mutex_lock(&subsys->lock); switch (write_protect) { @@ -599,11 +626,36 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req) return status; } +u16 nvmet_set_feat_kato(struct nvmet_req *req) +{ + u32 val32 = le32_to_cpu(req->cmd->common.cdw11); + + req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); + + nvmet_set_result(req, req->sq->ctrl->kato); + + return 0; +} + +u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask) +{ + u32 val32 = le32_to_cpu(req->cmd->common.cdw11); + + if (val32 & ~mask) { + req->error_loc = offsetof(struct nvme_common_command, cdw11); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + WRITE_ONCE(req->sq->ctrl->aen_enabled, val32); + nvmet_set_result(req, val32); + + return 0; +} + static void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; - u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); - u32 val32; + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); u16 status = 0; switch (cdw10 & 0xff) { @@ -612,19 +664,10 @@ static void nvmet_execute_set_features(struct nvmet_req *req) (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); break; case NVME_FEAT_KATO: - val32 = le32_to_cpu(req->cmd->common.cdw10[1]); - req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); - nvmet_set_result(req, req->sq->ctrl->kato); + status = nvmet_set_feat_kato(req); break; case NVME_FEAT_ASYNC_EVENT: - val32 = le32_to_cpu(req->cmd->common.cdw10[1]); - if (val32 & ~NVMET_AEN_CFG_ALL) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; - break; - } - - WRITE_ONCE(req->sq->ctrl->aen_enabled, val32); - nvmet_set_result(req, val32); + status = nvmet_set_feat_async_event(req, NVMET_AEN_CFG_ALL); break; case NVME_FEAT_HOST_ID: status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; @@ -633,6 +676,7 @@ static void nvmet_execute_set_features(struct nvmet_req *req) status = nvmet_set_feat_write_protect(req); break; default: + req->error_loc = offsetof(struct nvme_common_command, cdw10); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; break; } @@ -646,9 +690,10 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req) u32 result; req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid); - if (!req->ns) + if (!req->ns) { + req->error_loc = offsetof(struct nvme_common_command, nsid); return NVME_SC_INVALID_NS | NVME_SC_DNR; - + } mutex_lock(&subsys->lock); if (req->ns->readonly == true) result = NVME_NS_WRITE_PROTECT; @@ -660,10 +705,20 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req) return 0; } +void nvmet_get_feat_kato(struct nvmet_req *req) +{ + nvmet_set_result(req, req->sq->ctrl->kato * 1000); +} + +void nvmet_get_feat_async_event(struct nvmet_req *req) +{ + nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled)); +} + static void nvmet_execute_get_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; - u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); u16 status = 0; switch (cdw10 & 0xff) { @@ -689,7 +744,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req) break; #endif case NVME_FEAT_ASYNC_EVENT: - nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled)); + nvmet_get_feat_async_event(req); break; case NVME_FEAT_VOLATILE_WC: nvmet_set_result(req, 1); @@ -699,11 +754,13 @@ static void nvmet_execute_get_features(struct nvmet_req *req) (subsys->max_qid-1) | ((subsys->max_qid-1) << 16)); break; case NVME_FEAT_KATO: - nvmet_set_result(req, req->sq->ctrl->kato * 1000); + nvmet_get_feat_kato(req); break; case NVME_FEAT_HOST_ID: /* need 128-bit host identifier flag */ - if (!(req->cmd->common.cdw10[1] & cpu_to_le32(1 << 0))) { + if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) { + req->error_loc = + offsetof(struct nvme_common_command, cdw11); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; break; } @@ -715,6 +772,8 @@ static void nvmet_execute_get_features(struct nvmet_req *req) status = nvmet_get_feat_write_protect(req); break; default: + req->error_loc = + offsetof(struct nvme_common_command, cdw10); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; break; } @@ -722,7 +781,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req) nvmet_req_complete(req, status); } -static void nvmet_execute_async_event(struct nvmet_req *req) +void nvmet_execute_async_event(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -738,7 +797,7 @@ static void nvmet_execute_async_event(struct nvmet_req *req) schedule_work(&ctrl->async_event_work); } -static void nvmet_execute_keep_alive(struct nvmet_req *req) +void nvmet_execute_keep_alive(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -764,13 +823,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) switch (cmd->get_log_page.lid) { case NVME_LOG_ERROR: - /* - * We currently never set the More bit in the status - * field, so all error log entries are invalid and can - * be zeroed out. This is called a minum viable - * implementation (TM) of this mandatory log page. - */ - req->execute = nvmet_execute_get_log_page_noop; + req->execute = nvmet_execute_get_log_page_error; return 0; case NVME_LOG_SMART: req->execute = nvmet_execute_get_log_page_smart; @@ -836,5 +889,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, req->sq->qid); + req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index d895579b6c5d..618bbd006544 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -25,12 +25,16 @@ static const struct config_item_type nvmet_host_type; static const struct config_item_type nvmet_subsys_type; +static LIST_HEAD(nvmet_ports_list); +struct list_head *nvmet_ports = &nvmet_ports_list; + static const struct nvmet_transport_name { u8 type; const char *name; } nvmet_transport_names[] = { { NVMF_TRTYPE_RDMA, "rdma" }, { NVMF_TRTYPE_FC, "fc" }, + { NVMF_TRTYPE_TCP, "tcp" }, { NVMF_TRTYPE_LOOP, "loop" }, }; @@ -150,7 +154,8 @@ CONFIGFS_ATTR(nvmet_, addr_traddr); static ssize_t nvmet_addr_treq_show(struct config_item *item, char *page) { - switch (to_nvmet_port(item)->disc_addr.treq) { + switch (to_nvmet_port(item)->disc_addr.treq & + NVME_TREQ_SECURE_CHANNEL_MASK) { case NVMF_TREQ_NOT_SPECIFIED: return sprintf(page, "not specified\n"); case NVMF_TREQ_REQUIRED: @@ -166,6 +171,7 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item, const char *page, size_t count) { struct nvmet_port *port = to_nvmet_port(item); + u8 treq = port->disc_addr.treq & ~NVME_TREQ_SECURE_CHANNEL_MASK; if (port->enabled) { pr_err("Cannot modify address while enabled\n"); @@ -174,15 +180,16 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item, } if (sysfs_streq(page, "not specified")) { - port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED; + treq |= NVMF_TREQ_NOT_SPECIFIED; } else if (sysfs_streq(page, "required")) { - port->disc_addr.treq = NVMF_TREQ_REQUIRED; + treq |= NVMF_TREQ_REQUIRED; } else if (sysfs_streq(page, "not required")) { - port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED; + treq |= NVMF_TREQ_NOT_REQUIRED; } else { pr_err("Invalid value '%s' for treq\n", page); return -EINVAL; } + port->disc_addr.treq = treq; return count; } @@ -646,7 +653,8 @@ static int nvmet_port_subsys_allow_link(struct config_item *parent, } list_add_tail(&link->entry, &port->subsystems); - nvmet_genctr++; + nvmet_port_disc_changed(port, subsys); + up_write(&nvmet_config_sem); return 0; @@ -673,7 +681,8 @@ static void nvmet_port_subsys_drop_link(struct config_item *parent, found: list_del(&p->entry); - nvmet_genctr++; + nvmet_port_disc_changed(port, subsys); + if (list_empty(&port->subsystems)) nvmet_disable_port(port); up_write(&nvmet_config_sem); @@ -722,7 +731,8 @@ static int nvmet_allowed_hosts_allow_link(struct config_item *parent, goto out_free_link; } list_add_tail(&link->entry, &subsys->hosts); - nvmet_genctr++; + nvmet_subsys_disc_changed(subsys, host); + up_write(&nvmet_config_sem); return 0; out_free_link: @@ -748,7 +758,8 @@ static void nvmet_allowed_hosts_drop_link(struct config_item *parent, found: list_del(&p->entry); - nvmet_genctr++; + nvmet_subsys_disc_changed(subsys, host); + up_write(&nvmet_config_sem); kfree(p); } @@ -787,7 +798,11 @@ static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item, goto out_unlock; } - subsys->allow_any_host = allow_any_host; + if (subsys->allow_any_host != allow_any_host) { + subsys->allow_any_host = allow_any_host; + nvmet_subsys_disc_changed(subsys, NULL); + } + out_unlock: up_write(&nvmet_config_sem); return ret ? ret : count; @@ -936,7 +951,7 @@ static ssize_t nvmet_referral_enable_store(struct config_item *item, if (enable) nvmet_referral_enable(parent, port); else - nvmet_referral_disable(port); + nvmet_referral_disable(parent, port); return count; inval: @@ -962,9 +977,10 @@ static struct configfs_attribute *nvmet_referral_attrs[] = { static void nvmet_referral_release(struct config_item *item) { + struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent); struct nvmet_port *port = to_nvmet_port(item); - nvmet_referral_disable(port); + nvmet_referral_disable(parent, port); kfree(port); } @@ -1137,6 +1153,8 @@ static void nvmet_port_release(struct config_item *item) { struct nvmet_port *port = to_nvmet_port(item); + list_del(&port->global_entry); + kfree(port->ana_state); kfree(port); } @@ -1189,12 +1207,15 @@ static struct config_group *nvmet_ports_make(struct config_group *group, port->ana_state[i] = NVME_ANA_INACCESSIBLE; } + list_add(&port->global_entry, &nvmet_ports_list); + INIT_LIST_HEAD(&port->entry); INIT_LIST_HEAD(&port->subsystems); INIT_LIST_HEAD(&port->referrals); port->inline_data_size = -1; /* < 0 == let the transport choose */ port->disc_addr.portid = cpu_to_le16(portid); + port->disc_addr.treq = NVMF_TREQ_DISABLE_SQFLOW; config_group_init_type_name(&port->group, name, &nvmet_port_type); config_group_init_type_name(&port->subsys_group, diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index a5f9bbce863f..88d260f31835 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -45,28 +45,72 @@ u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; u64 nvmet_ana_chgcnt; DECLARE_RWSEM(nvmet_ana_sem); +inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno) +{ + u16 status; + + switch (errno) { + case -ENOSPC: + req->error_loc = offsetof(struct nvme_rw_command, length); + status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR; + break; + case -EREMOTEIO: + req->error_loc = offsetof(struct nvme_rw_command, slba); + status = NVME_SC_LBA_RANGE | NVME_SC_DNR; + break; + case -EOPNOTSUPP: + req->error_loc = offsetof(struct nvme_common_command, opcode); + switch (req->cmd->common.opcode) { + case nvme_cmd_dsm: + case nvme_cmd_write_zeroes: + status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR; + break; + default: + status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + break; + case -ENODATA: + req->error_loc = offsetof(struct nvme_rw_command, nsid); + status = NVME_SC_ACCESS_DENIED; + break; + case -EIO: + /* FALLTHRU */ + default: + req->error_loc = offsetof(struct nvme_common_command, opcode); + status = NVME_SC_INTERNAL | NVME_SC_DNR; + } + + return status; +} + static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, const char *subsysnqn); u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, size_t len) { - if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) + if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { + req->error_loc = offsetof(struct nvme_common_command, dptr); return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + } return 0; } u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) { - if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) + if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { + req->error_loc = offsetof(struct nvme_common_command, dptr); return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + } return 0; } u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len) { - if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) + if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) { + req->error_loc = offsetof(struct nvme_common_command, dptr); return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + } return 0; } @@ -130,7 +174,7 @@ static void nvmet_async_event_work(struct work_struct *work) } } -static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, u8 event_info, u8 log_page) { struct nvmet_async_event *aen; @@ -150,13 +194,6 @@ static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, schedule_work(&ctrl->async_event_work); } -static bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen) -{ - if (!(READ_ONCE(ctrl->aen_enabled) & aen)) - return true; - return test_and_set_bit(aen, &ctrl->aen_masked); -} - static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid) { u32 i; @@ -187,7 +224,7 @@ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid)); - if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_NS_ATTR)) + if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR)) continue; nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, NVME_AER_NOTICE_NS_CHANGED, @@ -204,7 +241,7 @@ void nvmet_send_ana_event(struct nvmet_subsys *subsys, list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { if (port && ctrl->port != port) continue; - if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE)) + if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE)) continue; nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, NVME_AER_NOTICE_ANA, NVME_LOG_ANA); @@ -299,6 +336,15 @@ static void nvmet_keep_alive_timer(struct work_struct *work) { struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work), struct nvmet_ctrl, ka_work); + bool cmd_seen = ctrl->cmd_seen; + + ctrl->cmd_seen = false; + if (cmd_seen) { + pr_debug("ctrl %d reschedule traffic based keep-alive timer\n", + ctrl->cntlid); + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + return; + } pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n", ctrl->cntlid, ctrl->kato); @@ -595,26 +641,58 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) return ns; } -static void __nvmet_req_complete(struct nvmet_req *req, u16 status) +static void nvmet_update_sq_head(struct nvmet_req *req) { - u32 old_sqhd, new_sqhd; - u16 sqhd; - - if (status) - nvmet_set_status(req, status); - if (req->sq->size) { + u32 old_sqhd, new_sqhd; + do { old_sqhd = req->sq->sqhd; new_sqhd = (old_sqhd + 1) % req->sq->size; } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) != old_sqhd); } - sqhd = req->sq->sqhd & 0x0000FFFF; - req->rsp->sq_head = cpu_to_le16(sqhd); + req->rsp->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF); +} + +static void nvmet_set_error(struct nvmet_req *req, u16 status) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_error_slot *new_error_slot; + unsigned long flags; + + req->rsp->status = cpu_to_le16(status << 1); + + if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC) + return; + + spin_lock_irqsave(&ctrl->error_lock, flags); + ctrl->err_counter++; + new_error_slot = + &ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS]; + + new_error_slot->error_count = cpu_to_le64(ctrl->err_counter); + new_error_slot->sqid = cpu_to_le16(req->sq->qid); + new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id); + new_error_slot->status_field = cpu_to_le16(status << 1); + new_error_slot->param_error_location = cpu_to_le16(req->error_loc); + new_error_slot->lba = cpu_to_le64(req->error_slba); + new_error_slot->nsid = req->cmd->common.nsid; + spin_unlock_irqrestore(&ctrl->error_lock, flags); + + /* set the more bit for this request */ + req->rsp->status |= cpu_to_le16(1 << 14); +} + +static void __nvmet_req_complete(struct nvmet_req *req, u16 status) +{ + if (!req->sq->sqhd_disabled) + nvmet_update_sq_head(req); req->rsp->sq_id = cpu_to_le16(req->sq->qid); req->rsp->command_id = req->cmd->common.command_id; + if (unlikely(status)) + nvmet_set_error(req, status); if (req->ns) nvmet_put_namespace(req->ns); req->ops->queue_response(req); @@ -735,14 +813,20 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) return ret; req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); - if (unlikely(!req->ns)) + if (unlikely(!req->ns)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); return NVME_SC_INVALID_NS | NVME_SC_DNR; + } ret = nvmet_check_ana_state(req->port, req->ns); - if (unlikely(ret)) + if (unlikely(ret)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); return ret; + } ret = nvmet_io_cmd_check_access(req); - if (unlikely(ret)) + if (unlikely(ret)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); return ret; + } if (req->ns->file) return nvmet_file_parse_io_cmd(req); @@ -763,10 +847,14 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, req->sg_cnt = 0; req->transfer_len = 0; req->rsp->status = 0; + req->rsp->sq_head = 0; req->ns = NULL; + req->error_loc = NVMET_NO_ERROR_LOC; + req->error_slba = 0; /* no support for fused commands yet */ if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { + req->error_loc = offsetof(struct nvme_common_command, flags); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; goto fail; } @@ -777,6 +865,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, * byte aligned. */ if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) { + req->error_loc = offsetof(struct nvme_common_command, flags); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; goto fail; } @@ -801,6 +890,9 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, goto fail; } + if (sq->ctrl) + sq->ctrl->cmd_seen = true; + return true; fail: @@ -819,9 +911,10 @@ EXPORT_SYMBOL_GPL(nvmet_req_uninit); void nvmet_req_execute(struct nvmet_req *req) { - if (unlikely(req->data_len != req->transfer_len)) + if (unlikely(req->data_len != req->transfer_len)) { + req->error_loc = offsetof(struct nvme_common_command, dptr); nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); - else + } else req->execute(req); } EXPORT_SYMBOL_GPL(nvmet_req_execute); @@ -1027,14 +1120,18 @@ u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd) return 0; } -static bool __nvmet_host_allowed(struct nvmet_subsys *subsys, - const char *hostnqn) +bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn) { struct nvmet_host_link *p; + lockdep_assert_held(&nvmet_config_sem); + if (subsys->allow_any_host) return true; + if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */ + return true; + list_for_each_entry(p, &subsys->hosts, entry) { if (!strcmp(nvmet_host_name(p->host), hostnqn)) return true; @@ -1043,30 +1140,6 @@ static bool __nvmet_host_allowed(struct nvmet_subsys *subsys, return false; } -static bool nvmet_host_discovery_allowed(struct nvmet_req *req, - const char *hostnqn) -{ - struct nvmet_subsys_link *s; - - list_for_each_entry(s, &req->port->subsystems, entry) { - if (__nvmet_host_allowed(s->subsys, hostnqn)) - return true; - } - - return false; -} - -bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, - const char *hostnqn) -{ - lockdep_assert_held(&nvmet_config_sem); - - if (subsys->type == NVME_NQN_DISC) - return nvmet_host_discovery_allowed(req, hostnqn); - else - return __nvmet_host_allowed(subsys, hostnqn); -} - /* * Note: ctrl->subsys->lock should be held when calling this function */ @@ -1117,7 +1190,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; down_read(&nvmet_config_sem); - if (!nvmet_host_allowed(req, subsys, hostnqn)) { + if (!nvmet_host_allowed(subsys, hostnqn)) { pr_info("connect by host %s for subsystem %s not allowed\n", hostnqn, subsysnqn); req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); @@ -1175,31 +1248,20 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, ctrl->cntlid = ret; ctrl->ops = req->ops; - if (ctrl->subsys->type == NVME_NQN_DISC) { - /* Don't accept keep-alive timeout for discovery controllers */ - if (kato) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; - goto out_remove_ida; - } - /* - * Discovery controllers use some arbitrary high value in order - * to cleanup stale discovery sessions - * - * From the latest base diff RC: - * "The Keep Alive command is not supported by - * Discovery controllers. A transport may specify a - * fixed Discovery controller activity timeout value - * (e.g., 2 minutes). If no commands are received - * by a Discovery controller within that time - * period, the controller may perform the - * actions for Keep Alive Timer expiration". - */ - ctrl->kato = NVMET_DISC_KATO; - } else { - /* keep-alive timeout in seconds */ - ctrl->kato = DIV_ROUND_UP(kato, 1000); - } + /* + * Discovery controllers may use some arbitrary high value + * in order to cleanup stale discovery sessions + */ + if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato) + kato = NVMET_DISC_KATO_MS; + + /* keep-alive timeout in seconds */ + ctrl->kato = DIV_ROUND_UP(kato, 1000); + + ctrl->err_counter = 0; + spin_lock_init(&ctrl->error_lock); + nvmet_start_keep_alive_timer(ctrl); mutex_lock(&subsys->lock); @@ -1210,8 +1272,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, *ctrlp = ctrl; return 0; -out_remove_ida: - ida_simple_remove(&cntlid_ida, ctrl->cntlid); out_free_sqs: kfree(ctrl->sqs); out_free_cqs: diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index bc0aa0bf1543..d2cb71a0b419 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -18,7 +18,65 @@ struct nvmet_subsys *nvmet_disc_subsys; -u64 nvmet_genctr; +static u64 nvmet_genctr; + +static void __nvmet_disc_changed(struct nvmet_port *port, + struct nvmet_ctrl *ctrl) +{ + if (ctrl->port != port) + return; + + if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_DISC_CHANGE)) + return; + + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + NVME_AER_NOTICE_DISC_CHANGED, NVME_LOG_DISC); +} + +void nvmet_port_disc_changed(struct nvmet_port *port, + struct nvmet_subsys *subsys) +{ + struct nvmet_ctrl *ctrl; + + nvmet_genctr++; + + list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) { + if (subsys && !nvmet_host_allowed(subsys, ctrl->hostnqn)) + continue; + + __nvmet_disc_changed(port, ctrl); + } +} + +static void __nvmet_subsys_disc_changed(struct nvmet_port *port, + struct nvmet_subsys *subsys, + struct nvmet_host *host) +{ + struct nvmet_ctrl *ctrl; + + list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) { + if (host && strcmp(nvmet_host_name(host), ctrl->hostnqn)) + continue; + + __nvmet_disc_changed(port, ctrl); + } +} + +void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, + struct nvmet_host *host) +{ + struct nvmet_port *port; + struct nvmet_subsys_link *s; + + nvmet_genctr++; + + list_for_each_entry(port, nvmet_ports, global_entry) + list_for_each_entry(s, &port->subsystems, entry) { + if (s->subsys != subsys) + continue; + __nvmet_subsys_disc_changed(port, subsys, host); + } +} void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port) { @@ -26,18 +84,18 @@ void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port) if (list_empty(&port->entry)) { list_add_tail(&port->entry, &parent->referrals); port->enabled = true; - nvmet_genctr++; + nvmet_port_disc_changed(parent, NULL); } up_write(&nvmet_config_sem); } -void nvmet_referral_disable(struct nvmet_port *port) +void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port) { down_write(&nvmet_config_sem); if (!list_empty(&port->entry)) { port->enabled = false; list_del_init(&port->entry); - nvmet_genctr++; + nvmet_port_disc_changed(parent, NULL); } up_write(&nvmet_config_sem); } @@ -107,7 +165,7 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req) down_read(&nvmet_config_sem); list_for_each_entry(p, &req->port->subsystems, entry) { - if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn)) + if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn)) continue; if (residual_len >= entry_size) { char traddr[NVMF_TRADDR_SIZE]; @@ -136,6 +194,8 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req) hdr->numrec = cpu_to_le64(numrec); hdr->recfmt = cpu_to_le16(0); + nvmet_clear_aen_bit(req, NVME_AEN_BIT_DISC_CHANGE); + up_read(&nvmet_config_sem); status = nvmet_copy_to_sgl(req, 0, hdr, data_len); @@ -174,6 +234,8 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req) if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); + id->oaes = cpu_to_le32(NVMET_DISC_AEN_CFG_OPTIONAL); + strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); @@ -183,6 +245,51 @@ out: nvmet_req_complete(req, status); } +static void nvmet_execute_disc_set_features(struct nvmet_req *req) +{ + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); + u16 stat; + + switch (cdw10 & 0xff) { + case NVME_FEAT_KATO: + stat = nvmet_set_feat_kato(req); + break; + case NVME_FEAT_ASYNC_EVENT: + stat = nvmet_set_feat_async_event(req, + NVMET_DISC_AEN_CFG_OPTIONAL); + break; + default: + req->error_loc = + offsetof(struct nvme_common_command, cdw10); + stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, stat); +} + +static void nvmet_execute_disc_get_features(struct nvmet_req *req) +{ + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); + u16 stat = 0; + + switch (cdw10 & 0xff) { + case NVME_FEAT_KATO: + nvmet_get_feat_kato(req); + break; + case NVME_FEAT_ASYNC_EVENT: + nvmet_get_feat_async_event(req); + break; + default: + req->error_loc = + offsetof(struct nvme_common_command, cdw10); + stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, stat); +} + u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; @@ -190,10 +297,28 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { pr_err("got cmd %d while not ready\n", cmd->common.opcode); + req->error_loc = + offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } switch (cmd->common.opcode) { + case nvme_admin_set_features: + req->execute = nvmet_execute_disc_set_features; + req->data_len = 0; + return 0; + case nvme_admin_get_features: + req->execute = nvmet_execute_disc_get_features; + req->data_len = 0; + return 0; + case nvme_admin_async_event: + req->execute = nvmet_execute_async_event; + req->data_len = 0; + return 0; + case nvme_admin_keep_alive: + req->execute = nvmet_execute_keep_alive; + req->data_len = 0; + return 0; case nvme_admin_get_log_page: req->data_len = nvmet_get_log_page_len(cmd); @@ -204,6 +329,8 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) default: pr_err("unsupported get_log_page lid %d\n", cmd->get_log_page.lid); + req->error_loc = + offsetof(struct nvme_get_log_page_command, lid); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } case nvme_admin_identify: @@ -216,10 +343,12 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) default: pr_err("unsupported identify cns %d\n", cmd->identify.cns); + req->error_loc = offsetof(struct nvme_identify, cns); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } default: pr_err("unhandled cmd %d\n", cmd->common.opcode); + req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index d84ae004cb85..6cf1fd9eb32e 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -17,23 +17,26 @@ static void nvmet_execute_prop_set(struct nvmet_req *req) { + u64 val = le64_to_cpu(req->cmd->prop_set.value); u16 status = 0; - if (!(req->cmd->prop_set.attrib & 1)) { - u64 val = le64_to_cpu(req->cmd->prop_set.value); - - switch (le32_to_cpu(req->cmd->prop_set.offset)) { - case NVME_REG_CC: - nvmet_update_cc(req->sq->ctrl, val); - break; - default: - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; - break; - } - } else { + if (req->cmd->prop_set.attrib & 1) { + req->error_loc = + offsetof(struct nvmf_property_set_command, attrib); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; } + switch (le32_to_cpu(req->cmd->prop_set.offset)) { + case NVME_REG_CC: + nvmet_update_cc(req->sq->ctrl, val); + break; + default: + req->error_loc = + offsetof(struct nvmf_property_set_command, offset); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } +out: nvmet_req_complete(req, status); } @@ -69,6 +72,14 @@ static void nvmet_execute_prop_get(struct nvmet_req *req) } } + if (status && req->cmd->prop_get.attrib & 1) { + req->error_loc = + offsetof(struct nvmf_property_get_command, offset); + } else { + req->error_loc = + offsetof(struct nvmf_property_get_command, attrib); + } + req->rsp->result.u64 = cpu_to_le64(val); nvmet_req_complete(req, status); } @@ -89,6 +100,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req) default: pr_err("received unknown capsule type 0x%x\n", cmd->fabrics.fctype); + req->error_loc = offsetof(struct nvmf_common_command, fctype); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } @@ -105,16 +117,34 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) old = cmpxchg(&req->sq->ctrl, NULL, ctrl); if (old) { pr_warn("queue already connected!\n"); + req->error_loc = offsetof(struct nvmf_connect_command, opcode); return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; } if (!sqsize) { pr_warn("queue size zero!\n"); + req->error_loc = offsetof(struct nvmf_connect_command, sqsize); return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; } /* note: convert queue size from 0's-based value to 1's-based value */ nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1); + + if (c->cattr & NVME_CONNECT_DISABLE_SQFLOW) { + req->sq->sqhd_disabled = true; + req->rsp->sq_head = cpu_to_le16(0xffff); + } + + if (ctrl->ops->install_queue) { + u16 ret = ctrl->ops->install_queue(req->sq); + + if (ret) { + pr_err("failed to install queue %d cntlid %d ret %x\n", + qid, ret, ctrl->cntlid); + return ret; + } + } + return 0; } @@ -141,6 +171,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) if (c->recfmt != 0) { pr_warn("invalid connect version (%d).\n", le16_to_cpu(c->recfmt)); + req->error_loc = offsetof(struct nvmf_connect_command, recfmt); status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; goto out; } @@ -155,8 +186,13 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, le32_to_cpu(c->kato), &ctrl); - if (status) + if (status) { + if (status == (NVME_SC_INVALID_FIELD | NVME_SC_DNR)) + req->error_loc = + offsetof(struct nvme_common_command, opcode); goto out; + } + uuid_copy(&ctrl->hostid, &d->hostid); status = nvmet_install_queue(ctrl, req); @@ -243,11 +279,13 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req) if (cmd->common.opcode != nvme_fabrics_command) { pr_err("invalid command 0x%x on unconnected queue.\n", cmd->fabrics.opcode); + req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } if (cmd->fabrics.fctype != nvme_fabrics_type_connect) { pr_err("invalid capsule type 0x%x on unconnected queue.\n", cmd->fabrics.fctype); + req->error_loc = offsetof(struct nvmf_common_command, fctype); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 409081a03b24..f98f5c5bea26 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -86,8 +86,6 @@ struct nvmet_fc_fcp_iod { spinlock_t flock; struct nvmet_req req; - struct work_struct work; - struct work_struct done_work; struct work_struct defer_work; struct nvmet_fc_tgtport *tgtport; @@ -134,7 +132,6 @@ struct nvmet_fc_tgt_queue { u16 sqsize; u16 ersp_ratio; __le16 sqhd; - int cpu; atomic_t connected; atomic_t sqtail; atomic_t zrspcnt; @@ -232,8 +229,6 @@ static LIST_HEAD(nvmet_fc_portentry_list); static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work); -static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work); -static void nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work); static void nvmet_fc_fcp_rqst_op_defer_work(struct work_struct *work); static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc); static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc); @@ -438,8 +433,6 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport, int i; for (i = 0; i < queue->sqsize; fod++, i++) { - INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work); - INIT_WORK(&fod->done_work, nvmet_fc_fcp_rqst_op_done_work); INIT_WORK(&fod->defer_work, nvmet_fc_fcp_rqst_op_defer_work); fod->tgtport = tgtport; fod->queue = queue; @@ -517,10 +510,7 @@ nvmet_fc_queue_fcp_req(struct nvmet_fc_tgtport *tgtport, fcpreq->hwqid = queue->qid ? ((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0; - if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR) - queue_work_on(queue->cpu, queue->work_q, &fod->work); - else - nvmet_fc_handle_fcp_rqst(tgtport, fod); + nvmet_fc_handle_fcp_rqst(tgtport, fod); } static void @@ -599,30 +589,6 @@ nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue, queue_work(queue->work_q, &fod->defer_work); } -static int -nvmet_fc_queue_to_cpu(struct nvmet_fc_tgtport *tgtport, int qid) -{ - int cpu, idx, cnt; - - if (tgtport->ops->max_hw_queues == 1) - return WORK_CPU_UNBOUND; - - /* Simple cpu selection based on qid modulo active cpu count */ - idx = !qid ? 0 : (qid - 1) % num_active_cpus(); - - /* find the n'th active cpu */ - for (cpu = 0, cnt = 0; ; ) { - if (cpu_active(cpu)) { - if (cnt == idx) - break; - cnt++; - } - cpu = (cpu + 1) % num_possible_cpus(); - } - - return cpu; -} - static struct nvmet_fc_tgt_queue * nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, u16 qid, u16 sqsize) @@ -653,7 +619,6 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, queue->qid = qid; queue->sqsize = sqsize; queue->assoc = assoc; - queue->cpu = nvmet_fc_queue_to_cpu(assoc->tgtport, qid); INIT_LIST_HEAD(&queue->fod_list); INIT_LIST_HEAD(&queue->avail_defer_list); INIT_LIST_HEAD(&queue->pending_cmd_list); @@ -2145,26 +2110,12 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) } } -static void -nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work) -{ - struct nvmet_fc_fcp_iod *fod = - container_of(work, struct nvmet_fc_fcp_iod, done_work); - - nvmet_fc_fod_op_done(fod); -} - static void nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq) { struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private; - struct nvmet_fc_tgt_queue *queue = fod->queue; - if (fod->tgtport->ops->target_features & NVMET_FCTGTFEAT_OPDONE_IN_ISR) - /* context switch so completion is not in ISR context */ - queue_work_on(queue->cpu, queue->work_q, &fod->done_work); - else - nvmet_fc_fod_op_done(fod); + nvmet_fc_fod_op_done(fod); } /* @@ -2332,19 +2283,6 @@ transport_error: nvmet_fc_abort_op(tgtport, fod); } -/* - * Actual processing routine for received FC-NVME LS Requests from the LLD - */ -static void -nvmet_fc_handle_fcp_rqst_work(struct work_struct *work) -{ - struct nvmet_fc_fcp_iod *fod = - container_of(work, struct nvmet_fc_fcp_iod, work); - struct nvmet_fc_tgtport *tgtport = fod->tgtport; - - nvmet_fc_handle_fcp_rqst(tgtport, fod); -} - /** * nvmet_fc_rcv_fcp_req - transport entry point called by an LLDD * upon the reception of a NVME FCP CMD IU. diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index c1ec3475a140..b6d030d3259f 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -44,13 +44,69 @@ void nvmet_bdev_ns_disable(struct nvmet_ns *ns) } } +static u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) +{ + u16 status = NVME_SC_SUCCESS; + + if (likely(blk_sts == BLK_STS_OK)) + return status; + /* + * Right now there exists M : 1 mapping between block layer error + * to the NVMe status code (see nvme_error_status()). For consistency, + * when we reverse map we use most appropriate NVMe Status code from + * the group of the NVMe staus codes used in the nvme_error_status(). + */ + switch (blk_sts) { + case BLK_STS_NOSPC: + status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR; + req->error_loc = offsetof(struct nvme_rw_command, length); + break; + case BLK_STS_TARGET: + status = NVME_SC_LBA_RANGE | NVME_SC_DNR; + req->error_loc = offsetof(struct nvme_rw_command, slba); + break; + case BLK_STS_NOTSUPP: + req->error_loc = offsetof(struct nvme_common_command, opcode); + switch (req->cmd->common.opcode) { + case nvme_cmd_dsm: + case nvme_cmd_write_zeroes: + status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR; + break; + default: + status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + break; + case BLK_STS_MEDIUM: + status = NVME_SC_ACCESS_DENIED; + req->error_loc = offsetof(struct nvme_rw_command, nsid); + break; + case BLK_STS_IOERR: + /* fallthru */ + default: + status = NVME_SC_INTERNAL | NVME_SC_DNR; + req->error_loc = offsetof(struct nvme_common_command, opcode); + } + + switch (req->cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + req->error_slba = le64_to_cpu(req->cmd->rw.slba); + break; + case nvme_cmd_write_zeroes: + req->error_slba = + le64_to_cpu(req->cmd->write_zeroes.slba); + break; + default: + req->error_slba = 0; + } + return status; +} + static void nvmet_bio_done(struct bio *bio) { struct nvmet_req *req = bio->bi_private; - nvmet_req_complete(req, - bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); - + nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status)); if (bio != &req->b.inline_bio) bio_put(bio); } @@ -61,7 +117,6 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) struct bio *bio; struct scatterlist *sg; sector_t sector; - blk_qc_t cookie; int op, op_flags = 0, i; if (!req->sg_cnt) { @@ -114,9 +169,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) sg_cnt--; } - cookie = submit_bio(bio); - - blk_poll(bdev_get_queue(req->ns->bdev), cookie); + submit_bio(bio); } static void nvmet_bdev_execute_flush(struct nvmet_req *req) @@ -139,18 +192,21 @@ u16 nvmet_bdev_flush(struct nvmet_req *req) return 0; } -static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns, +static u16 nvmet_bdev_discard_range(struct nvmet_req *req, struct nvme_dsm_range *range, struct bio **bio) { + struct nvmet_ns *ns = req->ns; int ret; ret = __blkdev_issue_discard(ns->bdev, le64_to_cpu(range->slba) << (ns->blksize_shift - 9), le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), GFP_KERNEL, 0, bio); - if (ret && ret != -EOPNOTSUPP) - return NVME_SC_INTERNAL | NVME_SC_DNR; - return 0; + + if (ret) + req->error_slba = le64_to_cpu(range->slba); + + return blk_to_nvme_status(req, errno_to_blk_status(ret)); } static void nvmet_bdev_execute_discard(struct nvmet_req *req) @@ -166,7 +222,7 @@ static void nvmet_bdev_execute_discard(struct nvmet_req *req) if (status) break; - status = nvmet_bdev_discard_range(req->ns, &range, &bio); + status = nvmet_bdev_discard_range(req, &range, &bio); if (status) break; } @@ -207,16 +263,16 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req) u16 status = NVME_SC_SUCCESS; sector_t sector; sector_t nr_sector; + int ret; sector = le64_to_cpu(write_zeroes->slba) << (req->ns->blksize_shift - 9); nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << (req->ns->blksize_shift - 9)); - if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector, - GFP_KERNEL, &bio, 0)) - status = NVME_SC_INTERNAL | NVME_SC_DNR; - + ret = __blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector, + GFP_KERNEL, &bio, 0); + status = blk_to_nvme_status(req, errno_to_blk_status(ret)); if (bio) { bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; @@ -251,6 +307,7 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) default: pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, req->sq->qid); + req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } } diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 01feebec29ea..517522305e5c 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -83,17 +83,16 @@ static void nvmet_file_init_bvec(struct bio_vec *bv, struct sg_page_iter *iter) } static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, - unsigned long nr_segs, size_t count) + unsigned long nr_segs, size_t count, int ki_flags) { struct kiocb *iocb = &req->f.iocb; ssize_t (*call_iter)(struct kiocb *iocb, struct iov_iter *iter); struct iov_iter iter; - int ki_flags = 0, rw; - ssize_t ret; + int rw; if (req->cmd->rw.opcode == nvme_cmd_write) { if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) - ki_flags = IOCB_DSYNC; + ki_flags |= IOCB_DSYNC; call_iter = req->ns->file->f_op->write_iter; rw = WRITE; } else { @@ -107,17 +106,13 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, iocb->ki_filp = req->ns->file; iocb->ki_flags = ki_flags | iocb_flags(req->ns->file); - ret = call_iter(iocb, &iter); - - if (ret != -EIOCBQUEUED && iocb->ki_complete) - iocb->ki_complete(iocb, ret, 0); - - return ret; + return call_iter(iocb, &iter); } static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2) { struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb); + u16 status = NVME_SC_SUCCESS; if (req->f.bvec != req->inline_bvec) { if (likely(req->f.mpool_alloc == false)) @@ -126,11 +121,12 @@ static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2) mempool_free(req->f.bvec, req->ns->bvec_pool); } - nvmet_req_complete(req, ret != req->data_len ? - NVME_SC_INTERNAL | NVME_SC_DNR : 0); + if (unlikely(ret != req->data_len)) + status = errno_to_nvme_status(req, ret); + nvmet_req_complete(req, status); } -static void nvmet_file_execute_rw(struct nvmet_req *req) +static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags) { ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE); struct sg_page_iter sg_pg_iter; @@ -140,30 +136,14 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) ssize_t ret = 0; loff_t pos; - if (!req->sg_cnt || !nr_bvec) { - nvmet_req_complete(req, 0); - return; - } + + if (req->f.mpool_alloc && nr_bvec > NVMET_MAX_MPOOL_BVEC) + is_sync = true; pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; if (unlikely(pos + req->data_len > req->ns->size)) { - nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR); - return; - } - - if (nr_bvec > NVMET_MAX_INLINE_BIOVEC) - req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), - GFP_KERNEL); - else - req->f.bvec = req->inline_bvec; - - req->f.mpool_alloc = false; - if (unlikely(!req->f.bvec)) { - /* fallback under memory pressure */ - req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL); - req->f.mpool_alloc = true; - if (nr_bvec > NVMET_MAX_MPOOL_BVEC) - is_sync = true; + nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC)); + return true; } memset(&req->f.iocb, 0, sizeof(struct kiocb)); @@ -177,9 +157,10 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) if (unlikely(is_sync) && (nr_bvec - 1 == 0 || bv_cnt == NVMET_MAX_MPOOL_BVEC)) { - ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len); + ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len, 0); if (ret < 0) - goto out; + goto complete; + pos += len; bv_cnt = 0; len = 0; @@ -187,35 +168,95 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) nr_bvec--; } - if (WARN_ON_ONCE(total_len != req->data_len)) + if (WARN_ON_ONCE(total_len != req->data_len)) { ret = -EIO; -out: - if (unlikely(is_sync || ret)) { - nvmet_file_io_done(&req->f.iocb, ret < 0 ? ret : total_len, 0); - return; + goto complete; } - req->f.iocb.ki_complete = nvmet_file_io_done; - nvmet_file_submit_bvec(req, pos, bv_cnt, total_len); + + if (unlikely(is_sync)) { + ret = total_len; + goto complete; + } + + /* + * A NULL ki_complete ask for synchronous execution, which we want + * for the IOCB_NOWAIT case. + */ + if (!(ki_flags & IOCB_NOWAIT)) + req->f.iocb.ki_complete = nvmet_file_io_done; + + ret = nvmet_file_submit_bvec(req, pos, bv_cnt, total_len, ki_flags); + + switch (ret) { + case -EIOCBQUEUED: + return true; + case -EAGAIN: + if (WARN_ON_ONCE(!(ki_flags & IOCB_NOWAIT))) + goto complete; + return false; + case -EOPNOTSUPP: + /* + * For file systems returning error -EOPNOTSUPP, handle + * IOCB_NOWAIT error case separately and retry without + * IOCB_NOWAIT. + */ + if ((ki_flags & IOCB_NOWAIT)) + return false; + break; + } + +complete: + nvmet_file_io_done(&req->f.iocb, ret, 0); + return true; } static void nvmet_file_buffered_io_work(struct work_struct *w) { struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); - nvmet_file_execute_rw(req); + nvmet_file_execute_io(req, 0); } -static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req) +static void nvmet_file_submit_buffered_io(struct nvmet_req *req) { INIT_WORK(&req->f.work, nvmet_file_buffered_io_work); queue_work(buffered_io_wq, &req->f.work); } +static void nvmet_file_execute_rw(struct nvmet_req *req) +{ + ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE); + + if (!req->sg_cnt || !nr_bvec) { + nvmet_req_complete(req, 0); + return; + } + + if (nr_bvec > NVMET_MAX_INLINE_BIOVEC) + req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), + GFP_KERNEL); + else + req->f.bvec = req->inline_bvec; + + if (unlikely(!req->f.bvec)) { + /* fallback under memory pressure */ + req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL); + req->f.mpool_alloc = true; + } else + req->f.mpool_alloc = false; + + if (req->ns->buffered_io) { + if (likely(!req->f.mpool_alloc) && + nvmet_file_execute_io(req, IOCB_NOWAIT)) + return; + nvmet_file_submit_buffered_io(req); + } else + nvmet_file_execute_io(req, 0); +} + u16 nvmet_file_flush(struct nvmet_req *req) { - if (vfs_fsync(req->ns->file, 1) < 0) - return NVME_SC_INTERNAL | NVME_SC_DNR; - return 0; + return errno_to_nvme_status(req, vfs_fsync(req->ns->file, 1)); } static void nvmet_file_flush_work(struct work_struct *w) @@ -236,30 +277,34 @@ static void nvmet_file_execute_discard(struct nvmet_req *req) int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; struct nvme_dsm_range range; loff_t offset, len; - u16 ret; + u16 status = 0; + int ret; int i; for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { - ret = nvmet_copy_from_sgl(req, i * sizeof(range), &range, + status = nvmet_copy_from_sgl(req, i * sizeof(range), &range, sizeof(range)); - if (ret) + if (status) break; offset = le64_to_cpu(range.slba) << req->ns->blksize_shift; len = le32_to_cpu(range.nlb); len <<= req->ns->blksize_shift; if (offset + len > req->ns->size) { - ret = NVME_SC_LBA_RANGE | NVME_SC_DNR; + req->error_slba = le64_to_cpu(range.slba); + status = errno_to_nvme_status(req, -ENOSPC); break; } - if (vfs_fallocate(req->ns->file, mode, offset, len)) { - ret = NVME_SC_INTERNAL | NVME_SC_DNR; + ret = vfs_fallocate(req->ns->file, mode, offset, len); + if (ret) { + req->error_slba = le64_to_cpu(range.slba); + status = errno_to_nvme_status(req, ret); break; } } - nvmet_req_complete(req, ret); + nvmet_req_complete(req, status); } static void nvmet_file_dsm_work(struct work_struct *w) @@ -299,12 +344,12 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w) req->ns->blksize_shift); if (unlikely(offset + len > req->ns->size)) { - nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR); + nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC)); return; } ret = vfs_fallocate(req->ns->file, mode, offset, len); - nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + nvmet_req_complete(req, ret < 0 ? errno_to_nvme_status(req, ret) : 0); } static void nvmet_file_execute_write_zeroes(struct nvmet_req *req) @@ -320,10 +365,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) switch (cmd->common.opcode) { case nvme_cmd_read: case nvme_cmd_write: - if (req->ns->buffered_io) - req->execute = nvmet_file_execute_rw_buffered_io; - else - req->execute = nvmet_file_execute_rw; + req->execute = nvmet_file_execute_rw; req->data_len = nvmet_rw_len(req); return 0; case nvme_cmd_flush: @@ -342,6 +384,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) default: pr_err("unhandled cmd for file ns %d on qid %d\n", cmd->common.opcode, req->sq->qid); + req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 9908082b32c4..4aac1b4a8112 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -345,7 +345,7 @@ static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl) int i, ret; for (i = 1; i < ctrl->ctrl.queue_count; i++) { - ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + ret = nvmf_connect_io_queue(&ctrl->ctrl, i, false); if (ret) return ret; set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index c2b4d9ee6391..3e4719fdba85 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -30,12 +30,15 @@ #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 +#define NVMET_NO_ERROR_LOC ((u16)-1) /* * Supported optional AENs: */ #define NVMET_AEN_CFG_OPTIONAL \ (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE) +#define NVMET_DISC_AEN_CFG_OPTIONAL \ + (NVME_AEN_CFG_DISC_CHANGE) /* * Plus mandatory SMART AENs (we'll never send them, but allow enabling them): @@ -104,6 +107,7 @@ struct nvmet_sq { u16 qid; u16 size; u32 sqhd; + bool sqhd_disabled; struct completion free_done; struct completion confirm_done; }; @@ -137,6 +141,7 @@ struct nvmet_port { struct list_head subsystems; struct config_group referrals_group; struct list_head referrals; + struct list_head global_entry; struct config_group ana_groups_group; struct nvmet_ana_group ana_default_group; enum nvme_ana_state *ana_state; @@ -163,6 +168,8 @@ struct nvmet_ctrl { struct nvmet_cq **cqs; struct nvmet_sq **sqs; + bool cmd_seen; + struct mutex lock; u64 cap; u32 cc; @@ -194,8 +201,12 @@ struct nvmet_ctrl { char subsysnqn[NVMF_NQN_FIELD_LEN]; char hostnqn[NVMF_NQN_FIELD_LEN]; - struct device *p2p_client; - struct radix_tree_root p2p_ns_map; + struct device *p2p_client; + struct radix_tree_root p2p_ns_map; + + spinlock_t error_lock; + u64 err_counter; + struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS]; }; struct nvmet_subsys { @@ -273,6 +284,7 @@ struct nvmet_fabrics_ops { void (*delete_ctrl)(struct nvmet_ctrl *ctrl); void (*disc_traddr)(struct nvmet_req *req, struct nvmet_port *port, char *traddr); + u16 (*install_queue)(struct nvmet_sq *nvme_sq); }; #define NVMET_MAX_INLINE_BIOVEC 8 @@ -308,17 +320,14 @@ struct nvmet_req { void (*execute)(struct nvmet_req *req); const struct nvmet_fabrics_ops *ops; - struct pci_dev *p2p_dev; - struct device *p2p_client; + struct pci_dev *p2p_dev; + struct device *p2p_client; + u16 error_loc; + u64 error_slba; }; extern struct workqueue_struct *buffered_io_wq; -static inline void nvmet_set_status(struct nvmet_req *req, u16 status) -{ - req->rsp->status = cpu_to_le16(status << 1); -} - static inline void nvmet_set_result(struct nvmet_req *req, u32 result) { req->rsp->result.u32 = cpu_to_le32(result); @@ -340,6 +349,27 @@ struct nvmet_async_event { u8 log_page; }; +static inline void nvmet_clear_aen_bit(struct nvmet_req *req, u32 bn) +{ + int rae = le32_to_cpu(req->cmd->common.cdw10) & 1 << 15; + + if (!rae) + clear_bit(bn, &req->sq->ctrl->aen_masked); +} + +static inline bool nvmet_aen_bit_disabled(struct nvmet_ctrl *ctrl, u32 bn) +{ + if (!(READ_ONCE(ctrl->aen_enabled) & (1 << bn))) + return true; + return test_and_set_bit(bn, &ctrl->aen_masked); +} + +void nvmet_get_feat_kato(struct nvmet_req *req); +void nvmet_get_feat_async_event(struct nvmet_req *req); +u16 nvmet_set_feat_kato(struct nvmet_req *req); +u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask); +void nvmet_execute_async_event(struct nvmet_req *req); + u16 nvmet_parse_connect_cmd(struct nvmet_req *req); u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); @@ -355,6 +385,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status); int nvmet_req_alloc_sgl(struct nvmet_req *req); void nvmet_req_free_sgl(struct nvmet_req *req); +void nvmet_execute_keep_alive(struct nvmet_req *req); + void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size); void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, @@ -395,7 +427,7 @@ int nvmet_enable_port(struct nvmet_port *port); void nvmet_disable_port(struct nvmet_port *port); void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port); -void nvmet_referral_disable(struct nvmet_port *port); +void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port); u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, size_t len); @@ -405,6 +437,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len); u32 nvmet_get_log_page_len(struct nvme_command *cmd); +extern struct list_head *nvmet_ports; +void nvmet_port_disc_changed(struct nvmet_port *port, + struct nvmet_subsys *subsys); +void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, + struct nvmet_host *host); +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, + u8 event_info, u8 log_page); + #define NVMET_QUEUE_SIZE 1024 #define NVMET_NR_QUEUES 128 #define NVMET_MAX_CMD NVMET_QUEUE_SIZE @@ -425,7 +465,7 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd); #define NVMET_DEFAULT_ANA_GRPID 1 #define NVMET_KAS 10 -#define NVMET_DISC_KATO 120 +#define NVMET_DISC_KATO_MS 120000 int __init nvmet_init_configfs(void); void __exit nvmet_exit_configfs(void); @@ -434,15 +474,13 @@ int __init nvmet_init_discovery(void); void nvmet_exit_discovery(void); extern struct nvmet_subsys *nvmet_disc_subsys; -extern u64 nvmet_genctr; extern struct rw_semaphore nvmet_config_sem; extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; extern u64 nvmet_ana_chgcnt; extern struct rw_semaphore nvmet_ana_sem; -bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, - const char *hostnqn); +bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn); int nvmet_bdev_ns_enable(struct nvmet_ns *ns); int nvmet_file_ns_enable(struct nvmet_ns *ns); @@ -457,4 +495,6 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req) return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) << req->ns->blksize_shift; } + +u16 errno_to_nvme_status(struct nvmet_req *req, int errno); #endif /* _NVMET_H */ diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 583086dd9cb9..a8d23eb80192 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -196,7 +196,7 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) { unsigned long flags; - if (rsp->allocated) { + if (unlikely(rsp->allocated)) { kfree(rsp); return; } @@ -630,8 +630,11 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) u64 off = le64_to_cpu(sgl->addr); u32 len = le32_to_cpu(sgl->length); - if (!nvme_is_write(rsp->req.cmd)) + if (!nvme_is_write(rsp->req.cmd)) { + rsp->req.error_loc = + offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } if (off + len > rsp->queue->dev->inline_data_size) { pr_err("invalid inline data offset!\n"); @@ -696,6 +699,8 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) return nvmet_rdma_map_sgl_inline(rsp); default: pr_err("invalid SGL subtype: %#x\n", sgl->type); + rsp->req.error_loc = + offsetof(struct nvme_common_command, dptr); return NVME_SC_INVALID_FIELD | NVME_SC_DNR; } case NVME_KEY_SGL_FMT_DATA_DESC: @@ -706,10 +711,13 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); default: pr_err("invalid SGL subtype: %#x\n", sgl->type); + rsp->req.error_loc = + offsetof(struct nvme_common_command, dptr); return NVME_SC_INVALID_FIELD | NVME_SC_DNR; } default: pr_err("invalid SGL type: %#x\n", sgl->type); + rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; } } diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c new file mode 100644 index 000000000000..44b37b202e39 --- /dev/null +++ b/drivers/nvme/target/tcp.c @@ -0,0 +1,1737 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics TCP target. + * Copyright (c) 2018 Lightbits Labs. All rights reserved. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvmet.h" + +#define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) + +#define NVMET_TCP_RECV_BUDGET 8 +#define NVMET_TCP_SEND_BUDGET 8 +#define NVMET_TCP_IO_WORK_BUDGET 64 + +enum nvmet_tcp_send_state { + NVMET_TCP_SEND_DATA_PDU, + NVMET_TCP_SEND_DATA, + NVMET_TCP_SEND_R2T, + NVMET_TCP_SEND_DDGST, + NVMET_TCP_SEND_RESPONSE +}; + +enum nvmet_tcp_recv_state { + NVMET_TCP_RECV_PDU, + NVMET_TCP_RECV_DATA, + NVMET_TCP_RECV_DDGST, + NVMET_TCP_RECV_ERR, +}; + +enum { + NVMET_TCP_F_INIT_FAILED = (1 << 0), +}; + +struct nvmet_tcp_cmd { + struct nvmet_tcp_queue *queue; + struct nvmet_req req; + + struct nvme_tcp_cmd_pdu *cmd_pdu; + struct nvme_tcp_rsp_pdu *rsp_pdu; + struct nvme_tcp_data_pdu *data_pdu; + struct nvme_tcp_r2t_pdu *r2t_pdu; + + u32 rbytes_done; + u32 wbytes_done; + + u32 pdu_len; + u32 pdu_recv; + int sg_idx; + int nr_mapped; + struct msghdr recv_msg; + struct kvec *iov; + u32 flags; + + struct list_head entry; + struct llist_node lentry; + + /* send state */ + u32 offset; + struct scatterlist *cur_sg; + enum nvmet_tcp_send_state state; + + __le32 exp_ddgst; + __le32 recv_ddgst; +}; + +enum nvmet_tcp_queue_state { + NVMET_TCP_Q_CONNECTING, + NVMET_TCP_Q_LIVE, + NVMET_TCP_Q_DISCONNECTING, +}; + +struct nvmet_tcp_queue { + struct socket *sock; + struct nvmet_tcp_port *port; + struct work_struct io_work; + int cpu; + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + + /* send state */ + struct nvmet_tcp_cmd *cmds; + unsigned int nr_cmds; + struct list_head free_list; + struct llist_head resp_list; + struct list_head resp_send_list; + int send_list_len; + struct nvmet_tcp_cmd *snd_cmd; + + /* recv state */ + int offset; + int left; + enum nvmet_tcp_recv_state rcv_state; + struct nvmet_tcp_cmd *cmd; + union nvme_tcp_pdu pdu; + + /* digest state */ + bool hdr_digest; + bool data_digest; + struct ahash_request *snd_hash; + struct ahash_request *rcv_hash; + + spinlock_t state_lock; + enum nvmet_tcp_queue_state state; + + struct sockaddr_storage sockaddr; + struct sockaddr_storage sockaddr_peer; + struct work_struct release_work; + + int idx; + struct list_head queue_list; + + struct nvmet_tcp_cmd connect; + + struct page_frag_cache pf_cache; + + void (*data_ready)(struct sock *); + void (*state_change)(struct sock *); + void (*write_space)(struct sock *); +}; + +struct nvmet_tcp_port { + struct socket *sock; + struct work_struct accept_work; + struct nvmet_port *nport; + struct sockaddr_storage addr; + int last_cpu; + void (*data_ready)(struct sock *); +}; + +static DEFINE_IDA(nvmet_tcp_queue_ida); +static LIST_HEAD(nvmet_tcp_queue_list); +static DEFINE_MUTEX(nvmet_tcp_queue_mutex); + +static struct workqueue_struct *nvmet_tcp_wq; +static struct nvmet_fabrics_ops nvmet_tcp_ops; +static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); +static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); + +static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, + struct nvmet_tcp_cmd *cmd) +{ + return cmd - queue->cmds; +} + +static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd) +{ + return nvme_is_write(cmd->req.cmd) && + cmd->rbytes_done < cmd->req.transfer_len; +} + +static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd) +{ + return nvmet_tcp_has_data_in(cmd) && !cmd->req.rsp->status; +} + +static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd) +{ + return !nvme_is_write(cmd->req.cmd) && + cmd->req.transfer_len > 0 && + !cmd->req.rsp->status; +} + +static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd) +{ + return nvme_is_write(cmd->req.cmd) && cmd->pdu_len && + !cmd->rbytes_done; +} + +static inline struct nvmet_tcp_cmd * +nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd; + + cmd = list_first_entry_or_null(&queue->free_list, + struct nvmet_tcp_cmd, entry); + if (!cmd) + return NULL; + list_del_init(&cmd->entry); + + cmd->rbytes_done = cmd->wbytes_done = 0; + cmd->pdu_len = 0; + cmd->pdu_recv = 0; + cmd->iov = NULL; + cmd->flags = 0; + return cmd; +} + +static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd) +{ + if (unlikely(cmd == &cmd->queue->connect)) + return; + + list_add_tail(&cmd->entry, &cmd->queue->free_list); +} + +static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue) +{ + return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; +} + +static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) +{ + return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; +} + +static inline void nvmet_tcp_hdgst(struct ahash_request *hash, + void *pdu, size_t len) +{ + struct scatterlist sg; + + sg_init_one(&sg, pdu, len); + ahash_request_set_crypt(hash, &sg, pdu + len, len); + crypto_ahash_digest(hash); +} + +static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, + void *pdu, size_t len) +{ + struct nvme_tcp_hdr *hdr = pdu; + __le32 recv_digest; + __le32 exp_digest; + + if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) { + pr_err("queue %d: header digest enabled but no header digest\n", + queue->idx); + return -EPROTO; + } + + recv_digest = *(__le32 *)(pdu + hdr->hlen); + nvmet_tcp_hdgst(queue->rcv_hash, pdu, len); + exp_digest = *(__le32 *)(pdu + hdr->hlen); + if (recv_digest != exp_digest) { + pr_err("queue %d: header digest error: recv %#x expected %#x\n", + queue->idx, le32_to_cpu(recv_digest), + le32_to_cpu(exp_digest)); + return -EPROTO; + } + + return 0; +} + +static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu) +{ + struct nvme_tcp_hdr *hdr = pdu; + u8 digest_len = nvmet_tcp_hdgst_len(queue); + u32 len; + + len = le32_to_cpu(hdr->plen) - hdr->hlen - + (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0); + + if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) { + pr_err("queue %d: data digest flag is cleared\n", queue->idx); + return -EPROTO; + } + + return 0; +} + +static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd) +{ + struct scatterlist *sg; + int i; + + sg = &cmd->req.sg[cmd->sg_idx]; + + for (i = 0; i < cmd->nr_mapped; i++) + kunmap(sg_page(&sg[i])); +} + +static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd) +{ + struct kvec *iov = cmd->iov; + struct scatterlist *sg; + u32 length, offset, sg_offset; + + length = cmd->pdu_len; + cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE); + offset = cmd->rbytes_done; + cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE); + sg_offset = offset % PAGE_SIZE; + sg = &cmd->req.sg[cmd->sg_idx]; + + while (length) { + u32 iov_len = min_t(u32, length, sg->length - sg_offset); + + iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset; + iov->iov_len = iov_len; + + length -= iov_len; + sg = sg_next(sg); + iov++; + } + + iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov, + cmd->nr_mapped, cmd->pdu_len); +} + +static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) +{ + queue->rcv_state = NVMET_TCP_RECV_ERR; + if (queue->nvme_sq.ctrl) + nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); + else + kernel_sock_shutdown(queue->sock, SHUT_RDWR); +} + +static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) +{ + struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl; + u32 len = le32_to_cpu(sgl->length); + + if (!cmd->req.data_len) + return 0; + + if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) | + NVME_SGL_FMT_OFFSET)) { + if (!nvme_is_write(cmd->req.cmd)) + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + + if (len > cmd->req.port->inline_data_size) + return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; + cmd->pdu_len = len; + } + cmd->req.transfer_len += len; + + cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt); + if (!cmd->req.sg) + return NVME_SC_INTERNAL; + cmd->cur_sg = cmd->req.sg; + + if (nvmet_tcp_has_data_in(cmd)) { + cmd->iov = kmalloc_array(cmd->req.sg_cnt, + sizeof(*cmd->iov), GFP_KERNEL); + if (!cmd->iov) + goto err; + } + + return 0; +err: + sgl_free(cmd->req.sg); + return NVME_SC_INTERNAL; +} + +static void nvmet_tcp_ddgst(struct ahash_request *hash, + struct nvmet_tcp_cmd *cmd) +{ + ahash_request_set_crypt(hash, cmd->req.sg, + (void *)&cmd->exp_ddgst, cmd->req.transfer_len); + crypto_ahash_digest(hash); +} + +static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) +{ + struct nvme_tcp_data_pdu *pdu = cmd->data_pdu; + struct nvmet_tcp_queue *queue = cmd->queue; + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue); + + cmd->offset = 0; + cmd->state = NVMET_TCP_SEND_DATA_PDU; + + pdu->hdr.type = nvme_tcp_c2h_data; + pdu->hdr.flags = NVME_TCP_F_DATA_LAST; + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.pdo = pdu->hdr.hlen + hdgst; + pdu->hdr.plen = + cpu_to_le32(pdu->hdr.hlen + hdgst + + cmd->req.transfer_len + ddgst); + pdu->command_id = cmd->req.rsp->command_id; + pdu->data_length = cpu_to_le32(cmd->req.transfer_len); + pdu->data_offset = cpu_to_le32(cmd->wbytes_done); + + if (queue->data_digest) { + pdu->hdr.flags |= NVME_TCP_F_DDGST; + nvmet_tcp_ddgst(queue->snd_hash, cmd); + } + + if (cmd->queue->hdr_digest) { + pdu->hdr.flags |= NVME_TCP_F_HDGST; + nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + } +} + +static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) +{ + struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu; + struct nvmet_tcp_queue *queue = cmd->queue; + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + + cmd->offset = 0; + cmd->state = NVMET_TCP_SEND_R2T; + + pdu->hdr.type = nvme_tcp_r2t; + pdu->hdr.flags = 0; + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.pdo = 0; + pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); + + pdu->command_id = cmd->req.cmd->common.command_id; + pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd); + pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done); + pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done); + if (cmd->queue->hdr_digest) { + pdu->hdr.flags |= NVME_TCP_F_HDGST; + nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + } +} + +static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) +{ + struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu; + struct nvmet_tcp_queue *queue = cmd->queue; + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + + cmd->offset = 0; + cmd->state = NVMET_TCP_SEND_RESPONSE; + + pdu->hdr.type = nvme_tcp_rsp; + pdu->hdr.flags = 0; + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.pdo = 0; + pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); + if (cmd->queue->hdr_digest) { + pdu->hdr.flags |= NVME_TCP_F_HDGST; + nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + } +} + +static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue) +{ + struct llist_node *node; + + node = llist_del_all(&queue->resp_list); + if (!node) + return; + + while (node) { + struct nvmet_tcp_cmd *cmd = llist_entry(node, + struct nvmet_tcp_cmd, lentry); + + list_add(&cmd->entry, &queue->resp_send_list); + node = node->next; + queue->send_list_len++; + } +} + +static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue) +{ + queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list, + struct nvmet_tcp_cmd, entry); + if (!queue->snd_cmd) { + nvmet_tcp_process_resp_list(queue); + queue->snd_cmd = + list_first_entry_or_null(&queue->resp_send_list, + struct nvmet_tcp_cmd, entry); + if (unlikely(!queue->snd_cmd)) + return NULL; + } + + list_del_init(&queue->snd_cmd->entry); + queue->send_list_len--; + + if (nvmet_tcp_need_data_out(queue->snd_cmd)) + nvmet_setup_c2h_data_pdu(queue->snd_cmd); + else if (nvmet_tcp_need_data_in(queue->snd_cmd)) + nvmet_setup_r2t_pdu(queue->snd_cmd); + else + nvmet_setup_response_pdu(queue->snd_cmd); + + return queue->snd_cmd; +} + +static void nvmet_tcp_queue_response(struct nvmet_req *req) +{ + struct nvmet_tcp_cmd *cmd = + container_of(req, struct nvmet_tcp_cmd, req); + struct nvmet_tcp_queue *queue = cmd->queue; + + llist_add(&cmd->lentry, &queue->resp_list); + queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work); +} + +static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd) +{ + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst; + int ret; + + ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu), + offset_in_page(cmd->data_pdu) + cmd->offset, + left, MSG_DONTWAIT | MSG_MORE); + if (ret <= 0) + return ret; + + cmd->offset += ret; + left -= ret; + + if (left) + return -EAGAIN; + + cmd->state = NVMET_TCP_SEND_DATA; + cmd->offset = 0; + return 1; +} + +static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd) +{ + struct nvmet_tcp_queue *queue = cmd->queue; + int ret; + + while (cmd->cur_sg) { + struct page *page = sg_page(cmd->cur_sg); + u32 left = cmd->cur_sg->length - cmd->offset; + + ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset, + left, MSG_DONTWAIT | MSG_MORE); + if (ret <= 0) + return ret; + + cmd->offset += ret; + cmd->wbytes_done += ret; + + /* Done with sg?*/ + if (cmd->offset == cmd->cur_sg->length) { + cmd->cur_sg = sg_next(cmd->cur_sg); + cmd->offset = 0; + } + } + + if (queue->data_digest) { + cmd->state = NVMET_TCP_SEND_DDGST; + cmd->offset = 0; + } else { + nvmet_setup_response_pdu(cmd); + } + return 1; + +} + +static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd, + bool last_in_batch) +{ + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst; + int flags = MSG_DONTWAIT; + int ret; + + if (!last_in_batch && cmd->queue->send_list_len) + flags |= MSG_MORE; + else + flags |= MSG_EOR; + + ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu), + offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags); + if (ret <= 0) + return ret; + cmd->offset += ret; + left -= ret; + + if (left) + return -EAGAIN; + + kfree(cmd->iov); + sgl_free(cmd->req.sg); + cmd->queue->snd_cmd = NULL; + nvmet_tcp_put_cmd(cmd); + return 1; +} + +static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch) +{ + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst; + int flags = MSG_DONTWAIT; + int ret; + + if (!last_in_batch && cmd->queue->send_list_len) + flags |= MSG_MORE; + else + flags |= MSG_EOR; + + ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu), + offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags); + if (ret <= 0) + return ret; + cmd->offset += ret; + left -= ret; + + if (left) + return -EAGAIN; + + cmd->queue->snd_cmd = NULL; + return 1; +} + +static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd) +{ + struct nvmet_tcp_queue *queue = cmd->queue; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + struct kvec iov = { + .iov_base = &cmd->exp_ddgst + cmd->offset, + .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset + }; + int ret; + + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); + if (unlikely(ret <= 0)) + return ret; + + cmd->offset += ret; + nvmet_setup_response_pdu(cmd); + return 1; +} + +static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue, + bool last_in_batch) +{ + struct nvmet_tcp_cmd *cmd = queue->snd_cmd; + int ret = 0; + + if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) { + cmd = nvmet_tcp_fetch_cmd(queue); + if (unlikely(!cmd)) + return 0; + } + + if (cmd->state == NVMET_TCP_SEND_DATA_PDU) { + ret = nvmet_try_send_data_pdu(cmd); + if (ret <= 0) + goto done_send; + } + + if (cmd->state == NVMET_TCP_SEND_DATA) { + ret = nvmet_try_send_data(cmd); + if (ret <= 0) + goto done_send; + } + + if (cmd->state == NVMET_TCP_SEND_DDGST) { + ret = nvmet_try_send_ddgst(cmd); + if (ret <= 0) + goto done_send; + } + + if (cmd->state == NVMET_TCP_SEND_R2T) { + ret = nvmet_try_send_r2t(cmd, last_in_batch); + if (ret <= 0) + goto done_send; + } + + if (cmd->state == NVMET_TCP_SEND_RESPONSE) + ret = nvmet_try_send_response(cmd, last_in_batch); + +done_send: + if (ret < 0) { + if (ret == -EAGAIN) + return 0; + return ret; + } + + return 1; +} + +static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue, + int budget, int *sends) +{ + int i, ret = 0; + + for (i = 0; i < budget; i++) { + ret = nvmet_tcp_try_send_one(queue, i == budget - 1); + if (ret <= 0) + break; + (*sends)++; + } + + return ret; +} + +static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue) +{ + queue->offset = 0; + queue->left = sizeof(struct nvme_tcp_hdr); + queue->cmd = NULL; + queue->rcv_state = NVMET_TCP_RECV_PDU; +} + +static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); + + ahash_request_free(queue->rcv_hash); + ahash_request_free(queue->snd_hash); + crypto_free_ahash(tfm); +} + +static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue) +{ + struct crypto_ahash *tfm; + + tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); + if (!queue->snd_hash) + goto free_tfm; + ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); + + queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); + if (!queue->rcv_hash) + goto free_snd_hash; + ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); + + return 0; +free_snd_hash: + ahash_request_free(queue->snd_hash); +free_tfm: + crypto_free_ahash(tfm); + return -ENOMEM; +} + + +static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq; + struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp; + struct msghdr msg = {}; + struct kvec iov; + int ret; + + if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) { + pr_err("bad nvme-tcp pdu length (%d)\n", + le32_to_cpu(icreq->hdr.plen)); + nvmet_tcp_fatal_error(queue); + } + + if (icreq->pfv != NVME_TCP_PFV_1_0) { + pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv); + return -EPROTO; + } + + if (icreq->hpda != 0) { + pr_err("queue %d: unsupported hpda %d\n", queue->idx, + icreq->hpda); + return -EPROTO; + } + + if (icreq->maxr2t != 0) { + pr_err("queue %d: unsupported maxr2t %d\n", queue->idx, + le32_to_cpu(icreq->maxr2t) + 1); + return -EPROTO; + } + + queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE); + queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE); + if (queue->hdr_digest || queue->data_digest) { + ret = nvmet_tcp_alloc_crypto(queue); + if (ret) + return ret; + } + + memset(icresp, 0, sizeof(*icresp)); + icresp->hdr.type = nvme_tcp_icresp; + icresp->hdr.hlen = sizeof(*icresp); + icresp->hdr.pdo = 0; + icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen); + icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); + icresp->maxdata = cpu_to_le32(0xffff); /* FIXME: support r2t */ + icresp->cpda = 0; + if (queue->hdr_digest) + icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE; + if (queue->data_digest) + icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE; + + iov.iov_base = icresp; + iov.iov_len = sizeof(*icresp); + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); + if (ret < 0) + goto free_crypto; + + queue->state = NVMET_TCP_Q_LIVE; + nvmet_prepare_receive_pdu(queue); + return 0; +free_crypto: + if (queue->hdr_digest || queue->data_digest) + nvmet_tcp_free_crypto(queue); + return ret; +} + +static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, + struct nvmet_tcp_cmd *cmd, struct nvmet_req *req) +{ + int ret; + + /* recover the expected data transfer length */ + req->data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length); + + if (!nvme_is_write(cmd->req.cmd) || + req->data_len > cmd->req.port->inline_data_size) { + nvmet_prepare_receive_pdu(queue); + return; + } + + ret = nvmet_tcp_map_data(cmd); + if (unlikely(ret)) { + pr_err("queue %d: failed to map data\n", queue->idx); + nvmet_tcp_fatal_error(queue); + return; + } + + queue->rcv_state = NVMET_TCP_RECV_DATA; + nvmet_tcp_map_pdu_iovec(cmd); + cmd->flags |= NVMET_TCP_F_INIT_FAILED; +} + +static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_data_pdu *data = &queue->pdu.data; + struct nvmet_tcp_cmd *cmd; + + cmd = &queue->cmds[data->ttag]; + + if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { + pr_err("ttag %u unexpected data offset %u (expected %u)\n", + data->ttag, le32_to_cpu(data->data_offset), + cmd->rbytes_done); + /* FIXME: use path and transport errors */ + nvmet_req_complete(&cmd->req, + NVME_SC_INVALID_FIELD | NVME_SC_DNR); + return -EPROTO; + } + + cmd->pdu_len = le32_to_cpu(data->data_length); + cmd->pdu_recv = 0; + nvmet_tcp_map_pdu_iovec(cmd); + queue->cmd = cmd; + queue->rcv_state = NVMET_TCP_RECV_DATA; + + return 0; +} + +static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; + struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd; + struct nvmet_req *req; + int ret; + + if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { + if (hdr->type != nvme_tcp_icreq) { + pr_err("unexpected pdu type (%d) before icreq\n", + hdr->type); + nvmet_tcp_fatal_error(queue); + return -EPROTO; + } + return nvmet_tcp_handle_icreq(queue); + } + + if (hdr->type == nvme_tcp_h2c_data) { + ret = nvmet_tcp_handle_h2c_data_pdu(queue); + if (unlikely(ret)) + return ret; + return 0; + } + + queue->cmd = nvmet_tcp_get_cmd(queue); + if (unlikely(!queue->cmd)) { + /* This should never happen */ + pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d", + queue->idx, queue->nr_cmds, queue->send_list_len, + nvme_cmd->common.opcode); + nvmet_tcp_fatal_error(queue); + return -ENOMEM; + } + + req = &queue->cmd->req; + memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd)); + + if (unlikely(!nvmet_req_init(req, &queue->nvme_cq, + &queue->nvme_sq, &nvmet_tcp_ops))) { + pr_err("failed cmd %p id %d opcode %d, data_len: %d\n", + req->cmd, req->cmd->common.command_id, + req->cmd->common.opcode, + le32_to_cpu(req->cmd->common.dptr.sgl.length)); + + nvmet_tcp_handle_req_failure(queue, queue->cmd, req); + return -EAGAIN; + } + + ret = nvmet_tcp_map_data(queue->cmd); + if (unlikely(ret)) { + pr_err("queue %d: failed to map data\n", queue->idx); + if (nvmet_tcp_has_inline_data(queue->cmd)) + nvmet_tcp_fatal_error(queue); + else + nvmet_req_complete(req, ret); + ret = -EAGAIN; + goto out; + } + + if (nvmet_tcp_need_data_in(queue->cmd)) { + if (nvmet_tcp_has_inline_data(queue->cmd)) { + queue->rcv_state = NVMET_TCP_RECV_DATA; + nvmet_tcp_map_pdu_iovec(queue->cmd); + return 0; + } + /* send back R2T */ + nvmet_tcp_queue_response(&queue->cmd->req); + goto out; + } + + nvmet_req_execute(&queue->cmd->req); +out: + nvmet_prepare_receive_pdu(queue); + return ret; +} + +static const u8 nvme_tcp_pdu_sizes[] = { + [nvme_tcp_icreq] = sizeof(struct nvme_tcp_icreq_pdu), + [nvme_tcp_cmd] = sizeof(struct nvme_tcp_cmd_pdu), + [nvme_tcp_h2c_data] = sizeof(struct nvme_tcp_data_pdu), +}; + +static inline u8 nvmet_tcp_pdu_size(u8 type) +{ + size_t idx = type; + + return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) && + nvme_tcp_pdu_sizes[idx]) ? + nvme_tcp_pdu_sizes[idx] : 0; +} + +static inline bool nvmet_tcp_pdu_valid(u8 type) +{ + switch (type) { + case nvme_tcp_icreq: + case nvme_tcp_cmd: + case nvme_tcp_h2c_data: + /* fallthru */ + return true; + } + + return false; +} + +static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; + int len; + struct kvec iov; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + +recv: + iov.iov_base = (void *)&queue->pdu + queue->offset; + iov.iov_len = queue->left; + len = kernel_recvmsg(queue->sock, &msg, &iov, 1, + iov.iov_len, msg.msg_flags); + if (unlikely(len < 0)) + return len; + + queue->offset += len; + queue->left -= len; + if (queue->left) + return -EAGAIN; + + if (queue->offset == sizeof(struct nvme_tcp_hdr)) { + u8 hdgst = nvmet_tcp_hdgst_len(queue); + + if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) { + pr_err("unexpected pdu type %d\n", hdr->type); + nvmet_tcp_fatal_error(queue); + return -EIO; + } + + if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) { + pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen); + return -EIO; + } + + queue->left = hdr->hlen - queue->offset + hdgst; + goto recv; + } + + if (queue->hdr_digest && + nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) { + nvmet_tcp_fatal_error(queue); /* fatal */ + return -EPROTO; + } + + if (queue->data_digest && + nvmet_tcp_check_ddgst(queue, &queue->pdu)) { + nvmet_tcp_fatal_error(queue); /* fatal */ + return -EPROTO; + } + + return nvmet_tcp_done_recv_pdu(queue); +} + +static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd) +{ + struct nvmet_tcp_queue *queue = cmd->queue; + + nvmet_tcp_ddgst(queue->rcv_hash, cmd); + queue->offset = 0; + queue->left = NVME_TCP_DIGEST_LENGTH; + queue->rcv_state = NVMET_TCP_RECV_DDGST; +} + +static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd = queue->cmd; + int ret; + + while (msg_data_left(&cmd->recv_msg)) { + ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg, + cmd->recv_msg.msg_flags); + if (ret <= 0) + return ret; + + cmd->pdu_recv += ret; + cmd->rbytes_done += ret; + } + + nvmet_tcp_unmap_pdu_iovec(cmd); + + if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && + cmd->rbytes_done == cmd->req.transfer_len) { + if (queue->data_digest) { + nvmet_tcp_prep_recv_ddgst(cmd); + return 0; + } + nvmet_req_execute(&cmd->req); + } + + nvmet_prepare_receive_pdu(queue); + return 0; +} + +static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd = queue->cmd; + int ret; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + struct kvec iov = { + .iov_base = (void *)&cmd->recv_ddgst + queue->offset, + .iov_len = queue->left + }; + + ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, + iov.iov_len, msg.msg_flags); + if (unlikely(ret < 0)) + return ret; + + queue->offset += ret; + queue->left -= ret; + if (queue->left) + return -EAGAIN; + + if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) { + pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n", + queue->idx, cmd->req.cmd->common.command_id, + queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), + le32_to_cpu(cmd->exp_ddgst)); + nvmet_tcp_finish_cmd(cmd); + nvmet_tcp_fatal_error(queue); + ret = -EPROTO; + goto out; + } + + if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && + cmd->rbytes_done == cmd->req.transfer_len) + nvmet_req_execute(&cmd->req); + ret = 0; +out: + nvmet_prepare_receive_pdu(queue); + return ret; +} + +static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue) +{ + int result; + + if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR)) + return 0; + + if (queue->rcv_state == NVMET_TCP_RECV_PDU) { + result = nvmet_tcp_try_recv_pdu(queue); + if (result != 0) + goto done_recv; + } + + if (queue->rcv_state == NVMET_TCP_RECV_DATA) { + result = nvmet_tcp_try_recv_data(queue); + if (result != 0) + goto done_recv; + } + + if (queue->rcv_state == NVMET_TCP_RECV_DDGST) { + result = nvmet_tcp_try_recv_ddgst(queue); + if (result != 0) + goto done_recv; + } + +done_recv: + if (result < 0) { + if (result == -EAGAIN) + return 0; + return result; + } + return 1; +} + +static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue, + int budget, int *recvs) +{ + int i, ret = 0; + + for (i = 0; i < budget; i++) { + ret = nvmet_tcp_try_recv_one(queue); + if (ret <= 0) + break; + (*recvs)++; + } + + return ret; +} + +static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue) +{ + spin_lock(&queue->state_lock); + if (queue->state != NVMET_TCP_Q_DISCONNECTING) { + queue->state = NVMET_TCP_Q_DISCONNECTING; + schedule_work(&queue->release_work); + } + spin_unlock(&queue->state_lock); +} + +static void nvmet_tcp_io_work(struct work_struct *w) +{ + struct nvmet_tcp_queue *queue = + container_of(w, struct nvmet_tcp_queue, io_work); + bool pending; + int ret, ops = 0; + + do { + pending = false; + + ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops); + if (ret > 0) { + pending = true; + } else if (ret < 0) { + if (ret == -EPIPE || ret == -ECONNRESET) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + else + nvmet_tcp_fatal_error(queue); + return; + } + + ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops); + if (ret > 0) { + /* transmitted message/data */ + pending = true; + } else if (ret < 0) { + if (ret == -EPIPE || ret == -ECONNRESET) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + else + nvmet_tcp_fatal_error(queue); + return; + } + + } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET); + + /* + * We exahusted our budget, requeue our selves + */ + if (pending) + queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); +} + +static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, + struct nvmet_tcp_cmd *c) +{ + u8 hdgst = nvmet_tcp_hdgst_len(queue); + + c->queue = queue; + c->req.port = queue->port->nport; + + c->cmd_pdu = page_frag_alloc(&queue->pf_cache, + sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + if (!c->cmd_pdu) + return -ENOMEM; + c->req.cmd = &c->cmd_pdu->cmd; + + c->rsp_pdu = page_frag_alloc(&queue->pf_cache, + sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + if (!c->rsp_pdu) + goto out_free_cmd; + c->req.rsp = &c->rsp_pdu->cqe; + + c->data_pdu = page_frag_alloc(&queue->pf_cache, + sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + if (!c->data_pdu) + goto out_free_rsp; + + c->r2t_pdu = page_frag_alloc(&queue->pf_cache, + sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + if (!c->r2t_pdu) + goto out_free_data; + + c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + + list_add_tail(&c->entry, &queue->free_list); + + return 0; +out_free_data: + page_frag_free(c->data_pdu); +out_free_rsp: + page_frag_free(c->rsp_pdu); +out_free_cmd: + page_frag_free(c->cmd_pdu); + return -ENOMEM; +} + +static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c) +{ + page_frag_free(c->r2t_pdu); + page_frag_free(c->data_pdu); + page_frag_free(c->rsp_pdu); + page_frag_free(c->cmd_pdu); +} + +static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmds; + int i, ret = -EINVAL, nr_cmds = queue->nr_cmds; + + cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL); + if (!cmds) + goto out; + + for (i = 0; i < nr_cmds; i++) { + ret = nvmet_tcp_alloc_cmd(queue, cmds + i); + if (ret) + goto out_free; + } + + queue->cmds = cmds; + + return 0; +out_free: + while (--i >= 0) + nvmet_tcp_free_cmd(cmds + i); + kfree(cmds); +out: + return ret; +} + +static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmds = queue->cmds; + int i; + + for (i = 0; i < queue->nr_cmds; i++) + nvmet_tcp_free_cmd(cmds + i); + + nvmet_tcp_free_cmd(&queue->connect); + kfree(cmds); +} + +static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue) +{ + struct socket *sock = queue->sock; + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_data_ready = queue->data_ready; + sock->sk->sk_state_change = queue->state_change; + sock->sk->sk_write_space = queue->write_space; + sock->sk->sk_user_data = NULL; + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) +{ + nvmet_req_uninit(&cmd->req); + nvmet_tcp_unmap_pdu_iovec(cmd); + sgl_free(cmd->req.sg); +} + +static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd = queue->cmds; + int i; + + for (i = 0; i < queue->nr_cmds; i++, cmd++) { + if (nvmet_tcp_need_data_in(cmd)) + nvmet_tcp_finish_cmd(cmd); + } + + if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) { + /* failed in connect */ + nvmet_tcp_finish_cmd(&queue->connect); + } +} + +static void nvmet_tcp_release_queue_work(struct work_struct *w) +{ + struct nvmet_tcp_queue *queue = + container_of(w, struct nvmet_tcp_queue, release_work); + + mutex_lock(&nvmet_tcp_queue_mutex); + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_tcp_queue_mutex); + + nvmet_tcp_restore_socket_callbacks(queue); + flush_work(&queue->io_work); + + nvmet_tcp_uninit_data_in_cmds(queue); + nvmet_sq_destroy(&queue->nvme_sq); + cancel_work_sync(&queue->io_work); + sock_release(queue->sock); + nvmet_tcp_free_cmds(queue); + if (queue->hdr_digest || queue->data_digest) + nvmet_tcp_free_crypto(queue); + ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); + + kfree(queue); +} + +static void nvmet_tcp_data_ready(struct sock *sk) +{ + struct nvmet_tcp_queue *queue; + + read_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (likely(queue)) + queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + read_unlock_bh(&sk->sk_callback_lock); +} + +static void nvmet_tcp_write_space(struct sock *sk) +{ + struct nvmet_tcp_queue *queue; + + read_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (unlikely(!queue)) + goto out; + + if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { + queue->write_space(sk); + goto out; + } + + if (sk_stream_is_writeable(sk)) { + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + } +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static void nvmet_tcp_state_change(struct sock *sk) +{ + struct nvmet_tcp_queue *queue; + + write_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (!queue) + goto done; + + switch (sk->sk_state) { + case TCP_FIN_WAIT1: + case TCP_CLOSE_WAIT: + case TCP_CLOSE: + /* FALLTHRU */ + sk->sk_user_data = NULL; + nvmet_tcp_schedule_release_queue(queue); + break; + default: + pr_warn("queue %d unhandled state %d\n", + queue->idx, sk->sk_state); + } +done: + write_unlock_bh(&sk->sk_callback_lock); +} + +static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) +{ + struct socket *sock = queue->sock; + struct linger sol = { .l_onoff = 1, .l_linger = 0 }; + int ret; + + ret = kernel_getsockname(sock, + (struct sockaddr *)&queue->sockaddr); + if (ret < 0) + return ret; + + ret = kernel_getpeername(sock, + (struct sockaddr *)&queue->sockaddr_peer); + if (ret < 0) + return ret; + + /* + * Cleanup whatever is sitting in the TCP transmit queue on socket + * close. This is done to prevent stale data from being sent should + * the network connection be restored before TCP times out. + */ + ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&sol, sizeof(sol)); + if (ret) + return ret; + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_user_data = queue; + queue->data_ready = sock->sk->sk_data_ready; + sock->sk->sk_data_ready = nvmet_tcp_data_ready; + queue->state_change = sock->sk->sk_state_change; + sock->sk->sk_state_change = nvmet_tcp_state_change; + queue->write_space = sock->sk->sk_write_space; + sock->sk->sk_write_space = nvmet_tcp_write_space; + write_unlock_bh(&sock->sk->sk_callback_lock); + + return 0; +} + +static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, + struct socket *newsock) +{ + struct nvmet_tcp_queue *queue; + int ret; + + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) + return -ENOMEM; + + INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work); + INIT_WORK(&queue->io_work, nvmet_tcp_io_work); + queue->sock = newsock; + queue->port = port; + queue->nr_cmds = 0; + spin_lock_init(&queue->state_lock); + queue->state = NVMET_TCP_Q_CONNECTING; + INIT_LIST_HEAD(&queue->free_list); + init_llist_head(&queue->resp_list); + INIT_LIST_HEAD(&queue->resp_send_list); + + queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL); + if (queue->idx < 0) { + ret = queue->idx; + goto out_free_queue; + } + + ret = nvmet_tcp_alloc_cmd(queue, &queue->connect); + if (ret) + goto out_ida_remove; + + ret = nvmet_sq_init(&queue->nvme_sq); + if (ret) + goto out_free_connect; + + port->last_cpu = cpumask_next_wrap(port->last_cpu, + cpu_online_mask, -1, false); + queue->cpu = port->last_cpu; + nvmet_prepare_receive_pdu(queue); + + mutex_lock(&nvmet_tcp_queue_mutex); + list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list); + mutex_unlock(&nvmet_tcp_queue_mutex); + + ret = nvmet_tcp_set_queue_sock(queue); + if (ret) + goto out_destroy_sq; + + queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + + return 0; +out_destroy_sq: + mutex_lock(&nvmet_tcp_queue_mutex); + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_tcp_queue_mutex); + nvmet_sq_destroy(&queue->nvme_sq); +out_free_connect: + nvmet_tcp_free_cmd(&queue->connect); +out_ida_remove: + ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); +out_free_queue: + kfree(queue); + return ret; +} + +static void nvmet_tcp_accept_work(struct work_struct *w) +{ + struct nvmet_tcp_port *port = + container_of(w, struct nvmet_tcp_port, accept_work); + struct socket *newsock; + int ret; + + while (true) { + ret = kernel_accept(port->sock, &newsock, O_NONBLOCK); + if (ret < 0) { + if (ret != -EAGAIN) + pr_warn("failed to accept err=%d\n", ret); + return; + } + ret = nvmet_tcp_alloc_queue(port, newsock); + if (ret) { + pr_err("failed to allocate queue\n"); + sock_release(newsock); + } + } +} + +static void nvmet_tcp_listen_data_ready(struct sock *sk) +{ + struct nvmet_tcp_port *port; + + read_lock_bh(&sk->sk_callback_lock); + port = sk->sk_user_data; + if (!port) + goto out; + + if (sk->sk_state == TCP_LISTEN) + schedule_work(&port->accept_work); +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static int nvmet_tcp_add_port(struct nvmet_port *nport) +{ + struct nvmet_tcp_port *port; + __kernel_sa_family_t af; + int opt, ret; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + switch (nport->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + af = AF_INET; + break; + case NVMF_ADDR_FAMILY_IP6: + af = AF_INET6; + break; + default: + pr_err("address family %d not supported\n", + nport->disc_addr.adrfam); + ret = -EINVAL; + goto err_port; + } + + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, + nport->disc_addr.trsvcid, &port->addr); + if (ret) { + pr_err("malformed ip/port passed: %s:%s\n", + nport->disc_addr.traddr, nport->disc_addr.trsvcid); + goto err_port; + } + + port->nport = nport; + port->last_cpu = -1; + INIT_WORK(&port->accept_work, nvmet_tcp_accept_work); + if (port->nport->inline_data_size < 0) + port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE; + + ret = sock_create(port->addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &port->sock); + if (ret) { + pr_err("failed to create a socket\n"); + goto err_port; + } + + port->sock->sk->sk_user_data = port; + port->data_ready = port->sock->sk->sk_data_ready; + port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; + + opt = 1; + ret = kernel_setsockopt(port->sock, IPPROTO_TCP, + TCP_NODELAY, (char *)&opt, sizeof(opt)); + if (ret) { + pr_err("failed to set TCP_NODELAY sock opt %d\n", ret); + goto err_sock; + } + + ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&opt, sizeof(opt)); + if (ret) { + pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret); + goto err_sock; + } + + ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, + sizeof(port->addr)); + if (ret) { + pr_err("failed to bind port socket %d\n", ret); + goto err_sock; + } + + ret = kernel_listen(port->sock, 128); + if (ret) { + pr_err("failed to listen %d on port sock\n", ret); + goto err_sock; + } + + nport->priv = port; + pr_info("enabling port %d (%pISpc)\n", + le16_to_cpu(nport->disc_addr.portid), &port->addr); + + return 0; + +err_sock: + sock_release(port->sock); +err_port: + kfree(port); + return ret; +} + +static void nvmet_tcp_remove_port(struct nvmet_port *nport) +{ + struct nvmet_tcp_port *port = nport->priv; + + write_lock_bh(&port->sock->sk->sk_callback_lock); + port->sock->sk->sk_data_ready = port->data_ready; + port->sock->sk->sk_user_data = NULL; + write_unlock_bh(&port->sock->sk->sk_callback_lock); + cancel_work_sync(&port->accept_work); + + sock_release(port->sock); + kfree(port); +} + +static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl) +{ + struct nvmet_tcp_queue *queue; + + mutex_lock(&nvmet_tcp_queue_mutex); + list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) + if (queue->nvme_sq.ctrl == ctrl) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + mutex_unlock(&nvmet_tcp_queue_mutex); +} + +static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq) +{ + struct nvmet_tcp_queue *queue = + container_of(sq, struct nvmet_tcp_queue, nvme_sq); + + if (sq->qid == 0) { + /* Let inflight controller teardown complete */ + flush_scheduled_work(); + } + + queue->nr_cmds = sq->size * 2; + if (nvmet_tcp_alloc_cmds(queue)) + return NVME_SC_INTERNAL; + return 0; +} + +static void nvmet_tcp_disc_port_addr(struct nvmet_req *req, + struct nvmet_port *nport, char *traddr) +{ + struct nvmet_tcp_port *port = nport->priv; + + if (inet_addr_is_any((struct sockaddr *)&port->addr)) { + struct nvmet_tcp_cmd *cmd = + container_of(req, struct nvmet_tcp_cmd, req); + struct nvmet_tcp_queue *queue = cmd->queue; + + sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr); + } else { + memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); + } +} + +static struct nvmet_fabrics_ops nvmet_tcp_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_TCP, + .msdbd = 1, + .has_keyed_sgls = 0, + .add_port = nvmet_tcp_add_port, + .remove_port = nvmet_tcp_remove_port, + .queue_response = nvmet_tcp_queue_response, + .delete_ctrl = nvmet_tcp_delete_ctrl, + .install_queue = nvmet_tcp_install_queue, + .disc_traddr = nvmet_tcp_disc_port_addr, +}; + +static int __init nvmet_tcp_init(void) +{ + int ret; + + nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0); + if (!nvmet_tcp_wq) + return -ENOMEM; + + ret = nvmet_register_transport(&nvmet_tcp_ops); + if (ret) + goto err; + + return 0; +err: + destroy_workqueue(nvmet_tcp_wq); + return ret; +} + +static void __exit nvmet_tcp_exit(void) +{ + struct nvmet_tcp_queue *queue; + + nvmet_unregister_transport(&nvmet_tcp_ops); + + flush_scheduled_work(); + mutex_lock(&nvmet_tcp_queue_mutex); + list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + mutex_unlock(&nvmet_tcp_queue_mutex); + flush_scheduled_work(); + + destroy_workqueue(nvmet_tcp_wq); +} + +module_init(nvmet_tcp_init); +module_exit(nvmet_tcp_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */ diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 2016e0ed5865..8e26001dc11c 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -412,6 +412,7 @@ static int dasd_ioctl_information(struct dasd_block *block, struct ccw_dev_id dev_id; struct dasd_device *base; struct ccw_device *cdev; + struct list_head *l; unsigned long flags; int rc; @@ -462,23 +463,10 @@ static int dasd_ioctl_information(struct dasd_block *block, memcpy(dasd_info->type, base->discipline->name, 4); - if (block->request_queue->request_fn) { - struct list_head *l; -#ifdef DASD_EXTENDED_PROFILING - { - struct list_head *l; - spin_lock_irqsave(&block->lock, flags); - list_for_each(l, &block->request_queue->queue_head) - dasd_info->req_queue_len++; - spin_unlock_irqrestore(&block->lock, flags); - } -#endif /* DASD_EXTENDED_PROFILING */ - spin_lock_irqsave(get_ccwdev_lock(base->cdev), flags); - list_for_each(l, &base->ccw_queue) - dasd_info->chanq_len++; - spin_unlock_irqrestore(get_ccwdev_lock(base->cdev), - flags); - } + spin_lock_irqsave(&block->queue_lock, flags); + list_for_each(l, &base->ccw_queue) + dasd_info->chanq_len++; + spin_unlock_irqrestore(&block->queue_lock, flags); rc = 0; if (copy_to_user(argp, dasd_info, diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 640cd1b31a18..f38882f6f37d 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -50,18 +50,6 @@ config SCSI_NETLINK default n depends on NET -config SCSI_MQ_DEFAULT - bool "SCSI: use blk-mq I/O path by default" - default y - depends on SCSI - ---help--- - This option enables the blk-mq based I/O path for SCSI devices by - default. With this option the scsi_mod.use_blk_mq module/boot - option defaults to Y, without it to N, but it can still be - overridden either way. - - If unsure say Y. - config SCSI_PROC_FS bool "legacy /proc/scsi/ support" depends on SCSI && PROC_FS diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c index e9e669a6c2bc..6bad2689edd4 100644 --- a/drivers/scsi/bnx2i/bnx2i_hwi.c +++ b/drivers/scsi/bnx2i/bnx2i_hwi.c @@ -1906,7 +1906,6 @@ static int bnx2i_queue_scsi_cmd_resp(struct iscsi_session *session, struct iscsi_task *task; struct scsi_cmnd *sc; int rc = 0; - int cpu; spin_lock(&session->back_lock); task = iscsi_itt_to_task(bnx2i_conn->cls_conn->dd_data, @@ -1917,14 +1916,9 @@ static int bnx2i_queue_scsi_cmd_resp(struct iscsi_session *session, } sc = task->sc; - if (!blk_rq_cpu_valid(sc->request)) - cpu = smp_processor_id(); - else - cpu = sc->request->cpu; - spin_unlock(&session->back_lock); - p = &per_cpu(bnx2i_percpu, cpu); + p = &per_cpu(bnx2i_percpu, blk_mq_rq_cpu(sc->request)); spin_lock(&p->p_work_lock); if (unlikely(!p->iothread)) { rc = -EINVAL; diff --git a/drivers/scsi/csiostor/csio_scsi.c b/drivers/scsi/csiostor/csio_scsi.c index 8c15b7acb4b7..a95debbea0e4 100644 --- a/drivers/scsi/csiostor/csio_scsi.c +++ b/drivers/scsi/csiostor/csio_scsi.c @@ -1780,16 +1780,10 @@ csio_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmnd) int nsge = 0; int rv = SCSI_MLQUEUE_HOST_BUSY, nr; int retval; - int cpu; struct csio_scsi_qset *sqset; struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device)); - if (!blk_rq_cpu_valid(cmnd->request)) - cpu = smp_processor_id(); - else - cpu = cmnd->request->cpu; - - sqset = &hw->sqset[ln->portid][cpu]; + sqset = &hw->sqset[ln->portid][blk_mq_rq_cpu(cmnd->request)]; nr = fc_remote_port_chkready(rport); if (nr) { diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index 6637116529aa..abdc9eac4173 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -3088,12 +3088,6 @@ static ssize_t hwq_mode_store(struct device *dev, return -EINVAL; } - if ((mode == HWQ_MODE_TAG) && !shost_use_blk_mq(shost)) { - dev_info(cfgdev, "SCSI-MQ is not enabled, use a different " - "HWQ steering mode.\n"); - return -EINVAL; - } - afu->hwq_mode = mode; return count; diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index 12dc7100bb4c..d7ac498ba35a 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -1071,28 +1071,29 @@ static void alua_check(struct scsi_device *sdev, bool force) * Fail I/O to all paths not in state * active/optimized or active/non-optimized. */ -static int alua_prep_fn(struct scsi_device *sdev, struct request *req) +static blk_status_t alua_prep_fn(struct scsi_device *sdev, struct request *req) { struct alua_dh_data *h = sdev->handler_data; struct alua_port_group *pg; unsigned char state = SCSI_ACCESS_STATE_OPTIMAL; - int ret = BLKPREP_OK; rcu_read_lock(); pg = rcu_dereference(h->pg); if (pg) state = pg->state; rcu_read_unlock(); - if (state == SCSI_ACCESS_STATE_TRANSITIONING) - ret = BLKPREP_DEFER; - else if (state != SCSI_ACCESS_STATE_OPTIMAL && - state != SCSI_ACCESS_STATE_ACTIVE && - state != SCSI_ACCESS_STATE_LBA) { - ret = BLKPREP_KILL; - req->rq_flags |= RQF_QUIET; - } - return ret; + switch (state) { + case SCSI_ACCESS_STATE_OPTIMAL: + case SCSI_ACCESS_STATE_ACTIVE: + case SCSI_ACCESS_STATE_LBA: + return BLK_STS_OK; + case SCSI_ACCESS_STATE_TRANSITIONING: + return BLK_STS_RESOURCE; + default: + req->rq_flags |= RQF_QUIET; + return BLK_STS_IOERR; + } } static void alua_rescan(struct scsi_device *sdev) diff --git a/drivers/scsi/device_handler/scsi_dh_emc.c b/drivers/scsi/device_handler/scsi_dh_emc.c index 95c47909a58f..bea8e13febb6 100644 --- a/drivers/scsi/device_handler/scsi_dh_emc.c +++ b/drivers/scsi/device_handler/scsi_dh_emc.c @@ -341,17 +341,17 @@ static int clariion_check_sense(struct scsi_device *sdev, return SCSI_RETURN_NOT_HANDLED; } -static int clariion_prep_fn(struct scsi_device *sdev, struct request *req) +static blk_status_t clariion_prep_fn(struct scsi_device *sdev, + struct request *req) { struct clariion_dh_data *h = sdev->handler_data; - int ret = BLKPREP_OK; if (h->lun_state != CLARIION_LUN_OWNED) { - ret = BLKPREP_KILL; req->rq_flags |= RQF_QUIET; + return BLK_STS_IOERR; } - return ret; + return BLK_STS_OK; } static int clariion_std_inquiry(struct scsi_device *sdev, diff --git a/drivers/scsi/device_handler/scsi_dh_hp_sw.c b/drivers/scsi/device_handler/scsi_dh_hp_sw.c index e65a0ebb4b54..80129b033855 100644 --- a/drivers/scsi/device_handler/scsi_dh_hp_sw.c +++ b/drivers/scsi/device_handler/scsi_dh_hp_sw.c @@ -172,17 +172,16 @@ retry: return rc; } -static int hp_sw_prep_fn(struct scsi_device *sdev, struct request *req) +static blk_status_t hp_sw_prep_fn(struct scsi_device *sdev, struct request *req) { struct hp_sw_dh_data *h = sdev->handler_data; - int ret = BLKPREP_OK; if (h->path_state != HP_SW_PATH_ACTIVE) { - ret = BLKPREP_KILL; req->rq_flags |= RQF_QUIET; + return BLK_STS_IOERR; } - return ret; + return BLK_STS_OK; } /* diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c index d27fabae8ddd..65f1fe343c64 100644 --- a/drivers/scsi/device_handler/scsi_dh_rdac.c +++ b/drivers/scsi/device_handler/scsi_dh_rdac.c @@ -642,17 +642,16 @@ done: return 0; } -static int rdac_prep_fn(struct scsi_device *sdev, struct request *req) +static blk_status_t rdac_prep_fn(struct scsi_device *sdev, struct request *req) { struct rdac_dh_data *h = sdev->handler_data; - int ret = BLKPREP_OK; if (h->state != RDAC_STATE_ACTIVE) { - ret = BLKPREP_KILL; req->rq_flags |= RQF_QUIET; + return BLK_STS_IOERR; } - return ret; + return BLK_STS_OK; } static int rdac_check_sense(struct scsi_device *sdev, diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c index 96acfcecd540..cafbcfb85bfa 100644 --- a/drivers/scsi/fnic/fnic_scsi.c +++ b/drivers/scsi/fnic/fnic_scsi.c @@ -2274,7 +2274,7 @@ fnic_scsi_host_start_tag(struct fnic *fnic, struct scsi_cmnd *sc) return SCSI_NO_TAG; sc->tag = sc->request->tag = dummy->tag; - sc->request->special = sc; + sc->host_scribble = (unsigned char *)dummy; return dummy->tag; } @@ -2286,7 +2286,7 @@ fnic_scsi_host_start_tag(struct fnic *fnic, struct scsi_cmnd *sc) static inline void fnic_scsi_host_end_tag(struct fnic *fnic, struct scsi_cmnd *sc) { - struct request *dummy = sc->request->special; + struct request *dummy = (struct request *)sc->host_scribble; blk_mq_free_request(dummy); } diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index ea4b0bb0c1cd..cc71136ba300 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -222,18 +222,9 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev, if (error) goto fail; - if (shost_use_blk_mq(shost)) { - error = scsi_mq_setup_tags(shost); - if (error) - goto fail; - } else { - shost->bqt = blk_init_tags(shost->can_queue, - shost->hostt->tag_alloc_policy); - if (!shost->bqt) { - error = -ENOMEM; - goto fail; - } - } + error = scsi_mq_setup_tags(shost); + if (error) + goto fail; if (!shost->shost_gendev.parent) shost->shost_gendev.parent = dev ? dev : &platform_bus; @@ -309,8 +300,7 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev, pm_runtime_disable(&shost->shost_gendev); pm_runtime_set_suspended(&shost->shost_gendev); pm_runtime_put_noidle(&shost->shost_gendev); - if (shost_use_blk_mq(shost)) - scsi_mq_destroy_tags(shost); + scsi_mq_destroy_tags(shost); fail: return error; } @@ -344,13 +334,8 @@ static void scsi_host_dev_release(struct device *dev) kfree(dev_name(&shost->shost_dev)); } - if (shost_use_blk_mq(shost)) { - if (shost->tag_set.tags) - scsi_mq_destroy_tags(shost); - } else { - if (shost->bqt) - blk_free_tags(shost->bqt); - } + if (shost->tag_set.tags) + scsi_mq_destroy_tags(shost); kfree(shost->shost_data); @@ -472,8 +457,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) else shost->dma_boundary = 0xffffffff; - shost->use_blk_mq = scsi_use_blk_mq || shost->hostt->force_blk_mq; - device_initialize(&shost->shost_gendev); dev_set_name(&shost->shost_gendev, "host%d", shost->host_no); shost->shost_gendev.bus = &scsi_bus_type; diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 4f6cdf53e913..c90b278cc28c 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -601,12 +601,7 @@ void sas_ata_task_abort(struct sas_task *task) /* Bounce SCSI-initiated commands to the SCSI EH */ if (qc->scsicmd) { - struct request_queue *q = qc->scsicmd->device->request_queue; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); blk_abort_request(qc->scsicmd->request); - spin_unlock_irqrestore(q->queue_lock, flags); return; } diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c index 33229348dcb6..af085432c5fe 100644 --- a/drivers/scsi/libsas/sas_scsi_host.c +++ b/drivers/scsi/libsas/sas_scsi_host.c @@ -930,16 +930,10 @@ void sas_task_abort(struct sas_task *task) return; } - if (dev_is_sata(task->dev)) { + if (dev_is_sata(task->dev)) sas_ata_task_abort(task); - } else { - struct request_queue *q = sc->device->request_queue; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); + else blk_abort_request(sc->request); - spin_unlock_irqrestore(q->queue_lock, flags); - } } void sas_target_destroy(struct scsi_target *starget) diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index 4fa6703a9ec9..baed2b891efb 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -3914,7 +3914,7 @@ int lpfc_sli4_scmd_to_wqidx_distr(struct lpfc_hba *phba, uint32_t tag; uint16_t hwq; - if (cmnd && shost_use_blk_mq(cmnd->device->host)) { + if (cmnd) { tag = blk_mq_unique_tag(cmnd->request); hwq = blk_mq_unique_tag_to_hwq(tag); diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index e19fa883376f..60cf7c5eb880 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -506,11 +506,11 @@ static void osd_request_async_done(struct request *req, blk_status_t error) _set_error_resid(or, req, error); if (req->next_rq) { - __blk_put_request(req->q, req->next_rq); + blk_put_request(req->next_rq); req->next_rq = NULL; } - __blk_put_request(req->q, req); + blk_put_request(req); or->request = NULL; or->in.req = NULL; or->out.req = NULL; diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c index 7a1a1edde35d..664c1238a87f 100644 --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -341,7 +341,7 @@ static void osst_end_async(struct request *req, blk_status_t status) blk_rq_unmap_user(SRpnt->bio); } - __blk_put_request(req->q, req); + blk_put_request(req); } /* osst_request memory management */ diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c index 105b0e4d7818..311eb22068e1 100644 --- a/drivers/scsi/qedi/qedi_main.c +++ b/drivers/scsi/qedi/qedi_main.c @@ -644,8 +644,7 @@ static struct qedi_ctx *qedi_host_alloc(struct pci_dev *pdev) qedi->max_active_conns = ISCSI_MAX_SESS_PER_HBA; qedi->max_sqes = QEDI_SQ_SIZE; - if (shost_use_blk_mq(shost)) - shost->nr_hw_queues = MIN_NUM_CPUS_MSIX(qedi); + shost->nr_hw_queues = MIN_NUM_CPUS_MSIX(qedi); pci_set_drvdata(pdev, qedi); diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index 7e78e7eff783..fccc733145fc 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -272,17 +272,6 @@ static void qla_nvme_fcp_abort(struct nvme_fc_local_port *lport, schedule_work(&priv->abort_work); } -static void qla_nvme_poll(struct nvme_fc_local_port *lport, void *hw_queue_handle) -{ - struct qla_qpair *qpair = hw_queue_handle; - unsigned long flags; - struct scsi_qla_host *vha = lport->private; - - spin_lock_irqsave(&qpair->qp_lock, flags); - qla24xx_process_response_queue(vha, qpair->rsp); - spin_unlock_irqrestore(&qpair->qp_lock, flags); -} - static inline int qla2x00_start_nvme_mq(srb_t *sp) { unsigned long flags; @@ -578,7 +567,6 @@ static struct nvme_fc_port_template qla_nvme_fc_transport = { .ls_abort = qla_nvme_ls_abort, .fcp_io = qla_nvme_post_cmd, .fcp_abort = qla_nvme_fcp_abort, - .poll_queue = qla_nvme_poll, .max_hw_queues = 8, .max_sgl_segments = 128, .max_dif_sgl_segments = 64, diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index d0ecc729a90a..f92196ec5489 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -857,13 +857,9 @@ qla2xxx_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) } if (ha->mqenable) { - if (shost_use_blk_mq(vha->host)) { - tag = blk_mq_unique_tag(cmd->request); - hwq = blk_mq_unique_tag_to_hwq(tag); - qpair = ha->queue_pair_map[hwq]; - } else if (vha->vp_idx && vha->qpair) { - qpair = vha->qpair; - } + tag = blk_mq_unique_tag(cmd->request); + hwq = blk_mq_unique_tag_to_hwq(tag); + qpair = ha->queue_pair_map[hwq]; if (qpair) return qla2xxx_mqueuecommand(host, cmd, qpair); @@ -1464,7 +1460,7 @@ __qla2xxx_eh_generic_reset(char *name, enum nexus_wait_type type, goto eh_reset_failed; } err = 2; - if (do_reset(fcport, cmd->device->lun, cmd->request->cpu + 1) + if (do_reset(fcport, cmd->device->lun, blk_mq_rq_cpu(cmd->request) + 1) != QLA_SUCCESS) { ql_log(ql_log_warn, vha, 0x800c, "do_reset failed for cmd=%p.\n", cmd); @@ -3159,7 +3155,7 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id) goto probe_failed; } - if (ha->mqenable && shost_use_blk_mq(host)) { + if (ha->mqenable) { /* number of hardware queues supported by blk/scsi-mq*/ host->nr_hw_queues = ha->max_qpairs; @@ -3271,25 +3267,17 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id) base_vha->mgmt_svr_loop_id, host->sg_tablesize); if (ha->mqenable) { - bool mq = false; bool startit = false; - if (QLA_TGT_MODE_ENABLED()) { - mq = true; + if (QLA_TGT_MODE_ENABLED()) startit = false; - } - if ((ql2x_ini_mode == QLA2XXX_INI_MODE_ENABLED) && - shost_use_blk_mq(host)) { - mq = true; + if (ql2x_ini_mode == QLA2XXX_INI_MODE_ENABLED) startit = true; - } - if (mq) { - /* Create start of day qpairs for Block MQ */ - for (i = 0; i < ha->max_qpairs; i++) - qla2xxx_create_qpair(base_vha, 5, 0, startit); - } + /* Create start of day qpairs for Block MQ */ + for (i = 0; i < ha->max_qpairs; i++) + qla2xxx_create_qpair(base_vha, 5, 0, startit); } if (ha->flags.running_gold_fw) @@ -6952,11 +6940,12 @@ static int qla2xxx_map_queues(struct Scsi_Host *shost) { int rc; scsi_qla_host_t *vha = (scsi_qla_host_t *)shost->hostdata; + struct blk_mq_queue_map *qmap = &shost->tag_set.map[0]; if (USER_CTRL_IRQ(vha->hw)) - rc = blk_mq_map_queues(&shost->tag_set); + rc = blk_mq_map_queues(qmap); else - rc = blk_mq_pci_map_queues(&shost->tag_set, vha->hw->pdev, 0); + rc = blk_mq_pci_map_queues(qmap, vha->hw->pdev, 0); return rc; } diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index fc1356d101b0..7675ff0ca2ea 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -780,11 +780,8 @@ MODULE_LICENSE("GPL"); module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels"); -#ifdef CONFIG_SCSI_MQ_DEFAULT +/* This should go away in the future, it doesn't do anything anymore */ bool scsi_use_blk_mq = true; -#else -bool scsi_use_blk_mq = false; -#endif module_param_named(use_blk_mq, scsi_use_blk_mq, bool, S_IWUSR | S_IRUGO); static int __init init_scsi(void) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 60bcc6df97a9..4740f1e9dd17 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -5881,8 +5881,7 @@ static int sdebug_driver_probe(struct device *dev) } /* Decide whether to tell scsi subsystem that we want mq */ /* Following should give the same answer for each host */ - if (shost_use_blk_mq(hpnt)) - hpnt->nr_hw_queues = submit_queues; + hpnt->nr_hw_queues = submit_queues; sdbg_host->shost = hpnt; *((struct sdebug_host_info **)hpnt->hostdata) = sdbg_host; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index c736d61b1648..16eef068e9e9 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -297,19 +297,19 @@ enum blk_eh_timer_return scsi_times_out(struct request *req) if (rtn == BLK_EH_DONE) { /* - * For blk-mq, we must set the request state to complete now - * before sending the request to the scsi error handler. This - * will prevent a use-after-free in the event the LLD manages - * to complete the request before the error handler finishes - * processing this timed out request. + * Set the command to complete first in order to prevent a real + * completion from releasing the command while error handling + * is using it. If the command was already completed, then the + * lower level driver beat the timeout handler, and it is safe + * to return without escalating error recovery. * - * If the request was already completed, then the LLD beat the - * time out handler from transferring the request to the scsi - * error handler. In that case we can return immediately as no - * further action is required. + * If timeout handling lost the race to a real completion, the + * block layer may ignore that due to a fake timeout injection, + * so return RESET_TIMER to allow error handling another shot + * at this command. */ - if (req->q->mq_ops && !blk_mq_mark_complete(req)) - return rtn; + if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state)) + return BLK_EH_RESET_TIMER; if (scsi_abort_command(scmd) != SUCCESS) { set_host_byte(scmd, DID_TIME_OUT); scsi_eh_scmd_add(scmd); @@ -1932,7 +1932,7 @@ maybe_retry: static void eh_lock_door_done(struct request *req, blk_status_t status) { - __blk_put_request(req->q, req); + blk_put_request(req); } /** diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index fa6e0c3b3aa6..0dbf25512778 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -168,8 +168,6 @@ static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd) static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy) { struct scsi_device *device = cmd->device; - struct request_queue *q = device->request_queue; - unsigned long flags; SCSI_LOG_MLQUEUE(1, scmd_printk(KERN_INFO, cmd, "Inserting command %p into mlqueue\n", cmd)); @@ -190,26 +188,20 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy) * before blk_cleanup_queue() finishes. */ cmd->result = 0; - if (q->mq_ops) { - /* - * Before a SCSI command is dispatched, - * get_device(&sdev->sdev_gendev) is called and the host, - * target and device busy counters are increased. Since - * requeuing a request causes these actions to be repeated and - * since scsi_device_unbusy() has already been called, - * put_device(&device->sdev_gendev) must still be called. Call - * put_device() after blk_mq_requeue_request() to avoid that - * removal of the SCSI device can start before requeueing has - * happened. - */ - blk_mq_requeue_request(cmd->request, true); - put_device(&device->sdev_gendev); - return; - } - spin_lock_irqsave(q->queue_lock, flags); - blk_requeue_request(q, cmd->request); - kblockd_schedule_work(&device->requeue_work); - spin_unlock_irqrestore(q->queue_lock, flags); + + /* + * Before a SCSI command is dispatched, + * get_device(&sdev->sdev_gendev) is called and the host, + * target and device busy counters are increased. Since + * requeuing a request causes these actions to be repeated and + * since scsi_device_unbusy() has already been called, + * put_device(&device->sdev_gendev) must still be called. Call + * put_device() after blk_mq_requeue_request() to avoid that + * removal of the SCSI device can start before requeueing has + * happened. + */ + blk_mq_requeue_request(cmd->request, true); + put_device(&device->sdev_gendev); } /* @@ -370,10 +362,7 @@ void scsi_device_unbusy(struct scsi_device *sdev) static void scsi_kick_queue(struct request_queue *q) { - if (q->mq_ops) - blk_mq_run_hw_queues(q, false); - else - blk_run_queue(q); + blk_mq_run_hw_queues(q, false); } /* @@ -534,10 +523,7 @@ static void scsi_run_queue(struct request_queue *q) if (!list_empty(&sdev->host->starved_list)) scsi_starved_list_run(sdev->host); - if (q->mq_ops) - blk_mq_run_hw_queues(q, false); - else - blk_run_queue(q); + blk_mq_run_hw_queues(q, false); } void scsi_requeue_run_queue(struct work_struct *work) @@ -550,42 +536,6 @@ void scsi_requeue_run_queue(struct work_struct *work) scsi_run_queue(q); } -/* - * Function: scsi_requeue_command() - * - * Purpose: Handle post-processing of completed commands. - * - * Arguments: q - queue to operate on - * cmd - command that may need to be requeued. - * - * Returns: Nothing - * - * Notes: After command completion, there may be blocks left - * over which weren't finished by the previous command - * this can be for a number of reasons - the main one is - * I/O errors in the middle of the request, in which case - * we need to request the blocks that come after the bad - * sector. - * Notes: Upon return, cmd is a stale pointer. - */ -static void scsi_requeue_command(struct request_queue *q, struct scsi_cmnd *cmd) -{ - struct scsi_device *sdev = cmd->device; - struct request *req = cmd->request; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - blk_unprep_request(req); - req->special = NULL; - scsi_put_command(cmd); - blk_requeue_request(q, req); - spin_unlock_irqrestore(q->queue_lock, flags); - - scsi_run_queue(q); - - put_device(&sdev->sdev_gendev); -} - void scsi_run_host_queues(struct Scsi_Host *shost) { struct scsi_device *sdev; @@ -626,42 +576,6 @@ static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd) scsi_del_cmd_from_list(cmd); } -/* - * Function: scsi_release_buffers() - * - * Purpose: Free resources allocate for a scsi_command. - * - * Arguments: cmd - command that we are bailing. - * - * Lock status: Assumed that no lock is held upon entry. - * - * Returns: Nothing - * - * Notes: In the event that an upper level driver rejects a - * command, we must release resources allocated during - * the __init_io() function. Primarily this would involve - * the scatter-gather table. - */ -static void scsi_release_buffers(struct scsi_cmnd *cmd) -{ - if (cmd->sdb.table.nents) - sg_free_table_chained(&cmd->sdb.table, false); - - memset(&cmd->sdb, 0, sizeof(cmd->sdb)); - - if (scsi_prot_sg_count(cmd)) - sg_free_table_chained(&cmd->prot_sdb->table, false); -} - -static void scsi_release_bidi_buffers(struct scsi_cmnd *cmd) -{ - struct scsi_data_buffer *bidi_sdb = cmd->request->next_rq->special; - - sg_free_table_chained(&bidi_sdb->table, false); - kmem_cache_free(scsi_sdb_cache, bidi_sdb); - cmd->request->next_rq->special = NULL; -} - /* Returns false when no more bytes to process, true if there are more */ static bool scsi_end_request(struct request *req, blk_status_t error, unsigned int bytes, unsigned int bidi_bytes) @@ -687,46 +601,30 @@ static bool scsi_end_request(struct request *req, blk_status_t error, destroy_rcu_head(&cmd->rcu); } - if (req->mq_ctx) { - /* - * In the MQ case the command gets freed by __blk_mq_end_request, - * so we have to do all cleanup that depends on it earlier. - * - * We also can't kick the queues from irq context, so we - * will have to defer it to a workqueue. - */ - scsi_mq_uninit_cmd(cmd); + /* + * In the MQ case the command gets freed by __blk_mq_end_request, + * so we have to do all cleanup that depends on it earlier. + * + * We also can't kick the queues from irq context, so we + * will have to defer it to a workqueue. + */ + scsi_mq_uninit_cmd(cmd); - /* - * queue is still alive, so grab the ref for preventing it - * from being cleaned up during running queue. - */ - percpu_ref_get(&q->q_usage_counter); + /* + * queue is still alive, so grab the ref for preventing it + * from being cleaned up during running queue. + */ + percpu_ref_get(&q->q_usage_counter); - __blk_mq_end_request(req, error); + __blk_mq_end_request(req, error); - if (scsi_target(sdev)->single_lun || - !list_empty(&sdev->host->starved_list)) - kblockd_schedule_work(&sdev->requeue_work); - else - blk_mq_run_hw_queues(q, true); - - percpu_ref_put(&q->q_usage_counter); - } else { - unsigned long flags; - - if (bidi_bytes) - scsi_release_bidi_buffers(cmd); - scsi_release_buffers(cmd); - scsi_put_command(cmd); - - spin_lock_irqsave(q->queue_lock, flags); - blk_finish_request(req, error); - spin_unlock_irqrestore(q->queue_lock, flags); - - scsi_run_queue(q); - } + if (scsi_target(sdev)->single_lun || + !list_empty(&sdev->host->starved_list)) + kblockd_schedule_work(&sdev->requeue_work); + else + blk_mq_run_hw_queues(q, true); + percpu_ref_put(&q->q_usage_counter); put_device(&sdev->sdev_gendev); return false; } @@ -774,13 +672,7 @@ static void scsi_io_completion_reprep(struct scsi_cmnd *cmd, struct request_queue *q) { /* A new command will be prepared and issued. */ - if (q->mq_ops) { - scsi_mq_requeue_cmd(cmd); - } else { - /* Unprep request and put it back at head of the queue. */ - scsi_release_buffers(cmd); - scsi_requeue_command(q, cmd); - } + scsi_mq_requeue_cmd(cmd); } /* Helper for scsi_io_completion() when special action required. */ @@ -1120,7 +1012,8 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) scsi_io_completion_action(cmd, result); } -static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb) +static blk_status_t scsi_init_sgtable(struct request *req, + struct scsi_data_buffer *sdb) { int count; @@ -1129,7 +1022,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb) */ if (unlikely(sg_alloc_table_chained(&sdb->table, blk_rq_nr_phys_segments(req), sdb->table.sgl))) - return BLKPREP_DEFER; + return BLK_STS_RESOURCE; /* * Next, walk the list, and fill in the addresses and sizes of @@ -1139,7 +1032,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb) BUG_ON(count > sdb->table.nents); sdb->table.nents = count; sdb->length = blk_rq_payload_bytes(req); - return BLKPREP_OK; + return BLK_STS_OK; } /* @@ -1149,62 +1042,48 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb) * * Arguments: cmd - Command descriptor we wish to initialize * - * Returns: 0 on success - * BLKPREP_DEFER if the failure is retryable - * BLKPREP_KILL if the failure is fatal + * Returns: BLK_STS_OK on success + * BLK_STS_RESOURCE if the failure is retryable + * BLK_STS_IOERR if the failure is fatal */ -int scsi_init_io(struct scsi_cmnd *cmd) +blk_status_t scsi_init_io(struct scsi_cmnd *cmd) { - struct scsi_device *sdev = cmd->device; struct request *rq = cmd->request; - bool is_mq = (rq->mq_ctx != NULL); - int error = BLKPREP_KILL; + blk_status_t ret; if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq))) - goto err_exit; + return BLK_STS_IOERR; - error = scsi_init_sgtable(rq, &cmd->sdb); - if (error) - goto err_exit; + ret = scsi_init_sgtable(rq, &cmd->sdb); + if (ret) + return ret; if (blk_bidi_rq(rq)) { - if (!rq->q->mq_ops) { - struct scsi_data_buffer *bidi_sdb = - kmem_cache_zalloc(scsi_sdb_cache, GFP_ATOMIC); - if (!bidi_sdb) { - error = BLKPREP_DEFER; - goto err_exit; - } - - rq->next_rq->special = bidi_sdb; - } - - error = scsi_init_sgtable(rq->next_rq, rq->next_rq->special); - if (error) - goto err_exit; + ret = scsi_init_sgtable(rq->next_rq, rq->next_rq->special); + if (ret) + goto out_free_sgtables; } if (blk_integrity_rq(rq)) { struct scsi_data_buffer *prot_sdb = cmd->prot_sdb; int ivecs, count; - if (prot_sdb == NULL) { + if (WARN_ON_ONCE(!prot_sdb)) { /* * This can happen if someone (e.g. multipath) * queues a command to a device on an adapter * that does not support DIX. */ - WARN_ON_ONCE(1); - error = BLKPREP_KILL; - goto err_exit; + ret = BLK_STS_IOERR; + goto out_free_sgtables; } ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio); if (sg_alloc_table_chained(&prot_sdb->table, ivecs, prot_sdb->table.sgl)) { - error = BLKPREP_DEFER; - goto err_exit; + ret = BLK_STS_RESOURCE; + goto out_free_sgtables; } count = blk_rq_map_integrity_sg(rq->q, rq->bio, @@ -1216,17 +1095,10 @@ int scsi_init_io(struct scsi_cmnd *cmd) cmd->prot_sdb->table.nents = count; } - return BLKPREP_OK; -err_exit: - if (is_mq) { - scsi_mq_free_sgtables(cmd); - } else { - scsi_release_buffers(cmd); - cmd->request->special = NULL; - scsi_put_command(cmd); - put_device(&sdev->sdev_gendev); - } - return error; + return BLK_STS_OK; +out_free_sgtables: + scsi_mq_free_sgtables(cmd); + return ret; } EXPORT_SYMBOL(scsi_init_io); @@ -1312,7 +1184,8 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd) scsi_add_cmd_to_list(cmd); } -static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req) +static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev, + struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); @@ -1323,8 +1196,8 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req) * submit a request without an attached bio. */ if (req->bio) { - int ret = scsi_init_io(cmd); - if (unlikely(ret)) + blk_status_t ret = scsi_init_io(cmd); + if (unlikely(ret != BLK_STS_OK)) return ret; } else { BUG_ON(blk_rq_bytes(req)); @@ -1336,20 +1209,21 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req) cmd->cmnd = scsi_req(req)->cmd; cmd->transfersize = blk_rq_bytes(req); cmd->allowed = scsi_req(req)->retries; - return BLKPREP_OK; + return BLK_STS_OK; } /* * Setup a normal block command. These are simple request from filesystems * that still need to be translated to SCSI CDBs from the ULD. */ -static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req) +static blk_status_t scsi_setup_fs_cmnd(struct scsi_device *sdev, + struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); if (unlikely(sdev->handler && sdev->handler->prep_fn)) { - int ret = sdev->handler->prep_fn(sdev, req); - if (ret != BLKPREP_OK) + blk_status_t ret = sdev->handler->prep_fn(sdev, req); + if (ret != BLK_STS_OK) return ret; } @@ -1358,7 +1232,8 @@ static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req) return scsi_cmd_to_driver(cmd)->init_command(cmd); } -static int scsi_setup_cmnd(struct scsi_device *sdev, struct request *req) +static blk_status_t scsi_setup_cmnd(struct scsi_device *sdev, + struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); @@ -1375,129 +1250,48 @@ static int scsi_setup_cmnd(struct scsi_device *sdev, struct request *req) return scsi_setup_fs_cmnd(sdev, req); } -static int +static blk_status_t scsi_prep_state_check(struct scsi_device *sdev, struct request *req) { - int ret = BLKPREP_OK; - - /* - * If the device is not in running state we will reject some - * or all commands. - */ - if (unlikely(sdev->sdev_state != SDEV_RUNNING)) { - switch (sdev->sdev_state) { - case SDEV_OFFLINE: - case SDEV_TRANSPORT_OFFLINE: - /* - * If the device is offline we refuse to process any - * commands. The device must be brought online - * before trying any recovery commands. - */ - sdev_printk(KERN_ERR, sdev, - "rejecting I/O to offline device\n"); - ret = BLKPREP_KILL; - break; - case SDEV_DEL: - /* - * If the device is fully deleted, we refuse to - * process any commands as well. - */ - sdev_printk(KERN_ERR, sdev, - "rejecting I/O to dead device\n"); - ret = BLKPREP_KILL; - break; - case SDEV_BLOCK: - case SDEV_CREATED_BLOCK: - ret = BLKPREP_DEFER; - break; - case SDEV_QUIESCE: - /* - * If the devices is blocked we defer normal commands. - */ - if (req && !(req->rq_flags & RQF_PREEMPT)) - ret = BLKPREP_DEFER; - break; - default: - /* - * For any other not fully online state we only allow - * special commands. In particular any user initiated - * command is not allowed. - */ - if (req && !(req->rq_flags & RQF_PREEMPT)) - ret = BLKPREP_KILL; - break; - } - } - return ret; -} - -static int -scsi_prep_return(struct request_queue *q, struct request *req, int ret) -{ - struct scsi_device *sdev = q->queuedata; - - switch (ret) { - case BLKPREP_KILL: - case BLKPREP_INVALID: - scsi_req(req)->result = DID_NO_CONNECT << 16; - /* release the command and kill it */ - if (req->special) { - struct scsi_cmnd *cmd = req->special; - scsi_release_buffers(cmd); - scsi_put_command(cmd); - put_device(&sdev->sdev_gendev); - req->special = NULL; - } - break; - case BLKPREP_DEFER: + switch (sdev->sdev_state) { + case SDEV_OFFLINE: + case SDEV_TRANSPORT_OFFLINE: /* - * If we defer, the blk_peek_request() returns NULL, but the - * queue must be restarted, so we schedule a callback to happen - * shortly. + * If the device is offline we refuse to process any + * commands. The device must be brought online + * before trying any recovery commands. */ - if (atomic_read(&sdev->device_busy) == 0) - blk_delay_queue(q, SCSI_QUEUE_DELAY); - break; + sdev_printk(KERN_ERR, sdev, + "rejecting I/O to offline device\n"); + return BLK_STS_IOERR; + case SDEV_DEL: + /* + * If the device is fully deleted, we refuse to + * process any commands as well. + */ + sdev_printk(KERN_ERR, sdev, + "rejecting I/O to dead device\n"); + return BLK_STS_IOERR; + case SDEV_BLOCK: + case SDEV_CREATED_BLOCK: + return BLK_STS_RESOURCE; + case SDEV_QUIESCE: + /* + * If the devices is blocked we defer normal commands. + */ + if (req && !(req->rq_flags & RQF_PREEMPT)) + return BLK_STS_RESOURCE; + return BLK_STS_OK; default: - req->rq_flags |= RQF_DONTPREP; + /* + * For any other not fully online state we only allow + * special commands. In particular any user initiated + * command is not allowed. + */ + if (req && !(req->rq_flags & RQF_PREEMPT)) + return BLK_STS_IOERR; + return BLK_STS_OK; } - - return ret; -} - -static int scsi_prep_fn(struct request_queue *q, struct request *req) -{ - struct scsi_device *sdev = q->queuedata; - struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); - int ret; - - ret = scsi_prep_state_check(sdev, req); - if (ret != BLKPREP_OK) - goto out; - - if (!req->special) { - /* Bail if we can't get a reference to the device */ - if (unlikely(!get_device(&sdev->sdev_gendev))) { - ret = BLKPREP_DEFER; - goto out; - } - - scsi_init_command(sdev, cmd); - req->special = cmd; - } - - cmd->tag = req->tag; - cmd->request = req; - cmd->prot_op = SCSI_PROT_NORMAL; - - ret = scsi_setup_cmnd(sdev, req); -out: - return scsi_prep_return(q, req, ret); -} - -static void scsi_unprep_fn(struct request_queue *q, struct request *req) -{ - scsi_uninit_cmd(blk_mq_rq_to_pdu(req)); } /* @@ -1519,14 +1313,8 @@ static inline int scsi_dev_queue_ready(struct request_queue *q, /* * unblock after device_blocked iterates to zero */ - if (atomic_dec_return(&sdev->device_blocked) > 0) { - /* - * For the MQ case we take care of this in the caller. - */ - if (!q->mq_ops) - blk_delay_queue(q, SCSI_QUEUE_DELAY); + if (atomic_dec_return(&sdev->device_blocked) > 0) goto out_dec; - } SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev, "unblocking device at zero depth\n")); } @@ -1661,13 +1449,13 @@ out_dec: * needs to return 'not busy'. Otherwise, request stacking drivers * may hold requests forever. */ -static int scsi_lld_busy(struct request_queue *q) +static bool scsi_mq_lld_busy(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost; if (blk_queue_dying(q)) - return 0; + return false; shost = sdev->host; @@ -1678,43 +1466,9 @@ static int scsi_lld_busy(struct request_queue *q) * in SCSI layer. */ if (scsi_host_in_recovery(shost) || scsi_device_is_busy(sdev)) - return 1; + return true; - return 0; -} - -/* - * Kill a request for a dead device - */ -static void scsi_kill_request(struct request *req, struct request_queue *q) -{ - struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); - struct scsi_device *sdev; - struct scsi_target *starget; - struct Scsi_Host *shost; - - blk_start_request(req); - - scmd_printk(KERN_INFO, cmd, "killing request\n"); - - sdev = cmd->device; - starget = scsi_target(sdev); - shost = sdev->host; - scsi_init_cmd_errh(cmd); - cmd->result = DID_NO_CONNECT << 16; - atomic_inc(&cmd->device->iorequest_cnt); - - /* - * SCSI request completion path will do scsi_device_unbusy(), - * bump busy counts. To bump the counters, we need to dance - * with the locks as normal issue path does. - */ - atomic_inc(&sdev->device_busy); - atomic_inc(&shost->host_busy); - if (starget->can_queue > 0) - atomic_inc(&starget->target_busy); - - blk_complete_request(req); + return false; } static void scsi_softirq_done(struct request *rq) @@ -1837,170 +1591,6 @@ static int scsi_dispatch_cmd(struct scsi_cmnd *cmd) return 0; } -/** - * scsi_done - Invoke completion on finished SCSI command. - * @cmd: The SCSI Command for which a low-level device driver (LLDD) gives - * ownership back to SCSI Core -- i.e. the LLDD has finished with it. - * - * Description: This function is the mid-level's (SCSI Core) interrupt routine, - * which regains ownership of the SCSI command (de facto) from a LLDD, and - * calls blk_complete_request() for further processing. - * - * This function is interrupt context safe. - */ -static void scsi_done(struct scsi_cmnd *cmd) -{ - trace_scsi_dispatch_cmd_done(cmd); - blk_complete_request(cmd->request); -} - -/* - * Function: scsi_request_fn() - * - * Purpose: Main strategy routine for SCSI. - * - * Arguments: q - Pointer to actual queue. - * - * Returns: Nothing - * - * Lock status: request queue lock assumed to be held when called. - * - * Note: See sd_zbc.c sd_zbc_write_lock_zone() for write order - * protection for ZBC disks. - */ -static void scsi_request_fn(struct request_queue *q) - __releases(q->queue_lock) - __acquires(q->queue_lock) -{ - struct scsi_device *sdev = q->queuedata; - struct Scsi_Host *shost; - struct scsi_cmnd *cmd; - struct request *req; - - /* - * To start with, we keep looping until the queue is empty, or until - * the host is no longer able to accept any more requests. - */ - shost = sdev->host; - for (;;) { - int rtn; - /* - * get next queueable request. We do this early to make sure - * that the request is fully prepared even if we cannot - * accept it. - */ - req = blk_peek_request(q); - if (!req) - break; - - if (unlikely(!scsi_device_online(sdev))) { - sdev_printk(KERN_ERR, sdev, - "rejecting I/O to offline device\n"); - scsi_kill_request(req, q); - continue; - } - - if (!scsi_dev_queue_ready(q, sdev)) - break; - - /* - * Remove the request from the request list. - */ - if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req))) - blk_start_request(req); - - spin_unlock_irq(q->queue_lock); - cmd = blk_mq_rq_to_pdu(req); - if (cmd != req->special) { - printk(KERN_CRIT "impossible request in %s.\n" - "please mail a stack trace to " - "linux-scsi@vger.kernel.org\n", - __func__); - blk_dump_rq_flags(req, "foo"); - BUG(); - } - - /* - * We hit this when the driver is using a host wide - * tag map. For device level tag maps the queue_depth check - * in the device ready fn would prevent us from trying - * to allocate a tag. Since the map is a shared host resource - * we add the dev to the starved list so it eventually gets - * a run when a tag is freed. - */ - if (blk_queue_tagged(q) && !(req->rq_flags & RQF_QUEUED)) { - spin_lock_irq(shost->host_lock); - if (list_empty(&sdev->starved_entry)) - list_add_tail(&sdev->starved_entry, - &shost->starved_list); - spin_unlock_irq(shost->host_lock); - goto not_ready; - } - - if (!scsi_target_queue_ready(shost, sdev)) - goto not_ready; - - if (!scsi_host_queue_ready(q, shost, sdev)) - goto host_not_ready; - - if (sdev->simple_tags) - cmd->flags |= SCMD_TAGGED; - else - cmd->flags &= ~SCMD_TAGGED; - - /* - * Finally, initialize any error handling parameters, and set up - * the timers for timeouts. - */ - scsi_init_cmd_errh(cmd); - - /* - * Dispatch the command to the low-level driver. - */ - cmd->scsi_done = scsi_done; - rtn = scsi_dispatch_cmd(cmd); - if (rtn) { - scsi_queue_insert(cmd, rtn); - spin_lock_irq(q->queue_lock); - goto out_delay; - } - spin_lock_irq(q->queue_lock); - } - - return; - - host_not_ready: - if (scsi_target(sdev)->can_queue > 0) - atomic_dec(&scsi_target(sdev)->target_busy); - not_ready: - /* - * lock q, handle tag, requeue req, and decrement device_busy. We - * must return with queue_lock held. - * - * Decrementing device_busy without checking it is OK, as all such - * cases (host limits or settings) should run the queue at some - * later time. - */ - spin_lock_irq(q->queue_lock); - blk_requeue_request(q, req); - atomic_dec(&sdev->device_busy); -out_delay: - if (!atomic_read(&sdev->device_busy) && !scsi_device_blocked(sdev)) - blk_delay_queue(q, SCSI_QUEUE_DELAY); -} - -static inline blk_status_t prep_to_mq(int ret) -{ - switch (ret) { - case BLKPREP_OK: - return BLK_STS_OK; - case BLKPREP_DEFER: - return BLK_STS_RESOURCE; - default: - return BLK_STS_IOERR; - } -} - /* Size in bytes of the sg-list stored in the scsi-mq command-private data. */ static unsigned int scsi_mq_sgl_size(struct Scsi_Host *shost) { @@ -2008,7 +1598,7 @@ static unsigned int scsi_mq_sgl_size(struct Scsi_Host *shost) sizeof(struct scatterlist); } -static int scsi_mq_prep_fn(struct request *req) +static blk_status_t scsi_mq_prep_fn(struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); struct scsi_device *sdev = req->q->queuedata; @@ -2052,8 +1642,18 @@ static int scsi_mq_prep_fn(struct request *req) static void scsi_mq_done(struct scsi_cmnd *cmd) { + if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state))) + return; trace_scsi_dispatch_cmd_done(cmd); - blk_mq_complete_request(cmd->request); + + /* + * If the block layer didn't complete the request due to a timeout + * injection, scsi must clear its internal completed state so that the + * timeout handler will see it needs to escalate its own error + * recovery. + */ + if (unlikely(!blk_mq_complete_request(cmd->request))) + clear_bit(SCMD_STATE_COMPLETE, &cmd->state); } static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) @@ -2096,9 +1696,15 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, blk_status_t ret; int reason; - ret = prep_to_mq(scsi_prep_state_check(sdev, req)); - if (ret != BLK_STS_OK) - goto out_put_budget; + /* + * If the device is not in running state we will reject some or all + * commands. + */ + if (unlikely(sdev->sdev_state != SDEV_RUNNING)) { + ret = scsi_prep_state_check(sdev, req); + if (ret != BLK_STS_OK) + goto out_put_budget; + } ret = BLK_STS_RESOURCE; if (!scsi_target_queue_ready(shost, sdev)) @@ -2106,8 +1712,9 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (!scsi_host_queue_ready(q, shost, sdev)) goto out_dec_target_busy; + clear_bit(SCMD_STATE_COMPLETE, &cmd->state); if (!(req->rq_flags & RQF_DONTPREP)) { - ret = prep_to_mq(scsi_mq_prep_fn(req)); + ret = scsi_mq_prep_fn(req); if (ret != BLK_STS_OK) goto out_dec_host_busy; req->rq_flags |= RQF_DONTPREP; @@ -2208,7 +1815,7 @@ static int scsi_map_queues(struct blk_mq_tag_set *set) if (shost->hostt->map_queues) return shost->hostt->map_queues(shost); - return blk_mq_map_queues(set); + return blk_mq_map_queues(&set->map[0]); } void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q) @@ -2251,77 +1858,6 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q) } EXPORT_SYMBOL_GPL(__scsi_init_queue); -static int scsi_old_init_rq(struct request_queue *q, struct request *rq, - gfp_t gfp) -{ - struct Scsi_Host *shost = q->rq_alloc_data; - const bool unchecked_isa_dma = shost->unchecked_isa_dma; - struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); - - memset(cmd, 0, sizeof(*cmd)); - - if (unchecked_isa_dma) - cmd->flags |= SCMD_UNCHECKED_ISA_DMA; - cmd->sense_buffer = scsi_alloc_sense_buffer(unchecked_isa_dma, gfp, - NUMA_NO_NODE); - if (!cmd->sense_buffer) - goto fail; - cmd->req.sense = cmd->sense_buffer; - - if (scsi_host_get_prot(shost) >= SHOST_DIX_TYPE0_PROTECTION) { - cmd->prot_sdb = kmem_cache_zalloc(scsi_sdb_cache, gfp); - if (!cmd->prot_sdb) - goto fail_free_sense; - } - - return 0; - -fail_free_sense: - scsi_free_sense_buffer(unchecked_isa_dma, cmd->sense_buffer); -fail: - return -ENOMEM; -} - -static void scsi_old_exit_rq(struct request_queue *q, struct request *rq) -{ - struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); - - if (cmd->prot_sdb) - kmem_cache_free(scsi_sdb_cache, cmd->prot_sdb); - scsi_free_sense_buffer(cmd->flags & SCMD_UNCHECKED_ISA_DMA, - cmd->sense_buffer); -} - -struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev) -{ - struct Scsi_Host *shost = sdev->host; - struct request_queue *q; - - q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL); - if (!q) - return NULL; - q->cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size; - q->rq_alloc_data = shost; - q->request_fn = scsi_request_fn; - q->init_rq_fn = scsi_old_init_rq; - q->exit_rq_fn = scsi_old_exit_rq; - q->initialize_rq_fn = scsi_initialize_rq; - - if (blk_init_allocated_queue(q) < 0) { - blk_cleanup_queue(q); - return NULL; - } - - __scsi_init_queue(shost, q); - blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q); - blk_queue_prep_rq(q, scsi_prep_fn); - blk_queue_unprep_rq(q, scsi_unprep_fn); - blk_queue_softirq_done(q, scsi_softirq_done); - blk_queue_rq_timed_out(q, scsi_times_out); - blk_queue_lld_busy(q, scsi_lld_busy); - return q; -} - static const struct blk_mq_ops scsi_mq_ops = { .get_budget = scsi_mq_get_budget, .put_budget = scsi_mq_put_budget, @@ -2334,6 +1870,7 @@ static const struct blk_mq_ops scsi_mq_ops = { .init_request = scsi_mq_init_request, .exit_request = scsi_mq_exit_request, .initialize_rq_fn = scsi_initialize_rq, + .busy = scsi_mq_lld_busy, .map_queues = scsi_map_queues, }; @@ -2388,10 +1925,7 @@ struct scsi_device *scsi_device_from_queue(struct request_queue *q) { struct scsi_device *sdev = NULL; - if (q->mq_ops) { - if (q->mq_ops == &scsi_mq_ops) - sdev = q->queuedata; - } else if (q->request_fn == scsi_request_fn) + if (q->mq_ops == &scsi_mq_ops) sdev = q->queuedata; if (!sdev || !get_device(&sdev->sdev_gendev)) sdev = NULL; @@ -2994,39 +2528,6 @@ void sdev_evt_send_simple(struct scsi_device *sdev, } EXPORT_SYMBOL_GPL(sdev_evt_send_simple); -/** - * scsi_request_fn_active() - number of kernel threads inside scsi_request_fn() - * @sdev: SCSI device to count the number of scsi_request_fn() callers for. - */ -static int scsi_request_fn_active(struct scsi_device *sdev) -{ - struct request_queue *q = sdev->request_queue; - int request_fn_active; - - WARN_ON_ONCE(sdev->host->use_blk_mq); - - spin_lock_irq(q->queue_lock); - request_fn_active = q->request_fn_active; - spin_unlock_irq(q->queue_lock); - - return request_fn_active; -} - -/** - * scsi_wait_for_queuecommand() - wait for ongoing queuecommand() calls - * @sdev: SCSI device pointer. - * - * Wait until the ongoing shost->hostt->queuecommand() calls that are - * invoked from scsi_request_fn() have finished. - */ -static void scsi_wait_for_queuecommand(struct scsi_device *sdev) -{ - WARN_ON_ONCE(sdev->host->use_blk_mq); - - while (scsi_request_fn_active(sdev)) - msleep(20); -} - /** * scsi_device_quiesce - Block user issued commands. * @sdev: scsi device to quiesce. @@ -3150,7 +2651,6 @@ EXPORT_SYMBOL(scsi_target_resume); int scsi_internal_device_block_nowait(struct scsi_device *sdev) { struct request_queue *q = sdev->request_queue; - unsigned long flags; int err = 0; err = scsi_device_set_state(sdev, SDEV_BLOCK); @@ -3166,14 +2666,7 @@ int scsi_internal_device_block_nowait(struct scsi_device *sdev) * block layer from calling the midlayer with this device's * request queue. */ - if (q->mq_ops) { - blk_mq_quiesce_queue_nowait(q); - } else { - spin_lock_irqsave(q->queue_lock, flags); - blk_stop_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); - } - + blk_mq_quiesce_queue_nowait(q); return 0; } EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait); @@ -3204,12 +2697,8 @@ static int scsi_internal_device_block(struct scsi_device *sdev) mutex_lock(&sdev->state_mutex); err = scsi_internal_device_block_nowait(sdev); - if (err == 0) { - if (q->mq_ops) - blk_mq_quiesce_queue(q); - else - scsi_wait_for_queuecommand(sdev); - } + if (err == 0) + blk_mq_quiesce_queue(q); mutex_unlock(&sdev->state_mutex); return err; @@ -3218,15 +2707,8 @@ static int scsi_internal_device_block(struct scsi_device *sdev) void scsi_start_queue(struct scsi_device *sdev) { struct request_queue *q = sdev->request_queue; - unsigned long flags; - if (q->mq_ops) { - blk_mq_unquiesce_queue(q); - } else { - spin_lock_irqsave(q->queue_lock, flags); - blk_start_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); - } + blk_mq_unquiesce_queue(q); } /** diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h index 99f1db5e467e..5f21547b2ad2 100644 --- a/drivers/scsi/scsi_priv.h +++ b/drivers/scsi/scsi_priv.h @@ -92,7 +92,6 @@ extern void scsi_queue_insert(struct scsi_cmnd *cmd, int reason); extern void scsi_io_completion(struct scsi_cmnd *, unsigned int); extern void scsi_run_host_queues(struct Scsi_Host *shost); extern void scsi_requeue_run_queue(struct work_struct *work); -extern struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev); extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev); extern void scsi_start_queue(struct scsi_device *sdev); extern int scsi_mq_setup_tags(struct Scsi_Host *shost); diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 78ca63dfba4a..dd0d516f65e2 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -266,10 +266,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, */ sdev->borken = 1; - if (shost_use_blk_mq(shost)) - sdev->request_queue = scsi_mq_alloc_queue(sdev); - else - sdev->request_queue = scsi_old_alloc_queue(sdev); + sdev->request_queue = scsi_mq_alloc_queue(sdev); if (!sdev->request_queue) { /* release fn is set up in scsi_sysfs_device_initialise, so * have to free and put manually here */ @@ -280,11 +277,6 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, WARN_ON_ONCE(!blk_get_queue(sdev->request_queue)); sdev->request_queue->queuedata = sdev; - if (!shost_use_blk_mq(sdev->host)) { - blk_queue_init_tags(sdev->request_queue, - sdev->host->cmd_per_lun, shost->bqt, - shost->hostt->tag_alloc_policy); - } scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun ? sdev->host->cmd_per_lun : 1); diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 3aee9464a7bf..6a9040faed00 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -367,7 +367,6 @@ store_shost_eh_deadline(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(eh_deadline, S_IRUGO | S_IWUSR, show_shost_eh_deadline, store_shost_eh_deadline); -shost_rd_attr(use_blk_mq, "%d\n"); shost_rd_attr(unique_id, "%u\n"); shost_rd_attr(cmd_per_lun, "%hd\n"); shost_rd_attr(can_queue, "%hd\n"); @@ -386,6 +385,13 @@ show_host_busy(struct device *dev, struct device_attribute *attr, char *buf) } static DEVICE_ATTR(host_busy, S_IRUGO, show_host_busy, NULL); +static ssize_t +show_use_blk_mq(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "1\n"); +} +static DEVICE_ATTR(use_blk_mq, S_IRUGO, show_use_blk_mq, NULL); + static struct attribute *scsi_sysfs_shost_attrs[] = { &dev_attr_use_blk_mq.attr, &dev_attr_unique_id.attr, diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index 381668fa135d..d7035270d274 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -3592,7 +3592,7 @@ fc_bsg_job_timeout(struct request *req) /* the blk_end_sync_io() doesn't check the error */ if (inflight) - __blk_complete_request(req); + blk_mq_end_request(req, BLK_STS_IOERR); return BLK_EH_DONE; } @@ -3684,14 +3684,9 @@ static void fc_bsg_goose_queue(struct fc_rport *rport) { struct request_queue *q = rport->rqst_q; - unsigned long flags; - if (!q) - return; - - spin_lock_irqsave(q->queue_lock, flags); - blk_run_queue_async(q); - spin_unlock_irqrestore(q->queue_lock, flags); + if (q) + blk_mq_run_hw_queues(q, true); } /** @@ -3759,6 +3754,37 @@ static int fc_bsg_dispatch(struct bsg_job *job) return fc_bsg_host_dispatch(shost, job); } +static blk_status_t fc_bsg_rport_prep(struct fc_rport *rport) +{ + if (rport->port_state == FC_PORTSTATE_BLOCKED && + !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT)) + return BLK_STS_RESOURCE; + + if (rport->port_state != FC_PORTSTATE_ONLINE) + return BLK_STS_IOERR; + + return BLK_STS_OK; +} + + +static int fc_bsg_dispatch_prep(struct bsg_job *job) +{ + struct fc_rport *rport = fc_bsg_to_rport(job); + blk_status_t ret; + + ret = fc_bsg_rport_prep(rport); + switch (ret) { + case BLK_STS_OK: + break; + case BLK_STS_RESOURCE: + return -EAGAIN; + default: + return -EIO; + } + + return fc_bsg_dispatch(job); +} + /** * fc_bsg_hostadd - Create and add the bsg hooks so we can receive requests * @shost: shost for fc_host @@ -3780,7 +3806,8 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host) snprintf(bsg_name, sizeof(bsg_name), "fc_host%d", shost->host_no); - q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size); + q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, fc_bsg_job_timeout, + i->f->dd_bsg_size); if (IS_ERR(q)) { dev_err(dev, "fc_host%d: bsg interface failed to initialize - setup queue\n", @@ -3788,26 +3815,11 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host) return PTR_ERR(q); } __scsi_init_queue(shost, q); - blk_queue_rq_timed_out(q, fc_bsg_job_timeout); blk_queue_rq_timeout(q, FC_DEFAULT_BSG_TIMEOUT); fc_host->rqst_q = q; return 0; } -static int fc_bsg_rport_prep(struct request_queue *q, struct request *req) -{ - struct fc_rport *rport = dev_to_rport(q->queuedata); - - if (rport->port_state == FC_PORTSTATE_BLOCKED && - !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT)) - return BLKPREP_DEFER; - - if (rport->port_state != FC_PORTSTATE_ONLINE) - return BLKPREP_KILL; - - return BLKPREP_OK; -} - /** * fc_bsg_rportadd - Create and add the bsg hooks so we can receive requests * @shost: shost that rport is attached to @@ -3825,15 +3837,13 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport) if (!i->f->bsg_request) return -ENOTSUPP; - q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch, - i->f->dd_bsg_size); + q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch_prep, + fc_bsg_job_timeout, i->f->dd_bsg_size); if (IS_ERR(q)) { dev_err(dev, "failed to setup bsg queue\n"); return PTR_ERR(q); } __scsi_init_queue(shost, q); - blk_queue_prep_rq(q, fc_bsg_rport_prep); - blk_queue_rq_timed_out(q, fc_bsg_job_timeout); blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); rport->rqst_q = q; return 0; @@ -3852,10 +3862,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport) static void fc_bsg_remove(struct request_queue *q) { - if (q) { - bsg_unregister_queue(q); - blk_cleanup_queue(q); - } + bsg_remove_queue(q); } diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 6fd2fe210fc3..ff123023e5a5 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -1542,7 +1542,7 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost) return -ENOTSUPP; snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no); - q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0); + q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, NULL, 0); if (IS_ERR(q)) { shost_printk(KERN_ERR, shost, "bsg interface failed to " "initialize - no request queue\n"); @@ -1576,10 +1576,7 @@ static int iscsi_remove_host(struct transport_container *tc, struct Scsi_Host *shost = dev_to_shost(dev); struct iscsi_cls_host *ihost = shost->shost_data; - if (ihost->bsg_q) { - bsg_unregister_queue(ihost->bsg_q); - blk_cleanup_queue(ihost->bsg_q); - } + bsg_remove_queue(ihost->bsg_q); return 0; } diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c index 0a165b2b3e81..692b46937e52 100644 --- a/drivers/scsi/scsi_transport_sas.c +++ b/drivers/scsi/scsi_transport_sas.c @@ -198,7 +198,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) if (rphy) { q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev), - sas_smp_dispatch, 0); + sas_smp_dispatch, NULL, 0); if (IS_ERR(q)) return PTR_ERR(q); rphy->q = q; @@ -207,7 +207,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) snprintf(name, sizeof(name), "sas_host%d", shost->host_no); q = bsg_setup_queue(&shost->shost_gendev, name, - sas_smp_dispatch, 0); + sas_smp_dispatch, NULL, 0); if (IS_ERR(q)) return PTR_ERR(q); to_sas_host_attrs(shost)->q = q; @@ -246,11 +246,7 @@ static int sas_host_remove(struct transport_container *tc, struct device *dev, struct Scsi_Host *shost = dev_to_shost(dev); struct request_queue *q = to_sas_host_attrs(shost)->q; - if (q) { - bsg_unregister_queue(q); - blk_cleanup_queue(q); - } - + bsg_remove_queue(q); return 0; } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index bd0a5c694a97..a1a44f52e0e8 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -114,7 +114,7 @@ static int sd_suspend_system(struct device *); static int sd_suspend_runtime(struct device *); static int sd_resume(struct device *); static void sd_rescan(struct device *); -static int sd_init_command(struct scsi_cmnd *SCpnt); +static blk_status_t sd_init_command(struct scsi_cmnd *SCpnt); static void sd_uninit_command(struct scsi_cmnd *SCpnt); static int sd_done(struct scsi_cmnd *); static void sd_eh_reset(struct scsi_cmnd *); @@ -751,7 +751,7 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); } -static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) +static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) { struct scsi_device *sdp = cmd->device; struct request *rq = cmd->request; @@ -762,7 +762,7 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); if (!rq->special_vec.bv_page) - return BLKPREP_DEFER; + return BLK_STS_RESOURCE; clear_highpage(rq->special_vec.bv_page); rq->special_vec.bv_offset = 0; rq->special_vec.bv_len = data_len; @@ -786,7 +786,8 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) return scsi_init_io(cmd); } -static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap) +static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, + bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = cmd->request; @@ -796,7 +797,7 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap) rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); if (!rq->special_vec.bv_page) - return BLKPREP_DEFER; + return BLK_STS_RESOURCE; clear_highpage(rq->special_vec.bv_page); rq->special_vec.bv_offset = 0; rq->special_vec.bv_len = data_len; @@ -817,7 +818,8 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap) return scsi_init_io(cmd); } -static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap) +static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, + bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = cmd->request; @@ -827,7 +829,7 @@ static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap) rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); if (!rq->special_vec.bv_page) - return BLKPREP_DEFER; + return BLK_STS_RESOURCE; clear_highpage(rq->special_vec.bv_page); rq->special_vec.bv_offset = 0; rq->special_vec.bv_len = data_len; @@ -848,7 +850,7 @@ static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap) return scsi_init_io(cmd); } -static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) +static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) { struct request *rq = cmd->request; struct scsi_device *sdp = cmd->device; @@ -866,7 +868,7 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) } if (sdp->no_write_same) - return BLKPREP_INVALID; + return BLK_STS_TARGET; if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff) return sd_setup_write_same16_cmnd(cmd, false); @@ -943,7 +945,7 @@ out: * Will set up either WRITE SAME(10) or WRITE SAME(16) depending on * the preference indicated by the target device. **/ -static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) +static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) { struct request *rq = cmd->request; struct scsi_device *sdp = cmd->device; @@ -952,10 +954,10 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) sector_t sector = blk_rq_pos(rq); unsigned int nr_sectors = blk_rq_sectors(rq); unsigned int nr_bytes = blk_rq_bytes(rq); - int ret; + blk_status_t ret; if (sdkp->device->no_write_same) - return BLKPREP_INVALID; + return BLK_STS_TARGET; BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size); @@ -996,7 +998,7 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) return ret; } -static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd) +static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) { struct request *rq = cmd->request; @@ -1009,10 +1011,10 @@ static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd) cmd->allowed = SD_MAX_RETRIES; rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; - return BLKPREP_OK; + return BLK_STS_OK; } -static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) +static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) { struct request *rq = SCpnt->request; struct scsi_device *sdp = SCpnt->device; @@ -1022,18 +1024,14 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) sector_t threshold; unsigned int this_count = blk_rq_sectors(rq); unsigned int dif, dix; - int ret; unsigned char protect; + blk_status_t ret; ret = scsi_init_io(SCpnt); - if (ret != BLKPREP_OK) + if (ret != BLK_STS_OK) return ret; WARN_ON_ONCE(SCpnt != rq->special); - /* from here on until we're complete, any goto out - * is used for a killable error condition */ - ret = BLKPREP_KILL; - SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt, "%s: block=%llu, count=%d\n", @@ -1046,7 +1044,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) blk_rq_sectors(rq))); SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, "Retry with 0x%p\n", SCpnt)); - goto out; + return BLK_STS_IOERR; } if (sdp->changed) { @@ -1055,7 +1053,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) * the changed bit has been reset */ /* printk("SCSI disk has been changed or is not present. Prohibiting further I/O.\n"); */ - goto out; + return BLK_STS_IOERR; } /* @@ -1093,31 +1091,28 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) if ((block & 1) || (blk_rq_sectors(rq) & 1)) { scmd_printk(KERN_ERR, SCpnt, "Bad block number requested\n"); - goto out; - } else { - block = block >> 1; - this_count = this_count >> 1; + return BLK_STS_IOERR; } + block = block >> 1; + this_count = this_count >> 1; } if (sdp->sector_size == 2048) { if ((block & 3) || (blk_rq_sectors(rq) & 3)) { scmd_printk(KERN_ERR, SCpnt, "Bad block number requested\n"); - goto out; - } else { - block = block >> 2; - this_count = this_count >> 2; + return BLK_STS_IOERR; } + block = block >> 2; + this_count = this_count >> 2; } if (sdp->sector_size == 4096) { if ((block & 7) || (blk_rq_sectors(rq) & 7)) { scmd_printk(KERN_ERR, SCpnt, "Bad block number requested\n"); - goto out; - } else { - block = block >> 3; - this_count = this_count >> 3; + return BLK_STS_IOERR; } + block = block >> 3; + this_count = this_count >> 3; } if (rq_data_dir(rq) == WRITE) { SCpnt->cmnd[0] = WRITE_6; @@ -1129,7 +1124,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) SCpnt->cmnd[0] = READ_6; } else { scmd_printk(KERN_ERR, SCpnt, "Unknown command %d\n", req_op(rq)); - goto out; + return BLK_STS_IOERR; } SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, @@ -1149,10 +1144,8 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) { SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC); - if (unlikely(SCpnt->cmnd == NULL)) { - ret = BLKPREP_DEFER; - goto out; - } + if (unlikely(!SCpnt->cmnd)) + return BLK_STS_RESOURCE; SCpnt->cmd_len = SD_EXT_CDB_SIZE; memset(SCpnt->cmnd, 0, SCpnt->cmd_len); @@ -1220,7 +1213,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) */ scmd_printk(KERN_ERR, SCpnt, "FUA write on READ/WRITE(6) drive\n"); - goto out; + return BLK_STS_IOERR; } SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f); @@ -1244,12 +1237,10 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) * This indicates that the command is ready from our end to be * queued. */ - ret = BLKPREP_OK; - out: - return ret; + return BLK_STS_OK; } -static int sd_init_command(struct scsi_cmnd *cmd) +static blk_status_t sd_init_command(struct scsi_cmnd *cmd) { struct request *rq = cmd->request; @@ -1265,7 +1256,7 @@ static int sd_init_command(struct scsi_cmnd *cmd) case SD_LBP_ZERO: return sd_setup_write_same10_cmnd(cmd, false); default: - return BLKPREP_INVALID; + return BLK_STS_TARGET; } case REQ_OP_WRITE_ZEROES: return sd_setup_write_zeroes_cmnd(cmd); @@ -1280,7 +1271,7 @@ static int sd_init_command(struct scsi_cmnd *cmd) return sd_zbc_setup_reset_cmnd(cmd); default: WARN_ON_ONCE(1); - return BLKPREP_KILL; + return BLK_STS_NOTSUPP; } } diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 1d63f3a23ffb..7f43e6839bce 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -271,7 +271,7 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp) extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer); extern void sd_zbc_print_zones(struct scsi_disk *sdkp); -extern int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd); +extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd); extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr); extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, @@ -288,9 +288,9 @@ static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, static inline void sd_zbc_print_zones(struct scsi_disk *sdkp) {} -static inline int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd) +static inline blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd) { - return BLKPREP_INVALID; + return BLK_STS_TARGET; } static inline void sd_zbc_complete(struct scsi_cmnd *cmd, diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index e06c48c866e4..83365b29a4d8 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -185,7 +185,7 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp) * * Called from sd_init_command() for a REQ_OP_ZONE_RESET request. */ -int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd) +blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd) { struct request *rq = cmd->request; struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); @@ -194,14 +194,14 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd) if (!sd_is_zoned(sdkp)) /* Not a zoned device */ - return BLKPREP_KILL; + return BLK_STS_IOERR; if (sdkp->device->changed) - return BLKPREP_KILL; + return BLK_STS_IOERR; if (sector & (sd_zbc_zone_sectors(sdkp) - 1)) /* Unaligned request */ - return BLKPREP_KILL; + return BLK_STS_IOERR; cmd->cmd_len = 16; memset(cmd->cmnd, 0, cmd->cmd_len); @@ -214,7 +214,7 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd) cmd->transfersize = 0; cmd->allowed = 0; - return BLKPREP_OK; + return BLK_STS_OK; } /** diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index c6ad00703c5b..4e27460ec926 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1390,7 +1390,7 @@ sg_rq_end_io(struct request *rq, blk_status_t status) */ srp->rq = NULL; scsi_req_free_cmd(scsi_req(rq)); - __blk_put_request(rq->q, rq); + blk_put_request(rq); write_lock_irqsave(&sfp->rq_list_lock, iflags); if (unlikely(srp->orphan)) { diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index a25a07a0b7f0..bac084260d80 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -5319,7 +5319,8 @@ static int pqi_map_queues(struct Scsi_Host *shost) { struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost); - return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev, 0); + return blk_mq_pci_map_queues(&shost->tag_set.map[0], + ctrl_info->pci_dev, 0); } static int pqi_getpciinfo_ioctl(struct pqi_ctrl_info *ctrl_info, diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 54dd70ae9731..38ddbbfe5f3c 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -80,7 +80,7 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_WORM); static DEFINE_MUTEX(sr_mutex); static int sr_probe(struct device *); static int sr_remove(struct device *); -static int sr_init_command(struct scsi_cmnd *SCpnt); +static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt); static int sr_done(struct scsi_cmnd *); static int sr_runtime_suspend(struct device *dev); @@ -384,22 +384,22 @@ static int sr_done(struct scsi_cmnd *SCpnt) return good_bytes; } -static int sr_init_command(struct scsi_cmnd *SCpnt) +static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) { int block = 0, this_count, s_size; struct scsi_cd *cd; struct request *rq = SCpnt->request; - int ret; + blk_status_t ret; ret = scsi_init_io(SCpnt); - if (ret != BLKPREP_OK) + if (ret != BLK_STS_OK) goto out; WARN_ON_ONCE(SCpnt != rq->special); cd = scsi_cd(rq->rq_disk); /* from here on until we're complete, any goto out * is used for a killable error condition */ - ret = BLKPREP_KILL; + ret = BLK_STS_IOERR; SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt, "Doing sr request, block = %d\n", block)); @@ -516,7 +516,7 @@ static int sr_init_command(struct scsi_cmnd *SCpnt) * This indicates that the command is ready from our end to be * queued. */ - ret = BLKPREP_OK; + ret = BLK_STS_OK; out: return ret; } diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 307df2fa39a3..7ff22d3f03e3 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -530,7 +530,7 @@ static void st_scsi_execute_end(struct request *req, blk_status_t status) complete(SRpnt->waiting); blk_rq_unmap_user(tmp); - __blk_put_request(req->q, req); + blk_put_request(req); } static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, diff --git a/drivers/scsi/ufs/ufs_bsg.c b/drivers/scsi/ufs/ufs_bsg.c index e5f8e54bf644..775bb4e5e36e 100644 --- a/drivers/scsi/ufs/ufs_bsg.c +++ b/drivers/scsi/ufs/ufs_bsg.c @@ -157,7 +157,7 @@ void ufs_bsg_remove(struct ufs_hba *hba) if (!hba->bsg_queue) return; - bsg_unregister_queue(hba->bsg_queue); + bsg_remove_queue(hba->bsg_queue); device_del(bsg_dev); put_device(bsg_dev); @@ -193,7 +193,7 @@ int ufs_bsg_probe(struct ufs_hba *hba) if (ret) goto out; - q = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), ufs_bsg_request, 0); + q = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), ufs_bsg_request, NULL, 0); if (IS_ERR(q)) { ret = PTR_ERR(q); goto out; diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 1c72db94270e..c3c95b314286 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -719,8 +719,9 @@ static void virtscsi_target_destroy(struct scsi_target *starget) static int virtscsi_map_queues(struct Scsi_Host *shost) { struct virtio_scsi *vscsi = shost_priv(shost); + struct blk_mq_queue_map *qmap = &shost->tag_set.map[0]; - return blk_mq_virtio_map_queues(&shost->tag_set, vscsi->vdev, 2); + return blk_mq_virtio_map_queues(qmap, vscsi->vdev, 2); } /* diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c index 36b742932c72..86987da86dd6 100644 --- a/drivers/target/iscsi/iscsi_target_util.c +++ b/drivers/target/iscsi/iscsi_target_util.c @@ -150,24 +150,26 @@ void iscsit_free_r2ts_from_list(struct iscsi_cmd *cmd) static int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup) { int tag = -1; - DEFINE_WAIT(wait); + DEFINE_SBQ_WAIT(wait); struct sbq_wait_state *ws; + struct sbitmap_queue *sbq; if (state == TASK_RUNNING) return tag; - ws = &se_sess->sess_tag_pool.ws[0]; + sbq = &se_sess->sess_tag_pool; + ws = &sbq->ws[0]; for (;;) { - prepare_to_wait_exclusive(&ws->wait, &wait, state); + sbitmap_prepare_to_wait(sbq, ws, &wait, state); if (signal_pending_state(state, current)) break; - tag = sbitmap_queue_get(&se_sess->sess_tag_pool, cpup); + tag = sbitmap_queue_get(sbq, cpup); if (tag >= 0) break; schedule(); } - finish_wait(&ws->wait, &wait); + sbitmap_finish_wait(sbq, ws, &wait); return tag; } diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 47d76c862014..c062d363dce3 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1094,7 +1094,7 @@ static void pscsi_req_done(struct request *req, blk_status_t status) break; } - __blk_put_request(req->q, req); + blk_put_request(req); kfree(pt); } diff --git a/fs/aio.c b/fs/aio.c index 76f72509f8c5..0f99cad35ffe 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1438,17 +1438,22 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); - fput(req->ki_filp); - return ret; + goto out_fput; } req->ki_ioprio = iocb->aio_reqprio; } else - req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + req->ki_ioprio = get_current_ioprio(); ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) - fput(req->ki_filp); + goto out_fput; + + req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ + return 0; + +out_fput: + fput(req->ki_filp); return ret; } diff --git a/fs/block_dev.c b/fs/block_dev.c index a80b4f0ee7c4..e1886cc7048f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -181,7 +181,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio) struct task_struct *waiter = bio->bi_private; WRITE_ONCE(bio->bi_private, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } static ssize_t @@ -232,14 +232,18 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio.bi_opf = dio_bio_write_op(iocb); task_io_account_write(ret); } + if (iocb->ki_flags & IOCB_HIPRI) + bio.bi_opf |= REQ_HIPRI; qc = submit_bio(&bio); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio.bi_private)) break; + if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc, true)) io_schedule(); } __set_current_state(TASK_RUNNING); @@ -298,12 +302,13 @@ static void blkdev_bio_end_io(struct bio *bio) } dio->iocb->ki_complete(iocb, ret, 0); - bio_put(&dio->bio); + if (dio->multi_bio) + bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } } @@ -328,6 +333,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; + bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; @@ -338,20 +344,27 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) return -EINVAL; bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); - bio_get(bio); /* extra ref for the completion handler */ dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); - if (dio->is_sync) + if (dio->is_sync) { dio->waiter = current; - else + bio_get(bio); + } else { dio->iocb = iocb; + } dio->size = 0; dio->multi_bio = false; dio->should_dirty = is_read && iter_is_iovec(iter); - blk_start_plug(&plug); + /* + * Don't plug for HIPRI/polled IO, as those should go straight + * to issue + */ + if (!is_poll) + blk_start_plug(&plug); + for (;;) { bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; @@ -381,11 +394,21 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); if (!nr_pages) { + if (iocb->ki_flags & IOCB_HIPRI) + bio->bi_opf |= REQ_HIPRI; + qc = submit_bio(bio); break; } if (!dio->multi_bio) { + /* + * AIO needs an extra reference to ensure the dio + * structure which is embedded into the first bio + * stays around. + */ + if (!is_sync) + bio_get(bio); dio->multi_bio = true; atomic_set(&dio->ref, 2); } else { @@ -395,18 +418,21 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) submit_bio(bio); bio = bio_alloc(GFP_KERNEL, nr_pages); } - blk_finish_plug(&plug); + + if (!is_poll) + blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->waiter)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc, true)) io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/fs/buffer.c b/fs/buffer.c index 1286c2b95498..d60d61e8ed7d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3060,11 +3060,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); - if (wbc) { - wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); - } - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; @@ -3084,6 +3079,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + submit_bio(bio); return 0; } diff --git a/fs/direct-io.c b/fs/direct-io.c index 41a0e97252ae..dbc1a1f080ce 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -518,7 +518,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) + !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); @@ -1265,6 +1265,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } else { dio->op = REQ_OP_READ; } + if (iocb->ki_flags & IOCB_HIPRI) + dio->op_flags |= REQ_HIPRI; /* * For AIO O_(D)SYNC writes we need to defer completions to a workqueue diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index db7590178dfc..2aa62d58d8dd 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); if (!bio) return -ENOMEM; - wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; io->io_next_block = bh->b_blocknr; + wbc_init_bio(io->io_wbc, bio); return 0; } diff --git a/fs/iomap.c b/fs/iomap.c index ce837d962d47..e87c288cd5ef 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1543,7 +1543,7 @@ static void iomap_dio_bio_end_io(struct bio *bio) if (dio->wait_for_completion) { struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } else if (dio->flags & IOMAP_DIO_WRITE) { struct inode *inode = file_inode(dio->iocb->ki_filp); @@ -1571,6 +1571,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, unsigned len) { struct page *page = ZERO_PAGE(0); + int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; bio = bio_alloc(GFP_KERNEL, 1); @@ -1579,9 +1580,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; + if (dio->iocb->ki_flags & IOCB_HIPRI) + flags |= REQ_HIPRI; + get_page(page); __bio_add_page(bio, page, len, 0); - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + bio_set_op_attrs(bio, REQ_OP_WRITE, flags); atomic_inc(&dio->ref); return submit_bio(bio); @@ -1687,6 +1691,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio_set_pages_dirty(bio); } + if (dio->iocb->ki_flags & IOCB_HIPRI) + bio->bi_opf |= REQ_HIPRI; + iov_iter_advance(dio->submit.iter, n); dio->size += n; @@ -1914,14 +1921,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, return -EIOCBQUEUED; for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->submit.waiter)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || !dio->submit.last_queue || !blk_poll(dio->submit.last_queue, - dio->submit.cookie)) + dio->submit.cookie, true)) io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/include/linux/bio.h b/include/linux/bio.h index 056fb627edb3..7380b094dcca 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -491,35 +491,40 @@ do { \ bio_clear_flag(bio, BIO_THROTTLED);\ (bio)->bi_disk = (bdev)->bd_disk; \ (bio)->bi_partno = (bdev)->bd_partno; \ + bio_associate_blkg(bio); \ } while (0) #define bio_copy_dev(dst, src) \ do { \ (dst)->bi_disk = (src)->bi_disk; \ (dst)->bi_partno = (src)->bi_partno; \ + bio_clone_blkg_association(dst, src); \ } while (0) #define bio_dev(bio) \ disk_devt((bio)->bi_disk) #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -int bio_associate_blkcg_from_page(struct bio *bio, struct page *page); +void bio_associate_blkg_from_page(struct bio *bio, struct page *page); #else -static inline int bio_associate_blkcg_from_page(struct bio *bio, - struct page *page) { return 0; } +static inline void bio_associate_blkg_from_page(struct bio *bio, + struct page *page) { } #endif #ifdef CONFIG_BLK_CGROUP -int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); -int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); -void bio_disassociate_task(struct bio *bio); -void bio_clone_blkcg_association(struct bio *dst, struct bio *src); +void bio_disassociate_blkg(struct bio *bio); +void bio_associate_blkg(struct bio *bio); +void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css); +void bio_clone_blkg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ -static inline int bio_associate_blkcg(struct bio *bio, - struct cgroup_subsys_state *blkcg_css) { return 0; } -static inline void bio_disassociate_task(struct bio *bio) { } -static inline void bio_clone_blkcg_association(struct bio *dst, - struct bio *src) { } +static inline void bio_disassociate_blkg(struct bio *bio) { } +static inline void bio_associate_blkg(struct bio *bio) { } +static inline void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) +{ } +static inline void bio_clone_blkg_association(struct bio *dst, + struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_HIGHMEM diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 6d766a19f2bb..f025fd1e22e6 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -21,6 +21,7 @@ #include #include #include +#include /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) @@ -122,11 +123,8 @@ struct blkcg_gq { /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; - /* request allocation list for this blkcg-q pair */ - struct request_list rl; - /* reference count */ - atomic_t refcnt; + struct percpu_ref refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; @@ -184,6 +182,8 @@ extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); +struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); int blkcg_init_queue(struct request_queue *q); @@ -230,22 +230,62 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); +/** + * blkcg_css - find the current css + * + * Find the css associated with either the kthread or the current task. + * This may return a dying css, so it is up to the caller to use tryget logic + * to confirm it is alive and well. + */ +static inline struct cgroup_subsys_state *blkcg_css(void) +{ + struct cgroup_subsys_state *css; + + css = kthread_blkcg(); + if (css) + return css; + return task_css(current, io_cgrp_id); +} static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } +/** + * __bio_blkcg - internal, inconsistent version to get blkcg + * + * DO NOT USE. + * This function is inconsistent and consequently is dangerous to use. The + * first part of the function returns a blkcg where a reference is owned by the + * bio. This means it does not need to be rcu protected as it cannot go away + * with the bio owning a reference to it. However, the latter potentially gets + * it from task_css(). This can race against task migration and the cgroup + * dying. It is also semantically different as it must be called rcu protected + * and is susceptible to failure when trying to get a reference to it. + * Therefore, it is not ok to assume that *_get() will always succeed on the + * blkcg returned here. + */ +static inline struct blkcg *__bio_blkcg(struct bio *bio) +{ + if (bio && bio->bi_blkg) + return bio->bi_blkg->blkcg; + return css_to_blkcg(blkcg_css()); +} + +/** + * bio_blkcg - grab the blkcg associated with a bio + * @bio: target bio + * + * This returns the blkcg associated with a bio, %NULL if not associated. + * Callers are expected to either handle %NULL or know association has been + * done prior to calling this. + */ static inline struct blkcg *bio_blkcg(struct bio *bio) { - struct cgroup_subsys_state *css; - - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); - css = kthread_blkcg(); - if (css) - return css_to_blkcg(css); - return css_to_blkcg(task_css(current, io_cgrp_id)); + if (bio && bio->bi_blkg) + return bio->bi_blkg->blkcg; + return NULL; } static inline bool blk_cgroup_congested(void) @@ -328,16 +368,12 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock and is guaranteed to return %NULL if @q is bypassing - * - see blk_queue_bypass_start() for details. + * under RCU read loc. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { WARN_ON_ONCE(!rcu_read_lock_held()); - - if (unlikely(blk_queue_bypass(q))) - return NULL; return __blkg_lookup(blkcg, q, false); } @@ -451,26 +487,35 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) */ static inline void blkg_get(struct blkcg_gq *blkg) { - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - atomic_inc(&blkg->refcnt); + percpu_ref_get(&blkg->refcnt); } /** - * blkg_try_get - try and get a blkg reference + * blkg_tryget - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ -static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) +static inline bool blkg_tryget(struct blkcg_gq *blkg) { - if (atomic_inc_not_zero(&blkg->refcnt)) - return blkg; - return NULL; + return percpu_ref_tryget(&blkg->refcnt); } +/** + * blkg_tryget_closest - try and get a blkg ref on the closet blkg + * @blkg: blkg to get + * + * This walks up the blkg tree to find the closest non-dying blkg and returns + * the blkg that it did association with as it may not be the passed in blkg. + */ +static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) +{ + while (blkg && !percpu_ref_tryget(&blkg->refcnt)) + blkg = blkg->parent; -void __blkg_release_rcu(struct rcu_head *rcu); + return blkg; +} /** * blkg_put - put a blkg reference @@ -478,9 +523,7 @@ void __blkg_release_rcu(struct rcu_head *rcu); */ static inline void blkg_put(struct blkcg_gq *blkg) { - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - if (atomic_dec_and_test(&blkg->refcnt)) - call_rcu(&blkg->rcu_head, __blkg_release_rcu); + percpu_ref_put(&blkg->refcnt); } /** @@ -515,94 +558,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) -/** - * blk_get_rl - get request_list to use - * @q: request_queue of interest - * @bio: bio which will be attached to the allocated request (may be %NULL) - * - * The caller wants to allocate a request from @q to use for @bio. Find - * the request_list to use and obtain a reference on it. Should be called - * under queue_lock. This function is guaranteed to return non-%NULL - * request_list. - */ -static inline struct request_list *blk_get_rl(struct request_queue *q, - struct bio *bio) -{ - struct blkcg *blkcg; - struct blkcg_gq *blkg; - - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - - /* bypass blkg lookup and use @q->root_rl directly for root */ - if (blkcg == &blkcg_root) - goto root_rl; - - /* - * Try to use blkg->rl. blkg lookup may fail under memory pressure - * or if either the blkcg or queue is going away. Fall back to - * root_rl in such cases. - */ - blkg = blkg_lookup(blkcg, q); - if (unlikely(!blkg)) - goto root_rl; - - blkg_get(blkg); - rcu_read_unlock(); - return &blkg->rl; -root_rl: - rcu_read_unlock(); - return &q->root_rl; -} - -/** - * blk_put_rl - put request_list - * @rl: request_list to put - * - * Put the reference acquired by blk_get_rl(). Should be called under - * queue_lock. - */ -static inline void blk_put_rl(struct request_list *rl) -{ - if (rl->blkg->blkcg != &blkcg_root) - blkg_put(rl->blkg); -} - -/** - * blk_rq_set_rl - associate a request with a request_list - * @rq: request of interest - * @rl: target request_list - * - * Associate @rq with @rl so that accounting and freeing can know the - * request_list @rq came from. - */ -static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) -{ - rq->rl = rl; -} - -/** - * blk_rq_rl - return the request_list a request came from - * @rq: request of interest - * - * Return the request_list @rq is allocated from. - */ -static inline struct request_list *blk_rq_rl(struct request *rq) -{ - return rq->rl; -} - -struct request_list *__blk_queue_next_rl(struct request_list *rl, - struct request_queue *q); -/** - * blk_queue_for_each_rl - iterate through all request_lists of a request_queue - * - * Should be used under queue_lock. - */ -#define blk_queue_for_each_rl(rl, q) \ - for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) - static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp) { int ret; @@ -797,32 +752,34 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg struct bio *bio) { return false; } #endif + +static inline void blkcg_bio_issue_init(struct bio *bio) +{ + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); +} + static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { - struct blkcg *blkcg; struct blkcg_gq *blkg; bool throtl = false; rcu_read_lock(); - blkcg = bio_blkcg(bio); - /* associate blkcg if bio hasn't attached one */ - bio_associate_blkcg(bio, &blkcg->css); + if (!bio->bi_blkg) { + char b[BDEVNAME_SIZE]; - blkg = blkg_lookup(blkcg, q); - if (unlikely(!blkg)) { - spin_lock_irq(q->queue_lock); - blkg = blkg_lookup_create(blkcg, q); - if (IS_ERR(blkg)) - blkg = NULL; - spin_unlock_irq(q->queue_lock); + WARN_ONCE(1, + "no blkg associated for bio on block-device: %s\n", + bio_devname(bio, b)); + bio_associate_blkg(bio); } + blkg = bio->bi_blkg; + throtl = blk_throtl_bio(q, blkg, bio); if (!throtl) { - blkg = blkg ?: q->root_blkg; /* * If the bio is flagged with BIO_QUEUE_ENTERED it means this * is a split bio and we would have already accounted for the @@ -834,6 +791,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } + blkcg_bio_issue_init(bio); + rcu_read_unlock(); return !throtl; } @@ -930,6 +889,7 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } +static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, @@ -939,12 +899,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } -static inline struct request_list *blk_get_rl(struct request_queue *q, - struct bio *bio) { return &q->root_rl; } -static inline void blk_put_rl(struct request_list *rl) { } -static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } -static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } - +static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { return true; } diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h index 9f4c17f0d2d8..0b1f45c62623 100644 --- a/include/linux/blk-mq-pci.h +++ b/include/linux/blk-mq-pci.h @@ -2,10 +2,10 @@ #ifndef _LINUX_BLK_MQ_PCI_H #define _LINUX_BLK_MQ_PCI_H -struct blk_mq_tag_set; +struct blk_mq_queue_map; struct pci_dev; -int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, +int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, int offset); #endif /* _LINUX_BLK_MQ_PCI_H */ diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h index b4ade198007d..7b6ecf9ac4c3 100644 --- a/include/linux/blk-mq-rdma.h +++ b/include/linux/blk-mq-rdma.h @@ -4,7 +4,7 @@ struct blk_mq_tag_set; struct ib_device; -int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, +int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, struct ib_device *dev, int first_vec); #endif /* _LINUX_BLK_MQ_RDMA_H */ diff --git a/include/linux/blk-mq-virtio.h b/include/linux/blk-mq-virtio.h index 69b4da262c45..687ae287e1dc 100644 --- a/include/linux/blk-mq-virtio.h +++ b/include/linux/blk-mq-virtio.h @@ -2,10 +2,10 @@ #ifndef _LINUX_BLK_MQ_VIRTIO_H #define _LINUX_BLK_MQ_VIRTIO_H -struct blk_mq_tag_set; +struct blk_mq_queue_map; struct virtio_device; -int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set, +int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, struct virtio_device *vdev, int first_vec); #endif /* _LINUX_BLK_MQ_VIRTIO_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2286dc12c6bc..0e030f5f76b6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -37,7 +37,8 @@ struct blk_mq_hw_ctx { struct blk_mq_ctx *dispatch_from; unsigned int dispatch_busy; - unsigned int nr_ctx; + unsigned short type; + unsigned short nr_ctx; struct blk_mq_ctx **ctxs; spinlock_t dispatch_wait_lock; @@ -74,10 +75,31 @@ struct blk_mq_hw_ctx { struct srcu_struct srcu[0]; }; +struct blk_mq_queue_map { + unsigned int *mq_map; + unsigned int nr_queues; + unsigned int queue_offset; +}; + +enum hctx_type { + HCTX_TYPE_DEFAULT, /* all I/O not otherwise accounted for */ + HCTX_TYPE_READ, /* just for READ I/O */ + HCTX_TYPE_POLL, /* polled I/O of any kind */ + + HCTX_MAX_TYPES, +}; + struct blk_mq_tag_set { - unsigned int *mq_map; + /* + * map[] holds ctx -> hctx mappings, one map exists for each type + * that the driver wishes to support. There are no restrictions + * on maps being of the same size, and it's perfectly legal to + * share maps between types. + */ + struct blk_mq_queue_map map[HCTX_MAX_TYPES]; + unsigned int nr_maps; /* nr entries in map[] */ const struct blk_mq_ops *ops; - unsigned int nr_hw_queues; + unsigned int nr_hw_queues; /* nr hw queues across maps */ unsigned int queue_depth; /* max hw supported */ unsigned int reserved_tags; unsigned int cmd_size; /* per-request extra data */ @@ -99,6 +121,7 @@ struct blk_mq_queue_data { typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); +typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *); typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *); typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); @@ -109,11 +132,13 @@ typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *, typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *, unsigned int); -typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, +typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); -typedef void (busy_tag_iter_fn)(struct request *, void *, bool); -typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int); +typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); +typedef int (poll_fn)(struct blk_mq_hw_ctx *); typedef int (map_queues_fn)(struct blk_mq_tag_set *set); +typedef bool (busy_fn)(struct request_queue *); +typedef void (complete_fn)(struct request *); struct blk_mq_ops { @@ -122,6 +147,15 @@ struct blk_mq_ops { */ queue_rq_fn *queue_rq; + /* + * If a driver uses bd->last to judge when to submit requests to + * hardware, it must define this function. In case of errors that + * make us stop issuing further requests, this hook serves the + * purpose of kicking the hardware (which the last request otherwise + * would have done). + */ + commit_rqs_fn *commit_rqs; + /* * Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the @@ -141,7 +175,7 @@ struct blk_mq_ops { */ poll_fn *poll; - softirq_done_fn *complete; + complete_fn *complete; /* * Called when the block layer side of a hardware queue has been @@ -165,6 +199,11 @@ struct blk_mq_ops { /* Called from inside blk_get_request() */ void (*initialize_rq_fn)(struct request *rq); + /* + * If set, returns whether or not this queue currently is busy + */ + busy_fn *busy; + map_queues_fn *map_queues; #ifdef CONFIG_BLK_DEBUG_FS @@ -218,6 +257,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); +bool blk_mq_queue_inflight(struct request_queue *q); + enum { /* return when out of requests */ BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), @@ -264,7 +305,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); -void blk_mq_complete_request(struct request *rq); +bool blk_mq_complete_request(struct request *rq); bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio); bool blk_mq_queue_stopped(struct request_queue *q); @@ -288,24 +329,12 @@ void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -int blk_mq_map_queues(struct blk_mq_tag_set *set); +int blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); -/** - * blk_mq_mark_complete() - Set request state to complete - * @rq: request to set to complete state - * - * Returns true if request state was successfully set to complete. If - * successful, the caller is responsibile for seeing this request is ended, as - * blk_mq_complete_request will not work again. - */ -static inline bool blk_mq_mark_complete(struct request *rq) -{ - return cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) == - MQ_RQ_IN_FLIGHT; -} +unsigned int blk_mq_rq_cpu(struct request *rq); /* * Driver command data is immediately after the request. So subtract request @@ -328,4 +357,14 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) for ((i) = 0; (i) < (hctx)->nr_ctx && \ ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) +static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + if (rq->tag != -1) + return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT); + + return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) | + BLK_QC_T_INTERNAL; +} + #endif diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1dcf652ba0aa..5c7e7f859a24 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -174,11 +174,11 @@ struct bio { void *bi_private; #ifdef CONFIG_BLK_CGROUP /* - * Optional ioc and css associated with this bio. Put on bio - * release. Read comment on top of bio_associate_current(). + * Represents the association of the css and request_queue for the bio. + * If a bio goes direct to device, it will not have a blkg as it will + * not have a request_queue associated with it. The reference is put + * on release of the bio. */ - struct io_context *bi_ioc; - struct cgroup_subsys_state *bi_css; struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; #endif @@ -228,6 +228,7 @@ struct bio { #define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion * of this bio. */ #define BIO_QUEUE_ENTERED 11 /* can use blk_queue_enter_live() */ +#define BIO_TRACKED 12 /* set if bio goes through the rq_qos path */ /* See BVEC_POOL_OFFSET below before adding new flags */ @@ -323,6 +324,8 @@ enum req_flag_bits { /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ + __REQ_HIPRI, + /* for driver use */ __REQ_DRV, __REQ_SWAP, /* swapping request. */ @@ -343,8 +346,8 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) - #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) +#define REQ_HIPRI (1ULL << __REQ_HIPRI) #define REQ_DRV (1ULL << __REQ_DRV) #define REQ_SWAP (1ULL << __REQ_SWAP) @@ -422,17 +425,6 @@ static inline bool blk_qc_t_valid(blk_qc_t cookie) return cookie != BLK_QC_T_NONE; } -static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num, - bool internal) -{ - blk_qc_t ret = tag | (queue_num << BLK_QC_T_SHIFT); - - if (internal) - ret |= BLK_QC_T_INTERNAL; - - return ret; -} - static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) { return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4293dc1cd160..45552e6eae1e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -58,25 +58,6 @@ struct blk_stat_callback; typedef void (rq_end_io_fn)(struct request *, blk_status_t); -#define BLK_RL_SYNCFULL (1U << 0) -#define BLK_RL_ASYNCFULL (1U << 1) - -struct request_list { - struct request_queue *q; /* the queue this rl belongs to */ -#ifdef CONFIG_BLK_CGROUP - struct blkcg_gq *blkg; /* blkg this request pool belongs to */ -#endif - /* - * count[], starved[], and wait[] are indexed by - * BLK_RW_SYNC/BLK_RW_ASYNC - */ - int count[2]; - int starved[2]; - mempool_t *rq_pool; - wait_queue_head_t wait[2]; - unsigned int flags; -}; - /* * request flags */ typedef __u32 __bitwise req_flags_t; @@ -85,8 +66,6 @@ typedef __u32 __bitwise req_flags_t; #define RQF_SORTED ((__force req_flags_t)(1 << 0)) /* drive already may have started this one */ #define RQF_STARTED ((__force req_flags_t)(1 << 1)) -/* uses tagged queueing */ -#define RQF_QUEUED ((__force req_flags_t)(1 << 2)) /* may not be passed by ioscheduler */ #define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) /* request for flush sequence */ @@ -150,8 +129,8 @@ enum mq_rq_state { struct request { struct request_queue *q; struct blk_mq_ctx *mq_ctx; + struct blk_mq_hw_ctx *mq_hctx; - int cpu; unsigned int cmd_flags; /* op and common flags */ req_flags_t rq_flags; @@ -245,11 +224,7 @@ struct request { refcount_t ref; unsigned int timeout; - - /* access through blk_rq_set_deadline, blk_rq_deadline */ - unsigned long __deadline; - - struct list_head timeout_list; + unsigned long deadline; union { struct __call_single_data csd; @@ -264,10 +239,6 @@ struct request { /* for bidi */ struct request *next_rq; - -#ifdef CONFIG_BLK_CGROUP - struct request_list *rl; /* rl this rq is alloced from */ -#endif }; static inline bool blk_op_is_scsi(unsigned int op) @@ -311,41 +282,21 @@ static inline unsigned short req_get_ioprio(struct request *req) struct blk_queue_ctx; -typedef void (request_fn_proc) (struct request_queue *q); typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); -typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t); -typedef int (prep_rq_fn) (struct request_queue *, struct request *); -typedef void (unprep_rq_fn) (struct request_queue *, struct request *); struct bio_vec; -typedef void (softirq_done_fn)(struct request *); typedef int (dma_drain_needed_fn)(struct request *); -typedef int (lld_busy_fn) (struct request_queue *q); -typedef int (bsg_job_fn) (struct bsg_job *); -typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t); -typedef void (exit_rq_fn)(struct request_queue *, struct request *); enum blk_eh_timer_return { BLK_EH_DONE, /* drivers has completed the command */ BLK_EH_RESET_TIMER, /* reset timer and try again */ }; -typedef enum blk_eh_timer_return (rq_timed_out_fn)(struct request *); - enum blk_queue_state { Queue_down, Queue_up, }; -struct blk_queue_tag { - struct request **tag_index; /* map of busy tags */ - unsigned long *tag_map; /* bit map of free/busy tags */ - int max_depth; /* what we will send to device */ - int real_max_depth; /* what the array can hold */ - atomic_t refcnt; /* map can be shared */ - int alloc_policy; /* tag allocation policy */ - int next_tag; /* next tag */ -}; #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ @@ -444,40 +395,15 @@ struct request_queue { struct list_head queue_head; struct request *last_merge; struct elevator_queue *elevator; - int nr_rqs[2]; /* # allocated [a]sync rqs */ - int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ struct blk_queue_stats *stats; struct rq_qos *rq_qos; - /* - * If blkcg is not used, @q->root_rl serves all requests. If blkcg - * is used, root blkg allocates from @q->root_rl and all other - * blkgs from their own blkg->rl. Which one to use should be - * determined using bio_request_list(). - */ - struct request_list root_rl; - - request_fn_proc *request_fn; make_request_fn *make_request_fn; - poll_q_fn *poll_fn; - prep_rq_fn *prep_rq_fn; - unprep_rq_fn *unprep_rq_fn; - softirq_done_fn *softirq_done_fn; - rq_timed_out_fn *rq_timed_out_fn; dma_drain_needed_fn *dma_drain_needed; - lld_busy_fn *lld_busy_fn; - /* Called just after a request is allocated */ - init_rq_fn *init_rq_fn; - /* Called just before a request is freed */ - exit_rq_fn *exit_rq_fn; - /* Called from inside blk_get_request() */ - void (*initialize_rq_fn)(struct request *rq); const struct blk_mq_ops *mq_ops; - unsigned int *mq_map; - /* sw queues */ struct blk_mq_ctx __percpu *queue_ctx; unsigned int nr_queues; @@ -488,17 +414,6 @@ struct request_queue { struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; - /* - * Dispatch queue sorting - */ - sector_t end_sector; - struct request *boundary_rq; - - /* - * Delayed queue handling - */ - struct delayed_work delay_work; - struct backing_dev_info *backing_dev_info; /* @@ -529,13 +444,7 @@ struct request_queue { */ gfp_t bounce_gfp; - /* - * protects queue structures from reentrancy. ->__queue_lock should - * _never_ be used directly, it is queue private. always use - * ->queue_lock. - */ - spinlock_t __queue_lock; - spinlock_t *queue_lock; + spinlock_t queue_lock; /* * queue kobject @@ -545,7 +454,7 @@ struct request_queue { /* * mq queue kobject */ - struct kobject mq_kobj; + struct kobject *mq_kobj; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity integrity; @@ -561,27 +470,12 @@ struct request_queue { * queue settings */ unsigned long nr_requests; /* Max # of requests */ - unsigned int nr_congestion_on; - unsigned int nr_congestion_off; - unsigned int nr_batching; unsigned int dma_drain_size; void *dma_drain_buffer; unsigned int dma_pad_mask; unsigned int dma_alignment; - struct blk_queue_tag *queue_tags; - - unsigned int nr_sorted; - unsigned int in_flight[2]; - - /* - * Number of active block driver functions for which blk_drain_queue() - * must wait. Must be incremented around functions that unlock the - * queue_lock internally, e.g. scsi_request_fn(). - */ - unsigned int request_fn_active; - unsigned int rq_timeout; int poll_nsec; @@ -590,7 +484,6 @@ struct request_queue { struct timer_list timeout; struct work_struct timeout_work; - struct list_head timeout_list; struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP @@ -645,11 +538,9 @@ struct request_queue { struct mutex sysfs_lock; - int bypass_depth; atomic_t mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG) - bsg_job_fn *bsg_job_fn; struct bsg_class_device bsg_dev; #endif @@ -669,12 +560,12 @@ struct request_queue { #ifdef CONFIG_BLK_DEBUG_FS struct dentry *debugfs_dir; struct dentry *sched_debugfs_dir; + struct dentry *rqos_debugfs_dir; #endif bool mq_sysfs_init_done; size_t cmd_size; - void *rq_alloc_data; struct work_struct release_work; @@ -682,10 +573,8 @@ struct request_queue { u64 write_hints[BLK_MAX_WRITE_HINTS]; }; -#define QUEUE_FLAG_QUEUED 0 /* uses generic tag queueing */ #define QUEUE_FLAG_STOPPED 1 /* queue is stopped */ #define QUEUE_FLAG_DYING 2 /* queue being torn down */ -#define QUEUE_FLAG_BYPASS 3 /* act as dumb FIFO queue */ #define QUEUE_FLAG_BIDI 4 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */ @@ -718,19 +607,15 @@ struct request_queue { (1 << QUEUE_FLAG_ADD_RANDOM)) #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_SAME_COMP) | \ - (1 << QUEUE_FLAG_POLL)) + (1 << QUEUE_FLAG_SAME_COMP)) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); -bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q); -#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) -#define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ @@ -757,32 +642,20 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q); extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -static inline int queue_in_flight(struct request_queue *q) -{ - return q->in_flight[0] + q->in_flight[1]; -} - static inline bool blk_account_rq(struct request *rq) { return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq); } -#define blk_rq_cpu_valid(rq) ((rq)->cpu != -1) #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) -/* rq->queuelist of dequeued request must be list_empty() */ -#define blk_queued_rq(rq) (!list_empty(&(rq)->queuelist)) #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) -/* - * Driver can handle struct request, if it either has an old style - * request_fn defined, or is blk-mq based. - */ -static inline bool queue_is_rq_based(struct request_queue *q) +static inline bool queue_is_mq(struct request_queue *q) { - return q->request_fn || q->mq_ops; + return q->mq_ops; } static inline unsigned int blk_queue_cluster(struct request_queue *q) @@ -845,27 +718,6 @@ static inline bool rq_is_sync(struct request *rq) return op_is_sync(rq->cmd_flags); } -static inline bool blk_rl_full(struct request_list *rl, bool sync) -{ - unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; - - return rl->flags & flag; -} - -static inline void blk_set_rl_full(struct request_list *rl, bool sync) -{ - unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; - - rl->flags |= flag; -} - -static inline void blk_clear_rl_full(struct request_list *rl, bool sync) -{ - unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; - - rl->flags &= ~flag; -} - static inline bool rq_mergeable(struct request *rq) { if (blk_rq_is_passthrough(rq)) @@ -902,16 +754,6 @@ static inline unsigned int blk_queue_depth(struct request_queue *q) return q->nr_requests; } -/* - * q->prep_rq_fn return values - */ -enum { - BLKPREP_OK, /* serve it */ - BLKPREP_KILL, /* fatal error, kill, return -EIO */ - BLKPREP_DEFER, /* leave on queue */ - BLKPREP_INVALID, /* invalid command, kill, return -EREMOTEIO */ -}; - extern unsigned long blk_max_low_pfn, blk_max_pfn; /* @@ -983,10 +825,8 @@ extern blk_qc_t direct_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); -extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, unsigned int op, blk_mq_req_flags_t flags); -extern void blk_requeue_request(struct request_queue *, struct request *); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, @@ -996,7 +836,6 @@ extern void blk_rq_unprep_clone(struct request *rq); extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern int blk_rq_append_bio(struct request *rq, struct bio **bio); -extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_queue_split(struct request_queue *, struct bio **); extern void blk_recount_segments(struct request_queue *, struct bio *); extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); @@ -1009,15 +848,7 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); -extern void blk_start_queue(struct request_queue *q); -extern void blk_start_queue_async(struct request_queue *q); -extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -extern void __blk_stop_queue(struct request_queue *q); -extern void __blk_run_queue(struct request_queue *q); -extern void __blk_run_queue_uncond(struct request_queue *q); -extern void blk_run_queue(struct request_queue *); -extern void blk_run_queue_async(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); @@ -1034,7 +865,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); -bool blk_poll(struct request_queue *q, blk_qc_t cookie); +int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { @@ -1172,13 +1003,6 @@ static inline unsigned int blk_rq_count_bios(struct request *rq) return nr_bios; } -/* - * Request issue related functions. - */ -extern struct request *blk_peek_request(struct request_queue *q); -extern void blk_start_request(struct request *rq); -extern struct request *blk_fetch_request(struct request_queue *q); - void blk_steal_bios(struct bio_list *list, struct request *rq); /* @@ -1196,27 +1020,18 @@ void blk_steal_bios(struct bio_list *list, struct request *rq); */ extern bool blk_update_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); -extern void blk_finish_request(struct request *rq, blk_status_t error); -extern bool blk_end_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes); extern void blk_end_request_all(struct request *rq, blk_status_t error); extern bool __blk_end_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); extern void __blk_end_request_all(struct request *rq, blk_status_t error); extern bool __blk_end_request_cur(struct request *rq, blk_status_t error); -extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); extern void blk_abort_request(struct request *); -extern void blk_unprep_request(struct request *); /* * Access functions for manipulating queue properties */ -extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn, - spinlock_t *lock, int node_id); -extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *); -extern int blk_init_allocated_queue(struct request_queue *); extern void blk_cleanup_queue(struct request_queue *); extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); @@ -1255,15 +1070,10 @@ extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern int blk_queue_dma_drain(struct request_queue *q, dma_drain_needed_fn *dma_drain_needed, void *buf, unsigned int size); -extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); -extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn); -extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn); extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); -extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); -extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); @@ -1299,8 +1109,7 @@ extern long nr_blockdev_pages(void); bool __must_check blk_get_queue(struct request_queue *); struct request_queue *blk_alloc_queue(gfp_t); -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, - spinlock_t *lock); +struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id); extern void blk_put_queue(struct request_queue *); extern void blk_set_queue_dying(struct request_queue *); @@ -1317,9 +1126,10 @@ extern void blk_set_queue_dying(struct request_queue *); * schedule() where blk_schedule_flush_plug() is called. */ struct blk_plug { - struct list_head list; /* requests */ struct list_head mq_list; /* blk-mq requests */ struct list_head cb_list; /* md requires an unplug callback */ + unsigned short rq_count; + bool multiple_queues; }; #define BLK_MAX_REQUEST_COUNT 16 #define BLK_PLUG_FLUSH_SIZE (128 * 1024) @@ -1358,31 +1168,10 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) struct blk_plug *plug = tsk->plug; return plug && - (!list_empty(&plug->list) || - !list_empty(&plug->mq_list) || + (!list_empty(&plug->mq_list) || !list_empty(&plug->cb_list)); } -/* - * tag stuff - */ -extern int blk_queue_start_tag(struct request_queue *, struct request *); -extern struct request *blk_queue_find_tag(struct request_queue *, int); -extern void blk_queue_end_tag(struct request_queue *, struct request *); -extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int); -extern void blk_queue_free_tags(struct request_queue *); -extern int blk_queue_resize_tags(struct request_queue *, int); -extern struct blk_queue_tag *blk_init_tags(int, int); -extern void blk_free_tags(struct blk_queue_tag *); - -static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, - int tag) -{ - if (unlikely(bqt == NULL || tag >= bqt->real_max_depth)) - return NULL; - return bqt->tag_index[tag]; -} - extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); @@ -1982,4 +1771,17 @@ static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, #endif /* CONFIG_BLOCK */ +static inline void blk_wake_io_task(struct task_struct *waiter) +{ + /* + * If we're polling, the task itself is doing the completions. For + * that case, we don't need to signal a wakeup, it's enough to just + * mark us as RUNNING. + */ + if (waiter == current) + __set_current_state(TASK_RUNNING); + else + wake_up_process(waiter); +} + #endif diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h index 6aeaf6472665..b356e0006731 100644 --- a/include/linux/bsg-lib.h +++ b/include/linux/bsg-lib.h @@ -31,6 +31,9 @@ struct device; struct scatterlist; struct request_queue; +typedef int (bsg_job_fn) (struct bsg_job *); +typedef enum blk_eh_timer_return (bsg_timeout_fn)(struct request *); + struct bsg_buffer { unsigned int payload_len; int sg_cnt; @@ -72,7 +75,8 @@ struct bsg_job { void bsg_job_done(struct bsg_job *job, int result, unsigned int reply_payload_rcv_len); struct request_queue *bsg_setup_queue(struct device *dev, const char *name, - bsg_job_fn *job_fn, int dd_job_size); + bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size); +void bsg_remove_queue(struct request_queue *q); void bsg_job_put(struct bsg_job *job); int __must_check bsg_job_get(struct bsg_job *job); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9d12757a65b0..9968332cceed 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -93,6 +93,8 @@ extern struct css_set init_css_set; bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); +struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, + struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 015bb59c0331..2e9e2763bf47 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -23,74 +23,6 @@ enum elv_merge { ELEVATOR_DISCARD_MERGE = 3, }; -typedef enum elv_merge (elevator_merge_fn) (struct request_queue *, struct request **, - struct bio *); - -typedef void (elevator_merge_req_fn) (struct request_queue *, struct request *, struct request *); - -typedef void (elevator_merged_fn) (struct request_queue *, struct request *, enum elv_merge); - -typedef int (elevator_allow_bio_merge_fn) (struct request_queue *, - struct request *, struct bio *); - -typedef int (elevator_allow_rq_merge_fn) (struct request_queue *, - struct request *, struct request *); - -typedef void (elevator_bio_merged_fn) (struct request_queue *, - struct request *, struct bio *); - -typedef int (elevator_dispatch_fn) (struct request_queue *, int); - -typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); -typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *); -typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *); -typedef int (elevator_may_queue_fn) (struct request_queue *, unsigned int); - -typedef void (elevator_init_icq_fn) (struct io_cq *); -typedef void (elevator_exit_icq_fn) (struct io_cq *); -typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, - struct bio *, gfp_t); -typedef void (elevator_put_req_fn) (struct request *); -typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); -typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); - -typedef int (elevator_init_fn) (struct request_queue *, - struct elevator_type *e); -typedef void (elevator_exit_fn) (struct elevator_queue *); -typedef void (elevator_registered_fn) (struct request_queue *); - -struct elevator_ops -{ - elevator_merge_fn *elevator_merge_fn; - elevator_merged_fn *elevator_merged_fn; - elevator_merge_req_fn *elevator_merge_req_fn; - elevator_allow_bio_merge_fn *elevator_allow_bio_merge_fn; - elevator_allow_rq_merge_fn *elevator_allow_rq_merge_fn; - elevator_bio_merged_fn *elevator_bio_merged_fn; - - elevator_dispatch_fn *elevator_dispatch_fn; - elevator_add_req_fn *elevator_add_req_fn; - elevator_activate_req_fn *elevator_activate_req_fn; - elevator_deactivate_req_fn *elevator_deactivate_req_fn; - - elevator_completed_req_fn *elevator_completed_req_fn; - - elevator_request_list_fn *elevator_former_req_fn; - elevator_request_list_fn *elevator_latter_req_fn; - - elevator_init_icq_fn *elevator_init_icq_fn; /* see iocontext.h */ - elevator_exit_icq_fn *elevator_exit_icq_fn; /* ditto */ - - elevator_set_req_fn *elevator_set_req_fn; - elevator_put_req_fn *elevator_put_req_fn; - - elevator_may_queue_fn *elevator_may_queue_fn; - - elevator_init_fn *elevator_init_fn; - elevator_exit_fn *elevator_exit_fn; - elevator_registered_fn *elevator_registered_fn; -}; - struct blk_mq_alloc_data; struct blk_mq_hw_ctx; @@ -137,17 +69,14 @@ struct elevator_type struct kmem_cache *icq_cache; /* fields provided by elevator implementation */ - union { - struct elevator_ops sq; - struct elevator_mq_ops mq; - } ops; + struct elevator_mq_ops ops; + size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; const char *elevator_alias; struct module *elevator_owner; - bool uses_mq; #ifdef CONFIG_BLK_DEBUG_FS const struct blk_mq_debugfs_attr *queue_debugfs_attrs; const struct blk_mq_debugfs_attr *hctx_debugfs_attrs; @@ -175,40 +104,25 @@ struct elevator_queue struct kobject kobj; struct mutex sysfs_lock; unsigned int registered:1; - unsigned int uses_mq:1; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; /* * block elevator interface */ -extern void elv_dispatch_sort(struct request_queue *, struct request *); -extern void elv_dispatch_add_tail(struct request_queue *, struct request *); -extern void elv_add_request(struct request_queue *, struct request *, int); -extern void __elv_add_request(struct request_queue *, struct request *, int); extern enum elv_merge elv_merge(struct request_queue *, struct request **, struct bio *); extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, enum elv_merge); -extern void elv_bio_merged(struct request_queue *q, struct request *, - struct bio *); extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); -extern void elv_requeue_request(struct request_queue *, struct request *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); -extern int elv_may_queue(struct request_queue *, unsigned int); -extern void elv_completed_request(struct request_queue *, struct request *); -extern int elv_set_request(struct request_queue *q, struct request *rq, - struct bio *bio, gfp_t gfp_mask); -extern void elv_put_request(struct request_queue *, struct request *); -extern void elv_drain_elevator(struct request_queue *); /* * io scheduler registration */ -extern void __init load_default_elevator_module(void); extern int elv_register(struct elevator_type *); extern void elv_unregister(struct elevator_type *); @@ -260,9 +174,5 @@ enum { #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) -#else /* CONFIG_BLOCK */ - -static inline void load_default_elevator_module(void) { } - #endif /* CONFIG_BLOCK */ #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 26a8607b3c3c..6d52ce6af4ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2026,7 +2026,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) .ki_filp = filp, .ki_flags = iocb_flags(filp), .ki_hint = ki_hint_validate(file_write_hint(filp)), - .ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0), + .ki_ioprio = get_current_ioprio(), }; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 70fc838e6773..06c0fd594097 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -17,6 +17,7 @@ #include #include #include +#include #ifdef CONFIG_BLOCK @@ -89,6 +90,7 @@ struct disk_stats { unsigned long merges[NR_STAT_GROUPS]; unsigned long io_ticks; unsigned long time_in_queue; + local_t in_flight[2]; }; #define PARTITION_META_INFO_VOLNAMELTH 64 @@ -122,14 +124,13 @@ struct hd_struct { int make_it_fail; #endif unsigned long stamp; - atomic_t in_flight[2]; #ifdef CONFIG_SMP struct disk_stats __percpu *dkstats; #else struct disk_stats dkstats; #endif struct percpu_ref ref; - struct rcu_head rcu_head; + struct rcu_work rcu_work; }; #define GENHD_FL_REMOVABLE 1 @@ -295,8 +296,11 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, #define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) #define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) -#define __part_stat_add(cpu, part, field, addnd) \ - (per_cpu_ptr((part)->dkstats, (cpu))->field += (addnd)) +#define part_stat_get_cpu(part, field, cpu) \ + (per_cpu_ptr((part)->dkstats, (cpu))->field) + +#define part_stat_get(part, field) \ + part_stat_get_cpu(part, field, smp_processor_id()) #define part_stat_read(part, field) \ ({ \ @@ -333,10 +337,9 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_lock() ({ rcu_read_lock(); 0; }) #define part_stat_unlock() rcu_read_unlock() -#define __part_stat_add(cpu, part, field, addnd) \ - ((part)->dkstats.field += addnd) - -#define part_stat_read(part, field) ((part)->dkstats.field) +#define part_stat_get(part, field) ((part)->dkstats.field) +#define part_stat_get_cpu(part, field, cpu) part_stat_get(part, field) +#define part_stat_read(part, field) part_stat_get(part, field) static inline void part_stat_set_all(struct hd_struct *part, int value) { @@ -362,22 +365,33 @@ static inline void free_part_stats(struct hd_struct *part) part_stat_read(part, field[STAT_WRITE]) + \ part_stat_read(part, field[STAT_DISCARD])) -#define part_stat_add(cpu, part, field, addnd) do { \ - __part_stat_add((cpu), (part), field, addnd); \ +#define __part_stat_add(part, field, addnd) \ + (part_stat_get(part, field) += (addnd)) + +#define part_stat_add(part, field, addnd) do { \ + __part_stat_add((part), field, addnd); \ if ((part)->partno) \ - __part_stat_add((cpu), &part_to_disk((part))->part0, \ + __part_stat_add(&part_to_disk((part))->part0, \ field, addnd); \ } while (0) -#define part_stat_dec(cpu, gendiskp, field) \ - part_stat_add(cpu, gendiskp, field, -1) -#define part_stat_inc(cpu, gendiskp, field) \ - part_stat_add(cpu, gendiskp, field, 1) -#define part_stat_sub(cpu, gendiskp, field, subnd) \ - part_stat_add(cpu, gendiskp, field, -subnd) +#define part_stat_dec(gendiskp, field) \ + part_stat_add(gendiskp, field, -1) +#define part_stat_inc(gendiskp, field) \ + part_stat_add(gendiskp, field, 1) +#define part_stat_sub(gendiskp, field, subnd) \ + part_stat_add(gendiskp, field, -subnd) -void part_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); +#define part_stat_local_dec(gendiskp, field) \ + local_dec(&(part_stat_get(gendiskp, field))) +#define part_stat_local_inc(gendiskp, field) \ + local_inc(&(part_stat_get(gendiskp, field))) +#define part_stat_local_read(gendiskp, field) \ + local_read(&(part_stat_get(gendiskp, field))) +#define part_stat_local_read_cpu(gendiskp, field, cpu) \ + local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) + +unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part); void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, @@ -398,8 +412,7 @@ static inline void free_part_info(struct hd_struct *part) kfree(part->info); } -/* block/blk-core.c */ -extern void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part); +void update_io_ticks(struct hd_struct *part, unsigned long now); /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, diff --git a/include/linux/ide.h b/include/linux/ide.h index c74b0321922a..e7d29ae633cd 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -50,6 +50,7 @@ struct ide_request { struct scsi_request sreq; u8 sense[SCSI_SENSE_BUFFERSIZE]; u8 type; + void *special; }; static inline struct ide_request *ide_req(struct request *rq) @@ -529,6 +530,10 @@ struct ide_drive_s { struct request_queue *queue; /* request queue */ + bool (*prep_rq)(struct ide_drive_s *, struct request *); + + struct blk_mq_tag_set tag_set; + struct request *rq; /* current request */ void *driver_data; /* extra driver data */ u16 *id; /* identification info */ @@ -612,6 +617,10 @@ struct ide_drive_s { bool sense_rq_armed; struct request *sense_rq; struct request_sense sense_data; + + /* async sense insertion */ + struct work_struct rq_work; + struct list_head rq_list; }; typedef struct ide_drive_s ide_drive_t; @@ -1089,6 +1098,7 @@ extern int ide_pci_clk; int ide_end_rq(ide_drive_t *, struct request *, blk_status_t, unsigned int); void ide_kill_rq(ide_drive_t *, struct request *); +void ide_insert_request_head(ide_drive_t *, struct request *); void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int); void ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int); @@ -1208,7 +1218,7 @@ extern void ide_stall_queue(ide_drive_t *drive, unsigned long timeout); extern void ide_timer_expiry(struct timer_list *t); extern irqreturn_t ide_intr(int irq, void *dev_id); -extern void do_ide_request(struct request_queue *); +extern blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); extern void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq); void ide_init_disk(struct gendisk *, ide_drive_t *); diff --git a/include/linux/init.h b/include/linux/init.h index 9c2aba1dbabf..5255069f5a9f 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -146,7 +146,6 @@ extern unsigned int reset_devices; /* used by init/main.c */ void setup_arch(char **); void prepare_namespace(void); -void __init load_default_modules(void); int __init init_rootfs(void); #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX) diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 9e30ed6443db..e9bfe6972aed 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -70,6 +70,19 @@ static inline int task_nice_ioclass(struct task_struct *task) return IOPRIO_CLASS_BE; } +/* + * If the calling process has set an I/O priority, use that. Otherwise, return + * the default I/O priority. + */ +static inline int get_current_ioprio(void) +{ + struct io_context *ioc = current->io_context; + + if (ioc) + return ioc->ioprio; + return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); +} + /* * For inheritance, return the highest of the two given priorities */ diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 2fdeac1a420d..5d865a5d5cdc 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -90,7 +90,7 @@ typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int, struct nvm_chk_meta *); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *); -typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); +typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int); typedef void (nvm_destroy_dma_pool_fn)(void *); typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, dma_addr_t *); @@ -357,6 +357,7 @@ struct nvm_geo { u32 clba; /* sectors per chunk */ u16 csecs; /* sector size */ u16 sos; /* out-of-band area size */ + bool ext; /* metadata in extended data buffer */ /* device write constrains */ u32 ws_min; /* minimum write size */ diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 496ff759f84c..91745cc3704c 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -403,7 +403,6 @@ struct nvme_fc_port_template { void **handle); void (*delete_queue)(struct nvme_fc_local_port *, unsigned int qidx, void *handle); - void (*poll_queue)(struct nvme_fc_local_port *, void *handle); int (*ls_req)(struct nvme_fc_local_port *, struct nvme_fc_remote_port *, struct nvmefc_ls_req *); @@ -649,22 +648,6 @@ enum { * sequence in one LLDD operation. Errors during Data * sequence transmit must not allow RSP sequence to be sent. */ - NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 1), - /* Bit 2: When 0, the LLDD is calling the cmd rcv handler - * in a non-isr context, allowing the transport to finish - * op completion in the calling context. When 1, the LLDD - * is calling the cmd rcv handler in an ISR context, - * requiring the transport to transition to a workqueue - * for op completion. - */ - NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 2), - /* Bit 3: When 0, the LLDD is calling the op done handler - * in a non-isr context, allowing the transport to finish - * op completion in the calling context. When 1, the LLDD - * is calling the op done handler in an ISR context, - * requiring the transport to transition to a workqueue - * for op completion. - */ }; diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h new file mode 100644 index 000000000000..03d87c0550a9 --- /dev/null +++ b/include/linux/nvme-tcp.h @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics TCP protocol header. + * Copyright (c) 2018 Lightbits Labs. All rights reserved. + */ + +#ifndef _LINUX_NVME_TCP_H +#define _LINUX_NVME_TCP_H + +#include + +#define NVME_TCP_DISC_PORT 8009 +#define NVME_TCP_ADMIN_CCSZ SZ_8K +#define NVME_TCP_DIGEST_LENGTH 4 + +enum nvme_tcp_pfv { + NVME_TCP_PFV_1_0 = 0x0, +}; + +enum nvme_tcp_fatal_error_status { + NVME_TCP_FES_INVALID_PDU_HDR = 0x01, + NVME_TCP_FES_PDU_SEQ_ERR = 0x02, + NVME_TCP_FES_HDR_DIGEST_ERR = 0x03, + NVME_TCP_FES_DATA_OUT_OF_RANGE = 0x04, + NVME_TCP_FES_R2T_LIMIT_EXCEEDED = 0x05, + NVME_TCP_FES_DATA_LIMIT_EXCEEDED = 0x05, + NVME_TCP_FES_UNSUPPORTED_PARAM = 0x06, +}; + +enum nvme_tcp_digest_option { + NVME_TCP_HDR_DIGEST_ENABLE = (1 << 0), + NVME_TCP_DATA_DIGEST_ENABLE = (1 << 1), +}; + +enum nvme_tcp_pdu_type { + nvme_tcp_icreq = 0x0, + nvme_tcp_icresp = 0x1, + nvme_tcp_h2c_term = 0x2, + nvme_tcp_c2h_term = 0x3, + nvme_tcp_cmd = 0x4, + nvme_tcp_rsp = 0x5, + nvme_tcp_h2c_data = 0x6, + nvme_tcp_c2h_data = 0x7, + nvme_tcp_r2t = 0x9, +}; + +enum nvme_tcp_pdu_flags { + NVME_TCP_F_HDGST = (1 << 0), + NVME_TCP_F_DDGST = (1 << 1), + NVME_TCP_F_DATA_LAST = (1 << 2), + NVME_TCP_F_DATA_SUCCESS = (1 << 3), +}; + +/** + * struct nvme_tcp_hdr - nvme tcp pdu common header + * + * @type: pdu type + * @flags: pdu specific flags + * @hlen: pdu header length + * @pdo: pdu data offset + * @plen: pdu wire byte length + */ +struct nvme_tcp_hdr { + __u8 type; + __u8 flags; + __u8 hlen; + __u8 pdo; + __le32 plen; +}; + +/** + * struct nvme_tcp_icreq_pdu - nvme tcp initialize connection request pdu + * + * @hdr: pdu generic header + * @pfv: pdu version format + * @hpda: host pdu data alignment (dwords, 0's based) + * @digest: digest types enabled + * @maxr2t: maximum r2ts per request supported + */ +struct nvme_tcp_icreq_pdu { + struct nvme_tcp_hdr hdr; + __le16 pfv; + __u8 hpda; + __u8 digest; + __le32 maxr2t; + __u8 rsvd2[112]; +}; + +/** + * struct nvme_tcp_icresp_pdu - nvme tcp initialize connection response pdu + * + * @hdr: pdu common header + * @pfv: pdu version format + * @cpda: controller pdu data alignment (dowrds, 0's based) + * @digest: digest types enabled + * @maxdata: maximum data capsules per r2t supported + */ +struct nvme_tcp_icresp_pdu { + struct nvme_tcp_hdr hdr; + __le16 pfv; + __u8 cpda; + __u8 digest; + __le32 maxdata; + __u8 rsvd[112]; +}; + +/** + * struct nvme_tcp_term_pdu - nvme tcp terminate connection pdu + * + * @hdr: pdu common header + * @fes: fatal error status + * @fei: fatal error information + */ +struct nvme_tcp_term_pdu { + struct nvme_tcp_hdr hdr; + __le16 fes; + __le32 fei; + __u8 rsvd[8]; +}; + +/** + * struct nvme_tcp_cmd_pdu - nvme tcp command capsule pdu + * + * @hdr: pdu common header + * @cmd: nvme command + */ +struct nvme_tcp_cmd_pdu { + struct nvme_tcp_hdr hdr; + struct nvme_command cmd; +}; + +/** + * struct nvme_tcp_rsp_pdu - nvme tcp response capsule pdu + * + * @hdr: pdu common header + * @hdr: nvme-tcp generic header + * @cqe: nvme completion queue entry + */ +struct nvme_tcp_rsp_pdu { + struct nvme_tcp_hdr hdr; + struct nvme_completion cqe; +}; + +/** + * struct nvme_tcp_r2t_pdu - nvme tcp ready-to-transfer pdu + * + * @hdr: pdu common header + * @command_id: nvme command identifier which this relates to + * @ttag: transfer tag (controller generated) + * @r2t_offset: offset from the start of the command data + * @r2t_length: length the host is allowed to send + */ +struct nvme_tcp_r2t_pdu { + struct nvme_tcp_hdr hdr; + __u16 command_id; + __u16 ttag; + __le32 r2t_offset; + __le32 r2t_length; + __u8 rsvd[4]; +}; + +/** + * struct nvme_tcp_data_pdu - nvme tcp data pdu + * + * @hdr: pdu common header + * @command_id: nvme command identifier which this relates to + * @ttag: transfer tag (controller generated) + * @data_offset: offset from the start of the command data + * @data_length: length of the data stream + */ +struct nvme_tcp_data_pdu { + struct nvme_tcp_hdr hdr; + __u16 command_id; + __u16 ttag; + __le32 data_offset; + __le32 data_length; + __u8 rsvd[4]; +}; + +union nvme_tcp_pdu { + struct nvme_tcp_icreq_pdu icreq; + struct nvme_tcp_icresp_pdu icresp; + struct nvme_tcp_cmd_pdu cmd; + struct nvme_tcp_rsp_pdu rsp; + struct nvme_tcp_r2t_pdu r2t; + struct nvme_tcp_data_pdu data; +}; + +#endif /* _LINUX_NVME_TCP_H */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 818dbe9331be..bbcc83886899 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -52,15 +52,20 @@ enum { enum { NVMF_TRTYPE_RDMA = 1, /* RDMA */ NVMF_TRTYPE_FC = 2, /* Fibre Channel */ + NVMF_TRTYPE_TCP = 3, /* TCP/IP */ NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */ NVMF_TRTYPE_MAX, }; /* Transport Requirements codes for Discovery Log Page entry TREQ field */ enum { - NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */ - NVMF_TREQ_REQUIRED = 1, /* Required */ - NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */ + NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */ + NVMF_TREQ_REQUIRED = 1, /* Required */ + NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */ +#define NVME_TREQ_SECURE_CHANNEL_MASK \ + (NVMF_TREQ_REQUIRED | NVMF_TREQ_NOT_REQUIRED) + + NVMF_TREQ_DISABLE_SQFLOW = (1 << 2), /* Supports SQ flow control disable */ }; /* RDMA QP Service Type codes for Discovery Log Page entry TSAS @@ -198,6 +203,11 @@ enum { NVME_PS_FLAGS_NON_OP_STATE = 1 << 1, }; +enum nvme_ctrl_attr { + NVME_CTRL_ATTR_HID_128_BIT = (1 << 0), + NVME_CTRL_ATTR_TBKAS = (1 << 6), +}; + struct nvme_id_ctrl { __le16 vid; __le16 ssvid; @@ -214,7 +224,11 @@ struct nvme_id_ctrl { __le32 rtd3e; __le32 oaes; __le32 ctratt; - __u8 rsvd100[156]; + __u8 rsvd100[28]; + __le16 crdt1; + __le16 crdt2; + __le16 crdt3; + __u8 rsvd134[122]; __le16 oacs; __u8 acl; __u8 aerl; @@ -481,12 +495,21 @@ enum { NVME_AER_NOTICE_NS_CHANGED = 0x00, NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, NVME_AER_NOTICE_ANA = 0x03, + NVME_AER_NOTICE_DISC_CHANGED = 0xf0, }; enum { - NVME_AEN_CFG_NS_ATTR = 1 << 8, - NVME_AEN_CFG_FW_ACT = 1 << 9, - NVME_AEN_CFG_ANA_CHANGE = 1 << 11, + NVME_AEN_BIT_NS_ATTR = 8, + NVME_AEN_BIT_FW_ACT = 9, + NVME_AEN_BIT_ANA_CHANGE = 11, + NVME_AEN_BIT_DISC_CHANGE = 31, +}; + +enum { + NVME_AEN_CFG_NS_ATTR = 1 << NVME_AEN_BIT_NS_ATTR, + NVME_AEN_CFG_FW_ACT = 1 << NVME_AEN_BIT_FW_ACT, + NVME_AEN_CFG_ANA_CHANGE = 1 << NVME_AEN_BIT_ANA_CHANGE, + NVME_AEN_CFG_DISC_CHANGE = 1 << NVME_AEN_BIT_DISC_CHANGE, }; struct nvme_lba_range_type { @@ -639,7 +662,12 @@ struct nvme_common_command { __le32 cdw2[2]; __le64 metadata; union nvme_data_ptr dptr; - __le32 cdw10[6]; + __le32 cdw10; + __le32 cdw11; + __le32 cdw12; + __le32 cdw13; + __le32 cdw14; + __le32 cdw15; }; struct nvme_rw_command { @@ -738,6 +766,15 @@ enum { NVME_HOST_MEM_RETURN = (1 << 1), }; +struct nvme_feat_host_behavior { + __u8 acre; + __u8 resv1[511]; +}; + +enum { + NVME_ENABLE_ACRE = 1, +}; + /* Admin commands */ enum nvme_admin_opcode { @@ -792,6 +829,7 @@ enum { NVME_FEAT_RRL = 0x12, NVME_FEAT_PLM_CONFIG = 0x13, NVME_FEAT_PLM_WINDOW = 0x14, + NVME_FEAT_HOST_BEHAVIOR = 0x16, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, @@ -1030,6 +1068,10 @@ struct nvmf_disc_rsp_page_hdr { struct nvmf_disc_rsp_page_entry entries[0]; }; +enum { + NVME_CONNECT_DISABLE_SQFLOW = (1 << 2), +}; + struct nvmf_connect_command { __u8 opcode; __u8 resv1; @@ -1126,6 +1168,20 @@ struct nvme_command { }; }; +struct nvme_error_slot { + __le64 error_count; + __le16 sqid; + __le16 cmdid; + __le16 status_field; + __le16 param_error_location; + __le64 lba; + __le32 nsid; + __u8 vs; + __u8 resv[3]; + __le64 cs; + __u8 resv2[24]; +}; + static inline bool nvme_is_write(struct nvme_command *cmd) { /* @@ -1243,6 +1299,7 @@ enum { NVME_SC_ANA_TRANSITION = 0x303, NVME_SC_HOST_PATH_ERROR = 0x370, + NVME_SC_CRD = 0x1800, NVME_SC_DNR = 0x4000, }; diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 804a50983ec5..14d558146aea 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -30,14 +30,24 @@ struct seq_file; */ struct sbitmap_word { /** - * @word: The bitmap word itself. - */ - unsigned long word; - - /** - * @depth: Number of bits being used in @word. + * @depth: Number of bits being used in @word/@cleared */ unsigned long depth; + + /** + * @word: word holding free bits + */ + unsigned long word ____cacheline_aligned_in_smp; + + /** + * @cleared: word holding cleared bits + */ + unsigned long cleared ____cacheline_aligned_in_smp; + + /** + * @swap_lock: Held while swapping word <-> cleared + */ + spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** @@ -125,6 +135,11 @@ struct sbitmap_queue { */ struct sbq_wait_state *ws; + /* + * @ws_active: count of currently active ws waitqueues + */ + atomic_t ws_active; + /** * @round_robin: Allocate bits in strict round-robin order. */ @@ -250,12 +265,14 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb, nr = SB_NR_TO_BIT(sb, start); while (scanned < sb->depth) { - struct sbitmap_word *word = &sb->map[index]; - unsigned int depth = min_t(unsigned int, word->depth - nr, + unsigned long word; + unsigned int depth = min_t(unsigned int, + sb->map[index].depth - nr, sb->depth - scanned); scanned += depth; - if (!word->word) + word = sb->map[index].word & ~sb->map[index].cleared; + if (!word) goto next; /* @@ -265,7 +282,7 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb, */ depth += nr; while (1) { - nr = find_next_bit(&word->word, depth, nr); + nr = find_next_bit(&word, depth, nr); if (nr >= depth) break; if (!fn(sb, (index << sb->shift) + nr, data)) @@ -310,6 +327,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } +/* + * This one is special, since it doesn't actually clear the bit, rather it + * sets the corresponding bit in the ->cleared mask instead. Paired with + * the caller doing sbitmap_batch_clear() if a given index is full, which + * will clear the previously freed entries in the corresponding ->word. + */ +static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) +{ + unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared; + + set_bit(SB_NR_TO_BIT(sb, bitnr), addr); +} + static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb, unsigned int bitnr) { @@ -321,8 +351,6 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } -unsigned int sbitmap_weight(const struct sbitmap *sb); - /** * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file. * @sb: Bitmap to show. @@ -531,4 +559,45 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); */ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); +struct sbq_wait { + struct sbitmap_queue *sbq; /* if set, sbq_wait is accounted */ + struct wait_queue_entry wait; +}; + +#define DEFINE_SBQ_WAIT(name) \ + struct sbq_wait name = { \ + .sbq = NULL, \ + .wait = { \ + .private = current, \ + .func = autoremove_wake_function, \ + .entry = LIST_HEAD_INIT((name).wait.entry), \ + } \ + } + +/* + * Wrapper around prepare_to_wait_exclusive(), which maintains some extra + * internal state. + */ +void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, + struct sbq_wait_state *ws, + struct sbq_wait *sbq_wait, int state); + +/* + * Must be paired with sbitmap_prepare_to_wait(). + */ +void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, + struct sbq_wait *sbq_wait); + +/* + * Wrapper around add_wait_queue(), which maintains some extra internal state + */ +void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, + struct sbq_wait_state *ws, + struct sbq_wait *sbq_wait); + +/* + * Must be paired with sbitmap_add_wait_queue() + */ +void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait); + #endif /* __LINUX_SCALE_BITMAP_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2a57a365c711..93f56fddd92a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3339,6 +3339,9 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, } int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg); +int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, + struct ahash_request *hash); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); diff --git a/include/linux/uio.h b/include/linux/uio.h index 55ce99ddb912..ecf584f6b82d 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -11,6 +11,7 @@ #include #include +#include #include struct page; @@ -266,9 +267,11 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { i->count = count; } -size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); +size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, struct iov_iter *i); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); +size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, + struct iov_iter *i); int import_iovec(int type, const struct iovec __user * uvector, unsigned nr_segs, unsigned fast_segs, diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fdfd04e348f6..738a0c24874f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -246,7 +246,8 @@ static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, * * @bio is a part of the writeback in progress controlled by @wbc. Perform * writeback specific initialization. This is used to apply the cgroup - * writeback context. + * writeback context. Must be called after the bio has been associated with + * a device. */ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { @@ -257,7 +258,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) * regular writeback instead of writing things out itself. */ if (wbc->wb) - bio_associate_blkcg(bio, wbc->wb->blkcg_css); + bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } #else /* CONFIG_CGROUP_WRITEBACK */ diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index c891ada3c5c2..d85e6befa26b 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -61,6 +61,9 @@ struct scsi_pointer { /* flags preserved across unprep / reprep */ #define SCMD_PRESERVED_FLAGS (SCMD_UNCHECKED_ISA_DMA | SCMD_INITIALIZED) +/* for scmd->state */ +#define SCMD_STATE_COMPLETE 0 + struct scsi_cmnd { struct scsi_request req; struct scsi_device *device; @@ -145,6 +148,7 @@ struct scsi_cmnd { int result; /* Status code from lower level driver */ int flags; /* Command flags */ + unsigned long state; /* Command completion state */ unsigned char tag; /* SCSI-II queued command tag */ }; @@ -171,7 +175,7 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, size_t *offset, size_t *len); extern void scsi_kunmap_atomic_sg(void *virt); -extern int scsi_init_io(struct scsi_cmnd *cmd); +extern blk_status_t scsi_init_io(struct scsi_cmnd *cmd); #ifdef CONFIG_SCSI_DMA extern int scsi_dma_map(struct scsi_cmnd *cmd); diff --git a/include/scsi/scsi_dh.h b/include/scsi/scsi_dh.h index c7bba2b24849..a862dc23c68d 100644 --- a/include/scsi/scsi_dh.h +++ b/include/scsi/scsi_dh.h @@ -69,7 +69,7 @@ struct scsi_device_handler { int (*attach)(struct scsi_device *); void (*detach)(struct scsi_device *); int (*activate)(struct scsi_device *, activate_complete, void *); - int (*prep_fn)(struct scsi_device *, struct request *); + blk_status_t (*prep_fn)(struct scsi_device *, struct request *); int (*set_params)(struct scsi_device *, const char *); void (*rescan)(struct scsi_device *); }; diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h index fae8b465233e..6dffa8555a39 100644 --- a/include/scsi/scsi_driver.h +++ b/include/scsi/scsi_driver.h @@ -2,6 +2,7 @@ #ifndef _SCSI_SCSI_DRIVER_H #define _SCSI_SCSI_DRIVER_H +#include #include struct module; @@ -13,7 +14,7 @@ struct scsi_driver { struct device_driver gendrv; void (*rescan)(struct device *); - int (*init_command)(struct scsi_cmnd *); + blk_status_t (*init_command)(struct scsi_cmnd *); void (*uninit_command)(struct scsi_cmnd *); int (*done)(struct scsi_cmnd *); int (*eh_action)(struct scsi_cmnd *, int); diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 5ea06d310a25..aa760df8c6b3 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -11,7 +11,6 @@ #include #include -struct request_queue; struct block_device; struct completion; struct module; @@ -22,7 +21,6 @@ struct scsi_target; struct Scsi_Host; struct scsi_host_cmd_pool; struct scsi_transport_template; -struct blk_queue_tags; /* @@ -547,14 +545,8 @@ struct Scsi_Host { struct scsi_host_template *hostt; struct scsi_transport_template *transportt; - /* - * Area to keep a shared tag map (if needed, will be - * NULL if not). - */ - union { - struct blk_queue_tag *bqt; - struct blk_mq_tag_set tag_set; - }; + /* Area to keep a shared tag map */ + struct blk_mq_tag_set tag_set; atomic_t host_busy; /* commands actually active on low-level */ atomic_t host_blocked; @@ -648,7 +640,6 @@ struct Scsi_Host { /* The controller does not support WRITE SAME */ unsigned no_write_same:1; - unsigned use_blk_mq:1; unsigned use_cmd_list:1; /* Host responded with short (<36 bytes) INQUIRY result */ @@ -742,11 +733,6 @@ static inline int scsi_host_in_recovery(struct Scsi_Host *shost) shost->tmf_in_progress; } -static inline bool shost_use_blk_mq(struct Scsi_Host *shost) -{ - return shost->use_blk_mq; -} - extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *); extern void scsi_flush_work(struct Scsi_Host *); diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h index e192a0caa850..6053d46e794e 100644 --- a/include/scsi/scsi_tcq.h +++ b/include/scsi/scsi_tcq.h @@ -23,19 +23,15 @@ static inline struct scsi_cmnd *scsi_host_find_tag(struct Scsi_Host *shost, int tag) { struct request *req = NULL; + u16 hwq; if (tag == SCSI_NO_TAG) return NULL; - if (shost_use_blk_mq(shost)) { - u16 hwq = blk_mq_unique_tag_to_hwq(tag); - - if (hwq < shost->tag_set.nr_hw_queues) { - req = blk_mq_tag_to_rq(shost->tag_set.tags[hwq], - blk_mq_unique_tag_to_tag(tag)); - } - } else { - req = blk_map_queue_find_tag(shost->bqt, tag); + hwq = blk_mq_unique_tag_to_hwq(tag); + if (hwq < shost->tag_set.nr_hw_queues) { + req = blk_mq_tag_to_rq(shost->tag_set.tags[hwq], + blk_mq_unique_tag_to_tag(tag)); } if (!req) diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 2cbd6e42ad83..e4526f85c19d 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -221,9 +221,30 @@ DEFINE_EVENT(cache_set, bcache_journal_entry_full, TP_ARGS(c) ); -DEFINE_EVENT(bcache_bio, bcache_journal_write, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) +TRACE_EVENT(bcache_journal_write, + TP_PROTO(struct bio *bio, u32 keys), + TP_ARGS(bio, keys), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(sector_t, sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __field(u32, nr_keys ) + ), + + TP_fast_assign( + __entry->dev = bio_dev(bio); + __entry->sector = bio->bi_iter.bi_sector; + __entry->nr_sector = bio->bi_iter.bi_size >> 9; + __entry->nr_keys = keys; + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + ), + + TP_printk("%d,%d %s %llu + %u keys %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, __entry->nr_sector, + __entry->nr_keys) ); /* Btree */ diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index ce43d340f010..8387e0af0f76 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -50,6 +50,8 @@ enum { * * IOCB_FLAG_RESFD - Set if the "aio_resfd" member of the "struct iocb" * is valid. + * IOCB_FLAG_IOPRIO - Set if the "aio_reqprio" member of the "struct iocb" + * is valid. */ #define IOCB_FLAG_RESFD (1 << 0) #define IOCB_FLAG_IOPRIO (1 << 1) diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index d1a5d885ce13..73e02ea5d5d1 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -53,9 +53,6 @@ static void __init handle_initrd(void) ksys_mkdir("/old", 0700); ksys_chdir("/old"); - /* try loading default modules from initrd */ - load_default_modules(); - /* * In case that a resume from disk is carried out by linuxrc or one of * its children, we need to tell the freezer not to wait for us. diff --git a/init/initramfs.c b/init/initramfs.c index f6f4a1e4cd54..fca899622937 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -646,12 +646,6 @@ static int __init populate_rootfs(void) #endif } flush_delayed_fput(); - /* - * Try loading default modules from initramfs. This gives - * us a chance to load before device_initcalls. - */ - load_default_modules(); - return 0; } rootfs_initcall(populate_rootfs); diff --git a/init/main.c b/init/main.c index 954d9b6c62c6..0f8cc626e634 100644 --- a/init/main.c +++ b/init/main.c @@ -992,17 +992,6 @@ static void __init do_pre_smp_initcalls(void) do_one_initcall(initcall_from_entry(fn)); } -/* - * This function requests modules which should be loaded by default and is - * called twice right after initrd is mounted and right before init is - * exec'd. If such modules are on either initrd or rootfs, they will be - * loaded before control is passed to userland. - */ -void __init load_default_modules(void) -{ - load_default_elevator_module(); -} - static int run_init_process(const char *init_filename) { argv_init[0] = init_filename; @@ -1176,5 +1165,4 @@ static noinline void __init kernel_init_freeable(void) */ integrity_load_keys(); - load_default_modules(); } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7a8429f8e280..39eb36ba36ad 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -493,7 +493,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, } /** - * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * @@ -502,8 +502,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ -static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) +static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, + struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); @@ -523,6 +523,35 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, return cgroup_css(cgrp, ss); } +/** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get the effective css of @cgrp for @ss. The effective css is + * defined as the matching css of the nearest ancestor including self which + * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, + * the root css is returned, so this function always returns a valid css. + * + * The returned css is not guaranteed to be online, and therefore it is the + * callers responsiblity to tryget a reference for it. + */ +struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + do { + css = cgroup_css(cgrp, ss); + + if (css) + return css; + cgrp = cgroup_parent(cgrp); + } while (cgrp); + + return init_css_set.subsys[ss->id]; +} + /** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest @@ -605,10 +634,11 @@ EXPORT_SYMBOL_GPL(of_css); * * Should be called under cgroup_[tree_]mutex. */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ - ; \ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css_by_mask(cgrp, \ + cgroup_subsys[(ssid)]))) \ + ; \ else /** @@ -1007,7 +1037,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ - template[i] = cgroup_e_css(cgrp, ss); + template[i] = cgroup_e_css_by_mask(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want @@ -3024,7 +3054,7 @@ static int cgroup_apply_control(struct cgroup *cgrp) return ret; /* - * At this point, cgroup_e_css() results reflect the new csses + * At this point, cgroup_e_css_by_mask() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2868d85f1fb1..fac0ddf8a8e2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return NULL; - if (!bio->bi_css) + if (!bio->bi_blkg) return NULL; - return cgroup_get_kernfs_id(bio->bi_css->cgroup); + return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); } #else static union kernfs_node_id * diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 54c248526b55..1928009f506e 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -6,6 +6,7 @@ #include #include #include +#include #define PIPE_PARANOIA /* for now */ @@ -1464,10 +1465,11 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, } EXPORT_SYMBOL(csum_and_copy_from_iter_full); -size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, +size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, struct iov_iter *i) { const char *from = addr; + __wsum *csum = csump; __wsum sum, next; size_t off = 0; @@ -1510,6 +1512,21 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, } EXPORT_SYMBOL(csum_and_copy_to_iter); +size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, + struct iov_iter *i) +{ + struct ahash_request *hash = hashp; + struct scatterlist sg; + size_t copied; + + copied = copy_to_iter(addr, bytes, i); + sg_init_one(&sg, addr, copied); + ahash_request_set_crypt(hash, &sg, NULL, copied); + crypto_ahash_update(hash); + return copied; +} +EXPORT_SYMBOL(hash_and_copy_to_iter); + int iov_iter_npages(const struct iov_iter *i, int maxpages) { size_t size = i->count; diff --git a/lib/sbitmap.c b/lib/sbitmap.c index fdd1b8aa8ac6..65c2d06250a6 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -20,6 +20,47 @@ #include #include +/* + * See if we have deferred clears that we can batch move + */ +static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index) +{ + unsigned long mask, val; + unsigned long __maybe_unused flags; + bool ret = false; + + /* Silence bogus lockdep warning */ +#if defined(CONFIG_LOCKDEP) + local_irq_save(flags); +#endif + spin_lock(&sb->map[index].swap_lock); + + if (!sb->map[index].cleared) + goto out_unlock; + + /* + * First get a stable cleared mask, setting the old mask to 0. + */ + do { + mask = sb->map[index].cleared; + } while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask); + + /* + * Now clear the masked bits in our free word + */ + do { + val = sb->map[index].word; + } while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val); + + ret = true; +out_unlock: + spin_unlock(&sb->map[index].swap_lock); +#if defined(CONFIG_LOCKDEP) + local_irq_restore(flags); +#endif + return ret; +} + int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node) { @@ -59,6 +100,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, for (i = 0; i < sb->map_nr; i++) { sb->map[i].depth = min(depth, bits_per_word); depth -= sb->map[i].depth; + spin_lock_init(&sb->map[i].swap_lock); } return 0; } @@ -69,6 +111,9 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth) unsigned int bits_per_word = 1U << sb->shift; unsigned int i; + for (i = 0; i < sb->map_nr; i++) + sbitmap_deferred_clear(sb, i); + sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); @@ -111,6 +156,24 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth, return nr; } +static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index, + unsigned int alloc_hint, bool round_robin) +{ + int nr; + + do { + nr = __sbitmap_get_word(&sb->map[index].word, + sb->map[index].depth, alloc_hint, + !round_robin); + if (nr != -1) + break; + if (!sbitmap_deferred_clear(sb, index)) + break; + } while (1); + + return nr; +} + int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) { unsigned int i, index; @@ -118,24 +181,28 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) index = SB_NR_TO_INDEX(sb, alloc_hint); + /* + * Unless we're doing round robin tag allocation, just use the + * alloc_hint to find the right word index. No point in looping + * twice in find_next_zero_bit() for that case. + */ + if (round_robin) + alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); + else + alloc_hint = 0; + for (i = 0; i < sb->map_nr; i++) { - nr = __sbitmap_get_word(&sb->map[index].word, - sb->map[index].depth, - SB_NR_TO_BIT(sb, alloc_hint), - !round_robin); + nr = sbitmap_find_bit_in_index(sb, index, alloc_hint, + round_robin); if (nr != -1) { nr += index << sb->shift; break; } /* Jump to next index. */ - index++; - alloc_hint = index << sb->shift; - - if (index >= sb->map_nr) { + alloc_hint = 0; + if (++index >= sb->map_nr) index = 0; - alloc_hint = 0; - } } return nr; @@ -151,6 +218,7 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, index = SB_NR_TO_INDEX(sb, alloc_hint); for (i = 0; i < sb->map_nr; i++) { +again: nr = __sbitmap_get_word(&sb->map[index].word, min(sb->map[index].depth, shallow_depth), SB_NR_TO_BIT(sb, alloc_hint), true); @@ -159,6 +227,9 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, break; } + if (sbitmap_deferred_clear(sb, index)) + goto again; + /* Jump to next index. */ index++; alloc_hint = index << sb->shift; @@ -178,7 +249,7 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb) unsigned int i; for (i = 0; i < sb->map_nr; i++) { - if (sb->map[i].word) + if (sb->map[i].word & ~sb->map[i].cleared) return true; } return false; @@ -191,9 +262,10 @@ bool sbitmap_any_bit_clear(const struct sbitmap *sb) for (i = 0; i < sb->map_nr; i++) { const struct sbitmap_word *word = &sb->map[i]; + unsigned long mask = word->word & ~word->cleared; unsigned long ret; - ret = find_first_zero_bit(&word->word, word->depth); + ret = find_first_zero_bit(&mask, word->depth); if (ret < word->depth) return true; } @@ -201,23 +273,36 @@ bool sbitmap_any_bit_clear(const struct sbitmap *sb) } EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear); -unsigned int sbitmap_weight(const struct sbitmap *sb) +static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set) { unsigned int i, weight = 0; for (i = 0; i < sb->map_nr; i++) { const struct sbitmap_word *word = &sb->map[i]; - weight += bitmap_weight(&word->word, word->depth); + if (set) + weight += bitmap_weight(&word->word, word->depth); + else + weight += bitmap_weight(&word->cleared, word->depth); } return weight; } -EXPORT_SYMBOL_GPL(sbitmap_weight); + +static unsigned int sbitmap_weight(const struct sbitmap *sb) +{ + return __sbitmap_weight(sb, true); +} + +static unsigned int sbitmap_cleared(const struct sbitmap *sb) +{ + return __sbitmap_weight(sb, false); +} void sbitmap_show(struct sbitmap *sb, struct seq_file *m) { seq_printf(m, "depth=%u\n", sb->depth); - seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); + seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb)); + seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb)); seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); seq_printf(m, "map_nr=%u\n", sb->map_nr); } @@ -325,6 +410,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, sbq->min_shallow_depth = UINT_MAX; sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); atomic_set(&sbq->wake_index, 0); + atomic_set(&sbq->ws_active, 0); sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); if (!sbq->ws) { @@ -440,6 +526,9 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) { int i, wake_index; + if (!atomic_read(&sbq->ws_active)) + return NULL; + wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; @@ -509,7 +598,8 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu) { - sbitmap_clear_bit_unlock(&sbq->sb, nr); + sbitmap_deferred_clear_bit(&sbq->sb, nr); + /* * Pairs with the memory barrier in set_current_state() to ensure the * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker @@ -564,6 +654,7 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); + seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active)); seq_puts(m, "ws={\n"); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { @@ -579,3 +670,48 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_show); + +void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, + struct sbq_wait_state *ws, + struct sbq_wait *sbq_wait) +{ + if (!sbq_wait->sbq) { + sbq_wait->sbq = sbq; + atomic_inc(&sbq->ws_active); + } + add_wait_queue(&ws->wait, &sbq_wait->wait); +} +EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue); + +void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait) +{ + list_del_init(&sbq_wait->wait.entry); + if (sbq_wait->sbq) { + atomic_dec(&sbq_wait->sbq->ws_active); + sbq_wait->sbq = NULL; + } +} +EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue); + +void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, + struct sbq_wait_state *ws, + struct sbq_wait *sbq_wait, int state) +{ + if (!sbq_wait->sbq) { + atomic_inc(&sbq->ws_active); + sbq_wait->sbq = sbq; + } + prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state); +} +EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait); + +void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, + struct sbq_wait *sbq_wait) +{ + finish_wait(&ws->wait, &sbq_wait->wait); + if (sbq_wait->sbq) { + atomic_dec(&sbq->ws_active); + sbq_wait->sbq = NULL; + } +} +EXPORT_SYMBOL_GPL(sbitmap_finish_wait); diff --git a/mm/page_io.c b/mm/page_io.c index d4d1c89bcddd..3475733b1926 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -140,7 +140,7 @@ out: unlock_page(page); WRITE_ONCE(bio->bi_private, NULL); bio_put(bio); - wake_up_process(waiter); + blk_wake_io_task(waiter); put_task_struct(waiter); } @@ -339,7 +339,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, goto out; } bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); - bio_associate_blkcg_from_page(bio, page); + bio_associate_blkg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); @@ -405,11 +405,12 @@ int swap_readpage(struct page *page, bool synchronous) bio_get(bio); qc = submit_bio(bio); while (synchronous) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio->bi_private)) break; - if (!blk_poll(disk->queue, qc)) + if (!blk_poll(disk->queue, qc, true)) break; } __set_current_state(TASK_RUNNING); diff --git a/net/core/datagram.c b/net/core/datagram.c index 4bf62b1afa3b..b2651bb6d2a3 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -408,27 +408,20 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) } EXPORT_SYMBOL(skb_kill_datagram); -/** - * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: iovec iterator to copy to - * @len: amount of data to copy from buffer to iovec - */ -int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len) +int __skb_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, bool fault_short, + size_t (*cb)(const void *, size_t, void *, struct iov_iter *), + void *data) { int start = skb_headlen(skb); int i, copy = start - offset, start_off = offset, n; struct sk_buff *frag_iter; - trace_skb_copy_datagram_iovec(skb, len); - /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; - n = copy_to_iter(skb->data + offset, copy, to); + n = cb(skb->data + offset, copy, data, to); offset += n; if (n != copy) goto short_copy; @@ -445,11 +438,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { + struct page *page = skb_frag_page(frag); + u8 *vaddr = kmap(page); + if (copy > len) copy = len; - n = copy_page_to_iter(skb_frag_page(frag), - frag->page_offset + offset - - start, copy, to); + n = cb(vaddr + frag->page_offset + + offset - start, copy, data, to); + kunmap(page); offset += n; if (n != copy) goto short_copy; @@ -468,8 +464,8 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, if ((copy = end - offset) > 0) { if (copy > len) copy = len; - if (skb_copy_datagram_iter(frag_iter, offset - start, - to, copy)) + if (__skb_datagram_iter(frag_iter, offset - start, + to, copy, fault_short, cb, data)) goto fault; if ((len -= copy) == 0) return 0; @@ -490,11 +486,50 @@ fault: return -EFAULT; short_copy: - if (iov_iter_count(to)) + if (fault_short || iov_iter_count(to)) goto fault; return 0; } + +/** + * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator + * and update a hash. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @hash: hash request to update + */ +int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, + struct ahash_request *hash) +{ + return __skb_datagram_iter(skb, offset, to, len, true, + hash_and_copy_to_iter, hash); +} +EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); + +static size_t simple_copy_to_iter(const void *addr, size_t bytes, + void *data __always_unused, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + +/** + * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + */ +int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len) +{ + trace_skb_copy_datagram_iovec(skb, len); + return __skb_datagram_iter(skb, offset, to, len, false, + simple_copy_to_iter, NULL); +} EXPORT_SYMBOL(skb_copy_datagram_iter); /** @@ -645,87 +680,21 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) } EXPORT_SYMBOL(zerocopy_sg_from_iter); +/** + * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator + * and update a checksum. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @csump: checksum pointer + */ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, __wsum *csump) { - int start = skb_headlen(skb); - int i, copy = start - offset, start_off = offset; - struct sk_buff *frag_iter; - int pos = 0; - int n; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to); - offset += n; - if (n != copy) - goto fault; - if ((len -= copy) == 0) - return 0; - pos = copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - __wsum csum2 = 0; - struct page *page = skb_frag_page(frag); - u8 *vaddr = kmap(page); - - if (copy > len) - copy = len; - n = csum_and_copy_to_iter(vaddr + frag->page_offset + - offset - start, copy, - &csum2, to); - kunmap(page); - offset += n; - if (n != copy) - goto fault; - *csump = csum_block_add(*csump, csum2, pos); - if (!(len -= copy)) - return 0; - pos += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - __wsum csum2 = 0; - if (copy > len) - copy = len; - if (skb_copy_and_csum_datagram(frag_iter, - offset - start, - to, copy, - &csum2)) - goto fault; - *csump = csum_block_add(*csump, csum2, pos); - if ((len -= copy) == 0) - return 0; - offset += copy; - pos += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - iov_iter_revert(to, offset - start_off); - return -EFAULT; + return __skb_datagram_iter(skb, offset, to, len, true, + csum_and_copy_to_iter, csump); } /**