1
0
Fork 0

for-5.1/block-20190302

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAlx63XIQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpp2vEACfrrQsap7R+Av28mmXpmXi2FPa3g5Tev1t
 yYjK2qHvhlMZjPTYw3hCmbYdDDczlF7PEgSE2x2DjdcsYapb8Fy1lZ2X16c7ztBR
 HD/t9b5AVSQsczZzKgv3RqsNtTnjzS5V0A8XH8FAP2QRgiwDMwSN6G0FP0JBLbE/
 ZgxQrH1Iy1F33Wz4hI3Z7dEghKPZrH1IlegkZCEu47q9SlWS76qUetSy2GEtchOl
 3Lgu54mQZyVdI5/QZf9DyMDLF6dIz3tYU2qhuo01AHjGRCC72v86p8sIiXcUr94Q
 8pbegJhJ/g8KBol9Qhv3+pWG/QUAZwi/ZwasTkK+MJ4klRXfOrznxPubW1z6t9Vn
 QRo39Po5SqqP0QWAscDxCFjESIQlWlKa+LZurJL7DJDCUGrSgzTpnVwFqKwc5zTP
 HJa5MT2tEeL2TfUYRYCfh0ZV0elINdHA1y1klDBh38drh4EWr2gW8xdseGYXqRjh
 fLgEpoF7VQ8kTvxKN+E4jZXkcZmoLmefp0ZyAbblS6IawpPVC7kXM9Fdn2OU8f2c
 fjVjvSiqxfeN6dnpfeLDRbbN9894HwgP/LPropJOQ7KmjCorQq5zMDkAvoh3tElq
 qwluRqdBJpWT/F05KweY+XVW8OawIycmUWqt6JrVNoIDAK31auHQv47kR0VA4OvE
 DRVVhYpocw==
 =VBaU
 -----END PGP SIGNATURE-----

Merge tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block

Pull block layer updates from Jens Axboe:
 "Not a huge amount of changes in this round, the biggest one is that we
  finally have Mings multi-page bvec support merged. Apart from that,
  this pull request contains:

   - Small series that avoids quiescing the queue for sysfs changes that
     match what we currently have (Aleksei)

   - Series of bcache fixes (via Coly)

   - Series of lightnvm fixes (via Mathias)

   - NVMe pull request from Christoph. Nothing major, just SPDX/license
     cleanups, RR mp policy (Hannes), and little fixes (Bart,
     Chaitanya).

   - BFQ series (Paolo)

   - Save blk-mq cpu -> hw queue mapping, removing a pointer indirection
     for the fast path (Jianchao)

   - fops->iopoll() added for async IO polling, this is a feature that
     the upcoming io_uring interface will use (Christoph, me)

   - Partition scan loop fixes (Dongli)

   - mtip32xx conversion from managed resource API (Christoph)

   - cdrom registration race fix (Guenter)

   - MD pull from Song, two minor fixes.

   - Various documentation fixes (Marcos)

   - Multi-page bvec feature. This brings a lot of nice improvements
     with it, like more efficient splitting, larger IOs can be supported
     without growing the bvec table size, and so on. (Ming)

   - Various little fixes to core and drivers"

* tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block: (117 commits)
  block: fix updating bio's front segment size
  block: Replace function name in string with __func__
  nbd: propagate genlmsg_reply return code
  floppy: remove set but not used variable 'q'
  null_blk: fix checking for REQ_FUA
  block: fix NULL pointer dereference in register_disk
  fs: fix guard_bio_eod to check for real EOD errors
  blk-mq: use HCTX_TYPE_DEFAULT but not 0 to index blk_mq_tag_set->map
  block: optimize bvec iteration in bvec_iter_advance
  block: introduce mp_bvec_for_each_page() for iterating over page
  block: optimize blk_bio_segment_split for single-page bvec
  block: optimize __blk_segment_map_sg() for single-page bvec
  block: introduce bvec_nth_page()
  iomap: wire up the iopoll method
  block: add bio_set_polled() helper
  block: wire up block device iopoll method
  fs: add an iopoll method to struct file_operations
  loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()
  loop: do not print warn message if partition scan is successful
  block: bounce: make sure that bvec table is updated
  ...
hifive-unleashed-5.1
Linus Torvalds 2019-03-08 14:12:17 -08:00
commit 80201fe175
114 changed files with 1471 additions and 1124 deletions

View File

@ -117,3 +117,28 @@ Other implications:
size limitations and the limitations of the underlying devices. Thus size limitations and the limitations of the underlying devices. Thus
there's no need to define ->merge_bvec_fn() callbacks for individual block there's no need to define ->merge_bvec_fn() callbacks for individual block
drivers. drivers.
Usage of helpers:
=================
* The following helpers whose names have the suffix of "_all" can only be used
on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
shouldn't use them because the bio may have been split before it reached the
driver.
bio_for_each_segment_all()
bio_first_bvec_all()
bio_first_page_all()
bio_last_bvec_all()
* The following helpers iterate over single-page segment. The passed 'struct
bio_vec' will contain a single-page IO vector during the iteration
bio_for_each_segment()
bio_for_each_segment_all()
* The following helpers iterate over multi-page bvec. The passed 'struct
bio_vec' will contain a multi-page IO vector during the iteration
bio_for_each_bvec()
rq_for_each_bvec()

View File

@ -857,6 +857,7 @@ struct file_operations {
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iopoll)(struct kiocb *kiocb, bool spin);
int (*iterate) (struct file *, struct dir_context *); int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *); __poll_t (*poll) (struct file *, struct poll_table_struct *);
@ -902,6 +903,8 @@ otherwise noted.
write_iter: possibly asynchronous write with iov_iter as source write_iter: possibly asynchronous write with iov_iter as source
iopoll: called when aio wants to poll for completions on HIPRI iocbs
iterate: called when the VFS needs to read the directory contents iterate: called when the VFS needs to read the directory contents
iterate_shared: called when the VFS needs to read the directory contents iterate_shared: called when the VFS needs to read the directory contents

View File

@ -230,11 +230,16 @@ static struct kmem_cache *bfq_pool;
#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
/* hw_tag detection: parallel requests threshold and min samples needed. */ /* hw_tag detection: parallel requests threshold and min samples needed. */
#define BFQ_HW_QUEUE_THRESHOLD 4 #define BFQ_HW_QUEUE_THRESHOLD 3
#define BFQ_HW_QUEUE_SAMPLES 32 #define BFQ_HW_QUEUE_SAMPLES 32
#define BFQQ_SEEK_THR (sector_t)(8 * 100) #define BFQQ_SEEK_THR (sector_t)(8 * 100)
#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
(get_sdist(last_pos, rq) > \
BFQQ_SEEK_THR && \
(!blk_queue_nonrot(bfqd->queue) || \
blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
@ -623,26 +628,6 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq->pos_root = NULL; bfqq->pos_root = NULL;
} }
/*
* Tell whether there are active queues with different weights or
* active groups.
*/
static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
{
/*
* For queue weights to differ, queue_weights_tree must contain
* at least two nodes.
*/
return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
(bfqd->queue_weights_tree.rb_node->rb_left ||
bfqd->queue_weights_tree.rb_node->rb_right)
#ifdef CONFIG_BFQ_GROUP_IOSCHED
) ||
(bfqd->num_groups_with_pending_reqs > 0
#endif
);
}
/* /*
* The following function returns true if every queue must receive the * The following function returns true if every queue must receive the
* same share of the throughput (this condition is used when deciding * same share of the throughput (this condition is used when deciding
@ -651,25 +636,48 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
* *
* Such a scenario occurs when: * Such a scenario occurs when:
* 1) all active queues have the same weight, * 1) all active queues have the same weight,
* 2) all active groups at the same level in the groups tree have the same * 2) all active queues belong to the same I/O-priority class,
* weight,
* 3) all active groups at the same level in the groups tree have the same * 3) all active groups at the same level in the groups tree have the same
* weight,
* 4) all active groups at the same level in the groups tree have the same
* number of children. * number of children.
* *
* Unfortunately, keeping the necessary state for evaluating exactly * Unfortunately, keeping the necessary state for evaluating exactly
* the last two symmetry sub-conditions above would be quite complex * the last two symmetry sub-conditions above would be quite complex
* and time consuming. Therefore this function evaluates, instead, * and time consuming. Therefore this function evaluates, instead,
* only the following stronger two sub-conditions, for which it is * only the following stronger three sub-conditions, for which it is
* much easier to maintain the needed state: * much easier to maintain the needed state:
* 1) all active queues have the same weight, * 1) all active queues have the same weight,
* 2) there are no active groups. * 2) all active queues belong to the same I/O-priority class,
* 3) there are no active groups.
* In particular, the last condition is always true if hierarchical * In particular, the last condition is always true if hierarchical
* support or the cgroups interface are not enabled, thus no state * support or the cgroups interface are not enabled, thus no state
* needs to be maintained in this case. * needs to be maintained in this case.
*/ */
static bool bfq_symmetric_scenario(struct bfq_data *bfqd) static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
{ {
return !bfq_varied_queue_weights_or_active_groups(bfqd); /*
* For queue weights to differ, queue_weights_tree must contain
* at least two nodes.
*/
bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
(bfqd->queue_weights_tree.rb_node->rb_left ||
bfqd->queue_weights_tree.rb_node->rb_right);
bool multiple_classes_busy =
(bfqd->busy_queues[0] && bfqd->busy_queues[1]) ||
(bfqd->busy_queues[0] && bfqd->busy_queues[2]) ||
(bfqd->busy_queues[1] && bfqd->busy_queues[2]);
/*
* For queue weights to differ, queue_weights_tree must contain
* at least two nodes.
*/
return !(varied_queue_weights || multiple_classes_busy
#ifdef BFQ_GROUP_IOSCHED_ENABLED
|| bfqd->num_groups_with_pending_reqs > 0
#endif
);
} }
/* /*
@ -728,15 +736,14 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
/* /*
* In the unlucky event of an allocation failure, we just * In the unlucky event of an allocation failure, we just
* exit. This will cause the weight of queue to not be * exit. This will cause the weight of queue to not be
* considered in bfq_varied_queue_weights_or_active_groups, * considered in bfq_symmetric_scenario, which, in its turn,
* which, in its turn, causes the scenario to be deemed * causes the scenario to be deemed wrongly symmetric in case
* wrongly symmetric in case bfqq's weight would have been * bfqq's weight would have been the only weight making the
* the only weight making the scenario asymmetric. On the * scenario asymmetric. On the bright side, no unbalance will
* bright side, no unbalance will however occur when bfqq * however occur when bfqq becomes inactive again (the
* becomes inactive again (the invocation of this function * invocation of this function is triggered by an activation
* is triggered by an activation of queue). In fact, * of queue). In fact, bfq_weights_tree_remove does nothing
* bfq_weights_tree_remove does nothing if * if !bfqq->weight_counter.
* !bfqq->weight_counter.
*/ */
if (unlikely(!bfqq->weight_counter)) if (unlikely(!bfqq->weight_counter))
return; return;
@ -747,6 +754,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
inc_counter: inc_counter:
bfqq->weight_counter->num_active++; bfqq->weight_counter->num_active++;
bfqq->ref++;
} }
/* /*
@ -771,6 +779,7 @@ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
reset_entity_pointer: reset_entity_pointer:
bfqq->weight_counter = NULL; bfqq->weight_counter = NULL;
bfq_put_queue(bfqq);
} }
/* /*
@ -782,9 +791,6 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
{ {
struct bfq_entity *entity = bfqq->entity.parent; struct bfq_entity *entity = bfqq->entity.parent;
__bfq_weights_tree_remove(bfqd, bfqq,
&bfqd->queue_weights_tree);
for_each_entity(entity) { for_each_entity(entity) {
struct bfq_sched_data *sd = entity->my_sched_data; struct bfq_sched_data *sd = entity->my_sched_data;
@ -818,6 +824,15 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
bfqd->num_groups_with_pending_reqs--; bfqd->num_groups_with_pending_reqs--;
} }
} }
/*
* Next function is invoked last, because it causes bfqq to be
* freed if the following holds: bfqq is not in service and
* has no dispatched request. DO NOT use bfqq after the next
* function invocation.
*/
__bfq_weights_tree_remove(bfqd, bfqq,
&bfqd->queue_weights_tree);
} }
/* /*
@ -873,7 +888,8 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
static unsigned long bfq_serv_to_charge(struct request *rq, static unsigned long bfq_serv_to_charge(struct request *rq,
struct bfq_queue *bfqq) struct bfq_queue *bfqq)
{ {
if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 ||
!bfq_symmetric_scenario(bfqq->bfqd))
return blk_rq_sectors(rq); return blk_rq_sectors(rq);
return blk_rq_sectors(rq) * bfq_async_charge_factor; return blk_rq_sectors(rq) * bfq_async_charge_factor;
@ -907,8 +923,10 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
*/ */
return; return;
new_budget = max_t(unsigned long, bfqq->max_budget, new_budget = max_t(unsigned long,
bfq_serv_to_charge(next_rq, bfqq)); max_t(unsigned long, bfqq->max_budget,
bfq_serv_to_charge(next_rq, bfqq)),
entity->service);
if (entity->budget != new_budget) { if (entity->budget != new_budget) {
entity->budget = new_budget; entity->budget = new_budget;
bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
@ -1011,7 +1029,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
static int bfqq_process_refs(struct bfq_queue *bfqq) static int bfqq_process_refs(struct bfq_queue *bfqq)
{ {
return bfqq->ref - bfqq->allocated - bfqq->entity.on_st; return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
(bfqq->weight_counter != NULL);
} }
/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */ /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
@ -1380,7 +1399,15 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
{ {
struct bfq_entity *entity = &bfqq->entity; struct bfq_entity *entity = &bfqq->entity;
if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { /*
* In the next compound condition, we check also whether there
* is some budget left, because otherwise there is no point in
* trying to go on serving bfqq with this same budget: bfqq
* would be expired immediately after being selected for
* service. This would only cause useless overhead.
*/
if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time &&
bfq_bfqq_budget_left(bfqq) > 0) {
/* /*
* We do not clear the flag non_blocking_wait_rq here, as * We do not clear the flag non_blocking_wait_rq here, as
* the latter is used in bfq_activate_bfqq to signal * the latter is used in bfq_activate_bfqq to signal
@ -2217,14 +2244,15 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
return NULL; return NULL;
/* If there is only one backlogged queue, don't search. */ /* If there is only one backlogged queue, don't search. */
if (bfqd->busy_queues == 1) if (bfq_tot_busy_queues(bfqd) == 1)
return NULL; return NULL;
in_service_bfqq = bfqd->in_service_queue; in_service_bfqq = bfqd->in_service_queue;
if (in_service_bfqq && in_service_bfqq != bfqq && if (in_service_bfqq && in_service_bfqq != bfqq &&
likely(in_service_bfqq != &bfqd->oom_bfqq) && likely(in_service_bfqq != &bfqd->oom_bfqq) &&
bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && bfq_rq_close_to_sector(io_struct, request,
bfqd->in_serv_last_pos) &&
bfqq->entity.parent == in_service_bfqq->entity.parent && bfqq->entity.parent == in_service_bfqq->entity.parent &&
bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
@ -2742,7 +2770,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
if ((bfqd->rq_in_driver > 0 || if ((bfqd->rq_in_driver > 0 ||
now_ns - bfqd->last_completion < BFQ_MIN_TT) now_ns - bfqd->last_completion < BFQ_MIN_TT)
&& get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
bfqd->sequential_samples++; bfqd->sequential_samples++;
bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
@ -2764,6 +2792,8 @@ update_rate_and_reset:
bfq_update_rate_reset(bfqd, rq); bfq_update_rate_reset(bfqd, rq);
update_last_values: update_last_values:
bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
if (RQ_BFQQ(rq) == bfqd->in_service_queue)
bfqd->in_serv_last_pos = bfqd->last_position;
bfqd->last_dispatch = now_ns; bfqd->last_dispatch = now_ns;
} }
@ -3274,16 +3304,32 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
* requests, then the request pattern is isochronous * requests, then the request pattern is isochronous
* (see the comments on the function * (see the comments on the function
* bfq_bfqq_softrt_next_start()). Thus we can compute * bfq_bfqq_softrt_next_start()). Thus we can compute
* soft_rt_next_start. If, instead, the queue still * soft_rt_next_start. And we do it, unless bfqq is in
* has outstanding requests, then we have to wait for * interactive weight raising. We do not do it in the
* the completion of all the outstanding requests to * latter subcase, for the following reason. bfqq may
* discover whether the request pattern is actually * be conveying the I/O needed to load a soft
* isochronous. * real-time application. Such an application will
* actually exhibit a soft real-time I/O pattern after
* it finally starts doing its job. But, if
* soft_rt_next_start is computed here for an
* interactive bfqq, and bfqq had received a lot of
* service before remaining with no outstanding
* request (likely to happen on a fast device), then
* soft_rt_next_start would be assigned such a high
* value that, for a very long time, bfqq would be
* prevented from being possibly considered as soft
* real time.
*
* If, instead, the queue still has outstanding
* requests, then we have to wait for the completion
* of all the outstanding requests to discover whether
* the request pattern is actually isochronous.
*/ */
if (bfqq->dispatched == 0) if (bfqq->dispatched == 0 &&
bfqq->wr_coeff != bfqd->bfq_wr_coeff)
bfqq->soft_rt_next_start = bfqq->soft_rt_next_start =
bfq_bfqq_softrt_next_start(bfqd, bfqq); bfq_bfqq_softrt_next_start(bfqd, bfqq);
else { else if (bfqq->dispatched > 0) {
/* /*
* Schedule an update of soft_rt_next_start to when * Schedule an update of soft_rt_next_start to when
* the task may be discovered to be isochronous. * the task may be discovered to be isochronous.
@ -3376,53 +3422,13 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
bfq_bfqq_budget_timeout(bfqq); bfq_bfqq_budget_timeout(bfqq);
} }
/* static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
* For a queue that becomes empty, device idling is allowed only if struct bfq_queue *bfqq)
* this function returns true for the queue. As a consequence, since
* device idling plays a critical role in both throughput boosting and
* service guarantees, the return value of this function plays a
* critical role in both these aspects as well.
*
* In a nutshell, this function returns true only if idling is
* beneficial for throughput or, even if detrimental for throughput,
* idling is however necessary to preserve service guarantees (low
* latency, desired throughput distribution, ...). In particular, on
* NCQ-capable devices, this function tries to return false, so as to
* help keep the drives' internal queues full, whenever this helps the
* device boost the throughput without causing any service-guarantee
* issue.
*
* In more detail, the return value of this function is obtained by,
* first, computing a number of boolean variables that take into
* account throughput and service-guarantee issues, and, then,
* combining these variables in a logical expression. Most of the
* issues taken into account are not trivial. We discuss these issues
* individually while introducing the variables.
*/
static bool bfq_better_to_idle(struct bfq_queue *bfqq)
{ {
struct bfq_data *bfqd = bfqq->bfqd;
bool rot_without_queueing = bool rot_without_queueing =
!blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
bfqq_sequential_and_IO_bound, bfqq_sequential_and_IO_bound,
idling_boosts_thr, idling_boosts_thr_without_issues, idling_boosts_thr;
idling_needed_for_service_guarantees,
asymmetric_scenario;
if (bfqd->strict_guarantees)
return true;
/*
* Idling is performed only if slice_idle > 0. In addition, we
* do not idle if
* (a) bfqq is async
* (b) bfqq is in the idle io prio class: in this case we do
* not idle because we want to minimize the bandwidth that
* queues in this class can steal to higher-priority queues
*/
if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
bfq_class_idle(bfqq))
return false;
bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq);
@ -3454,8 +3460,7 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
bfqq_sequential_and_IO_bound); bfqq_sequential_and_IO_bound);
/* /*
* The value of the next variable, * The return value of this function is equal to that of
* idling_boosts_thr_without_issues, is equal to that of
* idling_boosts_thr, unless a special case holds. In this * idling_boosts_thr, unless a special case holds. In this
* special case, described below, idling may cause problems to * special case, described below, idling may cause problems to
* weight-raised queues. * weight-raised queues.
@ -3472,217 +3477,252 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
* which enqueue several requests in advance, and further * which enqueue several requests in advance, and further
* reorder internally-queued requests. * reorder internally-queued requests.
* *
* For this reason, we force to false the value of * For this reason, we force to false the return value if
* idling_boosts_thr_without_issues if there are weight-raised * there are weight-raised busy queues. In this case, and if
* busy queues. In this case, and if bfqq is not weight-raised, * bfqq is not weight-raised, this guarantees that the device
* this guarantees that the device is not idled for bfqq (if, * is not idled for bfqq (if, instead, bfqq is weight-raised,
* instead, bfqq is weight-raised, then idling will be * then idling will be guaranteed by another variable, see
* guaranteed by another variable, see below). Combined with * below). Combined with the timestamping rules of BFQ (see
* the timestamping rules of BFQ (see [1] for details), this * [1] for details), this behavior causes bfqq, and hence any
* behavior causes bfqq, and hence any sync non-weight-raised * sync non-weight-raised queue, to get a lower number of
* queue, to get a lower number of requests served, and thus * requests served, and thus to ask for a lower number of
* to ask for a lower number of requests from the request * requests from the request pool, before the busy
* pool, before the busy weight-raised queues get served * weight-raised queues get served again. This often mitigates
* again. This often mitigates starvation problems in the * starvation problems in the presence of heavy write
* presence of heavy write workloads and NCQ, thereby * workloads and NCQ, thereby guaranteeing a higher
* guaranteeing a higher application and system responsiveness * application and system responsiveness in these hostile
* in these hostile scenarios. * scenarios.
*/ */
idling_boosts_thr_without_issues = idling_boosts_thr && return idling_boosts_thr &&
bfqd->wr_busy_queues == 0; bfqd->wr_busy_queues == 0;
}
/* /*
* There is then a case where idling must be performed not * There is a case where idling must be performed not for
* for throughput concerns, but to preserve service * throughput concerns, but to preserve service guarantees.
* guarantees. *
* * To introduce this case, we can note that allowing the drive
* To introduce this case, we can note that allowing the drive * to enqueue more than one request at a time, and hence
* to enqueue more than one request at a time, and hence * delegating de facto final scheduling decisions to the
* delegating de facto final scheduling decisions to the * drive's internal scheduler, entails loss of control on the
* drive's internal scheduler, entails loss of control on the * actual request service order. In particular, the critical
* actual request service order. In particular, the critical * situation is when requests from different processes happen
* situation is when requests from different processes happen * to be present, at the same time, in the internal queue(s)
* to be present, at the same time, in the internal queue(s) * of the drive. In such a situation, the drive, by deciding
* of the drive. In such a situation, the drive, by deciding * the service order of the internally-queued requests, does
* the service order of the internally-queued requests, does * determine also the actual throughput distribution among
* determine also the actual throughput distribution among * these processes. But the drive typically has no notion or
* these processes. But the drive typically has no notion or * concern about per-process throughput distribution, and
* concern about per-process throughput distribution, and * makes its decisions only on a per-request basis. Therefore,
* makes its decisions only on a per-request basis. Therefore, * the service distribution enforced by the drive's internal
* the service distribution enforced by the drive's internal * scheduler is likely to coincide with the desired
* scheduler is likely to coincide with the desired * device-throughput distribution only in a completely
* device-throughput distribution only in a completely * symmetric scenario where:
* symmetric scenario where: * (i) each of these processes must get the same throughput as
* (i) each of these processes must get the same throughput as * the others;
* the others; * (ii) the I/O of each process has the same properties, in
* (ii) the I/O of each process has the same properties, in * terms of locality (sequential or random), direction
* terms of locality (sequential or random), direction * (reads or writes), request sizes, greediness
* (reads or writes), request sizes, greediness * (from I/O-bound to sporadic), and so on.
* (from I/O-bound to sporadic), and so on. * In fact, in such a scenario, the drive tends to treat
* In fact, in such a scenario, the drive tends to treat * the requests of each of these processes in about the same
* the requests of each of these processes in about the same * way as the requests of the others, and thus to provide
* way as the requests of the others, and thus to provide * each of these processes with about the same throughput
* each of these processes with about the same throughput * (which is exactly the desired throughput distribution). In
* (which is exactly the desired throughput distribution). In * contrast, in any asymmetric scenario, device idling is
* contrast, in any asymmetric scenario, device idling is * certainly needed to guarantee that bfqq receives its
* certainly needed to guarantee that bfqq receives its * assigned fraction of the device throughput (see [1] for
* assigned fraction of the device throughput (see [1] for * details).
* details). * The problem is that idling may significantly reduce
* The problem is that idling may significantly reduce * throughput with certain combinations of types of I/O and
* throughput with certain combinations of types of I/O and * devices. An important example is sync random I/O, on flash
* devices. An important example is sync random I/O, on flash * storage with command queueing. So, unless bfqq falls in the
* storage with command queueing. So, unless bfqq falls in the * above cases where idling also boosts throughput, it would
* above cases where idling also boosts throughput, it would * be important to check conditions (i) and (ii) accurately,
* be important to check conditions (i) and (ii) accurately, * so as to avoid idling when not strictly needed for service
* so as to avoid idling when not strictly needed for service * guarantees.
* guarantees. *
* * Unfortunately, it is extremely difficult to thoroughly
* Unfortunately, it is extremely difficult to thoroughly * check condition (ii). And, in case there are active groups,
* check condition (ii). And, in case there are active groups, * it becomes very difficult to check condition (i) too. In
* it becomes very difficult to check condition (i) too. In * fact, if there are active groups, then, for condition (i)
* fact, if there are active groups, then, for condition (i) * to become false, it is enough that an active group contains
* to become false, it is enough that an active group contains * more active processes or sub-groups than some other active
* more active processes or sub-groups than some other active * group. More precisely, for condition (i) to hold because of
* group. More precisely, for condition (i) to hold because of * such a group, it is not even necessary that the group is
* such a group, it is not even necessary that the group is * (still) active: it is sufficient that, even if the group
* (still) active: it is sufficient that, even if the group * has become inactive, some of its descendant processes still
* has become inactive, some of its descendant processes still * have some request already dispatched but still waiting for
* have some request already dispatched but still waiting for * completion. In fact, requests have still to be guaranteed
* completion. In fact, requests have still to be guaranteed * their share of the throughput even after being
* their share of the throughput even after being * dispatched. In this respect, it is easy to show that, if a
* dispatched. In this respect, it is easy to show that, if a * group frequently becomes inactive while still having
* group frequently becomes inactive while still having * in-flight requests, and if, when this happens, the group is
* in-flight requests, and if, when this happens, the group is * not considered in the calculation of whether the scenario
* not considered in the calculation of whether the scenario * is asymmetric, then the group may fail to be guaranteed its
* is asymmetric, then the group may fail to be guaranteed its * fair share of the throughput (basically because idling may
* fair share of the throughput (basically because idling may * not be performed for the descendant processes of the group,
* not be performed for the descendant processes of the group, * but it had to be). We address this issue with the
* but it had to be). We address this issue with the * following bi-modal behavior, implemented in the function
* following bi-modal behavior, implemented in the function * bfq_symmetric_scenario().
* bfq_symmetric_scenario(). *
* * If there are groups with requests waiting for completion
* If there are groups with requests waiting for completion * (as commented above, some of these groups may even be
* (as commented above, some of these groups may even be * already inactive), then the scenario is tagged as
* already inactive), then the scenario is tagged as * asymmetric, conservatively, without checking any of the
* asymmetric, conservatively, without checking any of the * conditions (i) and (ii). So the device is idled for bfqq.
* conditions (i) and (ii). So the device is idled for bfqq. * This behavior matches also the fact that groups are created
* This behavior matches also the fact that groups are created * exactly if controlling I/O is a primary concern (to
* exactly if controlling I/O is a primary concern (to * preserve bandwidth and latency guarantees).
* preserve bandwidth and latency guarantees). *
* * On the opposite end, if there are no groups with requests
* On the opposite end, if there are no groups with requests * waiting for completion, then only condition (i) is actually
* waiting for completion, then only condition (i) is actually * controlled, i.e., provided that condition (i) holds, idling
* controlled, i.e., provided that condition (i) holds, idling * is not performed, regardless of whether condition (ii)
* is not performed, regardless of whether condition (ii) * holds. In other words, only if condition (i) does not hold,
* holds. In other words, only if condition (i) does not hold, * then idling is allowed, and the device tends to be
* then idling is allowed, and the device tends to be * prevented from queueing many requests, possibly of several
* prevented from queueing many requests, possibly of several * processes. Since there are no groups with requests waiting
* processes. Since there are no groups with requests waiting * for completion, then, to control condition (i) it is enough
* for completion, then, to control condition (i) it is enough * to check just whether all the queues with requests waiting
* to check just whether all the queues with requests waiting * for completion also have the same weight.
* for completion also have the same weight. *
* * Not checking condition (ii) evidently exposes bfqq to the
* Not checking condition (ii) evidently exposes bfqq to the * risk of getting less throughput than its fair share.
* risk of getting less throughput than its fair share. * However, for queues with the same weight, a further
* However, for queues with the same weight, a further * mechanism, preemption, mitigates or even eliminates this
* mechanism, preemption, mitigates or even eliminates this * problem. And it does so without consequences on overall
* problem. And it does so without consequences on overall * throughput. This mechanism and its benefits are explained
* throughput. This mechanism and its benefits are explained * in the next three paragraphs.
* in the next three paragraphs. *
* * Even if a queue, say Q, is expired when it remains idle, Q
* Even if a queue, say Q, is expired when it remains idle, Q * can still preempt the new in-service queue if the next
* can still preempt the new in-service queue if the next * request of Q arrives soon (see the comments on
* request of Q arrives soon (see the comments on * bfq_bfqq_update_budg_for_activation). If all queues and
* bfq_bfqq_update_budg_for_activation). If all queues and * groups have the same weight, this form of preemption,
* groups have the same weight, this form of preemption, * combined with the hole-recovery heuristic described in the
* combined with the hole-recovery heuristic described in the * comments on function bfq_bfqq_update_budg_for_activation,
* comments on function bfq_bfqq_update_budg_for_activation, * are enough to preserve a correct bandwidth distribution in
* are enough to preserve a correct bandwidth distribution in * the mid term, even without idling. In fact, even if not
* the mid term, even without idling. In fact, even if not * idling allows the internal queues of the device to contain
* idling allows the internal queues of the device to contain * many requests, and thus to reorder requests, we can rather
* many requests, and thus to reorder requests, we can rather * safely assume that the internal scheduler still preserves a
* safely assume that the internal scheduler still preserves a * minimum of mid-term fairness.
* minimum of mid-term fairness. *
* * More precisely, this preemption-based, idleless approach
* More precisely, this preemption-based, idleless approach * provides fairness in terms of IOPS, and not sectors per
* provides fairness in terms of IOPS, and not sectors per * second. This can be seen with a simple example. Suppose
* second. This can be seen with a simple example. Suppose * that there are two queues with the same weight, but that
* that there are two queues with the same weight, but that * the first queue receives requests of 8 sectors, while the
* the first queue receives requests of 8 sectors, while the * second queue receives requests of 1024 sectors. In
* second queue receives requests of 1024 sectors. In * addition, suppose that each of the two queues contains at
* addition, suppose that each of the two queues contains at * most one request at a time, which implies that each queue
* most one request at a time, which implies that each queue * always remains idle after it is served. Finally, after
* always remains idle after it is served. Finally, after * remaining idle, each queue receives very quickly a new
* remaining idle, each queue receives very quickly a new * request. It follows that the two queues are served
* request. It follows that the two queues are served * alternatively, preempting each other if needed. This
* alternatively, preempting each other if needed. This * implies that, although both queues have the same weight,
* implies that, although both queues have the same weight, * the queue with large requests receives a service that is
* the queue with large requests receives a service that is * 1024/8 times as high as the service received by the other
* 1024/8 times as high as the service received by the other * queue.
* queue. *
* * The motivation for using preemption instead of idling (for
* The motivation for using preemption instead of idling (for * queues with the same weight) is that, by not idling,
* queues with the same weight) is that, by not idling, * service guarantees are preserved (completely or at least in
* service guarantees are preserved (completely or at least in * part) without minimally sacrificing throughput. And, if
* part) without minimally sacrificing throughput. And, if * there is no active group, then the primary expectation for
* there is no active group, then the primary expectation for * this device is probably a high throughput.
* this device is probably a high throughput. *
* * We are now left only with explaining the additional
* We are now left only with explaining the additional * compound condition that is checked below for deciding
* compound condition that is checked below for deciding * whether the scenario is asymmetric. To explain this
* whether the scenario is asymmetric. To explain this * compound condition, we need to add that the function
* compound condition, we need to add that the function * bfq_symmetric_scenario checks the weights of only
* bfq_symmetric_scenario checks the weights of only * non-weight-raised queues, for efficiency reasons (see
* non-weight-raised queues, for efficiency reasons (see * comments on bfq_weights_tree_add()). Then the fact that
* comments on bfq_weights_tree_add()). Then the fact that * bfqq is weight-raised is checked explicitly here. More
* bfqq is weight-raised is checked explicitly here. More * precisely, the compound condition below takes into account
* precisely, the compound condition below takes into account * also the fact that, even if bfqq is being weight-raised,
* also the fact that, even if bfqq is being weight-raised, * the scenario is still symmetric if all queues with requests
* the scenario is still symmetric if all queues with requests * waiting for completion happen to be
* waiting for completion happen to be * weight-raised. Actually, we should be even more precise
* weight-raised. Actually, we should be even more precise * here, and differentiate between interactive weight raising
* here, and differentiate between interactive weight raising * and soft real-time weight raising.
* and soft real-time weight raising. *
* * As a side note, it is worth considering that the above
* As a side note, it is worth considering that the above * device-idling countermeasures may however fail in the
* device-idling countermeasures may however fail in the * following unlucky scenario: if idling is (correctly)
* following unlucky scenario: if idling is (correctly) * disabled in a time period during which all symmetry
* disabled in a time period during which all symmetry * sub-conditions hold, and hence the device is allowed to
* sub-conditions hold, and hence the device is allowed to * enqueue many requests, but at some later point in time some
* enqueue many requests, but at some later point in time some * sub-condition stops to hold, then it may become impossible
* sub-condition stops to hold, then it may become impossible * to let requests be served in the desired order until all
* to let requests be served in the desired order until all * the requests already queued in the device have been served.
* the requests already queued in the device have been served. */
*/ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
asymmetric_scenario = (bfqq->wr_coeff > 1 && struct bfq_queue *bfqq)
bfqd->wr_busy_queues < bfqd->busy_queues) || {
return (bfqq->wr_coeff > 1 &&
bfqd->wr_busy_queues <
bfq_tot_busy_queues(bfqd)) ||
!bfq_symmetric_scenario(bfqd); !bfq_symmetric_scenario(bfqd);
}
/*
* For a queue that becomes empty, device idling is allowed only if
* this function returns true for that queue. As a consequence, since
* device idling plays a critical role for both throughput boosting
* and service guarantees, the return value of this function plays a
* critical role as well.
*
* In a nutshell, this function returns true only if idling is
* beneficial for throughput or, even if detrimental for throughput,
* idling is however necessary to preserve service guarantees (low
* latency, desired throughput distribution, ...). In particular, on
* NCQ-capable devices, this function tries to return false, so as to
* help keep the drives' internal queues full, whenever this helps the
* device boost the throughput without causing any service-guarantee
* issue.
*
* Most of the issues taken into account to get the return value of
* this function are not trivial. We discuss these issues in the two
* functions providing the main pieces of information needed by this
* function.
*/
static bool bfq_better_to_idle(struct bfq_queue *bfqq)
{
struct bfq_data *bfqd = bfqq->bfqd;
bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar;
if (unlikely(bfqd->strict_guarantees))
return true;
/* /*
* Finally, there is a case where maximizing throughput is the * Idling is performed only if slice_idle > 0. In addition, we
* best choice even if it may cause unfairness toward * do not idle if
* bfqq. Such a case is when bfqq became active in a burst of * (a) bfqq is async
* queue activations. Queues that became active during a large * (b) bfqq is in the idle io prio class: in this case we do
* burst benefit only from throughput, as discussed in the * not idle because we want to minimize the bandwidth that
* comments on bfq_handle_burst. Thus, if bfqq became active * queues in this class can steal to higher-priority queues
* in a burst and not idling the device maximizes throughput,
* then the device must no be idled, because not idling the
* device provides bfqq and all other queues in the burst with
* maximum benefit. Combining this and the above case, we can
* now establish when idling is actually needed to preserve
* service guarantees.
*/ */
idling_needed_for_service_guarantees = if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); bfq_class_idle(bfqq))
return false;
idling_boosts_thr_with_no_issue =
idling_boosts_thr_without_issues(bfqd, bfqq);
idling_needed_for_service_guar =
idling_needed_for_service_guarantees(bfqd, bfqq);
/* /*
* We have now all the components we need to compute the * We have now the two components we need to compute the
* return value of the function, which is true only if idling * return value of the function, which is true only if idling
* either boosts the throughput (without issues), or is * either boosts the throughput (without issues), or is
* necessary to preserve service guarantees. * necessary to preserve service guarantees.
*/ */
return idling_boosts_thr_without_issues || return idling_boosts_thr_with_no_issue ||
idling_needed_for_service_guarantees; idling_needed_for_service_guar;
} }
/* /*
@ -3934,7 +3974,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
* belongs to CLASS_IDLE and other queues are waiting for * belongs to CLASS_IDLE and other queues are waiting for
* service. * service.
*/ */
if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq))) if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
goto return_rq; goto return_rq;
bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
@ -3952,7 +3992,7 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
* most a call to dispatch for nothing * most a call to dispatch for nothing
*/ */
return !list_empty_careful(&bfqd->dispatch) || return !list_empty_careful(&bfqd->dispatch) ||
bfqd->busy_queues > 0; bfq_tot_busy_queues(bfqd) > 0;
} }
static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
@ -4006,9 +4046,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
goto start_rq; goto start_rq;
} }
bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); bfq_log(bfqd, "dispatch requests: %d busy queues",
bfq_tot_busy_queues(bfqd));
if (bfqd->busy_queues == 0) if (bfq_tot_busy_queues(bfqd) == 0)
goto exit; goto exit;
/* /*
@ -4488,10 +4529,7 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct request *rq) struct request *rq)
{ {
bfqq->seek_history <<= 1; bfqq->seek_history <<= 1;
bfqq->seek_history |= bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);
get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
(!blk_queue_nonrot(bfqd->queue) ||
blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
} }
static void bfq_update_has_short_ttime(struct bfq_data *bfqd, static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
@ -4560,28 +4598,31 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
/* /*
* There is just this request queued: if the request * There is just this request queued: if
* is small and the queue is not to be expired, then * - the request is small, and
* just exit. * - we are idling to boost throughput, and
* - the queue is not to be expired,
* then just exit.
* *
* In this way, if the device is being idled to wait * In this way, if the device is being idled to wait
* for a new request from the in-service queue, we * for a new request from the in-service queue, we
* avoid unplugging the device and committing the * avoid unplugging the device and committing the
* device to serve just a small request. On the * device to serve just a small request. In contrast
* contrary, we wait for the block layer to decide * we wait for the block layer to decide when to
* when to unplug the device: hopefully, new requests * unplug the device: hopefully, new requests will be
* will be merged to this one quickly, then the device * merged to this one quickly, then the device will be
* will be unplugged and larger requests will be * unplugged and larger requests will be dispatched.
* dispatched.
*/ */
if (small_req && !budget_timeout) if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) &&
!budget_timeout)
return; return;
/* /*
* A large enough request arrived, or the queue is to * A large enough request arrived, or idling is being
* be expired: in both cases disk idling is to be * performed to preserve service guarantees, or
* stopped, so clear wait_request flag and reset * finally the queue is to be expired: in all these
* timer. * cases disk idling is to be stopped, so clear
* wait_request flag and reset timer.
*/ */
bfq_clear_bfqq_wait_request(bfqq); bfq_clear_bfqq_wait_request(bfqq);
hrtimer_try_to_cancel(&bfqd->idle_slice_timer); hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
@ -4607,8 +4648,6 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bool waiting, idle_timer_disabled = false; bool waiting, idle_timer_disabled = false;
if (new_bfqq) { if (new_bfqq) {
if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
/* /*
* Release the request's reference to the old bfqq * Release the request's reference to the old bfqq
* and make sure one is taken to the shared queue. * and make sure one is taken to the shared queue.
@ -4751,6 +4790,8 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
static void bfq_update_hw_tag(struct bfq_data *bfqd) static void bfq_update_hw_tag(struct bfq_data *bfqd)
{ {
struct bfq_queue *bfqq = bfqd->in_service_queue;
bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
bfqd->rq_in_driver); bfqd->rq_in_driver);
@ -4763,7 +4804,18 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
* sum is not exact, as it's not taking into account deactivated * sum is not exact, as it's not taking into account deactivated
* requests. * requests.
*/ */
if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
return;
/*
* If active queue hasn't enough requests and can idle, bfq might not
* dispatch sufficient requests to hardware. Don't zero hw_tag in this
* case
*/
if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
BFQ_HW_QUEUE_THRESHOLD &&
bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
return; return;
if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
@ -4834,11 +4886,14 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
* isochronous, and both requisites for this condition to hold * isochronous, and both requisites for this condition to hold
* are now satisfied, then compute soft_rt_next_start (see the * are now satisfied, then compute soft_rt_next_start (see the
* comments on the function bfq_bfqq_softrt_next_start()). We * comments on the function bfq_bfqq_softrt_next_start()). We
* schedule this delayed check when bfqq expires, if it still * do not compute soft_rt_next_start if bfqq is in interactive
* has in-flight requests. * weight raising (see the comments in bfq_bfqq_expire() for
* an explanation). We schedule this delayed update when bfqq
* expires, if it still has in-flight requests.
*/ */
if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
RB_EMPTY_ROOT(&bfqq->sort_list)) RB_EMPTY_ROOT(&bfqq->sort_list) &&
bfqq->wr_coeff != bfqd->bfq_wr_coeff)
bfqq->soft_rt_next_start = bfqq->soft_rt_next_start =
bfq_bfqq_softrt_next_start(bfqd, bfqq); bfq_bfqq_softrt_next_start(bfqd, bfqq);

View File

@ -501,10 +501,11 @@ struct bfq_data {
unsigned int num_groups_with_pending_reqs; unsigned int num_groups_with_pending_reqs;
/* /*
* Number of bfq_queues containing requests (including the * Per-class (RT, BE, IDLE) number of bfq_queues containing
* queue in service, even if it is idling). * requests (including the queue in service, even if it is
* idling).
*/ */
int busy_queues; unsigned int busy_queues[3];
/* number of weight-raised busy @bfq_queues */ /* number of weight-raised busy @bfq_queues */
int wr_busy_queues; int wr_busy_queues;
/* number of queued requests */ /* number of queued requests */
@ -537,6 +538,9 @@ struct bfq_data {
/* on-disk position of the last served request */ /* on-disk position of the last served request */
sector_t last_position; sector_t last_position;
/* position of the last served request for the in-service queue */
sector_t in_serv_last_pos;
/* time of last request completion (ns) */ /* time of last request completion (ns) */
u64 last_completion; u64 last_completion;
@ -974,6 +978,7 @@ extern struct blkcg_policy blkcg_policy_bfq;
struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq); struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity); struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
struct bfq_entity *bfq_entity_of(struct rb_node *node); struct bfq_entity *bfq_entity_of(struct rb_node *node);
unsigned short bfq_ioprio_to_weight(int ioprio); unsigned short bfq_ioprio_to_weight(int ioprio);

View File

@ -44,6 +44,12 @@ static unsigned int bfq_class_idx(struct bfq_entity *entity)
BFQ_DEFAULT_GRP_CLASS - 1; BFQ_DEFAULT_GRP_CLASS - 1;
} }
unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd)
{
return bfqd->busy_queues[0] + bfqd->busy_queues[1] +
bfqd->busy_queues[2];
}
static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
bool expiration); bool expiration);
@ -1513,7 +1519,7 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
struct bfq_sched_data *sd; struct bfq_sched_data *sd;
struct bfq_queue *bfqq; struct bfq_queue *bfqq;
if (bfqd->busy_queues == 0) if (bfq_tot_busy_queues(bfqd) == 0)
return NULL; return NULL;
/* /*
@ -1665,10 +1671,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_clear_bfqq_busy(bfqq); bfq_clear_bfqq_busy(bfqq);
bfqd->busy_queues--; bfqd->busy_queues[bfqq->ioprio_class - 1]--;
if (!bfqq->dispatched)
bfq_weights_tree_remove(bfqd, bfqq);
if (bfqq->wr_coeff > 1) if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues--; bfqd->wr_busy_queues--;
@ -1676,6 +1679,9 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqg_stats_update_dequeue(bfqq_group(bfqq)); bfqg_stats_update_dequeue(bfqq_group(bfqq));
bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
if (!bfqq->dispatched)
bfq_weights_tree_remove(bfqd, bfqq);
} }
/* /*
@ -1688,7 +1694,7 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_activate_bfqq(bfqd, bfqq); bfq_activate_bfqq(bfqd, bfqq);
bfq_mark_bfqq_busy(bfqq); bfq_mark_bfqq_busy(bfqq);
bfqd->busy_queues++; bfqd->busy_queues[bfqq->ioprio_class - 1]++;
if (!bfqq->dispatched) if (!bfqq->dispatched)
if (bfqq->wr_coeff == 1) if (bfqq->wr_coeff == 1)

View File

@ -753,6 +753,8 @@ EXPORT_SYMBOL(bio_add_pc_page);
* @page: page to add * @page: page to add
* @len: length of the data to add * @len: length of the data to add
* @off: offset of the data in @page * @off: offset of the data in @page
* @same_page: if %true only merge if the new data is in the same physical
* page as the last segment of the bio.
* *
* Try to add the data at @page + @off to the last bvec of @bio. This is a * Try to add the data at @page + @off to the last bvec of @bio. This is a
* a useful optimisation for file systems with a block size smaller than the * a useful optimisation for file systems with a block size smaller than the
@ -761,19 +763,25 @@ EXPORT_SYMBOL(bio_add_pc_page);
* Return %true on success or %false on failure. * Return %true on success or %false on failure.
*/ */
bool __bio_try_merge_page(struct bio *bio, struct page *page, bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off) unsigned int len, unsigned int off, bool same_page)
{ {
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false; return false;
if (bio->bi_vcnt > 0) { if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
bv->bv_offset + bv->bv_len - 1;
phys_addr_t page_addr = page_to_phys(page);
if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) { if (vec_end_addr + 1 != page_addr + off)
bv->bv_len += len; return false;
bio->bi_iter.bi_size += len; if (same_page && (vec_end_addr & PAGE_MASK) != page_addr)
return true; return false;
}
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
} }
return false; return false;
} }
@ -819,7 +827,7 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
int bio_add_page(struct bio *bio, struct page *page, int bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset) unsigned int len, unsigned int offset)
{ {
if (!__bio_try_merge_page(bio, page, len, offset)) { if (!__bio_try_merge_page(bio, page, len, offset, false)) {
if (bio_full(bio)) if (bio_full(bio))
return 0; return 0;
__bio_add_page(bio, page, len, offset); __bio_add_page(bio, page, len, offset);
@ -1072,8 +1080,9 @@ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
{ {
int i; int i;
struct bio_vec *bvec; struct bio_vec *bvec;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret; ssize_t ret;
ret = copy_page_from_iter(bvec->bv_page, ret = copy_page_from_iter(bvec->bv_page,
@ -1103,8 +1112,9 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
{ {
int i; int i;
struct bio_vec *bvec; struct bio_vec *bvec;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret; ssize_t ret;
ret = copy_page_to_iter(bvec->bv_page, ret = copy_page_to_iter(bvec->bv_page,
@ -1126,8 +1136,9 @@ void bio_free_pages(struct bio *bio)
{ {
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) bio_for_each_segment_all(bvec, bio, i, iter_all)
__free_page(bvec->bv_page); __free_page(bvec->bv_page);
} }
EXPORT_SYMBOL(bio_free_pages); EXPORT_SYMBOL(bio_free_pages);
@ -1295,6 +1306,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
struct bio *bio; struct bio *bio;
int ret; int ret;
struct bio_vec *bvec; struct bio_vec *bvec;
struct bvec_iter_all iter_all;
if (!iov_iter_count(iter)) if (!iov_iter_count(iter))
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
@ -1368,7 +1380,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
return bio; return bio;
out_unmap: out_unmap:
bio_for_each_segment_all(bvec, bio, j) { bio_for_each_segment_all(bvec, bio, j, iter_all) {
put_page(bvec->bv_page); put_page(bvec->bv_page);
} }
bio_put(bio); bio_put(bio);
@ -1379,11 +1391,12 @@ static void __bio_unmap_user(struct bio *bio)
{ {
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
/* /*
* make sure we dirty pages we wrote to * make sure we dirty pages we wrote to
*/ */
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (bio_data_dir(bio) == READ) if (bio_data_dir(bio) == READ)
set_page_dirty_lock(bvec->bv_page); set_page_dirty_lock(bvec->bv_page);
@ -1475,8 +1488,9 @@ static void bio_copy_kern_endio_read(struct bio *bio)
char *p = bio->bi_private; char *p = bio->bi_private;
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
memcpy(p, page_address(bvec->bv_page), bvec->bv_len); memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
p += bvec->bv_len; p += bvec->bv_len;
} }
@ -1585,8 +1599,9 @@ void bio_set_pages_dirty(struct bio *bio)
{ {
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (!PageCompound(bvec->bv_page)) if (!PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page); set_page_dirty_lock(bvec->bv_page);
} }
@ -1596,8 +1611,9 @@ static void bio_release_pages(struct bio *bio)
{ {
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page); put_page(bvec->bv_page);
} }
@ -1644,8 +1660,9 @@ void bio_check_pages_dirty(struct bio *bio)
struct bio_vec *bvec; struct bio_vec *bvec;
unsigned long flags; unsigned long flags;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
goto defer; goto defer;
} }

View File

@ -1269,7 +1269,7 @@ void blkcg_drain_queue(struct request_queue *q)
* blkcg_exit_queue - exit and release blkcg part of request_queue * blkcg_exit_queue - exit and release blkcg part of request_queue
* @q: request_queue being released * @q: request_queue being released
* *
* Called from blk_release_queue(). Responsible for exiting blkcg part. * Called from blk_exit_queue(). Responsible for exiting blkcg part.
*/ */
void blkcg_exit_queue(struct request_queue *q) void blkcg_exit_queue(struct request_queue *q)
{ {

View File

@ -161,6 +161,73 @@ static inline unsigned get_max_io_size(struct request_queue *q,
return sectors; return sectors;
} }
static unsigned get_max_segment_size(struct request_queue *q,
unsigned offset)
{
unsigned long mask = queue_segment_boundary(q);
/* default segment boundary mask means no boundary limit */
if (mask == BLK_SEG_BOUNDARY_MASK)
return queue_max_segment_size(q);
return min_t(unsigned long, mask - (mask & offset) + 1,
queue_max_segment_size(q));
}
/*
* Split the bvec @bv into segments, and update all kinds of
* variables.
*/
static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
unsigned *nsegs, unsigned *last_seg_size,
unsigned *front_seg_size, unsigned *sectors)
{
unsigned len = bv->bv_len;
unsigned total_len = 0;
unsigned new_nsegs = 0, seg_size = 0;
/*
* Multi-page bvec may be too big to hold in one segment, so the
* current bvec has to be splitted as multiple segments.
*/
while (len && new_nsegs + *nsegs < queue_max_segments(q)) {
seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
seg_size = min(seg_size, len);
new_nsegs++;
total_len += seg_size;
len -= seg_size;
if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
break;
}
if (!new_nsegs)
return !!len;
/* update front segment size */
if (!*nsegs) {
unsigned first_seg_size;
if (new_nsegs == 1)
first_seg_size = get_max_segment_size(q, bv->bv_offset);
else
first_seg_size = queue_max_segment_size(q);
if (*front_seg_size < first_seg_size)
*front_seg_size = first_seg_size;
}
/* update other varibles */
*last_seg_size = seg_size;
*nsegs += new_nsegs;
if (sectors)
*sectors += total_len >> 9;
/* split in the middle of the bvec if len != 0 */
return !!len;
}
static struct bio *blk_bio_segment_split(struct request_queue *q, static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio, struct bio *bio,
struct bio_set *bs, struct bio_set *bs,
@ -174,7 +241,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *new = NULL; struct bio *new = NULL;
const unsigned max_sectors = get_max_io_size(q, bio); const unsigned max_sectors = get_max_io_size(q, bio);
bio_for_each_segment(bv, bio, iter) { bio_for_each_bvec(bv, bio, iter) {
/* /*
* If the queue doesn't support SG gaps and adding this * If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it. * offset would create a gap, disallow it.
@ -189,8 +256,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
*/ */
if (nsegs < queue_max_segments(q) && if (nsegs < queue_max_segments(q) &&
sectors < max_sectors) { sectors < max_sectors) {
nsegs++; /* split in the middle of bvec */
sectors = max_sectors; bv.bv_len = (max_sectors - sectors) << 9;
bvec_split_segs(q, &bv, &nsegs,
&seg_size,
&front_seg_size,
&sectors);
} }
goto split; goto split;
} }
@ -206,21 +277,28 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
bvprvp = &bvprv; bvprvp = &bvprv;
sectors += bv.bv_len >> 9; sectors += bv.bv_len >> 9;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
continue; continue;
} }
new_segment: new_segment:
if (nsegs == queue_max_segments(q)) if (nsegs == queue_max_segments(q))
goto split; goto split;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
nsegs++;
bvprv = bv; bvprv = bv;
bvprvp = &bvprv; bvprvp = &bvprv;
seg_size = bv.bv_len;
sectors += bv.bv_len >> 9;
if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
nsegs++;
seg_size = bv.bv_len;
sectors += bv.bv_len >> 9;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
} else if (bvec_split_segs(q, &bv, &nsegs, &seg_size,
&front_seg_size, &sectors)) {
goto split;
}
} }
do_split = false; do_split = false;
@ -233,8 +311,6 @@ split:
bio = new; bio = new;
} }
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
bio->bi_seg_front_size = front_seg_size; bio->bi_seg_front_size = front_seg_size;
if (seg_size > bio->bi_seg_back_size) if (seg_size > bio->bi_seg_back_size)
bio->bi_seg_back_size = seg_size; bio->bi_seg_back_size = seg_size;
@ -291,18 +367,20 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
EXPORT_SYMBOL(blk_queue_split); EXPORT_SYMBOL(blk_queue_split);
static unsigned int __blk_recalc_rq_segments(struct request_queue *q, static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
struct bio *bio, struct bio *bio)
bool no_sg_merge)
{ {
struct bio_vec bv, bvprv = { NULL }; struct bio_vec bv, bvprv = { NULL };
int prev = 0; int prev = 0;
unsigned int seg_size, nr_phys_segs; unsigned int seg_size, nr_phys_segs;
unsigned front_seg_size;
struct bio *fbio, *bbio; struct bio *fbio, *bbio;
struct bvec_iter iter; struct bvec_iter iter;
if (!bio) if (!bio)
return 0; return 0;
front_seg_size = bio->bi_seg_front_size;
switch (bio_op(bio)) { switch (bio_op(bio)) {
case REQ_OP_DISCARD: case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE: case REQ_OP_SECURE_ERASE:
@ -316,14 +394,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
seg_size = 0; seg_size = 0;
nr_phys_segs = 0; nr_phys_segs = 0;
for_each_bio(bio) { for_each_bio(bio) {
bio_for_each_segment(bv, bio, iter) { bio_for_each_bvec(bv, bio, iter) {
/*
* If SG merging is disabled, each bio vector is
* a segment
*/
if (no_sg_merge)
goto new_segment;
if (prev) { if (prev) {
if (seg_size + bv.bv_len if (seg_size + bv.bv_len
> queue_max_segment_size(q)) > queue_max_segment_size(q))
@ -333,23 +404,23 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
seg_size += bv.bv_len; seg_size += bv.bv_len;
bvprv = bv; bvprv = bv;
if (nr_phys_segs == 1 && seg_size >
front_seg_size)
front_seg_size = seg_size;
continue; continue;
} }
new_segment: new_segment:
if (nr_phys_segs == 1 && seg_size >
fbio->bi_seg_front_size)
fbio->bi_seg_front_size = seg_size;
nr_phys_segs++;
bvprv = bv; bvprv = bv;
prev = 1; prev = 1;
seg_size = bv.bv_len; bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size,
&front_seg_size, NULL);
} }
bbio = bio; bbio = bio;
} }
if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size) fbio->bi_seg_front_size = front_seg_size;
fbio->bi_seg_front_size = seg_size;
if (seg_size > bbio->bi_seg_back_size) if (seg_size > bbio->bi_seg_back_size)
bbio->bi_seg_back_size = seg_size; bbio->bi_seg_back_size = seg_size;
@ -358,33 +429,16 @@ new_segment:
void blk_recalc_rq_segments(struct request *rq) void blk_recalc_rq_segments(struct request *rq)
{ {
bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE, rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
&rq->q->queue_flags);
rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
no_sg_merge);
} }
void blk_recount_segments(struct request_queue *q, struct bio *bio) void blk_recount_segments(struct request_queue *q, struct bio *bio)
{ {
unsigned short seg_cnt; struct bio *nxt = bio->bi_next;
/* estimate segment number by bi_vcnt for non-cloned bio */ bio->bi_next = NULL;
if (bio_flagged(bio, BIO_CLONED)) bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
seg_cnt = bio_segments(bio); bio->bi_next = nxt;
else
seg_cnt = bio->bi_vcnt;
if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
(seg_cnt < queue_max_segments(q)))
bio->bi_phys_segments = seg_cnt;
else {
struct bio *nxt = bio->bi_next;
bio->bi_next = NULL;
bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
bio->bi_next = nxt;
}
bio_set_flag(bio, BIO_SEG_VALID); bio_set_flag(bio, BIO_SEG_VALID);
} }
@ -407,6 +461,54 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
return biovec_phys_mergeable(q, &end_bv, &nxt_bv); return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
} }
static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
struct scatterlist *sglist)
{
if (!*sg)
return sglist;
/*
* If the driver previously mapped a shorter list, we could see a
* termination bit prematurely unless it fully inits the sg table
* on each mapping. We KNOW that there must be more entries here
* or the driver would be buggy, so force clear the termination bit
* to avoid doing a full sg_init_table() in drivers for each command.
*/
sg_unmark_end(*sg);
return sg_next(*sg);
}
static unsigned blk_bvec_map_sg(struct request_queue *q,
struct bio_vec *bvec, struct scatterlist *sglist,
struct scatterlist **sg)
{
unsigned nbytes = bvec->bv_len;
unsigned nsegs = 0, total = 0, offset = 0;
while (nbytes > 0) {
unsigned seg_size;
struct page *pg;
unsigned idx;
*sg = blk_next_sg(sg, sglist);
seg_size = get_max_segment_size(q, bvec->bv_offset + total);
seg_size = min(nbytes, seg_size);
offset = (total + bvec->bv_offset) % PAGE_SIZE;
idx = (total + bvec->bv_offset) / PAGE_SIZE;
pg = bvec_nth_page(bvec->bv_page, idx);
sg_set_page(*sg, pg, seg_size, offset);
total += seg_size;
nbytes -= seg_size;
nsegs++;
}
return nsegs;
}
static inline void static inline void
__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
struct scatterlist *sglist, struct bio_vec *bvprv, struct scatterlist *sglist, struct bio_vec *bvprv,
@ -424,25 +526,12 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
(*sg)->length += nbytes; (*sg)->length += nbytes;
} else { } else {
new_segment: new_segment:
if (!*sg) if (bvec->bv_offset + bvec->bv_len <= PAGE_SIZE) {
*sg = sglist; *sg = blk_next_sg(sg, sglist);
else { sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
/* (*nsegs) += 1;
* If the driver previously mapped a shorter } else
* list, we could see a termination bit (*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg);
* prematurely unless it fully inits the sg
* table on each mapping. We KNOW that there
* must be more entries here or the driver
* would be buggy, so force clear the
* termination bit to avoid doing a full
* sg_init_table() in drivers for each command.
*/
sg_unmark_end(*sg);
*sg = sg_next(*sg);
}
sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
(*nsegs)++;
} }
*bvprv = *bvec; *bvprv = *bvec;
} }
@ -464,7 +553,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
int nsegs = 0; int nsegs = 0;
for_each_bio(bio) for_each_bio(bio)
bio_for_each_segment(bvec, bio, iter) bio_for_each_bvec(bvec, bio, iter)
__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg, __blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
&nsegs); &nsegs);

View File

@ -128,11 +128,9 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(DEAD), QUEUE_FLAG_NAME(DEAD),
QUEUE_FLAG_NAME(INIT_DONE), QUEUE_FLAG_NAME(INIT_DONE),
QUEUE_FLAG_NAME(NO_SG_MERGE),
QUEUE_FLAG_NAME(POLL), QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(WC), QUEUE_FLAG_NAME(WC),
QUEUE_FLAG_NAME(FUA), QUEUE_FLAG_NAME(FUA),
QUEUE_FLAG_NAME(FLUSH_NQ),
QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(DAX),
QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(POLL_STATS), QUEUE_FLAG_NAME(POLL_STATS),
@ -251,7 +249,6 @@ static const char *const alloc_policy_name[] = {
static const char *const hctx_flag_name[] = { static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(SHOULD_MERGE), HCTX_FLAG_NAME(SHOULD_MERGE),
HCTX_FLAG_NAME(TAG_SHARED), HCTX_FLAG_NAME(TAG_SHARED),
HCTX_FLAG_NAME(SG_MERGE),
HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED), HCTX_FLAG_NAME(NO_SCHED),
}; };

View File

@ -321,7 +321,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
{ {
struct elevator_queue *e = q->elevator; struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu); struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
bool ret = false; bool ret = false;
enum hctx_type type; enum hctx_type type;

View File

@ -170,7 +170,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
data->ctx = blk_mq_get_ctx(data->q); data->ctx = blk_mq_get_ctx(data->q);
data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
data->ctx->cpu); data->ctx);
tags = blk_mq_tags_from_data(data); tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED) if (data->flags & BLK_MQ_REQ_RESERVED)
bt = &tags->breserved_tags; bt = &tags->breserved_tags;

View File

@ -364,7 +364,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
} }
if (likely(!data->hctx)) if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->hctx = blk_mq_map_queue(q, data->cmd_flags,
data->ctx->cpu); data->ctx);
if (data->cmd_flags & REQ_NOWAIT) if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT; data->flags |= BLK_MQ_REQ_NOWAIT;
@ -2069,7 +2069,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags; struct blk_mq_tags *tags;
int node; int node;
node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx); node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE) if (node == NUMA_NO_NODE)
node = set->numa_node; node = set->numa_node;
@ -2125,7 +2125,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
size_t rq_size, left; size_t rq_size, left;
int node; int node;
node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx); node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE) if (node == NUMA_NO_NODE)
node = set->numa_node; node = set->numa_node;
@ -2424,7 +2424,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
* If the cpu isn't present, the cpu is mapped to first hctx. * If the cpu isn't present, the cpu is mapped to first hctx.
*/ */
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
hctx_idx = set->map[0].mq_map[i]; hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */ /* unmapped hw queue can be remapped after CPU topo changed */
if (!set->tags[hctx_idx] && if (!set->tags[hctx_idx] &&
!__blk_mq_alloc_rq_map(set, hctx_idx)) { !__blk_mq_alloc_rq_map(set, hctx_idx)) {
@ -2434,16 +2434,19 @@ static void blk_mq_map_swqueue(struct request_queue *q)
* case, remap the current ctx to hctx[0] which * case, remap the current ctx to hctx[0] which
* is guaranteed to always have tags allocated * is guaranteed to always have tags allocated
*/ */
set->map[0].mq_map[i] = 0; set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
} }
ctx = per_cpu_ptr(q->queue_ctx, i); ctx = per_cpu_ptr(q->queue_ctx, i);
for (j = 0; j < set->nr_maps; j++) { for (j = 0; j < set->nr_maps; j++) {
if (!set->map[j].nr_queues) if (!set->map[j].nr_queues) {
ctx->hctxs[j] = blk_mq_map_queue_type(q,
HCTX_TYPE_DEFAULT, i);
continue; continue;
}
hctx = blk_mq_map_queue_type(q, j, i); hctx = blk_mq_map_queue_type(q, j, i);
ctx->hctxs[j] = hctx;
/* /*
* If the CPU is already set in the mask, then we've * If the CPU is already set in the mask, then we've
* mapped this one already. This can happen if * mapped this one already. This can happen if
@ -2463,6 +2466,10 @@ static void blk_mq_map_swqueue(struct request_queue *q)
*/ */
BUG_ON(!hctx->nr_ctx); BUG_ON(!hctx->nr_ctx);
} }
for (; j < HCTX_MAX_TYPES; j++)
ctx->hctxs[j] = blk_mq_map_queue_type(q,
HCTX_TYPE_DEFAULT, i);
} }
mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_lock);
@ -2734,7 +2741,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
int node; int node;
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
node = blk_mq_hw_queue_to_node(&set->map[0], i); node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
/* /*
* If the hw queue has been mapped to another numa node, * If the hw queue has been mapped to another numa node,
* we need to realloc the hctx. If allocation fails, fallback * we need to realloc the hctx. If allocation fails, fallback
@ -2838,9 +2845,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
set->map[HCTX_TYPE_POLL].nr_queues) set->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, q); blk_queue_flag_set(QUEUE_FLAG_POLL, q);
if (!(set->flags & BLK_MQ_F_SG_MERGE))
blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
q->sg_reserved_size = INT_MAX; q->sg_reserved_size = INT_MAX;
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
@ -2968,7 +2972,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
return set->ops->map_queues(set); return set->ops->map_queues(set);
} else { } else {
BUG_ON(set->nr_maps > 1); BUG_ON(set->nr_maps > 1);
return blk_mq_map_queues(&set->map[0]); return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
} }
} }
@ -3090,6 +3094,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
if (!set) if (!set)
return -EINVAL; return -EINVAL;
if (q->nr_requests == nr)
return 0;
blk_mq_freeze_queue(q); blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q); blk_mq_quiesce_queue(q);
@ -3235,7 +3242,7 @@ fallback:
pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
nr_hw_queues, prev_nr_hw_queues); nr_hw_queues, prev_nr_hw_queues);
set->nr_hw_queues = prev_nr_hw_queues; set->nr_hw_queues = prev_nr_hw_queues;
blk_mq_map_queues(&set->map[0]); blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
goto fallback; goto fallback;
} }
blk_mq_map_swqueue(q); blk_mq_map_swqueue(q);

View File

@ -23,6 +23,7 @@ struct blk_mq_ctx {
unsigned int cpu; unsigned int cpu;
unsigned short index_hw[HCTX_MAX_TYPES]; unsigned short index_hw[HCTX_MAX_TYPES];
struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES];
/* incremented at dispatch time */ /* incremented at dispatch time */
unsigned long rq_dispatched[2]; unsigned long rq_dispatched[2];
@ -96,26 +97,23 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
* @q: request queue * @q: request queue
* @flags: request command flags * @flags: request command flags
* @cpu: CPU * @cpu: cpu ctx
*/ */
static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
unsigned int flags, unsigned int flags,
unsigned int cpu) struct blk_mq_ctx *ctx)
{ {
enum hctx_type type = HCTX_TYPE_DEFAULT; enum hctx_type type = HCTX_TYPE_DEFAULT;
if ((flags & REQ_HIPRI) && /*
q->tag_set->nr_maps > HCTX_TYPE_POLL && * The caller ensure that if REQ_HIPRI, poll must be enabled.
q->tag_set->map[HCTX_TYPE_POLL].nr_queues && */
test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) if (flags & REQ_HIPRI)
type = HCTX_TYPE_POLL; type = HCTX_TYPE_POLL;
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
else if (((flags & REQ_OP_MASK) == REQ_OP_READ) &&
q->tag_set->nr_maps > HCTX_TYPE_READ &&
q->tag_set->map[HCTX_TYPE_READ].nr_queues)
type = HCTX_TYPE_READ; type = HCTX_TYPE_READ;
return blk_mq_map_queue_type(q, type, cpu); return ctx->hctxs[type];
} }
/* /*

View File

@ -799,15 +799,6 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
} }
EXPORT_SYMBOL(blk_queue_update_dma_alignment); EXPORT_SYMBOL(blk_queue_update_dma_alignment);
void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
{
if (queueable)
blk_queue_flag_clear(QUEUE_FLAG_FLUSH_NQ, q);
else
blk_queue_flag_set(QUEUE_FLAG_FLUSH_NQ, q);
}
EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
/** /**
* blk_set_queue_depth - tell the block layer about the device queue depth * blk_set_queue_depth - tell the block layer about the device queue depth
* @q: the request queue for the device * @q: the request queue for the device

View File

@ -468,6 +468,9 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
else if (val >= 0) else if (val >= 0)
val *= 1000ULL; val *= 1000ULL;
if (wbt_get_min_lat(q) == val)
return count;
/* /*
* Ensure that the queue is idled, in case the latency update * Ensure that the queue is idled, in case the latency update
* ends up either enabling or disabling wbt completely. We can't * ends up either enabling or disabling wbt completely. We can't
@ -817,21 +820,16 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
} }
/** /**
* __blk_release_queue - release a request queue when it is no longer needed * __blk_release_queue - release a request queue
* @work: pointer to the release_work member of the request queue to be released * @work: pointer to the release_work member of the request queue to be released
* *
* Description: * Description:
* blk_release_queue is the counterpart of blk_init_queue(). It should be * This function is called when a block device is being unregistered. The
* called when a request queue is being released; typically when a block * process of releasing a request queue starts with blk_cleanup_queue, which
* device is being de-registered. Its primary task it to free the queue * set the appropriate flags and then calls blk_put_queue, that decrements
* itself. * the reference counter of the request queue. Once the reference counter
* * of the request queue reaches zero, blk_release_queue is called to release
* Notes: * all allocated resources of the request queue.
* The low level driver must have finished any outstanding requests first
* via blk_cleanup_queue().
*
* Although blk_release_queue() may be called with preemption disabled,
* __blk_release_queue() may sleep.
*/ */
static void __blk_release_queue(struct work_struct *work) static void __blk_release_queue(struct work_struct *work)
{ {

View File

@ -38,7 +38,7 @@ extern struct ida blk_queue_ida;
static inline struct blk_flush_queue * static inline struct blk_flush_queue *
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
{ {
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx->cpu)->fq; return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
} }
static inline void __blk_get_queue(struct request_queue *q) static inline void __blk_get_queue(struct request_queue *q)

View File

@ -165,11 +165,12 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
struct bio_vec *bvec, orig_vec; struct bio_vec *bvec, orig_vec;
int i; int i;
struct bvec_iter orig_iter = bio_orig->bi_iter; struct bvec_iter orig_iter = bio_orig->bi_iter;
struct bvec_iter_all iter_all;
/* /*
* free up bounce indirect pages used * free up bounce indirect pages used
*/ */
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
orig_vec = bio_iter_iovec(bio_orig, orig_iter); orig_vec = bio_iter_iovec(bio_orig, orig_iter);
if (bvec->bv_page != orig_vec.bv_page) { if (bvec->bv_page != orig_vec.bv_page) {
dec_zone_page_state(bvec->bv_page, NR_BOUNCE); dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
@ -313,7 +314,12 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
&bounce_bio_set); &bounce_bio_set);
bio_for_each_segment_all(to, bio, i) { /*
* Bvec table can't be updated by bio_for_each_segment_all(),
* so retrieve bvec from the table directly. This way is safe
* because the 'bio' is single-page bvec.
*/
for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
struct page *page = to->bv_page; struct page *page = to->bv_page;
if (page_to_pfn(page) <= q->limits.bounce_pfn) if (page_to_pfn(page) <= q->limits.bounce_pfn)

View File

@ -667,8 +667,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
/* /*
* Special case for mq, turn off scheduling * Special case for mq, turn off scheduling
*/ */
if (!strncmp(name, "none", 4)) if (!strncmp(name, "none", 4)) {
if (!q->elevator)
return 0;
return elevator_switch(q, NULL); return elevator_switch(q, NULL);
}
strlcpy(elevator_name, name, sizeof(elevator_name)); strlcpy(elevator_name, name, sizeof(elevator_name));
e = elevator_get(q, strstrip(elevator_name), true); e = elevator_get(q, strstrip(elevator_name), true);

View File

@ -365,8 +365,8 @@ int register_blkdev(unsigned int major, const char *name)
} }
if (index == 0) { if (index == 0) {
printk("register_blkdev: failed to get major for %s\n", printk("%s: failed to get major for %s\n",
name); __func__, name);
ret = -EBUSY; ret = -EBUSY;
goto out; goto out;
} }
@ -375,8 +375,8 @@ int register_blkdev(unsigned int major, const char *name)
} }
if (major >= BLKDEV_MAJOR_MAX) { if (major >= BLKDEV_MAJOR_MAX) {
pr_err("register_blkdev: major requested (%u) is greater than the maximum (%u) for %s\n", pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
major, BLKDEV_MAJOR_MAX-1, name); __func__, major, BLKDEV_MAJOR_MAX-1, name);
ret = -EINVAL; ret = -EINVAL;
goto out; goto out;
@ -655,10 +655,12 @@ exit:
kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
disk_part_iter_exit(&piter); disk_part_iter_exit(&piter);
err = sysfs_create_link(&ddev->kobj, if (disk->queue->backing_dev_info->dev) {
&disk->queue->backing_dev_info->dev->kobj, err = sysfs_create_link(&ddev->kobj,
"bdi"); &disk->queue->backing_dev_info->dev->kobj,
WARN_ON(err); "bdi");
WARN_ON(err);
}
} }
/** /**

View File

@ -1318,8 +1318,6 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
scsi_change_queue_depth(sdev, depth); scsi_change_queue_depth(sdev, depth);
} }
blk_queue_flush_queueable(q, false);
if (dev->flags & ATA_DFLAG_TRUSTED) if (dev->flags & ATA_DFLAG_TRUSTED)
sdev->security_supported = 1; sdev->security_supported = 1;

View File

@ -2230,7 +2230,6 @@ static void floppy_end_request(struct request *req, blk_status_t error)
static void request_done(int uptodate) static void request_done(int uptodate)
{ {
struct request *req = current_req; struct request *req = current_req;
struct request_queue *q;
int block; int block;
char msg[sizeof("request done ") + sizeof(int) * 3]; char msg[sizeof("request done ") + sizeof(int) * 3];
@ -2243,8 +2242,6 @@ static void request_done(int uptodate)
return; return;
} }
q = req->q;
if (uptodate) { if (uptodate) {
/* maintain values for invalidation on geometry /* maintain values for invalidation on geometry
* change */ * change */

View File

@ -511,21 +511,22 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
loff_t pos, bool rw) loff_t pos, bool rw)
{ {
struct iov_iter iter; struct iov_iter iter;
struct req_iterator rq_iter;
struct bio_vec *bvec; struct bio_vec *bvec;
struct request *rq = blk_mq_rq_from_pdu(cmd); struct request *rq = blk_mq_rq_from_pdu(cmd);
struct bio *bio = rq->bio; struct bio *bio = rq->bio;
struct file *file = lo->lo_backing_file; struct file *file = lo->lo_backing_file;
struct bio_vec tmp;
unsigned int offset; unsigned int offset;
int segments = 0; int nr_bvec = 0;
int ret; int ret;
if (rq->bio != rq->biotail) { rq_for_each_bvec(tmp, rq, rq_iter)
struct req_iterator iter; nr_bvec++;
struct bio_vec tmp;
__rq_for_each_bio(bio, rq) if (rq->bio != rq->biotail) {
segments += bio_segments(bio);
bvec = kmalloc_array(segments, sizeof(struct bio_vec), bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
GFP_NOIO); GFP_NOIO);
if (!bvec) if (!bvec)
return -EIO; return -EIO;
@ -534,10 +535,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
/* /*
* The bios of the request may be started from the middle of * The bios of the request may be started from the middle of
* the 'bvec' because of bio splitting, so we can't directly * the 'bvec' because of bio splitting, so we can't directly
* copy bio->bi_iov_vec to new bvec. The rq_for_each_segment * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
* API will take care of all details for us. * API will take care of all details for us.
*/ */
rq_for_each_segment(tmp, rq, iter) { rq_for_each_bvec(tmp, rq, rq_iter) {
*bvec = tmp; *bvec = tmp;
bvec++; bvec++;
} }
@ -551,11 +552,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
*/ */
offset = bio->bi_iter.bi_bvec_done; offset = bio->bi_iter.bi_bvec_done;
bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
segments = bio_segments(bio);
} }
atomic_set(&cmd->ref, 2); atomic_set(&cmd->ref, 2);
iov_iter_bvec(&iter, rw, bvec, segments, blk_rq_bytes(rq)); iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
iter.iov_offset = offset; iter.iov_offset = offset;
cmd->iocb.ki_pos = pos; cmd->iocb.ki_pos = pos;
@ -1089,16 +1089,12 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
} }
mapping_set_gfp_mask(filp->f_mapping, gfp); mapping_set_gfp_mask(filp->f_mapping, gfp);
lo->lo_state = Lo_unbound;
/* This is safe: open() is still holding a reference. */ /* This is safe: open() is still holding a reference. */
module_put(THIS_MODULE); module_put(THIS_MODULE);
blk_mq_unfreeze_queue(lo->lo_queue); blk_mq_unfreeze_queue(lo->lo_queue);
partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev; partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
lo_number = lo->lo_number; lo_number = lo->lo_number;
lo->lo_flags = 0;
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
loop_unprepare_queue(lo); loop_unprepare_queue(lo);
out_unlock: out_unlock:
mutex_unlock(&loop_ctl_mutex); mutex_unlock(&loop_ctl_mutex);
@ -1115,11 +1111,29 @@ out_unlock:
err = __blkdev_reread_part(bdev); err = __blkdev_reread_part(bdev);
else else
err = blkdev_reread_part(bdev); err = blkdev_reread_part(bdev);
pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", if (err)
__func__, lo_number, err); pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
__func__, lo_number, err);
/* Device is gone, no point in returning error */ /* Device is gone, no point in returning error */
err = 0; err = 0;
} }
/*
* lo->lo_state is set to Lo_unbound here after above partscan has
* finished.
*
* There cannot be anybody else entering __loop_clr_fd() as
* lo->lo_backing_file is already cleared and Lo_rundown state
* protects us from all the other places trying to change the 'lo'
* device.
*/
mutex_lock(&loop_ctl_mutex);
lo->lo_flags = 0;
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
lo->lo_state = Lo_unbound;
mutex_unlock(&loop_ctl_mutex);
/* /*
* Need not hold loop_ctl_mutex to fput backing file. * Need not hold loop_ctl_mutex to fput backing file.
* Calling fput holding loop_ctl_mutex triggers a circular * Calling fput holding loop_ctl_mutex triggers a circular
@ -1937,7 +1951,7 @@ static int loop_add(struct loop_device **l, int i)
lo->tag_set.queue_depth = 128; lo->tag_set.queue_depth = 128;
lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.numa_node = NUMA_NO_NODE;
lo->tag_set.cmd_size = sizeof(struct loop_cmd); lo->tag_set.cmd_size = sizeof(struct loop_cmd);
lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
lo->tag_set.driver_data = lo; lo->tag_set.driver_data = lo;
err = blk_mq_alloc_tag_set(&lo->tag_set); err = blk_mq_alloc_tag_set(&lo->tag_set);

View File

@ -1416,7 +1416,7 @@ static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE); WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE);
/* Allocate a DMA buffer for the trim structure */ /* Allocate a DMA buffer for the trim structure */
buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr, buf = dma_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
GFP_KERNEL); GFP_KERNEL);
if (!buf) if (!buf)
return BLK_STS_RESOURCE; return BLK_STS_RESOURCE;
@ -1453,7 +1453,7 @@ static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
MTIP_TRIM_TIMEOUT_MS) < 0) MTIP_TRIM_TIMEOUT_MS) < 0)
ret = BLK_STS_IOERR; ret = BLK_STS_IOERR;
dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr); dma_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
return ret; return ret;
} }
@ -1656,7 +1656,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
if (!user_buffer) if (!user_buffer)
return -EFAULT; return -EFAULT;
buf = dmam_alloc_coherent(&port->dd->pdev->dev, buf = dma_alloc_coherent(&port->dd->pdev->dev,
ATA_SECT_SIZE * xfer_sz, ATA_SECT_SIZE * xfer_sz,
&dma_addr, &dma_addr,
GFP_KERNEL); GFP_KERNEL);
@ -1734,7 +1734,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
} }
exit_drive_command: exit_drive_command:
if (buf) if (buf)
dmam_free_coherent(&port->dd->pdev->dev, dma_free_coherent(&port->dd->pdev->dev,
ATA_SECT_SIZE * xfer_sz, buf, dma_addr); ATA_SECT_SIZE * xfer_sz, buf, dma_addr);
return rv; return rv;
} }
@ -2838,11 +2838,11 @@ static void mtip_dma_free(struct driver_data *dd)
struct mtip_port *port = dd->port; struct mtip_port *port = dd->port;
if (port->block1) if (port->block1)
dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
port->block1, port->block1_dma); port->block1, port->block1_dma);
if (port->command_list) { if (port->command_list) {
dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, dma_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
port->command_list, port->command_list_dma); port->command_list, port->command_list_dma);
} }
} }
@ -2861,7 +2861,7 @@ static int mtip_dma_alloc(struct driver_data *dd)
/* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
port->block1 = port->block1 =
dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, dma_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
&port->block1_dma, GFP_KERNEL); &port->block1_dma, GFP_KERNEL);
if (!port->block1) if (!port->block1)
return -ENOMEM; return -ENOMEM;
@ -2869,10 +2869,10 @@ static int mtip_dma_alloc(struct driver_data *dd)
/* Allocate dma memory for command list */ /* Allocate dma memory for command list */
port->command_list = port->command_list =
dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, dma_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
&port->command_list_dma, GFP_KERNEL); &port->command_list_dma, GFP_KERNEL);
if (!port->command_list) { if (!port->command_list) {
dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
port->block1, port->block1_dma); port->block1, port->block1_dma);
port->block1 = NULL; port->block1 = NULL;
port->block1_dma = 0; port->block1_dma = 0;
@ -3057,13 +3057,8 @@ static int mtip_hw_init(struct driver_data *dd)
mtip_start_port(dd->port); mtip_start_port(dd->port);
/* Setup the ISR and enable interrupts. */ /* Setup the ISR and enable interrupts. */
rv = devm_request_irq(&dd->pdev->dev, rv = request_irq(dd->pdev->irq, mtip_irq_handler, IRQF_SHARED,
dd->pdev->irq, dev_driver_string(&dd->pdev->dev), dd);
mtip_irq_handler,
IRQF_SHARED,
dev_driver_string(&dd->pdev->dev),
dd);
if (rv) { if (rv) {
dev_err(&dd->pdev->dev, dev_err(&dd->pdev->dev,
"Unable to allocate IRQ %d\n", dd->pdev->irq); "Unable to allocate IRQ %d\n", dd->pdev->irq);
@ -3091,7 +3086,7 @@ out3:
/* Release the IRQ. */ /* Release the IRQ. */
irq_set_affinity_hint(dd->pdev->irq, NULL); irq_set_affinity_hint(dd->pdev->irq, NULL);
devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); free_irq(dd->pdev->irq, dd);
out2: out2:
mtip_deinit_port(dd->port); mtip_deinit_port(dd->port);
@ -3146,7 +3141,7 @@ static int mtip_hw_exit(struct driver_data *dd)
/* Release the IRQ. */ /* Release the IRQ. */
irq_set_affinity_hint(dd->pdev->irq, NULL); irq_set_affinity_hint(dd->pdev->irq, NULL);
devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); free_irq(dd->pdev->irq, dd);
msleep(1000); msleep(1000);
/* Free dma regions */ /* Free dma regions */
@ -3610,8 +3605,8 @@ static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq,
if (!cmd->command) if (!cmd->command)
return; return;
dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, dma_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, cmd->command,
cmd->command, cmd->command_dma); cmd->command_dma);
} }
static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq, static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
@ -3620,7 +3615,7 @@ static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
struct driver_data *dd = set->driver_data; struct driver_data *dd = set->driver_data;
struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, cmd->command = dma_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
&cmd->command_dma, GFP_KERNEL); &cmd->command_dma, GFP_KERNEL);
if (!cmd->command) if (!cmd->command)
return -ENOMEM; return -ENOMEM;

View File

@ -1571,7 +1571,7 @@ static int nbd_dev_add(int index)
nbd->tag_set.numa_node = NUMA_NO_NODE; nbd->tag_set.numa_node = NUMA_NO_NODE;
nbd->tag_set.cmd_size = sizeof(struct nbd_cmd); nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING; BLK_MQ_F_BLOCKING;
nbd->tag_set.driver_data = nbd; nbd->tag_set.driver_data = nbd;
err = blk_mq_alloc_tag_set(&nbd->tag_set); err = blk_mq_alloc_tag_set(&nbd->tag_set);
@ -2118,8 +2118,7 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
} }
nla_nest_end(reply, dev_list); nla_nest_end(reply, dev_list);
genlmsg_end(reply, reply_head); genlmsg_end(reply, reply_head);
genlmsg_reply(reply, info); ret = genlmsg_reply(reply, info);
ret = 0;
out: out:
mutex_unlock(&nbd_index_mutex); mutex_unlock(&nbd_index_mutex);
return ret; return ret;

View File

@ -1104,7 +1104,7 @@ static int null_handle_bio(struct nullb_cmd *cmd)
len = bvec.bv_len; len = bvec.bv_len;
err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
op_is_write(bio_op(bio)), sector, op_is_write(bio_op(bio)), sector,
bio_op(bio) & REQ_FUA); bio->bi_opf & REQ_FUA);
if (err) { if (err) {
spin_unlock_irq(&nullb->lock); spin_unlock_irq(&nullb->lock);
return err; return err;
@ -1678,7 +1678,6 @@ static int null_add_dev(struct nullb_device *dev)
if (dev->cache_size > 0) { if (dev->cache_size > 0) {
set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
blk_queue_write_cache(nullb->q, true, true); blk_queue_write_cache(nullb->q, true, true);
blk_queue_flush_queueable(nullb->q, true);
} }
if (dev->zoned) { if (dev->zoned) {

View File

@ -3987,7 +3987,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->tag_set.ops = &rbd_mq_ops; rbd_dev->tag_set.ops = &rbd_mq_ops;
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE; rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
rbd_dev->tag_set.nr_hw_queues = 1; rbd_dev->tag_set.nr_hw_queues = 1;
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);

View File

@ -2843,7 +2843,6 @@ static int skd_cons_disk(struct skd_device *skdev)
skdev->sgs_per_request * sizeof(struct scatterlist); skdev->sgs_per_request * sizeof(struct scatterlist);
skdev->tag_set.numa_node = NUMA_NO_NODE; skdev->tag_set.numa_node = NUMA_NO_NODE;
skdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | skdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
BLK_MQ_F_SG_MERGE |
BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO); BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO);
skdev->tag_set.driver_data = skdev; skdev->tag_set.driver_data = skdev;
rc = blk_mq_alloc_tag_set(&skdev->tag_set); rc = blk_mq_alloc_tag_set(&skdev->tag_set);

View File

@ -977,7 +977,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
} else } else
info->tag_set.queue_depth = BLK_RING_SIZE(info); info->tag_set.queue_depth = BLK_RING_SIZE(info);
info->tag_set.numa_node = NUMA_NO_NODE; info->tag_set.numa_node = NUMA_NO_NODE;
info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
info->tag_set.cmd_size = sizeof(struct blkif_req); info->tag_set.cmd_size = sizeof(struct blkif_req);
info->tag_set.driver_data = info; info->tag_set.driver_data = info;

View File

@ -265,6 +265,7 @@
/* #define ERRLOGMASK (CD_WARNING|CD_OPEN|CD_COUNT_TRACKS|CD_CLOSE) */ /* #define ERRLOGMASK (CD_WARNING|CD_OPEN|CD_COUNT_TRACKS|CD_CLOSE) */
/* #define ERRLOGMASK (CD_WARNING|CD_REG_UNREG|CD_DO_IOCTL|CD_OPEN|CD_CLOSE|CD_COUNT_TRACKS) */ /* #define ERRLOGMASK (CD_WARNING|CD_REG_UNREG|CD_DO_IOCTL|CD_OPEN|CD_CLOSE|CD_COUNT_TRACKS) */
#include <linux/atomic.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/major.h> #include <linux/major.h>
@ -3692,9 +3693,9 @@ static struct ctl_table_header *cdrom_sysctl_header;
static void cdrom_sysctl_register(void) static void cdrom_sysctl_register(void)
{ {
static int initialized; static atomic_t initialized = ATOMIC_INIT(0);
if (initialized == 1) if (!atomic_add_unless(&initialized, 1, 1))
return; return;
cdrom_sysctl_header = register_sysctl_table(cdrom_root_table); cdrom_sysctl_header = register_sysctl_table(cdrom_root_table);
@ -3705,8 +3706,6 @@ static void cdrom_sysctl_register(void)
cdrom_sysctl_settings.debug = debug; cdrom_sysctl_settings.debug = debug;
cdrom_sysctl_settings.lock = lockdoor; cdrom_sysctl_settings.lock = lockdoor;
cdrom_sysctl_settings.check = check_media_type; cdrom_sysctl_settings.check = check_media_type;
initialized = 1;
} }
static void cdrom_sysctl_unregister(void) static void cdrom_sysctl_unregister(void)

View File

@ -141,7 +141,7 @@ struct nvm_chk_meta *pblk_get_chunk_meta(struct pblk *pblk)
ret = nvm_get_chunk_meta(dev, ppa, geo->all_chunks, meta); ret = nvm_get_chunk_meta(dev, ppa, geo->all_chunks, meta);
if (ret) { if (ret) {
kfree(meta); vfree(meta);
return ERR_PTR(-EIO); return ERR_PTR(-EIO);
} }
@ -1065,7 +1065,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len); bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len);
smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC);
memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16); guid_copy((guid_t *)&smeta_buf->header.uuid, &pblk->instance_uuid);
smeta_buf->header.id = cpu_to_le32(line->id); smeta_buf->header.id = cpu_to_le32(line->id);
smeta_buf->header.type = cpu_to_le16(line->type); smeta_buf->header.type = cpu_to_le16(line->type);
smeta_buf->header.version_major = SMETA_VERSION_MAJOR; smeta_buf->header.version_major = SMETA_VERSION_MAJOR;
@ -1278,6 +1278,7 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
spin_unlock(&line->lock); spin_unlock(&line->lock);
kref_init(&line->ref); kref_init(&line->ref);
atomic_set(&line->sec_to_update, 0);
return 0; return 0;
} }
@ -1874,7 +1875,8 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) { if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) {
emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC);
memcpy(emeta_buf->header.uuid, pblk->instance_uuid, 16); guid_copy((guid_t *)&emeta_buf->header.uuid,
&pblk->instance_uuid);
emeta_buf->header.id = cpu_to_le32(line->id); emeta_buf->header.id = cpu_to_le32(line->id);
emeta_buf->header.type = cpu_to_le16(line->type); emeta_buf->header.type = cpu_to_le16(line->type);
emeta_buf->header.version_major = EMETA_VERSION_MAJOR; emeta_buf->header.version_major = EMETA_VERSION_MAJOR;

View File

@ -365,16 +365,22 @@ static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk,
struct list_head *group_list) struct list_head *group_list)
{ {
struct pblk_line *line, *victim; struct pblk_line *line, *victim;
int line_vsc, victim_vsc; unsigned int line_vsc = ~0x0L, victim_vsc = ~0x0L;
victim = list_first_entry(group_list, struct pblk_line, list); victim = list_first_entry(group_list, struct pblk_line, list);
list_for_each_entry(line, group_list, list) { list_for_each_entry(line, group_list, list) {
line_vsc = le32_to_cpu(*line->vsc); if (!atomic_read(&line->sec_to_update))
victim_vsc = le32_to_cpu(*victim->vsc); line_vsc = le32_to_cpu(*line->vsc);
if (line_vsc < victim_vsc) if (line_vsc < victim_vsc) {
victim = line; victim = line;
victim_vsc = le32_to_cpu(*victim->vsc);
}
} }
if (victim_vsc == ~0x0)
return NULL;
return victim; return victim;
} }
@ -448,13 +454,13 @@ next_gc_group:
do { do {
spin_lock(&l_mg->gc_lock); spin_lock(&l_mg->gc_lock);
if (list_empty(group_list)) {
line = pblk_gc_get_victim_line(pblk, group_list);
if (!line) {
spin_unlock(&l_mg->gc_lock); spin_unlock(&l_mg->gc_lock);
break; break;
} }
line = pblk_gc_get_victim_line(pblk, group_list);
spin_lock(&line->lock); spin_lock(&line->lock);
WARN_ON(line->state != PBLK_LINESTATE_CLOSED); WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
line->state = PBLK_LINESTATE_GC; line->state = PBLK_LINESTATE_GC;

View File

@ -130,7 +130,7 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
struct pblk_line *line = NULL; struct pblk_line *line = NULL;
if (factory_init) { if (factory_init) {
pblk_setup_uuid(pblk); guid_gen(&pblk->instance_uuid);
} else { } else {
line = pblk_recov_l2p(pblk); line = pblk_recov_l2p(pblk);
if (IS_ERR(line)) { if (IS_ERR(line)) {
@ -584,14 +584,12 @@ static void pblk_lines_free(struct pblk *pblk)
struct pblk_line *line; struct pblk_line *line;
int i; int i;
spin_lock(&l_mg->free_lock);
for (i = 0; i < l_mg->nr_lines; i++) { for (i = 0; i < l_mg->nr_lines; i++) {
line = &pblk->lines[i]; line = &pblk->lines[i];
pblk_line_free(line); pblk_line_free(line);
pblk_line_meta_free(l_mg, line); pblk_line_meta_free(l_mg, line);
} }
spin_unlock(&l_mg->free_lock);
pblk_line_mg_free(pblk); pblk_line_mg_free(pblk);

View File

@ -73,6 +73,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
*/ */
if (i < valid_secs) { if (i < valid_secs) {
kref_get(&line->ref); kref_get(&line->ref);
atomic_inc(&line->sec_to_update);
w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i); w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
w_ctx->ppa = ppa_list[i]; w_ctx->ppa = ppa_list[i];
meta->lba = cpu_to_le64(w_ctx->lba); meta->lba = cpu_to_le64(w_ctx->lba);

View File

@ -45,10 +45,23 @@ void pblk_rb_free(struct pblk_rb *rb)
/* /*
* pblk_rb_calculate_size -- calculate the size of the write buffer * pblk_rb_calculate_size -- calculate the size of the write buffer
*/ */
static unsigned int pblk_rb_calculate_size(unsigned int nr_entries) static unsigned int pblk_rb_calculate_size(unsigned int nr_entries,
unsigned int threshold)
{ {
/* Alloc a write buffer that can at least fit 128 entries */ unsigned int thr_sz = 1 << (get_count_order(threshold + NVM_MAX_VLBA));
return (1 << max(get_count_order(nr_entries), 7)); unsigned int max_sz = max(thr_sz, nr_entries);
unsigned int max_io;
/* Alloc a write buffer that can (i) fit at least two split bios
* (considering max I/O size NVM_MAX_VLBA, and (ii) guarantee that the
* threshold will be respected
*/
max_io = (1 << max((int)(get_count_order(max_sz)),
(int)(get_count_order(NVM_MAX_VLBA << 1))));
if ((threshold + NVM_MAX_VLBA) >= max_io)
max_io <<= 1;
return max_io;
} }
/* /*
@ -67,12 +80,12 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
unsigned int alloc_order, order, iter; unsigned int alloc_order, order, iter;
unsigned int nr_entries; unsigned int nr_entries;
nr_entries = pblk_rb_calculate_size(size); nr_entries = pblk_rb_calculate_size(size, threshold);
entries = vzalloc(array_size(nr_entries, sizeof(struct pblk_rb_entry))); entries = vzalloc(array_size(nr_entries, sizeof(struct pblk_rb_entry)));
if (!entries) if (!entries)
return -ENOMEM; return -ENOMEM;
power_size = get_count_order(size); power_size = get_count_order(nr_entries);
power_seg_sz = get_count_order(seg_size); power_seg_sz = get_count_order(seg_size);
down_write(&pblk_rb_lock); down_write(&pblk_rb_lock);
@ -149,7 +162,7 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
* Initialize rate-limiter, which controls access to the write buffer * Initialize rate-limiter, which controls access to the write buffer
* by user and GC I/O * by user and GC I/O
*/ */
pblk_rl_init(&pblk->rl, rb->nr_entries); pblk_rl_init(&pblk->rl, rb->nr_entries, threshold);
return 0; return 0;
} }
@ -247,6 +260,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
entry->cacheline); entry->cacheline);
line = pblk_ppa_to_line(pblk, w_ctx->ppa); line = pblk_ppa_to_line(pblk, w_ctx->ppa);
atomic_dec(&line->sec_to_update);
kref_put(&line->ref, pblk_line_put); kref_put(&line->ref, pblk_line_put);
clean_wctx(w_ctx); clean_wctx(w_ctx);
rb->l2p_update = pblk_rb_ptr_wrap(rb, rb->l2p_update, 1); rb->l2p_update = pblk_rb_ptr_wrap(rb, rb->l2p_update, 1);

View File

@ -302,35 +302,55 @@ static int pblk_pad_distance(struct pblk *pblk, struct pblk_line *line)
return (distance > line->left_msecs) ? line->left_msecs : distance; return (distance > line->left_msecs) ? line->left_msecs : distance;
} }
static int pblk_line_wp_is_unbalanced(struct pblk *pblk, /* Return a chunk belonging to a line by stripe(write order) index */
struct pblk_line *line) static struct nvm_chk_meta *pblk_get_stripe_chunk(struct pblk *pblk,
struct pblk_line *line,
int index)
{ {
struct nvm_tgt_dev *dev = pblk->dev; struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo; struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_lun *rlun; struct pblk_lun *rlun;
struct nvm_chk_meta *chunk;
struct ppa_addr ppa; struct ppa_addr ppa;
u64 line_wp; int pos;
int pos, i;
rlun = &pblk->luns[0]; rlun = &pblk->luns[index];
ppa = rlun->bppa; ppa = rlun->bppa;
pos = pblk_ppa_to_pos(geo, ppa); pos = pblk_ppa_to_pos(geo, ppa);
chunk = &line->chks[pos];
line_wp = chunk->wp; return &line->chks[pos];
}
for (i = 1; i < lm->blk_per_line; i++) { static int pblk_line_wps_are_unbalanced(struct pblk *pblk,
rlun = &pblk->luns[i]; struct pblk_line *line)
ppa = rlun->bppa; {
pos = pblk_ppa_to_pos(geo, ppa); struct pblk_line_meta *lm = &pblk->lm;
chunk = &line->chks[pos]; int blk_in_line = lm->blk_per_line;
struct nvm_chk_meta *chunk;
u64 max_wp, min_wp;
int i;
if (chunk->wp > line_wp) i = find_first_zero_bit(line->blk_bitmap, blk_in_line);
/* If there is one or zero good chunks in the line,
* the write pointers can't be unbalanced.
*/
if (i >= (blk_in_line - 1))
return 0;
chunk = pblk_get_stripe_chunk(pblk, line, i);
max_wp = chunk->wp;
if (max_wp > pblk->max_write_pgs)
min_wp = max_wp - pblk->max_write_pgs;
else
min_wp = 0;
i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1);
while (i < blk_in_line) {
chunk = pblk_get_stripe_chunk(pblk, line, i);
if (chunk->wp > max_wp || chunk->wp < min_wp)
return 1; return 1;
else if (chunk->wp < line_wp)
line_wp = chunk->wp; i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1);
} }
return 0; return 0;
@ -356,7 +376,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
int ret; int ret;
u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec; u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec;
if (pblk_line_wp_is_unbalanced(pblk, line)) if (pblk_line_wps_are_unbalanced(pblk, line))
pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id); pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id);
ppa_list = p.ppa_list; ppa_list = p.ppa_list;
@ -703,11 +723,13 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
/* The first valid instance uuid is used for initialization */ /* The first valid instance uuid is used for initialization */
if (!valid_uuid) { if (!valid_uuid) {
memcpy(pblk->instance_uuid, smeta_buf->header.uuid, 16); guid_copy(&pblk->instance_uuid,
(guid_t *)&smeta_buf->header.uuid);
valid_uuid = 1; valid_uuid = 1;
} }
if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) { if (!guid_equal(&pblk->instance_uuid,
(guid_t *)&smeta_buf->header.uuid)) {
pblk_debug(pblk, "ignore line %u due to uuid mismatch\n", pblk_debug(pblk, "ignore line %u due to uuid mismatch\n",
i); i);
continue; continue;
@ -737,7 +759,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
} }
if (!found_lines) { if (!found_lines) {
pblk_setup_uuid(pblk); guid_gen(&pblk->instance_uuid);
spin_lock(&l_mg->free_lock); spin_lock(&l_mg->free_lock);
WARN_ON_ONCE(!test_and_clear_bit(meta_line, WARN_ON_ONCE(!test_and_clear_bit(meta_line,

View File

@ -207,7 +207,7 @@ void pblk_rl_free(struct pblk_rl *rl)
del_timer(&rl->u_timer); del_timer(&rl->u_timer);
} }
void pblk_rl_init(struct pblk_rl *rl, int budget) void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold)
{ {
struct pblk *pblk = container_of(rl, struct pblk, rl); struct pblk *pblk = container_of(rl, struct pblk, rl);
struct nvm_tgt_dev *dev = pblk->dev; struct nvm_tgt_dev *dev = pblk->dev;
@ -217,7 +217,6 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
int sec_meta, blk_meta; int sec_meta, blk_meta;
unsigned int rb_windows; unsigned int rb_windows;
/* Consider sectors used for metadata */ /* Consider sectors used for metadata */
sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
@ -234,7 +233,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
/* To start with, all buffer is available to user I/O writers */ /* To start with, all buffer is available to user I/O writers */
rl->rb_budget = budget; rl->rb_budget = budget;
rl->rb_user_max = budget; rl->rb_user_max = budget;
rl->rb_max_io = budget >> 1; rl->rb_max_io = threshold ? (budget - threshold) : (budget - 1);
rl->rb_gc_max = 0; rl->rb_gc_max = 0;
rl->rb_state = PBLK_RL_HIGH; rl->rb_state = PBLK_RL_HIGH;

View File

@ -139,7 +139,7 @@ TRACE_EVENT(pblk_state,
/* This part must be outside protection */ /* This part must be outside protection */
#undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH ../../../drivers/lightnvm #define TRACE_INCLUDE_PATH ../../drivers/lightnvm
#undef TRACE_INCLUDE_FILE #undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE pblk-trace #define TRACE_INCLUDE_FILE pblk-trace
#include <trace/define_trace.h> #include <trace/define_trace.h>

View File

@ -177,6 +177,7 @@ static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
* re-map these entries * re-map these entries
*/ */
line = pblk_ppa_to_line(pblk, w_ctx->ppa); line = pblk_ppa_to_line(pblk, w_ctx->ppa);
atomic_dec(&line->sec_to_update);
kref_put(&line->ref, pblk_line_put); kref_put(&line->ref, pblk_line_put);
} }
spin_unlock(&pblk->trans_lock); spin_unlock(&pblk->trans_lock);

View File

@ -131,8 +131,8 @@ struct pblk_pr_ctx {
unsigned int bio_init_idx; unsigned int bio_init_idx;
void *ppa_ptr; void *ppa_ptr;
dma_addr_t dma_ppa_list; dma_addr_t dma_ppa_list;
__le64 lba_list_mem[NVM_MAX_VLBA]; u64 lba_list_mem[NVM_MAX_VLBA];
__le64 lba_list_media[NVM_MAX_VLBA]; u64 lba_list_media[NVM_MAX_VLBA];
}; };
/* Pad context */ /* Pad context */
@ -487,6 +487,7 @@ struct pblk_line {
__le32 *vsc; /* Valid sector count in line */ __le32 *vsc; /* Valid sector count in line */
struct kref ref; /* Write buffer L2P references */ struct kref ref; /* Write buffer L2P references */
atomic_t sec_to_update; /* Outstanding L2P updates to ppa */
struct pblk_w_err_gc *w_err_gc; /* Write error gc recovery metadata */ struct pblk_w_err_gc *w_err_gc; /* Write error gc recovery metadata */
@ -646,7 +647,7 @@ struct pblk {
int sec_per_write; int sec_per_write;
unsigned char instance_uuid[16]; guid_t instance_uuid;
/* Persistent write amplification counters, 4kb sector I/Os */ /* Persistent write amplification counters, 4kb sector I/Os */
atomic64_t user_wa; /* Sectors written by user */ atomic64_t user_wa; /* Sectors written by user */
@ -924,7 +925,7 @@ int pblk_gc_sysfs_force(struct pblk *pblk, int force);
/* /*
* pblk rate limiter * pblk rate limiter
*/ */
void pblk_rl_init(struct pblk_rl *rl, int budget); void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold);
void pblk_rl_free(struct pblk_rl *rl); void pblk_rl_free(struct pblk_rl *rl);
void pblk_rl_update_rates(struct pblk_rl *rl); void pblk_rl_update_rates(struct pblk_rl *rl);
int pblk_rl_high_thrs(struct pblk_rl *rl); int pblk_rl_high_thrs(struct pblk_rl *rl);
@ -1360,14 +1361,6 @@ static inline unsigned int pblk_get_secs(struct bio *bio)
return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE; return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
} }
static inline void pblk_setup_uuid(struct pblk *pblk)
{
uuid_le uuid;
uuid_le_gen(&uuid);
memcpy(pblk->instance_uuid, uuid.b, 16);
}
static inline char *pblk_disk_name(struct pblk *pblk) static inline char *pblk_disk_name(struct pblk *pblk)
{ {
struct gendisk *disk = pblk->disk; struct gendisk *disk = pblk->disk;

View File

@ -432,8 +432,9 @@ static void do_btree_node_write(struct btree *b)
int j; int j;
struct bio_vec *bv; struct bio_vec *bv;
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, b->bio, j) bio_for_each_segment_all(bv, b->bio, j, iter_all)
memcpy(page_address(bv->bv_page), memcpy(page_address(bv->bv_page),
base + j * PAGE_SIZE, PAGE_SIZE); base + j * PAGE_SIZE, PAGE_SIZE);

View File

@ -538,6 +538,7 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
{ {
struct btree *b = container_of(bk, struct btree, keys); struct btree *b = container_of(bk, struct btree, keys);
unsigned int i, stale; unsigned int i, stale;
char buf[80];
if (!KEY_PTRS(k) || if (!KEY_PTRS(k) ||
bch_extent_invalid(bk, k)) bch_extent_invalid(bk, k))
@ -547,19 +548,19 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
if (!ptr_available(b->c, k, i)) if (!ptr_available(b->c, k, i))
return true; return true;
if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
return false;
for (i = 0; i < KEY_PTRS(k); i++) { for (i = 0; i < KEY_PTRS(k); i++) {
stale = ptr_stale(b->c, k, i); stale = ptr_stale(b->c, k, i);
if (stale && KEY_DIRTY(k)) {
bch_extent_to_text(buf, sizeof(buf), k);
pr_info("stale dirty pointer, stale %u, key: %s",
stale, buf);
}
btree_bug_on(stale > BUCKET_GC_GEN_MAX, b, btree_bug_on(stale > BUCKET_GC_GEN_MAX, b,
"key too stale: %i, need_gc %u", "key too stale: %i, need_gc %u",
stale, b->c->need_gc); stale, b->c->need_gc);
btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
b, "stale dirty pointer");
if (stale) if (stale)
return true; return true;

View File

@ -392,10 +392,11 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
/* /*
* Flag for bypass if the IO is for read-ahead or background, * Flag for bypass if the IO is for read-ahead or background,
* unless the read-ahead request is for metadata (eg, for gfs2). * unless the read-ahead request is for metadata
* (eg, for gfs2 or xfs).
*/ */
if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
!(bio->bi_opf & REQ_PRIO)) !(bio->bi_opf & (REQ_META|REQ_PRIO)))
goto skip; goto skip;
if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
@ -877,7 +878,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
} }
if (!(bio->bi_opf & REQ_RAHEAD) && if (!(bio->bi_opf & REQ_RAHEAD) &&
!(bio->bi_opf & REQ_PRIO) && !(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
reada = min_t(sector_t, dc->readahead >> 9, reada = min_t(sector_t, dc->readahead >> 9,
get_capacity(bio->bi_disk) - bio_end_sector(bio)); get_capacity(bio->bi_disk) - bio_end_sector(bio));

View File

@ -111,7 +111,7 @@ void bch_cache_accounting_clear(struct cache_accounting *acc)
{ {
memset(&acc->total.cache_hits, memset(&acc->total.cache_hits,
0, 0,
sizeof(unsigned long) * 7); sizeof(struct cache_stats));
} }
void bch_cache_accounting_destroy(struct cache_accounting *acc) void bch_cache_accounting_destroy(struct cache_accounting *acc)

View File

@ -1615,21 +1615,21 @@ static void conditional_stop_bcache_device(struct cache_set *c,
*/ */
pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.", pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
d->disk->disk_name); d->disk->disk_name);
/* /*
* There might be a small time gap that cache set is * There might be a small time gap that cache set is
* released but bcache device is not. Inside this time * released but bcache device is not. Inside this time
* gap, regular I/O requests will directly go into * gap, regular I/O requests will directly go into
* backing device as no cache set attached to. This * backing device as no cache set attached to. This
* behavior may also introduce potential inconsistence * behavior may also introduce potential inconsistence
* data in writeback mode while cache is dirty. * data in writeback mode while cache is dirty.
* Therefore before calling bcache_device_stop() due * Therefore before calling bcache_device_stop() due
* to a broken cache device, dc->io_disable should be * to a broken cache device, dc->io_disable should be
* explicitly set to true. * explicitly set to true.
*/ */
dc->io_disable = true; dc->io_disable = true;
/* make others know io_disable is true earlier */ /* make others know io_disable is true earlier */
smp_mb(); smp_mb();
bcache_device_stop(d); bcache_device_stop(d);
} else { } else {
/* /*
* dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO

View File

@ -67,6 +67,8 @@ read_attribute(written);
read_attribute(btree_written); read_attribute(btree_written);
read_attribute(metadata_written); read_attribute(metadata_written);
read_attribute(active_journal_entries); read_attribute(active_journal_entries);
read_attribute(backing_dev_name);
read_attribute(backing_dev_uuid);
sysfs_time_stats_attribute(btree_gc, sec, ms); sysfs_time_stats_attribute(btree_gc, sec, ms);
sysfs_time_stats_attribute(btree_split, sec, us); sysfs_time_stats_attribute(btree_split, sec, us);
@ -243,6 +245,19 @@ SHOW(__bch_cached_dev)
return strlen(buf); return strlen(buf);
} }
if (attr == &sysfs_backing_dev_name) {
snprintf(buf, BDEVNAME_SIZE + 1, "%s", dc->backing_dev_name);
strcat(buf, "\n");
return strlen(buf);
}
if (attr == &sysfs_backing_dev_uuid) {
/* convert binary uuid into 36-byte string plus '\0' */
snprintf(buf, 36+1, "%pU", dc->sb.uuid);
strcat(buf, "\n");
return strlen(buf);
}
#undef var #undef var
return 0; return 0;
} }
@ -262,10 +277,10 @@ STORE(__cached_dev)
sysfs_strtoul(data_csum, dc->disk.data_csum); sysfs_strtoul(data_csum, dc->disk.data_csum);
d_strtoul(verify); d_strtoul(verify);
d_strtoul(bypass_torture_test); sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test);
d_strtoul(writeback_metadata); sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata);
d_strtoul(writeback_running); sysfs_strtoul_bool(writeback_running, dc->writeback_running);
d_strtoul(writeback_delay); sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX);
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
0, bch_cutoff_writeback); 0, bch_cutoff_writeback);
@ -287,9 +302,15 @@ STORE(__cached_dev)
sysfs_strtoul_clamp(writeback_rate_update_seconds, sysfs_strtoul_clamp(writeback_rate_update_seconds,
dc->writeback_rate_update_seconds, dc->writeback_rate_update_seconds,
1, WRITEBACK_RATE_UPDATE_SECS_MAX); 1, WRITEBACK_RATE_UPDATE_SECS_MAX);
d_strtoul(writeback_rate_i_term_inverse); sysfs_strtoul_clamp(writeback_rate_i_term_inverse,
d_strtoul_nonzero(writeback_rate_p_term_inverse); dc->writeback_rate_i_term_inverse,
d_strtoul_nonzero(writeback_rate_minimum); 1, UINT_MAX);
sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
dc->writeback_rate_p_term_inverse,
1, UINT_MAX);
sysfs_strtoul_clamp(writeback_rate_minimum,
dc->writeback_rate_minimum,
1, UINT_MAX);
sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX);
@ -299,7 +320,9 @@ STORE(__cached_dev)
dc->io_disable = v ? 1 : 0; dc->io_disable = v ? 1 : 0;
} }
d_strtoi_h(sequential_cutoff); sysfs_strtoul_clamp(sequential_cutoff,
dc->sequential_cutoff,
0, UINT_MAX);
d_strtoi_h(readahead); d_strtoi_h(readahead);
if (attr == &sysfs_clear_stats) if (attr == &sysfs_clear_stats)
@ -452,6 +475,8 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_verify, &sysfs_verify,
&sysfs_bypass_torture_test, &sysfs_bypass_torture_test,
#endif #endif
&sysfs_backing_dev_name,
&sysfs_backing_dev_uuid,
NULL NULL
}; };
KTYPE(bch_cached_dev); KTYPE(bch_cached_dev);
@ -761,10 +786,12 @@ STORE(__bch_cache_set)
c->shrink.scan_objects(&c->shrink, &sc); c->shrink.scan_objects(&c->shrink, &sc);
} }
sysfs_strtoul(congested_read_threshold_us, sysfs_strtoul_clamp(congested_read_threshold_us,
c->congested_read_threshold_us); c->congested_read_threshold_us,
sysfs_strtoul(congested_write_threshold_us, 0, UINT_MAX);
c->congested_write_threshold_us); sysfs_strtoul_clamp(congested_write_threshold_us,
c->congested_write_threshold_us,
0, UINT_MAX);
if (attr == &sysfs_errors) { if (attr == &sysfs_errors) {
v = __sysfs_match_string(error_actions, -1, buf); v = __sysfs_match_string(error_actions, -1, buf);
@ -774,12 +801,20 @@ STORE(__bch_cache_set)
c->on_error = v; c->on_error = v;
} }
if (attr == &sysfs_io_error_limit) sysfs_strtoul_clamp(io_error_limit, c->error_limit, 0, UINT_MAX);
c->error_limit = strtoul_or_return(buf);
/* See count_io_errors() for why 88 */ /* See count_io_errors() for why 88 */
if (attr == &sysfs_io_error_halflife) if (attr == &sysfs_io_error_halflife) {
c->error_decay = strtoul_or_return(buf) / 88; unsigned long v = 0;
ssize_t ret;
ret = strtoul_safe_clamp(buf, v, 0, UINT_MAX);
if (!ret) {
c->error_decay = v / 88;
return size;
}
return ret;
}
if (attr == &sysfs_io_disable) { if (attr == &sysfs_io_disable) {
v = strtoul_or_return(buf); v = strtoul_or_return(buf);
@ -794,13 +829,15 @@ STORE(__bch_cache_set)
} }
} }
sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); sysfs_strtoul_clamp(journal_delay_ms,
sysfs_strtoul(verify, c->verify); c->journal_delay_ms,
sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); 0, USHRT_MAX);
sysfs_strtoul_bool(verify, c->verify);
sysfs_strtoul_bool(key_merging_disabled, c->key_merging_disabled);
sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks);
sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite);
sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled);
sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled);
/* /*
* write gc_after_writeback here may overwrite an already set * write gc_after_writeback here may overwrite an already set
* BCH_DO_AUTO_GC, it doesn't matter because this flag will be * BCH_DO_AUTO_GC, it doesn't matter because this flag will be

View File

@ -79,11 +79,28 @@ do { \
return strtoul_safe(buf, var) ?: (ssize_t) size; \ return strtoul_safe(buf, var) ?: (ssize_t) size; \
} while (0) } while (0)
#define sysfs_strtoul_bool(file, var) \
do { \
if (attr == &sysfs_ ## file) { \
unsigned long v = strtoul_or_return(buf); \
\
var = v ? 1 : 0; \
return size; \
} \
} while (0)
#define sysfs_strtoul_clamp(file, var, min, max) \ #define sysfs_strtoul_clamp(file, var, min, max) \
do { \ do { \
if (attr == &sysfs_ ## file) \ if (attr == &sysfs_ ## file) { \
return strtoul_safe_clamp(buf, var, min, max) \ unsigned long v = 0; \
?: (ssize_t) size; \ ssize_t ret; \
ret = strtoul_safe_clamp(buf, v, min, max); \
if (!ret) { \
var = v; \
return size; \
} \
return ret; \
} \
} while (0) } while (0)
#define strtoul_or_return(cp) \ #define strtoul_or_return(cp) \

View File

@ -270,7 +270,11 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
int i; int i;
struct bio_vec *bv; struct bio_vec *bv;
bio_for_each_segment_all(bv, bio, i) { /*
* This is called on freshly new bio, so it is safe to access the
* bvec table directly.
*/
for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) {
bv->bv_page = alloc_page(gfp_mask); bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) { if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec) while (--bv >= bio->bi_io_vec)

View File

@ -71,6 +71,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
in_use > bch_cutoff_writeback_sync) in_use > bch_cutoff_writeback_sync)
return false; return false;
if (bio_op(bio) == REQ_OP_DISCARD)
return false;
if (dc->partial_stripes_expensive && if (dc->partial_stripes_expensive &&
bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector, bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector,
bio_sectors(bio))) bio_sectors(bio)))

View File

@ -1447,8 +1447,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
{ {
unsigned int i; unsigned int i;
struct bio_vec *bv; struct bio_vec *bv;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, clone, i) { bio_for_each_segment_all(bv, clone, i, iter_all) {
BUG_ON(!bv->bv_page); BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, &cc->page_pool); mempool_free(bv->bv_page, &cc->page_pool);
} }

View File

@ -527,7 +527,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
md->tag_set->ops = &dm_mq_ops; md->tag_set->ops = &dm_mq_ops;
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
md->tag_set->numa_node = md->numa_node_id; md->tag_set->numa_node = md->numa_node_id;
md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
md->tag_set->driver_data = md; md->tag_set->driver_data = md;

View File

@ -1698,14 +1698,6 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
return q && !blk_queue_add_random(q); return q && !blk_queue_add_random(q);
} }
static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
}
static bool dm_table_all_devices_attribute(struct dm_table *t, static bool dm_table_all_devices_attribute(struct dm_table *t,
iterate_devices_callout_fn func) iterate_devices_callout_fn func)
{ {
@ -1902,11 +1894,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (!dm_table_supports_write_zeroes(t)) if (!dm_table_supports_write_zeroes(t))
q->limits.max_write_zeroes_sectors = 0; q->limits.max_write_zeroes_sectors = 0;
if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
blk_queue_flag_clear(QUEUE_FLAG_NO_SG_MERGE, q);
else
blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
dm_table_verify_integrity(t); dm_table_verify_integrity(t);
/* /*

View File

@ -96,8 +96,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
int i, cnt; int i, cnt;
bool discard_supported = false; bool discard_supported = false;
conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
GFP_KERNEL);
if (!conf) if (!conf)
return NULL; return NULL;

View File

@ -1603,11 +1603,9 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
return; return;
} }
set_bit(Blocked, &rdev->flags); set_bit(Blocked, &rdev->flags);
if (test_and_clear_bit(In_sync, &rdev->flags)) { if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++; mddev->degraded++;
set_bit(Faulty, &rdev->flags); set_bit(Faulty, &rdev->flags);
} else
set_bit(Faulty, &rdev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
/* /*
* if recovery is running, make sure it aborts. * if recovery is running, make sure it aborts.
@ -2120,13 +2118,14 @@ static void process_checks(struct r1bio *r1_bio)
struct page **spages = get_resync_pages(sbio)->pages; struct page **spages = get_resync_pages(sbio)->pages;
struct bio_vec *bi; struct bio_vec *bi;
int page_len[RESYNC_PAGES] = { 0 }; int page_len[RESYNC_PAGES] = { 0 };
struct bvec_iter_all iter_all;
if (sbio->bi_end_io != end_sync_read) if (sbio->bi_end_io != end_sync_read)
continue; continue;
/* Now we can 'fixup' the error value */ /* Now we can 'fixup' the error value */
sbio->bi_status = 0; sbio->bi_status = 0;
bio_for_each_segment_all(bi, sbio, j) bio_for_each_segment_all(bi, sbio, j, iter_all)
page_len[j] = bi->bv_len; page_len[j] = bi->bv_len;
if (!status) { if (!status) {

View File

@ -417,8 +417,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card)
else else
mq->tag_set.queue_depth = MMC_QUEUE_DEPTH; mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
mq->tag_set.numa_node = NUMA_NO_NODE; mq->tag_set.numa_node = NUMA_NO_NODE;
mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE | mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
BLK_MQ_F_BLOCKING;
mq->tag_set.nr_hw_queues = 1; mq->tag_set.nr_hw_queues = 1;
mq->tag_set.cmd_size = sizeof(struct mmc_queue_req); mq->tag_set.cmd_size = sizeof(struct mmc_queue_req);
mq->tag_set.driver_data = mq; mq->tag_set.driver_data = mq;

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* NVM Express device driver * NVM Express device driver
* Copyright (c) 2011-2014, Intel Corporation. * Copyright (c) 2011-2014, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#include <linux/blkdev.h> #include <linux/blkdev.h>
@ -151,11 +143,8 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
} }
EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
static void nvme_delete_ctrl_work(struct work_struct *work) static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
{ {
struct nvme_ctrl *ctrl =
container_of(work, struct nvme_ctrl, delete_work);
dev_info(ctrl->device, dev_info(ctrl->device,
"Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn); "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
@ -167,6 +156,14 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
nvme_put_ctrl(ctrl); nvme_put_ctrl(ctrl);
} }
static void nvme_delete_ctrl_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl =
container_of(work, struct nvme_ctrl, delete_work);
nvme_do_delete_ctrl(ctrl);
}
int nvme_delete_ctrl(struct nvme_ctrl *ctrl) int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
{ {
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
@ -177,7 +174,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
} }
EXPORT_SYMBOL_GPL(nvme_delete_ctrl); EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
{ {
int ret = 0; int ret = 0;
@ -186,13 +183,13 @@ int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
* can free the controller. * can free the controller.
*/ */
nvme_get_ctrl(ctrl); nvme_get_ctrl(ctrl);
ret = nvme_delete_ctrl(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
ret = -EBUSY;
if (!ret) if (!ret)
flush_work(&ctrl->delete_work); nvme_do_delete_ctrl(ctrl);
nvme_put_ctrl(ctrl); nvme_put_ctrl(ctrl);
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
static inline bool nvme_ns_has_pi(struct nvme_ns *ns) static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
{ {
@ -611,6 +608,22 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
return BLK_STS_OK; return BLK_STS_OK;
} }
static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd)
{
if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
return nvme_setup_discard(ns, req, cmnd);
cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->write_zeroes.slba =
cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
cmnd->write_zeroes.length =
cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
cmnd->write_zeroes.control = 0;
return BLK_STS_OK;
}
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd) struct request *req, struct nvme_command *cmnd)
{ {
@ -705,7 +718,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
nvme_setup_flush(ns, cmd); nvme_setup_flush(ns, cmd);
break; break;
case REQ_OP_WRITE_ZEROES: case REQ_OP_WRITE_ZEROES:
/* currently only aliased to deallocate for a few ctrls: */ ret = nvme_setup_write_zeroes(ns, req, cmd);
break;
case REQ_OP_DISCARD: case REQ_OP_DISCARD:
ret = nvme_setup_discard(ns, req, cmd); ret = nvme_setup_discard(ns, req, cmd);
break; break;
@ -1512,6 +1526,37 @@ static void nvme_config_discard(struct nvme_ns *ns)
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
} }
static inline void nvme_config_write_zeroes(struct nvme_ns *ns)
{
u32 max_sectors;
unsigned short bs = 1 << ns->lba_shift;
if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES))
return;
/*
* Even though NVMe spec explicitly states that MDTS is not
* applicable to the write-zeroes:- "The restriction does not apply to
* commands that do not transfer data between the host and the
* controller (e.g., Write Uncorrectable ro Write Zeroes command).".
* In order to be more cautious use controller's max_hw_sectors value
* to configure the maximum sectors for the write-zeroes which is
* configured based on the controller's MDTS field in the
* nvme_init_identify() if available.
*/
if (ns->ctrl->max_hw_sectors == UINT_MAX)
max_sectors = ((u32)(USHRT_MAX + 1) * bs) >> 9;
else
max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9;
blk_queue_max_write_zeroes_sectors(ns->queue, max_sectors);
}
static inline void nvme_ns_config_oncs(struct nvme_ns *ns)
{
nvme_config_discard(ns);
nvme_config_write_zeroes(ns);
}
static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
struct nvme_id_ns *id, struct nvme_ns_ids *ids) struct nvme_id_ns *id, struct nvme_ns_ids *ids)
{ {
@ -1565,7 +1610,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
capacity = 0; capacity = 0;
set_capacity(disk, capacity); set_capacity(disk, capacity);
nvme_config_discard(ns); nvme_ns_config_oncs(ns);
if (id->nsattr & (1 << 0)) if (id->nsattr & (1 << 0))
set_disk_ro(disk, true); set_disk_ro(disk, true);
@ -2280,6 +2325,9 @@ static struct attribute *nvme_subsys_attrs[] = {
&subsys_attr_serial.attr, &subsys_attr_serial.attr,
&subsys_attr_firmware_rev.attr, &subsys_attr_firmware_rev.attr,
&subsys_attr_subsysnqn.attr, &subsys_attr_subsysnqn.attr,
#ifdef CONFIG_NVME_MULTIPATH
&subsys_attr_iopolicy.attr,
#endif
NULL, NULL,
}; };
@ -2332,6 +2380,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
subsys->vendor_id = le16_to_cpu(id->vid); subsys->vendor_id = le16_to_cpu(id->vid);
subsys->cmic = id->cmic; subsys->cmic = id->cmic;
#ifdef CONFIG_NVME_MULTIPATH
subsys->iopolicy = NVME_IOPOLICY_NUMA;
#endif
subsys->dev.class = nvme_subsys_class; subsys->dev.class = nvme_subsys_class;
subsys->dev.release = nvme_release_subsystem; subsys->dev.release = nvme_release_subsystem;
@ -3163,21 +3214,23 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
return 0; return 0;
} }
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{ {
struct nvme_ns *ns; struct nvme_ns *ns;
struct gendisk *disk; struct gendisk *disk;
struct nvme_id_ns *id; struct nvme_id_ns *id;
char disk_name[DISK_NAME_LEN]; char disk_name[DISK_NAME_LEN];
int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns) if (!ns)
return; return -ENOMEM;
ns->queue = blk_mq_init_queue(ctrl->tagset); ns->queue = blk_mq_init_queue(ctrl->tagset);
if (IS_ERR(ns->queue)) if (IS_ERR(ns->queue)) {
ret = PTR_ERR(ns->queue);
goto out_free_ns; goto out_free_ns;
}
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
@ -3193,20 +3246,27 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
nvme_set_queue_limits(ctrl, ns->queue); nvme_set_queue_limits(ctrl, ns->queue);
id = nvme_identify_ns(ctrl, nsid); id = nvme_identify_ns(ctrl, nsid);
if (!id) if (!id) {
ret = -EIO;
goto out_free_queue; goto out_free_queue;
}
if (id->ncap == 0) if (id->ncap == 0) {
ret = -EINVAL;
goto out_free_id; goto out_free_id;
}
if (nvme_init_ns_head(ns, nsid, id)) ret = nvme_init_ns_head(ns, nsid, id);
if (ret)
goto out_free_id; goto out_free_id;
nvme_setup_streams_ns(ctrl, ns); nvme_setup_streams_ns(ctrl, ns);
nvme_set_disk_name(disk_name, ns, ctrl, &flags); nvme_set_disk_name(disk_name, ns, ctrl, &flags);
disk = alloc_disk_node(0, node); disk = alloc_disk_node(0, node);
if (!disk) if (!disk) {
ret = -ENOMEM;
goto out_unlink_ns; goto out_unlink_ns;
}
disk->fops = &nvme_fops; disk->fops = &nvme_fops;
disk->private_data = ns; disk->private_data = ns;
@ -3218,7 +3278,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
__nvme_revalidate_disk(disk, id); __nvme_revalidate_disk(disk, id);
if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
if (nvme_nvm_register(ns, disk_name, node)) { ret = nvme_nvm_register(ns, disk_name, node);
if (ret) {
dev_warn(ctrl->device, "LightNVM init failure\n"); dev_warn(ctrl->device, "LightNVM init failure\n");
goto out_put_disk; goto out_put_disk;
} }
@ -3236,7 +3297,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
nvme_fault_inject_init(ns); nvme_fault_inject_init(ns);
kfree(id); kfree(id);
return; return 0;
out_put_disk: out_put_disk:
put_disk(ns->disk); put_disk(ns->disk);
out_unlink_ns: out_unlink_ns:
@ -3249,6 +3310,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_cleanup_queue(ns->queue); blk_cleanup_queue(ns->queue);
out_free_ns: out_free_ns:
kfree(ns); kfree(ns);
return ret;
} }
static void nvme_ns_remove(struct nvme_ns *ns) static void nvme_ns_remove(struct nvme_ns *ns)
@ -3596,8 +3658,6 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
nvme_stop_keep_alive(ctrl); nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work); flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fw_act_work); cancel_work_sync(&ctrl->fw_act_work);
if (ctrl->ops->stop_ctrl)
ctrl->ops->stop_ctrl(ctrl);
} }
EXPORT_SYMBOL_GPL(nvme_stop_ctrl); EXPORT_SYMBOL_GPL(nvme_stop_ctrl);

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* NVMe over Fabrics common host code. * NVMe over Fabrics common host code.
* Copyright (c) 2015-2016 HGST, a Western Digital Company. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h> #include <linux/init.h>
@ -430,6 +422,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue);
* @qid: NVMe I/O queue number for the new I/O connection between * @qid: NVMe I/O queue number for the new I/O connection between
* host and target (note qid == 0 is illegal as this is * host and target (note qid == 0 is illegal as this is
* the Admin queue, per NVMe standard). * the Admin queue, per NVMe standard).
* @poll: Whether or not to poll for the completion of the connect cmd.
* *
* This function issues a fabrics-protocol connection * This function issues a fabrics-protocol connection
* of a NVMe I/O queue (via NVMe Fabrics "Connect" command) * of a NVMe I/O queue (via NVMe Fabrics "Connect" command)

View File

@ -1,15 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* /*
* NVMe over Fabrics common host code. * NVMe over Fabrics common host code.
* Copyright (c) 2015-2016 HGST, a Western Digital Company. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#ifndef _NVME_FABRICS_H #ifndef _NVME_FABRICS_H
#define _NVME_FABRICS_H 1 #define _NVME_FABRICS_H 1

View File

@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* fault injection support for nvme. * fault injection support for nvme.
* *
* Copyright (c) 2018, Oracle and/or its affiliates * Copyright (c) 2018, Oracle and/or its affiliates
*
*/ */
#include <linux/moduleparam.h> #include <linux/moduleparam.h>

View File

@ -1,18 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* Copyright (c) 2016 Avago Technologies. All rights reserved. * Copyright (c) 2016 Avago Technologies. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful.
* ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
* INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
* THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
* See the GNU General Public License for more details, a copy of which
* can be found in the file COPYING included with this package
*
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h> #include <linux/module.h>

View File

@ -1,23 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* nvme-lightnvm.c - LightNVM NVMe device * nvme-lightnvm.c - LightNVM NVMe device
* *
* Copyright (C) 2014-2015 IT University of Copenhagen * Copyright (C) 2014-2015 IT University of Copenhagen
* Initial release: Matias Bjorling <mb@lightnvm.io> * Initial release: Matias Bjorling <mb@lightnvm.io>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
* USA.
*
*/ */
#include "nvme.h" #include "nvme.h"

View File

@ -1,14 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* Copyright (c) 2017-2018 Christoph Hellwig. * Copyright (c) 2017-2018 Christoph Hellwig.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#include <linux/moduleparam.h> #include <linux/moduleparam.h>
@ -141,7 +133,10 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
test_bit(NVME_NS_ANA_PENDING, &ns->flags)) test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue; continue;
distance = node_distance(node, ns->ctrl->numa_node); if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
distance = node_distance(node, ns->ctrl->numa_node);
else
distance = LOCAL_DISTANCE;
switch (ns->ana_state) { switch (ns->ana_state) {
case NVME_ANA_OPTIMIZED: case NVME_ANA_OPTIMIZED:
@ -168,6 +163,47 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
return found; return found;
} }
static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
struct nvme_ns *ns)
{
ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
siblings);
if (ns)
return ns;
return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
}
static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
int node, struct nvme_ns *old)
{
struct nvme_ns *ns, *found, *fallback = NULL;
if (list_is_singular(&head->list))
return old;
for (ns = nvme_next_ns(head, old);
ns != old;
ns = nvme_next_ns(head, ns)) {
if (ns->ctrl->state != NVME_CTRL_LIVE ||
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue;
if (ns->ana_state == NVME_ANA_OPTIMIZED) {
found = ns;
goto out;
}
if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
fallback = ns;
}
if (!fallback)
return NULL;
found = fallback;
out:
rcu_assign_pointer(head->current_path[node], found);
return found;
}
static inline bool nvme_path_is_optimized(struct nvme_ns *ns) static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
{ {
return ns->ctrl->state == NVME_CTRL_LIVE && return ns->ctrl->state == NVME_CTRL_LIVE &&
@ -180,6 +216,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
struct nvme_ns *ns; struct nvme_ns *ns;
ns = srcu_dereference(head->current_path[node], &head->srcu); ns = srcu_dereference(head->current_path[node], &head->srcu);
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
ns = nvme_round_robin_path(head, node, ns);
if (unlikely(!ns || !nvme_path_is_optimized(ns))) if (unlikely(!ns || !nvme_path_is_optimized(ns)))
ns = __nvme_find_path(head, node); ns = __nvme_find_path(head, node);
return ns; return ns;
@ -471,6 +509,44 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl)
cancel_work_sync(&ctrl->ana_work); cancel_work_sync(&ctrl->ana_work);
} }
#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
struct device_attribute subsys_attr_##_name = \
__ATTR(_name, _mode, _show, _store)
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
};
static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvme_subsystem *subsys =
container_of(dev, struct nvme_subsystem, dev);
return sprintf(buf, "%s\n",
nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
}
static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct nvme_subsystem *subsys =
container_of(dev, struct nvme_subsystem, dev);
int i;
for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
WRITE_ONCE(subsys->iopolicy, i);
return count;
}
}
return -EINVAL;
}
SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
char *buf) char *buf)
{ {

View File

@ -1,14 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* /*
* Copyright (c) 2011-2014, Intel Corporation. * Copyright (c) 2011-2014, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#ifndef _NVME_H #ifndef _NVME_H
@ -252,6 +244,11 @@ struct nvme_ctrl {
unsigned long discard_page_busy; unsigned long discard_page_busy;
}; };
enum nvme_iopolicy {
NVME_IOPOLICY_NUMA,
NVME_IOPOLICY_RR,
};
struct nvme_subsystem { struct nvme_subsystem {
int instance; int instance;
struct device dev; struct device dev;
@ -271,6 +268,9 @@ struct nvme_subsystem {
u8 cmic; u8 cmic;
u16 vendor_id; u16 vendor_id;
struct ida ns_ida; struct ida ns_ida;
#ifdef CONFIG_NVME_MULTIPATH
enum nvme_iopolicy iopolicy;
#endif
}; };
/* /*
@ -364,7 +364,6 @@ struct nvme_ctrl_ops {
void (*submit_async_event)(struct nvme_ctrl *ctrl); void (*submit_async_event)(struct nvme_ctrl *ctrl);
void (*delete_ctrl)(struct nvme_ctrl *ctrl); void (*delete_ctrl)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
void (*stop_ctrl)(struct nvme_ctrl *ctrl);
}; };
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
@ -459,7 +458,6 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
void *log, size_t size, u64 offset); void *log, size_t size, u64 offset);
@ -492,6 +490,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state; extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute subsys_attr_iopolicy;
#else #else
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* NVM Express device driver * NVM Express device driver
* Copyright (c) 2011-2014, Intel Corporation. * Copyright (c) 2011-2014, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#include <linux/aer.h> #include <linux/aer.h>
@ -157,6 +149,8 @@ static int queue_count_set(const char *val, const struct kernel_param *kp)
int n = 0, ret; int n = 0, ret;
ret = kstrtoint(val, 10, &n); ret = kstrtoint(val, 10, &n);
if (ret)
return ret;
if (n > num_possible_cpus()) if (n > num_possible_cpus())
n = num_possible_cpus(); n = num_possible_cpus();

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* NVMe over Fabrics RDMA host code. * NVMe over Fabrics RDMA host code.
* Copyright (c) 2015-2016 HGST, a Western Digital Company. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h> #include <linux/module.h>
@ -942,14 +934,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
} }
} }
static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
cancel_work_sync(&ctrl->err_work);
cancel_delayed_work_sync(&ctrl->reconnect_work);
}
static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
{ {
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@ -1158,7 +1142,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
struct nvme_rdma_device *dev = queue->device; struct nvme_rdma_device *dev = queue->device;
struct ib_device *ibdev = dev->dev; struct ib_device *ibdev = dev->dev;
if (!blk_rq_payload_bytes(rq)) if (!blk_rq_nr_phys_segments(rq))
return; return;
if (req->mr) { if (req->mr) {
@ -1281,7 +1265,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
c->common.flags |= NVME_CMD_SGL_METABUF; c->common.flags |= NVME_CMD_SGL_METABUF;
if (!blk_rq_payload_bytes(rq)) if (!blk_rq_nr_phys_segments(rq))
return nvme_rdma_set_sg_null(c); return nvme_rdma_set_sg_null(c);
req->sg_table.sgl = req->first_sgl; req->sg_table.sgl = req->first_sgl;
@ -1854,6 +1838,9 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
{ {
cancel_work_sync(&ctrl->err_work);
cancel_delayed_work_sync(&ctrl->reconnect_work);
nvme_rdma_teardown_io_queues(ctrl, shutdown); nvme_rdma_teardown_io_queues(ctrl, shutdown);
if (shutdown) if (shutdown)
nvme_shutdown_ctrl(&ctrl->ctrl); nvme_shutdown_ctrl(&ctrl->ctrl);
@ -1902,7 +1889,6 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.submit_async_event = nvme_rdma_submit_async_event, .submit_async_event = nvme_rdma_submit_async_event,
.delete_ctrl = nvme_rdma_delete_ctrl, .delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address, .get_address = nvmf_get_address,
.stop_ctrl = nvme_rdma_stop_ctrl,
}; };
/* /*

View File

@ -1822,6 +1822,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{ {
cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
nvme_tcp_teardown_io_queues(ctrl, shutdown); nvme_tcp_teardown_io_queues(ctrl, shutdown);
if (shutdown) if (shutdown)
nvme_shutdown_ctrl(ctrl); nvme_shutdown_ctrl(ctrl);
@ -1859,12 +1862,6 @@ out_fail:
nvme_tcp_reconnect_or_remove(ctrl); nvme_tcp_reconnect_or_remove(ctrl);
} }
static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
{
cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
}
static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl) static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
{ {
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
@ -2115,7 +2112,6 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
.submit_async_event = nvme_tcp_submit_async_event, .submit_async_event = nvme_tcp_submit_async_event,
.delete_ctrl = nvme_tcp_delete_ctrl, .delete_ctrl = nvme_tcp_delete_ctrl,
.get_address = nvmf_get_address, .get_address = nvmf_get_address,
.stop_ctrl = nvme_tcp_stop_ctrl,
}; };
static bool static bool

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* NVM Express device driver tracepoints * NVM Express device driver tracepoints
* Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#include <asm/unaligned.h> #include <asm/unaligned.h>

View File

@ -1,15 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* /*
* NVM Express device driver tracepoints * NVM Express device driver tracepoints
* Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#undef TRACE_SYSTEM #undef TRACE_SYSTEM

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* NVMe admin command implementation. * NVMe admin command implementation.
* Copyright (c) 2015-2016 HGST, a Western Digital Company. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h> #include <linux/module.h>

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* Configfs interface for the NVMe target. * Configfs interface for the NVMe target.
* Copyright (c) 2015-2016 HGST, a Western Digital Company. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h> #include <linux/kernel.h>

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* Common code for the NVMe target. * Common code for the NVMe target.
* Copyright (c) 2015-2016 HGST, a Western Digital Company. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h> #include <linux/module.h>

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* Discovery service for the NVMe over Fabrics target. * Discovery service for the NVMe over Fabrics target.
* Copyright (C) 2016 Intel Corporation. All rights reserved. * Copyright (C) 2016 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/slab.h> #include <linux/slab.h>
@ -331,7 +323,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
cmd->get_log_page.lid); cmd->get_log_page.lid);
req->error_loc = req->error_loc =
offsetof(struct nvme_get_log_page_command, lid); offsetof(struct nvme_get_log_page_command, lid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
} }
case nvme_admin_identify: case nvme_admin_identify:
req->data_len = NVME_IDENTIFY_DATA_SIZE; req->data_len = NVME_IDENTIFY_DATA_SIZE;

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* NVMe Fabrics command implementation. * NVMe Fabrics command implementation.
* Copyright (c) 2015-2016 HGST, a Western Digital Company. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/blkdev.h> #include <linux/blkdev.h>

View File

@ -1,18 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* Copyright (c) 2016 Avago Technologies. All rights reserved. * Copyright (c) 2016 Avago Technologies. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful.
* ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
* INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
* THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
* See the GNU General Public License for more details, a copy of which
* can be found in the file COPYING included with this package
*
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h> #include <linux/module.h>

View File

@ -1,17 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* Copyright (c) 2016 Avago Technologies. All rights reserved. * Copyright (c) 2016 Avago Technologies. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful.
* ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
* INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
* THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
* See the GNU General Public License for more details, a copy of which
* can be found in the file COPYING included with this package
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h> #include <linux/module.h>

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* NVMe I/O command implementation. * NVMe I/O command implementation.
* Copyright (c) 2015-2016 HGST, a Western Digital Company. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/blkdev.h> #include <linux/blkdev.h>

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* NVMe over Fabrics loopback device. * NVMe over Fabrics loopback device.
* Copyright (c) 2015-2016 HGST, a Western Digital Company. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/scatterlist.h> #include <linux/scatterlist.h>

View File

@ -1,14 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* /*
* Copyright (c) 2015-2016 HGST, a Western Digital Company. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#ifndef _NVMET_H #ifndef _NVMET_H

View File

@ -1,15 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* /*
* NVMe over Fabrics RDMA target. * NVMe over Fabrics RDMA target.
* Copyright (c) 2015-2016 HGST, a Western Digital Company. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/atomic.h> #include <linux/atomic.h>

View File

@ -1900,7 +1900,7 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
shost->tag_set.queue_depth = shost->can_queue; shost->tag_set.queue_depth = shost->can_queue;
shost->tag_set.cmd_size = cmd_size; shost->tag_set.cmd_size = cmd_size;
shost->tag_set.numa_node = NUMA_NO_NODE; shost->tag_set.numa_node = NUMA_NO_NODE;
shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
shost->tag_set.flags |= shost->tag_set.flags |=
BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
shost->tag_set.driver_data = shost; shost->tag_set.driver_data = shost;

View File

@ -20,8 +20,9 @@ static inline void read_endio(struct bio *bio)
int i; int i;
struct bio_vec *bvec; struct bio_vec *bvec;
const blk_status_t err = bio->bi_status; const blk_status_t err = bio->bi_status;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page; struct page *page = bvec->bv_page;
/* page is already locked */ /* page is already locked */

View File

@ -849,8 +849,9 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
#ifdef EROFS_FS_HAS_MANAGED_CACHE #ifdef EROFS_FS_HAS_MANAGED_CACHE
struct address_space *mc = NULL; struct address_space *mc = NULL;
#endif #endif
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page; struct page *page = bvec->bv_page;
bool cachemngd = false; bool cachemngd = false;

View File

@ -211,6 +211,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
ssize_t ret; ssize_t ret;
blk_qc_t qc; blk_qc_t qc;
int i; int i;
struct bvec_iter_all iter_all;
if ((pos | iov_iter_alignment(iter)) & if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1)) (bdev_logical_block_size(bdev) - 1))
@ -247,7 +248,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
task_io_account_write(ret); task_io_account_write(ret);
} }
if (iocb->ki_flags & IOCB_HIPRI) if (iocb->ki_flags & IOCB_HIPRI)
bio.bi_opf |= REQ_HIPRI; bio_set_polled(&bio, iocb);
qc = submit_bio(&bio); qc = submit_bio(&bio);
for (;;) { for (;;) {
@ -260,7 +261,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
} }
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
bio_for_each_segment_all(bvec, &bio, i) { bio_for_each_segment_all(bvec, &bio, i, iter_all) {
if (should_dirty && !PageCompound(bvec->bv_page)) if (should_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page); set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page); put_page(bvec->bv_page);
@ -293,6 +294,14 @@ struct blkdev_dio {
static struct bio_set blkdev_dio_pool; static struct bio_set blkdev_dio_pool;
static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
{
struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
struct request_queue *q = bdev_get_queue(bdev);
return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
}
static void blkdev_bio_end_io(struct bio *bio) static void blkdev_bio_end_io(struct bio *bio)
{ {
struct blkdev_dio *dio = bio->bi_private; struct blkdev_dio *dio = bio->bi_private;
@ -329,8 +338,9 @@ static void blkdev_bio_end_io(struct bio *bio)
} else { } else {
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page); put_page(bvec->bv_page);
bio_put(bio); bio_put(bio);
} }
@ -406,10 +416,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
if (!nr_pages) { if (!nr_pages) {
if (iocb->ki_flags & IOCB_HIPRI) bool polled = false;
bio->bi_opf |= REQ_HIPRI;
if (iocb->ki_flags & IOCB_HIPRI) {
bio_set_polled(bio, iocb);
polled = true;
}
qc = submit_bio(bio); qc = submit_bio(bio);
if (polled)
WRITE_ONCE(iocb->ki_cookie, qc);
break; break;
} }
@ -2076,6 +2093,7 @@ const struct file_operations def_blk_fops = {
.llseek = block_llseek, .llseek = block_llseek,
.read_iter = blkdev_read_iter, .read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter, .write_iter = blkdev_write_iter,
.iopoll = blkdev_iopoll,
.mmap = generic_file_mmap, .mmap = generic_file_mmap,
.fsync = blkdev_fsync, .fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl, .unlocked_ioctl = block_ioctl,

View File

@ -162,13 +162,14 @@ csum_failed:
} else { } else {
int i; int i;
struct bio_vec *bvec; struct bio_vec *bvec;
struct bvec_iter_all iter_all;
/* /*
* we have verified the checksum already, set page * we have verified the checksum already, set page
* checked so the end_io handlers know about it * checked so the end_io handlers know about it
*/ */
ASSERT(!bio_flagged(bio, BIO_CLONED)); ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, cb->orig_bio, i) bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all)
SetPageChecked(bvec->bv_page); SetPageChecked(bvec->bv_page);
bio_endio(cb->orig_bio); bio_endio(cb->orig_bio);

View File

@ -833,9 +833,10 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
struct bio_vec *bvec; struct bio_vec *bvec;
struct btrfs_root *root; struct btrfs_root *root;
int i, ret = 0; int i, ret = 0;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED)); ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root; root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
if (ret) if (ret)

View File

@ -152,11 +152,12 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
{ {
blk_status_t ret = 0; blk_status_t ret = 0;
struct bio_vec *bvec = bio_last_bvec_all(bio); struct bio_vec *bvec = bio_last_bvec_all(bio);
struct page *page = bvec->bv_page; struct bio_vec bv;
struct extent_io_tree *tree = bio->bi_private; struct extent_io_tree *tree = bio->bi_private;
u64 start; u64 start;
start = page_offset(page) + bvec->bv_offset; mp_bvec_last_segment(bvec, &bv);
start = page_offset(bv.bv_page) + bv.bv_offset;
bio->bi_private = NULL; bio->bi_private = NULL;
@ -2379,7 +2380,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
int read_mode = 0; int read_mode = 0;
blk_status_t status; blk_status_t status;
int ret; int ret;
unsigned failed_bio_pages = bio_pages_all(failed_bio); unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@ -2451,9 +2452,10 @@ static void end_bio_extent_writepage(struct bio *bio)
u64 start; u64 start;
u64 end; u64 end;
int i; int i;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED)); ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page; struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host; struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@ -2522,9 +2524,10 @@ static void end_bio_extent_readpage(struct bio *bio)
int mirror; int mirror;
int ret; int ret;
int i; int i;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED)); ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page; struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host; struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@ -3641,9 +3644,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
struct bio_vec *bvec; struct bio_vec *bvec;
struct extent_buffer *eb; struct extent_buffer *eb;
int i, done; int i, done;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED)); ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page; struct page *page = bvec->bv_page;
eb = (struct extent_buffer *)page->private; eb = (struct extent_buffer *)page->private;

View File

@ -7829,6 +7829,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
struct bio_vec *bvec; struct bio_vec *bvec;
struct extent_io_tree *io_tree, *failure_tree; struct extent_io_tree *io_tree, *failure_tree;
int i; int i;
struct bvec_iter_all iter_all;
if (bio->bi_status) if (bio->bi_status)
goto end; goto end;
@ -7840,7 +7841,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
done->uptodate = 1; done->uptodate = 1;
ASSERT(!bio_flagged(bio, BIO_CLONED)); ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) bio_for_each_segment_all(bvec, bio, i, iter_all)
clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
io_tree, done->start, bvec->bv_page, io_tree, done->start, bvec->bv_page,
btrfs_ino(BTRFS_I(inode)), 0); btrfs_ino(BTRFS_I(inode)), 0);
@ -7919,6 +7920,7 @@ static void btrfs_retry_endio(struct bio *bio)
int uptodate; int uptodate;
int ret; int ret;
int i; int i;
struct bvec_iter_all iter_all;
if (bio->bi_status) if (bio->bi_status)
goto end; goto end;
@ -7932,7 +7934,7 @@ static void btrfs_retry_endio(struct bio *bio)
failure_tree = &BTRFS_I(inode)->io_failure_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree;
ASSERT(!bio_flagged(bio, BIO_CLONED)); ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
bvec->bv_offset, done->start, bvec->bv_offset, done->start,
bvec->bv_len); bvec->bv_len);

View File

@ -1443,10 +1443,11 @@ static void set_bio_pages_uptodate(struct bio *bio)
{ {
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED)); ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) bio_for_each_segment_all(bvec, bio, i, iter_all)
SetPageUptodate(bvec->bv_page); SetPageUptodate(bvec->bv_page);
} }

View File

@ -3027,13 +3027,23 @@ void guard_bio_eod(int op, struct bio *bio)
/* Uhhuh. We've got a bio that straddles the device size! */ /* Uhhuh. We've got a bio that straddles the device size! */
truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
/*
* The bio contains more than one segment which spans EOD, just return
* and let IO layer turn it into an EIO
*/
if (truncated_bytes > bvec->bv_len)
return;
/* Truncate the bio.. */ /* Truncate the bio.. */
bio->bi_iter.bi_size -= truncated_bytes; bio->bi_iter.bi_size -= truncated_bytes;
bvec->bv_len -= truncated_bytes; bvec->bv_len -= truncated_bytes;
/* ..and clear the end of the buffer for reads */ /* ..and clear the end of the buffer for reads */
if (op == REQ_OP_READ) { if (op == REQ_OP_READ) {
zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len, struct bio_vec bv;
mp_bvec_last_segment(bvec, &bv);
zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
truncated_bytes); truncated_bytes);
} }
} }

View File

@ -30,8 +30,9 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
{ {
struct bio_vec *bv; struct bio_vec *bv;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, bio, i) { bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page; struct page *page = bv->bv_page;
int ret = fscrypt_decrypt_page(page->mapping->host, page, int ret = fscrypt_decrypt_page(page->mapping->host, page,
PAGE_SIZE, 0, page->index); PAGE_SIZE, 0, page->index);

View File

@ -551,7 +551,9 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) { if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
bio_check_pages_dirty(bio); /* transfers ownership */ bio_check_pages_dirty(bio); /* transfers ownership */
} else { } else {
bio_for_each_segment_all(bvec, bio, i) { struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page; struct page *page = bvec->bv_page;
if (dio->op == REQ_OP_READ && !PageCompound(page) && if (dio->op == REQ_OP_READ && !PageCompound(page) &&

View File

@ -420,8 +420,9 @@ static void _clear_bio(struct bio *bio)
{ {
struct bio_vec *bv; struct bio_vec *bv;
unsigned i; unsigned i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, bio, i) { bio_for_each_segment_all(bv, bio, i, iter_all) {
unsigned this_count = bv->bv_len; unsigned this_count = bv->bv_len;
if (likely(PAGE_SIZE == this_count)) if (likely(PAGE_SIZE == this_count))

View File

@ -468,11 +468,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
/* loop on all devices all pages */ /* loop on all devices all pages */
for (d = 0; d < ios->numdevs; d++) { for (d = 0; d < ios->numdevs; d++) {
struct bio *bio = ios->per_dev[d].bio; struct bio *bio = ios->per_dev[d].bio;
struct bvec_iter_all iter_all;
if (!bio) if (!bio)
continue; continue;
bio_for_each_segment_all(bv, bio, i) { bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page; struct page *page = bv->bv_page;
SetPageUptodate(page); SetPageUptodate(page);

View File

@ -63,8 +63,9 @@ static void ext4_finish_bio(struct bio *bio)
{ {
int i; int i;
struct bio_vec *bvec; struct bio_vec *bvec;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page; struct page *page = bvec->bv_page;
#ifdef CONFIG_EXT4_FS_ENCRYPTION #ifdef CONFIG_EXT4_FS_ENCRYPTION
struct page *data_page = NULL; struct page *data_page = NULL;

View File

@ -72,6 +72,7 @@ static void mpage_end_io(struct bio *bio)
{ {
struct bio_vec *bv; struct bio_vec *bv;
int i; int i;
struct bvec_iter_all iter_all;
if (ext4_bio_encrypted(bio)) { if (ext4_bio_encrypted(bio)) {
if (bio->bi_status) { if (bio->bi_status) {
@ -81,7 +82,7 @@ static void mpage_end_io(struct bio *bio)
return; return;
} }
} }
bio_for_each_segment_all(bv, bio, i) { bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page; struct page *page = bv->bv_page;
if (!bio->bi_status) { if (!bio->bi_status) {

View File

@ -87,8 +87,9 @@ static void __read_end_io(struct bio *bio)
struct page *page; struct page *page;
struct bio_vec *bv; struct bio_vec *bv;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, bio, i) { bio_for_each_segment_all(bv, bio, i, iter_all) {
page = bv->bv_page; page = bv->bv_page;
/* PG_error was set if any post_read step failed */ /* PG_error was set if any post_read step failed */
@ -164,13 +165,14 @@ static void f2fs_write_end_io(struct bio *bio)
struct f2fs_sb_info *sbi = bio->bi_private; struct f2fs_sb_info *sbi = bio->bi_private;
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
if (time_to_inject(sbi, FAULT_WRITE_IO)) { if (time_to_inject(sbi, FAULT_WRITE_IO)) {
f2fs_show_injection_info(FAULT_WRITE_IO); f2fs_show_injection_info(FAULT_WRITE_IO);
bio->bi_status = BLK_STS_IOERR; bio->bi_status = BLK_STS_IOERR;
} }
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page; struct page *page = bvec->bv_page;
enum count_type type = WB_DATA_TYPE(page); enum count_type type = WB_DATA_TYPE(page);
@ -347,6 +349,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
struct bio_vec *bvec; struct bio_vec *bvec;
struct page *target; struct page *target;
int i; int i;
struct bvec_iter_all iter_all;
if (!io->bio) if (!io->bio)
return false; return false;
@ -354,7 +357,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
if (!inode && !page && !ino) if (!inode && !page && !ino)
return true; return true;
bio_for_each_segment_all(bvec, io->bio, i) { bio_for_each_segment_all(bvec, io->bio, i, iter_all) {
if (bvec->bv_page->mapping) if (bvec->bv_page->mapping)
target = bvec->bv_page; target = bvec->bv_page;

View File

@ -1280,6 +1280,7 @@ const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek, .llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter, .read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter, .write_iter = gfs2_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl, .unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap, .mmap = gfs2_mmap,
.open = gfs2_open, .open = gfs2_open,
@ -1310,6 +1311,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek, .llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter, .read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter, .write_iter = gfs2_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl, .unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap, .mmap = gfs2_mmap,
.open = gfs2_open, .open = gfs2_open,

View File

@ -168,7 +168,8 @@ u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
* that is pinned in the pagecache. * that is pinned in the pagecache.
*/ */
static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
struct bio_vec *bvec,
blk_status_t error) blk_status_t error)
{ {
struct buffer_head *bh, *next; struct buffer_head *bh, *next;
@ -207,6 +208,7 @@ static void gfs2_end_log_write(struct bio *bio)
struct bio_vec *bvec; struct bio_vec *bvec;
struct page *page; struct page *page;
int i; int i;
struct bvec_iter_all iter_all;
if (bio->bi_status) { if (bio->bi_status) {
fs_err(sdp, "Error %d writing to journal, jid=%u\n", fs_err(sdp, "Error %d writing to journal, jid=%u\n",
@ -214,7 +216,7 @@ static void gfs2_end_log_write(struct bio *bio)
wake_up(&sdp->sd_logd_waitq); wake_up(&sdp->sd_logd_waitq);
} }
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
page = bvec->bv_page; page = bvec->bv_page;
if (page_has_buffers(page)) if (page_has_buffers(page))
gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);

View File

@ -190,8 +190,9 @@ static void gfs2_meta_read_endio(struct bio *bio)
{ {
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page; struct page *page = bvec->bv_page;
struct buffer_head *bh = page_buffers(page); struct buffer_head *bh = page_buffers(page);
unsigned int len = bvec->bv_len; unsigned int len = bvec->bv_len;

View File

@ -274,8 +274,9 @@ iomap_read_end_io(struct bio *bio)
int error = blk_status_to_errno(bio->bi_status); int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) bio_for_each_segment_all(bvec, bio, i, iter_all)
iomap_read_page_end_io(bvec, error); iomap_read_page_end_io(bvec, error);
bio_put(bio); bio_put(bio);
} }
@ -324,7 +325,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
*/ */
sector = iomap_sector(iomap, pos); sector = iomap_sector(iomap, pos);
if (ctx->bio && bio_end_sector(ctx->bio) == sector) { if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
if (__bio_try_merge_page(ctx->bio, page, plen, poff)) if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
goto done; goto done;
is_contig = true; is_contig = true;
} }
@ -355,7 +356,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
ctx->bio->bi_end_io = iomap_read_end_io; ctx->bio->bi_end_io = iomap_read_end_io;
} }
__bio_add_page(ctx->bio, page, plen, poff); bio_add_page(ctx->bio, page, plen, poff);
done: done:
/* /*
* Move the caller beyond our range so that it keeps making progress. * Move the caller beyond our range so that it keeps making progress.
@ -1463,6 +1464,28 @@ struct iomap_dio {
}; };
}; };
int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
{
struct request_queue *q = READ_ONCE(kiocb->private);
if (!q)
return 0;
return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
}
EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
struct bio *bio)
{
atomic_inc(&dio->ref);
if (dio->iocb->ki_flags & IOCB_HIPRI)
bio_set_polled(bio, dio->iocb);
dio->submit.last_queue = bdev_get_queue(iomap->bdev);
dio->submit.cookie = submit_bio(bio);
}
static ssize_t iomap_dio_complete(struct iomap_dio *dio) static ssize_t iomap_dio_complete(struct iomap_dio *dio)
{ {
struct kiocb *iocb = dio->iocb; struct kiocb *iocb = dio->iocb;
@ -1568,14 +1591,15 @@ static void iomap_dio_bio_end_io(struct bio *bio)
} else { } else {
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i) bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page); put_page(bvec->bv_page);
bio_put(bio); bio_put(bio);
} }
} }
static blk_qc_t static void
iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
unsigned len) unsigned len)
{ {
@ -1589,15 +1613,10 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
bio->bi_private = dio; bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io; bio->bi_end_io = iomap_dio_bio_end_io;
if (dio->iocb->ki_flags & IOCB_HIPRI)
flags |= REQ_HIPRI;
get_page(page); get_page(page);
__bio_add_page(bio, page, len, 0); __bio_add_page(bio, page, len, 0);
bio_set_op_attrs(bio, REQ_OP_WRITE, flags); bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
iomap_dio_submit_bio(dio, iomap, bio);
atomic_inc(&dio->ref);
return submit_bio(bio);
} }
static loff_t static loff_t
@ -1700,9 +1719,6 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
bio_set_pages_dirty(bio); bio_set_pages_dirty(bio);
} }
if (dio->iocb->ki_flags & IOCB_HIPRI)
bio->bi_opf |= REQ_HIPRI;
iov_iter_advance(dio->submit.iter, n); iov_iter_advance(dio->submit.iter, n);
dio->size += n; dio->size += n;
@ -1710,11 +1726,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
copied += n; copied += n;
nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
iomap_dio_submit_bio(dio, iomap, bio);
atomic_inc(&dio->ref);
dio->submit.last_queue = bdev_get_queue(iomap->bdev);
dio->submit.cookie = submit_bio(bio);
} while (nr_pages); } while (nr_pages);
/* /*
@ -1925,6 +1937,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio->flags & IOMAP_DIO_WRITE_FUA) if (dio->flags & IOMAP_DIO_WRITE_FUA)
dio->flags &= ~IOMAP_DIO_NEED_SYNC; dio->flags &= ~IOMAP_DIO_NEED_SYNC;
WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
WRITE_ONCE(iocb->private, dio->submit.last_queue);
/* /*
* We are about to drop our additional submission reference, which * We are about to drop our additional submission reference, which
* might be the last reference to the dio. There are three three * might be the last reference to the dio. There are three three

View File

@ -48,8 +48,9 @@ static void mpage_end_io(struct bio *bio)
{ {
struct bio_vec *bv; struct bio_vec *bv;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, bio, i) { bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page; struct page *page = bv->bv_page;
page_endio(page, bio_op(bio), page_endio(page, bio_op(bio),
blk_status_to_errno(bio->bi_status)); blk_status_to_errno(bio->bi_status));

Some files were not shown because too many files have changed in this diff Show More