alistair23-linux/block/blk.h
Jeff Moyer 4853abaae7 block: fix flush machinery for stacking drivers with differring flush flags
Commit ae1b153962, block: reimplement
FLUSH/FUA to support merge, introduced a performance regression when
running any sort of fsyncing workload using dm-multipath and certain
storage (in our case, an HP EVA).  The test I ran was fs_mark, and it
dropped from ~800 files/sec on ext4 to ~100 files/sec.  It turns out
that dm-multipath always advertised flush+fua support, and passed
commands on down the stack, where those flags used to get stripped off.
The above commit changed that behavior:

static inline struct request *__elv_next_request(struct request_queue *q)
{
        struct request *rq;

        while (1) {
-               while (!list_empty(&q->queue_head)) {
+               if (!list_empty(&q->queue_head)) {
                        rq = list_entry_rq(q->queue_head.next);
-                       if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
-                           (rq->cmd_flags & REQ_FLUSH_SEQ))
-                               return rq;
-                       rq = blk_do_flush(q, rq);
-                       if (rq)
-                               return rq;
+                       return rq;
                }

Note that previously, a command would come in here, have
REQ_FLUSH|REQ_FUA set, and then get handed off to blk_do_flush:

struct request *blk_do_flush(struct request_queue *q, struct request *rq)
{
        unsigned int fflags = q->flush_flags; /* may change, cache it */
        bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
        bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
        bool do_postflush = has_flush && !has_fua && (rq->cmd_flags &
        REQ_FUA);
        unsigned skip = 0;
...
        if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
                rq->cmd_flags &= ~REQ_FLUSH;
		if (!has_fua)
			rq->cmd_flags &= ~REQ_FUA;
	        return rq;
	}

So, the flush machinery was bypassed in such cases (q->flush_flags == 0
&& rq->cmd_flags & (REQ_FLUSH|REQ_FUA)).

Now, however, we don't get into the flush machinery at all.  Instead,
__elv_next_request just hands a request with flush and fua bits set to
the scsi_request_fn, even if the underlying request_queue does not
support flush or fua.

The agreed upon approach is to fix the flush machinery to allow
stacking.  While this isn't used in practice (since there is only one
request-based dm target, and that target will now reflect the flush
flags of the underlying device), it does future-proof the solution, and
make it function as designed.

In order to make this work, I had to add a field to the struct request,
inside the flush structure (to store the original req->end_io).  Shaohua
had suggested overloading the union with rb_node and completion_data,
but the completion data is used by device mapper and can also be used by
other drivers.  So, I didn't see a way around the additional field.

I tested this patch on an HP EVA with both ext4 and xfs, and it recovers
the lost performance.  Comments and other testers, as always, are
appreciated.

Cheers,
Jeff

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2011-08-15 21:37:25 +02:00

192 lines
5.5 KiB
C

#ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H
/* Amount of time in which a process may batch requests */
#define BLK_BATCH_TIME (HZ/50UL)
/* Number of requests a "batching" process may submit */
#define BLK_BATCH_REQ 32
extern struct kmem_cache *blk_requestq_cachep;
extern struct kobj_type blk_queue_ktype;
void init_request_from_bio(struct request *req, struct bio *bio);
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio);
int blk_rq_append_bio(struct request_queue *q, struct request *rq,
struct bio *bio);
void blk_dequeue_request(struct request *rq);
void __blk_queue_free_tags(struct request_queue *q);
bool __blk_end_bidi_request(struct request *rq, int error,
unsigned int nr_bytes, unsigned int bidi_bytes);
void blk_rq_timed_out_timer(unsigned long data);
void blk_delete_timer(struct request *);
void blk_add_timer(struct request *);
void __generic_unplug_device(struct request_queue *);
/*
* Internal atomic flags for request handling
*/
enum rq_atomic_flags {
REQ_ATOM_COMPLETE = 0,
};
/*
* EH timer and IO completion will both attempt to 'grab' the request, make
* sure that only one of them succeeds
*/
static inline int blk_mark_rq_complete(struct request *rq)
{
return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
}
static inline void blk_clear_rq_complete(struct request *rq)
{
clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
}
/*
* Internal elevator interface
*/
#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
void blk_insert_flush(struct request *rq);
void blk_abort_flushes(struct request_queue *q);
static inline struct request *__elv_next_request(struct request_queue *q)
{
struct request *rq;
while (1) {
if (!list_empty(&q->queue_head)) {
rq = list_entry_rq(q->queue_head.next);
return rq;
}
/*
* Flush request is running and flush request isn't queueable
* in the drive, we can hold the queue till flush request is
* finished. Even we don't do this, driver can't dispatch next
* requests and will requeue them. And this can improve
* throughput too. For example, we have request flush1, write1,
* flush 2. flush1 is dispatched, then queue is hold, write1
* isn't inserted to queue. After flush1 is finished, flush2
* will be dispatched. Since disk cache is already clean,
* flush2 will be finished very soon, so looks like flush2 is
* folded to flush1.
* Since the queue is hold, a flag is set to indicate the queue
* should be restarted later. Please see flush_end_io() for
* details.
*/
if (q->flush_pending_idx != q->flush_running_idx &&
!queue_flush_queueable(q)) {
q->flush_queue_delayed = 1;
return NULL;
}
if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
!q->elevator->ops->elevator_dispatch_fn(q, 0))
return NULL;
}
}
static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
if (e->ops->elevator_activate_req_fn)
e->ops->elevator_activate_req_fn(q, rq);
}
static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
if (e->ops->elevator_deactivate_req_fn)
e->ops->elevator_deactivate_req_fn(q, rq);
}
#ifdef CONFIG_FAIL_IO_TIMEOUT
int blk_should_fake_timeout(struct request_queue *);
ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
#else
static inline int blk_should_fake_timeout(struct request_queue *q)
{
return 0;
}
#endif
struct io_context *current_io_context(gfp_t gfp_flags, int node);
int ll_back_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio);
int ll_front_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio);
int attempt_back_merge(struct request_queue *q, struct request *rq);
int attempt_front_merge(struct request_queue *q, struct request *rq);
int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
struct request *next);
void blk_recalc_rq_segments(struct request *rq);
void blk_rq_set_mixed_merge(struct request *rq);
void blk_queue_congestion_threshold(struct request_queue *q);
int blk_dev_init(void);
void elv_quiesce_start(struct request_queue *q);
void elv_quiesce_end(struct request_queue *q);
/*
* Return the threshold (number of used requests) at which the queue is
* considered to be congested. It include a little hysteresis to keep the
* context switch rate down.
*/
static inline int queue_congestion_on_threshold(struct request_queue *q)
{
return q->nr_congestion_on;
}
/*
* The threshold at which a queue is considered to be uncongested
*/
static inline int queue_congestion_off_threshold(struct request_queue *q)
{
return q->nr_congestion_off;
}
static inline int blk_cpu_to_group(int cpu)
{
int group = NR_CPUS;
#ifdef CONFIG_SCHED_MC
const struct cpumask *mask = cpu_coregroup_mask(cpu);
group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
group = cpumask_first(topology_thread_cpumask(cpu));
#else
return cpu;
#endif
if (likely(group < NR_CPUS))
return group;
return cpu;
}
/*
* Contribute to IO statistics IFF:
*
* a) it's attached to a gendisk, and
* b) the queue had IO stats enabled when this request was started, and
* c) it's a file system request or a discard request
*/
static inline int blk_do_io_stat(struct request *rq)
{
return rq->rq_disk &&
(rq->cmd_flags & REQ_IO_STAT) &&
(rq->cmd_type == REQ_TYPE_FS ||
(rq->cmd_flags & REQ_DISCARD));
}
#endif