alistair23-linux/include/linux/blk_types.h
Jens Axboe 320ae51fee blk-mq: new multi-queue block IO queueing mechanism
Linux currently has two models for block devices:

- The classic request_fn based approach, where drivers use struct
  request units for IO. The block layer provides various helper
  functionalities to let drivers share code, things like tag
  management, timeout handling, queueing, etc.

- The "stacked" approach, where a driver squeezes in between the
  block layer and IO submitter. Since this bypasses the IO stack,
  driver generally have to manage everything themselves.

With drivers being written for new high IOPS devices, the classic
request_fn based driver doesn't work well enough. The design dates
back to when both SMP and high IOPS was rare. It has problems with
scaling to bigger machines, and runs into scaling issues even on
smaller machines when you have IOPS in the hundreds of thousands
per device.

The stacked approach is then most often selected as the model
for the driver. But this means that everybody has to re-invent
everything, and along with that we get all the problems again
that the shared approach solved.

This commit introduces blk-mq, block multi queue support. The
design is centered around per-cpu queues for queueing IO, which
then funnel down into x number of hardware submission queues.
We might have a 1:1 mapping between the two, or it might be
an N:M mapping. That all depends on what the hardware supports.

blk-mq provides various helper functions, which include:

- Scalable support for request tagging. Most devices need to
  be able to uniquely identify a request both in the driver and
  to the hardware. The tagging uses per-cpu caches for freed
  tags, to enable cache hot reuse.

- Timeout handling without tracking request on a per-device
  basis. Basically the driver should be able to get a notification,
  if a request happens to fail.

- Optional support for non 1:1 mappings between issue and
  submission queues. blk-mq can redirect IO completions to the
  desired location.

- Support for per-request payloads. Drivers almost always need
  to associate a request structure with some driver private
  command structure. Drivers can tell blk-mq this at init time,
  and then any request handed to the driver will have the
  required size of memory associated with it.

- Support for merging of IO, and plugging. The stacked model
  gets neither of these. Even for high IOPS devices, merging
  sequential IO reduces per-command overhead and thus
  increases bandwidth.

For now, this is provided as a potential 3rd queueing model, with
the hope being that, as it matures, it can replace both the classic
and stacked model. That would get us back to having just 1 real
model for block devices, leaving the stacked approach to dm/md
devices (as it was originally intended).

Contributions in this patch from the following people:

Shaohua Li <shli@fusionio.com>
Alexander Gordeev <agordeev@redhat.com>
Christoph Hellwig <hch@infradead.org>
Mike Christie <michaelc@cs.wisc.edu>
Matias Bjorling <m@bjorling.me>
Jeff Moyer <jmoyer@redhat.com>

Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2013-10-25 11:56:00 +01:00

236 lines
7.8 KiB
C

/*
* Block data types and constants. Directly include this file only to
* break include dependency loop.
*/
#ifndef __LINUX_BLK_TYPES_H
#define __LINUX_BLK_TYPES_H
#ifdef CONFIG_BLOCK
#include <linux/types.h>
struct bio_set;
struct bio;
struct bio_integrity_payload;
struct page;
struct block_device;
struct io_context;
struct cgroup_subsys_state;
typedef void (bio_end_io_t) (struct bio *, int);
typedef void (bio_destructor_t) (struct bio *);
/*
* was unsigned short, but we might as well be ready for > 64kB I/O pages
*/
struct bio_vec {
struct page *bv_page;
unsigned int bv_len;
unsigned int bv_offset;
};
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
* stacking drivers)
*/
struct bio {
sector_t bi_sector; /* device address in 512 byte
sectors */
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
unsigned long bi_flags; /* status, command, etc */
unsigned long bi_rw; /* bottom bits READ/WRITE,
* top bits priority
*/
unsigned short bi_vcnt; /* how many bio_vec's */
unsigned short bi_idx; /* current index into bvl_vec */
/* Number of segments in this BIO after
* physical address coalescing is performed.
*/
unsigned int bi_phys_segments;
unsigned int bi_size; /* residual I/O count */
/*
* To keep track of the max segment size, we account for the
* sizes of the first and last mergeable segments in this bio.
*/
unsigned int bi_seg_front_size;
unsigned int bi_seg_back_size;
bio_end_io_t *bi_end_io;
void *bi_private;
#ifdef CONFIG_BLK_CGROUP
/*
* Optional ioc and css associated with this bio. Put on bio
* release. Read comment on top of bio_associate_current().
*/
struct io_context *bi_ioc;
struct cgroup_subsys_state *bi_css;
#endif
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
/*
* Everything starting with bi_max_vecs will be preserved by bio_reset()
*/
unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
atomic_t bi_cnt; /* pin count */
struct bio_vec *bi_io_vec; /* the actual vec list */
struct bio_set *bi_pool;
/*
* We can inline a number of vecs at the end of the bio, to avoid
* double allocations for a small number of bio_vecs. This member
* MUST obviously be kept at the very end of the bio.
*/
struct bio_vec bi_inline_vecs[0];
};
#define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs)
/*
* bio flags
*/
#define BIO_UPTODATE 0 /* ok after I/O completion */
#define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */
#define BIO_EOF 2 /* out-out-bounds error */
#define BIO_SEG_VALID 3 /* bi_phys_segments valid */
#define BIO_CLONED 4 /* doesn't own data */
#define BIO_BOUNCED 5 /* bio is a bounce bio */
#define BIO_USER_MAPPED 6 /* contains user pages */
#define BIO_EOPNOTSUPP 7 /* not supported */
#define BIO_NULL_MAPPED 8 /* contains invalid user pages */
#define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */
#define BIO_QUIET 10 /* Make BIO Quiet */
#define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
#define BIO_SNAP_STABLE 12 /* bio data must be snapshotted during write */
/*
* Flags starting here get preserved by bio_reset() - this includes
* BIO_POOL_IDX()
*/
#define BIO_RESET_BITS 13
#define BIO_OWNS_VEC 13 /* bio_free() should free bvec */
#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
/*
* top 4 bits of bio flags indicate the pool this bio came from
*/
#define BIO_POOL_BITS (4)
#define BIO_POOL_NONE ((1UL << BIO_POOL_BITS) - 1)
#define BIO_POOL_OFFSET (BITS_PER_LONG - BIO_POOL_BITS)
#define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET)
#define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET)
#endif /* CONFIG_BLOCK */
/*
* Request flags. For use in the cmd_flags field of struct request, and in
* bi_rw of struct bio. Note that some flags are only valid in either one.
*/
enum rq_flag_bits {
/* common flags */
__REQ_WRITE, /* not set, read. set, write */
__REQ_FAILFAST_DEV, /* no driver retries of device errors */
__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
__REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */
__REQ_SYNC, /* request is sync (sync write or read) */
__REQ_META, /* metadata io request */
__REQ_PRIO, /* boost priority in cfq */
__REQ_DISCARD, /* request to discard sectors */
__REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */
__REQ_WRITE_SAME, /* write same block many times */
__REQ_NOIDLE, /* don't anticipate more IO after this one */
__REQ_FUA, /* forced unit access */
__REQ_FLUSH, /* request for cache flush */
/* bio only flags */
__REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_THROTTLED, /* This bio has already been subjected to
* throttling rules. Don't do it again. */
/* request only flags */
__REQ_SORTED, /* elevator knows about this request */
__REQ_SOFTBARRIER, /* may not be passed by ioscheduler */
__REQ_NOMERGE, /* don't touch this for merging */
__REQ_STARTED, /* drive already may have started this one */
__REQ_DONTPREP, /* don't call prep for this one */
__REQ_QUEUED, /* uses queueing */
__REQ_ELVPRIV, /* elevator private data attached */
__REQ_FAILED, /* set if the request failed */
__REQ_QUIET, /* don't worry about errors */
__REQ_PREEMPT, /* set for "ide_preempt" requests */
__REQ_ALLOCED, /* request came from our alloc pool */
__REQ_COPY_USER, /* contains copies of user pages */
__REQ_FLUSH_SEQ, /* request for flush sequence */
__REQ_IO_STAT, /* account I/O stat */
__REQ_MIXED_MERGE, /* merge of different types, fail separately */
__REQ_KERNEL, /* direct IO to kernel pages */
__REQ_PM, /* runtime pm request */
__REQ_END, /* last of chain of requests */
__REQ_NR_BITS, /* stops here */
};
#define REQ_WRITE (1ULL << __REQ_WRITE)
#define REQ_FAILFAST_DEV (1ULL << __REQ_FAILFAST_DEV)
#define REQ_FAILFAST_TRANSPORT (1ULL << __REQ_FAILFAST_TRANSPORT)
#define REQ_FAILFAST_DRIVER (1ULL << __REQ_FAILFAST_DRIVER)
#define REQ_SYNC (1ULL << __REQ_SYNC)
#define REQ_META (1ULL << __REQ_META)
#define REQ_PRIO (1ULL << __REQ_PRIO)
#define REQ_DISCARD (1ULL << __REQ_DISCARD)
#define REQ_WRITE_SAME (1ULL << __REQ_WRITE_SAME)
#define REQ_NOIDLE (1ULL << __REQ_NOIDLE)
#define REQ_FAILFAST_MASK \
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
#define REQ_COMMON_MASK \
(REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \
REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \
REQ_SECURE)
#define REQ_CLONE_MASK REQ_COMMON_MASK
#define BIO_NO_ADVANCE_ITER_MASK (REQ_DISCARD|REQ_WRITE_SAME)
/* This mask is used for both bio and request merge checking */
#define REQ_NOMERGE_FLAGS \
(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
#define REQ_THROTTLED (1ULL << __REQ_THROTTLED)
#define REQ_SORTED (1ULL << __REQ_SORTED)
#define REQ_SOFTBARRIER (1ULL << __REQ_SOFTBARRIER)
#define REQ_FUA (1ULL << __REQ_FUA)
#define REQ_NOMERGE (1ULL << __REQ_NOMERGE)
#define REQ_STARTED (1ULL << __REQ_STARTED)
#define REQ_DONTPREP (1ULL << __REQ_DONTPREP)
#define REQ_QUEUED (1ULL << __REQ_QUEUED)
#define REQ_ELVPRIV (1ULL << __REQ_ELVPRIV)
#define REQ_FAILED (1ULL << __REQ_FAILED)
#define REQ_QUIET (1ULL << __REQ_QUIET)
#define REQ_PREEMPT (1ULL << __REQ_PREEMPT)
#define REQ_ALLOCED (1ULL << __REQ_ALLOCED)
#define REQ_COPY_USER (1ULL << __REQ_COPY_USER)
#define REQ_FLUSH (1ULL << __REQ_FLUSH)
#define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ)
#define REQ_IO_STAT (1ULL << __REQ_IO_STAT)
#define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE)
#define REQ_SECURE (1ULL << __REQ_SECURE)
#define REQ_KERNEL (1ULL << __REQ_KERNEL)
#define REQ_PM (1ULL << __REQ_PM)
#define REQ_END (1ULL << __REQ_END)
#endif /* __LINUX_BLK_TYPES_H */