1
0
Fork 0

Merge branch 'for-4.8/drivers' of git://git.kernel.dk/linux-block

Pull block driver updates from Jens Axboe:
 "This branch also contains core changes.  I've come to the conclusion
  that from 4.9 and forward, I'll be doing just a single branch.  We
  often have dependencies between core and drivers, and it's hard to
  always split them up appropriately without pulling core into drivers
  when that happens.

  That said, this contains:

   - separate secure erase type for the core block layer, from
     Christoph.

   - set of discard fixes, from Christoph.

   - bio shrinking fixes from Christoph, as a followup up to the
     op/flags change in the core branch.

   - map and append request fixes from Christoph.

   - NVMeF (NVMe over Fabrics) code from Christoph.  This is pretty
     exciting!

   - nvme-loop fixes from Arnd.

   - removal of ->driverfs_dev from Dan, after providing a
     device_add_disk() helper.

   - bcache fixes from Bhaktipriya and Yijing.

   - cdrom subchannel read fix from Vchannaiah.

   - set of lightnvm updates from Wenwei, Matias, Johannes, and Javier.

   - set of drbd updates and fixes from Fabian, Lars, and Philipp.

   - mg_disk error path fix from Bart.

   - user notification for failed device add for loop, from Minfei.

   - NVMe in general:
        + NVMe delay quirk from Guilherme.
        + SR-IOV support and command retry limits from Keith.
        + fix for memory-less NUMA node from Masayoshi.
        + use UINT_MAX for discard sectors, from Minfei.
        + cancel IO fixes from Ming.
        + don't allocate unused major, from Neil.
        + error code fixup from Dan.
        + use constants for PSDT/FUSE from James.
        + variable init fix from Jay.
        + fabrics fixes from Ming, Sagi, and Wei.
        + various fixes"

* 'for-4.8/drivers' of git://git.kernel.dk/linux-block: (115 commits)
  nvme/pci: Provide SR-IOV support
  nvme: initialize variable before logical OR'ing it
  block: unexport various bio mapping helpers
  scsi/osd: open code blk_make_request
  target: stop using blk_make_request
  block: simplify and export blk_rq_append_bio
  block: ensure bios return from blk_get_request are properly initialized
  virtio_blk: use blk_rq_map_kern
  memstick: don't allow REQ_TYPE_BLOCK_PC requests
  block: shrink bio size again
  block: simplify and cleanup bvec pool handling
  block: get rid of bio_rw and READA
  block: don't ignore -EOPNOTSUPP blkdev_issue_write_same
  block: introduce BLKDEV_DISCARD_ZERO to fix zeroout
  NVMe: don't allocate unused nvme_major
  nvme: avoid crashes when node 0 is memoryless node.
  nvme: Limit command retries
  loop: Make user notify for adding loop device failed
  nvme-loop: fix nvme-loop Kconfig dependencies
  nvmet: fix return value check in nvmet_subsys_alloc()
  ...
hifive-unleashed-5.1
Linus Torvalds 2016-07-26 15:37:51 -07:00
commit 3fc9d69093
129 changed files with 11614 additions and 1280 deletions

View File

@ -340,7 +340,8 @@ CDROMSUBCHNL Read subchannel data (struct cdrom_subchnl)
EINVAL format not CDROM_MSF or CDROM_LBA
notes:
Format is converted to CDROM_MSF on return
Format is converted to CDROM_MSF or CDROM_LBA
as per user request on return

View File

@ -8184,6 +8184,13 @@ S: Supported
F: drivers/nvme/host/
F: include/linux/nvme.h
NVM EXPRESS TARGET DRIVER
M: Christoph Hellwig <hch@lst.de>
M: Sagi Grimberg <sagi@grimberg.me>
L: linux-nvme@lists.infradead.org
S: Supported
F: drivers/nvme/target/
NVMEM FRAMEWORK
M: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
M: Maxime Ripard <maxime.ripard@free-electrons.com>

View File

@ -223,7 +223,6 @@ static int axon_ram_probe(struct platform_device *device)
bank->disk->first_minor = azfs_minor;
bank->disk->fops = &axon_ram_devops;
bank->disk->private_data = bank;
bank->disk->driverfs_dev = &device->dev;
sprintf(bank->disk->disk_name, "%s%d",
AXON_RAM_DEVICE_NAME, axon_ram_bank_id);
@ -238,7 +237,7 @@ static int axon_ram_probe(struct platform_device *device)
set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT);
blk_queue_make_request(bank->disk->queue, axon_ram_make_request);
blk_queue_logical_block_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
add_disk(bank->disk);
device_add_disk(&device->dev, bank->disk);
bank->irq_id = irq_of_parse_and_map(device->dev.of_node, 0);
if (bank->irq_id == NO_IRQ) {

View File

@ -801,6 +801,7 @@ static void ubd_device_release(struct device *dev)
static int ubd_disk_register(int major, u64 size, int unit,
struct gendisk **disk_out)
{
struct device *parent = NULL;
struct gendisk *disk;
disk = alloc_disk(1 << UBD_SHIFT);
@ -823,12 +824,12 @@ static int ubd_disk_register(int major, u64 size, int unit,
ubd_devs[unit].pdev.dev.release = ubd_device_release;
dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]);
platform_device_register(&ubd_devs[unit].pdev);
disk->driverfs_dev = &ubd_devs[unit].pdev.dev;
parent = &ubd_devs[unit].pdev.dev;
}
disk->private_data = &ubd_devs[unit];
disk->queue = ubd_devs[unit].queue;
add_disk(disk);
device_add_disk(parent, disk);
*disk_out = disk;
return 0;

View File

@ -54,7 +54,6 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
{
struct bio_integrity_payload *bip;
struct bio_set *bs = bio->bi_pool;
unsigned long idx = BIO_POOL_NONE;
unsigned inline_vecs;
if (!bs || !bs->bio_integrity_pool) {
@ -72,17 +71,19 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
memset(bip, 0, sizeof(*bip));
if (nr_vecs > inline_vecs) {
unsigned long idx = 0;
bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
bs->bvec_integrity_pool);
if (!bip->bip_vec)
goto err;
bip->bip_max_vcnt = bvec_nr_vecs(idx);
bip->bip_slab = idx;
} else {
bip->bip_vec = bip->bip_inline_vecs;
bip->bip_max_vcnt = inline_vecs;
}
bip->bip_slab = idx;
bip->bip_bio = bio;
bio->bi_integrity = bip;
bio->bi_rw |= REQ_INTEGRITY;
@ -111,9 +112,7 @@ void bio_integrity_free(struct bio *bio)
bip->bip_vec->bv_offset);
if (bs && bs->bio_integrity_pool) {
if (bip->bip_slab != BIO_POOL_NONE)
bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
bip->bip_slab);
bvec_free(bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab);
mempool_free(bip, bs->bio_integrity_pool);
} else {

View File

@ -43,7 +43,7 @@
* unsigned short
*/
#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
};
#undef BV
@ -160,11 +160,15 @@ unsigned int bvec_nr_vecs(unsigned short idx)
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
{
BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
if (!idx)
return;
idx--;
if (idx == BIOVEC_MAX_IDX)
BIO_BUG_ON(idx >= BVEC_POOL_NR);
if (idx == BVEC_POOL_MAX) {
mempool_free(bv, pool);
else {
} else {
struct biovec_slab *bvs = bvec_slabs + idx;
kmem_cache_free(bvs->slab, bv);
@ -206,7 +210,7 @@ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
* idx now points to the pool we want to allocate from. only the
* 1-vec entry pool is mempool backed.
*/
if (*idx == BIOVEC_MAX_IDX) {
if (*idx == BVEC_POOL_MAX) {
fallback:
bvl = mempool_alloc(pool, gfp_mask);
} else {
@ -226,11 +230,12 @@ fallback:
*/
bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
*idx = BIOVEC_MAX_IDX;
*idx = BVEC_POOL_MAX;
goto fallback;
}
}
(*idx)++;
return bvl;
}
@ -250,8 +255,7 @@ static void bio_free(struct bio *bio)
__bio_free(bio);
if (bs) {
if (bio_flagged(bio, BIO_OWNS_VEC))
bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
/*
* If we have front padding, adjust the bio pointer before freeing
@ -420,7 +424,6 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
gfp_t saved_gfp = gfp_mask;
unsigned front_pad;
unsigned inline_vecs;
unsigned long idx = BIO_POOL_NONE;
struct bio_vec *bvl = NULL;
struct bio *bio;
void *p;
@ -480,6 +483,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
bio_init(bio);
if (nr_iovecs > inline_vecs) {
unsigned long idx = 0;
bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
if (!bvl && gfp_mask != saved_gfp) {
punt_bios_to_rescuer(bs);
@ -490,13 +495,12 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
if (unlikely(!bvl))
goto err_free;
bio_set_flag(bio, BIO_OWNS_VEC);
bio->bi_flags |= idx << BVEC_POOL_OFFSET;
} else if (nr_iovecs) {
bvl = bio->bi_inline_vecs;
}
bio->bi_pool = bs;
bio->bi_flags |= idx << BIO_POOL_OFFSET;
bio->bi_max_vecs = nr_iovecs;
bio->bi_io_vec = bvl;
return bio;
@ -568,7 +572,7 @@ EXPORT_SYMBOL(bio_phys_segments);
*/
void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
{
BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE);
BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio));
/*
* most users will be overriding ->bi_bdev with a new target,
@ -1097,7 +1101,6 @@ int bio_uncopy_user(struct bio *bio)
bio_put(bio);
return ret;
}
EXPORT_SYMBOL(bio_uncopy_user);
/**
* bio_copy_user_iov - copy user data to bio
@ -1392,7 +1395,6 @@ void bio_unmap_user(struct bio *bio)
__bio_unmap_user(bio);
bio_put(bio);
}
EXPORT_SYMBOL(bio_unmap_user);
static void bio_map_kern_endio(struct bio *bio)
{
@ -1538,7 +1540,6 @@ cleanup:
bio_put(bio);
return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(bio_copy_kern);
/*
* bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
@ -1832,7 +1833,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
*/
mempool_t *biovec_create_pool(int pool_entries)
{
struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX;
return mempool_create_slab_pool(pool_entries, bp->slab);
}
@ -2009,7 +2010,7 @@ static void __init biovec_init_slabs(void)
{
int i;
for (i = 0; i < BIOVEC_NR_POOLS; i++) {
for (i = 0; i < BVEC_POOL_NR; i++) {
int size;
struct biovec_slab *bvs = bvec_slabs + i;

View File

@ -1294,10 +1294,15 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
spin_lock_irq(q->queue_lock);
rq = get_request(q, rw, 0, NULL, gfp_mask);
if (IS_ERR(rq))
if (IS_ERR(rq)) {
spin_unlock_irq(q->queue_lock);
/* q->queue_lock is unlocked at this point */
return rq;
}
/* q->queue_lock is unlocked at this point */
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
return rq;
}
@ -1312,63 +1317,6 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
}
EXPORT_SYMBOL(blk_get_request);
/**
* blk_make_request - given a bio, allocate a corresponding struct request.
* @q: target request queue
* @bio: The bio describing the memory mappings that will be submitted for IO.
* It may be a chained-bio properly constructed by block/bio layer.
* @gfp_mask: gfp flags to be used for memory allocation
*
* blk_make_request is the parallel of generic_make_request for BLOCK_PC
* type commands. Where the struct request needs to be farther initialized by
* the caller. It is passed a &struct bio, which describes the memory info of
* the I/O transfer.
*
* The caller of blk_make_request must make sure that bi_io_vec
* are set to describe the memory buffers. That bio_data_dir() will return
* the needed direction of the request. (And all bio's in the passed bio-chain
* are properly set accordingly)
*
* If called under none-sleepable conditions, mapped bio buffers must not
* need bouncing, by calling the appropriate masked or flagged allocator,
* suitable for the target device. Otherwise the call to blk_queue_bounce will
* BUG.
*
* WARNING: When allocating/cloning a bio-chain, careful consideration should be
* given to how you allocate bios. In particular, you cannot use
* __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise
* you risk waiting for IO completion of a bio that hasn't been submitted yet,
* thus resulting in a deadlock. Alternatively bios should be allocated using
* bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock.
* If possible a big IO should be split into smaller parts when allocation
* fails. Partial allocation should not be an error, or you risk a live-lock.
*/
struct request *blk_make_request(struct request_queue *q, struct bio *bio,
gfp_t gfp_mask)
{
struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);
if (IS_ERR(rq))
return rq;
blk_rq_set_block_pc(rq);
for_each_bio(bio) {
struct bio *bounce_bio = bio;
int ret;
blk_queue_bounce(q, &bounce_bio);
ret = blk_rq_append_bio(q, rq, bounce_bio);
if (unlikely(ret)) {
blk_put_request(rq);
return ERR_PTR(ret);
}
}
return rq;
}
EXPORT_SYMBOL(blk_make_request);
/**
* blk_rq_set_block_pc - initialize a request to type BLOCK_PC
* @rq: request to be initialized
@ -1377,9 +1325,6 @@ EXPORT_SYMBOL(blk_make_request);
void blk_rq_set_block_pc(struct request *rq)
{
rq->cmd_type = REQ_TYPE_BLOCK_PC;
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
memset(rq->__cmd, 0, sizeof(rq->__cmd));
}
EXPORT_SYMBOL(blk_rq_set_block_pc);
@ -1982,16 +1927,21 @@ generic_make_request_checks(struct bio *bio)
}
}
if ((bio_op(bio) == REQ_OP_DISCARD) &&
(!blk_queue_discard(q) ||
((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
err = -EOPNOTSUPP;
goto end_io;
}
if (bio_op(bio) == REQ_OP_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
err = -EOPNOTSUPP;
goto end_io;
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
if (!blk_queue_discard(q))
goto not_supported;
break;
case REQ_OP_SECURE_ERASE:
if (!blk_queue_secure_erase(q))
goto not_supported;
break;
case REQ_OP_WRITE_SAME:
if (!bdev_write_same(bio->bi_bdev))
goto not_supported;
break;
default:
break;
}
/*
@ -2008,6 +1958,8 @@ generic_make_request_checks(struct bio *bio)
trace_block_bio_queue(q, bio);
return true;
not_supported:
err = -EOPNOTSUPP;
end_io:
bio->bi_error = err;
bio_endio(bio);
@ -3383,6 +3335,7 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie)
return false;
}
EXPORT_SYMBOL_GPL(blk_poll);
#ifdef CONFIG_PM
/**

View File

@ -23,20 +23,32 @@ static struct bio *next_bio(struct bio *bio, unsigned int nr_pages,
}
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, int op_flags,
sector_t nr_sects, gfp_t gfp_mask, int flags,
struct bio **biop)
{
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = *biop;
unsigned int granularity;
enum req_op op;
int alignment;
if (!q)
return -ENXIO;
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
if ((op_flags & REQ_SECURE) && !blk_queue_secdiscard(q))
return -EOPNOTSUPP;
if (flags & BLKDEV_DISCARD_SECURE) {
if (flags & BLKDEV_DISCARD_ZERO)
return -EOPNOTSUPP;
if (!blk_queue_secure_erase(q))
return -EOPNOTSUPP;
op = REQ_OP_SECURE_ERASE;
} else {
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
if ((flags & BLKDEV_DISCARD_ZERO) &&
!q->limits.discard_zeroes_data)
return -EOPNOTSUPP;
op = REQ_OP_DISCARD;
}
/* Zero-sector (unknown) and one-sector granularities are the same. */
granularity = max(q->limits.discard_granularity >> 9, 1U);
@ -66,7 +78,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
bio = next_bio(bio, 1, gfp_mask);
bio->bi_iter.bi_sector = sector;
bio->bi_bdev = bdev;
bio_set_op_attrs(bio, REQ_OP_DISCARD, op_flags);
bio_set_op_attrs(bio, op, 0);
bio->bi_iter.bi_size = req_sects << 9;
nr_sects -= req_sects;
@ -100,20 +112,16 @@ EXPORT_SYMBOL(__blkdev_issue_discard);
int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
{
int op_flags = 0;
struct bio *bio = NULL;
struct blk_plug plug;
int ret;
if (flags & BLKDEV_DISCARD_SECURE)
op_flags |= REQ_SECURE;
blk_start_plug(&plug);
ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, op_flags,
ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags,
&bio);
if (!ret && bio) {
ret = submit_bio_wait(bio);
if (ret == -EOPNOTSUPP)
if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO))
ret = 0;
bio_put(bio);
}
@ -173,7 +181,7 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
ret = submit_bio_wait(bio);
bio_put(bio);
}
return ret != -EOPNOTSUPP ? ret : 0;
return ret;
}
EXPORT_SYMBOL(blkdev_issue_write_same);
@ -244,11 +252,11 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, bool discard)
{
struct request_queue *q = bdev_get_queue(bdev);
if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data &&
blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0) == 0)
return 0;
if (discard) {
if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
BLKDEV_DISCARD_ZERO))
return 0;
}
if (bdev_write_same(bdev) &&
blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,

View File

@ -9,21 +9,26 @@
#include "blk.h"
int blk_rq_append_bio(struct request_queue *q, struct request *rq,
struct bio *bio)
/*
* Append a bio to a passthrough request. Only works can be merged into
* the request based on the driver constraints.
*/
int blk_rq_append_bio(struct request *rq, struct bio *bio)
{
if (!rq->bio)
blk_rq_bio_prep(q, rq, bio);
else if (!ll_back_merge_fn(q, rq, bio))
return -EINVAL;
else {
if (!rq->bio) {
blk_rq_bio_prep(rq->q, rq, bio);
} else {
if (!ll_back_merge_fn(rq->q, rq, bio))
return -EINVAL;
rq->biotail->bi_next = bio;
rq->biotail = bio;
rq->__data_len += bio->bi_iter.bi_size;
}
return 0;
}
EXPORT_SYMBOL(blk_rq_append_bio);
static int __blk_rq_unmap_user(struct bio *bio)
{
@ -71,7 +76,7 @@ static int __blk_rq_map_user_iov(struct request *rq,
*/
bio_get(bio);
ret = blk_rq_append_bio(q, rq, bio);
ret = blk_rq_append_bio(rq, bio);
if (ret) {
bio_endio(bio);
__blk_rq_unmap_user(orig_bio);
@ -229,7 +234,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
if (do_copy)
rq->cmd_flags |= REQ_COPY_USER;
ret = blk_rq_append_bio(q, rq, bio);
ret = blk_rq_append_bio(rq, bio);
if (unlikely(ret)) {
/* request is too big */
bio_put(bio);

View File

@ -649,8 +649,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
if (!rq_mergeable(req) || !rq_mergeable(next))
return 0;
if (!blk_check_merge_flags(req->cmd_flags, req_op(req), next->cmd_flags,
req_op(next)))
if (req_op(req) != req_op(next))
return 0;
/*
@ -758,8 +757,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (!rq_mergeable(rq) || !bio_mergeable(bio))
return false;
if (!blk_check_merge_flags(rq->cmd_flags, req_op(rq), bio->bi_rw,
bio_op(bio)))
if (req_op(rq) != bio_op(bio))
return false;
/* different data direction or already started, don't merge */

View File

@ -485,6 +485,32 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
{
int i, j, ret = 0;
if (!set->ops->reinit_request)
goto out;
for (i = 0; i < set->nr_hw_queues; i++) {
struct blk_mq_tags *tags = set->tags[i];
for (j = 0; j < tags->nr_tags; j++) {
if (!tags->rqs[j])
continue;
ret = set->ops->reinit_request(set->driver_data,
tags->rqs[j]);
if (ret)
goto out;
}
}
out:
return ret;
}
EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv)
{

View File

@ -263,10 +263,53 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
blk_queue_exit(q);
return ERR_PTR(-EWOULDBLOCK);
}
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
return rq;
}
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
unsigned int flags, unsigned int hctx_idx)
{
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct request *rq;
struct blk_mq_alloc_data alloc_data;
int ret;
/*
* If the tag allocator sleeps we could get an allocation for a
* different hardware context. No need to complicate the low level
* allocator for this for the rare use case of a command tied to
* a specific queue.
*/
if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
return ERR_PTR(-EINVAL);
if (hctx_idx >= q->nr_hw_queues)
return ERR_PTR(-EIO);
ret = blk_queue_enter(q, true);
if (ret)
return ERR_PTR(ret);
hctx = q->queue_hw_ctx[hctx_idx];
ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
if (!rq) {
blk_queue_exit(q);
return ERR_PTR(-EWOULDBLOCK);
}
return rq;
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx, struct request *rq)
{

View File

@ -64,8 +64,6 @@ void blk_exit_rl(struct request_list *rl);
void init_request_from_bio(struct request *req, struct bio *bio);
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio);
int blk_rq_append_bio(struct request_queue *q, struct request *rq,
struct bio *bio);
void blk_queue_bypass_start(struct request_queue *q);
void blk_queue_bypass_end(struct request_queue *q);
void blk_dequeue_request(struct request *rq);

View File

@ -506,7 +506,7 @@ static int exact_lock(dev_t devt, void *data)
return 0;
}
static void register_disk(struct gendisk *disk)
static void register_disk(struct device *parent, struct gendisk *disk)
{
struct device *ddev = disk_to_dev(disk);
struct block_device *bdev;
@ -514,7 +514,7 @@ static void register_disk(struct gendisk *disk)
struct hd_struct *part;
int err;
ddev->parent = disk->driverfs_dev;
ddev->parent = parent;
dev_set_name(ddev, "%s", disk->disk_name);
@ -573,7 +573,8 @@ exit:
}
/**
* add_disk - add partitioning information to kernel list
* device_add_disk - add partitioning information to kernel list
* @parent: parent device for the disk
* @disk: per-device partitioning information
*
* This function registers the partitioning information in @disk
@ -581,7 +582,7 @@ exit:
*
* FIXME: error handling
*/
void add_disk(struct gendisk *disk)
void device_add_disk(struct device *parent, struct gendisk *disk)
{
struct backing_dev_info *bdi;
dev_t devt;
@ -617,7 +618,7 @@ void add_disk(struct gendisk *disk)
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
register_disk(disk);
register_disk(parent, disk);
blk_register_queue(disk);
/*
@ -633,7 +634,7 @@ void add_disk(struct gendisk *disk)
disk_add_events(disk);
blk_integrity_add(disk);
}
EXPORT_SYMBOL(add_disk);
EXPORT_SYMBOL(device_add_disk);
void del_gendisk(struct gendisk *disk)
{
@ -799,10 +800,9 @@ void __init printk_all_partitions(void)
, disk_name(disk, part->partno, name_buf),
part->info ? part->info->uuid : "");
if (is_part0) {
if (disk->driverfs_dev != NULL &&
disk->driverfs_dev->driver != NULL)
if (dev->parent && dev->parent->driver)
printk(" driver: %s\n",
disk->driverfs_dev->driver->name);
dev->parent->driver->name);
else
printk(" (driver?)\n");
} else

View File

@ -347,9 +347,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
goto out;
}
rw = bio_rw(bio);
if (rw == READA)
rw = READ;
rw = bio_data_dir(bio);
bio_for_each_segment(bvec, bio, iter) {
unsigned int len = bvec.bv_len;

View File

@ -1951,7 +1951,6 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
if (cciss_create_ld_sysfs_entry(h, drv_index))
goto cleanup_queue;
disk->private_data = h->drv[drv_index];
disk->driverfs_dev = &h->drv[drv_index]->dev;
/* Set up queue information */
blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
@ -1973,7 +1972,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
/* allows the interrupt handler to start the queue */
wmb();
h->drv[drv_index]->queue = disk->queue;
add_disk(disk);
device_add_disk(&h->drv[drv_index]->dev, disk);
return 0;
cleanup_queue:

View File

@ -258,7 +258,7 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval
unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
D_ASSERT(device, (unsigned)(last - first) <= 1);
D_ASSERT(device, first <= last);
D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
/* FIXME figure out a fast path for bios crossing AL extent boundaries */
@ -341,6 +341,8 @@ static int __al_write_transaction(struct drbd_device *device, struct al_transact
i = 0;
drbd_bm_reset_al_hints(device);
/* Even though no one can start to change this list
* once we set the LC_LOCKED -- from drbd_al_begin_io(),
* lc_try_lock_for_transaction() --, someone may still
@ -770,10 +772,18 @@ static bool lazy_bitmap_update_due(struct drbd_device *device)
static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
{
if (rs_done)
set_bit(RS_DONE, &device->flags);
/* and also set RS_PROGRESS below */
else if (!lazy_bitmap_update_due(device))
if (rs_done) {
struct drbd_connection *connection = first_peer_device(device)->connection;
if (connection->agreed_pro_version <= 95 ||
is_sync_target_state(device->state.conn))
set_bit(RS_DONE, &device->flags);
/* and also set RS_PROGRESS below */
/* Else: rather wait for explicit notification via receive_state,
* to avoid uuids-rotated-too-fast causing full resync
* in next handshake, in case the replication link breaks
* at the most unfortunate time... */
} else if (!lazy_bitmap_update_due(device))
return;
drbd_device_post_work(device, RS_PROGRESS);
@ -832,6 +842,13 @@ static int update_sync_bits(struct drbd_device *device,
return count;
}
static bool plausible_request_size(int size)
{
return size > 0
&& size <= DRBD_MAX_BATCH_BIO_SIZE
&& IS_ALIGNED(size, 512);
}
/* clear the bit corresponding to the piece of storage in question:
* size byte of data starting from sector. Only clear a bits of the affected
* one ore more _aligned_ BM_BLOCK_SIZE blocks.
@ -851,7 +868,7 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
if ((mode == SET_OUT_OF_SYNC) && size == 0)
return 0;
if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
if (!plausible_request_size(size)) {
drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
drbd_change_sync_fname[mode],
(unsigned long long)sector, size);

View File

@ -96,6 +96,13 @@ struct drbd_bitmap {
struct page **bm_pages;
spinlock_t bm_lock;
/* exclusively to be used by __al_write_transaction(),
* drbd_bm_mark_for_writeout() and
* and drbd_bm_write_hinted() -> bm_rw() called from there.
*/
unsigned int n_bitmap_hints;
unsigned int al_bitmap_hints[AL_UPDATES_PER_TRANSACTION];
/* see LIMITATIONS: above */
unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
@ -242,6 +249,11 @@ static void bm_set_page_need_writeout(struct page *page)
set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
}
void drbd_bm_reset_al_hints(struct drbd_device *device)
{
device->bitmap->n_bitmap_hints = 0;
}
/**
* drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
* @device: DRBD device.
@ -253,6 +265,7 @@ static void bm_set_page_need_writeout(struct page *page)
*/
void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
{
struct drbd_bitmap *b = device->bitmap;
struct page *page;
if (page_nr >= device->bitmap->bm_number_of_pages) {
drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
@ -260,7 +273,9 @@ void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
return;
}
page = device->bitmap->bm_pages[page_nr];
set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
BUG_ON(b->n_bitmap_hints >= ARRAY_SIZE(b->al_bitmap_hints));
if (!test_and_set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)))
b->al_bitmap_hints[b->n_bitmap_hints++] = page_nr;
}
static int bm_test_page_unchanged(struct page *page)
@ -427,8 +442,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
}
/*
* called on driver init only. TODO call when a device is created.
* allocates the drbd_bitmap, and stores it in device->bitmap.
* allocates the drbd_bitmap and stores it in device->bitmap.
*/
int drbd_bm_init(struct drbd_device *device)
{
@ -633,7 +647,8 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
unsigned long bits, words, owords, obits;
unsigned long want, have, onpages; /* number of pages */
struct page **npages, **opages = NULL;
int err = 0, growing;
int err = 0;
bool growing;
if (!expect(b))
return -ENOMEM;
@ -1030,7 +1045,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
{
struct drbd_bm_aio_ctx *ctx;
struct drbd_bitmap *b = device->bitmap;
int num_pages, i, count = 0;
unsigned int num_pages, i, count = 0;
unsigned long now;
char ppb[10];
int err = 0;
@ -1078,16 +1093,37 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
now = jiffies;
/* let the layers below us try to merge these bios... */
for (i = 0; i < num_pages; i++) {
/* ignore completely unchanged pages */
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
break;
if (!(flags & BM_AIO_READ)) {
if ((flags & BM_AIO_WRITE_HINTED) &&
!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
&page_private(b->bm_pages[i])))
continue;
if (flags & BM_AIO_READ) {
for (i = 0; i < num_pages; i++) {
atomic_inc(&ctx->in_flight);
bm_page_io_async(ctx, i);
++count;
cond_resched();
}
} else if (flags & BM_AIO_WRITE_HINTED) {
/* ASSERT: BM_AIO_WRITE_ALL_PAGES is not set. */
unsigned int hint;
for (hint = 0; hint < b->n_bitmap_hints; hint++) {
i = b->al_bitmap_hints[hint];
if (i >= num_pages) /* == -1U: no hint here. */
continue;
/* Several AL-extents may point to the same page. */
if (!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
&page_private(b->bm_pages[i])))
continue;
/* Has it even changed? */
if (bm_test_page_unchanged(b->bm_pages[i]))
continue;
atomic_inc(&ctx->in_flight);
bm_page_io_async(ctx, i);
++count;
}
} else {
for (i = 0; i < num_pages; i++) {
/* ignore completely unchanged pages */
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
break;
if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
bm_test_page_unchanged(b->bm_pages[i])) {
dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
@ -1100,11 +1136,11 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
continue;
}
atomic_inc(&ctx->in_flight);
bm_page_io_async(ctx, i);
++count;
cond_resched();
}
atomic_inc(&ctx->in_flight);
bm_page_io_async(ctx, i);
++count;
cond_resched();
}
/*
@ -1121,10 +1157,14 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
/* summary for global bitmap IO */
if (flags == 0)
drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
(flags & BM_AIO_READ) ? "READ" : "WRITE",
count, jiffies - now);
if (flags == 0) {
unsigned int ms = jiffies_to_msecs(jiffies - now);
if (ms > 5) {
drbd_info(device, "bitmap %s of %u pages took %u ms\n",
(flags & BM_AIO_READ) ? "READ" : "WRITE",
count, ms);
}
}
if (ctx->error) {
drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");

View File

@ -237,14 +237,9 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re
seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
if (f & EE_IS_TRIM) {
seq_putc(m, sep);
sep = '|';
if (f & EE_IS_TRIM_USE_ZEROOUT)
seq_puts(m, "zero-out");
else
seq_puts(m, "trim");
}
if (f & EE_IS_TRIM)
__seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim");
seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same");
seq_putc(m, '\n');
}
@ -908,7 +903,7 @@ static int drbd_version_open(struct inode *inode, struct file *file)
return single_open(file, drbd_version_show, NULL);
}
static struct file_operations drbd_version_fops = {
static const struct file_operations drbd_version_fops = {
.owner = THIS_MODULE,
.open = drbd_version_open,
.llseek = seq_lseek,

View File

@ -468,9 +468,15 @@ enum {
/* this is/was a write request */
__EE_WRITE,
/* this is/was a write same request */
__EE_WRITE_SAME,
/* this originates from application on peer
* (not some resync or verify or other DRBD internal request) */
__EE_APPLICATION,
/* If it contains only 0 bytes, send back P_RS_DEALLOCATED */
__EE_RS_THIN_REQ,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
@ -484,7 +490,9 @@ enum {
#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
#define EE_SUBMITTED (1<<__EE_SUBMITTED)
#define EE_WRITE (1<<__EE_WRITE)
#define EE_WRITE_SAME (1<<__EE_WRITE_SAME)
#define EE_APPLICATION (1<<__EE_APPLICATION)
#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ)
/* flag bits per device */
enum {
@ -1123,6 +1131,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
extern int drbd_send_bitmap(struct drbd_device *device);
extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *);
extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
extern void drbd_device_cleanup(struct drbd_device *device);
void drbd_print_uuids(struct drbd_device *device, const char *text);
@ -1342,11 +1351,11 @@ struct bm_extent {
#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
/* For now, don't allow more than one activity log extent worth of data
* to be discarded in one go. We may need to rework drbd_al_begin_io()
* to allow for even larger discard ranges */
#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE
#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
/* For now, don't allow more than half of what we can "activate" in one
* activity log transaction to be discarded in one go. We may need to rework
* drbd_al_begin_io() to allow for even larger discard ranges */
#define DRBD_MAX_BATCH_BIO_SIZE (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE)
#define DRBD_MAX_BBIO_SECTORS (DRBD_MAX_BATCH_BIO_SIZE >> 9)
extern int drbd_bm_init(struct drbd_device *device);
extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
@ -1369,6 +1378,7 @@ extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
extern int drbd_bm_read(struct drbd_device *device) __must_hold(local);
extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
extern int drbd_bm_write(struct drbd_device *device) __must_hold(local);
extern void drbd_bm_reset_al_hints(struct drbd_device *device) __must_hold(local);
extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
@ -1483,12 +1493,14 @@ enum determine_dev_size {
extern enum determine_dev_size
drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
extern void resync_after_online_grow(struct drbd_device *);
extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev);
extern void drbd_reconsider_queue_parameters(struct drbd_device *device,
struct drbd_backing_dev *bdev, struct o_qlim *o);
extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
enum drbd_role new_role,
int force);
extern bool conn_try_outdate_peer(struct drbd_connection *connection);
extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
extern enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd);
extern int drbd_khelper(struct drbd_device *device, char *cmd);
/* drbd_worker.c */
@ -1548,6 +1560,8 @@ extern void start_resync_timer_fn(unsigned long data);
extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
/* drbd_receiver.c */
extern int drbd_issue_discard_or_zero_out(struct drbd_device *device,
sector_t start, unsigned int nr_sectors, bool discard);
extern int drbd_receiver(struct drbd_thread *thi);
extern int drbd_ack_receiver(struct drbd_thread *thi);
extern void drbd_send_ping_wf(struct work_struct *ws);
@ -1561,7 +1575,7 @@ extern int drbd_submit_peer_request(struct drbd_device *,
extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
sector_t, unsigned int,
bool,
unsigned int,
gfp_t) __must_hold(local);
extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
int);
@ -1635,8 +1649,6 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
/* drbd_proc.c */
extern struct proc_dir_entry *drbd_proc;
extern const struct file_operations drbd_proc_fops;
extern const char *drbd_conn_str(enum drbd_conns s);
extern const char *drbd_role_str(enum drbd_role s);
/* drbd_actlog.c */
extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
@ -2095,13 +2107,22 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f
ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
}
static inline bool is_sync_target_state(enum drbd_conns connection_state)
{
return connection_state == C_SYNC_TARGET ||
connection_state == C_PAUSED_SYNC_T;
}
static inline bool is_sync_source_state(enum drbd_conns connection_state)
{
return connection_state == C_SYNC_SOURCE ||
connection_state == C_PAUSED_SYNC_S;
}
static inline bool is_sync_state(enum drbd_conns connection_state)
{
return
(connection_state == C_SYNC_SOURCE
|| connection_state == C_SYNC_TARGET
|| connection_state == C_PAUSED_SYNC_S
|| connection_state == C_PAUSED_SYNC_T);
return is_sync_source_state(connection_state) ||
is_sync_target_state(connection_state);
}
/**

View File

@ -6,13 +6,13 @@
struct drbd_interval {
struct rb_node rb;
sector_t sector; /* start sector of the interval */
unsigned int size; /* size in bytes */
sector_t end; /* highest interval end in subtree */
int local:1 /* local or remote request? */;
int waiting:1; /* someone is waiting for this to complete */
int completed:1; /* this has been completed already;
* ignore for conflict detection */
sector_t sector; /* start sector of the interval */
unsigned int size; /* size in bytes */
sector_t end; /* highest interval end in subtree */
unsigned int local:1 /* local or remote request? */;
unsigned int waiting:1; /* someone is waiting for completion */
unsigned int completed:1; /* this has been completed already;
* ignore for conflict detection */
};
static inline void drbd_clear_interval(struct drbd_interval *i)

View File

@ -31,7 +31,7 @@
#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/drbd.h>
#include <asm/uaccess.h>
#include <linux/uaccess.h>
#include <asm/types.h>
#include <net/sock.h>
#include <linux/ctype.h>
@ -920,6 +920,31 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
}
}
/* communicated if (agreed_features & DRBD_FF_WSAME) */
void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct request_queue *q)
{
if (q) {
p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q));
p->qlim->io_min = cpu_to_be32(queue_io_min(q));
p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
p->qlim->discard_enabled = blk_queue_discard(q);
p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
} else {
q = device->rq_queue;
p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
p->qlim->alignment_offset = 0;
p->qlim->io_min = cpu_to_be32(queue_io_min(q));
p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
p->qlim->discard_enabled = 0;
p->qlim->discard_zeroes_data = 0;
p->qlim->write_same_capable = 0;
}
}
int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
{
struct drbd_device *device = peer_device->device;
@ -928,29 +953,37 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
sector_t d_size, u_size;
int q_order_type;
unsigned int max_bio_size;
unsigned int packet_size;
sock = &peer_device->connection->data;
p = drbd_prepare_command(peer_device, sock);
if (!p)
return -EIO;
packet_size = sizeof(*p);
if (peer_device->connection->agreed_features & DRBD_FF_WSAME)
packet_size += sizeof(p->qlim[0]);
memset(p, 0, packet_size);
if (get_ldev_if_state(device, D_NEGOTIATING)) {
D_ASSERT(device, device->ldev->backing_bdev);
struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
d_size = drbd_get_max_capacity(device->ldev);
rcu_read_lock();
u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
rcu_read_unlock();
q_order_type = drbd_queue_order_type(device);
max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
max_bio_size = queue_max_hw_sectors(q) << 9;
max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
assign_p_sizes_qlim(device, p, q);
put_ldev(device);
} else {
d_size = 0;
u_size = 0;
q_order_type = QUEUE_ORDERED_NONE;
max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
assign_p_sizes_qlim(device, p, NULL);
}
sock = &peer_device->connection->data;
p = drbd_prepare_command(peer_device, sock);
if (!p)
return -EIO;
if (peer_device->connection->agreed_pro_version <= 94)
max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
else if (peer_device->connection->agreed_pro_version < 100)
@ -962,7 +995,8 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
p->max_bio_size = cpu_to_be32(max_bio_size);
p->queue_order_type = cpu_to_be16(q_order_type);
p->dds_flags = cpu_to_be16(flags);
return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0);
return drbd_send_command(peer_device, sock, P_SIZES, packet_size, NULL, 0);
}
/**
@ -1377,6 +1411,22 @@ int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
cpu_to_be64(block_id));
}
int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device,
struct drbd_peer_request *peer_req)
{
struct drbd_socket *sock;
struct p_block_desc *p;
sock = &peer_device->connection->data;
p = drbd_prepare_command(peer_device, sock);
if (!p)
return -EIO;
p->sector = cpu_to_be64(peer_req->i.sector);
p->blksize = cpu_to_be32(peer_req->i.size);
p->pad = 0;
return drbd_send_command(peer_device, sock, P_RS_DEALLOCATED, sizeof(*p), NULL, 0);
}
int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
sector_t sector, int size, u64 block_id)
{
@ -1561,6 +1611,9 @@ static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
? 0 : MSG_MORE);
if (err)
return err;
/* REQ_OP_WRITE_SAME has only one segment */
if (bio_op(bio) == REQ_OP_WRITE_SAME)
break;
}
return 0;
}
@ -1579,6 +1632,9 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b
bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
if (err)
return err;
/* REQ_OP_WRITE_SAME has only one segment */
if (bio_op(bio) == REQ_OP_WRITE_SAME)
break;
}
return 0;
}
@ -1610,6 +1666,7 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
(bio->bi_rw & REQ_FUA ? DP_FUA : 0) |
(bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) |
(bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
else
return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
@ -1623,6 +1680,8 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
struct drbd_device *device = peer_device->device;
struct drbd_socket *sock;
struct p_data *p;
struct p_wsame *wsame = NULL;
void *digest_out;
unsigned int dp_flags = 0;
int digest_size;
int err;
@ -1658,12 +1717,29 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
goto out;
}
if (dp_flags & DP_WSAME) {
/* this will only work if DRBD_FF_WSAME is set AND the
* handshake agreed that all nodes and backend devices are
* WRITE_SAME capable and agree on logical_block_size */
wsame = (struct p_wsame*)p;
digest_out = wsame + 1;
wsame->size = cpu_to_be32(req->i.size);
} else
digest_out = p + 1;
/* our digest is still only over the payload.
* TRIM does not carry any payload. */
if (digest_size)
drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1);
err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size);
drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out);
if (wsame) {
err =
__send_command(peer_device->connection, device->vnr, sock, P_WSAME,
sizeof(*wsame) + digest_size, NULL,
bio_iovec(req->master_bio).bv_len);
} else
err =
__send_command(peer_device->connection, device->vnr, sock, P_DATA,
sizeof(*p) + digest_size, NULL, req->i.size);
if (!err) {
/* For protocol A, we have to memcpy the payload into
* socket buffers, as we may complete right away
@ -3507,7 +3583,12 @@ static int w_bitmap_io(struct drbd_work *w, int unused)
struct bm_io_work *work = &device->bm_io_work;
int rv = -EIO;
D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0);
if (work->flags != BM_LOCKED_CHANGE_ALLOWED) {
int cnt = atomic_read(&device->ap_bio_cnt);
if (cnt)
drbd_err(device, "FIXME: ap_bio_cnt %d, expected 0; queued for '%s'\n",
cnt, work->why);
}
if (get_ldev(device)) {
drbd_bm_lock(device, work->why, work->flags);
@ -3587,18 +3668,20 @@ void drbd_queue_bitmap_io(struct drbd_device *device,
int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *),
char *why, enum bm_flag flags)
{
/* Only suspend io, if some operation is supposed to be locked out */
const bool do_suspend_io = flags & (BM_DONT_CLEAR|BM_DONT_SET|BM_DONT_TEST);
int rv;
D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
if (do_suspend_io)
drbd_suspend_io(device);
drbd_bm_lock(device, why, flags);
rv = io_fn(device);
drbd_bm_unlock(device);
if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
if (do_suspend_io)
drbd_resume_io(device);
return rv;
@ -3637,6 +3720,8 @@ const char *cmdname(enum drbd_packet cmd)
* one PRO_VERSION */
static const char *cmdnames[] = {
[P_DATA] = "Data",
[P_WSAME] = "WriteSame",
[P_TRIM] = "Trim",
[P_DATA_REPLY] = "DataReply",
[P_RS_DATA_REPLY] = "RSDataReply",
[P_BARRIER] = "Barrier",
@ -3681,6 +3766,8 @@ const char *cmdname(enum drbd_packet cmd)
[P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
[P_RETRY_WRITE] = "retry_write",
[P_PROTOCOL_UPDATE] = "protocol_update",
[P_RS_THIN_REQ] = "rs_thin_req",
[P_RS_DEALLOCATED] = "rs_deallocated",
/* enum drbd_packet, but not commands - obsoleted flags:
* P_MAY_IGNORE

View File

@ -343,7 +343,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
(char[20]) { }, /* address family */
(char[60]) { }, /* address */
NULL };
char mb[12];
char mb[14];
char *argv[] = {usermode_helper, cmd, mb, NULL };
struct drbd_connection *connection = first_peer_device(device)->connection;
struct sib_info sib;
@ -352,7 +352,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
if (current == connection->worker.task)
set_bit(CALLBACK_PENDING, &connection->flags);
snprintf(mb, 12, "minor-%d", device_to_minor(device));
snprintf(mb, 14, "minor-%d", device_to_minor(device));
setup_khelper_env(connection, envp);
/* The helper may take some time.
@ -387,7 +387,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
return ret;
}
static int conn_khelper(struct drbd_connection *connection, char *cmd)
enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd)
{
char *envp[] = { "HOME=/",
"TERM=linux",
@ -442,19 +442,17 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec
}
rcu_read_unlock();
if (fp == FP_NOT_AVAIL) {
/* IO Suspending works on the whole resource.
Do it only for one device. */
vnr = 0;
peer_device = idr_get_next(&connection->peer_devices, &vnr);
drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0));
}
return fp;
}
static bool resource_is_supended(struct drbd_resource *resource)
{
return resource->susp || resource->susp_fen || resource->susp_nod;
}
bool conn_try_outdate_peer(struct drbd_connection *connection)
{
struct drbd_resource * const resource = connection->resource;
unsigned int connect_cnt;
union drbd_state mask = { };
union drbd_state val = { };
@ -462,21 +460,41 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
char *ex_to_string;
int r;
spin_lock_irq(&connection->resource->req_lock);
spin_lock_irq(&resource->req_lock);
if (connection->cstate >= C_WF_REPORT_PARAMS) {
drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
spin_unlock_irq(&connection->resource->req_lock);
spin_unlock_irq(&resource->req_lock);
return false;
}
connect_cnt = connection->connect_cnt;
spin_unlock_irq(&connection->resource->req_lock);
spin_unlock_irq(&resource->req_lock);
fp = highest_fencing_policy(connection);
switch (fp) {
case FP_NOT_AVAIL:
drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
goto out;
spin_lock_irq(&resource->req_lock);
if (connection->cstate < C_WF_REPORT_PARAMS) {
_conn_request_state(connection,
(union drbd_state) { { .susp_fen = 1 } },
(union drbd_state) { { .susp_fen = 0 } },
CS_VERBOSE | CS_HARD | CS_DC_SUSP);
/* We are no longer suspended due to the fencing policy.
* We may still be suspended due to the on-no-data-accessible policy.
* If that was OND_IO_ERROR, fail pending requests. */
if (!resource_is_supended(resource))
_tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
}
/* Else: in case we raced with a connection handshake,
* let the handshake figure out if we maybe can RESEND,
* and do not resume/fail pending requests here.
* Worst case is we stay suspended for now, which may be
* resolved by either re-establishing the replication link, or
* the next link failure, or eventually the administrator. */
spin_unlock_irq(&resource->req_lock);
return false;
case FP_DONT_CARE:
return true;
default: ;
@ -485,17 +503,17 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
r = conn_khelper(connection, "fence-peer");
switch ((r>>8) & 0xff) {
case 3: /* peer is inconsistent */
case P_INCONSISTENT: /* peer is inconsistent */
ex_to_string = "peer is inconsistent or worse";
mask.pdsk = D_MASK;
val.pdsk = D_INCONSISTENT;
break;
case 4: /* peer got outdated, or was already outdated */
case P_OUTDATED: /* peer got outdated, or was already outdated */
ex_to_string = "peer was fenced";
mask.pdsk = D_MASK;
val.pdsk = D_OUTDATED;
break;
case 5: /* peer was down */
case P_DOWN: /* peer was down */
if (conn_highest_disk(connection) == D_UP_TO_DATE) {
/* we will(have) create(d) a new UUID anyways... */
ex_to_string = "peer is unreachable, assumed to be dead";
@ -505,7 +523,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
}
break;
case 6: /* Peer is primary, voluntarily outdate myself.
case P_PRIMARY: /* Peer is primary, voluntarily outdate myself.
* This is useful when an unconnected R_SECONDARY is asked to
* become R_PRIMARY, but finds the other peer being active. */
ex_to_string = "peer is active";
@ -513,7 +531,9 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
mask.disk = D_MASK;
val.disk = D_OUTDATED;
break;
case 7:
case P_FENCING:
/* THINK: do we need to handle this
* like case 4, or more like case 5? */
if (fp != FP_STONITH)
drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
ex_to_string = "peer was stonithed";
@ -529,13 +549,11 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
drbd_info(connection, "fence-peer helper returned %d (%s)\n",
(r>>8) & 0xff, ex_to_string);
out:
/* Not using
conn_request_state(connection, mask, val, CS_VERBOSE);
here, because we might were able to re-establish the connection in the
meantime. */
spin_lock_irq(&connection->resource->req_lock);
spin_lock_irq(&resource->req_lock);
if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
if (connection->connect_cnt != connect_cnt)
/* In case the connection was established and droped
@ -544,7 +562,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
else
_conn_request_state(connection, mask, val, CS_VERBOSE);
}
spin_unlock_irq(&connection->resource->req_lock);
spin_unlock_irq(&resource->req_lock);
return conn_highest_pdsk(connection) <= D_OUTDATED;
}
@ -1154,51 +1172,160 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
return 0;
}
static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity)
{
q->limits.discard_granularity = granularity;
}
static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
{
/* when we introduced REQ_WRITE_SAME support, we also bumped
* our maximum supported batch bio size used for discards. */
if (connection->agreed_features & DRBD_FF_WSAME)
return DRBD_MAX_BBIO_SECTORS;
/* before, with DRBD <= 8.4.6, we only allowed up to one AL_EXTENT_SIZE. */
return AL_EXTENT_SIZE >> 9;
}
static void decide_on_discard_support(struct drbd_device *device,
struct request_queue *q,
struct request_queue *b,
bool discard_zeroes_if_aligned)
{
/* q = drbd device queue (device->rq_queue)
* b = backing device queue (device->ldev->backing_bdev->bd_disk->queue),
* or NULL if diskless
*/
struct drbd_connection *connection = first_peer_device(device)->connection;
bool can_do = b ? blk_queue_discard(b) : true;
if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) {
can_do = false;
drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
}
if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
can_do = false;
drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
}
if (can_do) {
/* We don't care for the granularity, really.
* Stacking limits below should fix it for the local
* device. Whether or not it is a suitable granularity
* on the remote device is not our problem, really. If
* you care, you need to use devices with similar
* topology on all peers. */
blk_queue_discard_granularity(q, 512);
q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
} else {
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
blk_queue_discard_granularity(q, 0);
q->limits.max_discard_sectors = 0;
}
}
static void fixup_discard_if_not_supported(struct request_queue *q)
{
/* To avoid confusion, if this queue does not support discard, clear
* max_discard_sectors, which is what lsblk -D reports to the user.
* Older kernels got this wrong in "stack limits".
* */
if (!blk_queue_discard(q)) {
blk_queue_max_discard_sectors(q, 0);
blk_queue_discard_granularity(q, 0);
}
}
static void decide_on_write_same_support(struct drbd_device *device,
struct request_queue *q,
struct request_queue *b, struct o_qlim *o)
{
struct drbd_peer_device *peer_device = first_peer_device(device);
struct drbd_connection *connection = peer_device->connection;
bool can_do = b ? b->limits.max_write_same_sectors : true;
if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) {
can_do = false;
drbd_info(peer_device, "peer does not support WRITE_SAME\n");
}
if (o) {
/* logical block size; queue_logical_block_size(NULL) is 512 */
unsigned int peer_lbs = be32_to_cpu(o->logical_block_size);
unsigned int me_lbs_b = queue_logical_block_size(b);
unsigned int me_lbs = queue_logical_block_size(q);
if (me_lbs_b != me_lbs) {
drbd_warn(device,
"logical block size of local backend does not match (drbd:%u, backend:%u); was this a late attach?\n",
me_lbs, me_lbs_b);
/* rather disable write same than trigger some BUG_ON later in the scsi layer. */
can_do = false;
}
if (me_lbs_b != peer_lbs) {
drbd_warn(peer_device, "logical block sizes do not match (me:%u, peer:%u); this may cause problems.\n",
me_lbs, peer_lbs);
if (can_do) {
drbd_dbg(peer_device, "logical block size mismatch: WRITE_SAME disabled.\n");
can_do = false;
}
me_lbs = max(me_lbs, me_lbs_b);
/* We cannot change the logical block size of an in-use queue.
* We can only hope that access happens to be properly aligned.
* If not, the peer will likely produce an IO error, and detach. */
if (peer_lbs > me_lbs) {
if (device->state.role != R_PRIMARY) {
blk_queue_logical_block_size(q, peer_lbs);
drbd_warn(peer_device, "logical block size set to %u\n", peer_lbs);
} else {
drbd_warn(peer_device,
"current Primary must NOT adjust logical block size (%u -> %u); hope for the best.\n",
me_lbs, peer_lbs);
}
}
}
if (can_do && !o->write_same_capable) {
/* If we introduce an open-coded write-same loop on the receiving side,
* the peer would present itself as "capable". */
drbd_dbg(peer_device, "WRITE_SAME disabled (peer device not capable)\n");
can_do = false;
}
}
blk_queue_max_write_same_sectors(q, can_do ? DRBD_MAX_BBIO_SECTORS : 0);
}
static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
unsigned int max_bio_size)
unsigned int max_bio_size, struct o_qlim *o)
{
struct request_queue * const q = device->rq_queue;
unsigned int max_hw_sectors = max_bio_size >> 9;
unsigned int max_segments = 0;
struct request_queue *b = NULL;
struct disk_conf *dc;
bool discard_zeroes_if_aligned = true;
if (bdev) {
b = bdev->backing_bdev->bd_disk->queue;
max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
rcu_read_lock();
max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
dc = rcu_dereference(device->ldev->disk_conf);
max_segments = dc->max_bio_bvecs;
discard_zeroes_if_aligned = dc->discard_zeroes_if_aligned;
rcu_read_unlock();
blk_set_stacking_limits(&q->limits);
blk_queue_max_write_same_sectors(q, 0);
}
blk_queue_logical_block_size(q, 512);
blk_queue_max_hw_sectors(q, max_hw_sectors);
/* This is the workaround for "bio would need to, but cannot, be split" */
blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
blk_queue_segment_boundary(q, PAGE_SIZE-1);
decide_on_discard_support(device, q, b, discard_zeroes_if_aligned);
decide_on_write_same_support(device, q, b, o);
if (b) {
struct drbd_connection *connection = first_peer_device(device)->connection;
blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
if (blk_queue_discard(b) &&
(connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
/* We don't care, stacking below should fix it for the local device.
* Whether or not it is a suitable granularity on the remote device
* is not our problem, really. If you care, you need to
* use devices with similar topology on all peers. */
q->limits.discard_granularity = 512;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
} else {
blk_queue_max_discard_sectors(q, 0);
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
q->limits.discard_granularity = 0;
}
blk_queue_stack_limits(q, b);
if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
@ -1208,15 +1335,10 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
}
}
/* To avoid confusion, if this queue does not support discard, clear
* max_discard_sectors, which is what lsblk -D reports to the user. */
if (!blk_queue_discard(q)) {
blk_queue_max_discard_sectors(q, 0);
q->limits.discard_granularity = 0;
}
fixup_discard_if_not_supported(q);
}
void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
{
unsigned int now, new, local, peer;
@ -1259,7 +1381,7 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backin
if (new != now)
drbd_info(device, "max BIO size = %u\n", new);
drbd_setup_queue_param(device, bdev, new);
drbd_setup_queue_param(device, bdev, new, o);
}
/* Starts the worker thread */
@ -1348,6 +1470,43 @@ static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
a->disk_drain != b->disk_drain;
}
static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *disk_conf,
struct drbd_backing_dev *nbc)
{
struct request_queue * const q = nbc->backing_bdev->bd_disk->queue;
if (disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
if (disk_conf->al_extents > drbd_al_extents_max(nbc))
disk_conf->al_extents = drbd_al_extents_max(nbc);
if (!blk_queue_discard(q)
|| (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) {
if (disk_conf->rs_discard_granularity) {
disk_conf->rs_discard_granularity = 0; /* disable feature */
drbd_info(device, "rs_discard_granularity feature disabled\n");
}
}
if (disk_conf->rs_discard_granularity) {
int orig_value = disk_conf->rs_discard_granularity;
int remainder;
if (q->limits.discard_granularity > disk_conf->rs_discard_granularity)
disk_conf->rs_discard_granularity = q->limits.discard_granularity;
remainder = disk_conf->rs_discard_granularity % q->limits.discard_granularity;
disk_conf->rs_discard_granularity += remainder;
if (disk_conf->rs_discard_granularity > q->limits.max_discard_sectors << 9)
disk_conf->rs_discard_granularity = q->limits.max_discard_sectors << 9;
if (disk_conf->rs_discard_granularity != orig_value)
drbd_info(device, "rs_discard_granularity changed to %d\n",
disk_conf->rs_discard_granularity);
}
}
int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
{
struct drbd_config_context adm_ctx;
@ -1395,10 +1554,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
if (!expect(new_disk_conf->resync_rate >= 1))
new_disk_conf->resync_rate = 1;
if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev))
new_disk_conf->al_extents = drbd_al_extents_max(device->ldev);
sanitize_disk_conf(device, new_disk_conf, device->ldev);
if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
@ -1457,6 +1613,9 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
if (write_ordering_changed(old_disk_conf, new_disk_conf))
drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned)
drbd_reconsider_queue_parameters(device, device->ldev, NULL);
drbd_md_sync(device);
if (device->state.conn >= C_CONNECTED) {
@ -1693,10 +1852,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
if (retcode != NO_ERROR)
goto fail;
if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
new_disk_conf->al_extents = drbd_al_extents_max(nbc);
sanitize_disk_conf(device, new_disk_conf, nbc);
if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
@ -1838,7 +1994,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
device->read_cnt = 0;
device->writ_cnt = 0;
drbd_reconsider_max_bio_size(device, device->ldev);
drbd_reconsider_queue_parameters(device, device->ldev, NULL);
/* If I am currently not R_PRIMARY,
* but meta data primary indicator is set,

View File

@ -25,7 +25,7 @@
#include <linux/module.h>
#include <asm/uaccess.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
@ -122,18 +122,18 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
x = res/50;
y = 20-x;
seq_printf(seq, "\t[");
seq_puts(seq, "\t[");
for (i = 1; i < x; i++)
seq_printf(seq, "=");
seq_printf(seq, ">");
seq_putc(seq, '=');
seq_putc(seq, '>');
for (i = 0; i < y; i++)
seq_printf(seq, ".");
seq_printf(seq, "] ");
seq_puts(seq, "] ");
if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
seq_printf(seq, "verified:");
seq_puts(seq, "verified:");
else
seq_printf(seq, "sync'ed:");
seq_puts(seq, "sync'ed:");
seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
/* if more than a few GB, display in MB */
@ -146,7 +146,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
(unsigned long) Bit2KB(rs_left),
(unsigned long) Bit2KB(rs_total));
seq_printf(seq, "\n\t");
seq_puts(seq, "\n\t");
/* see drivers/md/md.c
* We do not want to overflow, so the order of operands and
@ -175,9 +175,9 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
rt / 3600, (rt % 3600) / 60, rt % 60);
dbdt = Bit2KB(db/dt);
seq_printf(seq, " speed: ");
seq_puts(seq, " speed: ");
seq_printf_with_thousands_grouping(seq, dbdt);
seq_printf(seq, " (");
seq_puts(seq, " (");
/* ------------------------- ~3s average ------------------------ */
if (proc_details >= 1) {
/* this is what drbd_rs_should_slow_down() uses */
@ -188,7 +188,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
db = device->rs_mark_left[i] - rs_left;
dbdt = Bit2KB(db/dt);
seq_printf_with_thousands_grouping(seq, dbdt);
seq_printf(seq, " -- ");
seq_puts(seq, " -- ");
}
/* --------------------- long term average ---------------------- */
@ -200,11 +200,11 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
db = rs_total - rs_left;
dbdt = Bit2KB(db/dt);
seq_printf_with_thousands_grouping(seq, dbdt);
seq_printf(seq, ")");
seq_putc(seq, ')');
if (state.conn == C_SYNC_TARGET ||
state.conn == C_VERIFY_S) {
seq_printf(seq, " want: ");
seq_puts(seq, " want: ");
seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
}
seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
@ -231,7 +231,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
(unsigned long long)bm_bits * BM_SECT_PER_BIT);
if (stop_sector != 0 && stop_sector != ULLONG_MAX)
seq_printf(seq, " stop sector: %llu", stop_sector);
seq_printf(seq, "\n");
seq_putc(seq, '\n');
}
}
@ -276,7 +276,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
rcu_read_lock();
idr_for_each_entry(&drbd_devices, device, i) {
if (prev_i != i - 1)
seq_printf(seq, "\n");
seq_putc(seq, '\n');
prev_i = i;
state = device->state;

View File

@ -60,6 +60,15 @@ enum drbd_packet {
* which is why I chose TRIM here, to disambiguate. */
P_TRIM = 0x31,
/* Only use these two if both support FF_THIN_RESYNC */
P_RS_THIN_REQ = 0x32, /* Request a block for resync or reply P_RS_DEALLOCATED */
P_RS_DEALLOCATED = 0x33, /* Contains only zeros on sync source node */
/* REQ_WRITE_SAME.
* On a receiving side without REQ_WRITE_SAME,
* we may fall back to an opencoded loop instead. */
P_WSAME = 0x34,
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101,
@ -106,8 +115,11 @@ struct p_header100 {
u32 pad;
} __packed;
/* these defines must not be changed without changing the protocol version */
#define DP_HARDBARRIER 1 /* depricated */
/* These defines must not be changed without changing the protocol version.
* New defines may only be introduced together with protocol version bump or
* new protocol feature flags.
*/
#define DP_HARDBARRIER 1 /* no longer used */
#define DP_RW_SYNC 2 /* equals REQ_SYNC */
#define DP_MAY_SET_IN_SYNC 4
#define DP_UNPLUG 8 /* not used anymore */
@ -116,6 +128,7 @@ struct p_header100 {
#define DP_DISCARD 64 /* equals REQ_DISCARD */
#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
#define DP_WSAME 512 /* equiv. REQ_WRITE_SAME */
struct p_data {
u64 sector; /* 64 bits sector number */
@ -129,6 +142,11 @@ struct p_trim {
u32 size; /* == bio->bi_size */
} __packed;
struct p_wsame {
struct p_data p_data;
u32 size; /* == bio->bi_size */
} __packed;
/*
* commands which share a struct:
* p_block_ack:
@ -160,7 +178,23 @@ struct p_block_req {
* ReportParams
*/
#define FF_TRIM 1
/* supports TRIM/DISCARD on the "wire" protocol */
#define DRBD_FF_TRIM 1
/* Detect all-zeros during resync, and rather TRIM/UNMAP/DISCARD those blocks
* instead of fully allocate a supposedly thin volume on initial resync */
#define DRBD_FF_THIN_RESYNC 2
/* supports REQ_WRITE_SAME on the "wire" protocol.
* Note: this flag is overloaded,
* its presence also
* - indicates support for 128 MiB "batch bios",
* max discard size of 128 MiB
* instead of 4M before that.
* - indicates that we exchange additional settings in p_sizes
* drbd_send_sizes()/receive_sizes()
*/
#define DRBD_FF_WSAME 4
struct p_connection_features {
u32 protocol_min;
@ -235,6 +269,40 @@ struct p_rs_uuid {
u64 uuid;
} __packed;
/* optional queue_limits if (agreed_features & DRBD_FF_WSAME)
* see also struct queue_limits, as of late 2015 */
struct o_qlim {
/* we don't need it yet, but we may as well communicate it now */
u32 physical_block_size;
/* so the original in struct queue_limits is unsigned short,
* but I'd have to put in padding anyways. */
u32 logical_block_size;
/* One incoming bio becomes one DRBD request,
* which may be translated to several bio on the receiving side.
* We don't need to communicate chunk/boundary/segment ... limits.
*/
/* various IO hints may be useful with "diskless client" setups */
u32 alignment_offset;
u32 io_min;
u32 io_opt;
/* We may need to communicate integrity stuff at some point,
* but let's not get ahead of ourselves. */
/* Backend discard capabilities.
* Receiving side uses "blkdev_issue_discard()", no need to communicate
* more specifics. If the backend cannot do discards, the DRBD peer
* may fall back to blkdev_issue_zeroout().
*/
u8 discard_enabled;
u8 discard_zeroes_data;
u8 write_same_capable;
u8 _pad;
} __packed;
struct p_sizes {
u64 d_size; /* size of disk */
u64 u_size; /* user requested size */
@ -242,6 +310,9 @@ struct p_sizes {
u32 max_bio_size; /* Maximal size of a BIO */
u16 queue_order_type; /* not yet implemented in DRBD*/
u16 dds_flags; /* use enum dds_flags here. */
/* optional queue_limits if (agreed_features & DRBD_FF_WSAME) */
struct o_qlim qlim[0];
} __packed;
struct p_state {

View File

@ -25,7 +25,7 @@
#include <linux/module.h>
#include <asm/uaccess.h>
#include <linux/uaccess.h>
#include <net/sock.h>
#include <linux/drbd.h>
@ -48,7 +48,7 @@
#include "drbd_req.h"
#include "drbd_vli.h"
#define PRO_FEATURES (FF_TRIM)
#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME)
struct packet_info {
enum drbd_packet cmd;
@ -361,14 +361,17 @@ You must not have the req_lock:
drbd_wait_ee_list_empty()
*/
/* normal: payload_size == request size (bi_size)
* w_same: payload_size == logical_block_size
* trim: payload_size == 0 */
struct drbd_peer_request *
drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
{
struct drbd_device *device = peer_device->device;
struct drbd_peer_request *peer_req;
struct page *page = NULL;
unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT;
if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
return NULL;
@ -380,7 +383,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
return NULL;
}
if (has_payload && data_size) {
if (nr_pages) {
page = drbd_alloc_pages(peer_device, nr_pages,
gfpflags_allow_blocking(gfp_mask));
if (!page)
@ -390,7 +393,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
memset(peer_req, 0, sizeof(*peer_req));
INIT_LIST_HEAD(&peer_req->w.list);
drbd_clear_interval(&peer_req->i);
peer_req->i.size = data_size;
peer_req->i.size = request_size;
peer_req->i.sector = sector;
peer_req->submit_jif = jiffies;
peer_req->peer_device = peer_device;
@ -1204,13 +1207,84 @@ static int drbd_recv_header(struct drbd_connection *connection, struct packet_in
return err;
}
/* This is blkdev_issue_flush, but asynchronous.
* We want to submit to all component volumes in parallel,
* then wait for all completions.
*/
struct issue_flush_context {
atomic_t pending;
int error;
struct completion done;
};
struct one_flush_context {
struct drbd_device *device;
struct issue_flush_context *ctx;
};
void one_flush_endio(struct bio *bio)
{
struct one_flush_context *octx = bio->bi_private;
struct drbd_device *device = octx->device;
struct issue_flush_context *ctx = octx->ctx;
if (bio->bi_error) {
ctx->error = bio->bi_error;
drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_error);
}
kfree(octx);
bio_put(bio);
clear_bit(FLUSH_PENDING, &device->flags);
put_ldev(device);
kref_put(&device->kref, drbd_destroy_device);
if (atomic_dec_and_test(&ctx->pending))
complete(&ctx->done);
}
static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
{
struct bio *bio = bio_alloc(GFP_NOIO, 0);
struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
if (!bio || !octx) {
drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n");
/* FIXME: what else can I do now? disconnecting or detaching
* really does not help to improve the state of the world, either.
*/
kfree(octx);
if (bio)
bio_put(bio);
ctx->error = -ENOMEM;
put_ldev(device);
kref_put(&device->kref, drbd_destroy_device);
return;
}
octx->device = device;
octx->ctx = ctx;
bio->bi_bdev = device->ldev->backing_bdev;
bio->bi_private = octx;
bio->bi_end_io = one_flush_endio;
bio_set_op_attrs(bio, REQ_OP_FLUSH, WRITE_FLUSH);
device->flush_jif = jiffies;
set_bit(FLUSH_PENDING, &device->flags);
atomic_inc(&ctx->pending);
submit_bio(bio);
}
static void drbd_flush(struct drbd_connection *connection)
{
int rv;
struct drbd_peer_device *peer_device;
int vnr;
if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
struct drbd_peer_device *peer_device;
struct issue_flush_context ctx;
int vnr;
atomic_set(&ctx.pending, 1);
ctx.error = 0;
init_completion(&ctx.done);
rcu_read_lock();
idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
struct drbd_device *device = peer_device->device;
@ -1220,31 +1294,24 @@ static void drbd_flush(struct drbd_connection *connection)
kref_get(&device->kref);
rcu_read_unlock();
/* Right now, we have only this one synchronous code path
* for flushes between request epochs.
* We may want to make those asynchronous,
* or at least parallelize the flushes to the volume devices.
*/
device->flush_jif = jiffies;
set_bit(FLUSH_PENDING, &device->flags);
rv = blkdev_issue_flush(device->ldev->backing_bdev,
GFP_NOIO, NULL);
clear_bit(FLUSH_PENDING, &device->flags);
if (rv) {
drbd_info(device, "local disk flush failed with status %d\n", rv);
/* would rather check on EOPNOTSUPP, but that is not reliable.
* don't try again for ANY return value != 0
* if (rv == -EOPNOTSUPP) */
drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
}
put_ldev(device);
kref_put(&device->kref, drbd_destroy_device);
submit_one_flush(device, &ctx);
rcu_read_lock();
if (rv)
break;
}
rcu_read_unlock();
/* Do we want to add a timeout,
* if disk-timeout is set? */
if (!atomic_dec_and_test(&ctx.pending))
wait_for_completion(&ctx.done);
if (ctx.error) {
/* would rather check on EOPNOTSUPP, but that is not reliable.
* don't try again for ANY return value != 0
* if (rv == -EOPNOTSUPP) */
/* Any error is already reported by bio_endio callback. */
drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
}
}
}
@ -1379,6 +1446,120 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
}
/*
* We *may* ignore the discard-zeroes-data setting, if so configured.
*
* Assumption is that it "discard_zeroes_data=0" is only because the backend
* may ignore partial unaligned discards.
*
* LVM/DM thin as of at least
* LVM version: 2.02.115(2)-RHEL7 (2015-01-28)
* Library version: 1.02.93-RHEL7 (2015-01-28)
* Driver version: 4.29.0
* still behaves this way.
*
* For unaligned (wrt. alignment and granularity) or too small discards,
* we zero-out the initial (and/or) trailing unaligned partial chunks,
* but discard all the aligned full chunks.
*
* At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1".
*/
int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, bool discard)
{
struct block_device *bdev = device->ldev->backing_bdev;
struct request_queue *q = bdev_get_queue(bdev);
sector_t tmp, nr;
unsigned int max_discard_sectors, granularity;
int alignment;
int err = 0;
if (!discard)
goto zero_out;
/* Zero-sector (unknown) and one-sector granularities are the same. */
granularity = max(q->limits.discard_granularity >> 9, 1U);
alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22));
max_discard_sectors -= max_discard_sectors % granularity;
if (unlikely(!max_discard_sectors))
goto zero_out;
if (nr_sectors < granularity)
goto zero_out;
tmp = start;
if (sector_div(tmp, granularity) != alignment) {
if (nr_sectors < 2*granularity)
goto zero_out;
/* start + gran - (start + gran - align) % gran */
tmp = start + granularity - alignment;
tmp = start + granularity - sector_div(tmp, granularity);
nr = tmp - start;
err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
nr_sectors -= nr;
start = tmp;
}
while (nr_sectors >= granularity) {
nr = min_t(sector_t, nr_sectors, max_discard_sectors);
err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
nr_sectors -= nr;
start += nr;
}
zero_out:
if (nr_sectors) {
err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0);
}
return err != 0;
}
static bool can_do_reliable_discards(struct drbd_device *device)
{
struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
struct disk_conf *dc;
bool can_do;
if (!blk_queue_discard(q))
return false;
if (q->limits.discard_zeroes_data)
return true;
rcu_read_lock();
dc = rcu_dereference(device->ldev->disk_conf);
can_do = dc->discard_zeroes_if_aligned;
rcu_read_unlock();
return can_do;
}
static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
{
/* If the backend cannot discard, or does not guarantee
* read-back zeroes in discarded ranges, we fall back to
* zero-out. Unless configuration specifically requested
* otherwise. */
if (!can_do_reliable_discards(device))
peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
peer_req->i.size >> 9, !(peer_req->flags & EE_IS_TRIM_USE_ZEROOUT)))
peer_req->flags |= EE_WAS_ERROR;
drbd_endio_write_sec_final(peer_req);
}
static void drbd_issue_peer_wsame(struct drbd_device *device,
struct drbd_peer_request *peer_req)
{
struct block_device *bdev = device->ldev->backing_bdev;
sector_t s = peer_req->i.sector;
sector_t nr = peer_req->i.size >> 9;
if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages))
peer_req->flags |= EE_WAS_ERROR;
drbd_endio_write_sec_final(peer_req);
}
/**
* drbd_submit_peer_request()
* @device: DRBD device.
@ -1410,7 +1591,13 @@ int drbd_submit_peer_request(struct drbd_device *device,
unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
int err = -ENOMEM;
if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
/* TRIM/DISCARD: for now, always use the helper function
* blkdev_issue_zeroout(..., discard=true).
* It's synchronous, but it does the right thing wrt. bio splitting.
* Correctness first, performance later. Next step is to code an
* asynchronous variant of the same.
*/
if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) {
/* wait for all pending IO completions, before we start
* zeroing things out. */
conn_wait_active_ee_empty(peer_req->peer_device->connection);
@ -1418,22 +1605,22 @@ int drbd_submit_peer_request(struct drbd_device *device,
* so we can find it to present it in debugfs */
peer_req->submit_jif = jiffies;
peer_req->flags |= EE_SUBMITTED;
spin_lock_irq(&device->resource->req_lock);
list_add_tail(&peer_req->w.list, &device->active_ee);
spin_unlock_irq(&device->resource->req_lock);
if (blkdev_issue_zeroout(device->ldev->backing_bdev,
sector, data_size >> 9, GFP_NOIO, false))
peer_req->flags |= EE_WAS_ERROR;
drbd_endio_write_sec_final(peer_req);
/* If this was a resync request from receive_rs_deallocated(),
* it is already on the sync_ee list */
if (list_empty(&peer_req->w.list)) {
spin_lock_irq(&device->resource->req_lock);
list_add_tail(&peer_req->w.list, &device->active_ee);
spin_unlock_irq(&device->resource->req_lock);
}
if (peer_req->flags & EE_IS_TRIM)
drbd_issue_peer_discard(device, peer_req);
else /* EE_WRITE_SAME */
drbd_issue_peer_wsame(device, peer_req);
return 0;
}
/* Discards don't have any payload.
* But the scsi layer still expects a bio_vec it can use internally,
* see sd_setup_discard_cmnd() and blk_add_request_payload(). */
if (peer_req->flags & EE_IS_TRIM)
nr_pages = 1;
/* In most cases, we will only need one bio. But in case the lower
* level restrictions happen to be different at this offset on this
* side than those of the sending peer, we may need to submit the
@ -1459,11 +1646,6 @@ next_bio:
bios = bio;
++n_bios;
if (op == REQ_OP_DISCARD) {
bio->bi_iter.bi_size = data_size;
goto submit;
}
page_chain_for_each(page) {
unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
if (!bio_add_page(bio, page, len, 0)) {
@ -1485,7 +1667,6 @@ next_bio:
--nr_pages;
}
D_ASSERT(device, data_size == 0);
submit:
D_ASSERT(device, page == NULL);
atomic_set(&peer_req->pending_bios, n_bios);
@ -1609,8 +1790,26 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
return 0;
}
/* quick wrapper in case payload size != request_size (write same) */
static void drbd_csum_ee_size(struct crypto_ahash *h,
struct drbd_peer_request *r, void *d,
unsigned int payload_size)
{
unsigned int tmp = r->i.size;
r->i.size = payload_size;
drbd_csum_ee(h, r, d);
r->i.size = tmp;
}
/* used from receive_RSDataReply (recv_resync_read)
* and from receive_Data */
* and from receive_Data.
* data_size: actual payload ("data in")
* for normal writes that is bi_size.
* for discards, that is zero.
* for write same, it is logical_block_size.
* both trim and write same have the bi_size ("data len to be affected")
* as extra argument in the packet header.
*/
static struct drbd_peer_request *
read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
struct packet_info *pi) __must_hold(local)
@ -1625,6 +1824,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
void *dig_vv = peer_device->connection->int_dig_vv;
unsigned long *data;
struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL;
digest_size = 0;
if (!trim && peer_device->connection->peer_integrity_tfm) {
@ -1639,38 +1839,60 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
data_size -= digest_size;
}
/* assume request_size == data_size, but special case trim and wsame. */
ds = data_size;
if (trim) {
D_ASSERT(peer_device, data_size == 0);
data_size = be32_to_cpu(trim->size);
if (!expect(data_size == 0))
return NULL;
ds = be32_to_cpu(trim->size);
} else if (wsame) {
if (data_size != queue_logical_block_size(device->rq_queue)) {
drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n",
data_size, queue_logical_block_size(device->rq_queue));
return NULL;
}
if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) {
drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n",
data_size, bdev_logical_block_size(device->ldev->backing_bdev));
return NULL;
}
ds = be32_to_cpu(wsame->size);
}
if (!expect(IS_ALIGNED(data_size, 512)))
if (!expect(IS_ALIGNED(ds, 512)))
return NULL;
/* prepare for larger trim requests. */
if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
if (trim || wsame) {
if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
return NULL;
} else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
return NULL;
/* even though we trust out peer,
* we sometimes have to double check. */
if (sector + (data_size>>9) > capacity) {
if (sector + (ds>>9) > capacity) {
drbd_err(device, "request from peer beyond end of local disk: "
"capacity: %llus < sector: %llus + size: %u\n",
(unsigned long long)capacity,
(unsigned long long)sector, data_size);
(unsigned long long)sector, ds);
return NULL;
}
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
if (!peer_req)
return NULL;
peer_req->flags |= EE_WRITE;
if (trim)
if (trim) {
peer_req->flags |= EE_IS_TRIM;
return peer_req;
}
if (wsame)
peer_req->flags |= EE_WRITE_SAME;
/* receive payload size bytes into page chain */
ds = data_size;
page = peer_req->pages;
page_chain_for_each(page) {
@ -1690,7 +1912,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
}
if (digest_size) {
drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv);
drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
if (memcmp(dig_in, dig_vv, digest_size)) {
drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
(unsigned long long)sector, data_size);
@ -2067,13 +2289,13 @@ static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
{
struct drbd_peer_request *rs_req;
bool rv = 0;
bool rv = false;
spin_lock_irq(&device->resource->req_lock);
list_for_each_entry(rs_req, &device->sync_ee, w.list) {
if (overlaps(peer_req->i.sector, peer_req->i.size,
rs_req->i.sector, rs_req->i.size)) {
rv = 1;
rv = true;
break;
}
}
@ -2354,10 +2576,6 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
op = wire_flags_to_bio_op(dp_flags);
op_flags = wire_flags_to_bio_flags(dp_flags);
if (pi->cmd == P_TRIM) {
struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
peer_req->flags |= EE_IS_TRIM;
if (!blk_queue_discard(q))
peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
D_ASSERT(peer_device, peer_req->i.size > 0);
D_ASSERT(peer_device, op == REQ_OP_DISCARD);
D_ASSERT(peer_device, peer_req->pages == NULL);
@ -2424,11 +2642,11 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
update_peer_seq(peer_device, peer_seq);
spin_lock_irq(&device->resource->req_lock);
}
/* if we use the zeroout fallback code, we process synchronously
* and we wait for all pending requests, respectively wait for
/* TRIM and WRITE_SAME are processed synchronously,
* we wait for all pending requests, respectively wait for
* active_ee to become empty in drbd_submit_peer_request();
* better not add ourselves here. */
if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0)
list_add_tail(&peer_req->w.list, &device->active_ee);
spin_unlock_irq(&device->resource->req_lock);
@ -2460,7 +2678,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
}
out_interrupted:
drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
put_ldev(device);
drbd_free_peer_req(device, peer_req);
return err;
@ -2585,6 +2803,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
case P_DATA_REQUEST:
drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
break;
case P_RS_THIN_REQ:
case P_RS_DATA_REQUEST:
case P_CSUM_RS_REQUEST:
case P_OV_REQUEST:
@ -2610,7 +2829,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
true /* has real payload */, GFP_NOIO);
size, GFP_NOIO);
if (!peer_req) {
put_ldev(device);
return -ENOMEM;
@ -2624,6 +2843,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
peer_req->flags |= EE_APPLICATION;
goto submit;
case P_RS_THIN_REQ:
/* If at some point in the future we have a smart way to
find out if this data block is completely deallocated,
then we would do something smarter here than reading
the block... */
peer_req->flags |= EE_RS_THIN_REQ;
case P_RS_DATA_REQUEST:
peer_req->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD;
@ -2969,7 +3194,8 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
-1091 requires proto 91
-1096 requires proto 96
*/
static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local)
static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local)
{
struct drbd_peer_device *const peer_device = first_peer_device(device);
struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
@ -3049,8 +3275,39 @@ static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __m
* next bit (weight 2) is set when peer was primary */
*rule_nr = 40;
/* Neither has the "crashed primary" flag set,
* only a replication link hickup. */
if (rct == 0)
return 0;
/* Current UUID equal and no bitmap uuid; does not necessarily
* mean this was a "simultaneous hard crash", maybe IO was
* frozen, so no UUID-bump happened.
* This is a protocol change, overload DRBD_FF_WSAME as flag
* for "new-enough" peer DRBD version. */
if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) {
*rule_nr = 41;
if (!(connection->agreed_features & DRBD_FF_WSAME)) {
drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n");
return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8));
}
if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) {
/* At least one has the "crashed primary" bit set,
* both are primary now, but neither has rotated its UUIDs?
* "Can not happen." */
drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n");
return -100;
}
if (device->state.role == R_PRIMARY)
return 1;
return -1;
}
/* Both are secondary.
* Really looks like recovery from simultaneous hard crash.
* Check which had been primary before, and arbitrate. */
switch (rct) {
case 0: /* !self_pri && !peer_pri */ return 0;
case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */
case 1: /* self_pri && !peer_pri */ return 1;
case 2: /* !self_pri && peer_pri */ return -1;
case 3: /* self_pri && peer_pri */
@ -3177,7 +3434,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
drbd_uuid_dump(device, "peer", device->p_uuid,
device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
hg = drbd_uuid_compare(device, &rule_nr);
hg = drbd_uuid_compare(device, peer_role, &rule_nr);
spin_unlock_irq(&device->ldev->md.uuid_lock);
drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
@ -3186,6 +3443,15 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
drbd_alert(device, "Unrelated data, aborting!\n");
return C_MASK;
}
if (hg < -0x10000) {
int proto, fflags;
hg = -hg;
proto = hg & 0xff;
fflags = (hg >> 8) & 0xff;
drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n",
proto, fflags);
return C_MASK;
}
if (hg < -1000) {
drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
return C_MASK;
@ -3415,7 +3681,8 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in
*/
peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
if (!peer_integrity_tfm) {
if (IS_ERR(peer_integrity_tfm)) {
peer_integrity_tfm = NULL;
drbd_err(connection, "peer data-integrity-alg %s not supported\n",
integrity_alg);
goto disconnect;
@ -3766,6 +4033,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
struct drbd_peer_device *peer_device;
struct drbd_device *device;
struct p_sizes *p = pi->data;
struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
enum determine_dev_size dd = DS_UNCHANGED;
sector_t p_size, p_usize, p_csize, my_usize;
int ldsc = 0; /* local disk size changed */
@ -3785,6 +4053,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
device->p_size = p_size;
if (get_ldev(device)) {
sector_t new_size, cur_size;
rcu_read_lock();
my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
rcu_read_unlock();
@ -3801,11 +4070,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
/* Never shrink a device with usable data during connect.
But allow online shrinking if we are connected. */
if (drbd_new_dev_size(device, device->ldev, p_usize, 0) <
drbd_get_capacity(device->this_bdev) &&
new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
cur_size = drbd_get_capacity(device->this_bdev);
if (new_size < cur_size &&
device->state.disk >= D_OUTDATED &&
device->state.conn < C_CONNECTED) {
drbd_err(device, "The peer's disk size is too small!\n");
drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
(unsigned long long)new_size, (unsigned long long)cur_size);
conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
put_ldev(device);
return -EIO;
@ -3839,14 +4110,14 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
}
device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
/* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
/* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size().
In case we cleared the QUEUE_FLAG_DISCARD from our queue in
drbd_reconsider_max_bio_size(), we can be sure that after
drbd_reconsider_queue_parameters(), we can be sure that after
drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
ddsf = be16_to_cpu(p->dds_flags);
if (get_ldev(device)) {
drbd_reconsider_max_bio_size(device, device->ldev);
drbd_reconsider_queue_parameters(device, device->ldev, o);
dd = drbd_determine_dev_size(device, ddsf, NULL);
put_ldev(device);
if (dd == DS_ERROR)
@ -3866,7 +4137,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
* However, if he sends a zero current size,
* take his (user-capped or) backing disk size anyways.
*/
drbd_reconsider_max_bio_size(device, NULL);
drbd_reconsider_queue_parameters(device, NULL, o);
drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
}
@ -4599,9 +4870,75 @@ static int receive_out_of_sync(struct drbd_connection *connection, struct packet
return 0;
}
static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
{
struct drbd_peer_device *peer_device;
struct p_block_desc *p = pi->data;
struct drbd_device *device;
sector_t sector;
int size, err = 0;
peer_device = conn_peer_device(connection, pi->vnr);
if (!peer_device)
return -EIO;
device = peer_device->device;
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
dec_rs_pending(device);
if (get_ldev(device)) {
struct drbd_peer_request *peer_req;
const int op = REQ_OP_DISCARD;
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
size, 0, GFP_NOIO);
if (!peer_req) {
put_ldev(device);
return -ENOMEM;
}
peer_req->w.cb = e_end_resync_block;
peer_req->submit_jif = jiffies;
peer_req->flags |= EE_IS_TRIM;
spin_lock_irq(&device->resource->req_lock);
list_add_tail(&peer_req->w.list, &device->sync_ee);
spin_unlock_irq(&device->resource->req_lock);
atomic_add(pi->size >> 9, &device->rs_sect_ev);
err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR);
if (err) {
spin_lock_irq(&device->resource->req_lock);
list_del(&peer_req->w.list);
spin_unlock_irq(&device->resource->req_lock);
drbd_free_peer_req(device, peer_req);
put_ldev(device);
err = 0;
goto fail;
}
inc_unacked(device);
/* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
as well as drbd_rs_complete_io() */
} else {
fail:
drbd_rs_complete_io(device, sector);
drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
}
atomic_add(size >> 9, &device->rs_sect_in);
return err;
}
struct data_cmd {
int expect_payload;
size_t pkt_size;
unsigned int pkt_size;
int (*fn)(struct drbd_connection *, struct packet_info *);
};
@ -4626,11 +4963,14 @@ static struct data_cmd drbd_cmd_handler[] = {
[P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
[P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_RS_THIN_REQ] = { 0, sizeof(struct p_block_req), receive_DataRequest },
[P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
[P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
[P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
[P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
[P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data },
};
static void drbdd(struct drbd_connection *connection)
@ -4640,7 +4980,7 @@ static void drbdd(struct drbd_connection *connection)
int err;
while (get_t_state(&connection->receiver) == RUNNING) {
struct data_cmd *cmd;
struct data_cmd const *cmd;
drbd_thread_current_set_cpu(&connection->receiver);
update_receiver_timing_details(connection, drbd_recv_header);
@ -4655,11 +4995,18 @@ static void drbdd(struct drbd_connection *connection)
}
shs = cmd->pkt_size;
if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
shs += sizeof(struct o_qlim);
if (pi.size > shs && !cmd->expect_payload) {
drbd_err(connection, "No payload expected %s l:%d\n",
cmdname(pi.cmd), pi.size);
goto err_out;
}
if (pi.size < shs) {
drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
cmdname(pi.cmd), (int)shs, pi.size);
goto err_out;
}
if (shs) {
update_receiver_timing_details(connection, drbd_recv_all_warn);
@ -4795,9 +5142,11 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device)
drbd_md_sync(device);
/* serialize with bitmap writeout triggered by the state change,
* if any. */
wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
if (get_ldev(device)) {
drbd_bitmap_io(device, &drbd_bm_write_copy_pages,
"write from disconnected", BM_LOCKED_CHANGE_ALLOWED);
put_ldev(device);
}
/* tcp_close and release of sendpage pages can be deferred. I don't
* want to use SO_LINGER, because apparently it can be deferred for
@ -4904,8 +5253,12 @@ static int drbd_do_features(struct drbd_connection *connection)
drbd_info(connection, "Handshake successful: "
"Agreed network protocol version %d\n", connection->agreed_pro_version);
drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
connection->agreed_features & FF_TRIM ? " " : " not ");
drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n",
connection->agreed_features,
connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" :
connection->agreed_features ? "" : " none");
return 1;

View File

@ -47,8 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r
&device->vdisk->part0, req->start_jif);
}
static struct drbd_request *drbd_req_new(struct drbd_device *device,
struct bio *bio_src)
static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio *bio_src)
{
struct drbd_request *req;
@ -58,10 +57,12 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
memset(req, 0, sizeof(*req));
drbd_req_make_private_bio(req, bio_src);
req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
req->device = device;
req->master_bio = bio_src;
req->epoch = 0;
req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
| (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
| (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
req->device = device;
req->master_bio = bio_src;
req->epoch = 0;
drbd_clear_interval(&req->i);
req->i.sector = bio_src->bi_iter.bi_sector;
@ -218,7 +219,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
{
const unsigned s = req->rq_state;
struct drbd_device *device = req->device;
int rw;
int error, ok;
/* we must not complete the master bio, while it is
@ -242,8 +242,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
return;
}
rw = bio_rw(req->master_bio);
/*
* figure out whether to report success or failure.
*
@ -267,7 +265,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
* epoch number. If they match, increase the current_tle_nr,
* and reset the transfer log epoch write_cnt.
*/
if (rw == WRITE &&
if (op_is_write(bio_op(req->master_bio)) &&
req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr))
start_new_tl_epoch(first_peer_device(device)->connection);
@ -284,11 +282,14 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
* because no path was available, in which case
* it was not even added to the transfer_log.
*
* READA may fail, and will not be retried.
* read-ahead may fail, and will not be retried.
*
* WRITE should have used all available paths already.
*/
if (!ok && rw == READ && !list_empty(&req->tl_requests))
if (!ok &&
bio_op(req->master_bio) == REQ_OP_READ &&
!(req->master_bio->bi_rw & REQ_RAHEAD) &&
!list_empty(&req->tl_requests))
req->rq_state |= RQ_POSTPONED;
if (!(req->rq_state & RQ_POSTPONED)) {
@ -644,7 +645,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
__drbd_chk_io_error(device, DRBD_READ_ERROR);
/* fall through. */
case READ_AHEAD_COMPLETED_WITH_ERROR:
/* it is legal to fail READA, no __drbd_chk_io_error in that case. */
/* it is legal to fail read-ahead, no __drbd_chk_io_error in that case. */
mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
break;
@ -656,7 +657,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
break;
case QUEUE_FOR_NET_READ:
/* READ or READA, and
/* READ, and
* no local disk,
* or target area marked as invalid,
* or just got an io-error. */
@ -977,16 +978,20 @@ static void complete_conflicting_writes(struct drbd_request *req)
sector_t sector = req->i.sector;
int size = req->i.size;
i = drbd_find_overlap(&device->write_requests, sector, size);
if (!i)
return;
for (;;) {
prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
i = drbd_find_overlap(&device->write_requests, sector, size);
if (!i)
drbd_for_each_overlap(i, &device->write_requests, sector, size) {
/* Ignore, if already completed to upper layers. */
if (i->completed)
continue;
/* Handle the first found overlap. After the schedule
* we have to restart the tree walk. */
break;
}
if (!i) /* if any */
break;
/* Indicate to wake up device->misc_wait on progress. */
prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
i->waiting = true;
spin_unlock_irq(&device->resource->req_lock);
schedule();
@ -995,7 +1000,7 @@ static void complete_conflicting_writes(struct drbd_request *req)
finish_wait(&device->misc_wait, &wait);
}
/* called within req_lock and rcu_read_lock() */
/* called within req_lock */
static void maybe_pull_ahead(struct drbd_device *device)
{
struct drbd_connection *connection = first_peer_device(device)->connection;
@ -1152,12 +1157,29 @@ static int drbd_process_write_request(struct drbd_request *req)
return remote;
}
static void drbd_process_discard_req(struct drbd_request *req)
{
int err = drbd_issue_discard_or_zero_out(req->device,
req->i.sector, req->i.size >> 9, true);
if (err)
req->private_bio->bi_error = -EIO;
bio_endio(req->private_bio);
}
static void
drbd_submit_req_private_bio(struct drbd_request *req)
{
struct drbd_device *device = req->device;
struct bio *bio = req->private_bio;
const int rw = bio_rw(bio);
unsigned int type;
if (bio_op(bio) != REQ_OP_READ)
type = DRBD_FAULT_DT_WR;
else if (bio->bi_rw & REQ_RAHEAD)
type = DRBD_FAULT_DT_RA;
else
type = DRBD_FAULT_DT_RD;
bio->bi_bdev = device->ldev->backing_bdev;
@ -1167,11 +1189,10 @@ drbd_submit_req_private_bio(struct drbd_request *req)
* stable storage, and this is a WRITE, we may not even submit
* this bio. */
if (get_ldev(device)) {
if (drbd_insert_fault(device,
rw == WRITE ? DRBD_FAULT_DT_WR
: rw == READ ? DRBD_FAULT_DT_RD
: DRBD_FAULT_DT_RA))
if (drbd_insert_fault(device, type))
bio_io_error(bio);
else if (bio_op(bio) == REQ_OP_DISCARD)
drbd_process_discard_req(req);
else
generic_make_request(bio);
put_ldev(device);
@ -1223,24 +1244,45 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
/* Update disk stats */
_drbd_start_io_acct(device, req);
/* process discards always from our submitter thread */
if (bio_op(bio) & REQ_OP_DISCARD)
goto queue_for_submitter_thread;
if (rw == WRITE && req->private_bio && req->i.size
&& !test_bit(AL_SUSPENDED, &device->flags)) {
if (!drbd_al_begin_io_fastpath(device, &req->i)) {
atomic_inc(&device->ap_actlog_cnt);
drbd_queue_write(device, req);
return NULL;
}
if (!drbd_al_begin_io_fastpath(device, &req->i))
goto queue_for_submitter_thread;
req->rq_state |= RQ_IN_ACT_LOG;
req->in_actlog_jif = jiffies;
}
return req;
queue_for_submitter_thread:
atomic_inc(&device->ap_actlog_cnt);
drbd_queue_write(device, req);
return NULL;
}
/* Require at least one path to current data.
* We don't want to allow writes on C_STANDALONE D_INCONSISTENT:
* We would not allow to read what was written,
* we would not have bumped the data generation uuids,
* we would cause data divergence for all the wrong reasons.
*
* If we don't see at least one D_UP_TO_DATE, we will fail this request,
* which either returns EIO, or, if OND_SUSPEND_IO is set, suspends IO,
* and queues for retry later.
*/
static bool may_do_writes(struct drbd_device *device)
{
const union drbd_dev_state s = device->state;
return s.disk == D_UP_TO_DATE || s.pdsk == D_UP_TO_DATE;
}
static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
{
struct drbd_resource *resource = device->resource;
const int rw = bio_rw(req->master_bio);
const int rw = bio_data_dir(req->master_bio);
struct bio_and_error m = { NULL, };
bool no_remote = false;
bool submit_private_bio = false;
@ -1270,7 +1312,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
goto out;
}
/* We fail READ/READA early, if we can not serve it.
/* We fail READ early, if we can not serve it.
* We must do this before req is registered on any lists.
* Otherwise, drbd_req_complete() will queue failed READ for retry. */
if (rw != WRITE) {
@ -1291,6 +1333,12 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
}
if (rw == WRITE) {
if (req->private_bio && !may_do_writes(device)) {
bio_put(req->private_bio);
req->private_bio = NULL;
put_ldev(device);
goto nodata;
}
if (!drbd_process_write_request(req))
no_remote = true;
} else {

View File

@ -206,6 +206,8 @@ enum drbd_req_state_bits {
/* Set when this is a write, clear for a read */
__RQ_WRITE,
__RQ_WSAME,
__RQ_UNMAP,
/* Should call drbd_al_complete_io() for this request... */
__RQ_IN_ACT_LOG,
@ -241,10 +243,11 @@ enum drbd_req_state_bits {
#define RQ_NET_OK (1UL << __RQ_NET_OK)
#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
/* 0x1f8 */
#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
#define RQ_WRITE (1UL << __RQ_WRITE)
#define RQ_WSAME (1UL << __RQ_WSAME)
#define RQ_UNMAP (1UL << __RQ_UNMAP)
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)

View File

@ -814,7 +814,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns)
}
if (rv <= 0)
/* already found a reason to abort */;
goto out; /* already found a reason to abort */
else if (ns.role == R_SECONDARY && device->open_cnt)
rv = SS_DEVICE_IN_USE;
@ -862,6 +862,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns)
else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
rv = SS_CONNECTED_OUTDATES;
out:
rcu_read_unlock();
return rv;
@ -906,6 +907,15 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_c
(ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS)))
rv = SS_IN_TRANSIENT_STATE;
/* Do not promote during resync handshake triggered by "force primary".
* This is a hack. It should really be rejected by the peer during the
* cluster wide state change request. */
if (os.role != R_PRIMARY && ns.role == R_PRIMARY
&& ns.pdsk == D_UP_TO_DATE
&& ns.disk != D_UP_TO_DATE && ns.disk != D_DISKLESS
&& (ns.conn <= C_WF_SYNC_UUID || ns.conn != os.conn))
rv = SS_IN_TRANSIENT_STATE;
if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
rv = SS_NEED_CONNECTION;
@ -1628,6 +1638,26 @@ static void broadcast_state_change(struct drbd_state_change *state_change)
#undef REMEMBER_STATE_CHANGE
}
/* takes old and new peer disk state */
static bool lost_contact_to_peer_data(enum drbd_disk_state os, enum drbd_disk_state ns)
{
if ((os >= D_INCONSISTENT && os != D_UNKNOWN && os != D_OUTDATED)
&& (ns < D_INCONSISTENT || ns == D_UNKNOWN || ns == D_OUTDATED))
return true;
/* Scenario, starting with normal operation
* Connected Primary/Secondary UpToDate/UpToDate
* NetworkFailure Primary/Unknown UpToDate/DUnknown (frozen)
* ...
* Connected Primary/Secondary UpToDate/Diskless (resumed; needs to bump uuid!)
*/
if (os == D_UNKNOWN
&& (ns == D_DISKLESS || ns == D_FAILED || ns == D_OUTDATED))
return true;
return false;
}
/**
* after_state_ch() - Perform after state change actions that may sleep
* @device: DRBD device.
@ -1675,7 +1705,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
what = RESEND;
if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
conn_lowest_disk(connection) > D_NEGOTIATING)
conn_lowest_disk(connection) == D_UP_TO_DATE)
what = RESTART_FROZEN_DISK_IO;
if (resource->susp_nod && what != NOTHING) {
@ -1699,6 +1729,13 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
clear_bit(NEW_CUR_UUID, &peer_device->device->flags);
rcu_read_unlock();
/* We should actively create a new uuid, _before_
* we resume/resent, if the peer is diskless
* (recovery from a multiple error scenario).
* Currently, this happens with a slight delay
* below when checking lost_contact_to_peer_data() ...
*/
_tl_restart(connection, RESEND);
_conn_request_state(connection,
(union drbd_state) { { .susp_fen = 1 } },
@ -1742,12 +1779,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
BM_LOCKED_TEST_ALLOWED);
/* Lost contact to peer's copy of the data */
if ((os.pdsk >= D_INCONSISTENT &&
os.pdsk != D_UNKNOWN &&
os.pdsk != D_OUTDATED)
&& (ns.pdsk < D_INCONSISTENT ||
ns.pdsk == D_UNKNOWN ||
ns.pdsk == D_OUTDATED)) {
if (lost_contact_to_peer_data(os.pdsk, ns.pdsk)) {
if (get_ldev(device)) {
if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
@ -1934,12 +1966,17 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
/* This triggers bitmap writeout of potentially still unwritten pages
* if the resync finished cleanly, or aborted because of peer disk
* failure, or because of connection loss.
* failure, or on transition from resync back to AHEAD/BEHIND.
*
* Connection loss is handled in drbd_disconnected() by the receiver.
*
* For resync aborted because of local disk failure, we cannot do
* any bitmap writeout anymore.
*
* No harm done if some bits change during this phase.
*/
if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) {
if ((os.conn > C_CONNECTED && os.conn < C_AHEAD) &&
(ns.conn == C_CONNECTED || ns.conn >= C_AHEAD) && get_ldev(device)) {
drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL,
"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
put_ldev(device);
@ -2160,9 +2197,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
ns.disk = os.disk;
rv = _drbd_set_state(device, ns, flags, NULL);
if (rv < SS_SUCCESS)
BUG();
BUG_ON(rv < SS_SUCCESS);
ns.i = device->state.i;
ns_max.role = max_role(ns.role, ns_max.role);
ns_max.peer = max_role(ns.peer, ns_max.peer);

View File

@ -140,7 +140,7 @@ extern void drbd_resume_al(struct drbd_device *device);
extern bool conn_all_vols_unconf(struct drbd_connection *connection);
/**
* drbd_request_state() - Reqest a state change
* drbd_request_state() - Request a state change
* @device: DRBD device.
* @mask: mask of state bits to change.
* @val: value of new state bits.

View File

@ -26,7 +26,7 @@
#include <linux/drbd.h>
#include "drbd_strings.h"
static const char *drbd_conn_s_names[] = {
static const char * const drbd_conn_s_names[] = {
[C_STANDALONE] = "StandAlone",
[C_DISCONNECTING] = "Disconnecting",
[C_UNCONNECTED] = "Unconnected",
@ -53,13 +53,13 @@ static const char *drbd_conn_s_names[] = {
[C_BEHIND] = "Behind",
};
static const char *drbd_role_s_names[] = {
static const char * const drbd_role_s_names[] = {
[R_PRIMARY] = "Primary",
[R_SECONDARY] = "Secondary",
[R_UNKNOWN] = "Unknown"
};
static const char *drbd_disk_s_names[] = {
static const char * const drbd_disk_s_names[] = {
[D_DISKLESS] = "Diskless",
[D_ATTACHING] = "Attaching",
[D_FAILED] = "Failed",
@ -71,7 +71,7 @@ static const char *drbd_disk_s_names[] = {
[D_UP_TO_DATE] = "UpToDate",
};
static const char *drbd_state_sw_errors[] = {
static const char * const drbd_state_sw_errors[] = {
[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
[-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",

View File

@ -173,8 +173,8 @@ void drbd_peer_request_endio(struct bio *bio)
{
struct drbd_peer_request *peer_req = bio->bi_private;
struct drbd_device *device = peer_req->peer_device->device;
int is_write = bio_data_dir(bio) == WRITE;
int is_discard = !!(bio_op(bio) == REQ_OP_DISCARD);
bool is_write = bio_data_dir(bio) == WRITE;
bool is_discard = !!(bio_op(bio) == REQ_OP_DISCARD);
if (bio->bi_error && __ratelimit(&drbd_ratelimit_state))
drbd_warn(device, "%s: error=%d s=%llus\n",
@ -248,18 +248,26 @@ void drbd_request_endio(struct bio *bio)
/* to avoid recursion in __req_mod */
if (unlikely(bio->bi_error)) {
if (bio_op(bio) == REQ_OP_DISCARD)
what = (bio->bi_error == -EOPNOTSUPP)
? DISCARD_COMPLETED_NOTSUPP
: DISCARD_COMPLETED_WITH_ERROR;
else
what = (bio_data_dir(bio) == WRITE)
? WRITE_COMPLETED_WITH_ERROR
: (bio_rw(bio) == READ)
? READ_COMPLETED_WITH_ERROR
: READ_AHEAD_COMPLETED_WITH_ERROR;
} else
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
if (bio->bi_error == -EOPNOTSUPP)
what = DISCARD_COMPLETED_NOTSUPP;
else
what = DISCARD_COMPLETED_WITH_ERROR;
break;
case REQ_OP_READ:
if (bio->bi_rw & REQ_RAHEAD)
what = READ_AHEAD_COMPLETED_WITH_ERROR;
else
what = READ_COMPLETED_WITH_ERROR;
break;
default:
what = WRITE_COMPLETED_WITH_ERROR;
break;
}
} else {
what = COMPLETED_OK;
}
bio_put(req->private_bio);
req->private_bio = ERR_PTR(bio->bi_error);
@ -320,6 +328,10 @@ void drbd_csum_bio(struct crypto_ahash *tfm, struct bio *bio, void *digest)
sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
ahash_request_set_crypt(req, &sg, NULL, sg.length);
crypto_ahash_update(req);
/* REQ_OP_WRITE_SAME has only one segment,
* checksum the payload only once. */
if (bio_op(bio) == REQ_OP_WRITE_SAME)
break;
}
ahash_request_set_crypt(req, NULL, digest, 0);
crypto_ahash_final(req);
@ -387,7 +399,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
size, true /* has real payload */, GFP_TRY);
size, size, GFP_TRY);
if (!peer_req)
goto defer;
@ -583,6 +595,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel)
int number, rollback_i, size;
int align, requeue = 0;
int i = 0;
int discard_granularity = 0;
if (unlikely(cancel))
return 0;
@ -602,6 +615,12 @@ static int make_resync_request(struct drbd_device *const device, int cancel)
return 0;
}
if (connection->agreed_features & DRBD_FF_THIN_RESYNC) {
rcu_read_lock();
discard_granularity = rcu_dereference(device->ldev->disk_conf)->rs_discard_granularity;
rcu_read_unlock();
}
max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
number = drbd_rs_number_requests(device);
if (number <= 0)
@ -666,6 +685,9 @@ next_sector:
if (sector & ((1<<(align+3))-1))
break;
if (discard_granularity && size == discard_granularity)
break;
/* do not cross extent boundaries */
if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
break;
@ -712,7 +734,8 @@ next_sector:
int err;
inc_rs_pending(device);
err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
err = drbd_send_drequest(peer_device,
size == discard_granularity ? P_RS_THIN_REQ : P_RS_DATA_REQUEST,
sector, size, ID_SYNCER);
if (err) {
drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
@ -829,6 +852,7 @@ static void ping_peer(struct drbd_device *device)
int drbd_resync_finished(struct drbd_device *device)
{
struct drbd_connection *connection = first_peer_device(device)->connection;
unsigned long db, dt, dbdt;
unsigned long n_oos;
union drbd_state os, ns;
@ -850,8 +874,7 @@ int drbd_resync_finished(struct drbd_device *device)
if (dw) {
dw->w.cb = w_resync_finished;
dw->device = device;
drbd_queue_work(&first_peer_device(device)->connection->sender_work,
&dw->w);
drbd_queue_work(&connection->sender_work, &dw->w);
return 1;
}
drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
@ -964,6 +987,30 @@ int drbd_resync_finished(struct drbd_device *device)
_drbd_set_state(device, ns, CS_VERBOSE, NULL);
out_unlock:
spin_unlock_irq(&device->resource->req_lock);
/* If we have been sync source, and have an effective fencing-policy,
* once *all* volumes are back in sync, call "unfence". */
if (os.conn == C_SYNC_SOURCE) {
enum drbd_disk_state disk_state = D_MASK;
enum drbd_disk_state pdsk_state = D_MASK;
enum drbd_fencing_p fp = FP_DONT_CARE;
rcu_read_lock();
fp = rcu_dereference(device->ldev->disk_conf)->fencing;
if (fp != FP_DONT_CARE) {
struct drbd_peer_device *peer_device;
int vnr;
idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
struct drbd_device *device = peer_device->device;
disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk);
pdsk_state = min_t(enum drbd_disk_state, pdsk_state, device->state.pdsk);
}
}
rcu_read_unlock();
if (disk_state == D_UP_TO_DATE && pdsk_state == D_UP_TO_DATE)
conn_khelper(connection, "unfence-peer");
}
put_ldev(device);
out:
device->rs_total = 0;
@ -1000,7 +1047,6 @@ static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_
/**
* w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
* @device: DRBD device.
* @w: work object.
* @cancel: The connection will be closed anyways
*/
@ -1036,6 +1082,30 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
return err;
}
static bool all_zero(struct drbd_peer_request *peer_req)
{
struct page *page = peer_req->pages;
unsigned int len = peer_req->i.size;
page_chain_for_each(page) {
unsigned int l = min_t(unsigned int, len, PAGE_SIZE);
unsigned int i, words = l / sizeof(long);
unsigned long *d;
d = kmap_atomic(page);
for (i = 0; i < words; i++) {
if (d[i]) {
kunmap_atomic(d);
return false;
}
}
kunmap_atomic(d);
len -= l;
}
return true;
}
/**
* w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
* @w: work object.
@ -1064,7 +1134,10 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
if (likely(device->state.pdsk >= D_INCONSISTENT)) {
inc_rs_pending(device);
err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req))
err = drbd_send_rs_deallocated(peer_device, peer_req);
else
err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
} else {
if (__ratelimit(&drbd_ratelimit_state))
drbd_err(device, "Not sending RSDataReply, "
@ -1634,7 +1707,7 @@ static bool use_checksum_based_resync(struct drbd_connection *connection, struct
rcu_read_unlock();
return connection->agreed_pro_version >= 89 && /* supported? */
connection->csums_tfm && /* configured? */
(csums_after_crash_only == 0 /* use for each resync? */
(csums_after_crash_only == false /* use for each resync? */
|| test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */
}
@ -1769,7 +1842,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
device->bm_resync_fo = 0;
device->use_csums = use_checksum_based_resync(connection, device);
} else {
device->use_csums = 0;
device->use_csums = false;
}
/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid

View File

@ -4350,8 +4350,7 @@ static int __init do_floppy_init(void)
/* to be cleaned up... */
disks[drive]->private_data = (void *)(long)drive;
disks[drive]->flags |= GENHD_FL_REMOVABLE;
disks[drive]->driverfs_dev = &floppy_device[drive].dev;
add_disk(disks[drive]);
device_add_disk(&floppy_device[drive].dev, disks[drive]);
}
return 0;

View File

@ -1765,6 +1765,7 @@ static int loop_add(struct loop_device **l, int i)
*/
queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
err = -ENOMEM;
disk = lo->lo_disk = alloc_disk(1 << part_shift);
if (!disk)
goto out_free_queue;

View File

@ -687,15 +687,13 @@ static unsigned int mg_issue_req(struct request *req,
unsigned int sect_num,
unsigned int sect_cnt)
{
switch (rq_data_dir(req)) {
case READ:
if (rq_data_dir(req) == READ) {
if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
!= MG_ERR_NONE) {
mg_bad_rw_intr(host);
return host->error;
}
break;
case WRITE:
} else {
/* TODO : handler */
outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
@ -714,7 +712,6 @@ static unsigned int mg_issue_req(struct request *req,
mod_timer(&host->timer, jiffies + 3 * HZ);
outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
MG_REG_COMMAND);
break;
}
return MG_ERR_NONE;
}
@ -1018,7 +1015,7 @@ probe_err_7:
probe_err_6:
blk_cleanup_queue(host->breq);
probe_err_5:
unregister_blkdev(MG_DISK_MAJ, MG_DISK_NAME);
unregister_blkdev(host->major, MG_DISK_NAME);
probe_err_4:
if (!prv_data->use_polling)
free_irq(host->irq, host);

View File

@ -3956,7 +3956,6 @@ static int mtip_block_initialize(struct driver_data *dd)
if (rv)
goto disk_index_error;
dd->disk->driverfs_dev = &dd->pdev->dev;
dd->disk->major = dd->major;
dd->disk->first_minor = index * MTIP_MAX_MINORS;
dd->disk->minors = MTIP_MAX_MINORS;
@ -4008,7 +4007,7 @@ skip_create_disk:
/*
* if rebuild pending, start the service thread, and delay the block
* queue creation and add_disk()
* queue creation and device_add_disk()
*/
if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
goto start_service_thread;
@ -4042,7 +4041,7 @@ skip_create_disk:
set_capacity(dd->disk, capacity);
/* Enable the block device and add it to /dev */
add_disk(dd->disk);
device_add_disk(&dd->pdev->dev, dd->disk);
dd->bdev = bdget_disk(dd->disk, 0);
/*

View File

@ -448,7 +448,7 @@ static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
struct request *rq;
struct bio *bio = rqd->bio;
rq = blk_mq_alloc_request(q, bio_rw(bio), 0);
rq = blk_mq_alloc_request(q, bio_data_dir(bio), 0);
if (IS_ERR(rq))
return -ENOMEM;

View File

@ -487,7 +487,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
gendisk->fops = &ps3disk_fops;
gendisk->queue = queue;
gendisk->private_data = dev;
gendisk->driverfs_dev = &dev->sbd.core;
snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME,
devidx+'a');
priv->blocking_factor = dev->blk_size >> 9;
@ -499,7 +498,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
gendisk->disk_name, priv->model, priv->raw_capacity >> 11,
get_capacity(gendisk) >> 11);
add_disk(gendisk);
device_add_disk(&dev->sbd.core, gendisk);
return 0;
fail_cleanup_queue:

View File

@ -773,14 +773,13 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev)
gendisk->fops = &ps3vram_fops;
gendisk->queue = queue;
gendisk->private_data = dev;
gendisk->driverfs_dev = &dev->core;
strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name));
set_capacity(gendisk, priv->size >> 9);
dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n",
gendisk->disk_name, get_capacity(gendisk) >> 11);
add_disk(gendisk);
device_add_disk(&dev->core, gendisk);
return 0;
fail_cleanup_queue:

View File

@ -230,8 +230,7 @@ int rsxx_attach_dev(struct rsxx_cardinfo *card)
set_capacity(card->gendisk, card->size8 >> 9);
else
set_capacity(card->gendisk, 0);
add_disk(card->gendisk);
device_add_disk(CARD_TO_DEV(card), card->gendisk);
card->bdev_attached = 1;
}
@ -308,7 +307,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name),
"rsxx%d", card->disk_id);
card->gendisk->driverfs_dev = &card->dev->dev;
card->gendisk->major = card->major;
card->gendisk->first_minor = 0;
card->gendisk->fops = &rsxx_fops;

View File

@ -4690,10 +4690,10 @@ static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return -EIO;
}
static int skd_bdev_attach(struct skd_device *skdev)
static int skd_bdev_attach(struct device *parent, struct skd_device *skdev)
{
pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__);
add_disk(skdev->disk);
device_add_disk(parent, skdev->disk);
return 0;
}
@ -4812,8 +4812,6 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
pci_set_drvdata(pdev, skdev);
skdev->disk->driverfs_dev = &pdev->dev;
for (i = 0; i < SKD_MAX_BARS; i++) {
skdev->mem_phys[i] = pci_resource_start(pdev, i);
skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
@ -4851,7 +4849,7 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
(SKD_START_WAIT_SECONDS * HZ));
if (skdev->gendisk_on > 0) {
/* device came on-line after reset */
skd_bdev_attach(skdev);
skd_bdev_attach(&pdev->dev, skdev);
rc = 0;
} else {
/* we timed out, something is wrong with the device,

View File

@ -804,7 +804,6 @@ static int probe_disk(struct vdc_port *port)
g->fops = &vdc_fops;
g->queue = q;
g->private_data = port;
g->driverfs_dev = &port->vio.vdev->dev;
set_capacity(g, port->vdisk_size);
@ -835,7 +834,7 @@ static int probe_disk(struct vdc_port *port)
port->vdisk_size, (port->vdisk_size >> (20 - 9)),
port->vio.ver.major, port->vio.ver.minor);
add_disk(g);
device_add_disk(&port->vio.vdev->dev, g);
return 0;
}

View File

@ -344,7 +344,6 @@ static int add_bio(struct cardinfo *card)
int offset;
struct bio *bio;
struct bio_vec vec;
int rw;
bio = card->currentbio;
if (!bio && card->bio) {
@ -359,7 +358,6 @@ static int add_bio(struct cardinfo *card)
if (!bio)
return 0;
rw = bio_rw(bio);
if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE)
return 0;
@ -369,7 +367,7 @@ static int add_bio(struct cardinfo *card)
vec.bv_page,
vec.bv_offset,
vec.bv_len,
(rw == READ) ?
bio_op(bio) == REQ_OP_READ ?
PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE);
p = &card->mm_pages[card->Ready];
@ -398,7 +396,7 @@ static int add_bio(struct cardinfo *card)
DMASCR_CHAIN_EN |
DMASCR_SEM_EN |
pci_cmds);
if (rw == WRITE)
if (bio_op(bio) == REQ_OP_WRITE)
desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ);
desc->sem_control_bits = desc->control_bits;

View File

@ -236,25 +236,22 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
static int virtblk_get_id(struct gendisk *disk, char *id_str)
{
struct virtio_blk *vblk = disk->private_data;
struct request_queue *q = vblk->disk->queue;
struct request *req;
struct bio *bio;
int err;
bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES,
GFP_KERNEL);
if (IS_ERR(bio))
return PTR_ERR(bio);
req = blk_make_request(vblk->disk->queue, bio, GFP_KERNEL);
if (IS_ERR(req)) {
bio_put(bio);
req = blk_get_request(q, READ, GFP_KERNEL);
if (IS_ERR(req))
return PTR_ERR(req);
}
req->cmd_type = REQ_TYPE_DRV_PRIV;
err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
blk_put_request(req);
err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
if (err)
goto out;
err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
out:
blk_put_request(req);
return err;
}
@ -656,7 +653,6 @@ static int virtblk_probe(struct virtio_device *vdev)
vblk->disk->first_minor = index_to_minor(index);
vblk->disk->private_data = vblk;
vblk->disk->fops = &virtblk_fops;
vblk->disk->driverfs_dev = &vdev->dev;
vblk->disk->flags |= GENHD_FL_EXT_DEVT;
vblk->index = index;
@ -733,7 +729,7 @@ static int virtblk_probe(struct virtio_device *vdev)
virtio_device_ready(vdev);
add_disk(vblk->disk);
device_add_disk(&vdev->dev, vblk->disk);
err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
if (err)
goto out_del_disk;

View File

@ -480,7 +480,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags))
vbd->flush_support = true;
if (q && blk_queue_secdiscard(q))
if (q && blk_queue_secure_erase(q))
vbd->discard_secure = true;
pr_debug("Successful creation of handle=%04x (dom=%u)\n",

View File

@ -548,7 +548,7 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf
ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
ring_req->u.discard.id = id;
ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard)
ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
else
ring_req->u.discard.flag = 0;
@ -844,7 +844,7 @@ static int blkif_queue_request(struct request *req, struct blkfront_ring_info *r
return 1;
if (unlikely(req_op(req) == REQ_OP_DISCARD ||
req->cmd_flags & REQ_SECURE))
req_op(req) == REQ_OP_SECURE_ERASE))
return blkif_queue_discard_req(req, rinfo);
else
return blkif_queue_rw_req(req, rinfo);
@ -952,7 +952,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
rq->limits.discard_granularity = info->discard_granularity;
rq->limits.discard_alignment = info->discard_alignment;
if (info->feature_secdiscard)
queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, rq);
}
/* Hard sector size and max sectors impersonate the equiv. hardware. */
@ -1134,7 +1134,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
gd->first_minor = minor;
gd->fops = &xlvbd_block_fops;
gd->private_data = info;
gd->driverfs_dev = &(info->xbdev->dev);
set_capacity(gd, capacity);
if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
@ -1592,7 +1591,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
info->feature_discard = 0;
info->feature_secdiscard = 0;
queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
}
blk_mq_complete_request(req, error);
break;
@ -2106,11 +2105,14 @@ static int blkfront_resume(struct xenbus_device *dev)
*/
if (req_op(shadow[i].request) == REQ_OP_FLUSH ||
req_op(shadow[i].request) == REQ_OP_DISCARD ||
shadow[j].request->cmd_flags & (REQ_FUA | REQ_SECURE)) {
req_op(shadow[i].request) == REQ_OP_SECURE_ERASE ||
shadow[j].request->cmd_flags & REQ_FUA) {
/*
* Flush operations don't contain bios, so
* we need to requeue the whole request
*
* XXX: but this doesn't make any sense for a
* write with the FUA flag set..
*/
list_add(&shadow[j].request->queuelist, &info->requests);
continue;
@ -2445,7 +2447,7 @@ static void blkfront_connect(struct blkfront_info *info)
for (i = 0; i < info->nr_rings; i++)
kick_pending_request_queues(&info->rinfo[i]);
add_disk(info->gd);
device_add_disk(&info->xbdev->dev, info->gd);
info->is_ready = 1;
}

View File

@ -2032,7 +2032,7 @@ static int cdrom_read_subchannel(struct cdrom_device_info *cdi,
init_cdrom_command(&cgc, buffer, 16, CGC_DATA_READ);
cgc.cmd[0] = GPCMD_READ_SUBCHANNEL;
cgc.cmd[1] = 2; /* MSF addressing */
cgc.cmd[1] = subchnl->cdsc_format;/* MSF or LBA addressing */
cgc.cmd[2] = 0x40; /* request subQ data */
cgc.cmd[3] = mcn ? 2 : 1;
cgc.cmd[8] = 16;
@ -2041,17 +2041,27 @@ static int cdrom_read_subchannel(struct cdrom_device_info *cdi,
return ret;
subchnl->cdsc_audiostatus = cgc.buffer[1];
subchnl->cdsc_format = CDROM_MSF;
subchnl->cdsc_ctrl = cgc.buffer[5] & 0xf;
subchnl->cdsc_trk = cgc.buffer[6];
subchnl->cdsc_ind = cgc.buffer[7];
subchnl->cdsc_reladdr.msf.minute = cgc.buffer[13];
subchnl->cdsc_reladdr.msf.second = cgc.buffer[14];
subchnl->cdsc_reladdr.msf.frame = cgc.buffer[15];
subchnl->cdsc_absaddr.msf.minute = cgc.buffer[9];
subchnl->cdsc_absaddr.msf.second = cgc.buffer[10];
subchnl->cdsc_absaddr.msf.frame = cgc.buffer[11];
if (subchnl->cdsc_format == CDROM_LBA) {
subchnl->cdsc_absaddr.lba = ((cgc.buffer[8] << 24) |
(cgc.buffer[9] << 16) |
(cgc.buffer[10] << 8) |
(cgc.buffer[11]));
subchnl->cdsc_reladdr.lba = ((cgc.buffer[12] << 24) |
(cgc.buffer[13] << 16) |
(cgc.buffer[14] << 8) |
(cgc.buffer[15]));
} else {
subchnl->cdsc_reladdr.msf.minute = cgc.buffer[13];
subchnl->cdsc_reladdr.msf.second = cgc.buffer[14];
subchnl->cdsc_reladdr.msf.frame = cgc.buffer[15];
subchnl->cdsc_absaddr.msf.minute = cgc.buffer[9];
subchnl->cdsc_absaddr.msf.second = cgc.buffer[10];
subchnl->cdsc_absaddr.msf.frame = cgc.buffer[11];
}
return 0;
}
@ -3022,7 +3032,7 @@ static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi,
if (!((requested == CDROM_MSF) ||
(requested == CDROM_LBA)))
return -EINVAL;
q.cdsc_format = CDROM_MSF;
ret = cdrom_read_subchannel(cdi, &q, 0);
if (ret)
return ret;

View File

@ -1770,7 +1770,6 @@ static int ide_cd_probe(ide_drive_t *drive)
drive->driver_data = info;
g->minors = 1;
g->driverfs_dev = &drive->gendev;
g->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE;
if (ide_cdrom_setup(drive)) {
put_device(&info->dev);
@ -1780,7 +1779,7 @@ static int ide_cd_probe(ide_drive_t *drive)
ide_cd_read_toc(drive, &sense);
g->fops = &idecd_ops;
g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
add_disk(g);
device_add_disk(&drive->gendev, g);
return 0;
out_free_disk:

View File

@ -412,12 +412,11 @@ static int ide_gd_probe(ide_drive_t *drive)
set_capacity(g, ide_gd_capacity(drive));
g->minors = IDE_DISK_MINORS;
g->driverfs_dev = &drive->gendev;
g->flags |= GENHD_FL_EXT_DEVT;
if (drive->dev_flags & IDE_DFLAG_REMOVABLE)
g->flags = GENHD_FL_REMOVABLE;
g->fops = &ide_gd_ops;
add_disk(g);
device_add_disk(&drive->gendev, g);
return 0;
out_free_disk:

View File

@ -27,11 +27,13 @@ config NVM_DEBUG
It is required to create/remove targets without IOCTLs.
config NVM_GENNVM
tristate "Generic NVM manager for Open-Channel SSDs"
tristate "General Non-Volatile Memory Manager for Open-Channel SSDs"
---help---
NVM media manager for Open-Channel SSDs that offload management
functionality to device, while keeping data placement and garbage
collection decisions on the host.
Non-volatile memory media manager for Open-Channel SSDs that implements
physical media metadata management and block provisioning API.
This is the standard media manager for using Open-Channel SSDs, and
required for targets to be instantiated.
config NVM_RRPC
tristate "Round-robin Hybrid Open-Channel SSD target"

View File

@ -18,8 +18,6 @@
*
*/
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/sem.h>
@ -28,46 +26,42 @@
#include <linux/miscdevice.h>
#include <linux/lightnvm.h>
#include <linux/sched/sysctl.h>
#include <uapi/linux/lightnvm.h>
static LIST_HEAD(nvm_tgt_types);
static DECLARE_RWSEM(nvm_tgtt_lock);
static LIST_HEAD(nvm_mgrs);
static LIST_HEAD(nvm_devices);
static LIST_HEAD(nvm_targets);
static DECLARE_RWSEM(nvm_lock);
static struct nvm_target *nvm_find_target(const char *name)
struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
{
struct nvm_target *tgt;
struct nvm_tgt_type *tmp, *tt = NULL;
list_for_each_entry(tgt, &nvm_targets, list)
if (!strcmp(name, tgt->disk->disk_name))
return tgt;
if (lock)
down_write(&nvm_tgtt_lock);
return NULL;
}
static struct nvm_tgt_type *nvm_find_target_type(const char *name)
{
struct nvm_tgt_type *tt;
list_for_each_entry(tt, &nvm_tgt_types, list)
if (!strcmp(name, tt->name))
return tt;
return NULL;
list_for_each_entry(tmp, &nvm_tgt_types, list)
if (!strcmp(name, tmp->name)) {
tt = tmp;
break;
}
if (lock)
up_write(&nvm_tgtt_lock);
return tt;
}
EXPORT_SYMBOL(nvm_find_target_type);
int nvm_register_tgt_type(struct nvm_tgt_type *tt)
{
int ret = 0;
down_write(&nvm_lock);
if (nvm_find_target_type(tt->name))
down_write(&nvm_tgtt_lock);
if (nvm_find_target_type(tt->name, 0))
ret = -EEXIST;
else
list_add(&tt->list, &nvm_tgt_types);
up_write(&nvm_lock);
up_write(&nvm_tgtt_lock);
return ret;
}
@ -110,7 +104,7 @@ static struct nvmm_type *nvm_find_mgr_type(const char *name)
return NULL;
}
struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev)
static struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev)
{
struct nvmm_type *mt;
int ret;
@ -182,20 +176,6 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
return NULL;
}
struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *dev, struct nvm_lun *lun,
unsigned long flags)
{
return dev->mt->get_blk_unlocked(dev, lun, flags);
}
EXPORT_SYMBOL(nvm_get_blk_unlocked);
/* Assumes that all valid pages have already been moved on release to bm */
void nvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk)
{
return dev->mt->put_blk_unlocked(dev, blk);
}
EXPORT_SYMBOL(nvm_put_blk_unlocked);
struct nvm_block *nvm_get_blk(struct nvm_dev *dev, struct nvm_lun *lun,
unsigned long flags)
{
@ -210,6 +190,12 @@ void nvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
}
EXPORT_SYMBOL(nvm_put_blk);
void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
{
return dev->mt->mark_blk(dev, ppa, type);
}
EXPORT_SYMBOL(nvm_mark_blk);
int nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
{
return dev->mt->submit_io(dev, rqd);
@ -251,9 +237,10 @@ void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
EXPORT_SYMBOL(nvm_generic_to_addr_mode);
int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
struct ppa_addr *ppas, int nr_ppas, int vblk)
const struct ppa_addr *ppas, int nr_ppas, int vblk)
{
int i, plane_cnt, pl_idx;
struct ppa_addr ppa;
if ((!vblk || dev->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) {
rqd->nr_ppas = nr_ppas;
@ -278,8 +265,9 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
for (i = 0; i < nr_ppas; i++) {
for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
ppas[i].g.pl = pl_idx;
rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppas[i];
ppa = ppas[i];
ppa.g.pl = pl_idx;
rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
}
}
}
@ -337,7 +325,7 @@ static void nvm_end_io_sync(struct nvm_rq *rqd)
complete(waiting);
}
int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode,
static int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode,
int flags, void *buf, int len)
{
DECLARE_COMPLETION_ONSTACK(wait);
@ -367,7 +355,9 @@ int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode,
/* Prevent hang_check timer from firing at us during very long I/O */
hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
while (!wait_for_completion_io_timeout(&wait,
hang_check * (HZ/2)))
;
else
wait_for_completion_io(&wait);
@ -510,7 +500,8 @@ static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
/* The lower page table encoding consists of a list of bytes, where each
* has a lower and an upper half. The first half byte maintains the
* increment value and every value after is an offset added to the
* previous incrementation value */
* previous incrementation value
*/
dev->lptbl[0] = mlc->pairs[0] & 0xF;
for (i = 1; i < dev->lps_per_blk; i++) {
p = mlc->pairs[i >> 1];
@ -596,42 +587,11 @@ err_fmtype:
return ret;
}
static void nvm_remove_target(struct nvm_target *t)
{
struct nvm_tgt_type *tt = t->type;
struct gendisk *tdisk = t->disk;
struct request_queue *q = tdisk->queue;
lockdep_assert_held(&nvm_lock);
del_gendisk(tdisk);
blk_cleanup_queue(q);
if (tt->exit)
tt->exit(tdisk->private_data);
put_disk(tdisk);
list_del(&t->list);
kfree(t);
}
static void nvm_free_mgr(struct nvm_dev *dev)
{
struct nvm_target *tgt, *tmp;
if (!dev->mt)
return;
down_write(&nvm_lock);
list_for_each_entry_safe(tgt, tmp, &nvm_targets, list) {
if (tgt->dev != dev)
continue;
nvm_remove_target(tgt);
}
up_write(&nvm_lock);
dev->mt->unregister_mgr(dev);
dev->mt = NULL;
}
@ -778,91 +738,6 @@ void nvm_unregister(char *disk_name)
}
EXPORT_SYMBOL(nvm_unregister);
static const struct block_device_operations nvm_fops = {
.owner = THIS_MODULE,
};
static int nvm_create_target(struct nvm_dev *dev,
struct nvm_ioctl_create *create)
{
struct nvm_ioctl_create_simple *s = &create->conf.s;
struct request_queue *tqueue;
struct gendisk *tdisk;
struct nvm_tgt_type *tt;
struct nvm_target *t;
void *targetdata;
if (!dev->mt) {
pr_info("nvm: device has no media manager registered.\n");
return -ENODEV;
}
down_write(&nvm_lock);
tt = nvm_find_target_type(create->tgttype);
if (!tt) {
pr_err("nvm: target type %s not found\n", create->tgttype);
up_write(&nvm_lock);
return -EINVAL;
}
t = nvm_find_target(create->tgtname);
if (t) {
pr_err("nvm: target name already exists.\n");
up_write(&nvm_lock);
return -EINVAL;
}
up_write(&nvm_lock);
t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
if (!t)
return -ENOMEM;
tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
if (!tqueue)
goto err_t;
blk_queue_make_request(tqueue, tt->make_rq);
tdisk = alloc_disk(0);
if (!tdisk)
goto err_queue;
sprintf(tdisk->disk_name, "%s", create->tgtname);
tdisk->flags = GENHD_FL_EXT_DEVT;
tdisk->major = 0;
tdisk->first_minor = 0;
tdisk->fops = &nvm_fops;
tdisk->queue = tqueue;
targetdata = tt->init(dev, tdisk, s->lun_begin, s->lun_end);
if (IS_ERR(targetdata))
goto err_init;
tdisk->private_data = targetdata;
tqueue->queuedata = targetdata;
blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect);
set_capacity(tdisk, tt->capacity(targetdata));
add_disk(tdisk);
t->type = tt;
t->disk = tdisk;
t->dev = dev;
down_write(&nvm_lock);
list_add_tail(&t->list, &nvm_targets);
up_write(&nvm_lock);
return 0;
err_init:
put_disk(tdisk);
err_queue:
blk_cleanup_queue(tqueue);
err_t:
kfree(t);
return -ENOMEM;
}
static int __nvm_configure_create(struct nvm_ioctl_create *create)
{
struct nvm_dev *dev;
@ -871,11 +746,17 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
down_write(&nvm_lock);
dev = nvm_find_nvm_dev(create->dev);
up_write(&nvm_lock);
if (!dev) {
pr_err("nvm: device not found\n");
return -EINVAL;
}
if (!dev->mt) {
pr_info("nvm: device has no media manager registered.\n");
return -ENODEV;
}
if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) {
pr_err("nvm: config type not valid\n");
return -EINVAL;
@ -888,25 +769,7 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
return -EINVAL;
}
return nvm_create_target(dev, create);
}
static int __nvm_configure_remove(struct nvm_ioctl_remove *remove)
{
struct nvm_target *t;
down_write(&nvm_lock);
t = nvm_find_target(remove->tgtname);
if (!t) {
pr_err("nvm: target \"%s\" doesn't exist.\n", remove->tgtname);
up_write(&nvm_lock);
return -EINVAL;
}
nvm_remove_target(t);
up_write(&nvm_lock);
return 0;
return dev->mt->create_tgt(dev, create);
}
#ifdef CONFIG_NVM_DEBUG
@ -941,8 +804,9 @@ static int nvm_configure_show(const char *val)
static int nvm_configure_remove(const char *val)
{
struct nvm_ioctl_remove remove;
struct nvm_dev *dev;
char opcode;
int ret;
int ret = 0;
ret = sscanf(val, "%c %256s", &opcode, remove.tgtname);
if (ret != 2) {
@ -952,7 +816,13 @@ static int nvm_configure_remove(const char *val)
remove.flags = 0;
return __nvm_configure_remove(&remove);
list_for_each_entry(dev, &nvm_devices, devices) {
ret = dev->mt->remove_tgt(dev, &remove);
if (!ret)
break;
}
return ret;
}
static int nvm_configure_create(const char *val)
@ -1149,6 +1019,8 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
{
struct nvm_ioctl_remove remove;
struct nvm_dev *dev;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@ -1163,7 +1035,13 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
return -EINVAL;
}
return __nvm_configure_remove(&remove);
list_for_each_entry(dev, &nvm_devices, devices) {
ret = dev->mt->remove_tgt(dev, &remove);
if (!ret)
break;
}
return ret;
}
static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info)

View File

@ -15,22 +15,160 @@
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
* USA.
*
* Implementation of a generic nvm manager for Open-Channel SSDs.
* Implementation of a general nvm manager for Open-Channel SSDs.
*/
#include "gennvm.h"
static int gennvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len)
static struct nvm_target *gen_find_target(struct gen_dev *gn, const char *name)
{
struct gen_nvm *gn = dev->mp;
struct gennvm_area *area, *prev, *next;
struct nvm_target *tgt;
list_for_each_entry(tgt, &gn->targets, list)
if (!strcmp(name, tgt->disk->disk_name))
return tgt;
return NULL;
}
static const struct block_device_operations gen_fops = {
.owner = THIS_MODULE,
};
static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
{
struct gen_dev *gn = dev->mp;
struct nvm_ioctl_create_simple *s = &create->conf.s;
struct request_queue *tqueue;
struct gendisk *tdisk;
struct nvm_tgt_type *tt;
struct nvm_target *t;
void *targetdata;
tt = nvm_find_target_type(create->tgttype, 1);
if (!tt) {
pr_err("nvm: target type %s not found\n", create->tgttype);
return -EINVAL;
}
mutex_lock(&gn->lock);
t = gen_find_target(gn, create->tgtname);
if (t) {
pr_err("nvm: target name already exists.\n");
mutex_unlock(&gn->lock);
return -EINVAL;
}
mutex_unlock(&gn->lock);
t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
if (!t)
return -ENOMEM;
tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
if (!tqueue)
goto err_t;
blk_queue_make_request(tqueue, tt->make_rq);
tdisk = alloc_disk(0);
if (!tdisk)
goto err_queue;
sprintf(tdisk->disk_name, "%s", create->tgtname);
tdisk->flags = GENHD_FL_EXT_DEVT;
tdisk->major = 0;
tdisk->first_minor = 0;
tdisk->fops = &gen_fops;
tdisk->queue = tqueue;
targetdata = tt->init(dev, tdisk, s->lun_begin, s->lun_end);
if (IS_ERR(targetdata))
goto err_init;
tdisk->private_data = targetdata;
tqueue->queuedata = targetdata;
blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect);
set_capacity(tdisk, tt->capacity(targetdata));
add_disk(tdisk);
t->type = tt;
t->disk = tdisk;
t->dev = dev;
mutex_lock(&gn->lock);
list_add_tail(&t->list, &gn->targets);
mutex_unlock(&gn->lock);
return 0;
err_init:
put_disk(tdisk);
err_queue:
blk_cleanup_queue(tqueue);
err_t:
kfree(t);
return -ENOMEM;
}
static void __gen_remove_target(struct nvm_target *t)
{
struct nvm_tgt_type *tt = t->type;
struct gendisk *tdisk = t->disk;
struct request_queue *q = tdisk->queue;
del_gendisk(tdisk);
blk_cleanup_queue(q);
if (tt->exit)
tt->exit(tdisk->private_data);
put_disk(tdisk);
list_del(&t->list);
kfree(t);
}
/**
* gen_remove_tgt - Removes a target from the media manager
* @dev: device
* @remove: ioctl structure with target name to remove.
*
* Returns:
* 0: on success
* 1: on not found
* <0: on error
*/
static int gen_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
{
struct gen_dev *gn = dev->mp;
struct nvm_target *t;
if (!gn)
return 1;
mutex_lock(&gn->lock);
t = gen_find_target(gn, remove->tgtname);
if (!t) {
mutex_unlock(&gn->lock);
return 1;
}
__gen_remove_target(t);
mutex_unlock(&gn->lock);
return 0;
}
static int gen_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len)
{
struct gen_dev *gn = dev->mp;
struct gen_area *area, *prev, *next;
sector_t begin = 0;
sector_t max_sectors = (dev->sec_size * dev->total_secs) >> 9;
if (len > max_sectors)
return -EINVAL;
area = kmalloc(sizeof(struct gennvm_area), GFP_KERNEL);
area = kmalloc(sizeof(struct gen_area), GFP_KERNEL);
if (!area)
return -ENOMEM;
@ -64,10 +202,10 @@ static int gennvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len)
return 0;
}
static void gennvm_put_area(struct nvm_dev *dev, sector_t begin)
static void gen_put_area(struct nvm_dev *dev, sector_t begin)
{
struct gen_nvm *gn = dev->mp;
struct gennvm_area *area;
struct gen_dev *gn = dev->mp;
struct gen_area *area;
spin_lock(&dev->lock);
list_for_each_entry(area, &gn->area_list, list) {
@ -82,27 +220,27 @@ static void gennvm_put_area(struct nvm_dev *dev, sector_t begin)
spin_unlock(&dev->lock);
}
static void gennvm_blocks_free(struct nvm_dev *dev)
static void gen_blocks_free(struct nvm_dev *dev)
{
struct gen_nvm *gn = dev->mp;
struct gen_dev *gn = dev->mp;
struct gen_lun *lun;
int i;
gennvm_for_each_lun(gn, lun, i) {
gen_for_each_lun(gn, lun, i) {
if (!lun->vlun.blocks)
break;
vfree(lun->vlun.blocks);
}
}
static void gennvm_luns_free(struct nvm_dev *dev)
static void gen_luns_free(struct nvm_dev *dev)
{
struct gen_nvm *gn = dev->mp;
struct gen_dev *gn = dev->mp;
kfree(gn->luns);
}
static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn)
static int gen_luns_init(struct nvm_dev *dev, struct gen_dev *gn)
{
struct gen_lun *lun;
int i;
@ -111,7 +249,7 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn)
if (!gn->luns)
return -ENOMEM;
gennvm_for_each_lun(gn, lun, i) {
gen_for_each_lun(gn, lun, i) {
spin_lock_init(&lun->vlun.lock);
INIT_LIST_HEAD(&lun->free_list);
INIT_LIST_HEAD(&lun->used_list);
@ -122,14 +260,11 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn)
lun->vlun.lun_id = i % dev->luns_per_chnl;
lun->vlun.chnl_id = i / dev->luns_per_chnl;
lun->vlun.nr_free_blocks = dev->blks_per_lun;
lun->vlun.nr_open_blocks = 0;
lun->vlun.nr_closed_blocks = 0;
lun->vlun.nr_bad_blocks = 0;
}
return 0;
}
static int gennvm_block_bb(struct gen_nvm *gn, struct ppa_addr ppa,
static int gen_block_bb(struct gen_dev *gn, struct ppa_addr ppa,
u8 *blks, int nr_blks)
{
struct nvm_dev *dev = gn->dev;
@ -149,17 +284,16 @@ static int gennvm_block_bb(struct gen_nvm *gn, struct ppa_addr ppa,
blk = &lun->vlun.blocks[i];
list_move_tail(&blk->list, &lun->bb_list);
lun->vlun.nr_bad_blocks++;
lun->vlun.nr_free_blocks--;
}
return 0;
}
static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
static int gen_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
{
struct nvm_dev *dev = private;
struct gen_nvm *gn = dev->mp;
struct gen_dev *gn = dev->mp;
u64 elba = slba + nlb;
struct gen_lun *lun;
struct nvm_block *blk;
@ -167,7 +301,7 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
int lun_id;
if (unlikely(elba > dev->total_secs)) {
pr_err("gennvm: L2P data from device is out of bounds!\n");
pr_err("gen: L2P data from device is out of bounds!\n");
return -EINVAL;
}
@ -175,7 +309,7 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
u64 pba = le64_to_cpu(entries[i]);
if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) {
pr_err("gennvm: L2P data entry is out of bounds!\n");
pr_err("gen: L2P data entry is out of bounds!\n");
return -EINVAL;
}
@ -200,16 +334,15 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
* block state. The block is assumed to be open.
*/
list_move_tail(&blk->list, &lun->used_list);
blk->state = NVM_BLK_ST_OPEN;
blk->state = NVM_BLK_ST_TGT;
lun->vlun.nr_free_blocks--;
lun->vlun.nr_open_blocks++;
}
}
return 0;
}
static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
static int gen_blocks_init(struct nvm_dev *dev, struct gen_dev *gn)
{
struct gen_lun *lun;
struct nvm_block *block;
@ -222,7 +355,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
if (!blks)
return -ENOMEM;
gennvm_for_each_lun(gn, lun, lun_iter) {
gen_for_each_lun(gn, lun, lun_iter) {
lun->vlun.blocks = vzalloc(sizeof(struct nvm_block) *
dev->blks_per_lun);
if (!lun->vlun.blocks) {
@ -256,20 +389,20 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
ret = nvm_get_bb_tbl(dev, ppa, blks);
if (ret)
pr_err("gennvm: could not get BB table\n");
pr_err("gen: could not get BB table\n");
ret = gennvm_block_bb(gn, ppa, blks, nr_blks);
ret = gen_block_bb(gn, ppa, blks, nr_blks);
if (ret)
pr_err("gennvm: BB table map failed\n");
pr_err("gen: BB table map failed\n");
}
}
if ((dev->identity.dom & NVM_RSP_L2P) && dev->ops->get_l2p_tbl) {
ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs,
gennvm_block_map, dev);
gen_block_map, dev);
if (ret) {
pr_err("gennvm: could not read L2P table.\n");
pr_warn("gennvm: default block initialization");
pr_err("gen: could not read L2P table.\n");
pr_warn("gen: default block initialization");
}
}
@ -277,67 +410,79 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
return 0;
}
static void gennvm_free(struct nvm_dev *dev)
static void gen_free(struct nvm_dev *dev)
{
gennvm_blocks_free(dev);
gennvm_luns_free(dev);
gen_blocks_free(dev);
gen_luns_free(dev);
kfree(dev->mp);
dev->mp = NULL;
}
static int gennvm_register(struct nvm_dev *dev)
static int gen_register(struct nvm_dev *dev)
{
struct gen_nvm *gn;
struct gen_dev *gn;
int ret;
if (!try_module_get(THIS_MODULE))
return -ENODEV;
gn = kzalloc(sizeof(struct gen_nvm), GFP_KERNEL);
gn = kzalloc(sizeof(struct gen_dev), GFP_KERNEL);
if (!gn)
return -ENOMEM;
gn->dev = dev;
gn->nr_luns = dev->nr_luns;
INIT_LIST_HEAD(&gn->area_list);
mutex_init(&gn->lock);
INIT_LIST_HEAD(&gn->targets);
dev->mp = gn;
ret = gennvm_luns_init(dev, gn);
ret = gen_luns_init(dev, gn);
if (ret) {
pr_err("gennvm: could not initialize luns\n");
pr_err("gen: could not initialize luns\n");
goto err;
}
ret = gennvm_blocks_init(dev, gn);
ret = gen_blocks_init(dev, gn);
if (ret) {
pr_err("gennvm: could not initialize blocks\n");
pr_err("gen: could not initialize blocks\n");
goto err;
}
return 1;
err:
gennvm_free(dev);
gen_free(dev);
module_put(THIS_MODULE);
return ret;
}
static void gennvm_unregister(struct nvm_dev *dev)
static void gen_unregister(struct nvm_dev *dev)
{
gennvm_free(dev);
struct gen_dev *gn = dev->mp;
struct nvm_target *t, *tmp;
mutex_lock(&gn->lock);
list_for_each_entry_safe(t, tmp, &gn->targets, list) {
if (t->dev != dev)
continue;
__gen_remove_target(t);
}
mutex_unlock(&gn->lock);
gen_free(dev);
module_put(THIS_MODULE);
}
static struct nvm_block *gennvm_get_blk_unlocked(struct nvm_dev *dev,
static struct nvm_block *gen_get_blk(struct nvm_dev *dev,
struct nvm_lun *vlun, unsigned long flags)
{
struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun);
struct nvm_block *blk = NULL;
int is_gc = flags & NVM_IOTYPE_GC;
assert_spin_locked(&vlun->lock);
spin_lock(&vlun->lock);
if (list_empty(&lun->free_list)) {
pr_err_ratelimited("gennvm: lun %u have no free pages available",
pr_err_ratelimited("gen: lun %u have no free pages available",
lun->vlun.id);
goto out;
}
@ -346,88 +491,58 @@ static struct nvm_block *gennvm_get_blk_unlocked(struct nvm_dev *dev,
goto out;
blk = list_first_entry(&lun->free_list, struct nvm_block, list);
list_move_tail(&blk->list, &lun->used_list);
blk->state = NVM_BLK_ST_OPEN;
blk->state = NVM_BLK_ST_TGT;
lun->vlun.nr_free_blocks--;
lun->vlun.nr_open_blocks++;
out:
return blk;
}
static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev,
struct nvm_lun *vlun, unsigned long flags)
{
struct nvm_block *blk;
spin_lock(&vlun->lock);
blk = gennvm_get_blk_unlocked(dev, vlun, flags);
spin_unlock(&vlun->lock);
return blk;
}
static void gennvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk)
static void gen_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
{
struct nvm_lun *vlun = blk->lun;
struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun);
assert_spin_locked(&vlun->lock);
if (blk->state & NVM_BLK_ST_OPEN) {
spin_lock(&vlun->lock);
if (blk->state & NVM_BLK_ST_TGT) {
list_move_tail(&blk->list, &lun->free_list);
lun->vlun.nr_open_blocks--;
lun->vlun.nr_free_blocks++;
blk->state = NVM_BLK_ST_FREE;
} else if (blk->state & NVM_BLK_ST_CLOSED) {
list_move_tail(&blk->list, &lun->free_list);
lun->vlun.nr_closed_blocks--;
lun->vlun.nr_free_blocks++;
blk->state = NVM_BLK_ST_FREE;
} else if (blk->state & NVM_BLK_ST_BAD) {
list_move_tail(&blk->list, &lun->bb_list);
lun->vlun.nr_bad_blocks++;
blk->state = NVM_BLK_ST_BAD;
} else {
WARN_ON_ONCE(1);
pr_err("gennvm: erroneous block type (%lu -> %u)\n",
pr_err("gen: erroneous block type (%lu -> %u)\n",
blk->id, blk->state);
list_move_tail(&blk->list, &lun->bb_list);
lun->vlun.nr_bad_blocks++;
blk->state = NVM_BLK_ST_BAD;
}
}
static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
{
struct nvm_lun *vlun = blk->lun;
spin_lock(&vlun->lock);
gennvm_put_blk_unlocked(dev, blk);
spin_unlock(&vlun->lock);
}
static void gennvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
static void gen_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
{
struct gen_nvm *gn = dev->mp;
struct gen_dev *gn = dev->mp;
struct gen_lun *lun;
struct nvm_block *blk;
pr_debug("gennvm: ppa (ch: %u lun: %u blk: %u pg: %u) -> %u\n",
pr_debug("gen: ppa (ch: %u lun: %u blk: %u pg: %u) -> %u\n",
ppa.g.ch, ppa.g.lun, ppa.g.blk, ppa.g.pg, type);
if (unlikely(ppa.g.ch > dev->nr_chnls ||
ppa.g.lun > dev->luns_per_chnl ||
ppa.g.blk > dev->blks_per_lun)) {
WARN_ON_ONCE(1);
pr_err("gennvm: ppa broken (ch: %u > %u lun: %u > %u blk: %u > %u",
pr_err("gen: ppa broken (ch: %u > %u lun: %u > %u blk: %u > %u",
ppa.g.ch, dev->nr_chnls,
ppa.g.lun, dev->luns_per_chnl,
ppa.g.blk, dev->blks_per_lun);
return;
}
lun = &gn->luns[ppa.g.lun * ppa.g.ch];
lun = &gn->luns[(dev->luns_per_chnl * ppa.g.ch) + ppa.g.lun];
blk = &lun->vlun.blocks[ppa.g.blk];
/* will be moved to bb list on put_blk from target */
@ -435,9 +550,9 @@ static void gennvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
}
/*
* mark block bad in gennvm. It is expected that the target recovers separately
* mark block bad in gen. It is expected that the target recovers separately
*/
static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd)
static void gen_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd)
{
int bit = -1;
int max_secs = dev->ops->max_phys_sect;
@ -447,25 +562,25 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd)
/* look up blocks and mark them as bad */
if (rqd->nr_ppas == 1) {
gennvm_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD);
gen_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD);
return;
}
while ((bit = find_next_bit(comp_bits, max_secs, bit + 1)) < max_secs)
gennvm_mark_blk(dev, rqd->ppa_list[bit], NVM_BLK_ST_BAD);
gen_mark_blk(dev, rqd->ppa_list[bit], NVM_BLK_ST_BAD);
}
static void gennvm_end_io(struct nvm_rq *rqd)
static void gen_end_io(struct nvm_rq *rqd)
{
struct nvm_tgt_instance *ins = rqd->ins;
if (rqd->error == NVM_RSP_ERR_FAILWRITE)
gennvm_mark_blk_bad(rqd->dev, rqd);
gen_mark_blk_bad(rqd->dev, rqd);
ins->tt->end_io(rqd);
}
static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
static int gen_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
{
if (!dev->ops->submit_io)
return -ENODEV;
@ -474,11 +589,11 @@ static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
nvm_generic_to_addr_mode(dev, rqd);
rqd->dev = dev;
rqd->end_io = gennvm_end_io;
rqd->end_io = gen_end_io;
return dev->ops->submit_io(dev, rqd);
}
static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
static int gen_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
unsigned long flags)
{
struct ppa_addr addr = block_to_ppa(dev, blk);
@ -486,19 +601,19 @@ static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
return nvm_erase_ppa(dev, &addr, 1);
}
static int gennvm_reserve_lun(struct nvm_dev *dev, int lunid)
static int gen_reserve_lun(struct nvm_dev *dev, int lunid)
{
return test_and_set_bit(lunid, dev->lun_map);
}
static void gennvm_release_lun(struct nvm_dev *dev, int lunid)
static void gen_release_lun(struct nvm_dev *dev, int lunid)
{
WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
}
static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid)
static struct nvm_lun *gen_get_lun(struct nvm_dev *dev, int lunid)
{
struct gen_nvm *gn = dev->mp;
struct gen_dev *gn = dev->mp;
if (unlikely(lunid >= dev->nr_luns))
return NULL;
@ -506,66 +621,62 @@ static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid)
return &gn->luns[lunid].vlun;
}
static void gennvm_lun_info_print(struct nvm_dev *dev)
static void gen_lun_info_print(struct nvm_dev *dev)
{
struct gen_nvm *gn = dev->mp;
struct gen_dev *gn = dev->mp;
struct gen_lun *lun;
unsigned int i;
gennvm_for_each_lun(gn, lun, i) {
gen_for_each_lun(gn, lun, i) {
spin_lock(&lun->vlun.lock);
pr_info("%s: lun%8u\t%u\t%u\t%u\t%u\n",
dev->name, i,
lun->vlun.nr_free_blocks,
lun->vlun.nr_open_blocks,
lun->vlun.nr_closed_blocks,
lun->vlun.nr_bad_blocks);
pr_info("%s: lun%8u\t%u\n", dev->name, i,
lun->vlun.nr_free_blocks);
spin_unlock(&lun->vlun.lock);
}
}
static struct nvmm_type gennvm = {
static struct nvmm_type gen = {
.name = "gennvm",
.version = {0, 1, 0},
.register_mgr = gennvm_register,
.unregister_mgr = gennvm_unregister,
.register_mgr = gen_register,
.unregister_mgr = gen_unregister,
.get_blk_unlocked = gennvm_get_blk_unlocked,
.put_blk_unlocked = gennvm_put_blk_unlocked,
.create_tgt = gen_create_tgt,
.remove_tgt = gen_remove_tgt,
.get_blk = gennvm_get_blk,
.put_blk = gennvm_put_blk,
.get_blk = gen_get_blk,
.put_blk = gen_put_blk,
.submit_io = gennvm_submit_io,
.erase_blk = gennvm_erase_blk,
.submit_io = gen_submit_io,
.erase_blk = gen_erase_blk,
.mark_blk = gennvm_mark_blk,
.mark_blk = gen_mark_blk,
.get_lun = gennvm_get_lun,
.reserve_lun = gennvm_reserve_lun,
.release_lun = gennvm_release_lun,
.lun_info_print = gennvm_lun_info_print,
.get_lun = gen_get_lun,
.reserve_lun = gen_reserve_lun,
.release_lun = gen_release_lun,
.lun_info_print = gen_lun_info_print,
.get_area = gennvm_get_area,
.put_area = gennvm_put_area,
.get_area = gen_get_area,
.put_area = gen_put_area,
};
static int __init gennvm_module_init(void)
static int __init gen_module_init(void)
{
return nvm_register_mgr(&gennvm);
return nvm_register_mgr(&gen);
}
static void gennvm_module_exit(void)
static void gen_module_exit(void)
{
nvm_unregister_mgr(&gennvm);
nvm_unregister_mgr(&gen);
}
module_init(gennvm_module_init);
module_exit(gennvm_module_exit);
module_init(gen_module_init);
module_exit(gen_module_exit);
MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("Generic media manager for Open-Channel SSDs");
MODULE_DESCRIPTION("General media manager for Open-Channel SSDs");

View File

@ -34,20 +34,24 @@ struct gen_lun {
*/
};
struct gen_nvm {
struct gen_dev {
struct nvm_dev *dev;
int nr_luns;
struct gen_lun *luns;
struct list_head area_list;
struct mutex lock;
struct list_head targets;
};
struct gennvm_area {
struct gen_area {
struct list_head list;
sector_t begin;
sector_t end; /* end is excluded */
};
#define gennvm_for_each_lun(bm, lun, i) \
#define gen_for_each_lun(bm, lun, i) \
for ((i) = 0, lun = &(bm)->luns[0]; \
(i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)])

View File

@ -48,7 +48,7 @@ static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a)
}
static void rrpc_invalidate_range(struct rrpc *rrpc, sector_t slba,
unsigned len)
unsigned int len)
{
sector_t i;
@ -96,10 +96,13 @@ static void rrpc_discard(struct rrpc *rrpc, struct bio *bio)
sector_t len = bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE;
struct nvm_rq *rqd;
do {
while (1) {
rqd = rrpc_inflight_laddr_acquire(rrpc, slba, len);
if (rqd)
break;
schedule();
} while (!rqd);
}
if (IS_ERR(rqd)) {
pr_err("rrpc: unable to acquire inflight IO\n");
@ -172,39 +175,32 @@ static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_dev *dev, u64 addr)
}
/* requires lun->lock taken */
static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *rblk)
static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *new_rblk,
struct rrpc_block **cur_rblk)
{
struct rrpc *rrpc = rlun->rrpc;
BUG_ON(!rblk);
if (rlun->cur) {
spin_lock(&rlun->cur->lock);
WARN_ON(!block_is_full(rrpc, rlun->cur));
spin_unlock(&rlun->cur->lock);
if (*cur_rblk) {
spin_lock(&(*cur_rblk)->lock);
WARN_ON(!block_is_full(rrpc, *cur_rblk));
spin_unlock(&(*cur_rblk)->lock);
}
rlun->cur = rblk;
*cur_rblk = new_rblk;
}
static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
unsigned long flags)
{
struct nvm_lun *lun = rlun->parent;
struct nvm_block *blk;
struct rrpc_block *rblk;
spin_lock(&lun->lock);
blk = nvm_get_blk_unlocked(rrpc->dev, rlun->parent, flags);
blk = nvm_get_blk(rrpc->dev, rlun->parent, flags);
if (!blk) {
pr_err("nvm: rrpc: cannot get new block from media manager\n");
spin_unlock(&lun->lock);
return NULL;
}
rblk = rrpc_get_rblk(rlun, blk->id);
list_add_tail(&rblk->list, &rlun->open_list);
spin_unlock(&lun->lock);
blk->priv = rblk;
bitmap_zero(rblk->invalid_pages, rrpc->dev->sec_per_blk);
rblk->next_page = 0;
@ -216,13 +212,7 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
{
struct rrpc_lun *rlun = rblk->rlun;
struct nvm_lun *lun = rlun->parent;
spin_lock(&lun->lock);
nvm_put_blk_unlocked(rrpc->dev, rblk->parent);
list_del(&rblk->list);
spin_unlock(&lun->lock);
nvm_put_blk(rrpc->dev, rblk->parent);
}
static void rrpc_put_blks(struct rrpc *rrpc)
@ -508,21 +498,11 @@ static void rrpc_gc_queue(struct work_struct *work)
struct rrpc *rrpc = gcb->rrpc;
struct rrpc_block *rblk = gcb->rblk;
struct rrpc_lun *rlun = rblk->rlun;
struct nvm_lun *lun = rblk->parent->lun;
struct nvm_block *blk = rblk->parent;
spin_lock(&rlun->lock);
list_add_tail(&rblk->prio, &rlun->prio_list);
spin_unlock(&rlun->lock);
spin_lock(&lun->lock);
lun->nr_open_blocks--;
lun->nr_closed_blocks++;
blk->state &= ~NVM_BLK_ST_OPEN;
blk->state |= NVM_BLK_ST_CLOSED;
list_move_tail(&rblk->list, &rlun->closed_list);
spin_unlock(&lun->lock);
mempool_free(gcb, rrpc->gcb_pool);
pr_debug("nvm: block '%lu' is full, allow GC (sched)\n",
rblk->parent->id);
@ -596,21 +576,20 @@ out:
return addr;
}
/* Simple round-robin Logical to physical address translation.
/* Map logical address to a physical page. The mapping implements a round robin
* approach and allocates a page from the next lun available.
*
* Retrieve the mapping using the active append point. Then update the ap for
* the next write to the disk.
*
* Returns rrpc_addr with the physical address and block. Remember to return to
* rrpc->addr_cache when request is finished.
* Returns rrpc_addr with the physical address and block. Returns NULL if no
* blocks in the next rlun are available.
*/
static struct rrpc_addr *rrpc_map_page(struct rrpc *rrpc, sector_t laddr,
int is_gc)
{
struct rrpc_lun *rlun;
struct rrpc_block *rblk;
struct rrpc_block *rblk, **cur_rblk;
struct nvm_lun *lun;
u64 paddr;
int gc_force = 0;
rlun = rrpc_get_lun_rr(rrpc, is_gc);
lun = rlun->parent;
@ -618,41 +597,65 @@ static struct rrpc_addr *rrpc_map_page(struct rrpc *rrpc, sector_t laddr,
if (!is_gc && lun->nr_free_blocks < rrpc->nr_luns * 4)
return NULL;
spin_lock(&rlun->lock);
/*
* page allocation steps:
* 1. Try to allocate new page from current rblk
* 2a. If succeed, proceed to map it in and return
* 2b. If fail, first try to allocate a new block from media manger,
* and then retry step 1. Retry until the normal block pool is
* exhausted.
* 3. If exhausted, and garbage collector is requesting the block,
* go to the reserved block and retry step 1.
* In the case that this fails as well, or it is not GC
* requesting, report not able to retrieve a block and let the
* caller handle further processing.
*/
spin_lock(&rlun->lock);
cur_rblk = &rlun->cur;
rblk = rlun->cur;
retry:
paddr = rrpc_alloc_addr(rrpc, rblk);
if (paddr == ADDR_EMPTY) {
rblk = rrpc_get_blk(rrpc, rlun, 0);
if (rblk) {
rrpc_set_lun_cur(rlun, rblk);
goto retry;
}
if (paddr != ADDR_EMPTY)
goto done;
if (is_gc) {
/* retry from emergency gc block */
paddr = rrpc_alloc_addr(rrpc, rlun->gc_cur);
if (paddr == ADDR_EMPTY) {
rblk = rrpc_get_blk(rrpc, rlun, 1);
if (!rblk) {
pr_err("rrpc: no more blocks");
goto err;
}
if (!list_empty(&rlun->wblk_list)) {
new_blk:
rblk = list_first_entry(&rlun->wblk_list, struct rrpc_block,
prio);
rrpc_set_lun_cur(rlun, rblk, cur_rblk);
list_del(&rblk->prio);
goto retry;
}
spin_unlock(&rlun->lock);
rlun->gc_cur = rblk;
paddr = rrpc_alloc_addr(rrpc, rlun->gc_cur);
}
rblk = rlun->gc_cur;
}
rblk = rrpc_get_blk(rrpc, rlun, gc_force);
if (rblk) {
spin_lock(&rlun->lock);
list_add_tail(&rblk->prio, &rlun->wblk_list);
/*
* another thread might already have added a new block,
* Therefore, make sure that one is used, instead of the
* one just added.
*/
goto new_blk;
}
if (unlikely(is_gc) && !gc_force) {
/* retry from emergency gc block */
cur_rblk = &rlun->gc_cur;
rblk = rlun->gc_cur;
gc_force = 1;
spin_lock(&rlun->lock);
goto retry;
}
pr_err("rrpc: failed to allocate new block\n");
return NULL;
done:
spin_unlock(&rlun->lock);
return rrpc_update_map(rrpc, laddr, rblk, paddr);
err:
spin_unlock(&rlun->lock);
return NULL;
}
static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk)
@ -850,14 +853,14 @@ static int rrpc_setup_rq(struct rrpc *rrpc, struct bio *bio,
return NVM_IO_ERR;
}
if (bio_rw(bio) == WRITE)
if (bio_op(bio) == REQ_OP_WRITE)
return rrpc_write_ppalist_rq(rrpc, bio, rqd, flags,
npages);
return rrpc_read_ppalist_rq(rrpc, bio, rqd, flags, npages);
}
if (bio_rw(bio) == WRITE)
if (bio_op(bio) == REQ_OP_WRITE)
return rrpc_write_rq(rrpc, bio, rqd, flags);
return rrpc_read_rq(rrpc, bio, rqd, flags);
@ -1196,8 +1199,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
rlun->rrpc = rrpc;
INIT_LIST_HEAD(&rlun->prio_list);
INIT_LIST_HEAD(&rlun->open_list);
INIT_LIST_HEAD(&rlun->closed_list);
INIT_LIST_HEAD(&rlun->wblk_list);
INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
spin_lock_init(&rlun->lock);
@ -1338,14 +1340,13 @@ static int rrpc_luns_configure(struct rrpc *rrpc)
rblk = rrpc_get_blk(rrpc, rlun, 0);
if (!rblk)
goto err;
rrpc_set_lun_cur(rlun, rblk);
rrpc_set_lun_cur(rlun, rblk, &rlun->cur);
/* Emergency gc block */
rblk = rrpc_get_blk(rrpc, rlun, 1);
if (!rblk)
goto err;
rlun->gc_cur = rblk;
rrpc_set_lun_cur(rlun, rblk, &rlun->gc_cur);
}
return 0;

View File

@ -56,7 +56,6 @@ struct rrpc_block {
struct nvm_block *parent;
struct rrpc_lun *rlun;
struct list_head prio;
struct list_head list;
#define MAX_INVALID_PAGES_STORAGE 8
/* Bitmap for invalid page intries */
@ -77,13 +76,7 @@ struct rrpc_lun {
struct rrpc_block *blocks; /* Reference to block allocation */
struct list_head prio_list; /* Blocks that may be GC'ed */
struct list_head open_list; /* In-use open blocks. These are blocks
* that can be both written to and read
* from
*/
struct list_head closed_list; /* In-use closed blocks. These are
* blocks that can _only_ be read from
*/
struct list_head wblk_list; /* Queued blocks to be written to */
struct work_struct ws_gc;
@ -188,7 +181,7 @@ static inline int request_intersects(struct rrpc_inflight_rq *r,
}
static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
unsigned pages, struct rrpc_inflight_rq *r)
unsigned int pages, struct rrpc_inflight_rq *r)
{
sector_t laddr_end = laddr + pages - 1;
struct rrpc_inflight_rq *rtmp;
@ -213,7 +206,7 @@ static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
}
static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
unsigned pages,
unsigned int pages,
struct rrpc_inflight_rq *r)
{
BUG_ON((laddr + pages) > rrpc->nr_sects);

View File

@ -39,7 +39,8 @@ static inline int scan_ppa_idx(int row, int blkid)
return (row * MAX_BLKS_PR_SYSBLK) + blkid;
}
void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb)
static void nvm_sysblk_to_cpu(struct nvm_sb_info *info,
struct nvm_system_block *sb)
{
info->seqnr = be32_to_cpu(sb->seqnr);
info->erase_cnt = be32_to_cpu(sb->erase_cnt);
@ -48,7 +49,8 @@ void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb)
info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa);
}
void nvm_cpu_to_sysblk(struct nvm_system_block *sb, struct nvm_sb_info *info)
static void nvm_cpu_to_sysblk(struct nvm_system_block *sb,
struct nvm_sb_info *info)
{
sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC);
sb->seqnr = cpu_to_be32(info->seqnr);
@ -86,7 +88,7 @@ static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas)
return nr_rows;
}
void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s,
static void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s,
struct ppa_addr *sysblk_ppas)
{
memset(s, 0, sizeof(struct sysblk_scan));

View File

@ -112,7 +112,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
EXPORT_SYMBOL(closure_wait);
/**
* closure_sync - sleep until a closure a closure has nothing left to wait on
* closure_sync - sleep until a closure has nothing left to wait on
*
* Sleeps until the refcount hits 1 - the thread that's running the closure owns
* the last refcount.

View File

@ -31,7 +31,8 @@
* passing it, as you might expect, the function to run when nothing is pending
* and the workqueue to run that function out of.
*
* continue_at() also, critically, is a macro that returns the calling function.
* continue_at() also, critically, requires a 'return' immediately following the
* location where this macro is referenced, to return to the calling function.
* There's good reason for this.
*
* To use safely closures asynchronously, they must always have a refcount while

View File

@ -25,7 +25,6 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
struct bio *bio = &b->bio;
bio_init(bio);
bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
bio->bi_max_vecs = bucket_pages(c);
bio->bi_io_vec = bio->bi_inline_vecs;

View File

@ -134,7 +134,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
case BCACHE_SB_VERSION_CDEV:
case BCACHE_SB_VERSION_CDEV_WITH_UUID:
sb->nbuckets = le64_to_cpu(s->nbuckets);
sb->block_size = le16_to_cpu(s->block_size);
sb->bucket_size = le16_to_cpu(s->bucket_size);
sb->nr_in_set = le16_to_cpu(s->nr_in_set);
@ -1520,7 +1519,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
!(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
!(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
!(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
!(c->moving_gc_wq = create_workqueue("bcache_gc")) ||
!(c->moving_gc_wq = alloc_workqueue("bcache_gc",
WQ_MEM_RECLAIM, 0)) ||
bch_journal_alloc(c) ||
bch_btree_cache_alloc(c) ||
bch_open_buckets_alloc(c) ||
@ -1805,7 +1805,7 @@ void bch_cache_release(struct kobject *kobj)
module_put(THIS_MODULE);
}
static int cache_alloc(struct cache_sb *sb, struct cache *ca)
static int cache_alloc(struct cache *ca)
{
size_t free;
struct bucket *b;
@ -1860,7 +1860,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
if (blk_queue_discard(bdev_get_queue(ca->bdev)))
ca->discard = CACHE_DISCARD(&ca->sb);
ret = cache_alloc(sb, ca);
ret = cache_alloc(ca);
if (ret != 0)
goto err;
@ -2099,7 +2099,7 @@ static int __init bcache_init(void)
return bcache_major;
}
if (!(bcache_wq = create_workqueue("bcache")) ||
if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
!(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
sysfs_create_files(bcache_kobj, files) ||
bch_request_init() ||

View File

@ -528,7 +528,7 @@ static void read_callback(unsigned long error, void *context)
DMWARN_LIMIT("Read failure on mirror device %s. "
"Trying alternative device.",
m->dev->name);
queue_bio(m->ms, bio, bio_rw(bio));
queue_bio(m->ms, bio, bio_data_dir(bio));
return;
}
@ -1193,7 +1193,7 @@ static void mirror_dtr(struct dm_target *ti)
*/
static int mirror_map(struct dm_target *ti, struct bio *bio)
{
int r, rw = bio_rw(bio);
int r, rw = bio_data_dir(bio);
struct mirror *m;
struct mirror_set *ms = ti->private;
struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
@ -1217,7 +1217,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
* If region is not in-sync queue the bio.
*/
if (!r || (r == -EWOULDBLOCK)) {
if (rw == READA)
if (bio->bi_rw & REQ_RAHEAD)
return -EWOULDBLOCK;
queue_bio(ms, bio, rw);
@ -1242,7 +1242,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
{
int rw = bio_rw(bio);
int rw = bio_data_dir(bio);
struct mirror_set *ms = (struct mirror_set *) ti->private;
struct mirror *m = NULL;
struct dm_bio_details *bd = NULL;

View File

@ -1696,7 +1696,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
* to copy an exception */
down_write(&s->lock);
if (!s->valid || (unlikely(s->snapshot_overflowed) && bio_rw(bio) == WRITE)) {
if (!s->valid || (unlikely(s->snapshot_overflowed) &&
bio_data_dir(bio) == WRITE)) {
r = -EIO;
goto out_unlock;
}
@ -1713,7 +1714,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
* flags so we should only get this if we are
* writeable.
*/
if (bio_rw(bio) == WRITE) {
if (bio_data_dir(bio) == WRITE) {
pe = __lookup_pending_exception(s, chunk);
if (!pe) {
up_write(&s->lock);
@ -1819,7 +1820,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
e = dm_lookup_exception(&s->complete, chunk);
if (e) {
/* Queue writes overlapping with chunks being merged */
if (bio_rw(bio) == WRITE &&
if (bio_data_dir(bio) == WRITE &&
chunk >= s->first_merging_chunk &&
chunk < (s->first_merging_chunk +
s->num_merging_chunks)) {
@ -1831,7 +1832,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
remap_exception(s, e, bio, chunk);
if (bio_rw(bio) == WRITE)
if (bio_data_dir(bio) == WRITE)
track_chunk(s, bio, chunk);
goto out_unlock;
}
@ -1839,7 +1840,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
redirect_to_origin:
bio->bi_bdev = s->origin->bdev;
if (bio_rw(bio) == WRITE) {
if (bio_data_dir(bio) == WRITE) {
up_write(&s->lock);
return do_origin(s->origin, bio);
}
@ -2288,7 +2289,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
if (unlikely(bio->bi_rw & REQ_PREFLUSH))
return DM_MAPIO_REMAPPED;
if (bio_rw(bio) != WRITE)
if (bio_data_dir(bio) != WRITE)
return DM_MAPIO_REMAPPED;
available_sectors = o->split_boundary -

View File

@ -35,16 +35,19 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
*/
static int zero_map(struct dm_target *ti, struct bio *bio)
{
switch(bio_rw(bio)) {
case READ:
switch (bio_op(bio)) {
case REQ_OP_READ:
if (bio->bi_rw & REQ_RAHEAD) {
/* readahead of null bytes only wastes buffer cache */
return -EIO;
}
zero_fill_bio(bio);
break;
case READA:
/* readahead of null bytes only wastes buffer cache */
return -EIO;
case WRITE:
case REQ_OP_WRITE:
/* writes get silently dropped */
break;
default:
return -EIO;
}
bio_endio(bio);

View File

@ -1833,7 +1833,7 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
dm_put_live_table(md, srcu_idx);
if (bio_rw(bio) != READA)
if (!(bio->bi_rw & REQ_RAHEAD))
queue_io(md, bio);
else
bio_io_error(bio);

View File

@ -1058,7 +1058,6 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_flush_fua = (bio->bi_rw &
(REQ_PREFLUSH | REQ_FUA));
const unsigned long do_sec = (bio->bi_rw & REQ_SECURE);
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
struct raid1_plug_cb *plug = NULL;
@ -1106,7 +1105,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
bitmap = mddev->bitmap;
/*
* make_request() can abort the operation when READA is being
* make_request() can abort the operation when read-ahead is being
* used and no empty request is available.
*
*/
@ -1376,7 +1375,7 @@ read_again:
conf->mirrors[i].rdev->data_offset);
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
bio_set_op_attrs(mbio, op, do_flush_fua | do_sync | do_sec);
bio_set_op_attrs(mbio, op, do_flush_fua | do_sync);
mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining);

View File

@ -1062,7 +1062,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
const unsigned long do_sec = (bio->bi_rw & REQ_SECURE);
unsigned long flags;
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
@ -1362,7 +1361,7 @@ retry_write:
rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
bio_set_op_attrs(mbio, op, do_sync | do_fua | do_sec);
bio_set_op_attrs(mbio, op, do_sync | do_fua);
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@ -1404,7 +1403,7 @@ retry_write:
r10_bio, rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
bio_set_op_attrs(mbio, op, do_sync | do_fua | do_sec);
bio_set_op_attrs(mbio, op, do_sync | do_fua);
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);

View File

@ -5233,7 +5233,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
(unsigned long long)logical_sector);
sh = raid5_get_active_stripe(conf, new_sector, previous,
(bi->bi_rw&RWA_MASK), 0);
(bi->bi_rw & REQ_RAHEAD), 0);
if (sh) {
if (unlikely(previous)) {
/* expansion might have moved on while waiting for a

View File

@ -2002,8 +2002,7 @@ static int msb_bd_getgeo(struct block_device *bdev,
static int msb_prepare_req(struct request_queue *q, struct request *req)
{
if (req->cmd_type != REQ_TYPE_FS &&
req->cmd_type != REQ_TYPE_BLOCK_PC) {
if (req->cmd_type != REQ_TYPE_FS) {
blk_dump_rq_flags(req, "MS unsupported request");
return BLKPREP_KILL;
}
@ -2146,7 +2145,6 @@ static int msb_init_disk(struct memstick_dev *card)
msb->disk->fops = &msb_bdops;
msb->disk->private_data = msb;
msb->disk->queue = msb->queue;
msb->disk->driverfs_dev = &card->dev;
msb->disk->flags |= GENHD_FL_EXT_DEVT;
capacity = msb->pages_in_block * msb->logical_block_count;
@ -2163,7 +2161,7 @@ static int msb_init_disk(struct memstick_dev *card)
set_disk_ro(msb->disk, 1);
msb_start(card);
add_disk(msb->disk);
device_add_disk(&card->dev, msb->disk);
dbg("Disk added");
return 0;

View File

@ -829,8 +829,7 @@ static void mspro_block_start(struct memstick_dev *card)
static int mspro_block_prepare_req(struct request_queue *q, struct request *req)
{
if (req->cmd_type != REQ_TYPE_FS &&
req->cmd_type != REQ_TYPE_BLOCK_PC) {
if (req->cmd_type != REQ_TYPE_FS) {
blk_dump_rq_flags(req, "MSPro unsupported request");
return BLKPREP_KILL;
}
@ -1243,7 +1242,6 @@ static int mspro_block_init_disk(struct memstick_dev *card)
msb->usage_count = 1;
msb->disk->private_data = msb;
msb->disk->queue = msb->queue;
msb->disk->driverfs_dev = &card->dev;
sprintf(msb->disk->disk_name, "mspblk%d", disk_id);
@ -1255,7 +1253,7 @@ static int mspro_block_init_disk(struct memstick_dev *card)
set_capacity(msb->disk, capacity);
dev_dbg(&card->dev, "capacity set %ld\n", capacity);
add_disk(msb->disk);
device_add_disk(&card->dev, msb->disk);
msb->active = 1;
return 0;

View File

@ -93,6 +93,7 @@ static DEFINE_SPINLOCK(mmc_blk_lock);
*/
struct mmc_blk_data {
spinlock_t lock;
struct device *parent;
struct gendisk *disk;
struct mmc_queue queue;
struct list_head part;
@ -2169,10 +2170,12 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
/* complete ongoing async transfer before issuing discard */
if (card->host->areq)
mmc_blk_issue_rw_rq(mq, NULL);
if (req->cmd_flags & REQ_SECURE)
ret = mmc_blk_issue_secdiscard_rq(mq, req);
else
ret = mmc_blk_issue_discard_rq(mq, req);
ret = mmc_blk_issue_discard_rq(mq, req);
} else if (req && req_op(req) == REQ_OP_SECURE_ERASE) {
/* complete ongoing async transfer before issuing secure erase*/
if (card->host->areq)
mmc_blk_issue_rw_rq(mq, NULL);
ret = mmc_blk_issue_secdiscard_rq(mq, req);
} else if (req && req_op(req) == REQ_OP_FLUSH) {
/* complete ongoing async transfer before issuing flush */
if (card->host->areq)
@ -2270,7 +2273,7 @@ again:
md->disk->fops = &mmc_bdops;
md->disk->private_data = md;
md->disk->queue = md->queue.queue;
md->disk->driverfs_dev = parent;
md->parent = parent;
set_disk_ro(md->disk, md->read_only || default_ro);
md->disk->flags = GENHD_FL_EXT_DEVT;
if (area_type & (MMC_BLK_DATA_AREA_RPMB | MMC_BLK_DATA_AREA_BOOT))
@ -2458,7 +2461,7 @@ static int mmc_add_disk(struct mmc_blk_data *md)
int ret;
struct mmc_card *card = md->queue.card;
add_disk(md->disk);
device_add_disk(md->parent, md->disk);
md->force_ro.show = force_ro_show;
md->force_ro.store = force_ro_store;
sysfs_attr_init(&md->force_ro.attr);

View File

@ -171,7 +171,7 @@ static void mmc_queue_setup_discard(struct request_queue *q,
if (card->pref_erase > max_discard)
q->limits.discard_granularity = 0;
if (mmc_can_secure_erase_trim(card))
queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q);
}
/**

View File

@ -431,12 +431,10 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
goto error4;
INIT_WORK(&new->work, mtd_blktrans_work);
gd->driverfs_dev = &new->mtd->dev;
if (new->readonly)
set_disk_ro(gd, 1);
add_disk(gd);
device_add_disk(&new->mtd->dev, gd);
if (new->disk_attributes) {
ret = sysfs_create_group(&disk_to_dev(gd)->kobj,

View File

@ -287,14 +287,13 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
return -ENOMEM;
}
disk->driverfs_dev = dev;
disk->first_minor = 0;
disk->fops = &nd_blk_fops;
disk->queue = q;
disk->flags = GENHD_FL_EXT_DEVT;
nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name);
set_capacity(disk, 0);
add_disk(disk);
device_add_disk(dev, disk);
if (nsblk_meta_size(nsblk)) {
int rc = nd_integrity_init(disk, nsblk_meta_size(nsblk));

View File

@ -1243,7 +1243,6 @@ static int btt_blk_init(struct btt *btt)
}
nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
btt->btt_disk->driverfs_dev = &btt->nd_btt->dev;
btt->btt_disk->first_minor = 0;
btt->btt_disk->fops = &btt_fops;
btt->btt_disk->private_data = btt;
@ -1258,7 +1257,7 @@ static int btt_blk_init(struct btt *btt)
btt->btt_queue->queuedata = btt;
set_capacity(btt->btt_disk, 0);
add_disk(btt->btt_disk);
device_add_disk(&btt->nd_btt->dev, btt->btt_disk);
if (btt_meta_size(btt)) {
int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt));

View File

@ -312,7 +312,7 @@ EXPORT_SYMBOL(__nd_driver_register);
int nvdimm_revalidate_disk(struct gendisk *disk)
{
struct device *dev = disk->driverfs_dev;
struct device *dev = disk_to_dev(disk)->parent;
struct nd_region *nd_region = to_nd_region(dev->parent);
const char *pol = nd_region->ro ? "only" : "write";

View File

@ -298,14 +298,13 @@ static int pmem_attach_disk(struct device *dev,
disk->queue = q;
disk->flags = GENHD_FL_EXT_DEVT;
nvdimm_namespace_disk_name(ndns, disk->disk_name);
disk->driverfs_dev = dev;
set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
/ 512);
if (devm_init_badblocks(dev, &pmem->bb))
return -ENOMEM;
nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res);
disk->bb = &pmem->bb;
add_disk(disk);
device_add_disk(dev, disk);
revalidate_disk(disk);
return 0;

View File

@ -1 +1,2 @@
source "drivers/nvme/host/Kconfig"
source "drivers/nvme/target/Kconfig"

View File

@ -1,2 +1,3 @@
obj-y += host/
obj-y += target/

View File

@ -24,3 +24,22 @@ config BLK_DEV_NVME_SCSI
to say N here, unless you run a distro that abuses the SCSI
emulation to provide stable device names for mount by id, like
some OpenSuSE and SLES versions.
config NVME_FABRICS
tristate
config NVME_RDMA
tristate "NVM Express over Fabrics RDMA host driver"
depends on INFINIBAND
depends on BLK_DEV_NVME
select NVME_FABRICS
select SG_POOL
help
This provides support for the NVMe over Fabrics protocol using
the RDMA (Infiniband, RoCE, iWarp) transport. This allows you
to use remote block devices exported using the NVMe protocol set.
To configure a NVMe over Fabrics controller use the nvme-cli tool
from https://github.com/linux-nvme/nvme-cli.
If unsure, say N.

View File

@ -1,8 +1,14 @@
obj-$(CONFIG_NVME_CORE) += nvme-core.o
obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o
obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o
nvme-core-y := core.o
nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o
nvme-core-$(CONFIG_NVM) += lightnvm.o
nvme-y += pci.o
nvme-fabrics-y += fabrics.o
nvme-rdma-y += rdma.o

View File

@ -30,6 +30,7 @@
#include <asm/unaligned.h>
#include "nvme.h"
#include "fabrics.h"
#define NVME_MINORS (1U << MINORBITS)
@ -47,8 +48,10 @@ unsigned char shutdown_timeout = 5;
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
static int nvme_major;
module_param(nvme_major, int, 0);
unsigned int nvme_max_retries = 5;
module_param_named(max_retries, nvme_max_retries, uint, 0644);
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
EXPORT_SYMBOL_GPL(nvme_max_retries);
static int nvme_char_major;
module_param(nvme_char_major, int, 0);
@ -58,6 +61,23 @@ static DEFINE_SPINLOCK(dev_list_lock);
static struct class *nvme_class;
void nvme_cancel_request(struct request *req, void *data, bool reserved)
{
int status;
if (!blk_mq_request_started(req))
return;
dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
"Cancelling I/O %d", req->tag);
status = NVME_SC_ABORT_REQ;
if (blk_queue_dying(req->q))
status |= NVME_SC_DNR;
blk_mq_complete_request(req, status);
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state)
{
@ -68,7 +88,9 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
switch (new_state) {
case NVME_CTRL_LIVE:
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_RESETTING:
case NVME_CTRL_RECONNECTING:
changed = true;
/* FALLTHRU */
default:
@ -78,6 +100,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_RESETTING:
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
case NVME_CTRL_RECONNECTING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_RECONNECTING:
switch (old_state) {
case NVME_CTRL_LIVE:
changed = true;
/* FALLTHRU */
@ -89,6 +121,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
switch (old_state) {
case NVME_CTRL_LIVE:
case NVME_CTRL_RESETTING:
case NVME_CTRL_RECONNECTING:
changed = true;
/* FALLTHRU */
default:
@ -174,21 +207,21 @@ void nvme_requeue_req(struct request *req)
EXPORT_SYMBOL_GPL(nvme_requeue_req);
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags)
struct nvme_command *cmd, unsigned int flags, int qid)
{
bool write = cmd->common.opcode & 1;
struct request *req;
req = blk_mq_alloc_request(q, write, flags);
if (qid == NVME_QID_ANY) {
req = blk_mq_alloc_request(q, nvme_is_write(cmd), flags);
} else {
req = blk_mq_alloc_request_hctx(q, nvme_is_write(cmd), flags,
qid ? qid - 1 : 0);
}
if (IS_ERR(req))
return req;
req->cmd_type = REQ_TYPE_DRV_PRIV;
req->cmd_flags |= REQ_FAILFAST_DRIVER;
req->__data_len = 0;
req->__sector = (sector_t) -1;
req->bio = req->biotail = NULL;
req->cmd = (unsigned char *)cmd;
req->cmd_len = sizeof(struct nvme_command);
@ -307,12 +340,12 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
*/
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
struct nvme_completion *cqe, void *buffer, unsigned bufflen,
unsigned timeout)
unsigned timeout, int qid, int at_head, int flags)
{
struct request *req;
int ret;
req = nvme_alloc_request(q, cmd, 0);
req = nvme_alloc_request(q, cmd, flags, qid);
if (IS_ERR(req))
return PTR_ERR(req);
@ -325,17 +358,19 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
goto out;
}
blk_execute_rq(req->q, NULL, req, 0);
blk_execute_rq(req->q, NULL, req, at_head);
ret = req->errors;
out:
blk_mq_free_request(req);
return ret;
}
EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buffer, unsigned bufflen)
{
return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0);
return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
NVME_QID_ANY, 0, 0);
}
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
@ -344,7 +379,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
u32 *result, unsigned timeout)
{
bool write = cmd->common.opcode & 1;
bool write = nvme_is_write(cmd);
struct nvme_completion cqe;
struct nvme_ns *ns = q->queuedata;
struct gendisk *disk = ns ? ns->disk : NULL;
@ -353,7 +388,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
void *meta = NULL;
int ret;
req = nvme_alloc_request(q, cmd, 0);
req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
if (IS_ERR(req))
return PTR_ERR(req);
@ -439,6 +474,74 @@ int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
result, timeout);
}
static void nvme_keep_alive_end_io(struct request *rq, int error)
{
struct nvme_ctrl *ctrl = rq->end_io_data;
blk_mq_free_request(rq);
if (error) {
dev_err(ctrl->device,
"failed nvme_keep_alive_end_io error=%d\n", error);
return;
}
schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
}
static int nvme_keep_alive(struct nvme_ctrl *ctrl)
{
struct nvme_command c;
struct request *rq;
memset(&c, 0, sizeof(c));
c.common.opcode = nvme_admin_keep_alive;
rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED,
NVME_QID_ANY);
if (IS_ERR(rq))
return PTR_ERR(rq);
rq->timeout = ctrl->kato * HZ;
rq->end_io_data = ctrl;
blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
return 0;
}
static void nvme_keep_alive_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
struct nvme_ctrl, ka_work);
if (nvme_keep_alive(ctrl)) {
/* allocation failure, reset the controller */
dev_err(ctrl->device, "keep-alive failed\n");
ctrl->ops->reset_ctrl(ctrl);
return;
}
}
void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
{
if (unlikely(ctrl->kato == 0))
return;
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
}
EXPORT_SYMBOL_GPL(nvme_start_keep_alive);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
{
if (unlikely(ctrl->kato == 0))
return;
cancel_delayed_work_sync(&ctrl->ka_work);
}
EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
{
struct nvme_command c = { };
@ -500,10 +603,11 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
memset(&c, 0, sizeof(c));
c.features.opcode = nvme_admin_get_features;
c.features.nsid = cpu_to_le32(nsid);
c.features.prp1 = cpu_to_le64(dma_addr);
c.features.dptr.prp1 = cpu_to_le64(dma_addr);
c.features.fid = cpu_to_le32(fid);
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0);
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0,
NVME_QID_ANY, 0, 0);
if (ret >= 0)
*result = le32_to_cpu(cqe.result);
return ret;
@ -518,11 +622,12 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
memset(&c, 0, sizeof(c));
c.features.opcode = nvme_admin_set_features;
c.features.prp1 = cpu_to_le64(dma_addr);
c.features.dptr.prp1 = cpu_to_le64(dma_addr);
c.features.fid = cpu_to_le32(fid);
c.features.dword11 = cpu_to_le32(dword11);
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0);
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0,
NVME_QID_ANY, 0, 0);
if (ret >= 0)
*result = le32_to_cpu(cqe.result);
return ret;
@ -558,11 +663,22 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
&result);
if (status)
if (status < 0)
return status;
nr_io_queues = min(result & 0xffff, result >> 16) + 1;
*count = min(*count, nr_io_queues);
/*
* Degraded controllers might return an error when setting the queue
* count. We still want to be able to bring them online and offer
* access to the admin queue, as that might be only way to fix them up.
*/
if (status > 0) {
dev_err(ctrl->dev, "Could not set queue count (%d)\n", status);
*count = 0;
} else {
nr_io_queues = min(result & 0xffff, result >> 16) + 1;
*count = min(*count, nr_io_queues);
}
return 0;
}
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
@ -726,6 +842,7 @@ static void nvme_init_integrity(struct nvme_ns *ns)
{
struct blk_integrity integrity;
memset(&integrity, 0, sizeof(integrity));
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
integrity.profile = &t10_pi_type3_crc;
@ -764,7 +881,7 @@ static void nvme_config_discard(struct nvme_ns *ns)
ns->queue->limits.discard_alignment = logical_block_size;
ns->queue->limits.discard_granularity = logical_block_size;
blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
}
@ -991,6 +1108,15 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
/* Checking for ctrl->tagset is a trick to avoid sleeping on module
* load, since we only need the quirk on reset_controller. Notice
* that the HGST device needs this delay only in firmware activation
* procedure; unfortunately we have no (easy) way to verify this.
*/
if ((ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) && ctrl->tagset)
msleep(NVME_QUIRK_DELAY_AMOUNT);
return nvme_wait_ready(ctrl, cap, false);
}
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
@ -1088,6 +1214,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
struct nvme_id_ctrl *id;
u64 cap;
int ret, page_shift;
u32 max_hw_sectors;
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
if (ret) {
@ -1120,9 +1247,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
memcpy(ctrl->model, id->mn, sizeof(id->mn));
memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
if (id->mdts)
ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9);
max_hw_sectors = 1 << (id->mdts + page_shift - 9);
else
ctrl->max_hw_sectors = UINT_MAX;
max_hw_sectors = UINT_MAX;
ctrl->max_hw_sectors =
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
unsigned int max_hw_sectors;
@ -1138,9 +1267,33 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
}
nvme_set_queue_limits(ctrl, ctrl->admin_q);
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
if (ctrl->ops->is_fabrics) {
ctrl->icdoff = le16_to_cpu(id->icdoff);
ctrl->ioccsz = le32_to_cpu(id->ioccsz);
ctrl->iorcsz = le32_to_cpu(id->iorcsz);
ctrl->maxcmd = le16_to_cpu(id->maxcmd);
/*
* In fabrics we need to verify the cntlid matches the
* admin connect
*/
if (ctrl->cntlid != le16_to_cpu(id->cntlid))
ret = -EINVAL;
if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
dev_err(ctrl->dev,
"keep-alive support is mandatory for fabrics\n");
ret = -EINVAL;
}
} else {
ctrl->cntlid = le16_to_cpu(id->cntlid);
}
kfree(id);
return 0;
return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_identify);
@ -1322,7 +1475,7 @@ static struct attribute *nvme_ns_attrs[] = {
NULL,
};
static umode_t nvme_attrs_are_visible(struct kobject *kobj,
static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
@ -1341,7 +1494,7 @@ static umode_t nvme_attrs_are_visible(struct kobject *kobj,
static const struct attribute_group nvme_ns_attr_group = {
.attrs = nvme_ns_attrs,
.is_visible = nvme_attrs_are_visible,
.is_visible = nvme_ns_attrs_are_visible,
};
#define nvme_show_str_function(field) \
@ -1367,6 +1520,49 @@ nvme_show_str_function(serial);
nvme_show_str_function(firmware_rev);
nvme_show_int_function(cntlid);
static ssize_t nvme_sysfs_delete(struct device *dev,
struct device_attribute *attr, const char *buf,
size_t count)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
if (device_remove_file_self(dev, attr))
ctrl->ops->delete_ctrl(ctrl);
return count;
}
static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
static ssize_t nvme_sysfs_show_transport(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
}
static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
return snprintf(buf, PAGE_SIZE, "%s\n",
ctrl->ops->get_subsysnqn(ctrl));
}
static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
static ssize_t nvme_sysfs_show_address(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
}
static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
@ -1374,11 +1570,38 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_serial.attr,
&dev_attr_firmware_rev.attr,
&dev_attr_cntlid.attr,
&dev_attr_delete_controller.attr,
&dev_attr_transport.attr,
&dev_attr_subsysnqn.attr,
&dev_attr_address.attr,
NULL
};
#define CHECK_ATTR(ctrl, a, name) \
if ((a) == &dev_attr_##name.attr && \
!(ctrl)->ops->get_##name) \
return 0
static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
if (a == &dev_attr_delete_controller.attr) {
if (!ctrl->ops->delete_ctrl)
return 0;
}
CHECK_ATTR(ctrl, a, subsysnqn);
CHECK_ATTR(ctrl, a, address);
return a->mode;
}
static struct attribute_group nvme_dev_attrs_group = {
.attrs = nvme_dev_attrs,
.attrs = nvme_dev_attrs,
.is_visible = nvme_dev_attrs_are_visible,
};
static const struct attribute_group *nvme_dev_attr_groups[] = {
@ -1446,12 +1669,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
disk->major = nvme_major;
disk->first_minor = 0;
disk->fops = &nvme_fops;
disk->private_data = ns;
disk->queue = ns->queue;
disk->driverfs_dev = ctrl->device;
disk->flags = GENHD_FL_EXT_DEVT;
sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
@ -1466,7 +1686,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
if (ns->type == NVME_NS_LIGHTNVM)
return;
add_disk(ns->disk);
device_add_disk(ctrl->device, ns->disk);
if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
&nvme_ns_attr_group))
pr_warn("%s: failed to create sysfs group for identification\n",
@ -1517,6 +1737,17 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
nvme_alloc_ns(ctrl, nsid);
}
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid)
{
struct nvme_ns *ns, *next;
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
if (ns->ns_id > nsid)
nvme_ns_remove(ns);
}
}
static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
{
struct nvme_ns *ns;
@ -1531,7 +1762,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
for (i = 0; i < num_lists; i++) {
ret = nvme_identify_ns_list(ctrl, prev, ns_list);
if (ret)
goto out;
goto free;
for (j = 0; j < min(nn, 1024U); j++) {
nsid = le32_to_cpu(ns_list[j]);
@ -1551,22 +1782,20 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
nn -= j;
}
out:
nvme_remove_invalid_namespaces(ctrl, prev);
free:
kfree(ns_list);
return ret;
}
static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
{
struct nvme_ns *ns, *next;
unsigned i;
for (i = 1; i <= nn; i++)
nvme_validate_ns(ctrl, i);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
if (ns->ns_id > nn)
nvme_ns_remove(ns);
}
nvme_remove_invalid_namespaces(ctrl, nn);
}
static void nvme_scan_work(struct work_struct *work)
@ -1852,16 +2081,10 @@ int __init nvme_core_init(void)
{
int result;
result = register_blkdev(nvme_major, "nvme");
if (result < 0)
return result;
else if (result > 0)
nvme_major = result;
result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
&nvme_dev_fops);
if (result < 0)
goto unregister_blkdev;
return result;
else if (result > 0)
nvme_char_major = result;
@ -1875,8 +2098,6 @@ int __init nvme_core_init(void)
unregister_chrdev:
__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
unregister_blkdev:
unregister_blkdev(nvme_major, "nvme");
return result;
}
@ -1884,7 +2105,6 @@ void nvme_core_exit(void)
{
class_destroy(nvme_class);
__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
unregister_blkdev(nvme_major, "nvme");
}
MODULE_LICENSE("GPL");

View File

@ -0,0 +1,952 @@
/*
* NVMe over Fabrics common host code.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/parser.h>
#include <linux/seq_file.h>
#include "nvme.h"
#include "fabrics.h"
static LIST_HEAD(nvmf_transports);
static DEFINE_MUTEX(nvmf_transports_mutex);
static LIST_HEAD(nvmf_hosts);
static DEFINE_MUTEX(nvmf_hosts_mutex);
static struct nvmf_host *nvmf_default_host;
static struct nvmf_host *__nvmf_host_find(const char *hostnqn)
{
struct nvmf_host *host;
list_for_each_entry(host, &nvmf_hosts, list) {
if (!strcmp(host->nqn, hostnqn))
return host;
}
return NULL;
}
static struct nvmf_host *nvmf_host_add(const char *hostnqn)
{
struct nvmf_host *host;
mutex_lock(&nvmf_hosts_mutex);
host = __nvmf_host_find(hostnqn);
if (host)
goto out_unlock;
host = kmalloc(sizeof(*host), GFP_KERNEL);
if (!host)
goto out_unlock;
kref_init(&host->ref);
memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE);
uuid_le_gen(&host->id);
list_add_tail(&host->list, &nvmf_hosts);
out_unlock:
mutex_unlock(&nvmf_hosts_mutex);
return host;
}
static struct nvmf_host *nvmf_host_default(void)
{
struct nvmf_host *host;
host = kmalloc(sizeof(*host), GFP_KERNEL);
if (!host)
return NULL;
kref_init(&host->ref);
uuid_le_gen(&host->id);
snprintf(host->nqn, NVMF_NQN_SIZE,
"nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUl", &host->id);
mutex_lock(&nvmf_hosts_mutex);
list_add_tail(&host->list, &nvmf_hosts);
mutex_unlock(&nvmf_hosts_mutex);
return host;
}
static void nvmf_host_destroy(struct kref *ref)
{
struct nvmf_host *host = container_of(ref, struct nvmf_host, ref);
mutex_lock(&nvmf_hosts_mutex);
list_del(&host->list);
mutex_unlock(&nvmf_hosts_mutex);
kfree(host);
}
static void nvmf_host_put(struct nvmf_host *host)
{
if (host)
kref_put(&host->ref, nvmf_host_destroy);
}
/**
* nvmf_get_address() - Get address/port
* @ctrl: Host NVMe controller instance which we got the address
* @buf: OUTPUT parameter that will contain the address/port
* @size: buffer size
*/
int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
return snprintf(buf, size, "traddr=%s,trsvcid=%s\n",
ctrl->opts->traddr, ctrl->opts->trsvcid);
}
EXPORT_SYMBOL_GPL(nvmf_get_address);
/**
* nvmf_get_subsysnqn() - Get subsystem NQN
* @ctrl: Host NVMe controller instance which we got the NQN
*/
const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl)
{
return ctrl->opts->subsysnqn;
}
EXPORT_SYMBOL_GPL(nvmf_get_subsysnqn);
/**
* nvmf_reg_read32() - NVMe Fabrics "Property Get" API function.
* @ctrl: Host NVMe controller instance maintaining the admin
* queue used to submit the property read command to
* the allocated NVMe controller resource on the target system.
* @off: Starting offset value of the targeted property
* register (see the fabrics section of the NVMe standard).
* @val: OUTPUT parameter that will contain the value of
* the property after a successful read.
*
* Used by the host system to retrieve a 32-bit capsule property value
* from an NVMe controller on the target system.
*
* ("Capsule property" is an "PCIe register concept" applied to the
* NVMe fabrics space.)
*
* Return:
* 0: successful read
* > 0: NVMe error status code
* < 0: Linux errno error code
*/
int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
{
struct nvme_command cmd;
struct nvme_completion cqe;
int ret;
memset(&cmd, 0, sizeof(cmd));
cmd.prop_get.opcode = nvme_fabrics_command;
cmd.prop_get.fctype = nvme_fabrics_type_property_get;
cmd.prop_get.offset = cpu_to_le32(off);
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, NULL, 0, 0,
NVME_QID_ANY, 0, 0);
if (ret >= 0)
*val = le64_to_cpu(cqe.result64);
if (unlikely(ret != 0))
dev_err(ctrl->device,
"Property Get error: %d, offset %#x\n",
ret > 0 ? ret & ~NVME_SC_DNR : ret, off);
return ret;
}
EXPORT_SYMBOL_GPL(nvmf_reg_read32);
/**
* nvmf_reg_read64() - NVMe Fabrics "Property Get" API function.
* @ctrl: Host NVMe controller instance maintaining the admin
* queue used to submit the property read command to
* the allocated controller resource on the target system.
* @off: Starting offset value of the targeted property
* register (see the fabrics section of the NVMe standard).
* @val: OUTPUT parameter that will contain the value of
* the property after a successful read.
*
* Used by the host system to retrieve a 64-bit capsule property value
* from an NVMe controller on the target system.
*
* ("Capsule property" is an "PCIe register concept" applied to the
* NVMe fabrics space.)
*
* Return:
* 0: successful read
* > 0: NVMe error status code
* < 0: Linux errno error code
*/
int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
struct nvme_command cmd;
struct nvme_completion cqe;
int ret;
memset(&cmd, 0, sizeof(cmd));
cmd.prop_get.opcode = nvme_fabrics_command;
cmd.prop_get.fctype = nvme_fabrics_type_property_get;
cmd.prop_get.attrib = 1;
cmd.prop_get.offset = cpu_to_le32(off);
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, NULL, 0, 0,
NVME_QID_ANY, 0, 0);
if (ret >= 0)
*val = le64_to_cpu(cqe.result64);
if (unlikely(ret != 0))
dev_err(ctrl->device,
"Property Get error: %d, offset %#x\n",
ret > 0 ? ret & ~NVME_SC_DNR : ret, off);
return ret;
}
EXPORT_SYMBOL_GPL(nvmf_reg_read64);
/**
* nvmf_reg_write32() - NVMe Fabrics "Property Write" API function.
* @ctrl: Host NVMe controller instance maintaining the admin
* queue used to submit the property read command to
* the allocated NVMe controller resource on the target system.
* @off: Starting offset value of the targeted property
* register (see the fabrics section of the NVMe standard).
* @val: Input parameter that contains the value to be
* written to the property.
*
* Used by the NVMe host system to write a 32-bit capsule property value
* to an NVMe controller on the target system.
*
* ("Capsule property" is an "PCIe register concept" applied to the
* NVMe fabrics space.)
*
* Return:
* 0: successful write
* > 0: NVMe error status code
* < 0: Linux errno error code
*/
int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
{
struct nvme_command cmd;
int ret;
memset(&cmd, 0, sizeof(cmd));
cmd.prop_set.opcode = nvme_fabrics_command;
cmd.prop_set.fctype = nvme_fabrics_type_property_set;
cmd.prop_set.attrib = 0;
cmd.prop_set.offset = cpu_to_le32(off);
cmd.prop_set.value = cpu_to_le64(val);
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0,
NVME_QID_ANY, 0, 0);
if (unlikely(ret))
dev_err(ctrl->device,
"Property Set error: %d, offset %#x\n",
ret > 0 ? ret & ~NVME_SC_DNR : ret, off);
return ret;
}
EXPORT_SYMBOL_GPL(nvmf_reg_write32);
/**
* nvmf_log_connect_error() - Error-parsing-diagnostic print
* out function for connect() errors.
*
* @ctrl: the specific /dev/nvmeX device that had the error.
*
* @errval: Error code to be decoded in a more human-friendly
* printout.
*
* @offset: For use with the NVMe error code NVME_SC_CONNECT_INVALID_PARAM.
*
* @cmd: This is the SQE portion of a submission capsule.
*
* @data: This is the "Data" portion of a submission capsule.
*/
static void nvmf_log_connect_error(struct nvme_ctrl *ctrl,
int errval, int offset, struct nvme_command *cmd,
struct nvmf_connect_data *data)
{
int err_sctype = errval & (~NVME_SC_DNR);
switch (err_sctype) {
case (NVME_SC_CONNECT_INVALID_PARAM):
if (offset >> 16) {
char *inv_data = "Connect Invalid Data Parameter";
switch (offset & 0xffff) {
case (offsetof(struct nvmf_connect_data, cntlid)):
dev_err(ctrl->device,
"%s, cntlid: %d\n",
inv_data, data->cntlid);
break;
case (offsetof(struct nvmf_connect_data, hostnqn)):
dev_err(ctrl->device,
"%s, hostnqn \"%s\"\n",
inv_data, data->hostnqn);
break;
case (offsetof(struct nvmf_connect_data, subsysnqn)):
dev_err(ctrl->device,
"%s, subsysnqn \"%s\"\n",
inv_data, data->subsysnqn);
break;
default:
dev_err(ctrl->device,
"%s, starting byte offset: %d\n",
inv_data, offset & 0xffff);
break;
}
} else {
char *inv_sqe = "Connect Invalid SQE Parameter";
switch (offset) {
case (offsetof(struct nvmf_connect_command, qid)):
dev_err(ctrl->device,
"%s, qid %d\n",
inv_sqe, cmd->connect.qid);
break;
default:
dev_err(ctrl->device,
"%s, starting byte offset: %d\n",
inv_sqe, offset);
}
}
break;
default:
dev_err(ctrl->device,
"Connect command failed, error wo/DNR bit: %d\n",
err_sctype);
break;
} /* switch (err_sctype) */
}
/**
* nvmf_connect_admin_queue() - NVMe Fabrics Admin Queue "Connect"
* API function.
* @ctrl: Host nvme controller instance used to request
* a new NVMe controller allocation on the target
* system and establish an NVMe Admin connection to
* that controller.
*
* This function enables an NVMe host device to request a new allocation of
* an NVMe controller resource on a target system as well establish a
* fabrics-protocol connection of the NVMe Admin queue between the
* host system device and the allocated NVMe controller on the
* target system via a NVMe Fabrics "Connect" command.
*
* Return:
* 0: success
* > 0: NVMe error status code
* < 0: Linux errno error code
*
*/
int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
{
struct nvme_command cmd;
struct nvme_completion cqe;
struct nvmf_connect_data *data;
int ret;
memset(&cmd, 0, sizeof(cmd));
cmd.connect.opcode = nvme_fabrics_command;
cmd.connect.fctype = nvme_fabrics_type_connect;
cmd.connect.qid = 0;
cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize);
/*
* Set keep-alive timeout in seconds granularity (ms * 1000)
* and add a grace period for controller kato enforcement
*/
cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 :
cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000);
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_le));
data->cntlid = cpu_to_le16(0xffff);
strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe,
data, sizeof(*data), 0, NVME_QID_ANY, 1,
BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(cqe.result),
&cmd, data);
goto out_free_data;
}
ctrl->cntlid = le16_to_cpu(cqe.result16);
out_free_data:
kfree(data);
return ret;
}
EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue);
/**
* nvmf_connect_io_queue() - NVMe Fabrics I/O Queue "Connect"
* API function.
* @ctrl: Host nvme controller instance used to establish an
* NVMe I/O queue connection to the already allocated NVMe
* controller on the target system.
* @qid: NVMe I/O queue number for the new I/O connection between
* host and target (note qid == 0 is illegal as this is
* the Admin queue, per NVMe standard).
*
* This function issues a fabrics-protocol connection
* of a NVMe I/O queue (via NVMe Fabrics "Connect" command)
* between the host system device and the allocated NVMe controller
* on the target system.
*
* Return:
* 0: success
* > 0: NVMe error status code
* < 0: Linux errno error code
*/
int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
{
struct nvme_command cmd;
struct nvmf_connect_data *data;
struct nvme_completion cqe;
int ret;
memset(&cmd, 0, sizeof(cmd));
cmd.connect.opcode = nvme_fabrics_command;
cmd.connect.fctype = nvme_fabrics_type_connect;
cmd.connect.qid = cpu_to_le16(qid);
cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize);
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_le));
data->cntlid = cpu_to_le16(ctrl->cntlid);
strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &cqe,
data, sizeof(*data), 0, qid, 1,
BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(cqe.result),
&cmd, data);
}
kfree(data);
return ret;
}
EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
/**
* nvmf_register_transport() - NVMe Fabrics Library registration function.
* @ops: Transport ops instance to be registered to the
* common fabrics library.
*
* API function that registers the type of specific transport fabric
* being implemented to the common NVMe fabrics library. Part of
* the overall init sequence of starting up a fabrics driver.
*/
void nvmf_register_transport(struct nvmf_transport_ops *ops)
{
mutex_lock(&nvmf_transports_mutex);
list_add_tail(&ops->entry, &nvmf_transports);
mutex_unlock(&nvmf_transports_mutex);
}
EXPORT_SYMBOL_GPL(nvmf_register_transport);
/**
* nvmf_unregister_transport() - NVMe Fabrics Library unregistration function.
* @ops: Transport ops instance to be unregistered from the
* common fabrics library.
*
* Fabrics API function that unregisters the type of specific transport
* fabric being implemented from the common NVMe fabrics library.
* Part of the overall exit sequence of unloading the implemented driver.
*/
void nvmf_unregister_transport(struct nvmf_transport_ops *ops)
{
mutex_lock(&nvmf_transports_mutex);
list_del(&ops->entry);
mutex_unlock(&nvmf_transports_mutex);
}
EXPORT_SYMBOL_GPL(nvmf_unregister_transport);
static struct nvmf_transport_ops *nvmf_lookup_transport(
struct nvmf_ctrl_options *opts)
{
struct nvmf_transport_ops *ops;
lockdep_assert_held(&nvmf_transports_mutex);
list_for_each_entry(ops, &nvmf_transports, entry) {
if (strcmp(ops->name, opts->transport) == 0)
return ops;
}
return NULL;
}
static const match_table_t opt_tokens = {
{ NVMF_OPT_TRANSPORT, "transport=%s" },
{ NVMF_OPT_TRADDR, "traddr=%s" },
{ NVMF_OPT_TRSVCID, "trsvcid=%s" },
{ NVMF_OPT_NQN, "nqn=%s" },
{ NVMF_OPT_QUEUE_SIZE, "queue_size=%d" },
{ NVMF_OPT_NR_IO_QUEUES, "nr_io_queues=%d" },
{ NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" },
{ NVMF_OPT_KATO, "keep_alive_tmo=%d" },
{ NVMF_OPT_HOSTNQN, "hostnqn=%s" },
{ NVMF_OPT_ERR, NULL }
};
static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
const char *buf)
{
substring_t args[MAX_OPT_ARGS];
char *options, *o, *p;
int token, ret = 0;
size_t nqnlen = 0;
/* Set defaults */
opts->queue_size = NVMF_DEF_QUEUE_SIZE;
opts->nr_io_queues = num_online_cpus();
opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
options = o = kstrdup(buf, GFP_KERNEL);
if (!options)
return -ENOMEM;
while ((p = strsep(&o, ",\n")) != NULL) {
if (!*p)
continue;
token = match_token(p, opt_tokens, args);
opts->mask |= token;
switch (token) {
case NVMF_OPT_TRANSPORT:
p = match_strdup(args);
if (!p) {
ret = -ENOMEM;
goto out;
}
opts->transport = p;
break;
case NVMF_OPT_NQN:
p = match_strdup(args);
if (!p) {
ret = -ENOMEM;
goto out;
}
opts->subsysnqn = p;
nqnlen = strlen(opts->subsysnqn);
if (nqnlen >= NVMF_NQN_SIZE) {
pr_err("%s needs to be < %d bytes\n",
opts->subsysnqn, NVMF_NQN_SIZE);
ret = -EINVAL;
goto out;
}
opts->discovery_nqn =
!(strcmp(opts->subsysnqn,
NVME_DISC_SUBSYS_NAME));
if (opts->discovery_nqn)
opts->nr_io_queues = 0;
break;
case NVMF_OPT_TRADDR:
p = match_strdup(args);
if (!p) {
ret = -ENOMEM;
goto out;
}
opts->traddr = p;
break;
case NVMF_OPT_TRSVCID:
p = match_strdup(args);
if (!p) {
ret = -ENOMEM;
goto out;
}
opts->trsvcid = p;
break;
case NVMF_OPT_QUEUE_SIZE:
if (match_int(args, &token)) {
ret = -EINVAL;
goto out;
}
if (token < NVMF_MIN_QUEUE_SIZE ||
token > NVMF_MAX_QUEUE_SIZE) {
pr_err("Invalid queue_size %d\n", token);
ret = -EINVAL;
goto out;
}
opts->queue_size = token;
break;
case NVMF_OPT_NR_IO_QUEUES:
if (match_int(args, &token)) {
ret = -EINVAL;
goto out;
}
if (token <= 0) {
pr_err("Invalid number of IOQs %d\n", token);
ret = -EINVAL;
goto out;
}
opts->nr_io_queues = min_t(unsigned int,
num_online_cpus(), token);
break;
case NVMF_OPT_KATO:
if (match_int(args, &token)) {
ret = -EINVAL;
goto out;
}
if (opts->discovery_nqn) {
pr_err("Discovery controllers cannot accept keep_alive_tmo != 0\n");
ret = -EINVAL;
goto out;
}
if (token < 0) {
pr_err("Invalid keep_alive_tmo %d\n", token);
ret = -EINVAL;
goto out;
} else if (token == 0) {
/* Allowed for debug */
pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n");
}
opts->kato = token;
break;
case NVMF_OPT_HOSTNQN:
if (opts->host) {
pr_err("hostnqn already user-assigned: %s\n",
opts->host->nqn);
ret = -EADDRINUSE;
goto out;
}
p = match_strdup(args);
if (!p) {
ret = -ENOMEM;
goto out;
}
nqnlen = strlen(p);
if (nqnlen >= NVMF_NQN_SIZE) {
pr_err("%s needs to be < %d bytes\n",
p, NVMF_NQN_SIZE);
ret = -EINVAL;
goto out;
}
opts->host = nvmf_host_add(p);
if (!opts->host) {
ret = -ENOMEM;
goto out;
}
break;
case NVMF_OPT_RECONNECT_DELAY:
if (match_int(args, &token)) {
ret = -EINVAL;
goto out;
}
if (token <= 0) {
pr_err("Invalid reconnect_delay %d\n", token);
ret = -EINVAL;
goto out;
}
opts->reconnect_delay = token;
break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
ret = -EINVAL;
goto out;
}
}
if (!opts->host) {
kref_get(&nvmf_default_host->ref);
opts->host = nvmf_default_host;
}
out:
if (!opts->discovery_nqn && !opts->kato)
opts->kato = NVME_DEFAULT_KATO;
kfree(options);
return ret;
}
static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts,
unsigned int required_opts)
{
if ((opts->mask & required_opts) != required_opts) {
int i;
for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) {
if ((opt_tokens[i].token & required_opts) &&
!(opt_tokens[i].token & opts->mask)) {
pr_warn("missing parameter '%s'\n",
opt_tokens[i].pattern);
}
}
return -EINVAL;
}
return 0;
}
static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts,
unsigned int allowed_opts)
{
if (opts->mask & ~allowed_opts) {
int i;
for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) {
if (opt_tokens[i].token & ~allowed_opts) {
pr_warn("invalid parameter '%s'\n",
opt_tokens[i].pattern);
}
}
return -EINVAL;
}
return 0;
}
void nvmf_free_options(struct nvmf_ctrl_options *opts)
{
nvmf_host_put(opts->host);
kfree(opts->transport);
kfree(opts->traddr);
kfree(opts->trsvcid);
kfree(opts->subsysnqn);
kfree(opts);
}
EXPORT_SYMBOL_GPL(nvmf_free_options);
#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
NVMF_OPT_KATO | NVMF_OPT_HOSTNQN)
static struct nvme_ctrl *
nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
{
struct nvmf_ctrl_options *opts;
struct nvmf_transport_ops *ops;
struct nvme_ctrl *ctrl;
int ret;
opts = kzalloc(sizeof(*opts), GFP_KERNEL);
if (!opts)
return ERR_PTR(-ENOMEM);
ret = nvmf_parse_options(opts, buf);
if (ret)
goto out_free_opts;
/*
* Check the generic options first as we need a valid transport for
* the lookup below. Then clear the generic flags so that transport
* drivers don't have to care about them.
*/
ret = nvmf_check_required_opts(opts, NVMF_REQUIRED_OPTS);
if (ret)
goto out_free_opts;
opts->mask &= ~NVMF_REQUIRED_OPTS;
mutex_lock(&nvmf_transports_mutex);
ops = nvmf_lookup_transport(opts);
if (!ops) {
pr_info("no handler found for transport %s.\n",
opts->transport);
ret = -EINVAL;
goto out_unlock;
}
ret = nvmf_check_required_opts(opts, ops->required_opts);
if (ret)
goto out_unlock;
ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS |
ops->allowed_opts | ops->required_opts);
if (ret)
goto out_unlock;
ctrl = ops->create_ctrl(dev, opts);
if (IS_ERR(ctrl)) {
ret = PTR_ERR(ctrl);
goto out_unlock;
}
mutex_unlock(&nvmf_transports_mutex);
return ctrl;
out_unlock:
mutex_unlock(&nvmf_transports_mutex);
out_free_opts:
nvmf_host_put(opts->host);
kfree(opts);
return ERR_PTR(ret);
}
static struct class *nvmf_class;
static struct device *nvmf_device;
static DEFINE_MUTEX(nvmf_dev_mutex);
static ssize_t nvmf_dev_write(struct file *file, const char __user *ubuf,
size_t count, loff_t *pos)
{
struct seq_file *seq_file = file->private_data;
struct nvme_ctrl *ctrl;
const char *buf;
int ret = 0;
if (count > PAGE_SIZE)
return -ENOMEM;
buf = memdup_user_nul(ubuf, count);
if (IS_ERR(buf))
return PTR_ERR(buf);
mutex_lock(&nvmf_dev_mutex);
if (seq_file->private) {
ret = -EINVAL;
goto out_unlock;
}
ctrl = nvmf_create_ctrl(nvmf_device, buf, count);
if (IS_ERR(ctrl)) {
ret = PTR_ERR(ctrl);
goto out_unlock;
}
seq_file->private = ctrl;
out_unlock:
mutex_unlock(&nvmf_dev_mutex);
kfree(buf);
return ret ? ret : count;
}
static int nvmf_dev_show(struct seq_file *seq_file, void *private)
{
struct nvme_ctrl *ctrl;
int ret = 0;
mutex_lock(&nvmf_dev_mutex);
ctrl = seq_file->private;
if (!ctrl) {
ret = -EINVAL;
goto out_unlock;
}
seq_printf(seq_file, "instance=%d,cntlid=%d\n",
ctrl->instance, ctrl->cntlid);
out_unlock:
mutex_unlock(&nvmf_dev_mutex);
return ret;
}
static int nvmf_dev_open(struct inode *inode, struct file *file)
{
/*
* The miscdevice code initializes file->private_data, but doesn't
* make use of it later.
*/
file->private_data = NULL;
return single_open(file, nvmf_dev_show, NULL);
}
static int nvmf_dev_release(struct inode *inode, struct file *file)
{
struct seq_file *seq_file = file->private_data;
struct nvme_ctrl *ctrl = seq_file->private;
if (ctrl)
nvme_put_ctrl(ctrl);
return single_release(inode, file);
}
static const struct file_operations nvmf_dev_fops = {
.owner = THIS_MODULE,
.write = nvmf_dev_write,
.read = seq_read,
.open = nvmf_dev_open,
.release = nvmf_dev_release,
};
static struct miscdevice nvmf_misc = {
.minor = MISC_DYNAMIC_MINOR,
.name = "nvme-fabrics",
.fops = &nvmf_dev_fops,
};
static int __init nvmf_init(void)
{
int ret;
nvmf_default_host = nvmf_host_default();
if (!nvmf_default_host)
return -ENOMEM;
nvmf_class = class_create(THIS_MODULE, "nvme-fabrics");
if (IS_ERR(nvmf_class)) {
pr_err("couldn't register class nvme-fabrics\n");
ret = PTR_ERR(nvmf_class);
goto out_free_host;
}
nvmf_device =
device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl");
if (IS_ERR(nvmf_device)) {
pr_err("couldn't create nvme-fabris device!\n");
ret = PTR_ERR(nvmf_device);
goto out_destroy_class;
}
ret = misc_register(&nvmf_misc);
if (ret) {
pr_err("couldn't register misc device: %d\n", ret);
goto out_destroy_device;
}
return 0;
out_destroy_device:
device_destroy(nvmf_class, MKDEV(0, 0));
out_destroy_class:
class_destroy(nvmf_class);
out_free_host:
nvmf_host_put(nvmf_default_host);
return ret;
}
static void __exit nvmf_exit(void)
{
misc_deregister(&nvmf_misc);
device_destroy(nvmf_class, MKDEV(0, 0));
class_destroy(nvmf_class);
nvmf_host_put(nvmf_default_host);
BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024);
}
MODULE_LICENSE("GPL v2");
module_init(nvmf_init);
module_exit(nvmf_exit);

View File

@ -0,0 +1,132 @@
/*
* NVMe over Fabrics common host code.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef _NVME_FABRICS_H
#define _NVME_FABRICS_H 1
#include <linux/in.h>
#include <linux/inet.h>
#define NVMF_MIN_QUEUE_SIZE 16
#define NVMF_MAX_QUEUE_SIZE 1024
#define NVMF_DEF_QUEUE_SIZE 128
#define NVMF_DEF_RECONNECT_DELAY 10
/*
* Define a host as seen by the target. We allocate one at boot, but also
* allow the override it when creating controllers. This is both to provide
* persistence of the Host NQN over multiple boots, and to allow using
* multiple ones, for example in a container scenario. Because we must not
* use different Host NQNs with the same Host ID we generate a Host ID and
* use this structure to keep track of the relation between the two.
*/
struct nvmf_host {
struct kref ref;
struct list_head list;
char nqn[NVMF_NQN_SIZE];
uuid_le id;
};
/**
* enum nvmf_parsing_opts - used to define the sysfs parsing options used.
*/
enum {
NVMF_OPT_ERR = 0,
NVMF_OPT_TRANSPORT = 1 << 0,
NVMF_OPT_NQN = 1 << 1,
NVMF_OPT_TRADDR = 1 << 2,
NVMF_OPT_TRSVCID = 1 << 3,
NVMF_OPT_QUEUE_SIZE = 1 << 4,
NVMF_OPT_NR_IO_QUEUES = 1 << 5,
NVMF_OPT_TL_RETRY_COUNT = 1 << 6,
NVMF_OPT_KATO = 1 << 7,
NVMF_OPT_HOSTNQN = 1 << 8,
NVMF_OPT_RECONNECT_DELAY = 1 << 9,
};
/**
* struct nvmf_ctrl_options - Used to hold the options specified
* with the parsing opts enum.
* @mask: Used by the fabrics library to parse through sysfs options
* on adding a NVMe controller.
* @transport: Holds the fabric transport "technology name" (for a lack of
* better description) that will be used by an NVMe controller
* being added.
* @subsysnqn: Hold the fully qualified NQN subystem name (format defined
* in the NVMe specification, "NVMe Qualified Names").
* @traddr: network address that will be used by the host to communicate
* to the added NVMe controller.
* @trsvcid: network port used for host-controller communication.
* @queue_size: Number of IO queue elements.
* @nr_io_queues: Number of controller IO queues that will be established.
* @reconnect_delay: Time between two consecutive reconnect attempts.
* @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN.
* @kato: Keep-alive timeout.
* @host: Virtual NVMe host, contains the NQN and Host ID.
*/
struct nvmf_ctrl_options {
unsigned mask;
char *transport;
char *subsysnqn;
char *traddr;
char *trsvcid;
size_t queue_size;
unsigned int nr_io_queues;
unsigned int reconnect_delay;
bool discovery_nqn;
unsigned int kato;
struct nvmf_host *host;
};
/*
* struct nvmf_transport_ops - used to register a specific
* fabric implementation of NVMe fabrics.
* @entry: Used by the fabrics library to add the new
* registration entry to its linked-list internal tree.
* @name: Name of the NVMe fabric driver implementation.
* @required_opts: sysfs command-line options that must be specified
* when adding a new NVMe controller.
* @allowed_opts: sysfs command-line options that can be specified
* when adding a new NVMe controller.
* @create_ctrl(): function pointer that points to a non-NVMe
* implementation-specific fabric technology
* that would go into starting up that fabric
* for the purpose of conneciton to an NVMe controller
* using that fabric technology.
*
* Notes:
* 1. At minimum, 'required_opts' and 'allowed_opts' should
* be set to the same enum parsing options defined earlier.
* 2. create_ctrl() must be defined (even if it does nothing)
*/
struct nvmf_transport_ops {
struct list_head entry;
const char *name;
int required_opts;
int allowed_opts;
struct nvme_ctrl *(*create_ctrl)(struct device *dev,
struct nvmf_ctrl_options *opts);
};
int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
void nvmf_register_transport(struct nvmf_transport_ops *ops);
void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
void nvmf_free_options(struct nvmf_ctrl_options *opts);
const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl);
int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
#endif /* _NVME_FABRICS_H */

View File

@ -156,7 +156,7 @@ struct nvme_nvm_completion {
#define NVME_NVM_LP_MLC_PAIRS 886
struct nvme_nvm_lp_mlc {
__u16 num_pairs;
__le16 num_pairs;
__u8 pairs[NVME_NVM_LP_MLC_PAIRS];
};
@ -500,7 +500,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
struct bio *bio = rqd->bio;
struct nvme_nvm_command *cmd;
rq = blk_mq_alloc_request(q, bio_rw(bio), 0);
rq = blk_mq_alloc_request(q, bio_data_dir(bio), 0);
if (IS_ERR(rq))
return -ENOMEM;

View File

@ -38,6 +38,11 @@ extern unsigned char admin_timeout;
extern unsigned char shutdown_timeout;
#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ)
#define NVME_DEFAULT_KATO 5
#define NVME_KATO_GRACE 10
extern unsigned int nvme_max_retries;
enum {
NVME_NS_LBA = 0,
NVME_NS_LIGHTNVM = 1,
@ -65,12 +70,26 @@ enum nvme_quirks {
* logical blocks.
*/
NVME_QUIRK_DISCARD_ZEROES = (1 << 2),
/*
* The controller needs a delay before starts checking the device
* readiness, which is done by reading the NVME_CSTS_RDY bit.
*/
NVME_QUIRK_DELAY_BEFORE_CHK_RDY = (1 << 3),
};
/* The below value is the specific amount of delay needed before checking
* readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the
* NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was
* found empirically.
*/
#define NVME_QUIRK_DELAY_AMOUNT 2000
enum nvme_ctrl_state {
NVME_CTRL_NEW,
NVME_CTRL_LIVE,
NVME_CTRL_RESETTING,
NVME_CTRL_RECONNECTING,
NVME_CTRL_DELETING,
NVME_CTRL_DEAD,
};
@ -80,6 +99,7 @@ struct nvme_ctrl {
spinlock_t lock;
const struct nvme_ctrl_ops *ops;
struct request_queue *admin_q;
struct request_queue *connect_q;
struct device *dev;
struct kref kref;
int instance;
@ -107,10 +127,22 @@ struct nvme_ctrl {
u8 event_limit;
u8 vwc;
u32 vs;
u32 sgls;
u16 kas;
unsigned int kato;
bool subsystem;
unsigned long quirks;
struct work_struct scan_work;
struct work_struct async_event_work;
struct delayed_work ka_work;
/* Fabrics only */
u16 sqsize;
u32 ioccsz;
u32 iorcsz;
u16 icdoff;
u16 maxcmd;
struct nvmf_ctrl_options *opts;
};
/*
@ -144,7 +176,9 @@ struct nvme_ns {
};
struct nvme_ctrl_ops {
const char *name;
struct module *module;
bool is_fabrics;
int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
@ -152,6 +186,9 @@ struct nvme_ctrl_ops {
void (*free_ctrl)(struct nvme_ctrl *ctrl);
void (*post_scan)(struct nvme_ctrl *ctrl);
void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx);
int (*delete_ctrl)(struct nvme_ctrl *ctrl);
const char *(*get_subsysnqn)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
};
static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
@ -204,9 +241,11 @@ static inline int nvme_error_status(u16 status)
static inline bool nvme_req_needs_retry(struct request *req, u16 status)
{
return !(status & NVME_SC_DNR || blk_noretry_request(req)) &&
(jiffies - req->start_time) < req->timeout;
(jiffies - req->start_time) < req->timeout &&
req->retries < nvme_max_retries;
}
void nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
@ -230,8 +269,9 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl);
void nvme_start_queues(struct nvme_ctrl *ctrl);
void nvme_kill_queues(struct nvme_ctrl *ctrl);
#define NVME_QID_ANY -1
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags);
struct nvme_command *cmd, unsigned int flags, int qid);
void nvme_requeue_req(struct request *req);
int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd);
@ -239,7 +279,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buf, unsigned bufflen);
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
struct nvme_completion *cqe, void *buffer, unsigned bufflen,
unsigned timeout);
unsigned timeout, int qid, int at_head, int flags);
int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
void __user *ubuffer, unsigned bufflen, u32 *result,
unsigned timeout);
@ -256,6 +296,8 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
dma_addr_t dma_addr, u32 *result);
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
struct sg_io_hdr;

View File

@ -310,6 +310,11 @@ static int nvme_init_iod(struct request *rq, unsigned size,
iod->npages = -1;
iod->nents = 0;
iod->length = size;
if (!(rq->cmd_flags & REQ_DONTPREP)) {
rq->retries = 0;
rq->cmd_flags |= REQ_DONTPREP;
}
return 0;
}
@ -520,8 +525,8 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req,
goto out_unmap;
}
cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma);
if (blk_integrity_rq(req))
cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
return BLK_MQ_RQ_QUEUE_OK;
@ -623,6 +628,7 @@ static void nvme_complete_rq(struct request *req)
if (unlikely(req->errors)) {
if (nvme_req_needs_retry(req, req->errors)) {
req->retries++;
nvme_requeue_req(req);
return;
}
@ -901,7 +907,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
req->tag, nvmeq->qid);
abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
BLK_MQ_REQ_NOWAIT);
BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
if (IS_ERR(abort_req)) {
atomic_inc(&dev->ctrl.abort_limit);
return BLK_EH_RESET_TIMER;
@ -919,22 +925,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
return BLK_EH_RESET_TIMER;
}
static void nvme_cancel_io(struct request *req, void *data, bool reserved)
{
int status;
if (!blk_mq_request_started(req))
return;
dev_dbg_ratelimited(((struct nvme_dev *) data)->ctrl.device,
"Cancelling I/O %d", req->tag);
status = NVME_SC_ABORT_REQ;
if (blk_queue_dying(req->q))
status |= NVME_SC_DNR;
blk_mq_complete_request(req, status);
}
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
@ -1399,16 +1389,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
if (result < 0)
return result;
/*
* Degraded controllers might return an error when setting the queue
* count. We still want to be able to bring them online and offer
* access to the admin queue, as that might be only way to fix them up.
*/
if (result > 0) {
dev_err(dev->ctrl.device,
"Could not set queue count (%d)\n", result);
if (nr_io_queues == 0)
return 0;
}
if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
@ -1536,7 +1518,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
cmd.delete_queue.opcode = opcode;
cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT);
req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
if (IS_ERR(req))
return PTR_ERR(req);
@ -1727,8 +1709,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
}
nvme_pci_disable(dev);
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_io, dev);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_io, dev);
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
mutex_unlock(&dev->shutdown_lock);
}
@ -1902,6 +1884,7 @@ static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl)
}
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32 = nvme_pci_reg_write32,
@ -1940,7 +1923,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
node = dev_to_node(&pdev->dev);
if (node == NUMA_NO_NODE)
set_dev_node(&pdev->dev, 0);
set_dev_node(&pdev->dev, first_memory_node);
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
@ -2037,6 +2020,24 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_put_ctrl(&dev->ctrl);
}
static int nvme_pci_sriov_configure(struct pci_dev *pdev, int numvfs)
{
int ret = 0;
if (numvfs == 0) {
if (pci_vfs_assigned(pdev)) {
dev_warn(&pdev->dev,
"Cannot disable SR-IOV VFs while assigned\n");
return -EPERM;
}
pci_disable_sriov(pdev);
return 0;
}
ret = pci_enable_sriov(pdev, numvfs);
return ret ? ret : numvfs;
}
#ifdef CONFIG_PM_SLEEP
static int nvme_suspend(struct device *dev)
{
@ -2122,6 +2123,8 @@ static const struct pci_device_id nvme_id_table[] = {
NVME_QUIRK_DISCARD_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
{ PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ 0, }
@ -2137,6 +2140,7 @@ static struct pci_driver nvme_driver = {
.driver = {
.pm = &nvme_dev_pm_ops,
},
.sriov_configure = nvme_pci_sriov_configure,
.err_handler = &nvme_err_handler,
};

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,36 @@
config NVME_TARGET
tristate "NVMe Target support"
depends on BLOCK
depends on CONFIGFS_FS
help
This enabled target side support for the NVMe protocol, that is
it allows the Linux kernel to implement NVMe subsystems and
controllers and export Linux block devices as NVMe namespaces.
You need to select at least one of the transports below to make this
functionality useful.
To configure the NVMe target you probably want to use the nvmetcli
tool from http://git.infradead.org/users/hch/nvmetcli.git.
config NVME_TARGET_LOOP
tristate "NVMe loopback device support"
depends on BLK_DEV_NVME
depends on NVME_TARGET
select NVME_FABRICS
select SG_POOL
help
This enables the NVMe loopback device support, which can be useful
to test NVMe host and target side features.
If unsure, say N.
config NVME_TARGET_RDMA
tristate "NVMe over Fabrics RDMA target support"
depends on INFINIBAND
depends on NVME_TARGET
help
This enables the NVMe RDMA target support, which allows exporting NVMe
devices over RDMA.
If unsure, say N.

View File

@ -0,0 +1,9 @@
obj-$(CONFIG_NVME_TARGET) += nvmet.o
obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o
obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o
nvmet-y += core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o \
discovery.o
nvme-loop-y += loop.o
nvmet-rdma-y += rdma.o

View File

@ -0,0 +1,465 @@
/*
* NVMe admin command implementation.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/random.h>
#include <generated/utsrelease.h>
#include "nvmet.h"
u32 nvmet_get_log_page_len(struct nvme_command *cmd)
{
u32 len = le16_to_cpu(cmd->get_log_page.numdu);
len <<= 16;
len += le16_to_cpu(cmd->get_log_page.numdl);
/* NUMD is a 0's based value */
len += 1;
len *= sizeof(u32);
return len;
}
static void nvmet_execute_get_log_page(struct nvmet_req *req)
{
size_t data_len = nvmet_get_log_page_len(req->cmd);
void *buf;
u16 status = 0;
buf = kzalloc(data_len, GFP_KERNEL);
if (!buf) {
status = NVME_SC_INTERNAL;
goto out;
}
switch (req->cmd->get_log_page.lid) {
case 0x01:
/*
* We currently never set the More bit in the status field,
* so all error log entries are invalid and can be zeroed out.
* This is called a minum viable implementation (TM) of this
* mandatory log page.
*/
break;
case 0x02:
/*
* XXX: fill out actual smart log
*
* We might have a hard time coming up with useful values for
* many of the fields, and even when we have useful data
* available (e.g. units or commands read/written) those aren't
* persistent over power loss.
*/
break;
case 0x03:
/*
* We only support a single firmware slot which always is
* active, so we can zero out the whole firmware slot log and
* still claim to fully implement this mandatory log page.
*/
break;
default:
BUG();
}
status = nvmet_copy_to_sgl(req, 0, buf, data_len);
kfree(buf);
out:
nvmet_req_complete(req, status);
}
static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvme_id_ctrl *id;
u64 serial;
u16 status = 0;
id = kzalloc(sizeof(*id), GFP_KERNEL);
if (!id) {
status = NVME_SC_INTERNAL;
goto out;
}
/* XXX: figure out how to assign real vendors IDs. */
id->vid = 0;
id->ssvid = 0;
/* generate a random serial number as our controllers are ephemeral: */
get_random_bytes(&serial, sizeof(serial));
memset(id->sn, ' ', sizeof(id->sn));
snprintf(id->sn, sizeof(id->sn), "%llx", serial);
memset(id->mn, ' ', sizeof(id->mn));
strncpy((char *)id->mn, "Linux", sizeof(id->mn));
memset(id->fr, ' ', sizeof(id->fr));
strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr));
id->rab = 6;
/*
* XXX: figure out how we can assign a IEEE OUI, but until then
* the safest is to leave it as zeroes.
*/
/* we support multiple ports and multiples hosts: */
id->mic = (1 << 0) | (1 << 1);
/* no limit on data transfer sizes for now */
id->mdts = 0;
id->cntlid = cpu_to_le16(ctrl->cntlid);
id->ver = cpu_to_le32(ctrl->subsys->ver);
/* XXX: figure out what to do about RTD3R/RTD3 */
id->oaes = cpu_to_le32(1 << 8);
id->ctratt = cpu_to_le32(1 << 0);
id->oacs = 0;
/*
* We don't really have a practical limit on the number of abort
* comands. But we don't do anything useful for abort either, so
* no point in allowing more abort commands than the spec requires.
*/
id->acl = 3;
id->aerl = NVMET_ASYNC_EVENTS - 1;
/* first slot is read-only, only one slot supported */
id->frmw = (1 << 0) | (1 << 1);
id->lpa = (1 << 0) | (1 << 2);
id->elpe = NVMET_ERROR_LOG_SLOTS - 1;
id->npss = 0;
/* We support keep-alive timeout in granularity of seconds */
id->kas = cpu_to_le16(NVMET_KAS);
id->sqes = (0x6 << 4) | 0x6;
id->cqes = (0x4 << 4) | 0x4;
/* no enforcement soft-limit for maxcmd - pick arbitrary high value */
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
id->nn = cpu_to_le32(ctrl->subsys->max_nsid);
id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM);
/* XXX: don't report vwc if the underlying device is write through */
id->vwc = NVME_CTRL_VWC_PRESENT;
/*
* We can't support atomic writes bigger than a LBA without support
* from the backend device.
*/
id->awun = 0;
id->awupf = 0;
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
if (ctrl->ops->has_keyed_sgls)
id->sgls |= cpu_to_le32(1 << 2);
if (ctrl->ops->sqe_inline_size)
id->sgls |= cpu_to_le32(1 << 20);
strcpy(id->subnqn, ctrl->subsys->subsysnqn);
/* Max command capsule size is sqe + single page of in-capsule data */
id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) +
ctrl->ops->sqe_inline_size) / 16);
/* Max response capsule size is cqe */
id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16);
id->msdbd = ctrl->ops->msdbd;
/*
* Meh, we don't really support any power state. Fake up the same
* values that qemu does.
*/
id->psd[0].max_power = cpu_to_le16(0x9c4);
id->psd[0].entry_lat = cpu_to_le32(0x10);
id->psd[0].exit_lat = cpu_to_le32(0x4);
status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
kfree(id);
out:
nvmet_req_complete(req, status);
}
static void nvmet_execute_identify_ns(struct nvmet_req *req)
{
struct nvmet_ns *ns;
struct nvme_id_ns *id;
u16 status = 0;
ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
if (!ns) {
status = NVME_SC_INVALID_NS | NVME_SC_DNR;
goto out;
}
id = kzalloc(sizeof(*id), GFP_KERNEL);
if (!id) {
status = NVME_SC_INTERNAL;
goto out_put_ns;
}
/*
* nuse = ncap = nsze isn't aways true, but we have no way to find
* that out from the underlying device.
*/
id->ncap = id->nuse = id->nsze =
cpu_to_le64(ns->size >> ns->blksize_shift);
/*
* We just provide a single LBA format that matches what the
* underlying device reports.
*/
id->nlbaf = 0;
id->flbas = 0;
/*
* Our namespace might always be shared. Not just with other
* controllers, but also with any other user of the block device.
*/
id->nmic = (1 << 0);
memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le));
id->lbaf[0].ds = ns->blksize_shift;
status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
kfree(id);
out_put_ns:
nvmet_put_namespace(ns);
out:
nvmet_req_complete(req, status);
}
static void nvmet_execute_identify_nslist(struct nvmet_req *req)
{
static const int buf_size = 4096;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmet_ns *ns;
u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid);
__le32 *list;
u16 status = 0;
int i = 0;
list = kzalloc(buf_size, GFP_KERNEL);
if (!list) {
status = NVME_SC_INTERNAL;
goto out;
}
rcu_read_lock();
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
if (ns->nsid <= min_nsid)
continue;
list[i++] = cpu_to_le32(ns->nsid);
if (i == buf_size / sizeof(__le32))
break;
}
rcu_read_unlock();
status = nvmet_copy_to_sgl(req, 0, list, buf_size);
kfree(list);
out:
nvmet_req_complete(req, status);
}
/*
* A "mimimum viable" abort implementation: the command is mandatory in the
* spec, but we are not required to do any useful work. We couldn't really
* do a useful abort, so don't bother even with waiting for the command
* to be exectuted and return immediately telling the command to abort
* wasn't found.
*/
static void nvmet_execute_abort(struct nvmet_req *req)
{
nvmet_set_result(req, 1);
nvmet_req_complete(req, 0);
}
static void nvmet_execute_set_features(struct nvmet_req *req)
{
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
u64 val;
u32 val32;
u16 status = 0;
switch (cdw10 & 0xf) {
case NVME_FEAT_NUM_QUEUES:
nvmet_set_result(req,
(subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16));
break;
case NVME_FEAT_KATO:
val = le64_to_cpu(req->cmd->prop_set.value);
val32 = val & 0xffff;
req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
nvmet_set_result(req, req->sq->ctrl->kato);
break;
default:
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
break;
}
nvmet_req_complete(req, status);
}
static void nvmet_execute_get_features(struct nvmet_req *req)
{
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
u16 status = 0;
switch (cdw10 & 0xf) {
/*
* These features are mandatory in the spec, but we don't
* have a useful way to implement them. We'll eventually
* need to come up with some fake values for these.
*/
#if 0
case NVME_FEAT_ARBITRATION:
break;
case NVME_FEAT_POWER_MGMT:
break;
case NVME_FEAT_TEMP_THRESH:
break;
case NVME_FEAT_ERR_RECOVERY:
break;
case NVME_FEAT_IRQ_COALESCE:
break;
case NVME_FEAT_IRQ_CONFIG:
break;
case NVME_FEAT_WRITE_ATOMIC:
break;
case NVME_FEAT_ASYNC_EVENT:
break;
#endif
case NVME_FEAT_VOLATILE_WC:
nvmet_set_result(req, 1);
break;
case NVME_FEAT_NUM_QUEUES:
nvmet_set_result(req,
(subsys->max_qid-1) | ((subsys->max_qid-1) << 16));
break;
case NVME_FEAT_KATO:
nvmet_set_result(req, req->sq->ctrl->kato * 1000);
break;
default:
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
break;
}
nvmet_req_complete(req, status);
}
static void nvmet_execute_async_event(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
mutex_lock(&ctrl->lock);
if (ctrl->nr_async_event_cmds >= NVMET_ASYNC_EVENTS) {
mutex_unlock(&ctrl->lock);
nvmet_req_complete(req, NVME_SC_ASYNC_LIMIT | NVME_SC_DNR);
return;
}
ctrl->async_event_cmds[ctrl->nr_async_event_cmds++] = req;
mutex_unlock(&ctrl->lock);
schedule_work(&ctrl->async_event_work);
}
static void nvmet_execute_keep_alive(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
pr_debug("ctrl %d update keep-alive timer for %d secs\n",
ctrl->cntlid, ctrl->kato);
mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
nvmet_req_complete(req, 0);
}
int nvmet_parse_admin_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
req->ns = NULL;
if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
pr_err("nvmet: got admin cmd %d while CC.EN == 0\n",
cmd->common.opcode);
return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
}
if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
pr_err("nvmet: got admin cmd %d while CSTS.RDY == 0\n",
cmd->common.opcode);
return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
}
switch (cmd->common.opcode) {
case nvme_admin_get_log_page:
req->data_len = nvmet_get_log_page_len(cmd);
switch (cmd->get_log_page.lid) {
case 0x01:
case 0x02:
case 0x03:
req->execute = nvmet_execute_get_log_page;
return 0;
}
break;
case nvme_admin_identify:
req->data_len = 4096;
switch (le32_to_cpu(cmd->identify.cns)) {
case 0x00:
req->execute = nvmet_execute_identify_ns;
return 0;
case 0x01:
req->execute = nvmet_execute_identify_ctrl;
return 0;
case 0x02:
req->execute = nvmet_execute_identify_nslist;
return 0;
}
break;
case nvme_admin_abort_cmd:
req->execute = nvmet_execute_abort;
req->data_len = 0;
return 0;
case nvme_admin_set_features:
req->execute = nvmet_execute_set_features;
req->data_len = 0;
return 0;
case nvme_admin_get_features:
req->execute = nvmet_execute_get_features;
req->data_len = 0;
return 0;
case nvme_admin_async_event:
req->execute = nvmet_execute_async_event;
req->data_len = 0;
return 0;
case nvme_admin_keep_alive:
req->execute = nvmet_execute_keep_alive;
req->data_len = 0;
return 0;
}
pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}

View File

@ -0,0 +1,917 @@
/*
* Configfs interface for the NVMe target.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/ctype.h>
#include "nvmet.h"
static struct config_item_type nvmet_host_type;
static struct config_item_type nvmet_subsys_type;
/*
* nvmet_port Generic ConfigFS definitions.
* Used in any place in the ConfigFS tree that refers to an address.
*/
static ssize_t nvmet_addr_adrfam_show(struct config_item *item,
char *page)
{
switch (to_nvmet_port(item)->disc_addr.adrfam) {
case NVMF_ADDR_FAMILY_IP4:
return sprintf(page, "ipv4\n");
case NVMF_ADDR_FAMILY_IP6:
return sprintf(page, "ipv6\n");
case NVMF_ADDR_FAMILY_IB:
return sprintf(page, "ib\n");
default:
return sprintf(page, "\n");
}
}
static ssize_t nvmet_addr_adrfam_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_port *port = to_nvmet_port(item);
if (port->enabled) {
pr_err("Cannot modify address while enabled\n");
pr_err("Disable the address before modifying\n");
return -EACCES;
}
if (sysfs_streq(page, "ipv4")) {
port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP4;
} else if (sysfs_streq(page, "ipv6")) {
port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP6;
} else if (sysfs_streq(page, "ib")) {
port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IB;
} else {
pr_err("Invalid value '%s' for adrfam\n", page);
return -EINVAL;
}
return count;
}
CONFIGFS_ATTR(nvmet_, addr_adrfam);
static ssize_t nvmet_addr_portid_show(struct config_item *item,
char *page)
{
struct nvmet_port *port = to_nvmet_port(item);
return snprintf(page, PAGE_SIZE, "%d\n",
le16_to_cpu(port->disc_addr.portid));
}
static ssize_t nvmet_addr_portid_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_port *port = to_nvmet_port(item);
u16 portid = 0;
if (kstrtou16(page, 0, &portid)) {
pr_err("Invalid value '%s' for portid\n", page);
return -EINVAL;
}
if (port->enabled) {
pr_err("Cannot modify address while enabled\n");
pr_err("Disable the address before modifying\n");
return -EACCES;
}
port->disc_addr.portid = cpu_to_le16(portid);
return count;
}
CONFIGFS_ATTR(nvmet_, addr_portid);
static ssize_t nvmet_addr_traddr_show(struct config_item *item,
char *page)
{
struct nvmet_port *port = to_nvmet_port(item);
return snprintf(page, PAGE_SIZE, "%s\n",
port->disc_addr.traddr);
}
static ssize_t nvmet_addr_traddr_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_port *port = to_nvmet_port(item);
if (count > NVMF_TRADDR_SIZE) {
pr_err("Invalid value '%s' for traddr\n", page);
return -EINVAL;
}
if (port->enabled) {
pr_err("Cannot modify address while enabled\n");
pr_err("Disable the address before modifying\n");
return -EACCES;
}
return snprintf(port->disc_addr.traddr,
sizeof(port->disc_addr.traddr), "%s", page);
}
CONFIGFS_ATTR(nvmet_, addr_traddr);
static ssize_t nvmet_addr_treq_show(struct config_item *item,
char *page)
{
switch (to_nvmet_port(item)->disc_addr.treq) {
case NVMF_TREQ_NOT_SPECIFIED:
return sprintf(page, "not specified\n");
case NVMF_TREQ_REQUIRED:
return sprintf(page, "required\n");
case NVMF_TREQ_NOT_REQUIRED:
return sprintf(page, "not required\n");
default:
return sprintf(page, "\n");
}
}
static ssize_t nvmet_addr_treq_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_port *port = to_nvmet_port(item);
if (port->enabled) {
pr_err("Cannot modify address while enabled\n");
pr_err("Disable the address before modifying\n");
return -EACCES;
}
if (sysfs_streq(page, "not specified")) {
port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED;
} else if (sysfs_streq(page, "required")) {
port->disc_addr.treq = NVMF_TREQ_REQUIRED;
} else if (sysfs_streq(page, "not required")) {
port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED;
} else {
pr_err("Invalid value '%s' for treq\n", page);
return -EINVAL;
}
return count;
}
CONFIGFS_ATTR(nvmet_, addr_treq);
static ssize_t nvmet_addr_trsvcid_show(struct config_item *item,
char *page)
{
struct nvmet_port *port = to_nvmet_port(item);
return snprintf(page, PAGE_SIZE, "%s\n",
port->disc_addr.trsvcid);
}
static ssize_t nvmet_addr_trsvcid_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_port *port = to_nvmet_port(item);
if (count > NVMF_TRSVCID_SIZE) {
pr_err("Invalid value '%s' for trsvcid\n", page);
return -EINVAL;
}
if (port->enabled) {
pr_err("Cannot modify address while enabled\n");
pr_err("Disable the address before modifying\n");
return -EACCES;
}
return snprintf(port->disc_addr.trsvcid,
sizeof(port->disc_addr.trsvcid), "%s", page);
}
CONFIGFS_ATTR(nvmet_, addr_trsvcid);
static ssize_t nvmet_addr_trtype_show(struct config_item *item,
char *page)
{
switch (to_nvmet_port(item)->disc_addr.trtype) {
case NVMF_TRTYPE_RDMA:
return sprintf(page, "rdma\n");
case NVMF_TRTYPE_LOOP:
return sprintf(page, "loop\n");
default:
return sprintf(page, "\n");
}
}
static void nvmet_port_init_tsas_rdma(struct nvmet_port *port)
{
port->disc_addr.trtype = NVMF_TRTYPE_RDMA;
memset(&port->disc_addr.tsas.rdma, 0, NVMF_TSAS_SIZE);
port->disc_addr.tsas.rdma.qptype = NVMF_RDMA_QPTYPE_CONNECTED;
port->disc_addr.tsas.rdma.prtype = NVMF_RDMA_PRTYPE_NOT_SPECIFIED;
port->disc_addr.tsas.rdma.cms = NVMF_RDMA_CMS_RDMA_CM;
}
static void nvmet_port_init_tsas_loop(struct nvmet_port *port)
{
port->disc_addr.trtype = NVMF_TRTYPE_LOOP;
memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE);
}
static ssize_t nvmet_addr_trtype_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_port *port = to_nvmet_port(item);
if (port->enabled) {
pr_err("Cannot modify address while enabled\n");
pr_err("Disable the address before modifying\n");
return -EACCES;
}
if (sysfs_streq(page, "rdma")) {
nvmet_port_init_tsas_rdma(port);
} else if (sysfs_streq(page, "loop")) {
nvmet_port_init_tsas_loop(port);
} else {
pr_err("Invalid value '%s' for trtype\n", page);
return -EINVAL;
}
return count;
}
CONFIGFS_ATTR(nvmet_, addr_trtype);
/*
* Namespace structures & file operation functions below
*/
static ssize_t nvmet_ns_device_path_show(struct config_item *item, char *page)
{
return sprintf(page, "%s\n", to_nvmet_ns(item)->device_path);
}
static ssize_t nvmet_ns_device_path_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_ns *ns = to_nvmet_ns(item);
struct nvmet_subsys *subsys = ns->subsys;
int ret;
mutex_lock(&subsys->lock);
ret = -EBUSY;
if (nvmet_ns_enabled(ns))
goto out_unlock;
kfree(ns->device_path);
ret = -ENOMEM;
ns->device_path = kstrdup(page, GFP_KERNEL);
if (!ns->device_path)
goto out_unlock;
mutex_unlock(&subsys->lock);
return count;
out_unlock:
mutex_unlock(&subsys->lock);
return ret;
}
CONFIGFS_ATTR(nvmet_ns_, device_path);
static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page)
{
return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid);
}
static ssize_t nvmet_ns_device_nguid_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_ns *ns = to_nvmet_ns(item);
struct nvmet_subsys *subsys = ns->subsys;
u8 nguid[16];
const char *p = page;
int i;
int ret = 0;
mutex_lock(&subsys->lock);
if (nvmet_ns_enabled(ns)) {
ret = -EBUSY;
goto out_unlock;
}
for (i = 0; i < 16; i++) {
if (p + 2 > page + count) {
ret = -EINVAL;
goto out_unlock;
}
if (!isxdigit(p[0]) || !isxdigit(p[1])) {
ret = -EINVAL;
goto out_unlock;
}
nguid[i] = (hex_to_bin(p[0]) << 4) | hex_to_bin(p[1]);
p += 2;
if (*p == '-' || *p == ':')
p++;
}
memcpy(&ns->nguid, nguid, sizeof(nguid));
out_unlock:
mutex_unlock(&subsys->lock);
return ret ? ret : count;
}
CONFIGFS_ATTR(nvmet_ns_, device_nguid);
static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page)
{
return sprintf(page, "%d\n", nvmet_ns_enabled(to_nvmet_ns(item)));
}
static ssize_t nvmet_ns_enable_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_ns *ns = to_nvmet_ns(item);
bool enable;
int ret = 0;
if (strtobool(page, &enable))
return -EINVAL;
if (enable)
ret = nvmet_ns_enable(ns);
else
nvmet_ns_disable(ns);
return ret ? ret : count;
}
CONFIGFS_ATTR(nvmet_ns_, enable);
static struct configfs_attribute *nvmet_ns_attrs[] = {
&nvmet_ns_attr_device_path,
&nvmet_ns_attr_device_nguid,
&nvmet_ns_attr_enable,
NULL,
};
static void nvmet_ns_release(struct config_item *item)
{
struct nvmet_ns *ns = to_nvmet_ns(item);
nvmet_ns_free(ns);
}
static struct configfs_item_operations nvmet_ns_item_ops = {
.release = nvmet_ns_release,
};
static struct config_item_type nvmet_ns_type = {
.ct_item_ops = &nvmet_ns_item_ops,
.ct_attrs = nvmet_ns_attrs,
.ct_owner = THIS_MODULE,
};
static struct config_group *nvmet_ns_make(struct config_group *group,
const char *name)
{
struct nvmet_subsys *subsys = namespaces_to_subsys(&group->cg_item);
struct nvmet_ns *ns;
int ret;
u32 nsid;
ret = kstrtou32(name, 0, &nsid);
if (ret)
goto out;
ret = -EINVAL;
if (nsid == 0 || nsid == 0xffffffff)
goto out;
ret = -ENOMEM;
ns = nvmet_ns_alloc(subsys, nsid);
if (!ns)
goto out;
config_group_init_type_name(&ns->group, name, &nvmet_ns_type);
pr_info("adding nsid %d to subsystem %s\n", nsid, subsys->subsysnqn);
return &ns->group;
out:
return ERR_PTR(ret);
}
static struct configfs_group_operations nvmet_namespaces_group_ops = {
.make_group = nvmet_ns_make,
};
static struct config_item_type nvmet_namespaces_type = {
.ct_group_ops = &nvmet_namespaces_group_ops,
.ct_owner = THIS_MODULE,
};
static int nvmet_port_subsys_allow_link(struct config_item *parent,
struct config_item *target)
{
struct nvmet_port *port = to_nvmet_port(parent->ci_parent);
struct nvmet_subsys *subsys;
struct nvmet_subsys_link *link, *p;
int ret;
if (target->ci_type != &nvmet_subsys_type) {
pr_err("can only link subsystems into the subsystems dir.!\n");
return -EINVAL;
}
subsys = to_subsys(target);
link = kmalloc(sizeof(*link), GFP_KERNEL);
if (!link)
return -ENOMEM;
link->subsys = subsys;
down_write(&nvmet_config_sem);
ret = -EEXIST;
list_for_each_entry(p, &port->subsystems, entry) {
if (p->subsys == subsys)
goto out_free_link;
}
if (list_empty(&port->subsystems)) {
ret = nvmet_enable_port(port);
if (ret)
goto out_free_link;
}
list_add_tail(&link->entry, &port->subsystems);
nvmet_genctr++;
up_write(&nvmet_config_sem);
return 0;
out_free_link:
up_write(&nvmet_config_sem);
kfree(link);
return ret;
}
static int nvmet_port_subsys_drop_link(struct config_item *parent,
struct config_item *target)
{
struct nvmet_port *port = to_nvmet_port(parent->ci_parent);
struct nvmet_subsys *subsys = to_subsys(target);
struct nvmet_subsys_link *p;
down_write(&nvmet_config_sem);
list_for_each_entry(p, &port->subsystems, entry) {
if (p->subsys == subsys)
goto found;
}
up_write(&nvmet_config_sem);
return -EINVAL;
found:
list_del(&p->entry);
nvmet_genctr++;
if (list_empty(&port->subsystems))
nvmet_disable_port(port);
up_write(&nvmet_config_sem);
kfree(p);
return 0;
}
static struct configfs_item_operations nvmet_port_subsys_item_ops = {
.allow_link = nvmet_port_subsys_allow_link,
.drop_link = nvmet_port_subsys_drop_link,
};
static struct config_item_type nvmet_port_subsys_type = {
.ct_item_ops = &nvmet_port_subsys_item_ops,
.ct_owner = THIS_MODULE,
};
static int nvmet_allowed_hosts_allow_link(struct config_item *parent,
struct config_item *target)
{
struct nvmet_subsys *subsys = to_subsys(parent->ci_parent);
struct nvmet_host *host;
struct nvmet_host_link *link, *p;
int ret;
if (target->ci_type != &nvmet_host_type) {
pr_err("can only link hosts into the allowed_hosts directory!\n");
return -EINVAL;
}
host = to_host(target);
link = kmalloc(sizeof(*link), GFP_KERNEL);
if (!link)
return -ENOMEM;
link->host = host;
down_write(&nvmet_config_sem);
ret = -EINVAL;
if (subsys->allow_any_host) {
pr_err("can't add hosts when allow_any_host is set!\n");
goto out_free_link;
}
ret = -EEXIST;
list_for_each_entry(p, &subsys->hosts, entry) {
if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host)))
goto out_free_link;
}
list_add_tail(&link->entry, &subsys->hosts);
nvmet_genctr++;
up_write(&nvmet_config_sem);
return 0;
out_free_link:
up_write(&nvmet_config_sem);
kfree(link);
return ret;
}
static int nvmet_allowed_hosts_drop_link(struct config_item *parent,
struct config_item *target)
{
struct nvmet_subsys *subsys = to_subsys(parent->ci_parent);
struct nvmet_host *host = to_host(target);
struct nvmet_host_link *p;
down_write(&nvmet_config_sem);
list_for_each_entry(p, &subsys->hosts, entry) {
if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host)))
goto found;
}
up_write(&nvmet_config_sem);
return -EINVAL;
found:
list_del(&p->entry);
nvmet_genctr++;
up_write(&nvmet_config_sem);
kfree(p);
return 0;
}
static struct configfs_item_operations nvmet_allowed_hosts_item_ops = {
.allow_link = nvmet_allowed_hosts_allow_link,
.drop_link = nvmet_allowed_hosts_drop_link,
};
static struct config_item_type nvmet_allowed_hosts_type = {
.ct_item_ops = &nvmet_allowed_hosts_item_ops,
.ct_owner = THIS_MODULE,
};
static ssize_t nvmet_subsys_attr_allow_any_host_show(struct config_item *item,
char *page)
{
return snprintf(page, PAGE_SIZE, "%d\n",
to_subsys(item)->allow_any_host);
}
static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_subsys *subsys = to_subsys(item);
bool allow_any_host;
int ret = 0;
if (strtobool(page, &allow_any_host))
return -EINVAL;
down_write(&nvmet_config_sem);
if (allow_any_host && !list_empty(&subsys->hosts)) {
pr_err("Can't set allow_any_host when explicit hosts are set!\n");
ret = -EINVAL;
goto out_unlock;
}
subsys->allow_any_host = allow_any_host;
out_unlock:
up_write(&nvmet_config_sem);
return ret ? ret : count;
}
CONFIGFS_ATTR(nvmet_subsys_, attr_allow_any_host);
static struct configfs_attribute *nvmet_subsys_attrs[] = {
&nvmet_subsys_attr_attr_allow_any_host,
NULL,
};
/*
* Subsystem structures & folder operation functions below
*/
static void nvmet_subsys_release(struct config_item *item)
{
struct nvmet_subsys *subsys = to_subsys(item);
nvmet_subsys_put(subsys);
}
static struct configfs_item_operations nvmet_subsys_item_ops = {
.release = nvmet_subsys_release,
};
static struct config_item_type nvmet_subsys_type = {
.ct_item_ops = &nvmet_subsys_item_ops,
.ct_attrs = nvmet_subsys_attrs,
.ct_owner = THIS_MODULE,
};
static struct config_group *nvmet_subsys_make(struct config_group *group,
const char *name)
{
struct nvmet_subsys *subsys;
if (sysfs_streq(name, NVME_DISC_SUBSYS_NAME)) {
pr_err("can't create discovery subsystem through configfs\n");
return ERR_PTR(-EINVAL);
}
subsys = nvmet_subsys_alloc(name, NVME_NQN_NVME);
if (!subsys)
return ERR_PTR(-ENOMEM);
config_group_init_type_name(&subsys->group, name, &nvmet_subsys_type);
config_group_init_type_name(&subsys->namespaces_group,
"namespaces", &nvmet_namespaces_type);
configfs_add_default_group(&subsys->namespaces_group, &subsys->group);
config_group_init_type_name(&subsys->allowed_hosts_group,
"allowed_hosts", &nvmet_allowed_hosts_type);
configfs_add_default_group(&subsys->allowed_hosts_group,
&subsys->group);
return &subsys->group;
}
static struct configfs_group_operations nvmet_subsystems_group_ops = {
.make_group = nvmet_subsys_make,
};
static struct config_item_type nvmet_subsystems_type = {
.ct_group_ops = &nvmet_subsystems_group_ops,
.ct_owner = THIS_MODULE,
};
static ssize_t nvmet_referral_enable_show(struct config_item *item,
char *page)
{
return snprintf(page, PAGE_SIZE, "%d\n", to_nvmet_port(item)->enabled);
}
static ssize_t nvmet_referral_enable_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent);
struct nvmet_port *port = to_nvmet_port(item);
bool enable;
if (strtobool(page, &enable))
goto inval;
if (enable)
nvmet_referral_enable(parent, port);
else
nvmet_referral_disable(port);
return count;
inval:
pr_err("Invalid value '%s' for enable\n", page);
return -EINVAL;
}
CONFIGFS_ATTR(nvmet_referral_, enable);
/*
* Discovery Service subsystem definitions
*/
static struct configfs_attribute *nvmet_referral_attrs[] = {
&nvmet_attr_addr_adrfam,
&nvmet_attr_addr_portid,
&nvmet_attr_addr_treq,
&nvmet_attr_addr_traddr,
&nvmet_attr_addr_trsvcid,
&nvmet_attr_addr_trtype,
&nvmet_referral_attr_enable,
NULL,
};
static void nvmet_referral_release(struct config_item *item)
{
struct nvmet_port *port = to_nvmet_port(item);
nvmet_referral_disable(port);
kfree(port);
}
static struct configfs_item_operations nvmet_referral_item_ops = {
.release = nvmet_referral_release,
};
static struct config_item_type nvmet_referral_type = {
.ct_owner = THIS_MODULE,
.ct_attrs = nvmet_referral_attrs,
.ct_item_ops = &nvmet_referral_item_ops,
};
static struct config_group *nvmet_referral_make(
struct config_group *group, const char *name)
{
struct nvmet_port *port;
port = kzalloc(sizeof(*port), GFP_KERNEL);
if (!port)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&port->entry);
config_group_init_type_name(&port->group, name, &nvmet_referral_type);
return &port->group;
}
static struct configfs_group_operations nvmet_referral_group_ops = {
.make_group = nvmet_referral_make,
};
static struct config_item_type nvmet_referrals_type = {
.ct_owner = THIS_MODULE,
.ct_group_ops = &nvmet_referral_group_ops,
};
/*
* Ports definitions.
*/
static void nvmet_port_release(struct config_item *item)
{
struct nvmet_port *port = to_nvmet_port(item);
kfree(port);
}
static struct configfs_attribute *nvmet_port_attrs[] = {
&nvmet_attr_addr_adrfam,
&nvmet_attr_addr_treq,
&nvmet_attr_addr_traddr,
&nvmet_attr_addr_trsvcid,
&nvmet_attr_addr_trtype,
NULL,
};
static struct configfs_item_operations nvmet_port_item_ops = {
.release = nvmet_port_release,
};
static struct config_item_type nvmet_port_type = {
.ct_attrs = nvmet_port_attrs,
.ct_item_ops = &nvmet_port_item_ops,
.ct_owner = THIS_MODULE,
};
static struct config_group *nvmet_ports_make(struct config_group *group,
const char *name)
{
struct nvmet_port *port;
u16 portid;
if (kstrtou16(name, 0, &portid))
return ERR_PTR(-EINVAL);
port = kzalloc(sizeof(*port), GFP_KERNEL);
if (!port)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&port->entry);
INIT_LIST_HEAD(&port->subsystems);
INIT_LIST_HEAD(&port->referrals);
port->disc_addr.portid = cpu_to_le16(portid);
config_group_init_type_name(&port->group, name, &nvmet_port_type);
config_group_init_type_name(&port->subsys_group,
"subsystems", &nvmet_port_subsys_type);
configfs_add_default_group(&port->subsys_group, &port->group);
config_group_init_type_name(&port->referrals_group,
"referrals", &nvmet_referrals_type);
configfs_add_default_group(&port->referrals_group, &port->group);
return &port->group;
}
static struct configfs_group_operations nvmet_ports_group_ops = {
.make_group = nvmet_ports_make,
};
static struct config_item_type nvmet_ports_type = {
.ct_group_ops = &nvmet_ports_group_ops,
.ct_owner = THIS_MODULE,
};
static struct config_group nvmet_subsystems_group;
static struct config_group nvmet_ports_group;
static void nvmet_host_release(struct config_item *item)
{
struct nvmet_host *host = to_host(item);
kfree(host);
}
static struct configfs_item_operations nvmet_host_item_ops = {
.release = nvmet_host_release,
};
static struct config_item_type nvmet_host_type = {
.ct_item_ops = &nvmet_host_item_ops,
.ct_owner = THIS_MODULE,
};
static struct config_group *nvmet_hosts_make_group(struct config_group *group,
const char *name)
{
struct nvmet_host *host;
host = kzalloc(sizeof(*host), GFP_KERNEL);
if (!host)
return ERR_PTR(-ENOMEM);
config_group_init_type_name(&host->group, name, &nvmet_host_type);
return &host->group;
}
static struct configfs_group_operations nvmet_hosts_group_ops = {
.make_group = nvmet_hosts_make_group,
};
static struct config_item_type nvmet_hosts_type = {
.ct_group_ops = &nvmet_hosts_group_ops,
.ct_owner = THIS_MODULE,
};
static struct config_group nvmet_hosts_group;
static struct config_item_type nvmet_root_type = {
.ct_owner = THIS_MODULE,
};
static struct configfs_subsystem nvmet_configfs_subsystem = {
.su_group = {
.cg_item = {
.ci_namebuf = "nvmet",
.ci_type = &nvmet_root_type,
},
},
};
int __init nvmet_init_configfs(void)
{
int ret;
config_group_init(&nvmet_configfs_subsystem.su_group);
mutex_init(&nvmet_configfs_subsystem.su_mutex);
config_group_init_type_name(&nvmet_subsystems_group,
"subsystems", &nvmet_subsystems_type);
configfs_add_default_group(&nvmet_subsystems_group,
&nvmet_configfs_subsystem.su_group);
config_group_init_type_name(&nvmet_ports_group,
"ports", &nvmet_ports_type);
configfs_add_default_group(&nvmet_ports_group,
&nvmet_configfs_subsystem.su_group);
config_group_init_type_name(&nvmet_hosts_group,
"hosts", &nvmet_hosts_type);
configfs_add_default_group(&nvmet_hosts_group,
&nvmet_configfs_subsystem.su_group);
ret = configfs_register_subsystem(&nvmet_configfs_subsystem);
if (ret) {
pr_err("configfs_register_subsystem: %d\n", ret);
return ret;
}
return 0;
}
void __exit nvmet_exit_configfs(void)
{
configfs_unregister_subsystem(&nvmet_configfs_subsystem);
}

View File

@ -0,0 +1,964 @@
/*
* Common code for the NVMe target.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include "nvmet.h"
static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
/*
* This read/write semaphore is used to synchronize access to configuration
* information on a target system that will result in discovery log page
* information change for at least one host.
* The full list of resources to protected by this semaphore is:
*
* - subsystems list
* - per-subsystem allowed hosts list
* - allow_any_host subsystem attribute
* - nvmet_genctr
* - the nvmet_transports array
*
* When updating any of those lists/structures write lock should be obtained,
* while when reading (popolating discovery log page or checking host-subsystem
* link) read lock is obtained to allow concurrent reads.
*/
DECLARE_RWSEM(nvmet_config_sem);
static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
const char *subsysnqn);
u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
size_t len)
{
if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len)
return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
return 0;
}
u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
{
if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len)
return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
return 0;
}
static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
{
return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
}
static void nvmet_async_events_free(struct nvmet_ctrl *ctrl)
{
struct nvmet_req *req;
while (1) {
mutex_lock(&ctrl->lock);
if (!ctrl->nr_async_event_cmds) {
mutex_unlock(&ctrl->lock);
return;
}
req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
mutex_unlock(&ctrl->lock);
nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR);
}
}
static void nvmet_async_event_work(struct work_struct *work)
{
struct nvmet_ctrl *ctrl =
container_of(work, struct nvmet_ctrl, async_event_work);
struct nvmet_async_event *aen;
struct nvmet_req *req;
while (1) {
mutex_lock(&ctrl->lock);
aen = list_first_entry_or_null(&ctrl->async_events,
struct nvmet_async_event, entry);
if (!aen || !ctrl->nr_async_event_cmds) {
mutex_unlock(&ctrl->lock);
return;
}
req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
nvmet_set_result(req, nvmet_async_event_result(aen));
list_del(&aen->entry);
kfree(aen);
mutex_unlock(&ctrl->lock);
nvmet_req_complete(req, 0);
}
}
static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
u8 event_info, u8 log_page)
{
struct nvmet_async_event *aen;
aen = kmalloc(sizeof(*aen), GFP_KERNEL);
if (!aen)
return;
aen->event_type = event_type;
aen->event_info = event_info;
aen->log_page = log_page;
mutex_lock(&ctrl->lock);
list_add_tail(&aen->entry, &ctrl->async_events);
mutex_unlock(&ctrl->lock);
schedule_work(&ctrl->async_event_work);
}
int nvmet_register_transport(struct nvmet_fabrics_ops *ops)
{
int ret = 0;
down_write(&nvmet_config_sem);
if (nvmet_transports[ops->type])
ret = -EINVAL;
else
nvmet_transports[ops->type] = ops;
up_write(&nvmet_config_sem);
return ret;
}
EXPORT_SYMBOL_GPL(nvmet_register_transport);
void nvmet_unregister_transport(struct nvmet_fabrics_ops *ops)
{
down_write(&nvmet_config_sem);
nvmet_transports[ops->type] = NULL;
up_write(&nvmet_config_sem);
}
EXPORT_SYMBOL_GPL(nvmet_unregister_transport);
int nvmet_enable_port(struct nvmet_port *port)
{
struct nvmet_fabrics_ops *ops;
int ret;
lockdep_assert_held(&nvmet_config_sem);
ops = nvmet_transports[port->disc_addr.trtype];
if (!ops) {
up_write(&nvmet_config_sem);
request_module("nvmet-transport-%d", port->disc_addr.trtype);
down_write(&nvmet_config_sem);
ops = nvmet_transports[port->disc_addr.trtype];
if (!ops) {
pr_err("transport type %d not supported\n",
port->disc_addr.trtype);
return -EINVAL;
}
}
if (!try_module_get(ops->owner))
return -EINVAL;
ret = ops->add_port(port);
if (ret) {
module_put(ops->owner);
return ret;
}
port->enabled = true;
return 0;
}
void nvmet_disable_port(struct nvmet_port *port)
{
struct nvmet_fabrics_ops *ops;
lockdep_assert_held(&nvmet_config_sem);
port->enabled = false;
ops = nvmet_transports[port->disc_addr.trtype];
ops->remove_port(port);
module_put(ops->owner);
}
static void nvmet_keep_alive_timer(struct work_struct *work)
{
struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
struct nvmet_ctrl, ka_work);
pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
ctrl->cntlid, ctrl->kato);
ctrl->ops->delete_ctrl(ctrl);
}
static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
{
pr_debug("ctrl %d start keep-alive timer for %d secs\n",
ctrl->cntlid, ctrl->kato);
INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer);
schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
}
static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
{
pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
cancel_delayed_work_sync(&ctrl->ka_work);
}
static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl,
__le32 nsid)
{
struct nvmet_ns *ns;
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
if (ns->nsid == le32_to_cpu(nsid))
return ns;
}
return NULL;
}
struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
{
struct nvmet_ns *ns;
rcu_read_lock();
ns = __nvmet_find_namespace(ctrl, nsid);
if (ns)
percpu_ref_get(&ns->ref);
rcu_read_unlock();
return ns;
}
static void nvmet_destroy_namespace(struct percpu_ref *ref)
{
struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref);
complete(&ns->disable_done);
}
void nvmet_put_namespace(struct nvmet_ns *ns)
{
percpu_ref_put(&ns->ref);
}
int nvmet_ns_enable(struct nvmet_ns *ns)
{
struct nvmet_subsys *subsys = ns->subsys;
struct nvmet_ctrl *ctrl;
int ret = 0;
mutex_lock(&subsys->lock);
if (!list_empty(&ns->dev_link))
goto out_unlock;
ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE,
NULL);
if (IS_ERR(ns->bdev)) {
pr_err("nvmet: failed to open block device %s: (%ld)\n",
ns->device_path, PTR_ERR(ns->bdev));
ret = PTR_ERR(ns->bdev);
ns->bdev = NULL;
goto out_unlock;
}
ns->size = i_size_read(ns->bdev->bd_inode);
ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
0, GFP_KERNEL);
if (ret)
goto out_blkdev_put;
if (ns->nsid > subsys->max_nsid)
subsys->max_nsid = ns->nsid;
/*
* The namespaces list needs to be sorted to simplify the implementation
* of the Identify Namepace List subcommand.
*/
if (list_empty(&subsys->namespaces)) {
list_add_tail_rcu(&ns->dev_link, &subsys->namespaces);
} else {
struct nvmet_ns *old;
list_for_each_entry_rcu(old, &subsys->namespaces, dev_link) {
BUG_ON(ns->nsid == old->nsid);
if (ns->nsid < old->nsid)
break;
}
list_add_tail_rcu(&ns->dev_link, &old->dev_link);
}
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
ret = 0;
out_unlock:
mutex_unlock(&subsys->lock);
return ret;
out_blkdev_put:
blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ);
ns->bdev = NULL;
goto out_unlock;
}
void nvmet_ns_disable(struct nvmet_ns *ns)
{
struct nvmet_subsys *subsys = ns->subsys;
struct nvmet_ctrl *ctrl;
mutex_lock(&subsys->lock);
if (list_empty(&ns->dev_link)) {
mutex_unlock(&subsys->lock);
return;
}
list_del_init(&ns->dev_link);
mutex_unlock(&subsys->lock);
/*
* Now that we removed the namespaces from the lookup list, we
* can kill the per_cpu ref and wait for any remaining references
* to be dropped, as well as a RCU grace period for anyone only
* using the namepace under rcu_read_lock(). Note that we can't
* use call_rcu here as we need to ensure the namespaces have
* been fully destroyed before unloading the module.
*/
percpu_ref_kill(&ns->ref);
synchronize_rcu();
wait_for_completion(&ns->disable_done);
percpu_ref_exit(&ns->ref);
mutex_lock(&subsys->lock);
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
if (ns->bdev)
blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ);
mutex_unlock(&subsys->lock);
}
void nvmet_ns_free(struct nvmet_ns *ns)
{
nvmet_ns_disable(ns);
kfree(ns->device_path);
kfree(ns);
}
struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
{
struct nvmet_ns *ns;
ns = kzalloc(sizeof(*ns), GFP_KERNEL);
if (!ns)
return NULL;
INIT_LIST_HEAD(&ns->dev_link);
init_completion(&ns->disable_done);
ns->nsid = nsid;
ns->subsys = subsys;
return ns;
}
static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
{
if (status)
nvmet_set_status(req, status);
/* XXX: need to fill in something useful for sq_head */
req->rsp->sq_head = 0;
if (likely(req->sq)) /* may happen during early failure */
req->rsp->sq_id = cpu_to_le16(req->sq->qid);
req->rsp->command_id = req->cmd->common.command_id;
if (req->ns)
nvmet_put_namespace(req->ns);
req->ops->queue_response(req);
}
void nvmet_req_complete(struct nvmet_req *req, u16 status)
{
__nvmet_req_complete(req, status);
percpu_ref_put(&req->sq->ref);
}
EXPORT_SYMBOL_GPL(nvmet_req_complete);
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
u16 qid, u16 size)
{
cq->qid = qid;
cq->size = size;
ctrl->cqs[qid] = cq;
}
void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
u16 qid, u16 size)
{
sq->qid = qid;
sq->size = size;
ctrl->sqs[qid] = sq;
}
void nvmet_sq_destroy(struct nvmet_sq *sq)
{
/*
* If this is the admin queue, complete all AERs so that our
* queue doesn't have outstanding requests on it.
*/
if (sq->ctrl && sq->ctrl->sqs && sq->ctrl->sqs[0] == sq)
nvmet_async_events_free(sq->ctrl);
percpu_ref_kill(&sq->ref);
wait_for_completion(&sq->free_done);
percpu_ref_exit(&sq->ref);
if (sq->ctrl) {
nvmet_ctrl_put(sq->ctrl);
sq->ctrl = NULL; /* allows reusing the queue later */
}
}
EXPORT_SYMBOL_GPL(nvmet_sq_destroy);
static void nvmet_sq_free(struct percpu_ref *ref)
{
struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
complete(&sq->free_done);
}
int nvmet_sq_init(struct nvmet_sq *sq)
{
int ret;
ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
if (ret) {
pr_err("percpu_ref init failed!\n");
return ret;
}
init_completion(&sq->free_done);
return 0;
}
EXPORT_SYMBOL_GPL(nvmet_sq_init);
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops)
{
u8 flags = req->cmd->common.flags;
u16 status;
req->cq = cq;
req->sq = sq;
req->ops = ops;
req->sg = NULL;
req->sg_cnt = 0;
req->rsp->status = 0;
/* no support for fused commands yet */
if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
goto fail;
}
/* either variant of SGLs is fine, as we don't support metadata */
if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF &&
(flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METASEG)) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
goto fail;
}
if (unlikely(!req->sq->ctrl))
/* will return an error for any Non-connect command: */
status = nvmet_parse_connect_cmd(req);
else if (likely(req->sq->qid != 0))
status = nvmet_parse_io_cmd(req);
else if (req->cmd->common.opcode == nvme_fabrics_command)
status = nvmet_parse_fabrics_cmd(req);
else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC)
status = nvmet_parse_discovery_cmd(req);
else
status = nvmet_parse_admin_cmd(req);
if (status)
goto fail;
if (unlikely(!percpu_ref_tryget_live(&sq->ref))) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
goto fail;
}
return true;
fail:
__nvmet_req_complete(req, status);
return false;
}
EXPORT_SYMBOL_GPL(nvmet_req_init);
static inline bool nvmet_cc_en(u32 cc)
{
return cc & 0x1;
}
static inline u8 nvmet_cc_css(u32 cc)
{
return (cc >> 4) & 0x7;
}
static inline u8 nvmet_cc_mps(u32 cc)
{
return (cc >> 7) & 0xf;
}
static inline u8 nvmet_cc_ams(u32 cc)
{
return (cc >> 11) & 0x7;
}
static inline u8 nvmet_cc_shn(u32 cc)
{
return (cc >> 14) & 0x3;
}
static inline u8 nvmet_cc_iosqes(u32 cc)
{
return (cc >> 16) & 0xf;
}
static inline u8 nvmet_cc_iocqes(u32 cc)
{
return (cc >> 20) & 0xf;
}
static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
{
lockdep_assert_held(&ctrl->lock);
if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES ||
nvmet_cc_mps(ctrl->cc) != 0 ||
nvmet_cc_ams(ctrl->cc) != 0 ||
nvmet_cc_css(ctrl->cc) != 0) {
ctrl->csts = NVME_CSTS_CFS;
return;
}
ctrl->csts = NVME_CSTS_RDY;
}
static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
{
lockdep_assert_held(&ctrl->lock);
/* XXX: tear down queues? */
ctrl->csts &= ~NVME_CSTS_RDY;
ctrl->cc = 0;
}
void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new)
{
u32 old;
mutex_lock(&ctrl->lock);
old = ctrl->cc;
ctrl->cc = new;
if (nvmet_cc_en(new) && !nvmet_cc_en(old))
nvmet_start_ctrl(ctrl);
if (!nvmet_cc_en(new) && nvmet_cc_en(old))
nvmet_clear_ctrl(ctrl);
if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) {
nvmet_clear_ctrl(ctrl);
ctrl->csts |= NVME_CSTS_SHST_CMPLT;
}
if (!nvmet_cc_shn(new) && nvmet_cc_shn(old))
ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
mutex_unlock(&ctrl->lock);
}
static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
{
/* command sets supported: NVMe command set: */
ctrl->cap = (1ULL << 37);
/* CC.EN timeout in 500msec units: */
ctrl->cap |= (15ULL << 24);
/* maximum queue entries supported: */
ctrl->cap |= NVMET_QUEUE_SIZE - 1;
}
u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
struct nvmet_req *req, struct nvmet_ctrl **ret)
{
struct nvmet_subsys *subsys;
struct nvmet_ctrl *ctrl;
u16 status = 0;
subsys = nvmet_find_get_subsys(req->port, subsysnqn);
if (!subsys) {
pr_warn("connect request for invalid subsystem %s!\n",
subsysnqn);
req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn);
return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
}
mutex_lock(&subsys->lock);
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
if (ctrl->cntlid == cntlid) {
if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) {
pr_warn("hostnqn mismatch.\n");
continue;
}
if (!kref_get_unless_zero(&ctrl->ref))
continue;
*ret = ctrl;
goto out;
}
}
pr_warn("could not find controller %d for subsys %s / host %s\n",
cntlid, subsysnqn, hostnqn);
req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid);
status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
out:
mutex_unlock(&subsys->lock);
nvmet_subsys_put(subsys);
return status;
}
static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
const char *hostnqn)
{
struct nvmet_host_link *p;
if (subsys->allow_any_host)
return true;
list_for_each_entry(p, &subsys->hosts, entry) {
if (!strcmp(nvmet_host_name(p->host), hostnqn))
return true;
}
return false;
}
static bool nvmet_host_discovery_allowed(struct nvmet_req *req,
const char *hostnqn)
{
struct nvmet_subsys_link *s;
list_for_each_entry(s, &req->port->subsystems, entry) {
if (__nvmet_host_allowed(s->subsys, hostnqn))
return true;
}
return false;
}
bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
const char *hostnqn)
{
lockdep_assert_held(&nvmet_config_sem);
if (subsys->type == NVME_NQN_DISC)
return nvmet_host_discovery_allowed(req, hostnqn);
else
return __nvmet_host_allowed(subsys, hostnqn);
}
u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
{
struct nvmet_subsys *subsys;
struct nvmet_ctrl *ctrl;
int ret;
u16 status;
status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
subsys = nvmet_find_get_subsys(req->port, subsysnqn);
if (!subsys) {
pr_warn("connect request for invalid subsystem %s!\n",
subsysnqn);
req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn);
goto out;
}
status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
down_read(&nvmet_config_sem);
if (!nvmet_host_allowed(req, subsys, hostnqn)) {
pr_info("connect by host %s for subsystem %s not allowed\n",
hostnqn, subsysnqn);
req->rsp->result = IPO_IATTR_CONNECT_DATA(hostnqn);
up_read(&nvmet_config_sem);
goto out_put_subsystem;
}
up_read(&nvmet_config_sem);
status = NVME_SC_INTERNAL;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
goto out_put_subsystem;
mutex_init(&ctrl->lock);
nvmet_init_cap(ctrl);
INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
INIT_LIST_HEAD(&ctrl->async_events);
memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
kref_init(&ctrl->ref);
ctrl->subsys = subsys;
ctrl->cqs = kcalloc(subsys->max_qid + 1,
sizeof(struct nvmet_cq *),
GFP_KERNEL);
if (!ctrl->cqs)
goto out_free_ctrl;
ctrl->sqs = kcalloc(subsys->max_qid + 1,
sizeof(struct nvmet_sq *),
GFP_KERNEL);
if (!ctrl->sqs)
goto out_free_cqs;
ret = ida_simple_get(&subsys->cntlid_ida,
NVME_CNTLID_MIN, NVME_CNTLID_MAX,
GFP_KERNEL);
if (ret < 0) {
status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
goto out_free_sqs;
}
ctrl->cntlid = ret;
ctrl->ops = req->ops;
if (ctrl->subsys->type == NVME_NQN_DISC) {
/* Don't accept keep-alive timeout for discovery controllers */
if (kato) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
goto out_free_sqs;
}
/*
* Discovery controllers use some arbitrary high value in order
* to cleanup stale discovery sessions
*
* From the latest base diff RC:
* "The Keep Alive command is not supported by
* Discovery controllers. A transport may specify a
* fixed Discovery controller activity timeout value
* (e.g., 2 minutes). If no commands are received
* by a Discovery controller within that time
* period, the controller may perform the
* actions for Keep Alive Timer expiration".
*/
ctrl->kato = NVMET_DISC_KATO;
} else {
/* keep-alive timeout in seconds */
ctrl->kato = DIV_ROUND_UP(kato, 1000);
}
nvmet_start_keep_alive_timer(ctrl);
mutex_lock(&subsys->lock);
list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
mutex_unlock(&subsys->lock);
*ctrlp = ctrl;
return 0;
out_free_sqs:
kfree(ctrl->sqs);
out_free_cqs:
kfree(ctrl->cqs);
out_free_ctrl:
kfree(ctrl);
out_put_subsystem:
nvmet_subsys_put(subsys);
out:
return status;
}
static void nvmet_ctrl_free(struct kref *ref)
{
struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
struct nvmet_subsys *subsys = ctrl->subsys;
nvmet_stop_keep_alive_timer(ctrl);
mutex_lock(&subsys->lock);
list_del(&ctrl->subsys_entry);
mutex_unlock(&subsys->lock);
ida_simple_remove(&subsys->cntlid_ida, ctrl->cntlid);
nvmet_subsys_put(subsys);
kfree(ctrl->sqs);
kfree(ctrl->cqs);
kfree(ctrl);
}
void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
{
kref_put(&ctrl->ref, nvmet_ctrl_free);
}
static void nvmet_fatal_error_handler(struct work_struct *work)
{
struct nvmet_ctrl *ctrl =
container_of(work, struct nvmet_ctrl, fatal_err_work);
pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid);
ctrl->ops->delete_ctrl(ctrl);
}
void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl)
{
ctrl->csts |= NVME_CSTS_CFS;
INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler);
schedule_work(&ctrl->fatal_err_work);
}
EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error);
static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
const char *subsysnqn)
{
struct nvmet_subsys_link *p;
if (!port)
return NULL;
if (!strncmp(NVME_DISC_SUBSYS_NAME, subsysnqn,
NVMF_NQN_SIZE)) {
if (!kref_get_unless_zero(&nvmet_disc_subsys->ref))
return NULL;
return nvmet_disc_subsys;
}
down_read(&nvmet_config_sem);
list_for_each_entry(p, &port->subsystems, entry) {
if (!strncmp(p->subsys->subsysnqn, subsysnqn,
NVMF_NQN_SIZE)) {
if (!kref_get_unless_zero(&p->subsys->ref))
break;
up_read(&nvmet_config_sem);
return p->subsys;
}
}
up_read(&nvmet_config_sem);
return NULL;
}
struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
enum nvme_subsys_type type)
{
struct nvmet_subsys *subsys;
subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
if (!subsys)
return NULL;
subsys->ver = (1 << 16) | (2 << 8) | 1; /* NVMe 1.2.1 */
switch (type) {
case NVME_NQN_NVME:
subsys->max_qid = NVMET_NR_QUEUES;
break;
case NVME_NQN_DISC:
subsys->max_qid = 0;
break;
default:
pr_err("%s: Unknown Subsystem type - %d\n", __func__, type);
kfree(subsys);
return NULL;
}
subsys->type = type;
subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE,
GFP_KERNEL);
if (!subsys->subsysnqn) {
kfree(subsys);
return NULL;
}
kref_init(&subsys->ref);
mutex_init(&subsys->lock);
INIT_LIST_HEAD(&subsys->namespaces);
INIT_LIST_HEAD(&subsys->ctrls);
ida_init(&subsys->cntlid_ida);
INIT_LIST_HEAD(&subsys->hosts);
return subsys;
}
static void nvmet_subsys_free(struct kref *ref)
{
struct nvmet_subsys *subsys =
container_of(ref, struct nvmet_subsys, ref);
WARN_ON_ONCE(!list_empty(&subsys->namespaces));
ida_destroy(&subsys->cntlid_ida);
kfree(subsys->subsysnqn);
kfree(subsys);
}
void nvmet_subsys_put(struct nvmet_subsys *subsys)
{
kref_put(&subsys->ref, nvmet_subsys_free);
}
static int __init nvmet_init(void)
{
int error;
error = nvmet_init_discovery();
if (error)
goto out;
error = nvmet_init_configfs();
if (error)
goto out_exit_discovery;
return 0;
out_exit_discovery:
nvmet_exit_discovery();
out:
return error;
}
static void __exit nvmet_exit(void)
{
nvmet_exit_configfs();
nvmet_exit_discovery();
BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
}
module_init(nvmet_init);
module_exit(nvmet_exit);
MODULE_LICENSE("GPL v2");

View File

@ -0,0 +1,221 @@
/*
* Discovery service for the NVMe over Fabrics target.
* Copyright (C) 2016 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/slab.h>
#include <generated/utsrelease.h>
#include "nvmet.h"
struct nvmet_subsys *nvmet_disc_subsys;
u64 nvmet_genctr;
void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port)
{
down_write(&nvmet_config_sem);
if (list_empty(&port->entry)) {
list_add_tail(&port->entry, &parent->referrals);
port->enabled = true;
nvmet_genctr++;
}
up_write(&nvmet_config_sem);
}
void nvmet_referral_disable(struct nvmet_port *port)
{
down_write(&nvmet_config_sem);
if (!list_empty(&port->entry)) {
port->enabled = false;
list_del_init(&port->entry);
nvmet_genctr++;
}
up_write(&nvmet_config_sem);
}
static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr,
struct nvmet_port *port, char *subsys_nqn, u8 type, u32 numrec)
{
struct nvmf_disc_rsp_page_entry *e = &hdr->entries[numrec];
e->trtype = port->disc_addr.trtype;
e->adrfam = port->disc_addr.adrfam;
e->treq = port->disc_addr.treq;
e->portid = port->disc_addr.portid;
/* we support only dynamic controllers */
e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC);
e->asqsz = cpu_to_le16(NVMF_AQ_DEPTH);
e->nqntype = type;
memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE);
memcpy(e->traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE);
memcpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
}
static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
{
const int entry_size = sizeof(struct nvmf_disc_rsp_page_entry);
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmf_disc_rsp_page_hdr *hdr;
size_t data_len = nvmet_get_log_page_len(req->cmd);
size_t alloc_len = max(data_len, sizeof(*hdr));
int residual_len = data_len - sizeof(*hdr);
struct nvmet_subsys_link *p;
struct nvmet_port *r;
u32 numrec = 0;
u16 status = 0;
/*
* Make sure we're passing at least a buffer of response header size.
* If host provided data len is less than the header size, only the
* number of bytes requested by host will be sent to host.
*/
hdr = kzalloc(alloc_len, GFP_KERNEL);
if (!hdr) {
status = NVME_SC_INTERNAL;
goto out;
}
down_read(&nvmet_config_sem);
list_for_each_entry(p, &req->port->subsystems, entry) {
if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn))
continue;
if (residual_len >= entry_size) {
nvmet_format_discovery_entry(hdr, req->port,
p->subsys->subsysnqn,
NVME_NQN_NVME, numrec);
residual_len -= entry_size;
}
numrec++;
}
list_for_each_entry(r, &req->port->referrals, entry) {
if (residual_len >= entry_size) {
nvmet_format_discovery_entry(hdr, r,
NVME_DISC_SUBSYS_NAME,
NVME_NQN_DISC, numrec);
residual_len -= entry_size;
}
numrec++;
}
hdr->genctr = cpu_to_le64(nvmet_genctr);
hdr->numrec = cpu_to_le64(numrec);
hdr->recfmt = cpu_to_le16(0);
up_read(&nvmet_config_sem);
status = nvmet_copy_to_sgl(req, 0, hdr, data_len);
kfree(hdr);
out:
nvmet_req_complete(req, status);
}
static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvme_id_ctrl *id;
u16 status = 0;
id = kzalloc(sizeof(*id), GFP_KERNEL);
if (!id) {
status = NVME_SC_INTERNAL;
goto out;
}
memset(id->fr, ' ', sizeof(id->fr));
strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr));
/* no limit on data transfer sizes for now */
id->mdts = 0;
id->cntlid = cpu_to_le16(ctrl->cntlid);
id->ver = cpu_to_le32(ctrl->subsys->ver);
id->lpa = (1 << 2);
/* no enforcement soft-limit for maxcmd - pick arbitrary high value */
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
if (ctrl->ops->has_keyed_sgls)
id->sgls |= cpu_to_le32(1 << 2);
if (ctrl->ops->sqe_inline_size)
id->sgls |= cpu_to_le32(1 << 20);
strcpy(id->subnqn, ctrl->subsys->subsysnqn);
status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
kfree(id);
out:
nvmet_req_complete(req, status);
}
int nvmet_parse_discovery_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
req->ns = NULL;
if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
pr_err("nvmet: got cmd %d while not ready\n",
cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
switch (cmd->common.opcode) {
case nvme_admin_get_log_page:
req->data_len = nvmet_get_log_page_len(cmd);
switch (cmd->get_log_page.lid) {
case NVME_LOG_DISC:
req->execute = nvmet_execute_get_disc_log_page;
return 0;
default:
pr_err("nvmet: unsupported get_log_page lid %d\n",
cmd->get_log_page.lid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
case nvme_admin_identify:
req->data_len = 4096;
switch (le32_to_cpu(cmd->identify.cns)) {
case 0x01:
req->execute =
nvmet_execute_identify_disc_ctrl;
return 0;
default:
pr_err("nvmet: unsupported identify cns %d\n",
le32_to_cpu(cmd->identify.cns));
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
default:
pr_err("nvmet: unsupported cmd %d\n",
cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
int __init nvmet_init_discovery(void)
{
nvmet_disc_subsys =
nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC);
if (!nvmet_disc_subsys)
return -ENOMEM;
return 0;
}
void nvmet_exit_discovery(void)
{
nvmet_subsys_put(nvmet_disc_subsys);
}

View File

@ -0,0 +1,240 @@
/*
* NVMe Fabrics command implementation.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/blkdev.h>
#include "nvmet.h"
static void nvmet_execute_prop_set(struct nvmet_req *req)
{
u16 status = 0;
if (!(req->cmd->prop_set.attrib & 1)) {
u64 val = le64_to_cpu(req->cmd->prop_set.value);
switch (le32_to_cpu(req->cmd->prop_set.offset)) {
case NVME_REG_CC:
nvmet_update_cc(req->sq->ctrl, val);
break;
default:
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
break;
}
} else {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
}
nvmet_req_complete(req, status);
}
static void nvmet_execute_prop_get(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
u16 status = 0;
u64 val = 0;
if (req->cmd->prop_get.attrib & 1) {
switch (le32_to_cpu(req->cmd->prop_get.offset)) {
case NVME_REG_CAP:
val = ctrl->cap;
break;
default:
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
break;
}
} else {
switch (le32_to_cpu(req->cmd->prop_get.offset)) {
case NVME_REG_VS:
val = ctrl->subsys->ver;
break;
case NVME_REG_CC:
val = ctrl->cc;
break;
case NVME_REG_CSTS:
val = ctrl->csts;
break;
default:
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
break;
}
}
req->rsp->result64 = cpu_to_le64(val);
nvmet_req_complete(req, status);
}
int nvmet_parse_fabrics_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
req->ns = NULL;
switch (cmd->fabrics.fctype) {
case nvme_fabrics_type_property_set:
req->data_len = 0;
req->execute = nvmet_execute_prop_set;
break;
case nvme_fabrics_type_property_get:
req->data_len = 0;
req->execute = nvmet_execute_prop_get;
break;
default:
pr_err("received unknown capsule type 0x%x\n",
cmd->fabrics.fctype);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
return 0;
}
static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
{
struct nvmf_connect_command *c = &req->cmd->connect;
u16 qid = le16_to_cpu(c->qid);
u16 sqsize = le16_to_cpu(c->sqsize);
struct nvmet_ctrl *old;
old = cmpxchg(&req->sq->ctrl, NULL, ctrl);
if (old) {
pr_warn("queue already connected!\n");
return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
}
nvmet_cq_setup(ctrl, req->cq, qid, sqsize);
nvmet_sq_setup(ctrl, req->sq, qid, sqsize);
return 0;
}
static void nvmet_execute_admin_connect(struct nvmet_req *req)
{
struct nvmf_connect_command *c = &req->cmd->connect;
struct nvmf_connect_data *d;
struct nvmet_ctrl *ctrl = NULL;
u16 status = 0;
d = kmap(sg_page(req->sg)) + req->sg->offset;
/* zero out initial completion result, assign values as needed */
req->rsp->result = 0;
if (c->recfmt != 0) {
pr_warn("invalid connect version (%d).\n",
le16_to_cpu(c->recfmt));
status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR;
goto out;
}
if (unlikely(d->cntlid != cpu_to_le16(0xffff))) {
pr_warn("connect attempt for invalid controller ID %#x\n",
d->cntlid);
status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid);
goto out;
}
status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
le32_to_cpu(c->kato), &ctrl);
if (status)
goto out;
status = nvmet_install_queue(ctrl, req);
if (status) {
nvmet_ctrl_put(ctrl);
goto out;
}
pr_info("creating controller %d for NQN %s.\n",
ctrl->cntlid, ctrl->hostnqn);
req->rsp->result16 = cpu_to_le16(ctrl->cntlid);
out:
kunmap(sg_page(req->sg));
nvmet_req_complete(req, status);
}
static void nvmet_execute_io_connect(struct nvmet_req *req)
{
struct nvmf_connect_command *c = &req->cmd->connect;
struct nvmf_connect_data *d;
struct nvmet_ctrl *ctrl = NULL;
u16 qid = le16_to_cpu(c->qid);
u16 status = 0;
d = kmap(sg_page(req->sg)) + req->sg->offset;
/* zero out initial completion result, assign values as needed */
req->rsp->result = 0;
if (c->recfmt != 0) {
pr_warn("invalid connect version (%d).\n",
le16_to_cpu(c->recfmt));
status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR;
goto out;
}
status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn,
le16_to_cpu(d->cntlid),
req, &ctrl);
if (status)
goto out;
if (unlikely(qid > ctrl->subsys->max_qid)) {
pr_warn("invalid queue id (%d)\n", qid);
status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
req->rsp->result = IPO_IATTR_CONNECT_SQE(qid);
goto out_ctrl_put;
}
status = nvmet_install_queue(ctrl, req);
if (status) {
/* pass back cntlid that had the issue of installing queue */
req->rsp->result16 = cpu_to_le16(ctrl->cntlid);
goto out_ctrl_put;
}
pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
out:
kunmap(sg_page(req->sg));
nvmet_req_complete(req, status);
return;
out_ctrl_put:
nvmet_ctrl_put(ctrl);
goto out;
}
int nvmet_parse_connect_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
req->ns = NULL;
if (req->cmd->common.opcode != nvme_fabrics_command) {
pr_err("invalid command 0x%x on unconnected queue.\n",
cmd->fabrics.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
if (cmd->fabrics.fctype != nvme_fabrics_type_connect) {
pr_err("invalid capsule type 0x%x on unconnected queue.\n",
cmd->fabrics.fctype);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
req->data_len = sizeof(struct nvmf_connect_data);
if (cmd->connect.qid == 0)
req->execute = nvmet_execute_admin_connect;
else
req->execute = nvmet_execute_io_connect;
return 0;
}

View File

@ -0,0 +1,215 @@
/*
* NVMe I/O command implementation.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/blkdev.h>
#include <linux/module.h>
#include "nvmet.h"
static void nvmet_bio_done(struct bio *bio)
{
struct nvmet_req *req = bio->bi_private;
nvmet_req_complete(req,
bio->bi_error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
if (bio != &req->inline_bio)
bio_put(bio);
}
static inline u32 nvmet_rw_len(struct nvmet_req *req)
{
return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) <<
req->ns->blksize_shift;
}
static void nvmet_inline_bio_init(struct nvmet_req *req)
{
struct bio *bio = &req->inline_bio;
bio_init(bio);
bio->bi_max_vecs = NVMET_MAX_INLINE_BIOVEC;
bio->bi_io_vec = req->inline_bvec;
}
static void nvmet_execute_rw(struct nvmet_req *req)
{
int sg_cnt = req->sg_cnt;
struct scatterlist *sg;
struct bio *bio;
sector_t sector;
blk_qc_t cookie;
int op, op_flags = 0, i;
if (!req->sg_cnt) {
nvmet_req_complete(req, 0);
return;
}
if (req->cmd->rw.opcode == nvme_cmd_write) {
op = REQ_OP_WRITE;
if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
op_flags |= REQ_FUA;
} else {
op = REQ_OP_READ;
}
sector = le64_to_cpu(req->cmd->rw.slba);
sector <<= (req->ns->blksize_shift - 9);
nvmet_inline_bio_init(req);
bio = &req->inline_bio;
bio->bi_bdev = req->ns->bdev;
bio->bi_iter.bi_sector = sector;
bio->bi_private = req;
bio->bi_end_io = nvmet_bio_done;
bio_set_op_attrs(bio, op, op_flags);
for_each_sg(req->sg, sg, req->sg_cnt, i) {
while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset)
!= sg->length) {
struct bio *prev = bio;
bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
bio->bi_bdev = req->ns->bdev;
bio->bi_iter.bi_sector = sector;
bio_set_op_attrs(bio, op, op_flags);
bio_chain(bio, prev);
cookie = submit_bio(prev);
}
sector += sg->length >> 9;
sg_cnt--;
}
cookie = submit_bio(bio);
blk_poll(bdev_get_queue(req->ns->bdev), cookie);
}
static void nvmet_execute_flush(struct nvmet_req *req)
{
struct bio *bio;
nvmet_inline_bio_init(req);
bio = &req->inline_bio;
bio->bi_bdev = req->ns->bdev;
bio->bi_private = req;
bio->bi_end_io = nvmet_bio_done;
bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
submit_bio(bio);
}
static u16 nvmet_discard_range(struct nvmet_ns *ns,
struct nvme_dsm_range *range, struct bio **bio)
{
if (__blkdev_issue_discard(ns->bdev,
le64_to_cpu(range->slba) << (ns->blksize_shift - 9),
le32_to_cpu(range->nlb) << (ns->blksize_shift - 9),
GFP_KERNEL, 0, bio))
return NVME_SC_INTERNAL | NVME_SC_DNR;
return 0;
}
static void nvmet_execute_discard(struct nvmet_req *req)
{
struct nvme_dsm_range range;
struct bio *bio = NULL;
int i;
u16 status;
for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) {
status = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
sizeof(range));
if (status)
break;
status = nvmet_discard_range(req->ns, &range, &bio);
if (status)
break;
}
if (bio) {
bio->bi_private = req;
bio->bi_end_io = nvmet_bio_done;
if (status) {
bio->bi_error = -EIO;
bio_endio(bio);
} else {
submit_bio(bio);
}
} else {
nvmet_req_complete(req, status);
}
}
static void nvmet_execute_dsm(struct nvmet_req *req)
{
switch (le32_to_cpu(req->cmd->dsm.attributes)) {
case NVME_DSMGMT_AD:
nvmet_execute_discard(req);
return;
case NVME_DSMGMT_IDR:
case NVME_DSMGMT_IDW:
default:
/* Not supported yet */
nvmet_req_complete(req, 0);
return;
}
}
int nvmet_parse_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
pr_err("nvmet: got io cmd %d while CC.EN == 0\n",
cmd->common.opcode);
req->ns = NULL;
return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
}
if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
pr_err("nvmet: got io cmd %d while CSTS.RDY == 0\n",
cmd->common.opcode);
req->ns = NULL;
return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
}
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
if (!req->ns)
return NVME_SC_INVALID_NS | NVME_SC_DNR;
switch (cmd->common.opcode) {
case nvme_cmd_read:
case nvme_cmd_write:
req->execute = nvmet_execute_rw;
req->data_len = nvmet_rw_len(req);
return 0;
case nvme_cmd_flush:
req->execute = nvmet_execute_flush;
req->data_len = 0;
return 0;
case nvme_cmd_dsm:
req->execute = nvmet_execute_dsm;
req->data_len = le32_to_cpu(cmd->dsm.nr) *
sizeof(struct nvme_dsm_range);
return 0;
default:
pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
}

View File

@ -0,0 +1,754 @@
/*
* NVMe over Fabrics loopback device.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/scatterlist.h>
#include <linux/delay.h>
#include <linux/blk-mq.h>
#include <linux/nvme.h>
#include <linux/module.h>
#include <linux/parser.h>
#include <linux/t10-pi.h>
#include "nvmet.h"
#include "../host/nvme.h"
#include "../host/fabrics.h"
#define NVME_LOOP_AQ_DEPTH 256
#define NVME_LOOP_MAX_SEGMENTS 256
/*
* We handle AEN commands ourselves and don't even let the
* block layer know about them.
*/
#define NVME_LOOP_NR_AEN_COMMANDS 1
#define NVME_LOOP_AQ_BLKMQ_DEPTH \
(NVME_LOOP_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS)
struct nvme_loop_iod {
struct nvme_command cmd;
struct nvme_completion rsp;
struct nvmet_req req;
struct nvme_loop_queue *queue;
struct work_struct work;
struct sg_table sg_table;
struct scatterlist first_sgl[];
};
struct nvme_loop_ctrl {
spinlock_t lock;
struct nvme_loop_queue *queues;
u32 queue_count;
struct blk_mq_tag_set admin_tag_set;
struct list_head list;
u64 cap;
struct blk_mq_tag_set tag_set;
struct nvme_loop_iod async_event_iod;
struct nvme_ctrl ctrl;
struct nvmet_ctrl *target_ctrl;
struct work_struct delete_work;
struct work_struct reset_work;
};
static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
{
return container_of(ctrl, struct nvme_loop_ctrl, ctrl);
}
struct nvme_loop_queue {
struct nvmet_cq nvme_cq;
struct nvmet_sq nvme_sq;
struct nvme_loop_ctrl *ctrl;
};
static struct nvmet_port *nvmet_loop_port;
static LIST_HEAD(nvme_loop_ctrl_list);
static DEFINE_MUTEX(nvme_loop_ctrl_mutex);
static void nvme_loop_queue_response(struct nvmet_req *nvme_req);
static void nvme_loop_delete_ctrl(struct nvmet_ctrl *ctrl);
static struct nvmet_fabrics_ops nvme_loop_ops;
static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue)
{
return queue - queue->ctrl->queues;
}
static void nvme_loop_complete_rq(struct request *req)
{
struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
int error = 0;
nvme_cleanup_cmd(req);
sg_free_table_chained(&iod->sg_table, true);
if (unlikely(req->errors)) {
if (nvme_req_needs_retry(req, req->errors)) {
nvme_requeue_req(req);
return;
}
if (req->cmd_type == REQ_TYPE_DRV_PRIV)
error = req->errors;
else
error = nvme_error_status(req->errors);
}
blk_mq_end_request(req, error);
}
static void nvme_loop_queue_response(struct nvmet_req *nvme_req)
{
struct nvme_loop_iod *iod =
container_of(nvme_req, struct nvme_loop_iod, req);
struct nvme_completion *cqe = &iod->rsp;
/*
* AEN requests are special as they don't time out and can
* survive any kind of queue freeze and often don't respond to
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
if (unlikely(nvme_loop_queue_idx(iod->queue) == 0 &&
cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) {
nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe);
} else {
struct request *req = blk_mq_rq_from_pdu(iod);
if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special)
memcpy(req->special, cqe, sizeof(*cqe));
blk_mq_complete_request(req, le16_to_cpu(cqe->status) >> 1);
}
}
static void nvme_loop_execute_work(struct work_struct *work)
{
struct nvme_loop_iod *iod =
container_of(work, struct nvme_loop_iod, work);
iod->req.execute(&iod->req);
}
static enum blk_eh_timer_return
nvme_loop_timeout(struct request *rq, bool reserved)
{
struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(rq);
/* queue error recovery */
schedule_work(&iod->queue->ctrl->reset_work);
/* fail with DNR on admin cmd timeout */
rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
return BLK_EH_HANDLED;
}
static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct nvme_ns *ns = hctx->queue->queuedata;
struct nvme_loop_queue *queue = hctx->driver_data;
struct request *req = bd->rq;
struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
int ret;
ret = nvme_setup_cmd(ns, req, &iod->cmd);
if (ret)
return ret;
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
iod->req.port = nvmet_loop_port;
if (!nvmet_req_init(&iod->req, &queue->nvme_cq,
&queue->nvme_sq, &nvme_loop_ops)) {
nvme_cleanup_cmd(req);
blk_mq_start_request(req);
nvme_loop_queue_response(&iod->req);
return 0;
}
if (blk_rq_bytes(req)) {
iod->sg_table.sgl = iod->first_sgl;
ret = sg_alloc_table_chained(&iod->sg_table,
req->nr_phys_segments, iod->sg_table.sgl);
if (ret)
return BLK_MQ_RQ_QUEUE_BUSY;
iod->req.sg = iod->sg_table.sgl;
iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl);
BUG_ON(iod->req.sg_cnt > req->nr_phys_segments);
}
iod->cmd.common.command_id = req->tag;
blk_mq_start_request(req);
schedule_work(&iod->work);
return 0;
}
static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
{
struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg);
struct nvme_loop_queue *queue = &ctrl->queues[0];
struct nvme_loop_iod *iod = &ctrl->async_event_iod;
memset(&iod->cmd, 0, sizeof(iod->cmd));
iod->cmd.common.opcode = nvme_admin_async_event;
iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH;
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq,
&nvme_loop_ops)) {
dev_err(ctrl->ctrl.device, "failed async event work\n");
return;
}
schedule_work(&iod->work);
}
static int nvme_loop_init_iod(struct nvme_loop_ctrl *ctrl,
struct nvme_loop_iod *iod, unsigned int queue_idx)
{
BUG_ON(queue_idx >= ctrl->queue_count);
iod->req.cmd = &iod->cmd;
iod->req.rsp = &iod->rsp;
iod->queue = &ctrl->queues[queue_idx];
INIT_WORK(&iod->work, nvme_loop_execute_work);
return 0;
}
static int nvme_loop_init_request(void *data, struct request *req,
unsigned int hctx_idx, unsigned int rq_idx,
unsigned int numa_node)
{
return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), hctx_idx + 1);
}
static int nvme_loop_init_admin_request(void *data, struct request *req,
unsigned int hctx_idx, unsigned int rq_idx,
unsigned int numa_node)
{
return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), 0);
}
static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_loop_ctrl *ctrl = data;
struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1];
BUG_ON(hctx_idx >= ctrl->queue_count);
hctx->driver_data = queue;
return 0;
}
static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_loop_ctrl *ctrl = data;
struct nvme_loop_queue *queue = &ctrl->queues[0];
BUG_ON(hctx_idx != 0);
hctx->driver_data = queue;
return 0;
}
static struct blk_mq_ops nvme_loop_mq_ops = {
.queue_rq = nvme_loop_queue_rq,
.complete = nvme_loop_complete_rq,
.map_queue = blk_mq_map_queue,
.init_request = nvme_loop_init_request,
.init_hctx = nvme_loop_init_hctx,
.timeout = nvme_loop_timeout,
};
static struct blk_mq_ops nvme_loop_admin_mq_ops = {
.queue_rq = nvme_loop_queue_rq,
.complete = nvme_loop_complete_rq,
.map_queue = blk_mq_map_queue,
.init_request = nvme_loop_init_admin_request,
.init_hctx = nvme_loop_init_admin_hctx,
.timeout = nvme_loop_timeout,
};
static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
{
blk_cleanup_queue(ctrl->ctrl.admin_q);
blk_mq_free_tag_set(&ctrl->admin_tag_set);
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
}
static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl);
if (list_empty(&ctrl->list))
goto free_ctrl;
mutex_lock(&nvme_loop_ctrl_mutex);
list_del(&ctrl->list);
mutex_unlock(&nvme_loop_ctrl_mutex);
if (nctrl->tagset) {
blk_cleanup_queue(ctrl->ctrl.connect_q);
blk_mq_free_tag_set(&ctrl->tag_set);
}
kfree(ctrl->queues);
nvmf_free_options(nctrl->opts);
free_ctrl:
kfree(ctrl);
}
static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
{
int error;
memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops;
ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH;
ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
ctrl->admin_tag_set.driver_data = ctrl;
ctrl->admin_tag_set.nr_hw_queues = 1;
ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
ctrl->queues[0].ctrl = ctrl;
error = nvmet_sq_init(&ctrl->queues[0].nvme_sq);
if (error)
return error;
ctrl->queue_count = 1;
error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
if (error)
goto out_free_sq;
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
error = PTR_ERR(ctrl->ctrl.admin_q);
goto out_free_tagset;
}
error = nvmf_connect_admin_queue(&ctrl->ctrl);
if (error)
goto out_cleanup_queue;
error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
if (error) {
dev_err(ctrl->ctrl.device,
"prop_get NVME_REG_CAP failed\n");
goto out_cleanup_queue;
}
ctrl->ctrl.sqsize =
min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
if (error)
goto out_cleanup_queue;
ctrl->ctrl.max_hw_sectors =
(NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9);
error = nvme_init_identify(&ctrl->ctrl);
if (error)
goto out_cleanup_queue;
nvme_start_keep_alive(&ctrl->ctrl);
return 0;
out_cleanup_queue:
blk_cleanup_queue(ctrl->ctrl.admin_q);
out_free_tagset:
blk_mq_free_tag_set(&ctrl->admin_tag_set);
out_free_sq:
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
return error;
}
static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
{
int i;
nvme_stop_keep_alive(&ctrl->ctrl);
if (ctrl->queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_cancel_request, &ctrl->ctrl);
for (i = 1; i < ctrl->queue_count; i++)
nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
}
if (ctrl->ctrl.state == NVME_CTRL_LIVE)
nvme_shutdown_ctrl(&ctrl->ctrl);
blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_cancel_request, &ctrl->ctrl);
nvme_loop_destroy_admin_queue(ctrl);
}
static void nvme_loop_del_ctrl_work(struct work_struct *work)
{
struct nvme_loop_ctrl *ctrl = container_of(work,
struct nvme_loop_ctrl, delete_work);
nvme_remove_namespaces(&ctrl->ctrl);
nvme_loop_shutdown_ctrl(ctrl);
nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);
}
static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
return -EBUSY;
if (!schedule_work(&ctrl->delete_work))
return -EBUSY;
return 0;
}
static int nvme_loop_del_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl);
int ret;
ret = __nvme_loop_del_ctrl(ctrl);
if (ret)
return ret;
flush_work(&ctrl->delete_work);
return 0;
}
static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl)
{
struct nvme_loop_ctrl *ctrl;
mutex_lock(&nvme_loop_ctrl_mutex);
list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) {
if (ctrl->ctrl.cntlid == nctrl->cntlid)
__nvme_loop_del_ctrl(ctrl);
}
mutex_unlock(&nvme_loop_ctrl_mutex);
}
static void nvme_loop_reset_ctrl_work(struct work_struct *work)
{
struct nvme_loop_ctrl *ctrl = container_of(work,
struct nvme_loop_ctrl, reset_work);
bool changed;
int i, ret;
nvme_loop_shutdown_ctrl(ctrl);
ret = nvme_loop_configure_admin_queue(ctrl);
if (ret)
goto out_disable;
for (i = 1; i <= ctrl->ctrl.opts->nr_io_queues; i++) {
ctrl->queues[i].ctrl = ctrl;
ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq);
if (ret)
goto out_free_queues;
ctrl->queue_count++;
}
for (i = 1; i <= ctrl->ctrl.opts->nr_io_queues; i++) {
ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
if (ret)
goto out_free_queues;
}
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
nvme_queue_scan(&ctrl->ctrl);
nvme_queue_async_events(&ctrl->ctrl);
nvme_start_queues(&ctrl->ctrl);
return;
out_free_queues:
for (i = 1; i < ctrl->queue_count; i++)
nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
nvme_loop_destroy_admin_queue(ctrl);
out_disable:
dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
nvme_remove_namespaces(&ctrl->ctrl);
nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);
}
static int nvme_loop_reset_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
return -EBUSY;
if (!schedule_work(&ctrl->reset_work))
return -EBUSY;
flush_work(&ctrl->reset_work);
return 0;
}
static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
.name = "loop",
.module = THIS_MODULE,
.is_fabrics = true,
.reg_read32 = nvmf_reg_read32,
.reg_read64 = nvmf_reg_read64,
.reg_write32 = nvmf_reg_write32,
.reset_ctrl = nvme_loop_reset_ctrl,
.free_ctrl = nvme_loop_free_ctrl,
.submit_async_event = nvme_loop_submit_async_event,
.delete_ctrl = nvme_loop_del_ctrl,
.get_subsysnqn = nvmf_get_subsysnqn,
};
static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
{
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
int ret, i;
ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
if (ret || !opts->nr_io_queues)
return ret;
dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n",
opts->nr_io_queues);
for (i = 1; i <= opts->nr_io_queues; i++) {
ctrl->queues[i].ctrl = ctrl;
ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq);
if (ret)
goto out_destroy_queues;
ctrl->queue_count++;
}
memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
ctrl->tag_set.ops = &nvme_loop_mq_ops;
ctrl->tag_set.queue_depth = ctrl->ctrl.sqsize;
ctrl->tag_set.reserved_tags = 1; /* fabric connect */
ctrl->tag_set.numa_node = NUMA_NO_NODE;
ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
ctrl->tag_set.driver_data = ctrl;
ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
ctrl->ctrl.tagset = &ctrl->tag_set;
ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
if (ret)
goto out_destroy_queues;
ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
if (IS_ERR(ctrl->ctrl.connect_q)) {
ret = PTR_ERR(ctrl->ctrl.connect_q);
goto out_free_tagset;
}
for (i = 1; i <= opts->nr_io_queues; i++) {
ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
if (ret)
goto out_cleanup_connect_q;
}
return 0;
out_cleanup_connect_q:
blk_cleanup_queue(ctrl->ctrl.connect_q);
out_free_tagset:
blk_mq_free_tag_set(&ctrl->tag_set);
out_destroy_queues:
for (i = 1; i < ctrl->queue_count; i++)
nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
return ret;
}
static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
struct nvme_loop_ctrl *ctrl;
bool changed;
int ret;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
return ERR_PTR(-ENOMEM);
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list);
INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work);
INIT_WORK(&ctrl->reset_work, nvme_loop_reset_ctrl_work);
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops,
0 /* no quirks, we're perfect! */);
if (ret)
goto out_put_ctrl;
spin_lock_init(&ctrl->lock);
ret = -ENOMEM;
ctrl->ctrl.sqsize = opts->queue_size;
ctrl->ctrl.kato = opts->kato;
ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues),
GFP_KERNEL);
if (!ctrl->queues)
goto out_uninit_ctrl;
ret = nvme_loop_configure_admin_queue(ctrl);
if (ret)
goto out_free_queues;
if (opts->queue_size > ctrl->ctrl.maxcmd) {
/* warn if maxcmd is lower than queue_size */
dev_warn(ctrl->ctrl.device,
"queue_size %zu > ctrl maxcmd %u, clamping down\n",
opts->queue_size, ctrl->ctrl.maxcmd);
opts->queue_size = ctrl->ctrl.maxcmd;
}
if (opts->nr_io_queues) {
ret = nvme_loop_create_io_queues(ctrl);
if (ret)
goto out_remove_admin_queue;
}
nvme_loop_init_iod(ctrl, &ctrl->async_event_iod, 0);
dev_info(ctrl->ctrl.device,
"new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn);
kref_get(&ctrl->ctrl.kref);
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
mutex_lock(&nvme_loop_ctrl_mutex);
list_add_tail(&ctrl->list, &nvme_loop_ctrl_list);
mutex_unlock(&nvme_loop_ctrl_mutex);
if (opts->nr_io_queues) {
nvme_queue_scan(&ctrl->ctrl);
nvme_queue_async_events(&ctrl->ctrl);
}
return &ctrl->ctrl;
out_remove_admin_queue:
nvme_loop_destroy_admin_queue(ctrl);
out_free_queues:
kfree(ctrl->queues);
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
out_put_ctrl:
nvme_put_ctrl(&ctrl->ctrl);
if (ret > 0)
ret = -EIO;
return ERR_PTR(ret);
}
static int nvme_loop_add_port(struct nvmet_port *port)
{
/*
* XXX: disalow adding more than one port so
* there is no connection rejections when a
* a subsystem is assigned to a port for which
* loop doesn't have a pointer.
* This scenario would be possible if we allowed
* more than one port to be added and a subsystem
* was assigned to a port other than nvmet_loop_port.
*/
if (nvmet_loop_port)
return -EPERM;
nvmet_loop_port = port;
return 0;
}
static void nvme_loop_remove_port(struct nvmet_port *port)
{
if (port == nvmet_loop_port)
nvmet_loop_port = NULL;
}
static struct nvmet_fabrics_ops nvme_loop_ops = {
.owner = THIS_MODULE,
.type = NVMF_TRTYPE_LOOP,
.add_port = nvme_loop_add_port,
.remove_port = nvme_loop_remove_port,
.queue_response = nvme_loop_queue_response,
.delete_ctrl = nvme_loop_delete_ctrl,
};
static struct nvmf_transport_ops nvme_loop_transport = {
.name = "loop",
.create_ctrl = nvme_loop_create_ctrl,
};
static int __init nvme_loop_init_module(void)
{
int ret;
ret = nvmet_register_transport(&nvme_loop_ops);
if (ret)
return ret;
nvmf_register_transport(&nvme_loop_transport);
return 0;
}
static void __exit nvme_loop_cleanup_module(void)
{
struct nvme_loop_ctrl *ctrl, *next;
nvmf_unregister_transport(&nvme_loop_transport);
nvmet_unregister_transport(&nvme_loop_ops);
mutex_lock(&nvme_loop_ctrl_mutex);
list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list)
__nvme_loop_del_ctrl(ctrl);
mutex_unlock(&nvme_loop_ctrl_mutex);
flush_scheduled_work();
}
module_init(nvme_loop_init_module);
module_exit(nvme_loop_cleanup_module);
MODULE_LICENSE("GPL v2");
MODULE_ALIAS("nvmet-transport-254"); /* 254 == NVMF_TRTYPE_LOOP */

View File

@ -0,0 +1,331 @@
/*
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef _NVMET_H
#define _NVMET_H
#include <linux/dma-mapping.h>
#include <linux/types.h>
#include <linux/device.h>
#include <linux/kref.h>
#include <linux/percpu-refcount.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/nvme.h>
#include <linux/configfs.h>
#include <linux/rcupdate.h>
#include <linux/blkdev.h>
#define NVMET_ASYNC_EVENTS 4
#define NVMET_ERROR_LOG_SLOTS 128
/* Helper Macros when NVMe error is NVME_SC_CONNECT_INVALID_PARAM
* The 16 bit shift is to set IATTR bit to 1, which means offending
* offset starts in the data section of connect()
*/
#define IPO_IATTR_CONNECT_DATA(x) \
(cpu_to_le32((1 << 16) | (offsetof(struct nvmf_connect_data, x))))
#define IPO_IATTR_CONNECT_SQE(x) \
(cpu_to_le32(offsetof(struct nvmf_connect_command, x)))
struct nvmet_ns {
struct list_head dev_link;
struct percpu_ref ref;
struct block_device *bdev;
u32 nsid;
u32 blksize_shift;
loff_t size;
u8 nguid[16];
struct nvmet_subsys *subsys;
const char *device_path;
struct config_group device_group;
struct config_group group;
struct completion disable_done;
};
static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
{
return container_of(to_config_group(item), struct nvmet_ns, group);
}
static inline bool nvmet_ns_enabled(struct nvmet_ns *ns)
{
return !list_empty_careful(&ns->dev_link);
}
struct nvmet_cq {
u16 qid;
u16 size;
};
struct nvmet_sq {
struct nvmet_ctrl *ctrl;
struct percpu_ref ref;
u16 qid;
u16 size;
struct completion free_done;
};
/**
* struct nvmet_port - Common structure to keep port
* information for the target.
* @entry: List head for holding a list of these elements.
* @disc_addr: Address information is stored in a format defined
* for a discovery log page entry.
* @group: ConfigFS group for this element's folder.
* @priv: Private data for the transport.
*/
struct nvmet_port {
struct list_head entry;
struct nvmf_disc_rsp_page_entry disc_addr;
struct config_group group;
struct config_group subsys_group;
struct list_head subsystems;
struct config_group referrals_group;
struct list_head referrals;
void *priv;
bool enabled;
};
static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
{
return container_of(to_config_group(item), struct nvmet_port,
group);
}
struct nvmet_ctrl {
struct nvmet_subsys *subsys;
struct nvmet_cq **cqs;
struct nvmet_sq **sqs;
struct mutex lock;
u64 cap;
u32 cc;
u32 csts;
u16 cntlid;
u32 kato;
struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS];
unsigned int nr_async_event_cmds;
struct list_head async_events;
struct work_struct async_event_work;
struct list_head subsys_entry;
struct kref ref;
struct delayed_work ka_work;
struct work_struct fatal_err_work;
struct nvmet_fabrics_ops *ops;
char subsysnqn[NVMF_NQN_FIELD_LEN];
char hostnqn[NVMF_NQN_FIELD_LEN];
};
struct nvmet_subsys {
enum nvme_subsys_type type;
struct mutex lock;
struct kref ref;
struct list_head namespaces;
unsigned int max_nsid;
struct list_head ctrls;
struct ida cntlid_ida;
struct list_head hosts;
bool allow_any_host;
u16 max_qid;
u64 ver;
char *subsysnqn;
struct config_group group;
struct config_group namespaces_group;
struct config_group allowed_hosts_group;
};
static inline struct nvmet_subsys *to_subsys(struct config_item *item)
{
return container_of(to_config_group(item), struct nvmet_subsys, group);
}
static inline struct nvmet_subsys *namespaces_to_subsys(
struct config_item *item)
{
return container_of(to_config_group(item), struct nvmet_subsys,
namespaces_group);
}
struct nvmet_host {
struct config_group group;
};
static inline struct nvmet_host *to_host(struct config_item *item)
{
return container_of(to_config_group(item), struct nvmet_host, group);
}
static inline char *nvmet_host_name(struct nvmet_host *host)
{
return config_item_name(&host->group.cg_item);
}
struct nvmet_host_link {
struct list_head entry;
struct nvmet_host *host;
};
struct nvmet_subsys_link {
struct list_head entry;
struct nvmet_subsys *subsys;
};
struct nvmet_req;
struct nvmet_fabrics_ops {
struct module *owner;
unsigned int type;
unsigned int sqe_inline_size;
unsigned int msdbd;
bool has_keyed_sgls : 1;
void (*queue_response)(struct nvmet_req *req);
int (*add_port)(struct nvmet_port *port);
void (*remove_port)(struct nvmet_port *port);
void (*delete_ctrl)(struct nvmet_ctrl *ctrl);
};
#define NVMET_MAX_INLINE_BIOVEC 8
struct nvmet_req {
struct nvme_command *cmd;
struct nvme_completion *rsp;
struct nvmet_sq *sq;
struct nvmet_cq *cq;
struct nvmet_ns *ns;
struct scatterlist *sg;
struct bio inline_bio;
struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC];
int sg_cnt;
size_t data_len;
struct nvmet_port *port;
void (*execute)(struct nvmet_req *req);
struct nvmet_fabrics_ops *ops;
};
static inline void nvmet_set_status(struct nvmet_req *req, u16 status)
{
req->rsp->status = cpu_to_le16(status << 1);
}
static inline void nvmet_set_result(struct nvmet_req *req, u32 result)
{
req->rsp->result = cpu_to_le32(result);
}
/*
* NVMe command writes actually are DMA reads for us on the target side.
*/
static inline enum dma_data_direction
nvmet_data_dir(struct nvmet_req *req)
{
return nvme_is_write(req->cmd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
}
struct nvmet_async_event {
struct list_head entry;
u8 event_type;
u8 event_info;
u8 log_page;
};
int nvmet_parse_connect_cmd(struct nvmet_req *req);
int nvmet_parse_io_cmd(struct nvmet_req *req);
int nvmet_parse_admin_cmd(struct nvmet_req *req);
int nvmet_parse_discovery_cmd(struct nvmet_req *req);
int nvmet_parse_fabrics_cmd(struct nvmet_req *req);
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops);
void nvmet_req_complete(struct nvmet_req *req, u16 status);
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
u16 size);
void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
u16 size);
void nvmet_sq_destroy(struct nvmet_sq *sq);
int nvmet_sq_init(struct nvmet_sq *sq);
void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl);
void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new);
u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp);
u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
struct nvmet_req *req, struct nvmet_ctrl **ret);
void nvmet_ctrl_put(struct nvmet_ctrl *ctrl);
struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
enum nvme_subsys_type type);
void nvmet_subsys_put(struct nvmet_subsys *subsys);
struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid);
void nvmet_put_namespace(struct nvmet_ns *ns);
int nvmet_ns_enable(struct nvmet_ns *ns);
void nvmet_ns_disable(struct nvmet_ns *ns);
struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid);
void nvmet_ns_free(struct nvmet_ns *ns);
int nvmet_register_transport(struct nvmet_fabrics_ops *ops);
void nvmet_unregister_transport(struct nvmet_fabrics_ops *ops);
int nvmet_enable_port(struct nvmet_port *port);
void nvmet_disable_port(struct nvmet_port *port);
void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port);
void nvmet_referral_disable(struct nvmet_port *port);
u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
size_t len);
u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf,
size_t len);
u32 nvmet_get_log_page_len(struct nvme_command *cmd);
#define NVMET_QUEUE_SIZE 1024
#define NVMET_NR_QUEUES 64
#define NVMET_MAX_CMD NVMET_QUEUE_SIZE
#define NVMET_KAS 10
#define NVMET_DISC_KATO 120
int __init nvmet_init_configfs(void);
void __exit nvmet_exit_configfs(void);
int __init nvmet_init_discovery(void);
void nvmet_exit_discovery(void);
extern struct nvmet_subsys *nvmet_disc_subsys;
extern u64 nvmet_genctr;
extern struct rw_semaphore nvmet_config_sem;
bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
const char *hostnqn);
#endif /* _NVMET_H */

File diff suppressed because it is too large Load Diff

View File

@ -45,7 +45,6 @@ int dasd_gendisk_alloc(struct dasd_block *block)
gdp->major = DASD_MAJOR;
gdp->first_minor = base->devindex << DASD_PARTN_BITS;
gdp->fops = &dasd_device_operations;
gdp->driverfs_dev = &base->cdev->dev;
/*
* Set device name.
@ -76,7 +75,7 @@ int dasd_gendisk_alloc(struct dasd_block *block)
gdp->queue = block->request_queue;
block->gdp = gdp;
set_capacity(block->gdp, 0);
add_disk(block->gdp);
device_add_disk(&base->cdev->dev, block->gdp);
return 0;
}

View File

@ -615,7 +615,6 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
dev_info->dcssblk_queue = blk_alloc_queue(GFP_KERNEL);
dev_info->gd->queue = dev_info->dcssblk_queue;
dev_info->gd->private_data = dev_info;
dev_info->gd->driverfs_dev = &dev_info->dev;
blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request);
blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096);
queue_flag_set_unlocked(QUEUE_FLAG_DAX, dev_info->dcssblk_queue);
@ -656,7 +655,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
goto put_dev;
get_device(&dev_info->dev);
add_disk(dev_info->gd);
device_add_disk(&dev_info->dev, dev_info->gd);
switch (dev_info->segment_type) {
case SEG_TYPE_SR:

Some files were not shown because too many files have changed in this diff Show More