diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index d6da611f8f63..4ed7b5ceeed2 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt @@ -89,6 +89,33 @@ Throttling/Upper Limit policy Limits for writes can be put using blkio.write_bps_device file. +Hierarchical Cgroups +==================== +- Currently none of the IO control policy supports hierarhical groups. But + cgroup interface does allow creation of hierarhical cgroups and internally + IO policies treat them as flat hierarchy. + + So this patch will allow creation of cgroup hierarhcy but at the backend + everything will be treated as flat. So if somebody created a hierarchy like + as follows. + + root + / \ + test1 test2 + | + test3 + + CFQ and throttling will practically treat all groups at same level. + + pivot + / | \ \ + root test1 test2 test3 + + Down the line we can implement hierarchical accounting/control support + and also introduce a new cgroup file "use_hierarchy" which will control + whether cgroup hierarchy is viewed as flat or hierarchical by the policy.. + This is how memory controller also has implemented the things. + Various user visible config options =================================== CONFIG_BLK_CGROUP diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b1febd0f6d2a..455768a3eb9e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1452,10 +1452,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) goto done; } - /* Currently we do not support hierarchy deeper than two level (0,1) */ - if (parent != cgroup->top_cgroup) - return ERR_PTR(-EPERM); - blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) return ERR_PTR(-ENOMEM); diff --git a/block/blk-core.c b/block/blk-core.c index 4ce953f1b390..2f4002f79a24 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -33,7 +33,7 @@ #include "blk.h" -EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); @@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io) return; cpu = part_stat_lock(); - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!new_io) + if (!new_io) { + part = rq->part; part_stat_inc(cpu, part, merges[rw]); - else { + } else { + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + if (!hd_struct_try_get(part)) { + /* + * The partition is already being removed, + * the request will be accounted on the disk only + * + * We take a reference on disk->part0 although that + * partition will never be deleted, so we can treat + * it as any other partition. + */ + part = &rq->rq_disk->part0; + hd_struct_get(part); + } part_round_stats(cpu, part); part_inc_in_flight(part, rw); + rq->part = part; } part_stat_unlock(); @@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); + rq->part = NULL; } EXPORT_SYMBOL(blk_rq_init); @@ -1329,9 +1344,9 @@ static inline void blk_partition_remap(struct bio *bio) bio->bi_sector += p->start_sect; bio->bi_bdev = bdev->bd_contains; - trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, - bdev->bd_dev, - bio->bi_sector - p->start_sect); + trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio, + bdev->bd_dev, + bio->bi_sector - p->start_sect); } } @@ -1500,7 +1515,7 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; if (old_sector != -1) - trace_block_remap(q, bio, old_dev, old_sector); + trace_block_bio_remap(q, bio, old_dev, old_sector); old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; @@ -1776,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_add(cpu, part, sectors[rw], bytes >> 9); part_stat_unlock(); } @@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); part_round_stats(cpu, part); part_dec_in_flight(part, rw); + hd_struct_put(part); part_stat_unlock(); } } @@ -2606,7 +2622,9 @@ int __init blk_dev_init(void) BUILD_BUG_ON(__REQ_NR_BITS > 8 * sizeof(((struct request *)0)->cmd_flags)); - kblockd_workqueue = create_workqueue("kblockd"); + /* used for unplugging and affects IO latency/throughput - HIGHPRI */ + kblockd_workqueue = alloc_workqueue("kblockd", + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 3c7a339fe381..b791022beef3 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -64,7 +64,7 @@ static void cfq_exit(struct io_context *ioc) rcu_read_unlock(); } -/* Called by the exitting task */ +/* Called by the exiting task */ void exit_io_context(struct task_struct *task) { struct io_context *ioc; @@ -74,10 +74,9 @@ void exit_io_context(struct task_struct *task) task->io_context = NULL; task_unlock(task); - if (atomic_dec_and_test(&ioc->nr_tasks)) { + if (atomic_dec_and_test(&ioc->nr_tasks)) cfq_exit(ioc); - } put_io_context(ioc); } diff --git a/block/blk-merge.c b/block/blk-merge.c index 74bc4a768f32..ea85e20d5e94 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_round_stats(cpu, part); part_dec_in_flight(part, rq_data_dir(req)); + hd_struct_put(part); part_stat_unlock(); } } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 78ee4b1d4e85..8427697c5437 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -87,7 +87,6 @@ struct cfq_rb_root { unsigned count; unsigned total_weight; u64 min_vdisktime; - struct rb_node *active; }; #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ .count = 0, .min_vdisktime = 0, } @@ -97,7 +96,7 @@ struct cfq_rb_root { */ struct cfq_queue { /* reference count */ - atomic_t ref; + int ref; /* various state flags, see below */ unsigned int flags; /* parent cfq_data */ @@ -180,7 +179,6 @@ struct cfq_group { /* group service_tree key */ u64 vdisktime; unsigned int weight; - bool on_st; /* number of cfqq currently on this group */ int nr_cfqq; @@ -209,7 +207,7 @@ struct cfq_group { struct blkio_group blkg; #ifdef CONFIG_CFQ_GROUP_IOSCHED struct hlist_node cfqd_node; - atomic_t ref; + int ref; #endif /* number of requests that are on the dispatch list or inside driver */ int dispatched; @@ -563,11 +561,6 @@ static void update_min_vdisktime(struct cfq_rb_root *st) u64 vdisktime = st->min_vdisktime; struct cfq_group *cfqg; - if (st->active) { - cfqg = rb_entry_cfqg(st->active); - vdisktime = cfqg->vdisktime; - } - if (st->left) { cfqg = rb_entry_cfqg(st->left); vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); @@ -646,11 +639,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) static inline bool cfq_slice_used(struct cfq_queue *cfqq) { if (cfq_cfqq_slice_new(cfqq)) - return 0; + return false; if (time_before(jiffies, cfqq->slice_end)) - return 0; + return false; - return 1; + return true; } /* @@ -869,7 +862,7 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) struct rb_node *n; cfqg->nr_cfqq++; - if (cfqg->on_st) + if (!RB_EMPTY_NODE(&cfqg->rb_node)) return; /* @@ -885,7 +878,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) cfqg->vdisktime = st->min_vdisktime; __cfq_group_service_tree_add(st, cfqg); - cfqg->on_st = true; st->total_weight += cfqg->weight; } @@ -894,9 +886,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) { struct cfq_rb_root *st = &cfqd->grp_service_tree; - if (st->active == &cfqg->rb_node) - st->active = NULL; - BUG_ON(cfqg->nr_cfqq < 1); cfqg->nr_cfqq--; @@ -905,7 +894,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) return; cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); - cfqg->on_st = false; st->total_weight -= cfqg->weight; if (!RB_EMPTY_NODE(&cfqg->rb_node)) cfq_rb_erase(&cfqg->rb_node, st); @@ -1026,7 +1014,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) * elevator which will be dropped by either elevator exit * or cgroup deletion path depending on who is exiting first. */ - atomic_set(&cfqg->ref, 1); + cfqg->ref = 1; /* * Add group onto cgroup list. It might happen that bdi->dev is @@ -1071,7 +1059,7 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) { - atomic_inc(&cfqg->ref); + cfqg->ref++; return cfqg; } @@ -1083,7 +1071,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) cfqq->cfqg = cfqg; /* cfqq reference on cfqg */ - atomic_inc(&cfqq->cfqg->ref); + cfqq->cfqg->ref++; } static void cfq_put_cfqg(struct cfq_group *cfqg) @@ -1091,11 +1079,12 @@ static void cfq_put_cfqg(struct cfq_group *cfqg) struct cfq_rb_root *st; int i, j; - BUG_ON(atomic_read(&cfqg->ref) <= 0); - if (!atomic_dec_and_test(&cfqg->ref)) + BUG_ON(cfqg->ref <= 0); + cfqg->ref--; + if (cfqg->ref) return; for_each_cfqg_st(cfqg, i, j, st) - BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); + BUG_ON(!RB_EMPTY_ROOT(&st->rb)); kfree(cfqg); } @@ -1200,7 +1189,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_group_service_tree_del(cfqd, cfqq->cfqg); cfqq->orig_cfqg = cfqq->cfqg; cfqq->cfqg = &cfqd->root_group; - atomic_inc(&cfqd->root_group.ref); + cfqd->root_group.ref++; group_changed = 1; } else if (!cfqd->cfq_group_isolation && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { @@ -1687,9 +1676,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq == cfqd->active_queue) cfqd->active_queue = NULL; - if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) - cfqd->grp_service_tree.active = NULL; - if (cfqd->active_cic) { put_io_context(cfqd->active_cic->ioc); cfqd->active_cic = NULL; @@ -1901,10 +1887,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * in their service tree. */ if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) - return 1; + return true; cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", service_tree->count); - return 0; + return false; } static void cfq_arm_slice_timer(struct cfq_data *cfqd) @@ -2040,7 +2026,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq) int process_refs, io_refs; io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; - process_refs = atomic_read(&cfqq->ref) - io_refs; + process_refs = cfqq->ref - io_refs; BUG_ON(process_refs < 0); return process_refs; } @@ -2080,10 +2066,10 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) */ if (new_process_refs >= process_refs) { cfqq->new_cfqq = new_cfqq; - atomic_add(process_refs, &new_cfqq->ref); + new_cfqq->ref += process_refs; } else { new_cfqq->new_cfqq = cfqq; - atomic_add(new_process_refs, &cfqq->ref); + cfqq->ref += new_process_refs; } } @@ -2116,12 +2102,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) unsigned count; struct cfq_rb_root *st; unsigned group_slice; - - if (!cfqg) { - cfqd->serving_prio = IDLE_WORKLOAD; - cfqd->workload_expires = jiffies + 1; - return; - } + enum wl_prio_t original_prio = cfqd->serving_prio; /* Choose next priority. RT > BE > IDLE */ if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) @@ -2134,6 +2115,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) return; } + if (original_prio != cfqd->serving_prio) + goto new_workload; + /* * For RT and BE, we have to choose also the type * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload @@ -2148,6 +2132,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) if (count && !time_after(jiffies, cfqd->workload_expires)) return; +new_workload: /* otherwise select new workload type */ cfqd->serving_type = cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); @@ -2199,7 +2184,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) if (RB_EMPTY_ROOT(&st->rb)) return NULL; cfqg = cfq_rb_first_group(st); - st->active = &cfqg->rb_node; update_min_vdisktime(st); return cfqg; } @@ -2293,6 +2277,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) goto keep_queue; } + /* + * This is a deep seek queue, but the device is much faster than + * the queue can deliver, don't idle + **/ + if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) && + (cfq_cfqq_slice_new(cfqq) || + (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) { + cfq_clear_cfqq_deep(cfqq); + cfq_clear_cfqq_idle_window(cfqq); + } + if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { cfqq = NULL; goto keep_queue; @@ -2367,12 +2362,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, { /* the queue hasn't finished any request, can't estimate */ if (cfq_cfqq_slice_new(cfqq)) - return 1; + return true; if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, cfqq->slice_end)) - return 1; + return true; - return 0; + return false; } static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) @@ -2538,9 +2533,10 @@ static void cfq_put_queue(struct cfq_queue *cfqq) struct cfq_data *cfqd = cfqq->cfqd; struct cfq_group *cfqg, *orig_cfqg; - BUG_ON(atomic_read(&cfqq->ref) <= 0); + BUG_ON(cfqq->ref <= 0); - if (!atomic_dec_and_test(&cfqq->ref)) + cfqq->ref--; + if (cfqq->ref) return; cfq_log_cfqq(cfqd, cfqq, "put_queue"); @@ -2843,7 +2839,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, RB_CLEAR_NODE(&cfqq->p_node); INIT_LIST_HEAD(&cfqq->fifo); - atomic_set(&cfqq->ref, 0); + cfqq->ref = 0; cfqq->cfqd = cfqd; cfq_mark_cfqq_prio_changed(cfqq); @@ -2979,11 +2975,11 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, * pin the queue now that it's allocated, scheduler exit will prune it */ if (!is_sync && !(*async_cfqq)) { - atomic_inc(&cfqq->ref); + cfqq->ref++; *async_cfqq = cfqq; } - atomic_inc(&cfqq->ref); + cfqq->ref++; return cfqq; } @@ -3265,6 +3261,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) return true; + /* An idle queue should not be idle now for some reason */ + if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) + return true; + if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) return false; @@ -3681,13 +3681,13 @@ new_queue: } cfqq->allocated[rw]++; - atomic_inc(&cfqq->ref); - - spin_unlock_irqrestore(q->queue_lock, flags); - + cfqq->ref++; rq->elevator_private = cic; rq->elevator_private2 = cfqq; rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); + + spin_unlock_irqrestore(q->queue_lock, flags); + return 0; queue_fail: @@ -3862,6 +3862,10 @@ static void *cfq_init_queue(struct request_queue *q) if (!cfqd) return NULL; + /* + * Don't need take queue_lock in the routine, since we are + * initializing the ioscheduler, and nobody is using cfqd + */ cfqd->cic_index = i; /* Init root service tree */ @@ -3881,7 +3885,7 @@ static void *cfq_init_queue(struct request_queue *q) * Take a reference to root group which we never drop. This is just * to make sure that cfq_put_cfqg() does not try to kfree root group */ - atomic_set(&cfqg->ref, 1); + cfqg->ref = 1; rcu_read_lock(); cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, 0); @@ -3901,7 +3905,7 @@ static void *cfq_init_queue(struct request_queue *q) * will not attempt to free it. */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); - atomic_inc(&cfqd->oom_cfqq.ref); + cfqd->oom_cfqq.ref++; cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); INIT_LIST_HEAD(&cfqd->cic_list); diff --git a/block/genhd.c b/block/genhd.c index 5fa2b44a72ff..6a5b772aa201 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "blk.h" @@ -35,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; +static void disk_add_events(struct gendisk *disk); +static void disk_del_events(struct gendisk *disk); +static void disk_release_events(struct gendisk *disk); + /** * disk_get_part - get partition * @disk: disk to look partition from @@ -239,7 +244,7 @@ static struct blk_major_name { } *major_names[BLKDEV_MAJOR_HASH_SIZE]; /* index in the above - for now: assume no multimajor ranges */ -static inline int major_to_index(int major) +static inline int major_to_index(unsigned major) { return major % BLKDEV_MAJOR_HASH_SIZE; } @@ -502,6 +507,64 @@ static int exact_lock(dev_t devt, void *data) return 0; } +void register_disk(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + struct block_device *bdev; + struct disk_part_iter piter; + struct hd_struct *part; + int err; + + ddev->parent = disk->driverfs_dev; + + dev_set_name(ddev, disk->disk_name); + + /* delay uevents, until we scanned partition table */ + dev_set_uevent_suppress(ddev, 1); + + if (device_add(ddev)) + return; + if (!sysfs_deprecated) { + err = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); + if (err) { + device_del(ddev); + return; + } + } + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + + /* No minors to use for partitions */ + if (!disk_partitionable(disk)) + goto exit; + + /* No such device (e.g., media were just removed) */ + if (!get_capacity(disk)) + goto exit; + + bdev = bdget_disk(disk, 0); + if (!bdev) + goto exit; + + bdev->bd_invalidated = 1; + err = blkdev_get(bdev, FMODE_READ, NULL); + if (err < 0) + goto exit; + blkdev_put(bdev, FMODE_READ); + +exit: + /* announce disk after possible partitions are created */ + dev_set_uevent_suppress(ddev, 0); + kobject_uevent(&ddev->kobj, KOBJ_ADD); + + /* announce possible partitions */ + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); +} + /** * add_disk - add partitioning information to kernel list * @disk: per-device partitioning information @@ -551,18 +614,48 @@ void add_disk(struct gendisk *disk) retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); + + disk_add_events(disk); } - EXPORT_SYMBOL(add_disk); -EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ -void unlink_gendisk(struct gendisk *disk) +void del_gendisk(struct gendisk *disk) { + struct disk_part_iter piter; + struct hd_struct *part; + + disk_del_events(disk); + + /* invalidate stuff */ + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); + while ((part = disk_part_iter_next(&piter))) { + invalidate_partition(disk, part->partno); + delete_partition(disk, part->partno); + } + disk_part_iter_exit(&piter); + + invalidate_partition(disk, 0); + blk_free_devt(disk_to_dev(disk)->devt); + set_capacity(disk, 0); + disk->flags &= ~GENHD_FL_UP; + sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); bdi_unregister(&disk->queue->backing_dev_info); blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); + + part_stat_set_all(&disk->part0, 0); + disk->part0.stamp = 0; + + kobject_put(disk->part0.holder_dir); + kobject_put(disk->slave_dir); + disk->driverfs_dev = NULL; + if (!sysfs_deprecated) + sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); + device_del(disk_to_dev(disk)); } +EXPORT_SYMBOL(del_gendisk); /** * get_gendisk - get partitioning information for a given device @@ -735,7 +828,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos) static void *p; p = disk_seqf_start(seqf, pos); - if (!IS_ERR(p) && p && !*pos) + if (!IS_ERR_OR_NULL(p) && !*pos) seq_puts(seqf, "major minor #blocks name\n\n"); return p; } @@ -1005,6 +1098,7 @@ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); @@ -1110,29 +1204,6 @@ static int __init proc_genhd_init(void) module_init(proc_genhd_init); #endif /* CONFIG_PROC_FS */ -static void media_change_notify_thread(struct work_struct *work) -{ - struct gendisk *gd = container_of(work, struct gendisk, async_notify); - char event[] = "MEDIA_CHANGE=1"; - char *envp[] = { event, NULL }; - - /* - * set enviroment vars to indicate which event this is for - * so that user space will know to go check the media status. - */ - kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); - put_device(gd->driverfs_dev); -} - -#if 0 -void genhd_media_change_notify(struct gendisk *disk) -{ - get_device(disk->driverfs_dev); - schedule_work(&disk->async_notify); -} -EXPORT_SYMBOL_GPL(genhd_media_change_notify); -#endif /* 0 */ - dev_t blk_lookup_devt(const char *name, int partno) { dev_t devt = MKDEV(0, 0); @@ -1193,13 +1264,13 @@ struct gendisk *alloc_disk_node(int minors, int node_id) } disk->part_tbl->part[0] = &disk->part0; + hd_ref_init(&disk->part0); + disk->minors = minors; rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); - INIT_WORK(&disk->async_notify, - media_change_notify_thread); } return disk; } @@ -1291,3 +1362,422 @@ int invalidate_partition(struct gendisk *disk, int partno) } EXPORT_SYMBOL(invalidate_partition); + +/* + * Disk events - monitor disk events like media change and eject request. + */ +struct disk_events { + struct list_head node; /* all disk_event's */ + struct gendisk *disk; /* the associated disk */ + spinlock_t lock; + + int block; /* event blocking depth */ + unsigned int pending; /* events already sent out */ + unsigned int clearing; /* events being cleared */ + + long poll_msecs; /* interval, -1 for default */ + struct delayed_work dwork; +}; + +static const char *disk_events_strs[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", +}; + +static char *disk_uevents[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", +}; + +/* list of all disk_events */ +static DEFINE_MUTEX(disk_events_mutex); +static LIST_HEAD(disk_events); + +/* disable in-kernel polling by default */ +static unsigned long disk_events_dfl_poll_msecs = 0; + +static unsigned long disk_events_poll_jiffies(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + long intv_msecs = 0; + + /* + * If device-specific poll interval is set, always use it. If + * the default is being used, poll iff there are events which + * can't be monitored asynchronously. + */ + if (ev->poll_msecs >= 0) + intv_msecs = ev->poll_msecs; + else if (disk->events & ~disk->async_events) + intv_msecs = disk_events_dfl_poll_msecs; + + return msecs_to_jiffies(intv_msecs); +} + +static void __disk_block_events(struct gendisk *disk, bool sync) +{ + struct disk_events *ev = disk->ev; + unsigned long flags; + bool cancel; + + spin_lock_irqsave(&ev->lock, flags); + cancel = !ev->block++; + spin_unlock_irqrestore(&ev->lock, flags); + + if (cancel) { + if (sync) + cancel_delayed_work_sync(&disk->ev->dwork); + else + cancel_delayed_work(&disk->ev->dwork); + } +} + +static void __disk_unblock_events(struct gendisk *disk, bool check_now) +{ + struct disk_events *ev = disk->ev; + unsigned long intv; + unsigned long flags; + + spin_lock_irqsave(&ev->lock, flags); + + if (WARN_ON_ONCE(ev->block <= 0)) + goto out_unlock; + + if (--ev->block) + goto out_unlock; + + /* + * Not exactly a latency critical operation, set poll timer + * slack to 25% and kick event check. + */ + intv = disk_events_poll_jiffies(disk); + set_timer_slack(&ev->dwork.timer, intv / 4); + if (check_now) + queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + else if (intv) + queue_delayed_work(system_nrt_wq, &ev->dwork, intv); +out_unlock: + spin_unlock_irqrestore(&ev->lock, flags); +} + +/** + * disk_block_events - block and flush disk event checking + * @disk: disk to block events for + * + * On return from this function, it is guaranteed that event checking + * isn't in progress and won't happen until unblocked by + * disk_unblock_events(). Events blocking is counted and the actual + * unblocking happens after the matching number of unblocks are done. + * + * Note that this intentionally does not block event checking from + * disk_clear_events(). + * + * CONTEXT: + * Might sleep. + */ +void disk_block_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_block_events(disk, true); +} + +/** + * disk_unblock_events - unblock disk event checking + * @disk: disk to unblock events for + * + * Undo disk_block_events(). When the block count reaches zero, it + * starts events polling if configured. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_unblock_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_unblock_events(disk, true); +} + +/** + * disk_check_events - schedule immediate event checking + * @disk: disk to check events for + * + * Schedule immediate event checking on @disk if not blocked. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_check_events(struct gendisk *disk) +{ + if (disk->ev) { + __disk_block_events(disk, false); + __disk_unblock_events(disk, true); + } +} +EXPORT_SYMBOL_GPL(disk_check_events); + +/** + * disk_clear_events - synchronously check, clear and return pending events + * @disk: disk to fetch and clear events from + * @mask: mask of events to be fetched and clearted + * + * Disk events are synchronously checked and pending events in @mask + * are cleared and returned. This ignores the block count. + * + * CONTEXT: + * Might sleep. + */ +unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) +{ + const struct block_device_operations *bdops = disk->fops; + struct disk_events *ev = disk->ev; + unsigned int pending; + + if (!ev) { + /* for drivers still using the old ->media_changed method */ + if ((mask & DISK_EVENT_MEDIA_CHANGE) && + bdops->media_changed && bdops->media_changed(disk)) + return DISK_EVENT_MEDIA_CHANGE; + return 0; + } + + /* tell the workfn about the events being cleared */ + spin_lock_irq(&ev->lock); + ev->clearing |= mask; + spin_unlock_irq(&ev->lock); + + /* uncondtionally schedule event check and wait for it to finish */ + __disk_block_events(disk, true); + queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + flush_delayed_work(&ev->dwork); + __disk_unblock_events(disk, false); + + /* then, fetch and clear pending events */ + spin_lock_irq(&ev->lock); + WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */ + pending = ev->pending & mask; + ev->pending &= ~mask; + spin_unlock_irq(&ev->lock); + + return pending; +} + +static void disk_events_workfn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct disk_events *ev = container_of(dwork, struct disk_events, dwork); + struct gendisk *disk = ev->disk; + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + unsigned int clearing = ev->clearing; + unsigned int events; + unsigned long intv; + int nr_events = 0, i; + + /* check events */ + events = disk->fops->check_events(disk, clearing); + + /* accumulate pending events and schedule next poll if necessary */ + spin_lock_irq(&ev->lock); + + events &= ~ev->pending; + ev->pending |= events; + ev->clearing &= ~clearing; + + intv = disk_events_poll_jiffies(disk); + if (!ev->block && intv) + queue_delayed_work(system_nrt_wq, &ev->dwork, intv); + + spin_unlock_irq(&ev->lock); + + /* tell userland about new events */ + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if (events & (1 << i)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + +/* + * A disk events enabled device has the following sysfs nodes under + * its /sys/block/X/ directory. + * + * events : list of all supported events + * events_async : list of events which can be detected w/o polling + * events_poll_msecs : polling interval, 0: disable, -1: system default + */ +static ssize_t __disk_events_show(unsigned int events, char *buf) +{ + const char *delim = ""; + ssize_t pos = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) + if (events & (1 << i)) { + pos += sprintf(buf + pos, "%s%s", + delim, disk_events_strs[i]); + delim = " "; + } + if (pos) + pos += sprintf(buf + pos, "\n"); + return pos; +} + +static ssize_t disk_events_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return __disk_events_show(disk->events, buf); +} + +static ssize_t disk_events_async_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return __disk_events_show(disk->async_events, buf); +} + +static ssize_t disk_events_poll_msecs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%ld\n", disk->ev->poll_msecs); +} + +static ssize_t disk_events_poll_msecs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + long intv; + + if (!count || !sscanf(buf, "%ld", &intv)) + return -EINVAL; + + if (intv < 0 && intv != -1) + return -EINVAL; + + __disk_block_events(disk, true); + disk->ev->poll_msecs = intv; + __disk_unblock_events(disk, true); + + return count; +} + +static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL); +static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL); +static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR, + disk_events_poll_msecs_show, + disk_events_poll_msecs_store); + +static const struct attribute *disk_events_attrs[] = { + &dev_attr_events.attr, + &dev_attr_events_async.attr, + &dev_attr_events_poll_msecs.attr, + NULL, +}; + +/* + * The default polling interval can be specified by the kernel + * parameter block.events_dfl_poll_msecs which defaults to 0 + * (disable). This can also be modified runtime by writing to + * /sys/module/block/events_dfl_poll_msecs. + */ +static int disk_events_set_dfl_poll_msecs(const char *val, + const struct kernel_param *kp) +{ + struct disk_events *ev; + int ret; + + ret = param_set_ulong(val, kp); + if (ret < 0) + return ret; + + mutex_lock(&disk_events_mutex); + + list_for_each_entry(ev, &disk_events, node) + disk_check_events(ev->disk); + + mutex_unlock(&disk_events_mutex); + + return 0; +} + +static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { + .set = disk_events_set_dfl_poll_msecs, + .get = param_get_ulong, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "block." + +module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, + &disk_events_dfl_poll_msecs, 0644); + +/* + * disk_{add|del|release}_events - initialize and destroy disk_events. + */ +static void disk_add_events(struct gendisk *disk) +{ + struct disk_events *ev; + + if (!disk->fops->check_events || !(disk->events | disk->async_events)) + return; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) { + pr_warn("%s: failed to initialize events\n", disk->disk_name); + return; + } + + if (sysfs_create_files(&disk_to_dev(disk)->kobj, + disk_events_attrs) < 0) { + pr_warn("%s: failed to create sysfs files for events\n", + disk->disk_name); + kfree(ev); + return; + } + + disk->ev = ev; + + INIT_LIST_HEAD(&ev->node); + ev->disk = disk; + spin_lock_init(&ev->lock); + ev->block = 1; + ev->poll_msecs = -1; + INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); + + mutex_lock(&disk_events_mutex); + list_add_tail(&ev->node, &disk_events); + mutex_unlock(&disk_events_mutex); + + /* + * Block count is initialized to 1 and the following initial + * unblock kicks it into action. + */ + __disk_unblock_events(disk, true); +} + +static void disk_del_events(struct gendisk *disk) +{ + if (!disk->ev) + return; + + __disk_block_events(disk, true); + + mutex_lock(&disk_events_mutex); + list_del_init(&disk->ev->node); + mutex_unlock(&disk_events_mutex); + + sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); +} + +static void disk_release_events(struct gendisk *disk) +{ + /* the block count should be 1 from disk_del_events() */ + WARN_ON_ONCE(disk->ev && disk->ev->block != 1); + kfree(disk->ev); +} diff --git a/block/ioctl.c b/block/ioctl.c index a9a302eba01e..9049d460fa89 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -294,11 +294,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return -EINVAL; if (get_user(n, (int __user *) arg)) return -EFAULT; - if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0) + if (!(mode & FMODE_EXCL) && + blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) return -EBUSY; ret = set_blocksize(bdev, n); if (!(mode & FMODE_EXCL)) - bd_release(bdev); + blkdev_put(bdev, mode | FMODE_EXCL); return ret; case BLKPG: ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 1ea1a34e78b2..3803a0348937 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -911,8 +911,6 @@ struct drbd_md { struct drbd_backing_dev { struct block_device *backing_bdev; struct block_device *md_bdev; - struct file *lo_file; - struct file *md_file; struct drbd_md md; struct disk_conf dc; /* The user provided config... */ sector_t known_size; /* last known size of that backing device */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 6be5401d0e88..29cd0dc9fe4f 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3372,11 +3372,8 @@ void drbd_free_bc(struct drbd_backing_dev *ldev) if (ldev == NULL) return; - bd_release(ldev->backing_bdev); - bd_release(ldev->md_bdev); - - fput(ldev->lo_file); - fput(ldev->md_file); + blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); kfree(ldev); } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 29e5c70e4e26..8cbfaa687d72 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -855,7 +855,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp sector_t max_possible_sectors; sector_t min_md_device_sectors; struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ - struct inode *inode, *inode2; + struct block_device *bdev; struct lru_cache *resync_lru = NULL; union drbd_state ns, os; unsigned int max_seg_s; @@ -907,46 +907,40 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp } } - nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); - if (IS_ERR(nbc->lo_file)) { + bdev = blkdev_get_by_path(nbc->dc.backing_dev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev); + if (IS_ERR(bdev)) { dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, - PTR_ERR(nbc->lo_file)); - nbc->lo_file = NULL; + PTR_ERR(bdev)); retcode = ERR_OPEN_DISK; goto fail; } + nbc->backing_bdev = bdev; - inode = nbc->lo_file->f_dentry->d_inode; - - if (!S_ISBLK(inode->i_mode)) { - retcode = ERR_DISK_NOT_BDEV; - goto fail; - } - - nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0); - if (IS_ERR(nbc->md_file)) { + /* + * meta_dev_idx >= 0: external fixed size, possibly multiple + * drbd sharing one meta device. TODO in that case, paranoia + * check that [md_bdev, meta_dev_idx] is not yet used by some + * other drbd minor! (if you use drbd.conf + drbdadm, that + * should check it for you already; but if you don't, or + * someone fooled it, we need to double check here) + */ + bdev = blkdev_get_by_path(nbc->dc.meta_dev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, + (nbc->dc.meta_dev_idx < 0) ? + (void *)mdev : (void *)drbd_m_holder); + if (IS_ERR(bdev)) { dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, - PTR_ERR(nbc->md_file)); - nbc->md_file = NULL; + PTR_ERR(bdev)); retcode = ERR_OPEN_MD_DISK; goto fail; } + nbc->md_bdev = bdev; - inode2 = nbc->md_file->f_dentry->d_inode; - - if (!S_ISBLK(inode2->i_mode)) { - retcode = ERR_MD_NOT_BDEV; - goto fail; - } - - nbc->backing_bdev = inode->i_bdev; - if (bd_claim(nbc->backing_bdev, mdev)) { - printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n", - nbc->backing_bdev, mdev, - nbc->backing_bdev->bd_holder, - nbc->backing_bdev->bd_contains->bd_holder, - nbc->backing_bdev->bd_holders); - retcode = ERR_BDCLAIM_DISK; + if ((nbc->backing_bdev == nbc->md_bdev) != + (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { + retcode = ERR_MD_IDX_INVALID; goto fail; } @@ -955,28 +949,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp offsetof(struct bm_extent, lce)); if (!resync_lru) { retcode = ERR_NOMEM; - goto release_bdev_fail; - } - - /* meta_dev_idx >= 0: external fixed size, - * possibly multiple drbd sharing one meta device. - * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is - * not yet used by some other drbd minor! - * (if you use drbd.conf + drbdadm, - * that should check it for you already; but if you don't, or someone - * fooled it, we need to double check here) */ - nbc->md_bdev = inode2->i_bdev; - if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev - : (void *) drbd_m_holder)) { - retcode = ERR_BDCLAIM_MD_DISK; - goto release_bdev_fail; - } - - if ((nbc->backing_bdev == nbc->md_bdev) != - (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || - nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { - retcode = ERR_MD_IDX_INVALID; - goto release_bdev2_fail; + goto fail; } /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ @@ -987,7 +960,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp (unsigned long long) drbd_get_max_capacity(nbc), (unsigned long long) nbc->dc.disk_size); retcode = ERR_DISK_TO_SMALL; - goto release_bdev2_fail; + goto fail; } if (nbc->dc.meta_dev_idx < 0) { @@ -1004,7 +977,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp dev_warn(DEV, "refusing attach: md-device too small, " "at least %llu sectors needed for this meta-disk type\n", (unsigned long long) min_md_device_sectors); - goto release_bdev2_fail; + goto fail; } /* Make sure the new disk is big enough @@ -1012,7 +985,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp if (drbd_get_max_capacity(nbc) < drbd_get_capacity(mdev->this_bdev)) { retcode = ERR_DISK_TO_SMALL; - goto release_bdev2_fail; + goto fail; } nbc->known_size = drbd_get_capacity(nbc->backing_bdev); @@ -1035,7 +1008,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); drbd_resume_io(mdev); if (retcode < SS_SUCCESS) - goto release_bdev2_fail; + goto fail; if (!get_ldev_if_state(mdev, D_ATTACHING)) goto force_diskless; @@ -1269,18 +1242,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp force_diskless: drbd_force_state(mdev, NS(disk, D_FAILED)); drbd_md_sync(mdev); - release_bdev2_fail: - if (nbc) - bd_release(nbc->md_bdev); - release_bdev_fail: - if (nbc) - bd_release(nbc->backing_bdev); fail: if (nbc) { - if (nbc->lo_file) - fput(nbc->lo_file); - if (nbc->md_file) - fput(nbc->md_file); + if (nbc->backing_bdev) + blkdev_put(nbc->backing_bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL); + if (nbc->md_bdev) + blkdev_put(nbc->md_bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL); kfree(nbc); } lc_destroy(resync_lru); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 7ea0bea2f7e3..44e18c073c44 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -395,11 +395,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct loop_device *lo = p->lo; struct page *page = buf->page; sector_t IV; - int size, ret; - - ret = buf->ops->confirm(pipe, buf); - if (unlikely(ret)) - return ret; + int size; IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) + (buf->offset >> 9); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 19b3568e9326..77d70eebb6b2 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2296,15 +2296,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) * so bdget() can't fail. */ bdget(pd->bdev->bd_dev); - if ((ret = blkdev_get(pd->bdev, FMODE_READ))) + if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd))) goto out; - if ((ret = bd_claim(pd->bdev, pd))) - goto out_putdev; - if ((ret = pkt_get_last_written(pd, &lba))) { printk(DRIVER_NAME": pkt_get_last_written failed\n"); - goto out_unclaim; + goto out_putdev; } set_capacity(pd->disk, lba << 2); @@ -2314,7 +2311,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) q = bdev_get_queue(pd->bdev); if (write) { if ((ret = pkt_open_write(pd))) - goto out_unclaim; + goto out_putdev; /* * Some CDRW drives can not handle writes larger than one packet, * even if the size is a multiple of the packet size. @@ -2329,23 +2326,21 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) } if ((ret = pkt_set_segment_merging(pd, q))) - goto out_unclaim; + goto out_putdev; if (write) { if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { printk(DRIVER_NAME": not enough memory for buffers\n"); ret = -ENOMEM; - goto out_unclaim; + goto out_putdev; } printk(DRIVER_NAME": %lukB available on disc\n", lba << 1); } return 0; -out_unclaim: - bd_release(pd->bdev); out_putdev: - blkdev_put(pd->bdev, FMODE_READ); + blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL); out: return ret; } @@ -2362,8 +2357,7 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush) pkt_lock_door(pd, 0); pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); - bd_release(pd->bdev); - blkdev_put(pd->bdev, FMODE_READ); + blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL); pkt_shrink_pktlist(pd); } @@ -2733,7 +2727,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) bdev = bdget(dev); if (!bdev) return -ENOMEM; - ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY); + ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); if (ret) return ret; diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index af13c62dc473..14033a36bcd0 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -1348,7 +1348,10 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot) if (!CDROM_CAN(CDC_SELECT_DISC)) return -EDRIVE_CANT_DO_THIS; - (void) cdi->ops->media_changed(cdi, slot); + if (cdi->ops->check_events) + cdi->ops->check_events(cdi, 0, slot); + else + cdi->ops->media_changed(cdi, slot); if (slot == CDSL_NONE) { /* set media changed bits, on both queues */ @@ -1392,6 +1395,42 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot) return slot; } +/* + * As cdrom implements an extra ioctl consumer for media changed + * event, it needs to buffer ->check_events() output, such that event + * is not lost for both the usual VFS and ioctl paths. + * cdi->{vfs|ioctl}_events are used to buffer pending events for each + * path. + * + * XXX: Locking is non-existent. cdi->ops->check_events() can be + * called in parallel and buffering fields are accessed without any + * exclusion. The original media_changed code had the same problem. + * It might be better to simply deprecate CDROM_MEDIA_CHANGED ioctl + * and remove this cruft altogether. It doesn't have much usefulness + * at this point. + */ +static void cdrom_update_events(struct cdrom_device_info *cdi, + unsigned int clearing) +{ + unsigned int events; + + events = cdi->ops->check_events(cdi, clearing, CDSL_CURRENT); + cdi->vfs_events |= events; + cdi->ioctl_events |= events; +} + +unsigned int cdrom_check_events(struct cdrom_device_info *cdi, + unsigned int clearing) +{ + unsigned int events; + + cdrom_update_events(cdi, clearing); + events = cdi->vfs_events; + cdi->vfs_events = 0; + return events; +} +EXPORT_SYMBOL(cdrom_check_events); + /* We want to make media_changed accessible to the user through an * ioctl. The main problem now is that we must double-buffer the * low-level implementation, to assure that the VFS and the user both @@ -1403,15 +1442,26 @@ int media_changed(struct cdrom_device_info *cdi, int queue) { unsigned int mask = (1 << (queue & 1)); int ret = !!(cdi->mc_flags & mask); + bool changed; if (!CDROM_CAN(CDC_MEDIA_CHANGED)) - return ret; + return ret; + /* changed since last call? */ - if (cdi->ops->media_changed(cdi, CDSL_CURRENT)) { + if (cdi->ops->check_events) { + BUG_ON(!queue); /* shouldn't be called from VFS path */ + cdrom_update_events(cdi, DISK_EVENT_MEDIA_CHANGE); + changed = cdi->ioctl_events & DISK_EVENT_MEDIA_CHANGE; + cdi->ioctl_events = 0; + } else + changed = cdi->ops->media_changed(cdi, CDSL_CURRENT); + + if (changed) { cdi->mc_flags = 0x3; /* set bit on both queues */ ret |= 1; cdi->media_written = 0; } + cdi->mc_flags &= ~mask; /* clear bit */ return ret; } diff --git a/drivers/char/raw.c b/drivers/char/raw.c index bfe25ea9766b..b4b9d5a47885 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -65,15 +65,12 @@ static int raw_open(struct inode *inode, struct file *filp) if (!bdev) goto out; igrab(bdev->bd_inode); - err = blkdev_get(bdev, filp->f_mode); + err = blkdev_get(bdev, filp->f_mode | FMODE_EXCL, raw_open); if (err) goto out; - err = bd_claim(bdev, raw_open); - if (err) - goto out1; err = set_blocksize(bdev, bdev_logical_block_size(bdev)); if (err) - goto out2; + goto out1; filp->f_flags |= O_DIRECT; filp->f_mapping = bdev->bd_inode->i_mapping; if (++raw_devices[minor].inuse == 1) @@ -83,10 +80,8 @@ static int raw_open(struct inode *inode, struct file *filp) mutex_unlock(&raw_mutex); return 0; -out2: - bd_release(bdev); out1: - blkdev_put(bdev, filp->f_mode); + blkdev_put(bdev, filp->f_mode | FMODE_EXCL); out: mutex_unlock(&raw_mutex); return err; @@ -110,8 +105,7 @@ static int raw_release(struct inode *inode, struct file *filp) } mutex_unlock(&raw_mutex); - bd_release(bdev); - blkdev_put(bdev, filp->f_mode); + blkdev_put(bdev, filp->f_mode | FMODE_EXCL); return 0; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 4d705cea0f8c..985c20a4f30e 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -325,15 +325,18 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev, BUG_ON(d->dm_dev.bdev); - bdev = open_by_devnum(dev, d->dm_dev.mode); + bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr); if (IS_ERR(bdev)) return PTR_ERR(bdev); - r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md)); - if (r) - blkdev_put(bdev, d->dm_dev.mode); - else - d->dm_dev.bdev = bdev; - return r; + + r = bd_link_disk_holder(bdev, dm_disk(md)); + if (r) { + blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL); + return r; + } + + d->dm_dev.bdev = bdev; + return 0; } /* @@ -344,8 +347,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) if (!d->dm_dev.bdev) return; - bd_release_from_disk(d->dm_dev.bdev, dm_disk(md)); - blkdev_put(d->dm_dev.bdev, d->dm_dev.mode); + blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL); d->dm_dev.bdev = NULL; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7cb1352f7e7a..f48a2f359ac4 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -630,7 +630,7 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ - trace_block_bio_complete(md->queue, bio); + trace_block_bio_complete(md->queue, bio, io_error); bio_endio(bio, io_error); } } @@ -990,8 +990,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, if (r == DM_MAPIO_REMAPPED) { /* the bio has been remapped so dispatch it */ - trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, - tio->io->bio->bi_bdev->bd_dev, sector); + trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, + tio->io->bio->bi_bdev->bd_dev, sector); generic_make_request(clone); } else if (r < 0 || r == DM_MAPIO_REQUEUE) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 175c424f201f..7fc090ac9e28 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1879,7 +1879,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); list_add_rcu(&rdev->same_set, &mddev->disks); - bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); + bd_link_disk_holder(rdev->bdev, mddev->gendisk); /* May as well allow recovery to be retried once */ mddev->recovery_disabled = 0; @@ -1906,7 +1906,6 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) MD_BUG(); return; } - bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; @@ -1934,19 +1933,13 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + shared ? (mdk_rdev_t *)lock_rdev : rdev); if (IS_ERR(bdev)) { printk(KERN_ERR "md: could not open %s.\n", __bdevname(dev, b)); return PTR_ERR(bdev); } - err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); - if (err) { - printk(KERN_ERR "md: could not bd_claim %s.\n", - bdevname(bdev, b)); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); - return err; - } if (!shared) set_bit(AllReserved, &rdev->flags); rdev->bdev = bdev; @@ -1959,8 +1952,7 @@ static void unlock_rdev(mdk_rdev_t *rdev) rdev->bdev = NULL; if (!bdev) MD_BUG(); - bd_release(bdev); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } void md_autodetect_dev(dev_t dev); diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index 2cf0cc6a4189..f29a6f9df6e7 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -224,7 +224,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev) if (dev->blkdev) { invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping, 0, -1); - close_bdev_exclusive(dev->blkdev, FMODE_READ|FMODE_WRITE); + blkdev_put(dev->blkdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } kfree(dev); @@ -234,6 +234,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev) /* FIXME: ensure that mtd->size % erase_size == 0 */ static struct block2mtd_dev *add_device(char *devname, int erase_size) { + const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; struct block_device *bdev; struct block2mtd_dev *dev; char *name; @@ -246,7 +247,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size) return NULL; /* Get a handle on the device */ - bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, NULL); + bdev = blkdev_get_by_path(devname, mode, dev); #ifndef MODULE if (IS_ERR(bdev)) { @@ -254,9 +255,8 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size) to resolve the device name by other means. */ dev_t devt = name_to_dev_t(devname); - if (devt) { - bdev = open_by_devnum(devt, FMODE_WRITE | FMODE_READ); - } + if (devt) + bdev = blkdev_get_by_dev(devt, mode, dev); } #endif diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 30a1ca3d08b7..5505bc07e1e7 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -103,7 +103,7 @@ int dasd_scan_partitions(struct dasd_block *block) struct block_device *bdev; bdev = bdget_disk(block->gdp, 0); - if (!bdev || blkdev_get(bdev, FMODE_READ) < 0) + if (!bdev || blkdev_get(bdev, FMODE_READ, NULL) < 0) return -ENODEV; /* * See fs/partition/check.c:register_disk,rescan_partitions diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 501f67bef719..9045c52abd25 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1977,8 +1977,7 @@ EXPORT_SYMBOL(scsi_mode_sense); * in. * * Returns zero if unsuccessful or an error if TUR failed. For - * removable media, a return of NOT_READY or UNIT_ATTENTION is - * translated to success, with the ->changed flag updated. + * removable media, UNIT_ATTENTION sets ->changed flag. **/ int scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries, @@ -2005,16 +2004,6 @@ scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries, } while (scsi_sense_valid(sshdr) && sshdr->sense_key == UNIT_ATTENTION && --retries); - if (!sshdr) - /* could not allocate sense buffer, so can't process it */ - return result; - - if (sdev->removable && scsi_sense_valid(sshdr) && - (sshdr->sense_key == UNIT_ATTENTION || - sshdr->sense_key == NOT_READY)) { - sdev->changed = 1; - result = 0; - } if (!sshdr_external) kfree(sshdr); return result; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 365024b0c407..b65e65aa07eb 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1043,15 +1043,7 @@ static int sd_media_changed(struct gendisk *disk) sshdr); } - /* - * Unable to test, unit probably not ready. This usually - * means there is no disc in the drive. Mark as changed, - * and we will figure it out later once the drive is - * available again. - */ - if (retval || (scsi_sense_valid(sshdr) && - /* 0x3a is medium not present */ - sshdr->asc == 0x3a)) { + if (retval) { set_media_not_present(sdkp); goto out; } diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index d7b383c96d5d..be6baf8ad704 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -104,14 +104,15 @@ static void sr_release(struct cdrom_device_info *); static void get_sectorsize(struct scsi_cd *); static void get_capabilities(struct scsi_cd *); -static int sr_media_change(struct cdrom_device_info *, int); +static unsigned int sr_check_events(struct cdrom_device_info *cdi, + unsigned int clearing, int slot); static int sr_packet(struct cdrom_device_info *, struct packet_command *); static struct cdrom_device_ops sr_dops = { .open = sr_open, .release = sr_release, .drive_status = sr_drive_status, - .media_changed = sr_media_change, + .check_events = sr_check_events, .tray_move = sr_tray_move, .lock_door = sr_lock_door, .select_speed = sr_select_speed, @@ -165,90 +166,96 @@ static void scsi_cd_put(struct scsi_cd *cd) mutex_unlock(&sr_ref_mutex); } -/* identical to scsi_test_unit_ready except that it doesn't - * eat the NOT_READY returns for removable media */ -int sr_test_unit_ready(struct scsi_device *sdev, struct scsi_sense_hdr *sshdr) +static unsigned int sr_get_events(struct scsi_device *sdev) { - int retries = MAX_RETRIES; - int the_result; - u8 cmd[] = {TEST_UNIT_READY, 0, 0, 0, 0, 0 }; + u8 buf[8]; + u8 cmd[] = { GET_EVENT_STATUS_NOTIFICATION, + 1, /* polled */ + 0, 0, /* reserved */ + 1 << 4, /* notification class: media */ + 0, 0, /* reserved */ + 0, sizeof(buf), /* allocation length */ + 0, /* control */ + }; + struct event_header *eh = (void *)buf; + struct media_event_desc *med = (void *)(buf + 4); + struct scsi_sense_hdr sshdr; + int result; - /* issue TEST_UNIT_READY until the initial startup UNIT_ATTENTION - * conditions are gone, or a timeout happens - */ - do { - the_result = scsi_execute_req(sdev, cmd, DMA_NONE, NULL, - 0, sshdr, SR_TIMEOUT, - retries--, NULL); - if (scsi_sense_valid(sshdr) && - sshdr->sense_key == UNIT_ATTENTION) - sdev->changed = 1; + result = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buf, sizeof(buf), + &sshdr, SR_TIMEOUT, MAX_RETRIES, NULL); + if (scsi_sense_valid(&sshdr) && sshdr.sense_key == UNIT_ATTENTION) + return DISK_EVENT_MEDIA_CHANGE; - } while (retries > 0 && - (!scsi_status_is_good(the_result) || - (scsi_sense_valid(sshdr) && - sshdr->sense_key == UNIT_ATTENTION))); - return the_result; + if (result || be16_to_cpu(eh->data_len) < sizeof(*med)) + return 0; + + if (eh->nea || eh->notification_class != 0x4) + return 0; + + if (med->media_event_code == 1) + return DISK_EVENT_EJECT_REQUEST; + else if (med->media_event_code == 2) + return DISK_EVENT_MEDIA_CHANGE; + return 0; } /* - * This function checks to see if the media has been changed in the - * CDROM drive. It is possible that we have already sensed a change, - * or the drive may have sensed one and not yet reported it. We must - * be ready for either case. This function always reports the current - * value of the changed bit. If flag is 0, then the changed bit is reset. - * This function could be done as an ioctl, but we would need to have - * an inode for that to work, and we do not always have one. + * This function checks to see if the media has been changed or eject + * button has been pressed. It is possible that we have already + * sensed a change, or the drive may have sensed one and not yet + * reported it. The past events are accumulated in sdev->changed and + * returned together with the current state. */ - -static int sr_media_change(struct cdrom_device_info *cdi, int slot) +static unsigned int sr_check_events(struct cdrom_device_info *cdi, + unsigned int clearing, int slot) { struct scsi_cd *cd = cdi->handle; - int retval; - struct scsi_sense_hdr *sshdr; + bool last_present; + struct scsi_sense_hdr sshdr; + unsigned int events; + int ret; - if (CDSL_CURRENT != slot) { - /* no changer support */ - return -EINVAL; + /* no changer support */ + if (CDSL_CURRENT != slot) + return 0; + + events = sr_get_events(cd->device); + /* + * GET_EVENT_STATUS_NOTIFICATION is enough unless MEDIA_CHANGE + * is being cleared. Note that there are devices which hang + * if asked to execute TUR repeatedly. + */ + if (!(clearing & DISK_EVENT_MEDIA_CHANGE)) + goto skip_tur; + + /* let's see whether the media is there with TUR */ + last_present = cd->media_present; + ret = scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr); + + /* + * Media is considered to be present if TUR succeeds or fails with + * sense data indicating something other than media-not-present + * (ASC 0x3a). + */ + cd->media_present = scsi_status_is_good(ret) || + (scsi_sense_valid(&sshdr) && sshdr.asc != 0x3a); + + if (last_present != cd->media_present) + events |= DISK_EVENT_MEDIA_CHANGE; +skip_tur: + if (cd->device->changed) { + events |= DISK_EVENT_MEDIA_CHANGE; + cd->device->changed = 0; } - sshdr = kzalloc(sizeof(*sshdr), GFP_KERNEL); - retval = sr_test_unit_ready(cd->device, sshdr); - if (retval || (scsi_sense_valid(sshdr) && - /* 0x3a is medium not present */ - sshdr->asc == 0x3a)) { - /* Media not present or unable to test, unit probably not - * ready. This usually means there is no disc in the drive. - * Mark as changed, and we will figure it out later once - * the drive is available again. - */ - cd->device->changed = 1; - /* This will force a flush, if called from check_disk_change */ - retval = 1; - goto out; - }; - - retval = cd->device->changed; - cd->device->changed = 0; - /* If the disk changed, the capacity will now be different, - * so we force a re-read of this information */ - if (retval) { - /* check multisession offset etc */ - sr_cd_check(cdi); - get_sectorsize(cd); - } - -out: - /* Notify userspace, that media has changed. */ - if (retval != cd->previous_state) + /* for backward compatibility */ + if (events & DISK_EVENT_MEDIA_CHANGE) sdev_evt_send_simple(cd->device, SDEV_EVT_MEDIA_CHANGE, GFP_KERNEL); - cd->previous_state = retval; - kfree(sshdr); - - return retval; + return events; } - + /* * sr_done is the interrupt routine for the device driver. * @@ -533,10 +540,25 @@ out: return ret; } -static int sr_block_media_changed(struct gendisk *disk) +static unsigned int sr_block_check_events(struct gendisk *disk, + unsigned int clearing) { struct scsi_cd *cd = scsi_cd(disk); - return cdrom_media_changed(&cd->cdi); + return cdrom_check_events(&cd->cdi, clearing); +} + +static int sr_block_revalidate_disk(struct gendisk *disk) +{ + struct scsi_cd *cd = scsi_cd(disk); + struct scsi_sense_hdr sshdr; + + /* if the unit is not ready, nothing more to do */ + if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr)) + return 0; + + sr_cd_check(&cd->cdi); + get_sectorsize(cd); + return 0; } static const struct block_device_operations sr_bdops = @@ -545,7 +567,8 @@ static const struct block_device_operations sr_bdops = .open = sr_block_open, .release = sr_block_release, .ioctl = sr_block_ioctl, - .media_changed = sr_block_media_changed, + .check_events = sr_block_check_events, + .revalidate_disk = sr_block_revalidate_disk, /* * No compat_ioctl for now because sr_block_ioctl never * seems to pass arbitary ioctls down to host drivers. @@ -618,6 +641,7 @@ static int sr_probe(struct device *dev) sprintf(disk->disk_name, "sr%d", minor); disk->fops = &sr_bdops; disk->flags = GENHD_FL_CD; + disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST; blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT); @@ -627,7 +651,7 @@ static int sr_probe(struct device *dev) cd->disk = disk; cd->capacity = 0x1fffff; cd->device->changed = 1; /* force recheck CD type */ - cd->previous_state = 1; + cd->media_present = 1; cd->use = 1; cd->readcd_known = 0; cd->readcd_cdda = 0; @@ -780,7 +804,7 @@ static void get_capabilities(struct scsi_cd *cd) } /* eat unit attentions */ - sr_test_unit_ready(cd->device, &sshdr); + scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr); /* ask for mode page 0x2a */ rc = scsi_mode_sense(cd->device, 0, 0x2a, buffer, 128, diff --git a/drivers/scsi/sr.h b/drivers/scsi/sr.h index 1e144dfdbd4b..e036f1dc83c8 100644 --- a/drivers/scsi/sr.h +++ b/drivers/scsi/sr.h @@ -40,7 +40,7 @@ typedef struct scsi_cd { unsigned xa_flag:1; /* CD has XA sectors ? */ unsigned readcd_known:1; /* drive supports READ_CD (0xbe) */ unsigned readcd_cdda:1; /* reading audio data using READ_CD */ - unsigned previous_state:1; /* media has changed */ + unsigned media_present:1; /* media is present */ struct cdrom_device_info cdi; /* We hold gendisk and scsi_device references on probe and use * the refs on this kref to decide when to release them */ @@ -61,7 +61,6 @@ int sr_select_speed(struct cdrom_device_info *cdi, int speed); int sr_audio_ioctl(struct cdrom_device_info *, unsigned int, void *); int sr_is_xa(Scsi_CD *); -int sr_test_unit_ready(struct scsi_device *sdev, struct scsi_sense_hdr *sshdr); /* sr_vendor.c */ void sr_vendor_init(Scsi_CD *); diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c index 3cd8ffbad577..8be30554119b 100644 --- a/drivers/scsi/sr_ioctl.c +++ b/drivers/scsi/sr_ioctl.c @@ -307,7 +307,7 @@ int sr_drive_status(struct cdrom_device_info *cdi, int slot) /* we have no changer support */ return -EINVAL; } - if (0 == sr_test_unit_ready(cd->device, &sshdr)) + if (!scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr)) return CDS_DISC_OK; /* SK/ASC/ASCQ of 2/4/1 means "unit is becoming ready" */ diff --git a/drivers/usb/gadget/storage_common.c b/drivers/usb/gadget/storage_common.c index 3b513bafaf2a..b015561fd602 100644 --- a/drivers/usb/gadget/storage_common.c +++ b/drivers/usb/gadget/storage_common.c @@ -543,7 +543,7 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename) ro = curlun->initially_ro; if (!ro) { filp = filp_open(filename, O_RDWR | O_LARGEFILE, 0); - if (-EROFS == PTR_ERR(filp)) + if (PTR_ERR(filp) == -EROFS || PTR_ERR(filp) == -EACCES) ro = 1; } if (ro) @@ -558,10 +558,7 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename) if (filp->f_path.dentry) inode = filp->f_path.dentry->d_inode; - if (inode && S_ISBLK(inode->i_mode)) { - if (bdev_read_only(inode->i_bdev)) - ro = 1; - } else if (!inode || !S_ISREG(inode->i_mode)) { + if (!inode || (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) { LINFO(curlun, "invalid file type: %s\n", filename); goto out; } diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 4d0ff5ee27b8..e49cce234c65 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -782,7 +782,12 @@ void __init bio_integrity_init(void) { unsigned int i; - kintegrityd_wq = create_workqueue("kintegrityd"); + /* + * kintegrityd won't block much but may burn a lot of CPU cycles. + * Make it highpri CPU intensive wq with max concurrency of 1. + */ + kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); if (!kintegrityd_wq) panic("Failed to create kintegrityd\n"); diff --git a/fs/block_dev.c b/fs/block_dev.c index 88da70355aa3..fe3f59c14a02 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -432,9 +432,6 @@ static void init_once(void *foo) mutex_init(&bdev->bd_mutex); INIT_LIST_HEAD(&bdev->bd_inodes); INIT_LIST_HEAD(&bdev->bd_list); -#ifdef CONFIG_SYSFS - INIT_LIST_HEAD(&bdev->bd_holder_list); -#endif inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); @@ -669,7 +666,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, else if (bdev->bd_contains == bdev) return true; /* is a whole device which isn't held */ - else if (whole->bd_holder == bd_claim) + else if (whole->bd_holder == bd_may_claim) return true; /* is a partition of a device that is being partitioned */ else if (whole->bd_holder != NULL) return false; /* is a partition of a held device */ @@ -781,440 +778,88 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, } } -/* releases bdev_lock */ -static void __bd_abort_claiming(struct block_device *whole, void *holder) -{ - BUG_ON(whole->bd_claiming != holder); - whole->bd_claiming = NULL; - wake_up_bit(&whole->bd_claiming, 0); - - spin_unlock(&bdev_lock); - bdput(whole); -} - -/** - * bd_abort_claiming - abort claiming a block device - * @whole: whole block device returned by bd_start_claiming() - * @holder: holder trying to claim @bdev - * - * Abort a claiming block started by bd_start_claiming(). Note that - * @whole is not the block device to be claimed but the whole device - * returned by bd_start_claiming(). - * - * CONTEXT: - * Grabs and releases bdev_lock. - */ -static void bd_abort_claiming(struct block_device *whole, void *holder) -{ - spin_lock(&bdev_lock); - __bd_abort_claiming(whole, holder); /* releases bdev_lock */ -} - -/* increment holders when we have a legitimate claim. requires bdev_lock */ -static void __bd_claim(struct block_device *bdev, struct block_device *whole, - void *holder) -{ - /* note that for a whole device bd_holders - * will be incremented twice, and bd_holder will - * be set to bd_claim before being set to holder - */ - whole->bd_holders++; - whole->bd_holder = bd_claim; - bdev->bd_holders++; - bdev->bd_holder = holder; -} - -/** - * bd_finish_claiming - finish claiming a block device - * @bdev: block device of interest (passed to bd_start_claiming()) - * @whole: whole block device returned by bd_start_claiming() - * @holder: holder trying to claim @bdev - * - * Finish a claiming block started by bd_start_claiming(). - * - * CONTEXT: - * Grabs and releases bdev_lock. - */ -static void bd_finish_claiming(struct block_device *bdev, - struct block_device *whole, void *holder) -{ - spin_lock(&bdev_lock); - BUG_ON(!bd_may_claim(bdev, whole, holder)); - __bd_claim(bdev, whole, holder); - __bd_abort_claiming(whole, holder); /* not actually an abort */ -} - -/** - * bd_claim - claim a block device - * @bdev: block device to claim - * @holder: holder trying to claim @bdev - * - * Try to claim @bdev which must have been opened successfully. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * 0 if successful, -EBUSY if @bdev is already claimed. - */ -int bd_claim(struct block_device *bdev, void *holder) -{ - struct block_device *whole = bdev->bd_contains; - int res; - - might_sleep(); - - spin_lock(&bdev_lock); - res = bd_prepare_to_claim(bdev, whole, holder); - if (res == 0) - __bd_claim(bdev, whole, holder); - spin_unlock(&bdev_lock); - - return res; -} -EXPORT_SYMBOL(bd_claim); - -void bd_release(struct block_device *bdev) -{ - spin_lock(&bdev_lock); - if (!--bdev->bd_contains->bd_holders) - bdev->bd_contains->bd_holder = NULL; - if (!--bdev->bd_holders) - bdev->bd_holder = NULL; - spin_unlock(&bdev_lock); -} - -EXPORT_SYMBOL(bd_release); - #ifdef CONFIG_SYSFS -/* - * Functions for bd_claim_by_kobject / bd_release_from_kobject - * - * If a kobject is passed to bd_claim_by_kobject() - * and the kobject has a parent directory, - * following symlinks are created: - * o from the kobject to the claimed bdev - * o from "holders" directory of the bdev to the parent of the kobject - * bd_release_from_kobject() removes these symlinks. - * - * Example: - * If /dev/dm-0 maps to /dev/sda, kobject corresponding to - * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: - * /sys/block/dm-0/slaves/sda --> /sys/block/sda - * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 - */ - static int add_symlink(struct kobject *from, struct kobject *to) { - if (!from || !to) - return 0; return sysfs_create_link(from, to, kobject_name(to)); } static void del_symlink(struct kobject *from, struct kobject *to) { - if (!from || !to) - return; sysfs_remove_link(from, kobject_name(to)); } -/* - * 'struct bd_holder' contains pointers to kobjects symlinked by - * bd_claim_by_kobject. - * It's connected to bd_holder_list which is protected by bdev->bd_sem. - */ -struct bd_holder { - struct list_head list; /* chain of holders of the bdev */ - int count; /* references from the holder */ - struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */ - struct kobject *hdev; /* e.g. "/block/dm-0" */ - struct kobject *hdir; /* e.g. "/block/sda/holders" */ - struct kobject *sdev; /* e.g. "/block/sda" */ -}; - -/* - * Get references of related kobjects at once. - * Returns 1 on success. 0 on failure. - * - * Should call bd_holder_release_dirs() after successful use. - */ -static int bd_holder_grab_dirs(struct block_device *bdev, - struct bd_holder *bo) -{ - if (!bdev || !bo) - return 0; - - bo->sdir = kobject_get(bo->sdir); - if (!bo->sdir) - return 0; - - bo->hdev = kobject_get(bo->sdir->parent); - if (!bo->hdev) - goto fail_put_sdir; - - bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); - if (!bo->sdev) - goto fail_put_hdev; - - bo->hdir = kobject_get(bdev->bd_part->holder_dir); - if (!bo->hdir) - goto fail_put_sdev; - - return 1; - -fail_put_sdev: - kobject_put(bo->sdev); -fail_put_hdev: - kobject_put(bo->hdev); -fail_put_sdir: - kobject_put(bo->sdir); - - return 0; -} - -/* Put references of related kobjects at once. */ -static void bd_holder_release_dirs(struct bd_holder *bo) -{ - kobject_put(bo->hdir); - kobject_put(bo->sdev); - kobject_put(bo->hdev); - kobject_put(bo->sdir); -} - -static struct bd_holder *alloc_bd_holder(struct kobject *kobj) -{ - struct bd_holder *bo; - - bo = kzalloc(sizeof(*bo), GFP_KERNEL); - if (!bo) - return NULL; - - bo->count = 1; - bo->sdir = kobj; - - return bo; -} - -static void free_bd_holder(struct bd_holder *bo) -{ - kfree(bo); -} - /** - * find_bd_holder - find matching struct bd_holder from the block device + * bd_link_disk_holder - create symlinks between holding disk and slave bdev + * @bdev: the claimed slave bdev + * @disk: the holding disk * - * @bdev: struct block device to be searched - * @bo: target struct bd_holder + * This functions creates the following sysfs symlinks. * - * Returns matching entry with @bo in @bdev->bd_holder_list. - * If found, increment the reference count and return the pointer. - * If not found, returns NULL. + * - from "slaves" directory of the holder @disk to the claimed @bdev + * - from "holders" directory of the @bdev to the holder @disk + * + * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is + * passed to bd_link_disk_holder(), then: + * + * /sys/block/dm-0/slaves/sda --> /sys/block/sda + * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 + * + * The caller must have claimed @bdev before calling this function and + * ensure that both @bdev and @disk are valid during the creation and + * lifetime of these symlinks. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. */ -static struct bd_holder *find_bd_holder(struct block_device *bdev, - struct bd_holder *bo) +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { - struct bd_holder *tmp; - - list_for_each_entry(tmp, &bdev->bd_holder_list, list) - if (tmp->sdir == bo->sdir) { - tmp->count++; - return tmp; - } - - return NULL; -} - -/** - * add_bd_holder - create sysfs symlinks for bd_claim() relationship - * - * @bdev: block device to be bd_claimed - * @bo: preallocated and initialized by alloc_bd_holder() - * - * Add @bo to @bdev->bd_holder_list, create symlinks. - * - * Returns 0 if symlinks are created. - * Returns -ve if something fails. - */ -static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) -{ - int err; - - if (!bo) - return -EINVAL; - - if (!bd_holder_grab_dirs(bdev, bo)) - return -EBUSY; - - err = add_symlink(bo->sdir, bo->sdev); - if (err) - return err; - - err = add_symlink(bo->hdir, bo->hdev); - if (err) { - del_symlink(bo->sdir, bo->sdev); - return err; - } - - list_add_tail(&bo->list, &bdev->bd_holder_list); - return 0; -} - -/** - * del_bd_holder - delete sysfs symlinks for bd_claim() relationship - * - * @bdev: block device to be bd_claimed - * @kobj: holder's kobject - * - * If there is matching entry with @kobj in @bdev->bd_holder_list - * and no other bd_claim() from the same kobject, - * remove the struct bd_holder from the list, delete symlinks for it. - * - * Returns a pointer to the struct bd_holder when it's removed from the list - * and ready to be freed. - * Returns NULL if matching claim isn't found or there is other bd_claim() - * by the same kobject. - */ -static struct bd_holder *del_bd_holder(struct block_device *bdev, - struct kobject *kobj) -{ - struct bd_holder *bo; - - list_for_each_entry(bo, &bdev->bd_holder_list, list) { - if (bo->sdir == kobj) { - bo->count--; - BUG_ON(bo->count < 0); - if (!bo->count) { - list_del(&bo->list); - del_symlink(bo->sdir, bo->sdev); - del_symlink(bo->hdir, bo->hdev); - bd_holder_release_dirs(bo); - return bo; - } - break; - } - } - - return NULL; -} - -/** - * bd_claim_by_kobject - bd_claim() with additional kobject signature - * - * @bdev: block device to be claimed - * @holder: holder's signature - * @kobj: holder's kobject - * - * Do bd_claim() and if it succeeds, create sysfs symlinks between - * the bdev and the holder's kobject. - * Use bd_release_from_kobject() when relesing the claimed bdev. - * - * Returns 0 on success. (same as bd_claim()) - * Returns errno on failure. - */ -static int bd_claim_by_kobject(struct block_device *bdev, void *holder, - struct kobject *kobj) -{ - int err; - struct bd_holder *bo, *found; - - if (!kobj) - return -EINVAL; - - bo = alloc_bd_holder(kobj); - if (!bo) - return -ENOMEM; + int ret = 0; mutex_lock(&bdev->bd_mutex); - err = bd_claim(bdev, holder); - if (err) - goto fail; + WARN_ON_ONCE(!bdev->bd_holder || bdev->bd_holder_disk); - found = find_bd_holder(bdev, bo); - if (found) - goto fail; + /* FIXME: remove the following once add_disk() handles errors */ + if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) + goto out_unlock; - err = add_bd_holder(bdev, bo); - if (err) - bd_release(bdev); - else - bo = NULL; -fail: + ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + if (ret) + goto out_unlock; + + ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + if (ret) { + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + goto out_unlock; + } + + bdev->bd_holder_disk = disk; +out_unlock: mutex_unlock(&bdev->bd_mutex); - free_bd_holder(bo); - return err; + return ret; } +EXPORT_SYMBOL_GPL(bd_link_disk_holder); -/** - * bd_release_from_kobject - bd_release() with additional kobject signature - * - * @bdev: block device to be released - * @kobj: holder's kobject - * - * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). - */ -static void bd_release_from_kobject(struct block_device *bdev, - struct kobject *kobj) +static void bd_unlink_disk_holder(struct block_device *bdev) { - if (!kobj) + struct gendisk *disk = bdev->bd_holder_disk; + + bdev->bd_holder_disk = NULL; + if (!disk) return; - mutex_lock(&bdev->bd_mutex); - bd_release(bdev); - free_bd_holder(del_bd_holder(bdev, kobj)); - mutex_unlock(&bdev->bd_mutex); + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); } - -/** - * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() - * - * @bdev: block device to be claimed - * @holder: holder's signature - * @disk: holder's gendisk - * - * Call bd_claim_by_kobject() with getting @disk->slave_dir. - */ -int bd_claim_by_disk(struct block_device *bdev, void *holder, - struct gendisk *disk) -{ - return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); -} -EXPORT_SYMBOL_GPL(bd_claim_by_disk); - -/** - * bd_release_from_disk - wrapper function for bd_release_from_kobject() - * - * @bdev: block device to be claimed - * @disk: holder's gendisk - * - * Call bd_release_from_kobject() and put @disk->slave_dir. - */ -void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) -{ - bd_release_from_kobject(bdev, disk->slave_dir); - kobject_put(disk->slave_dir); -} -EXPORT_SYMBOL_GPL(bd_release_from_disk); +#else +static inline void bd_unlink_disk_holder(struct block_device *bdev) +{ } #endif -/* - * Tries to open block device by device number. Use it ONLY if you - * really do not have anything better - i.e. when you are behind a - * truly sucky interface and all you are given is a device number. _Never_ - * to be used for internal purposes. If you ever need it - reconsider - * your API. - */ -struct block_device *open_by_devnum(dev_t dev, fmode_t mode) -{ - struct block_device *bdev = bdget(dev); - int err = -ENOMEM; - if (bdev) - err = blkdev_get(bdev, mode); - return err ? ERR_PTR(err) : bdev; -} - -EXPORT_SYMBOL(open_by_devnum); - /** * flush_disk - invalidates all buffer-cache entries on a disk * @@ -1309,10 +954,11 @@ int check_disk_change(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; const struct block_device_operations *bdops = disk->fops; + unsigned int events; - if (!bdops->media_changed) - return 0; - if (!bdops->media_changed(bdev->bd_disk)) + events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | + DISK_EVENT_EJECT_REQUEST); + if (!(events & DISK_EVENT_MEDIA_CHANGE)) return 0; flush_disk(bdev); @@ -1475,17 +1121,171 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) return ret; } -int blkdev_get(struct block_device *bdev, fmode_t mode) +/** + * blkdev_get - open a block device + * @bdev: block_device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is + * open with exclusive access. Specifying %FMODE_EXCL with %NULL + * @holder is invalid. Exclusive opens may nest for the same @holder. + * + * On success, the reference count of @bdev is unchanged. On failure, + * @bdev is put. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { - return __blkdev_get(bdev, mode, 0); + struct block_device *whole = NULL; + int res; + + WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); + + if ((mode & FMODE_EXCL) && holder) { + whole = bd_start_claiming(bdev, holder); + if (IS_ERR(whole)) { + bdput(bdev); + return PTR_ERR(whole); + } + } + + res = __blkdev_get(bdev, mode, 0); + + /* __blkdev_get() may alter read only status, check it afterwards */ + if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { + __blkdev_put(bdev, mode, 0); + res = -EACCES; + } + + if (whole) { + /* finish claiming */ + mutex_lock(&bdev->bd_mutex); + spin_lock(&bdev_lock); + + if (!res) { + BUG_ON(!bd_may_claim(bdev, whole, holder)); + /* + * Note that for a whole device bd_holders + * will be incremented twice, and bd_holder + * will be set to bd_may_claim before being + * set to holder + */ + whole->bd_holders++; + whole->bd_holder = bd_may_claim; + bdev->bd_holders++; + bdev->bd_holder = holder; + } + + /* tell others that we're done */ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); + + spin_unlock(&bdev_lock); + + /* + * Block event polling for write claims. Any write + * holder makes the write_holder state stick until all + * are released. This is good enough and tracking + * individual writeable reference is too fragile given + * the way @mode is used in blkdev_get/put(). + */ + if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { + bdev->bd_write_holder = true; + disk_block_events(bdev->bd_disk); + } + + mutex_unlock(&bdev->bd_mutex); + bdput(whole); + } + + return res; } EXPORT_SYMBOL(blkdev_get); +/** + * blkdev_get_by_path - open a block device by name + * @path: path to the block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the blockdevice described by the device file at @path. @mode + * and @holder are identical to blkdev_get(). + * + * On success, the returned block_device has reference count of one. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, + void *holder) +{ + struct block_device *bdev; + int err; + + bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + return bdev; + + err = blkdev_get(bdev, mode, holder); + if (err) + return ERR_PTR(err); + + return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_path); + +/** + * blkdev_get_by_dev - open a block device by device number + * @dev: device number of block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the blockdevice described by device number @dev. @mode and + * @holder are identical to blkdev_get(). + * + * Use it ONLY if you really do not have anything better - i.e. when + * you are behind a truly sucky interface and all you are given is a + * device number. _Never_ to be used for internal purposes. If you + * ever need it - reconsider your API. + * + * On success, the returned block_device has reference count of one. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) +{ + struct block_device *bdev; + int err; + + bdev = bdget(dev); + if (!bdev) + return ERR_PTR(-ENOMEM); + + err = blkdev_get(bdev, mode, holder); + if (err) + return ERR_PTR(err); + + return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_dev); + static int blkdev_open(struct inode * inode, struct file * filp) { - struct block_device *whole = NULL; struct block_device *bdev; - int res; /* * Preserve backwards compatibility and allow large file access @@ -1506,26 +1306,9 @@ static int blkdev_open(struct inode * inode, struct file * filp) if (bdev == NULL) return -ENOMEM; - if (filp->f_mode & FMODE_EXCL) { - whole = bd_start_claiming(bdev, filp); - if (IS_ERR(whole)) { - bdput(bdev); - return PTR_ERR(whole); - } - } - filp->f_mapping = bdev->bd_inode->i_mapping; - res = blkdev_get(bdev, filp->f_mode); - - if (whole) { - if (res == 0) - bd_finish_claiming(bdev, whole, filp); - else - bd_abort_claiming(whole, filp); - } - - return res; + return blkdev_get(bdev, filp->f_mode, filp); } static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) @@ -1539,6 +1322,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_part_count--; if (!--bdev->bd_openers) { + WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); } @@ -1569,6 +1353,45 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) int blkdev_put(struct block_device *bdev, fmode_t mode) { + if (mode & FMODE_EXCL) { + bool bdev_free; + + /* + * Release a claim on the device. The holder fields + * are protected with bdev_lock. bd_mutex is to + * synchronize disk_holder unlinking. + */ + mutex_lock(&bdev->bd_mutex); + spin_lock(&bdev_lock); + + WARN_ON_ONCE(--bdev->bd_holders < 0); + WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); + + /* bd_contains might point to self, check in a separate step */ + if ((bdev_free = !bdev->bd_holders)) + bdev->bd_holder = NULL; + if (!bdev->bd_contains->bd_holders) + bdev->bd_contains->bd_holder = NULL; + + spin_unlock(&bdev_lock); + + /* + * If this was the last claim, remove holder link and + * unblock evpoll if it was a write holder. + */ + if (bdev_free) { + bd_unlink_disk_holder(bdev); + if (bdev->bd_write_holder) { + disk_unblock_events(bdev->bd_disk); + bdev->bd_write_holder = false; + } else + disk_check_events(bdev->bd_disk); + } + + mutex_unlock(&bdev->bd_mutex); + } else + disk_check_events(bdev->bd_disk); + return __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); @@ -1576,8 +1399,7 @@ EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); - if (bdev->bd_holder == filp) - bd_release(bdev); + return blkdev_put(bdev, filp->f_mode); } @@ -1722,67 +1544,6 @@ fail: } EXPORT_SYMBOL(lookup_bdev); -/** - * open_bdev_exclusive - open a block device by name and set it up for use - * - * @path: special file representing the block device - * @mode: FMODE_... combination to pass be used - * @holder: owner for exclusion - * - * Open the blockdevice described by the special file at @path, claim it - * for the @holder. - */ -struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) -{ - struct block_device *bdev, *whole; - int error; - - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) - return bdev; - - whole = bd_start_claiming(bdev, holder); - if (IS_ERR(whole)) { - bdput(bdev); - return whole; - } - - error = blkdev_get(bdev, mode); - if (error) - goto out_abort_claiming; - - error = -EACCES; - if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) - goto out_blkdev_put; - - bd_finish_claiming(bdev, whole, holder); - return bdev; - -out_blkdev_put: - blkdev_put(bdev, mode); -out_abort_claiming: - bd_abort_claiming(whole, holder); - return ERR_PTR(error); -} - -EXPORT_SYMBOL(open_bdev_exclusive); - -/** - * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive() - * - * @bdev: blockdevice to close - * @mode: mode, must match that used to open. - * - * This is the counterpart to open_bdev_exclusive(). - */ -void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) -{ - bd_release(bdev); - blkdev_put(bdev, mode); -} - -EXPORT_SYMBOL(close_bdev_exclusive); - int __invalidate_device(struct block_device *bdev) { struct super_block *sb = get_super(bdev); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6b9884507837..1718e1a5c320 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -493,7 +493,7 @@ again: continue; if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); device->bdev = NULL; fs_devices->open_devices--; } @@ -527,7 +527,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); fs_devices->open_devices--; } if (device->writeable) { @@ -584,13 +584,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int seeding = 1; int ret = 0; + flags |= FMODE_EXCL; + list_for_each_entry(device, head, dev_list) { if (device->bdev) continue; if (!device->name) continue; - bdev = open_bdev_exclusive(device->name, flags, holder); + bdev = blkdev_get_by_path(device->name, flags, holder); if (IS_ERR(bdev)) { printk(KERN_INFO "open %s failed\n", device->name); goto error; @@ -642,7 +644,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, error_brelse: brelse(bh); error_close: - close_bdev_exclusive(bdev, FMODE_READ); + blkdev_put(bdev, flags); error: continue; } @@ -688,7 +690,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, mutex_lock(&uuid_mutex); - bdev = open_bdev_exclusive(path, flags, holder); + flags |= FMODE_EXCL; + bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); @@ -720,7 +723,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, brelse(bh); error_close: - close_bdev_exclusive(bdev, flags); + blkdev_put(bdev, flags); error: mutex_unlock(&uuid_mutex); return ret; @@ -1183,8 +1186,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } } else { - bdev = open_bdev_exclusive(device_path, FMODE_READ, - root->fs_info->bdev_holder); + bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto out; @@ -1251,7 +1254,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->latest_bdev = next_device->bdev; if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); device->bdev = NULL; device->fs_devices->open_devices--; } @@ -1294,7 +1297,7 @@ error_brelse: brelse(bh); error_close: if (bdev) - close_bdev_exclusive(bdev, FMODE_READ); + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); @@ -1446,7 +1449,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) return -EINVAL; - bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); + bdev = blkdev_get_by_path(device_path, FMODE_EXCL, + root->fs_info->bdev_holder); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -1572,7 +1576,7 @@ out: mutex_unlock(&root->fs_info->volume_mutex); return ret; error: - close_bdev_exclusive(bdev, 0); + blkdev_put(bdev, FMODE_EXCL); if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2740db49eb04..1be781079450 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -50,7 +50,7 @@ struct btrfs_device { struct block_device *bdev; - /* the mode sent to open_bdev_exclusive */ + /* the mode sent to blkdev_get */ fmode_t mode; char *name; diff --git a/fs/char_dev.c b/fs/char_dev.c index 6e99b9ddd4e9..dca9e5e0f73b 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -59,7 +59,7 @@ static struct char_device_struct { } *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; /* index in the above */ -static inline int major_to_index(int major) +static inline int major_to_index(unsigned major) { return major % CHRDEV_MAJOR_HASH_SIZE; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index b7d0554631e4..7aa767d4f06f 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -364,7 +364,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) goto fail; return bdev; @@ -381,8 +381,7 @@ fail: */ static int ext3_blkdev_put(struct block_device *bdev) { - bd_release(bdev); - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } static int ext3_blkdev_remove(struct ext3_sb_info *sbi) @@ -2162,13 +2161,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, if (bdev == NULL) return NULL; - if (bd_claim(bdev, sb)) { - ext3_msg(sb, KERN_ERR, - "error: failed to claim external journal device"); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); - return NULL; - } - blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 29c80f6d8b27..cb10a06775e4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -657,7 +657,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) goto fail; return bdev; @@ -673,8 +673,7 @@ fail: */ static int ext4_blkdev_put(struct block_device *bdev) { - bd_release(bdev); - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } static int ext4_blkdev_remove(struct ext4_sb_info *sbi) @@ -3778,13 +3777,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if (bdev == NULL) return NULL; - if (bd_claim(bdev, sb)) { - ext4_msg(sb, KERN_ERR, - "failed to claim external journal device"); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); - return NULL; - } - blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 693f4470a2df..777927ce6f79 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1268,7 +1268,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ; + fmode_t mode = FMODE_READ | FMODE_EXCL; int error; struct gfs2_args args; struct gfs2_sbd *sdp; @@ -1276,7 +1276,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; - bdev = open_bdev_exclusive(dev_name, mode, fs_type); + bdev = blkdev_get_by_path(dev_name, mode, fs_type); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, goto error_bdev; if (s->s_root) - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode); memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; @@ -1342,7 +1342,7 @@ error_super: deactivate_locked_super(s); return ERR_PTR(error); error_bdev: - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode); return ERR_PTR(error); } diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index e1b8493b9aaa..278e3fb40b71 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1120,16 +1120,13 @@ int lmLogOpen(struct super_block *sb) * file systems to log may have n-to-1 relationship; */ - bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE); + bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + log); if (IS_ERR(bdev)) { rc = -PTR_ERR(bdev); goto free; } - if ((rc = bd_claim(bdev, log))) { - goto close; - } - log->bdev = bdev; memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid)); @@ -1137,7 +1134,7 @@ int lmLogOpen(struct super_block *sb) * initialize log: */ if ((rc = lmLogInit(log))) - goto unclaim; + goto close; list_add(&log->journal_list, &jfs_external_logs); @@ -1163,11 +1160,8 @@ journal_found: list_del(&log->journal_list); lbmLogShutdown(log); - unclaim: - bd_release(bdev); - close: /* close external log device */ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1512,8 +1506,7 @@ int lmLogClose(struct super_block *sb) bdev = log->bdev; rc = lmLogShutdown(log); - bd_release(bdev); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); kfree(log); diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 92ca6fbe09bd..723bc5bca09a 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -300,7 +300,7 @@ static int bdev_write_sb(struct super_block *sb, struct page *page) static void bdev_put_device(struct logfs_super *s) { - close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } static int bdev_can_write_buf(struct super_block *sb, u64 ofs) @@ -325,13 +325,14 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type, { struct block_device *bdev; - bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type); + bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + type); if (IS_ERR(bdev)) return PTR_ERR(bdev); if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { int mtdnr = MINOR(bdev->bd_dev); - close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); return logfs_get_sb_mtd(p, mtdnr); } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 3a359023c9f7..230b79fbf005 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -845,11 +845,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct page **pp = rqstp->rq_respages + rqstp->rq_resused; struct page *page = buf->page; size_t size; - int ret; - - ret = buf->ops->confirm(pipe, buf); - if (unlikely(ret)) - return ret; size = sd->len; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 70dfdd532b83..0994f6a76c07 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1163,14 +1163,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags, { struct nilfs_super_data sd; struct super_block *s; - fmode_t mode = FMODE_READ; + fmode_t mode = FMODE_READ | FMODE_EXCL; struct dentry *root_dentry; int err, s_new = false; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; - sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); + sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type); if (IS_ERR(sd.bdev)) return ERR_CAST(sd.bdev); @@ -1249,7 +1249,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (!s_new) - close_bdev_exclusive(sd.bdev, mode); + blkdev_put(sd.bdev, mode); return root_dentry; @@ -1258,7 +1258,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, failed: if (!s_new) - close_bdev_exclusive(sd.bdev, mode); + blkdev_put(sd.bdev, mode); return ERR_PTR(err); } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a6cc05302e9f..b108e863d8f6 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1729,7 +1729,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, goto out; reg->hr_bdev = I_BDEV(filp->f_mapping->host); - ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ); + ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); if (ret) { reg->hr_bdev = NULL; goto out; diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 0a8b0ad0c7e2..9c21119512b9 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -237,6 +237,13 @@ ssize_t part_size_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); } +ssize_t part_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%d\n", p->policy ? 1 : 0); +} + ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -312,6 +319,7 @@ ssize_t part_fail_store(struct device *dev, static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, NULL); @@ -326,6 +334,7 @@ static struct attribute *part_attrs[] = { &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, + &dev_attr_ro.attr, &dev_attr_alignment_offset.attr, &dev_attr_discard_alignment.attr, &dev_attr_stat.attr, @@ -372,6 +381,11 @@ static void delete_partition_rcu_cb(struct rcu_head *head) put_device(part_to_dev(part)); } +void __delete_partition(struct hd_struct *part) +{ + call_rcu(&part->rcu_head, delete_partition_rcu_cb); +} + void delete_partition(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = disk->part_tbl; @@ -390,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno) kobject_put(part->holder_dir); device_del(part_to_dev(part)); - call_rcu(&part->rcu_head, delete_partition_rcu_cb); + hd_struct_put(part); } static ssize_t whole_disk_show(struct device *dev, @@ -489,6 +503,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); + hd_ref_init(p); return p; out_free_info: @@ -507,65 +522,6 @@ out_put: return ERR_PTR(err); } -/* Not exported, helper to add_disk(). */ -void register_disk(struct gendisk *disk) -{ - struct device *ddev = disk_to_dev(disk); - struct block_device *bdev; - struct disk_part_iter piter; - struct hd_struct *part; - int err; - - ddev->parent = disk->driverfs_dev; - - dev_set_name(ddev, disk->disk_name); - - /* delay uevents, until we scanned partition table */ - dev_set_uevent_suppress(ddev, 1); - - if (device_add(ddev)) - return; - if (!sysfs_deprecated) { - err = sysfs_create_link(block_depr, &ddev->kobj, - kobject_name(&ddev->kobj)); - if (err) { - device_del(ddev); - return; - } - } - disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); - disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); - - /* No minors to use for partitions */ - if (!disk_partitionable(disk)) - goto exit; - - /* No such device (e.g., media were just removed) */ - if (!get_capacity(disk)) - goto exit; - - bdev = bdget_disk(disk, 0); - if (!bdev) - goto exit; - - bdev->bd_invalidated = 1; - err = blkdev_get(bdev, FMODE_READ); - if (err < 0) - goto exit; - blkdev_put(bdev, FMODE_READ); - -exit: - /* announce disk after possible partitions are created */ - dev_set_uevent_suppress(ddev, 0); - kobject_uevent(&ddev->kobj, KOBJ_ADD); - - /* announce possible partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); - disk_part_iter_exit(&piter); -} - static bool disk_unlock_native_capacity(struct gendisk *disk) { const struct block_device_operations *bdops = disk->fops; @@ -728,33 +684,3 @@ fail: } EXPORT_SYMBOL(read_dev_sector); - -void del_gendisk(struct gendisk *disk) -{ - struct disk_part_iter piter; - struct hd_struct *part; - - /* invalidate stuff */ - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); - while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(disk, part->partno); - delete_partition(disk, part->partno); - } - disk_part_iter_exit(&piter); - - invalidate_partition(disk, 0); - blk_free_devt(disk_to_dev(disk)->devt); - set_capacity(disk, 0); - disk->flags &= ~GENHD_FL_UP; - unlink_gendisk(disk); - part_stat_set_all(&disk->part0, 0); - disk->part0.stamp = 0; - - kobject_put(disk->part0.holder_dir); - kobject_put(disk->slave_dir); - disk->driverfs_dev = NULL; - if (!sysfs_deprecated) - sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); - device_del(disk_to_dev(disk)); -} diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index d31bce1a9f90..3eea859e6990 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2551,8 +2551,6 @@ static int release_journal_dev(struct super_block *super, result = 0; if (journal->j_dev_bd != NULL) { - if (journal->j_dev_bd->bd_dev != super->s_dev) - bd_release(journal->j_dev_bd); result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode); journal->j_dev_bd = NULL; } @@ -2570,7 +2568,7 @@ static int journal_init_dev(struct super_block *super, { int result; dev_t jdev; - fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE; + fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; char b[BDEVNAME_SIZE]; result = 0; @@ -2584,7 +2582,10 @@ static int journal_init_dev(struct super_block *super, /* there is no "jdev" option and journal is on separate device */ if ((!jdev_name || !jdev_name[0])) { - journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode); + if (jdev == super->s_dev) + blkdev_mode &= ~FMODE_EXCL; + journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, + journal); journal->j_dev_mode = blkdev_mode; if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); @@ -2593,22 +2594,14 @@ static int journal_init_dev(struct super_block *super, "cannot init journal device '%s': %i", __bdevname(jdev, b), result); return result; - } else if (jdev != super->s_dev) { - result = bd_claim(journal->j_dev_bd, journal); - if (result) { - blkdev_put(journal->j_dev_bd, blkdev_mode); - return result; - } - + } else if (jdev != super->s_dev) set_blocksize(journal->j_dev_bd, super->s_blocksize); - } return 0; } journal->j_dev_mode = blkdev_mode; - journal->j_dev_bd = open_bdev_exclusive(jdev_name, - blkdev_mode, journal); + journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal); if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; diff --git a/fs/splice.c b/fs/splice.c index ce2f02579e35..50a5d978da16 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -682,19 +682,14 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, { struct file *file = sd->u.file; loff_t pos = sd->pos; - int ret, more; + int more; - ret = buf->ops->confirm(pipe, buf); - if (!ret) { - more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - if (file->f_op && file->f_op->sendpage) - ret = file->f_op->sendpage(file, buf->page, buf->offset, - sd->len, &pos, more); - else - ret = -EINVAL; - } + if (!likely(file->f_op && file->f_op->sendpage)) + return -EINVAL; - return ret; + more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; + return file->f_op->sendpage(file, buf->page, buf->offset, + sd->len, &pos, more); } /* @@ -727,13 +722,6 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, void *fsdata; int ret; - /* - * make sure the data in this buffer is uptodate - */ - ret = buf->ops->confirm(pipe, buf); - if (unlikely(ret)) - return ret; - offset = sd->pos & ~PAGE_CACHE_MASK; this_len = sd->len; @@ -805,12 +793,17 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, if (sd->len > sd->total_len) sd->len = sd->total_len; - ret = actor(pipe, buf, sd); - if (ret <= 0) { + ret = buf->ops->confirm(pipe, buf); + if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; return ret; } + + ret = actor(pipe, buf, sd); + if (ret <= 0) + return ret; + buf->offset += ret; buf->len -= ret; @@ -1044,10 +1037,6 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, int ret; void *data; - ret = buf->ops->confirm(pipe, buf); - if (ret) - return ret; - data = buf->ops->map(pipe, buf, 0); ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); buf->ops->unmap(pipe, buf, data); @@ -1495,10 +1484,6 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, char *src; int ret; - ret = buf->ops->confirm(pipe, buf); - if (unlikely(ret)) - return ret; - /* * See if we can use the atomic maps, by prefaulting in the * pages and doing an atomic copy diff --git a/fs/super.c b/fs/super.c index 823e061faa87..4f6a3571a634 100644 --- a/fs/super.c +++ b/fs/super.c @@ -767,13 +767,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ; + fmode_t mode = FMODE_READ | FMODE_EXCL; int error = 0; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; - bdev = open_bdev_exclusive(dev_name, mode, fs_type); + bdev = blkdev_get_by_path(dev_name, mode, fs_type); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -802,13 +802,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, /* * s_umount nests inside bd_mutex during - * __invalidate_device(). close_bdev_exclusive() - * acquires bd_mutex and can't be called under - * s_umount. Drop s_umount temporarily. This is safe - * as we're holding an active reference. + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. */ up_write(&s->s_umount); - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode); down_write(&s->s_umount); } else { char b[BDEVNAME_SIZE]; @@ -832,7 +832,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, error_s: error = PTR_ERR(s); error_bdev: - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode); error: return ERR_PTR(error); } @@ -863,7 +863,8 @@ void kill_block_super(struct super_block *sb) bdev->bd_super = NULL; generic_shutdown_super(sb); sync_blockdev(bdev); - close_bdev_exclusive(bdev, mode); + WARN_ON_ONCE(!(mode & FMODE_EXCL)); + blkdev_put(bdev, mode | FMODE_EXCL); } EXPORT_SYMBOL(kill_block_super); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index a10f6416e563..bd07f7339366 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -606,7 +606,8 @@ xfs_blkdev_get( { int error = 0; - *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); + *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + mp); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); printk("XFS: Invalid device [%s], error=%d\n", name, error); @@ -620,7 +621,7 @@ xfs_blkdev_put( struct block_device *bdev) { if (bdev) - close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 36ab42c9bb99..4d18ff34670a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -115,6 +115,7 @@ struct request { void *elevator_private3; struct gendisk *rq_disk; + struct hd_struct *part; unsigned long start_time; #ifdef CONFIG_BLK_CGROUP unsigned long long start_time_ns; @@ -646,7 +647,6 @@ static inline void rq_flush_dcache_pages(struct request *rq) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); -extern void register_disk(struct gendisk *dev); extern void generic_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); @@ -1256,6 +1256,9 @@ struct block_device_operations { int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); + unsigned int (*check_events) (struct gendisk *disk, + unsigned int clearing); + /* ->media_changed() is DEPRECATED, use ->check_events() instead */ int (*media_changed) (struct gendisk *); void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 78e904796622..35eae4b67503 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -946,6 +946,8 @@ struct cdrom_device_info { /* device-related storage */ unsigned int options : 30; /* options flags */ unsigned mc_flags : 2; /* media change buffer flags */ + unsigned int vfs_events; /* cached events for vfs path */ + unsigned int ioctl_events; /* cached events for ioctl path */ int use_count; /* number of times device opened */ char name[20]; /* name of the device type */ /* per-device flags */ @@ -965,6 +967,8 @@ struct cdrom_device_ops { int (*open) (struct cdrom_device_info *, int); void (*release) (struct cdrom_device_info *); int (*drive_status) (struct cdrom_device_info *, int); + unsigned int (*check_events) (struct cdrom_device_info *cdi, + unsigned int clearing, int slot); int (*media_changed) (struct cdrom_device_info *, int); int (*tray_move) (struct cdrom_device_info *, int); int (*lock_door) (struct cdrom_device_info *, int); @@ -993,6 +997,8 @@ extern int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, extern void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode); extern int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); +extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi, + unsigned int clearing); extern int cdrom_media_changed(struct cdrom_device_info *); extern int register_cdrom(struct cdrom_device_info *cdi); diff --git a/include/linux/fs.h b/include/linux/fs.h index c0701288d204..3984f2358d1f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -664,8 +664,9 @@ struct block_device { void * bd_claiming; void * bd_holder; int bd_holders; + bool bd_write_holder; #ifdef CONFIG_SYSFS - struct list_head bd_holder_list; + struct gendisk * bd_holder_disk; /* for sysfs slave linkng */ #endif struct block_device * bd_contains; unsigned bd_block_size; @@ -2019,7 +2020,6 @@ extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); extern void bd_forget(struct inode *inode); extern void bdput(struct block_device *); -extern struct block_device *open_by_devnum(dev_t, fmode_t); extern void invalidate_bdev(struct block_device *); extern int sync_blockdev(struct block_device *bdev); extern struct super_block *freeze_bdev(struct block_device *); @@ -2050,16 +2050,20 @@ extern const struct file_operations def_fifo_fops; extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); -extern int blkdev_get(struct block_device *, fmode_t); -extern int blkdev_put(struct block_device *, fmode_t); -extern int bd_claim(struct block_device *, void *); -extern void bd_release(struct block_device *); +extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); +extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, + void *holder); +extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, + void *holder); +extern int blkdev_put(struct block_device *bdev, fmode_t mode); #ifdef CONFIG_SYSFS -extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *); -extern void bd_release_from_disk(struct block_device *, struct gendisk *); +extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); #else -#define bd_claim_by_disk(bdev, holder, disk) bd_claim(bdev, holder) -#define bd_release_from_disk(bdev, disk) bd_release(bdev) +static inline int bd_link_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ + return 0; +} #endif #endif @@ -2095,8 +2099,6 @@ static inline void unregister_chrdev(unsigned int major, const char *name) extern const char *__bdevname(dev_t, char *buffer); extern const char *bdevname(struct block_device *bdev, char *buffer); extern struct block_device *lookup_bdev(const char *); -extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *); -extern void close_bdev_exclusive(struct block_device *, fmode_t); extern void blkdev_show(struct seq_file *,off_t); #else diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7a7b9c1644e4..c0d5f6945c1e 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -115,6 +115,7 @@ struct hd_struct { #else struct disk_stats dkstats; #endif + atomic_t ref; struct rcu_head rcu_head; }; @@ -127,6 +128,11 @@ struct hd_struct { #define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ #define GENHD_FL_NATIVE_CAPACITY 128 +enum { + DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ + DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */ +}; + #define BLK_SCSI_MAX_CMDS (256) #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) @@ -143,6 +149,8 @@ struct disk_part_tbl { struct hd_struct __rcu *part[]; }; +struct disk_events; + struct gendisk { /* major, first_minor and minors are input parameters only, * don't use directly. Use disk_devt() and disk_max_parts(). @@ -154,6 +162,10 @@ struct gendisk { char disk_name[DISK_NAME_LEN]; /* name of major driver */ char *(*devnode)(struct gendisk *gd, mode_t *mode); + + unsigned int events; /* supported events */ + unsigned int async_events; /* async events, subset of all */ + /* Array of pointers to partitions indexed by partno. * Protected with matching bdev lock but stat and other * non-critical accesses use RCU. Always access through @@ -171,9 +183,8 @@ struct gendisk { struct kobject *slave_dir; struct timer_rand_state *random; - atomic_t sync_io; /* RAID */ - struct work_struct async_notify; + struct disk_events *ev; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *integrity; #endif @@ -395,7 +406,6 @@ extern void part_round_stats(int cpu, struct hd_struct *part); /* block/genhd.c */ extern void add_disk(struct gendisk *disk); extern void del_gendisk(struct gendisk *gp); -extern void unlink_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); @@ -407,6 +417,11 @@ static inline int get_disk_ro(struct gendisk *disk) return disk->part0.policy; } +extern void disk_block_events(struct gendisk *disk); +extern void disk_unblock_events(struct gendisk *disk); +extern void disk_check_events(struct gendisk *disk); +extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); + /* drivers/char/random.c */ extern void add_disk_randomness(struct gendisk *disk); extern void rand_initialize_disk(struct gendisk *disk); @@ -583,6 +598,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk, sector_t len, int flags, struct partition_meta_info *info); +extern void __delete_partition(struct hd_struct *); extern void delete_partition(struct gendisk *, int); extern void printk_all_partitions(void); @@ -611,6 +627,29 @@ extern ssize_t part_fail_store(struct device *dev, const char *buf, size_t count); #endif /* CONFIG_FAIL_MAKE_REQUEST */ +static inline void hd_ref_init(struct hd_struct *part) +{ + atomic_set(&part->ref, 1); + smp_mb(); +} + +static inline void hd_struct_get(struct hd_struct *part) +{ + atomic_inc(&part->ref); + smp_mb__after_atomic_inc(); +} + +static inline int hd_struct_try_get(struct hd_struct *part) +{ + return atomic_inc_not_zero(&part->ref); +} + +static inline void hd_struct_put(struct hd_struct *part) +{ + if (atomic_dec_and_test(&part->ref)) + __delete_partition(part); +} + #else /* CONFIG_BLOCK */ static inline void printk_all_partitions(void) { } diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h index 1651fef18831..648d23358038 100644 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@ -104,6 +104,7 @@ struct scsi_cmnd; #define UNMAP 0x42 #define READ_TOC 0x43 #define READ_HEADER 0x44 +#define GET_EVENT_STATUS_NOTIFICATION 0x4a #define LOG_SELECT 0x4c #define LOG_SENSE 0x4d #define XDWRITEREAD_10 0x53 diff --git a/include/trace/events/block.h b/include/trace/events/block.h index d8ce278515c3..aba421d68f6f 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -206,15 +206,16 @@ TRACE_EVENT(block_bio_bounce, * block_bio_complete - completed all work on the block operation * @q: queue holding the block operation * @bio: block operation completed + * @error: io error value * * This tracepoint indicates there is no further work to do on this * block IO operation @bio. */ TRACE_EVENT(block_bio_complete, - TP_PROTO(struct request_queue *q, struct bio *bio), + TP_PROTO(struct request_queue *q, struct bio *bio, int error), - TP_ARGS(q, bio), + TP_ARGS(q, bio, error), TP_STRUCT__entry( __field( dev_t, dev ) @@ -228,6 +229,7 @@ TRACE_EVENT(block_bio_complete, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_sector; __entry->nr_sector = bio->bi_size >> 9; + __entry->error = error; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); ), @@ -486,16 +488,16 @@ TRACE_EVENT(block_split, ); /** - * block_remap - map request for a partition to the raw device + * block_bio_remap - map request for a logical device to the raw device * @q: queue holding the operation * @bio: revised operation * @dev: device for the operation * @from: original sector for the operation * - * An operation for a partition on a block device has been mapped to the + * An operation for a logical device has been mapped to the * raw block device. */ -TRACE_EVENT(block_remap, +TRACE_EVENT(block_bio_remap, TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, sector_t from), diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 69425889bd40..7c97c3a0eee3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -224,7 +224,7 @@ static int swsusp_swap_check(void) return res; root_swap = res; - res = blkdev_get(hib_resume_bdev, FMODE_WRITE); + res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); if (res) return res; @@ -930,7 +930,8 @@ int swsusp_check(void) { int error; - hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + FMODE_READ, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7b8ec0281548..153562d0b93c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -758,53 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore, * @q: queue the io is for * @bio: the source bio * @what: the action + * @error: error, if any * * Description: * Records an action against a bio. Will log the bio offset + size. * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what) + u32 what, int error) { struct blk_trace *bt = q->blk_trace; if (likely(!bt)) return; + if (!error && !bio_flagged(bio, BIO_UPTODATE)) + error = EIO; + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, - !bio_flagged(bio, BIO_UPTODATE), 0, NULL); + error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, - struct request_queue *q, struct bio *bio) + struct request_queue *q, struct bio *bio, + int error) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } static void blk_add_trace_bio_backmerge(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); } static void blk_add_trace_bio_frontmerge(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); } static void blk_add_trace_bio_queue(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE); + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, @@ -812,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ); + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { struct blk_trace *bt = q->blk_trace; @@ -827,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { struct blk_trace *bt = q->blk_trace; @@ -887,7 +892,7 @@ static void blk_add_trace_split(void *ignore, } /** - * blk_add_trace_remap - Add a trace for a remap operation + * blk_add_trace_bio_remap - Add a trace for a bio-remap operation * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @bio: the source bio @@ -899,9 +904,9 @@ static void blk_add_trace_split(void *ignore, * it spans a stripe (or similar). Add a trace for that action. * **/ -static void blk_add_trace_remap(void *ignore, - struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from) +static void blk_add_trace_bio_remap(void *ignore, + struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -1016,7 +1021,7 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); - ret = register_trace_block_remap(blk_add_trace_remap, NULL); + ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); WARN_ON(ret); ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); WARN_ON(ret); @@ -1025,7 +1030,7 @@ static void blk_register_tracepoints(void) static void blk_unregister_tracepoints(void) { unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); - unregister_trace_block_remap(blk_add_trace_remap, NULL); + unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); diff --git a/mm/swapfile.c b/mm/swapfile.c index 67ddaaf98c74..b6adcfbf6f48 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1677,7 +1677,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); set_blocksize(bdev, p->old_block_size); - bd_release(bdev); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { mutex_lock(&inode->i_mutex); inode->i_flags &= ~S_SWAPFILE; @@ -1939,7 +1939,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EINVAL; if (S_ISBLK(inode->i_mode)) { bdev = I_BDEV(inode); - error = bd_claim(bdev, sys_swapon); + error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, + sys_swapon); if (error < 0) { bdev = NULL; error = -EINVAL; @@ -2136,7 +2137,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) bad_swap: if (bdev) { set_blocksize(bdev, p->old_block_size); - bd_release(bdev); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } destroy_swap_extents(p); swap_cgroup_swapoff(type);