1
0
Fork 0

lightnvm: physical block device (pblk) target

This patch introduces pblk, a host-side translation layer for
Open-Channel SSDs to expose them like block devices. The translation
layer allows data placement decisions, and I/O scheduling to be
managed by the host, enabling users to optimize the SSD for their
specific workloads.

An open-channel SSD has a set of LUNs (parallel units) and a
collection of blocks. Each block can be read in any order, but
writes must be sequential. Writes may also fail, and if a block
requires it, must also be reset before new writes can be
applied.

To manage the constraints, pblk maintains a logical to
physical address (L2P) table,  write cache, garbage
collection logic, recovery scheme, and logic to rate-limit
user I/Os versus garbage collection I/Os.

The L2P table is fully-associative and manages sectors at a
4KB granularity. Pblk stores the L2P table in two places, in
the out-of-band area of the media and on the last page of a
line. In the cause of a power failure, pblk will perform a
scan to recover the L2P table.

The user data is organized into lines. A line is data
striped across blocks and LUNs. The lines enable the host to
reduce the amount of metadata to maintain besides the user
data and makes it easier to implement RAID or erasure coding
in the future.

pblk implements multi-tenant support and can be instantiated
multiple times on the same drive. Each instance owns a
portion of the SSD - both regarding I/O bandwidth and
capacity - providing I/O isolation for each case.

Finally, pblk also exposes a sysfs interface that allows
user-space to peek into the internals of pblk. The interface
is available at /dev/block/*/pblk/ where * is the block
device name exposed.

This work also contains contributions from:
  Matias Bjørling <matias@cnexlabs.com>
  Simon A. F. Lund <slund@cnexlabs.com>
  Young Tack Jin <youngtack.jin@gmail.com>
  Huaicheng Li <huaicheng@cs.uchicago.edu>

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
zero-colors
Javier González 2017-04-15 20:55:50 +02:00 committed by Jens Axboe
parent 6eb082452d
commit a4bd217b43
15 changed files with 8044 additions and 0 deletions

View File

@ -0,0 +1,21 @@
pblk: Physical Block Device Target
==================================
pblk implements a fully associative, host-based FTL that exposes a traditional
block I/O interface. Its primary responsibilities are:
- Map logical addresses onto physical addresses (4KB granularity) in a
logical-to-physical (L2P) table.
- Maintain the integrity and consistency of the L2P table as well as its
recovery from normal tear down and power outage.
- Deal with controller- and media-specific constrains.
- Handle I/O errors.
- Implement garbage collection.
- Maintain consistency across the I/O stack during synchronization points.
For more information please refer to:
http://lightnvm.io
which maintains updated FAQs, manual pages, technical documentation, tools,
contacts, etc.

View File

@ -33,4 +33,13 @@ config NVM_RRPC
host. The target is implemented using a linear mapping table and
cost-based garbage collection. It is optimized for 4K IO sizes.
config NVM_PBLK
tristate "Physical Block Device Open-Channel SSD target"
---help---
Allows an open-channel SSD to be exposed as a block device to the
host. The target assumes the device exposes raw flash and must be
explicitly managed by the host.
Please note the disk format is considered EXPERIMENTAL for now.
endif # NVM

View File

@ -4,3 +4,8 @@
obj-$(CONFIG_NVM) := core.o
obj-$(CONFIG_NVM_RRPC) += rrpc.o
obj-$(CONFIG_NVM_PBLK) += pblk.o
pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
pblk-write.o pblk-cache.o pblk-read.o \
pblk-gc.o pblk-recovery.o pblk-map.o \
pblk-rl.o pblk-sysfs.o

View File

@ -0,0 +1,114 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-cache.c - pblk's write cache
*/
#include "pblk.h"
int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
{
struct pblk_w_ctx w_ctx;
sector_t lba = pblk_get_lba(bio);
unsigned int bpos, pos;
int nr_entries = pblk_get_secs(bio);
int i, ret;
/* Update the write buffer head (mem) with the entries that we can
* write. The write in itself cannot fail, so there is no need to
* rollback from here on.
*/
retry:
ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos);
if (ret == NVM_IO_REQUEUE) {
io_schedule();
goto retry;
}
if (unlikely(!bio_has_data(bio)))
goto out;
w_ctx.flags = flags;
pblk_ppa_set_empty(&w_ctx.ppa);
for (i = 0; i < nr_entries; i++) {
void *data = bio_data(bio);
w_ctx.lba = lba + i;
pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos);
bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
}
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(nr_entries, &pblk->inflight_writes);
atomic_long_add(nr_entries, &pblk->req_writes);
#endif
out:
pblk_write_should_kick(pblk);
return ret;
}
/*
* On GC the incoming lbas are not necessarily sequential. Also, some of the
* lbas might not be valid entries, which are marked as empty by the GC thread
*/
int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
unsigned int nr_entries, unsigned int nr_rec_entries,
struct pblk_line *gc_line, unsigned long flags)
{
struct pblk_w_ctx w_ctx;
unsigned int bpos, pos;
int i, valid_entries;
/* Update the write buffer head (mem) with the entries that we can
* write. The write in itself cannot fail, so there is no need to
* rollback from here on.
*/
retry:
if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) {
io_schedule();
goto retry;
}
w_ctx.flags = flags;
pblk_ppa_set_empty(&w_ctx.ppa);
for (i = 0, valid_entries = 0; i < nr_entries; i++) {
if (lba_list[i] == ADDR_EMPTY)
continue;
w_ctx.lba = lba_list[i];
pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos);
data += PBLK_EXPOSED_PAGE_SIZE;
valid_entries++;
}
WARN_ONCE(nr_rec_entries != valid_entries,
"pblk: inconsistent GC write\n");
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(valid_entries, &pblk->inflight_writes);
atomic_long_add(valid_entries, &pblk->recov_gc_writes);
#endif
pblk_write_should_kick(pblk);
return NVM_IO_OK;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,555 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-gc.c - pblk's garbage collector
*/
#include "pblk.h"
#include <linux/delay.h>
static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
{
kfree(gc_rq->data);
kfree(gc_rq->lba_list);
kfree(gc_rq);
}
static int pblk_gc_write(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
struct pblk_gc_rq *gc_rq, *tgc_rq;
LIST_HEAD(w_list);
spin_lock(&gc->w_lock);
if (list_empty(&gc->w_list)) {
spin_unlock(&gc->w_lock);
return 1;
}
list_for_each_entry_safe(gc_rq, tgc_rq, &gc->w_list, list) {
list_move_tail(&gc_rq->list, &w_list);
gc->w_entries--;
}
spin_unlock(&gc->w_lock);
list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list,
gc_rq->nr_secs, gc_rq->secs_to_gc,
gc_rq->line, PBLK_IOTYPE_GC);
kref_put(&gc_rq->line->ref, pblk_line_put);
list_del(&gc_rq->list);
pblk_gc_free_gc_rq(gc_rq);
}
return 0;
}
static void pblk_gc_writer_kick(struct pblk_gc *gc)
{
wake_up_process(gc->gc_writer_ts);
}
/*
* Responsible for managing all memory related to a gc request. Also in case of
* failure
*/
static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_line *line,
u64 *lba_list, unsigned int nr_secs)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_gc *gc = &pblk->gc;
struct pblk_gc_rq *gc_rq;
void *data;
unsigned int secs_to_gc;
int ret = NVM_IO_OK;
data = kmalloc(nr_secs * geo->sec_size, GFP_KERNEL);
if (!data) {
ret = NVM_IO_ERR;
goto free_lba_list;
}
/* Read from GC victim block */
if (pblk_submit_read_gc(pblk, lba_list, data, nr_secs,
&secs_to_gc, line)) {
ret = NVM_IO_ERR;
goto free_data;
}
if (!secs_to_gc)
goto free_data;
gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
if (!gc_rq) {
ret = NVM_IO_ERR;
goto free_data;
}
gc_rq->line = line;
gc_rq->data = data;
gc_rq->lba_list = lba_list;
gc_rq->nr_secs = nr_secs;
gc_rq->secs_to_gc = secs_to_gc;
kref_get(&line->ref);
retry:
spin_lock(&gc->w_lock);
if (gc->w_entries > 256) {
spin_unlock(&gc->w_lock);
usleep_range(256, 1024);
goto retry;
}
gc->w_entries++;
list_add_tail(&gc_rq->list, &gc->w_list);
spin_unlock(&gc->w_lock);
pblk_gc_writer_kick(&pblk->gc);
return NVM_IO_OK;
free_data:
kfree(data);
free_lba_list:
kfree(lba_list);
return ret;
}
static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct list_head *move_list;
spin_lock(&line->lock);
WARN_ON(line->state != PBLK_LINESTATE_GC);
line->state = PBLK_LINESTATE_CLOSED;
move_list = pblk_line_gc_list(pblk, line);
spin_unlock(&line->lock);
if (move_list) {
spin_lock(&l_mg->gc_lock);
list_add_tail(&line->list, move_list);
spin_unlock(&l_mg->gc_lock);
}
}
static void pblk_gc_line_ws(struct work_struct *work)
{
struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
ws);
struct pblk *pblk = line_ws->pblk;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *line = line_ws->line;
struct pblk_line_meta *lm = &pblk->lm;
__le64 *lba_list = line_ws->priv;
u64 *gc_list;
int sec_left;
int nr_ppas, bit;
int put_line = 1;
pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
spin_lock(&line->lock);
sec_left = line->vsc;
if (!sec_left) {
/* Lines are erased before being used (l_mg->data_/log_next) */
spin_unlock(&line->lock);
goto out;
}
spin_unlock(&line->lock);
if (sec_left < 0) {
pr_err("pblk: corrupted GC line (%d)\n", line->id);
put_line = 0;
pblk_put_line_back(pblk, line);
goto out;
}
bit = -1;
next_rq:
gc_list = kmalloc_array(pblk->max_write_pgs, sizeof(u64), GFP_KERNEL);
if (!gc_list) {
put_line = 0;
pblk_put_line_back(pblk, line);
goto out;
}
nr_ppas = 0;
do {
bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line,
bit + 1);
if (bit > line->emeta_ssec)
break;
gc_list[nr_ppas++] = le64_to_cpu(lba_list[bit]);
} while (nr_ppas < pblk->max_write_pgs);
if (unlikely(!nr_ppas)) {
kfree(gc_list);
goto out;
}
if (pblk_gc_move_valid_secs(pblk, line, gc_list, nr_ppas)) {
pr_err("pblk: could not GC all sectors: line:%d (%d/%d/%d)\n",
line->id, line->vsc,
nr_ppas, nr_ppas);
put_line = 0;
pblk_put_line_back(pblk, line);
goto out;
}
sec_left -= nr_ppas;
if (sec_left > 0)
goto next_rq;
out:
pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
mempool_free(line_ws, pblk->line_ws_pool);
atomic_dec(&pblk->gc.inflight_gc);
if (put_line)
kref_put(&line->ref, pblk_line_put);
}
static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line_ws *line_ws;
__le64 *lba_list;
int ret;
line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL);
line->emeta = pblk_malloc(lm->emeta_len, l_mg->emeta_alloc_type,
GFP_KERNEL);
if (!line->emeta) {
pr_err("pblk: cannot use GC emeta\n");
goto fail_free_ws;
}
ret = pblk_line_read_emeta(pblk, line);
if (ret) {
pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret);
goto fail_free_emeta;
}
/* If this read fails, it means that emeta is corrupted. For now, leave
* the line untouched. TODO: Implement a recovery routine that scans and
* moves all sectors on the line.
*/
lba_list = pblk_recov_get_lba_list(pblk, line->emeta);
if (!lba_list) {
pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
goto fail_free_emeta;
}
line_ws->pblk = pblk;
line_ws->line = line;
line_ws->priv = lba_list;
INIT_WORK(&line_ws->ws, pblk_gc_line_ws);
queue_work(pblk->gc.gc_reader_wq, &line_ws->ws);
return 0;
fail_free_emeta:
pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
fail_free_ws:
mempool_free(line_ws, pblk->line_ws_pool);
pblk_put_line_back(pblk, line);
return 1;
}
static void pblk_gc_lines(struct pblk *pblk, struct list_head *gc_list)
{
struct pblk_line *line, *tline;
list_for_each_entry_safe(line, tline, gc_list, list) {
if (pblk_gc_line(pblk, line))
pr_err("pblk: failed to GC line %d\n", line->id);
list_del(&line->list);
}
}
/*
* Lines with no valid sectors will be returned to the free list immediately. If
* GC is activated - either because the free block count is under the determined
* threshold, or because it is being forced from user space - only lines with a
* high count of invalid sectors will be recycled.
*/
static void pblk_gc_run(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_gc *gc = &pblk->gc;
struct pblk_line *line, *tline;
unsigned int nr_blocks_free, nr_blocks_need;
struct list_head *group_list;
int run_gc, gc_group = 0;
int prev_gc = 0;
int inflight_gc = atomic_read(&gc->inflight_gc);
LIST_HEAD(gc_list);
spin_lock(&l_mg->gc_lock);
list_for_each_entry_safe(line, tline, &l_mg->gc_full_list, list) {
spin_lock(&line->lock);
WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
line->state = PBLK_LINESTATE_GC;
spin_unlock(&line->lock);
list_del(&line->list);
kref_put(&line->ref, pblk_line_put);
}
spin_unlock(&l_mg->gc_lock);
nr_blocks_need = pblk_rl_gc_thrs(&pblk->rl);
nr_blocks_free = pblk_rl_nr_free_blks(&pblk->rl);
run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
next_gc_group:
group_list = l_mg->gc_lists[gc_group++];
spin_lock(&l_mg->gc_lock);
while (run_gc && !list_empty(group_list)) {
/* No need to queue up more GC lines than we can handle */
if (!run_gc || inflight_gc > gc->gc_jobs_active) {
spin_unlock(&l_mg->gc_lock);
pblk_gc_lines(pblk, &gc_list);
return;
}
line = list_first_entry(group_list, struct pblk_line, list);
nr_blocks_free += line->blk_in_line;
spin_lock(&line->lock);
WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
line->state = PBLK_LINESTATE_GC;
list_move_tail(&line->list, &gc_list);
atomic_inc(&gc->inflight_gc);
inflight_gc++;
spin_unlock(&line->lock);
prev_gc = 1;
run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
}
spin_unlock(&l_mg->gc_lock);
pblk_gc_lines(pblk, &gc_list);
if (!prev_gc && pblk->rl.rb_state > gc_group &&
gc_group < PBLK_NR_GC_LISTS)
goto next_gc_group;
}
static void pblk_gc_kick(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
wake_up_process(gc->gc_ts);
pblk_gc_writer_kick(gc);
mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
}
static void pblk_gc_timer(unsigned long data)
{
struct pblk *pblk = (struct pblk *)data;
pblk_gc_kick(pblk);
}
static int pblk_gc_ts(void *data)
{
struct pblk *pblk = data;
while (!kthread_should_stop()) {
pblk_gc_run(pblk);
set_current_state(TASK_INTERRUPTIBLE);
io_schedule();
}
return 0;
}
static int pblk_gc_writer_ts(void *data)
{
struct pblk *pblk = data;
while (!kthread_should_stop()) {
if (!pblk_gc_write(pblk))
continue;
set_current_state(TASK_INTERRUPTIBLE);
io_schedule();
}
return 0;
}
static void pblk_gc_start(struct pblk *pblk)
{
pblk->gc.gc_active = 1;
pr_debug("pblk: gc start\n");
}
int pblk_gc_status(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
int ret;
spin_lock(&gc->lock);
ret = gc->gc_active;
spin_unlock(&gc->lock);
return ret;
}
static void __pblk_gc_should_start(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
lockdep_assert_held(&gc->lock);
if (gc->gc_enabled && !gc->gc_active)
pblk_gc_start(pblk);
}
void pblk_gc_should_start(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
spin_lock(&gc->lock);
__pblk_gc_should_start(pblk);
spin_unlock(&gc->lock);
}
/*
* If flush_wq == 1 then no lock should be held by the caller since
* flush_workqueue can sleep
*/
static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
{
spin_lock(&pblk->gc.lock);
pblk->gc.gc_active = 0;
spin_unlock(&pblk->gc.lock);
pr_debug("pblk: gc stop\n");
}
void pblk_gc_should_stop(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
if (gc->gc_active && !gc->gc_forced)
pblk_gc_stop(pblk, 0);
}
void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
int *gc_active)
{
struct pblk_gc *gc = &pblk->gc;
spin_lock(&gc->lock);
*gc_enabled = gc->gc_enabled;
*gc_active = gc->gc_active;
spin_unlock(&gc->lock);
}
void pblk_gc_sysfs_force(struct pblk *pblk, int force)
{
struct pblk_gc *gc = &pblk->gc;
int rsv = 0;
spin_lock(&gc->lock);
if (force) {
gc->gc_enabled = 1;
rsv = 64;
}
pblk_rl_set_gc_rsc(&pblk->rl, rsv);
gc->gc_forced = force;
__pblk_gc_should_start(pblk);
spin_unlock(&gc->lock);
}
int pblk_gc_init(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
int ret;
gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
if (IS_ERR(gc->gc_ts)) {
pr_err("pblk: could not allocate GC main kthread\n");
return PTR_ERR(gc->gc_ts);
}
gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
"pblk-gc-writer-ts");
if (IS_ERR(gc->gc_writer_ts)) {
pr_err("pblk: could not allocate GC writer kthread\n");
ret = PTR_ERR(gc->gc_writer_ts);
goto fail_free_main_kthread;
}
setup_timer(&gc->gc_timer, pblk_gc_timer, (unsigned long)pblk);
mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
gc->gc_active = 0;
gc->gc_forced = 0;
gc->gc_enabled = 1;
gc->gc_jobs_active = 8;
gc->w_entries = 0;
atomic_set(&gc->inflight_gc, 0);
gc->gc_reader_wq = alloc_workqueue("pblk-gc-reader-wq",
WQ_MEM_RECLAIM | WQ_UNBOUND, gc->gc_jobs_active);
if (!gc->gc_reader_wq) {
pr_err("pblk: could not allocate GC reader workqueue\n");
ret = -ENOMEM;
goto fail_free_writer_kthread;
}
spin_lock_init(&gc->lock);
spin_lock_init(&gc->w_lock);
INIT_LIST_HEAD(&gc->w_list);
return 0;
fail_free_main_kthread:
kthread_stop(gc->gc_ts);
fail_free_writer_kthread:
kthread_stop(gc->gc_writer_ts);
return ret;
}
void pblk_gc_exit(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
flush_workqueue(gc->gc_reader_wq);
del_timer(&gc->gc_timer);
pblk_gc_stop(pblk, 1);
if (gc->gc_ts)
kthread_stop(gc->gc_ts);
if (pblk->gc.gc_reader_wq)
destroy_workqueue(pblk->gc.gc_reader_wq);
if (gc->gc_writer_ts)
kthread_stop(gc->gc_writer_ts);
}

View File

@ -0,0 +1,949 @@
/*
* Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Implementation of a physical block-device target for Open-channel SSDs.
*
* pblk-init.c - pblk's initialization.
*/
#include "pblk.h"
static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_r_rq_cache,
*pblk_w_rq_cache, *pblk_line_meta_cache;
static DECLARE_RWSEM(pblk_lock);
static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
struct bio *bio)
{
int ret;
/* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
* constraint. Writes can be of arbitrary size.
*/
if (bio_data_dir(bio) == READ) {
blk_queue_split(q, &bio, q->bio_split);
ret = pblk_submit_read(pblk, bio);
if (ret == NVM_IO_DONE && bio_flagged(bio, BIO_CLONED))
bio_put(bio);
return ret;
}
/* Prevent deadlock in the case of a modest LUN configuration and large
* user I/Os. Unless stalled, the rate limiter leaves at least 256KB
* available for user I/O.
*/
if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl)))
blk_queue_split(q, &bio, q->bio_split);
return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
}
static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio)
{
struct pblk *pblk = q->queuedata;
if (bio_op(bio) == REQ_OP_DISCARD) {
pblk_discard(pblk, bio);
if (!(bio->bi_opf & REQ_PREFLUSH)) {
bio_endio(bio);
return BLK_QC_T_NONE;
}
}
switch (pblk_rw_io(q, pblk, bio)) {
case NVM_IO_ERR:
bio_io_error(bio);
break;
case NVM_IO_DONE:
bio_endio(bio);
break;
}
return BLK_QC_T_NONE;
}
static void pblk_l2p_free(struct pblk *pblk)
{
vfree(pblk->trans_map);
}
static int pblk_l2p_init(struct pblk *pblk)
{
sector_t i;
struct ppa_addr ppa;
int entry_size = 8;
if (pblk->ppaf_bitsize < 32)
entry_size = 4;
pblk->trans_map = vmalloc(entry_size * pblk->rl.nr_secs);
if (!pblk->trans_map)
return -ENOMEM;
pblk_ppa_set_empty(&ppa);
for (i = 0; i < pblk->rl.nr_secs; i++)
pblk_trans_map_set(pblk, i, ppa);
return 0;
}
static void pblk_rwb_free(struct pblk *pblk)
{
if (pblk_rb_tear_down_check(&pblk->rwb))
pr_err("pblk: write buffer error on tear down\n");
pblk_rb_data_free(&pblk->rwb);
vfree(pblk_rb_entries_ref(&pblk->rwb));
}
static int pblk_rwb_init(struct pblk *pblk)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_rb_entry *entries;
unsigned long nr_entries;
unsigned int power_size, power_seg_sz;
nr_entries = pblk_rb_calculate_size(pblk->pgs_in_buffer);
entries = vzalloc(nr_entries * sizeof(struct pblk_rb_entry));
if (!entries)
return -ENOMEM;
power_size = get_count_order(nr_entries);
power_seg_sz = get_count_order(geo->sec_size);
return pblk_rb_init(&pblk->rwb, entries, power_size, power_seg_sz);
}
/* Minimum pages needed within a lun */
#define PAGE_POOL_SIZE 16
#define ADDR_POOL_SIZE 64
static int pblk_set_ppaf(struct pblk *pblk)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct nvm_addr_format ppaf = geo->ppaf;
int power_len;
/* Re-calculate channel and lun format to adapt to configuration */
power_len = get_count_order(geo->nr_chnls);
if (1 << power_len != geo->nr_chnls) {
pr_err("pblk: supports only power-of-two channel config.\n");
return -EINVAL;
}
ppaf.ch_len = power_len;
power_len = get_count_order(geo->luns_per_chnl);
if (1 << power_len != geo->luns_per_chnl) {
pr_err("pblk: supports only power-of-two LUN config.\n");
return -EINVAL;
}
ppaf.lun_len = power_len;
pblk->ppaf.sec_offset = 0;
pblk->ppaf.pln_offset = ppaf.sect_len;
pblk->ppaf.ch_offset = pblk->ppaf.pln_offset + ppaf.pln_len;
pblk->ppaf.lun_offset = pblk->ppaf.ch_offset + ppaf.ch_len;
pblk->ppaf.pg_offset = pblk->ppaf.lun_offset + ppaf.lun_len;
pblk->ppaf.blk_offset = pblk->ppaf.pg_offset + ppaf.pg_len;
pblk->ppaf.sec_mask = (1ULL << ppaf.sect_len) - 1;
pblk->ppaf.pln_mask = ((1ULL << ppaf.pln_len) - 1) <<
pblk->ppaf.pln_offset;
pblk->ppaf.ch_mask = ((1ULL << ppaf.ch_len) - 1) <<
pblk->ppaf.ch_offset;
pblk->ppaf.lun_mask = ((1ULL << ppaf.lun_len) - 1) <<
pblk->ppaf.lun_offset;
pblk->ppaf.pg_mask = ((1ULL << ppaf.pg_len) - 1) <<
pblk->ppaf.pg_offset;
pblk->ppaf.blk_mask = ((1ULL << ppaf.blk_len) - 1) <<
pblk->ppaf.blk_offset;
pblk->ppaf_bitsize = pblk->ppaf.blk_offset + ppaf.blk_len;
return 0;
}
static int pblk_init_global_caches(struct pblk *pblk)
{
char cache_name[PBLK_CACHE_NAME_LEN];
down_write(&pblk_lock);
pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws",
sizeof(struct pblk_line_ws), 0, 0, NULL);
if (!pblk_blk_ws_cache) {
up_write(&pblk_lock);
return -ENOMEM;
}
pblk_rec_cache = kmem_cache_create("pblk_rec",
sizeof(struct pblk_rec_ctx), 0, 0, NULL);
if (!pblk_rec_cache) {
kmem_cache_destroy(pblk_blk_ws_cache);
up_write(&pblk_lock);
return -ENOMEM;
}
pblk_r_rq_cache = kmem_cache_create("pblk_r_rq", pblk_r_rq_size,
0, 0, NULL);
if (!pblk_r_rq_cache) {
kmem_cache_destroy(pblk_blk_ws_cache);
kmem_cache_destroy(pblk_rec_cache);
up_write(&pblk_lock);
return -ENOMEM;
}
pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size,
0, 0, NULL);
if (!pblk_w_rq_cache) {
kmem_cache_destroy(pblk_blk_ws_cache);
kmem_cache_destroy(pblk_rec_cache);
kmem_cache_destroy(pblk_r_rq_cache);
up_write(&pblk_lock);
return -ENOMEM;
}
snprintf(cache_name, sizeof(cache_name), "pblk_line_m_%s",
pblk->disk->disk_name);
pblk_line_meta_cache = kmem_cache_create(cache_name,
pblk->lm.sec_bitmap_len, 0, 0, NULL);
if (!pblk_line_meta_cache) {
kmem_cache_destroy(pblk_blk_ws_cache);
kmem_cache_destroy(pblk_rec_cache);
kmem_cache_destroy(pblk_r_rq_cache);
kmem_cache_destroy(pblk_w_rq_cache);
up_write(&pblk_lock);
return -ENOMEM;
}
up_write(&pblk_lock);
return 0;
}
static int pblk_core_init(struct pblk *pblk)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
int max_write_ppas;
int mod;
pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
max_write_ppas = pblk->min_write_pgs * geo->nr_luns;
pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
max_write_ppas : nvm_max_phys_sects(dev);
pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
geo->nr_planes * geo->nr_luns;
if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
pr_err("pblk: cannot support device max_phys_sect\n");
return -EINVAL;
}
div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod);
if (mod) {
pr_err("pblk: bad configuration of sectors/pages\n");
return -EINVAL;
}
if (pblk_init_global_caches(pblk))
return -ENOMEM;
pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0);
if (!pblk->page_pool)
return -ENOMEM;
pblk->line_ws_pool = mempool_create_slab_pool(geo->nr_luns,
pblk_blk_ws_cache);
if (!pblk->line_ws_pool)
goto free_page_pool;
pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
if (!pblk->rec_pool)
goto free_blk_ws_pool;
pblk->r_rq_pool = mempool_create_slab_pool(64, pblk_r_rq_cache);
if (!pblk->r_rq_pool)
goto free_rec_pool;
pblk->w_rq_pool = mempool_create_slab_pool(64, pblk_w_rq_cache);
if (!pblk->w_rq_pool)
goto free_r_rq_pool;
pblk->line_meta_pool =
mempool_create_slab_pool(16, pblk_line_meta_cache);
if (!pblk->line_meta_pool)
goto free_w_rq_pool;
pblk->kw_wq = alloc_workqueue("pblk-aux-wq",
WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
if (!pblk->kw_wq)
goto free_line_meta_pool;
if (pblk_set_ppaf(pblk))
goto free_kw_wq;
if (pblk_rwb_init(pblk))
goto free_kw_wq;
INIT_LIST_HEAD(&pblk->compl_list);
return 0;
free_kw_wq:
destroy_workqueue(pblk->kw_wq);
free_line_meta_pool:
mempool_destroy(pblk->line_meta_pool);
free_w_rq_pool:
mempool_destroy(pblk->w_rq_pool);
free_r_rq_pool:
mempool_destroy(pblk->r_rq_pool);
free_rec_pool:
mempool_destroy(pblk->rec_pool);
free_blk_ws_pool:
mempool_destroy(pblk->line_ws_pool);
free_page_pool:
mempool_destroy(pblk->page_pool);
return -ENOMEM;
}
static void pblk_core_free(struct pblk *pblk)
{
if (pblk->kw_wq)
destroy_workqueue(pblk->kw_wq);
mempool_destroy(pblk->page_pool);
mempool_destroy(pblk->line_ws_pool);
mempool_destroy(pblk->rec_pool);
mempool_destroy(pblk->r_rq_pool);
mempool_destroy(pblk->w_rq_pool);
mempool_destroy(pblk->line_meta_pool);
kmem_cache_destroy(pblk_blk_ws_cache);
kmem_cache_destroy(pblk_rec_cache);
kmem_cache_destroy(pblk_r_rq_cache);
kmem_cache_destroy(pblk_w_rq_cache);
kmem_cache_destroy(pblk_line_meta_cache);
}
static void pblk_luns_free(struct pblk *pblk)
{
kfree(pblk->luns);
}
static void pblk_lines_free(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *line;
int i;
spin_lock(&l_mg->free_lock);
for (i = 0; i < l_mg->nr_lines; i++) {
line = &pblk->lines[i];
pblk_line_free(pblk, line);
kfree(line->blk_bitmap);
kfree(line->erase_bitmap);
}
spin_unlock(&l_mg->free_lock);
}
static void pblk_line_meta_free(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
int i;
kfree(l_mg->bb_template);
kfree(l_mg->bb_aux);
for (i = 0; i < PBLK_DATA_LINES; i++) {
pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type);
pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type);
}
kfree(pblk->lines);
}
static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
{
struct nvm_geo *geo = &dev->geo;
struct ppa_addr ppa;
u8 *blks;
int nr_blks, ret;
nr_blks = geo->blks_per_lun * geo->plane_mode;
blks = kmalloc(nr_blks, GFP_KERNEL);
if (!blks)
return -ENOMEM;
ppa.ppa = 0;
ppa.g.ch = rlun->bppa.g.ch;
ppa.g.lun = rlun->bppa.g.lun;
ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
if (ret)
goto out;
nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
if (nr_blks < 0) {
kfree(blks);
ret = nr_blks;
}
rlun->bb_list = blks;
out:
return ret;
}
static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line)
{
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_lun *rlun;
int bb_cnt = 0;
int i;
line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
if (!line->blk_bitmap)
return -ENOMEM;
line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
if (!line->erase_bitmap) {
kfree(line->blk_bitmap);
return -ENOMEM;
}
for (i = 0; i < lm->blk_per_line; i++) {
rlun = &pblk->luns[i];
if (rlun->bb_list[line->id] == NVM_BLK_T_FREE)
continue;
set_bit(i, line->blk_bitmap);
bb_cnt++;
}
return bb_cnt;
}
static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_lun *rlun;
int i, ret;
/* TODO: Implement unbalanced LUN support */
if (geo->luns_per_chnl < 0) {
pr_err("pblk: unbalanced LUN config.\n");
return -EINVAL;
}
pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL);
if (!pblk->luns)
return -ENOMEM;
for (i = 0; i < geo->nr_luns; i++) {
/* Stripe across channels */
int ch = i % geo->nr_chnls;
int lun_raw = i / geo->nr_chnls;
int lunid = lun_raw + ch * geo->luns_per_chnl;
rlun = &pblk->luns[i];
rlun->bppa = luns[lunid];
sema_init(&rlun->wr_sem, 1);
ret = pblk_bb_discovery(dev, rlun);
if (ret) {
while (--i >= 0)
kfree(pblk->luns[i].bb_list);
return ret;
}
}
return 0;
}
static int pblk_lines_configure(struct pblk *pblk, int flags)
{
struct pblk_line *line = NULL;
int ret = 0;
if (!(flags & NVM_TARGET_FACTORY)) {
line = pblk_recov_l2p(pblk);
if (IS_ERR(line)) {
pr_err("pblk: could not recover l2p table\n");
ret = -EFAULT;
}
}
if (!line) {
/* Configure next line for user data */
line = pblk_line_get_first_data(pblk);
if (!line) {
pr_err("pblk: line list corrupted\n");
ret = -EFAULT;
}
}
return ret;
}
/* See comment over struct line_emeta definition */
static unsigned int calc_emeta_len(struct pblk *pblk, struct pblk_line_meta *lm)
{
return (sizeof(struct line_emeta) +
((lm->sec_per_line - lm->emeta_sec) * sizeof(u64)) +
(pblk->l_mg.nr_lines * sizeof(u32)) +
lm->blk_bitmap_len);
}
static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
sector_t provisioned;
pblk->over_pct = 20;
provisioned = nr_free_blks;
provisioned *= (100 - pblk->over_pct);
sector_div(provisioned, 100);
/* Internally pblk manages all free blocks, but all calculations based
* on user capacity consider only provisioned blocks
*/
pblk->rl.total_blocks = nr_free_blks;
pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk;
pblk->capacity = provisioned * geo->sec_per_blk;
atomic_set(&pblk->rl.free_blocks, nr_free_blks);
}
static int pblk_lines_init(struct pblk *pblk)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line *line;
unsigned int smeta_len, emeta_len;
long nr_bad_blks, nr_meta_blks, nr_free_blks;
int bb_distance;
int i;
int ret = 0;
lm->sec_per_line = geo->sec_per_blk * geo->nr_luns;
lm->blk_per_line = geo->nr_luns;
lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
lm->high_thrs = lm->sec_per_line / 2;
lm->mid_thrs = lm->sec_per_line / 4;
/* Calculate necessary pages for smeta. See comment over struct
* line_smeta definition
*/
lm->smeta_len = sizeof(struct line_smeta) +
PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len;
i = 1;
add_smeta_page:
lm->smeta_sec = i * geo->sec_per_pl;
lm->smeta_len = lm->smeta_sec * geo->sec_size;
smeta_len = sizeof(struct line_smeta) +
PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len;
if (smeta_len > lm->smeta_len) {
i++;
goto add_smeta_page;
}
/* Calculate necessary pages for emeta. See comment over struct
* line_emeta definition
*/
i = 1;
add_emeta_page:
lm->emeta_sec = i * geo->sec_per_pl;
lm->emeta_len = lm->emeta_sec * geo->sec_size;
emeta_len = calc_emeta_len(pblk, lm);
if (emeta_len > lm->emeta_len) {
i++;
goto add_emeta_page;
}
lm->emeta_bb = geo->nr_luns - i;
nr_meta_blks = (lm->smeta_sec + lm->emeta_sec +
(geo->sec_per_blk / 2)) / geo->sec_per_blk;
lm->min_blk_line = nr_meta_blks + 1;
l_mg->nr_lines = geo->blks_per_lun;
l_mg->log_line = l_mg->data_line = NULL;
l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
l_mg->nr_free_lines = 0;
bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
/* smeta is always small enough to fit on a kmalloc memory allocation,
* emeta depends on the number of LUNs allocated to the pblk instance
*/
l_mg->smeta_alloc_type = PBLK_KMALLOC_META;
for (i = 0; i < PBLK_DATA_LINES; i++) {
l_mg->sline_meta[i].meta = kmalloc(lm->smeta_len, GFP_KERNEL);
if (!l_mg->sline_meta[i].meta)
while (--i >= 0) {
kfree(l_mg->sline_meta[i].meta);
ret = -ENOMEM;
goto fail;
}
}
if (lm->emeta_len > KMALLOC_MAX_CACHE_SIZE) {
l_mg->emeta_alloc_type = PBLK_VMALLOC_META;
for (i = 0; i < PBLK_DATA_LINES; i++) {
l_mg->eline_meta[i].meta = vmalloc(lm->emeta_len);
if (!l_mg->eline_meta[i].meta)
while (--i >= 0) {
vfree(l_mg->eline_meta[i].meta);
ret = -ENOMEM;
goto fail;
}
}
} else {
l_mg->emeta_alloc_type = PBLK_KMALLOC_META;
for (i = 0; i < PBLK_DATA_LINES; i++) {
l_mg->eline_meta[i].meta =
kmalloc(lm->emeta_len, GFP_KERNEL);
if (!l_mg->eline_meta[i].meta)
while (--i >= 0) {
kfree(l_mg->eline_meta[i].meta);
ret = -ENOMEM;
goto fail;
}
}
}
l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
if (!l_mg->bb_template)
goto fail_free_meta;
l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
if (!l_mg->bb_aux)
goto fail_free_bb_template;
bb_distance = (geo->nr_luns) * geo->sec_per_pl;
for (i = 0; i < lm->sec_per_line; i += bb_distance)
bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
INIT_LIST_HEAD(&l_mg->free_list);
INIT_LIST_HEAD(&l_mg->corrupt_list);
INIT_LIST_HEAD(&l_mg->bad_list);
INIT_LIST_HEAD(&l_mg->gc_full_list);
INIT_LIST_HEAD(&l_mg->gc_high_list);
INIT_LIST_HEAD(&l_mg->gc_mid_list);
INIT_LIST_HEAD(&l_mg->gc_low_list);
INIT_LIST_HEAD(&l_mg->gc_empty_list);
l_mg->gc_lists[0] = &l_mg->gc_high_list;
l_mg->gc_lists[1] = &l_mg->gc_mid_list;
l_mg->gc_lists[2] = &l_mg->gc_low_list;
spin_lock_init(&l_mg->free_lock);
spin_lock_init(&l_mg->gc_lock);
pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line),
GFP_KERNEL);
if (!pblk->lines)
goto fail_free_bb_aux;
nr_free_blks = 0;
for (i = 0; i < l_mg->nr_lines; i++) {
line = &pblk->lines[i];
line->pblk = pblk;
line->id = i;
line->type = PBLK_LINETYPE_FREE;
line->state = PBLK_LINESTATE_FREE;
line->gc_group = PBLK_LINEGC_NONE;
spin_lock_init(&line->lock);
nr_bad_blks = pblk_bb_line(pblk, line);
if (nr_bad_blks < 0 || nr_bad_blks > lm->blk_per_line)
goto fail_free_lines;
line->blk_in_line = lm->blk_per_line - nr_bad_blks;
if (line->blk_in_line < lm->min_blk_line) {
line->state = PBLK_LINESTATE_BAD;
list_add_tail(&line->list, &l_mg->bad_list);
continue;
}
nr_free_blks += line->blk_in_line;
l_mg->nr_free_lines++;
list_add_tail(&line->list, &l_mg->free_list);
}
pblk_set_provision(pblk, nr_free_blks);
sema_init(&pblk->erase_sem, 1);
/* Cleanup per-LUN bad block lists - managed within lines on run-time */
for (i = 0; i < geo->nr_luns; i++)
kfree(pblk->luns[i].bb_list);
return 0;
fail_free_lines:
kfree(pblk->lines);
fail_free_bb_aux:
kfree(l_mg->bb_aux);
fail_free_bb_template:
kfree(l_mg->bb_template);
fail_free_meta:
for (i = 0; i < PBLK_DATA_LINES; i++) {
pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type);
pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type);
}
fail:
for (i = 0; i < geo->nr_luns; i++)
kfree(pblk->luns[i].bb_list);
return ret;
}
static int pblk_writer_init(struct pblk *pblk)
{
setup_timer(&pblk->wtimer, pblk_write_timer_fn, (unsigned long)pblk);
mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
if (IS_ERR(pblk->writer_ts)) {
pr_err("pblk: could not allocate writer kthread\n");
return 1;
}
return 0;
}
static void pblk_writer_stop(struct pblk *pblk)
{
if (pblk->writer_ts)
kthread_stop(pblk->writer_ts);
del_timer(&pblk->wtimer);
}
static void pblk_free(struct pblk *pblk)
{
pblk_luns_free(pblk);
pblk_lines_free(pblk);
pblk_line_meta_free(pblk);
pblk_core_free(pblk);
pblk_l2p_free(pblk);
kfree(pblk);
}
static void pblk_tear_down(struct pblk *pblk)
{
pblk_flush_writer(pblk);
pblk_writer_stop(pblk);
pblk_rb_sync_l2p(&pblk->rwb);
pblk_recov_pad(pblk);
pblk_rwb_free(pblk);
pblk_rl_free(&pblk->rl);
pr_debug("pblk: consistent tear down\n");
}
static void pblk_exit(void *private)
{
struct pblk *pblk = private;
down_write(&pblk_lock);
pblk_gc_exit(pblk);
pblk_tear_down(pblk);
pblk_free(pblk);
up_write(&pblk_lock);
}
static sector_t pblk_capacity(void *private)
{
struct pblk *pblk = private;
return pblk->capacity * NR_PHY_IN_LOG;
}
static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
int flags)
{
struct nvm_geo *geo = &dev->geo;
struct request_queue *bqueue = dev->q;
struct request_queue *tqueue = tdisk->queue;
struct pblk *pblk;
int ret;
if (dev->identity.dom & NVM_RSP_L2P) {
pr_err("pblk: device-side L2P table not supported. (%x)\n",
dev->identity.dom);
return ERR_PTR(-EINVAL);
}
pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
if (!pblk)
return ERR_PTR(-ENOMEM);
pblk->dev = dev;
pblk->disk = tdisk;
spin_lock_init(&pblk->trans_lock);
spin_lock_init(&pblk->lock);
if (flags & NVM_TARGET_FACTORY)
pblk_setup_uuid(pblk);
#ifdef CONFIG_NVM_DEBUG
atomic_long_set(&pblk->inflight_writes, 0);
atomic_long_set(&pblk->padded_writes, 0);
atomic_long_set(&pblk->padded_wb, 0);
atomic_long_set(&pblk->nr_flush, 0);
atomic_long_set(&pblk->req_writes, 0);
atomic_long_set(&pblk->sub_writes, 0);
atomic_long_set(&pblk->sync_writes, 0);
atomic_long_set(&pblk->compl_writes, 0);
atomic_long_set(&pblk->inflight_reads, 0);
atomic_long_set(&pblk->sync_reads, 0);
atomic_long_set(&pblk->recov_writes, 0);
atomic_long_set(&pblk->recov_writes, 0);
atomic_long_set(&pblk->recov_gc_writes, 0);
#endif
atomic_long_set(&pblk->read_failed, 0);
atomic_long_set(&pblk->read_empty, 0);
atomic_long_set(&pblk->read_high_ecc, 0);
atomic_long_set(&pblk->read_failed_gc, 0);
atomic_long_set(&pblk->write_failed, 0);
atomic_long_set(&pblk->erase_failed, 0);
ret = pblk_luns_init(pblk, dev->luns);
if (ret) {
pr_err("pblk: could not initialize luns\n");
goto fail;
}
ret = pblk_lines_init(pblk);
if (ret) {
pr_err("pblk: could not initialize lines\n");
goto fail_free_luns;
}
ret = pblk_core_init(pblk);
if (ret) {
pr_err("pblk: could not initialize core\n");
goto fail_free_line_meta;
}
ret = pblk_l2p_init(pblk);
if (ret) {
pr_err("pblk: could not initialize maps\n");
goto fail_free_core;
}
ret = pblk_lines_configure(pblk, flags);
if (ret) {
pr_err("pblk: could not configure lines\n");
goto fail_free_l2p;
}
ret = pblk_writer_init(pblk);
if (ret) {
pr_err("pblk: could not initialize write thread\n");
goto fail_free_lines;
}
ret = pblk_gc_init(pblk);
if (ret) {
pr_err("pblk: could not initialize gc\n");
goto fail_stop_writer;
}
/* inherit the size from the underlying device */
blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
blk_queue_write_cache(tqueue, true, false);
tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size;
tqueue->limits.discard_alignment = 0;
blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
geo->nr_luns, pblk->l_mg.nr_lines,
(unsigned long long)pblk->rl.nr_secs,
pblk->rwb.nr_entries);
wake_up_process(pblk->writer_ts);
return pblk;
fail_stop_writer:
pblk_writer_stop(pblk);
fail_free_lines:
pblk_lines_free(pblk);
fail_free_l2p:
pblk_l2p_free(pblk);
fail_free_core:
pblk_core_free(pblk);
fail_free_line_meta:
pblk_line_meta_free(pblk);
fail_free_luns:
pblk_luns_free(pblk);
fail:
kfree(pblk);
return ERR_PTR(ret);
}
/* physical block device target */
static struct nvm_tgt_type tt_pblk = {
.name = "pblk",
.version = {1, 0, 0},
.make_rq = pblk_make_rq,
.capacity = pblk_capacity,
.init = pblk_init,
.exit = pblk_exit,
.sysfs_init = pblk_sysfs_init,
.sysfs_exit = pblk_sysfs_exit,
};
static int __init pblk_module_init(void)
{
return nvm_register_tgt_type(&tt_pblk);
}
static void pblk_module_exit(void)
{
nvm_unregister_tgt_type(&tt_pblk);
}
module_init(pblk_module_init);
module_exit(pblk_module_exit);
MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");

View File

@ -0,0 +1,136 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-map.c - pblk's lba-ppa mapping strategy
*
*/
#include "pblk.h"
static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
struct ppa_addr *ppa_list,
unsigned long *lun_bitmap,
struct pblk_sec_meta *meta_list,
unsigned int valid_secs)
{
struct pblk_line *line = pblk_line_get_data(pblk);
struct line_emeta *emeta = line->emeta;
struct pblk_w_ctx *w_ctx;
__le64 *lba_list = pblk_line_emeta_to_lbas(emeta);
u64 paddr;
int nr_secs = pblk->min_write_pgs;
int i;
paddr = pblk_alloc_page(pblk, line, nr_secs);
for (i = 0; i < nr_secs; i++, paddr++) {
/* ppa to be sent to the device */
ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
/* Write context for target bio completion on write buffer. Note
* that the write buffer is protected by the sync backpointer,
* and a single writer thread have access to each specific entry
* at a time. Thus, it is safe to modify the context for the
* entry we are setting up for submission without taking any
* lock or memory barrier.
*/
if (i < valid_secs) {
kref_get(&line->ref);
w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
w_ctx->ppa = ppa_list[i];
meta_list[i].lba = cpu_to_le64(w_ctx->lba);
lba_list[paddr] = cpu_to_le64(w_ctx->lba);
le64_add_cpu(&line->emeta->nr_valid_lbas, 1);
} else {
meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
lba_list[paddr] = cpu_to_le64(ADDR_EMPTY);
pblk_map_pad_invalidate(pblk, line, paddr);
}
}
if (pblk_line_is_full(line)) {
line = pblk_line_replace_data(pblk);
if (!line)
return;
}
pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
}
void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
unsigned long *lun_bitmap, unsigned int valid_secs,
unsigned int off)
{
struct pblk_sec_meta *meta_list = rqd->meta_list;
unsigned int map_secs;
int min = pblk->min_write_pgs;
int i;
for (i = off; i < rqd->nr_ppas; i += min) {
map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
lun_bitmap, &meta_list[i], map_secs);
}
}
/* only if erase_ppa is set, acquire erase semaphore */
void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
unsigned int sentry, unsigned long *lun_bitmap,
unsigned int valid_secs, struct ppa_addr *erase_ppa)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line *e_line = pblk_line_get_data_next(pblk);
struct pblk_sec_meta *meta_list = rqd->meta_list;
unsigned int map_secs;
int min = pblk->min_write_pgs;
int i, erase_lun;
for (i = 0; i < rqd->nr_ppas; i += min) {
map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
lun_bitmap, &meta_list[i], map_secs);
erase_lun = rqd->ppa_list[i].g.lun * geo->nr_chnls +
rqd->ppa_list[i].g.ch;
if (!test_bit(erase_lun, e_line->erase_bitmap)) {
if (down_trylock(&pblk->erase_sem))
continue;
set_bit(erase_lun, e_line->erase_bitmap);
e_line->left_eblks--;
*erase_ppa = rqd->ppa_list[i];
erase_ppa->g.blk = e_line->id;
/* Avoid evaluating e_line->left_eblks */
return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
valid_secs, i + min);
}
}
/* Erase blocks that are bad in this line but might not be in next */
if (unlikely(ppa_empty(*erase_ppa))) {
struct pblk_line_meta *lm = &pblk->lm;
i = find_first_zero_bit(e_line->erase_bitmap, lm->blk_per_line);
if (i == lm->blk_per_line)
return;
set_bit(i, e_line->erase_bitmap);
e_line->left_eblks--;
*erase_ppa = pblk->luns[i].bppa; /* set ch and lun */
erase_ppa->g.blk = e_line->id;
}
}

View File

@ -0,0 +1,852 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
*
* Based upon the circular ringbuffer.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-rb.c - pblk's write buffer
*/
#include <linux/circ_buf.h>
#include "pblk.h"
static DECLARE_RWSEM(pblk_rb_lock);
void pblk_rb_data_free(struct pblk_rb *rb)
{
struct pblk_rb_pages *p, *t;
down_write(&pblk_rb_lock);
list_for_each_entry_safe(p, t, &rb->pages, list) {
free_pages((unsigned long)page_address(p->pages), p->order);
list_del(&p->list);
kfree(p);
}
up_write(&pblk_rb_lock);
}
/*
* Initialize ring buffer. The data and metadata buffers must be previously
* allocated and their size must be a power of two
* (Documentation/circular-buffers.txt)
*/
int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
unsigned int power_size, unsigned int power_seg_sz)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
unsigned int init_entry = 0;
unsigned int alloc_order = power_size;
unsigned int max_order = MAX_ORDER - 1;
unsigned int order, iter;
down_write(&pblk_rb_lock);
rb->entries = rb_entry_base;
rb->seg_size = (1 << power_seg_sz);
rb->nr_entries = (1 << power_size);
rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
rb->sync_point = EMPTY_ENTRY;
spin_lock_init(&rb->w_lock);
spin_lock_init(&rb->s_lock);
INIT_LIST_HEAD(&rb->pages);
if (alloc_order >= max_order) {
order = max_order;
iter = (1 << (alloc_order - max_order));
} else {
order = alloc_order;
iter = 1;
}
do {
struct pblk_rb_entry *entry;
struct pblk_rb_pages *page_set;
void *kaddr;
unsigned long set_size;
int i;
page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL);
if (!page_set) {
up_write(&pblk_rb_lock);
return -ENOMEM;
}
page_set->order = order;
page_set->pages = alloc_pages(GFP_KERNEL, order);
if (!page_set->pages) {
kfree(page_set);
pblk_rb_data_free(rb);
up_write(&pblk_rb_lock);
return -ENOMEM;
}
kaddr = page_address(page_set->pages);
entry = &rb->entries[init_entry];
entry->data = kaddr;
entry->cacheline = pblk_cacheline_to_addr(init_entry++);
entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
set_size = (1 << order);
for (i = 1; i < set_size; i++) {
entry = &rb->entries[init_entry];
entry->cacheline = pblk_cacheline_to_addr(init_entry++);
entry->data = kaddr + (i * rb->seg_size);
entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
bio_list_init(&entry->w_ctx.bios);
}
list_add_tail(&page_set->list, &rb->pages);
iter--;
} while (iter > 0);
up_write(&pblk_rb_lock);
#ifdef CONFIG_NVM_DEBUG
atomic_set(&rb->inflight_sync_point, 0);
#endif
/*
* Initialize rate-limiter, which controls access to the write buffer
* but user and GC I/O
*/
pblk_rl_init(&pblk->rl, rb->nr_entries);
return 0;
}
/*
* pblk_rb_calculate_size -- calculate the size of the write buffer
*/
unsigned int pblk_rb_calculate_size(unsigned int nr_entries)
{
/* Alloc a write buffer that can at least fit 128 entries */
return (1 << max(get_count_order(nr_entries), 7));
}
void *pblk_rb_entries_ref(struct pblk_rb *rb)
{
return rb->entries;
}
static void clean_wctx(struct pblk_w_ctx *w_ctx)
{
int flags;
try:
flags = READ_ONCE(w_ctx->flags);
if (!(flags & PBLK_SUBMITTED_ENTRY))
goto try;
/* Release flags on context. Protect from writes and reads */
smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
pblk_ppa_set_empty(&w_ctx->ppa);
}
#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size)
#define pblk_rb_ring_space(rb, head, tail, size) \
(CIRC_SPACE(head, tail, size))
/*
* Buffer space is calculated with respect to the back pointer signaling
* synchronized entries to the media.
*/
static unsigned int pblk_rb_space(struct pblk_rb *rb)
{
unsigned int mem = READ_ONCE(rb->mem);
unsigned int sync = READ_ONCE(rb->sync);
return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries);
}
/*
* Buffer count is calculated with respect to the submission entry signaling the
* entries that are available to send to the media
*/
unsigned int pblk_rb_read_count(struct pblk_rb *rb)
{
unsigned int mem = READ_ONCE(rb->mem);
unsigned int subm = READ_ONCE(rb->subm);
return pblk_rb_ring_count(mem, subm, rb->nr_entries);
}
unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
{
unsigned int subm;
subm = READ_ONCE(rb->subm);
/* Commit read means updating submission pointer */
smp_store_release(&rb->subm,
(subm + nr_entries) & (rb->nr_entries - 1));
return subm;
}
static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
unsigned int to_update)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_line *line;
struct pblk_rb_entry *entry;
struct pblk_w_ctx *w_ctx;
unsigned int i;
for (i = 0; i < to_update; i++) {
entry = &rb->entries[*l2p_upd];
w_ctx = &entry->w_ctx;
pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
entry->cacheline);
line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
kref_put(&line->ref, pblk_line_put);
clean_wctx(w_ctx);
*l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1);
}
return 0;
}
/*
* When we move the l2p_update pointer, we update the l2p table - lookups will
* point to the physical address instead of to the cacheline in the write buffer
* from this moment on.
*/
static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
unsigned int mem, unsigned int sync)
{
unsigned int space, count;
int ret = 0;
lockdep_assert_held(&rb->w_lock);
/* Update l2p only as buffer entries are being overwritten */
space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries);
if (space > nr_entries)
goto out;
count = nr_entries - space;
/* l2p_update used exclusively under rb->w_lock */
ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count);
out:
return ret;
}
/*
* Update the l2p entry for all sectors stored on the write buffer. This means
* that all future lookups to the l2p table will point to a device address, not
* to the cacheline in the write buffer.
*/
void pblk_rb_sync_l2p(struct pblk_rb *rb)
{
unsigned int sync;
unsigned int to_update;
spin_lock(&rb->w_lock);
/* Protect from reads and writes */
sync = smp_load_acquire(&rb->sync);
to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
__pblk_rb_update_l2p(rb, &rb->l2p_update, to_update);
spin_unlock(&rb->w_lock);
}
/*
* Write @nr_entries to ring buffer from @data buffer if there is enough space.
* Typically, 4KB data chunks coming from a bio will be copied to the ring
* buffer, thus the write will fail if not all incoming data can be copied.
*
*/
static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data,
struct pblk_w_ctx w_ctx,
struct pblk_rb_entry *entry)
{
memcpy(entry->data, data, rb->seg_size);
entry->w_ctx.lba = w_ctx.lba;
entry->w_ctx.ppa = w_ctx.ppa;
}
void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
struct pblk_w_ctx w_ctx, unsigned int ring_pos)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_rb_entry *entry;
int flags;
entry = &rb->entries[ring_pos];
flags = READ_ONCE(entry->w_ctx.flags);
#ifdef CONFIG_NVM_DEBUG
/* Caller must guarantee that the entry is free */
BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
#endif
__pblk_rb_write_entry(rb, data, w_ctx, entry);
pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline);
flags = w_ctx.flags | PBLK_WRITTEN_DATA;
/* Release flags on write context. Protect from writes */
smp_store_release(&entry->w_ctx.flags, flags);
}
void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
unsigned int ring_pos)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_rb_entry *entry;
int flags;
entry = &rb->entries[ring_pos];
flags = READ_ONCE(entry->w_ctx.flags);
#ifdef CONFIG_NVM_DEBUG
/* Caller must guarantee that the entry is free */
BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
#endif
__pblk_rb_write_entry(rb, data, w_ctx, entry);
if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line))
entry->w_ctx.lba = ADDR_EMPTY;
flags = w_ctx.flags | PBLK_WRITTEN_DATA;
/* Release flags on write context. Protect from writes */
smp_store_release(&entry->w_ctx.flags, flags);
}
static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
unsigned int pos)
{
struct pblk_rb_entry *entry;
unsigned int subm, sync_point;
int flags;
subm = READ_ONCE(rb->subm);
#ifdef CONFIG_NVM_DEBUG
atomic_inc(&rb->inflight_sync_point);
#endif
if (pos == subm)
return 0;
sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
entry = &rb->entries[sync_point];
flags = READ_ONCE(entry->w_ctx.flags);
flags |= PBLK_FLUSH_ENTRY;
/* Release flags on context. Protect from writes */
smp_store_release(&entry->w_ctx.flags, flags);
/* Protect syncs */
smp_store_release(&rb->sync_point, sync_point);
spin_lock_irq(&rb->s_lock);
bio_list_add(&entry->w_ctx.bios, bio);
spin_unlock_irq(&rb->s_lock);
return 1;
}
static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
unsigned int *pos)
{
unsigned int mem;
unsigned int sync;
sync = READ_ONCE(rb->sync);
mem = READ_ONCE(rb->mem);
if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_entries)
return 0;
if (pblk_rb_update_l2p(rb, nr_entries, mem, sync))
return 0;
*pos = mem;
return 1;
}
static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
unsigned int *pos)
{
if (!__pblk_rb_may_write(rb, nr_entries, pos))
return 0;
/* Protect from read count */
smp_store_release(&rb->mem, (*pos + nr_entries) & (rb->nr_entries - 1));
return 1;
}
static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
unsigned int *pos, struct bio *bio,
int *io_ret)
{
unsigned int mem;
if (!__pblk_rb_may_write(rb, nr_entries, pos))
return 0;
mem = (*pos + nr_entries) & (rb->nr_entries - 1);
*io_ret = NVM_IO_DONE;
if (bio->bi_opf & REQ_PREFLUSH) {
struct pblk *pblk = container_of(rb, struct pblk, rwb);
#ifdef CONFIG_NVM_DEBUG
atomic_long_inc(&pblk->nr_flush);
#endif
if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem))
*io_ret = NVM_IO_OK;
}
/* Protect from read count */
smp_store_release(&rb->mem, mem);
return 1;
}
/*
* Atomically check that (i) there is space on the write buffer for the
* incoming I/O, and (ii) the current I/O type has enough budget in the write
* buffer (rate-limiter).
*/
int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
unsigned int nr_entries, unsigned int *pos)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
int flush_done;
spin_lock(&rb->w_lock);
if (!pblk_rl_user_may_insert(&pblk->rl, nr_entries)) {
spin_unlock(&rb->w_lock);
return NVM_IO_REQUEUE;
}
if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &flush_done)) {
spin_unlock(&rb->w_lock);
return NVM_IO_REQUEUE;
}
pblk_rl_user_in(&pblk->rl, nr_entries);
spin_unlock(&rb->w_lock);
return flush_done;
}
/*
* Look at pblk_rb_may_write_user comment
*/
int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
unsigned int *pos)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
spin_lock(&rb->w_lock);
if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) {
spin_unlock(&rb->w_lock);
return 0;
}
if (!pblk_rb_may_write(rb, nr_entries, pos)) {
spin_unlock(&rb->w_lock);
return 0;
}
pblk_rl_gc_in(&pblk->rl, nr_entries);
spin_unlock(&rb->w_lock);
return 1;
}
/*
* The caller of this function must ensure that the backpointer will not
* overwrite the entries passed on the list.
*/
unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
struct list_head *list,
unsigned int max)
{
struct pblk_rb_entry *entry, *tentry;
struct page *page;
unsigned int read = 0;
int ret;
list_for_each_entry_safe(entry, tentry, list, index) {
if (read > max) {
pr_err("pblk: too many entries on list\n");
goto out;
}
page = virt_to_page(entry->data);
if (!page) {
pr_err("pblk: could not allocate write bio page\n");
goto out;
}
ret = bio_add_page(bio, page, rb->seg_size, 0);
if (ret != rb->seg_size) {
pr_err("pblk: could not add page to write bio\n");
goto out;
}
list_del(&entry->index);
read++;
}
out:
return read;
}
/*
* Read available entries on rb and add them to the given bio. To avoid a memory
* copy, a page reference to the write buffer is used to be added to the bio.
*
* This function is used by the write thread to form the write bio that will
* persist data on the write buffer to the media.
*/
unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
struct pblk_c_ctx *c_ctx,
unsigned int pos,
unsigned int nr_entries,
unsigned int count)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_rb_entry *entry;
struct page *page;
unsigned int pad = 0, read = 0, to_read = nr_entries;
unsigned int user_io = 0, gc_io = 0;
unsigned int i;
int flags;
int ret;
if (count < nr_entries) {
pad = nr_entries - count;
to_read = count;
}
c_ctx->sentry = pos;
c_ctx->nr_valid = to_read;
c_ctx->nr_padded = pad;
for (i = 0; i < to_read; i++) {
entry = &rb->entries[pos];
/* A write has been allowed into the buffer, but data is still
* being copied to it. It is ok to busy wait.
*/
try:
flags = READ_ONCE(entry->w_ctx.flags);
if (!(flags & PBLK_WRITTEN_DATA))
goto try;
if (flags & PBLK_IOTYPE_USER)
user_io++;
else if (flags & PBLK_IOTYPE_GC)
gc_io++;
else
WARN(1, "pblk: unknown IO type\n");
page = virt_to_page(entry->data);
if (!page) {
pr_err("pblk: could not allocate write bio page\n");
flags &= ~PBLK_WRITTEN_DATA;
flags |= PBLK_SUBMITTED_ENTRY;
/* Release flags on context. Protect from writes */
smp_store_release(&entry->w_ctx.flags, flags);
goto out;
}
ret = bio_add_page(bio, page, rb->seg_size, 0);
if (ret != rb->seg_size) {
pr_err("pblk: could not add page to write bio\n");
flags &= ~PBLK_WRITTEN_DATA;
flags |= PBLK_SUBMITTED_ENTRY;
/* Release flags on context. Protect from writes */
smp_store_release(&entry->w_ctx.flags, flags);
goto out;
}
if (flags & PBLK_FLUSH_ENTRY) {
unsigned int sync_point;
sync_point = READ_ONCE(rb->sync_point);
if (sync_point == pos) {
/* Protect syncs */
smp_store_release(&rb->sync_point, EMPTY_ENTRY);
}
flags &= ~PBLK_FLUSH_ENTRY;
#ifdef CONFIG_NVM_DEBUG
atomic_dec(&rb->inflight_sync_point);
#endif
}
flags &= ~PBLK_WRITTEN_DATA;
flags |= PBLK_SUBMITTED_ENTRY;
/* Release flags on context. Protect from writes */
smp_store_release(&entry->w_ctx.flags, flags);
pos = (pos + 1) & (rb->nr_entries - 1);
}
read = to_read;
pblk_rl_out(&pblk->rl, user_io, gc_io);
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(pad, &((struct pblk *)
(container_of(rb, struct pblk, rwb)))->padded_writes);
#endif
out:
return read;
}
/*
* Copy to bio only if the lba matches the one on the given cache entry.
* Otherwise, it means that the entry has been overwritten, and the bio should
* be directed to disk.
*/
int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
u64 pos, int bio_iter)
{
struct pblk_rb_entry *entry;
struct pblk_w_ctx *w_ctx;
void *data;
int flags;
int ret = 1;
spin_lock(&rb->w_lock);
#ifdef CONFIG_NVM_DEBUG
/* Caller must ensure that the access will not cause an overflow */
BUG_ON(pos >= rb->nr_entries);
#endif
entry = &rb->entries[pos];
w_ctx = &entry->w_ctx;
flags = READ_ONCE(w_ctx->flags);
/* Check if the entry has been overwritten or is scheduled to be */
if (w_ctx->lba != lba || flags & PBLK_WRITABLE_ENTRY) {
ret = 0;
goto out;
}
/* Only advance the bio if it hasn't been advanced already. If advanced,
* this bio is at least a partial bio (i.e., it has partially been
* filled with data from the cache). If part of the data resides on the
* media, we will read later on
*/
if (unlikely(!bio->bi_iter.bi_idx))
bio_advance(bio, bio_iter * PBLK_EXPOSED_PAGE_SIZE);
data = bio_data(bio);
memcpy(data, entry->data, rb->seg_size);
out:
spin_unlock(&rb->w_lock);
return ret;
}
struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos)
{
unsigned int entry = pos & (rb->nr_entries - 1);
return &rb->entries[entry].w_ctx;
}
unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags)
__acquires(&rb->s_lock)
{
if (flags)
spin_lock_irqsave(&rb->s_lock, *flags);
else
spin_lock_irq(&rb->s_lock);
return rb->sync;
}
void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
__releases(&rb->s_lock)
{
lockdep_assert_held(&rb->s_lock);
if (flags)
spin_unlock_irqrestore(&rb->s_lock, *flags);
else
spin_unlock_irq(&rb->s_lock);
}
unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
{
unsigned int sync;
unsigned int i;
lockdep_assert_held(&rb->s_lock);
sync = READ_ONCE(rb->sync);
for (i = 0; i < nr_entries; i++)
sync = (sync + 1) & (rb->nr_entries - 1);
/* Protect from counts */
smp_store_release(&rb->sync, sync);
return sync;
}
unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb)
{
unsigned int subm, sync_point;
unsigned int count;
/* Protect syncs */
sync_point = smp_load_acquire(&rb->sync_point);
if (sync_point == EMPTY_ENTRY)
return 0;
subm = READ_ONCE(rb->subm);
/* The sync point itself counts as a sector to sync */
count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1;
return count;
}
/*
* Scan from the current position of the sync pointer to find the entry that
* corresponds to the given ppa. This is necessary since write requests can be
* completed out of order. The assumption is that the ppa is close to the sync
* pointer thus the search will not take long.
*
* The caller of this function must guarantee that the sync pointer will no
* reach the entry while it is using the metadata associated with it. With this
* assumption in mind, there is no need to take the sync lock.
*/
struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
struct ppa_addr *ppa)
{
unsigned int sync, subm, count;
unsigned int i;
sync = READ_ONCE(rb->sync);
subm = READ_ONCE(rb->subm);
count = pblk_rb_ring_count(subm, sync, rb->nr_entries);
for (i = 0; i < count; i++)
sync = (sync + 1) & (rb->nr_entries - 1);
return NULL;
}
int pblk_rb_tear_down_check(struct pblk_rb *rb)
{
struct pblk_rb_entry *entry;
int i;
int ret = 0;
spin_lock(&rb->w_lock);
spin_lock_irq(&rb->s_lock);
if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
(rb->sync == rb->l2p_update) &&
(rb->sync_point == EMPTY_ENTRY)) {
goto out;
}
if (!rb->entries) {
ret = 1;
goto out;
}
for (i = 0; i < rb->nr_entries; i++) {
entry = &rb->entries[i];
if (!entry->data) {
ret = 1;
goto out;
}
}
out:
spin_unlock(&rb->w_lock);
spin_unlock_irq(&rb->s_lock);
return ret;
}
unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos)
{
return (pos & (rb->nr_entries - 1));
}
int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos)
{
return (pos >= rb->nr_entries);
}
ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_c_ctx *c;
ssize_t offset;
int queued_entries = 0;
spin_lock_irq(&rb->s_lock);
list_for_each_entry(c, &pblk->compl_list, list)
queued_entries++;
spin_unlock_irq(&rb->s_lock);
if (rb->sync_point != EMPTY_ENTRY)
offset = scnprintf(buf, PAGE_SIZE,
"%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
rb->nr_entries,
rb->mem,
rb->subm,
rb->sync,
rb->l2p_update,
#ifdef CONFIG_NVM_DEBUG
atomic_read(&rb->inflight_sync_point),
#else
0,
#endif
rb->sync_point,
pblk_rb_read_count(rb),
pblk_rb_space(rb),
pblk_rb_sync_point_count(rb),
queued_entries);
else
offset = scnprintf(buf, PAGE_SIZE,
"%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n",
rb->nr_entries,
rb->mem,
rb->subm,
rb->sync,
rb->l2p_update,
#ifdef CONFIG_NVM_DEBUG
atomic_read(&rb->inflight_sync_point),
#else
0,
#endif
pblk_rb_read_count(rb),
pblk_rb_space(rb),
pblk_rb_sync_point_count(rb),
queued_entries);
return offset;
}

View File

@ -0,0 +1,529 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-read.c - pblk's read path
*/
#include "pblk.h"
/*
* There is no guarantee that the value read from cache has not been updated and
* resides at another location in the cache. We guarantee though that if the
* value is read from the cache, it belongs to the mapped lba. In order to
* guarantee and order between writes and reads are ordered, a flush must be
* issued.
*/
static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
sector_t lba, struct ppa_addr ppa,
int bio_iter)
{
#ifdef CONFIG_NVM_DEBUG
/* Callers must ensure that the ppa points to a cache address */
BUG_ON(pblk_ppa_empty(ppa));
BUG_ON(!pblk_addr_in_cache(ppa));
#endif
return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba,
pblk_addr_to_cacheline(ppa), bio_iter);
}
static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
unsigned long *read_bitmap)
{
struct bio *bio = rqd->bio;
struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
sector_t blba = pblk_get_lba(bio);
int nr_secs = rqd->nr_ppas;
int advanced_bio = 0;
int i, j = 0;
/* logic error: lba out-of-bounds. Ignore read request */
if (!(blba + nr_secs < pblk->rl.nr_secs)) {
WARN_ON("pblk: read lbas out of bounds\n");
return;
}
pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
for (i = 0; i < nr_secs; i++) {
struct ppa_addr p = ppas[i];
sector_t lba = blba + i;
retry:
if (pblk_ppa_empty(p)) {
WARN_ON(test_and_set_bit(i, read_bitmap));
continue;
}
/* Try to read from write buffer. The address is later checked
* on the write buffer to prevent retrieving overwritten data.
*/
if (pblk_addr_in_cache(p)) {
if (!pblk_read_from_cache(pblk, bio, lba, p, i)) {
pblk_lookup_l2p_seq(pblk, &p, lba, 1);
goto retry;
}
WARN_ON(test_and_set_bit(i, read_bitmap));
advanced_bio = 1;
} else {
/* Read from media non-cached sectors */
rqd->ppa_list[j++] = p;
}
if (advanced_bio)
bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
}
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(nr_secs, &pblk->inflight_reads);
#endif
}
static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd)
{
int err;
rqd->flags = pblk_set_read_mode(pblk);
err = pblk_submit_io(pblk, rqd);
if (err)
return NVM_IO_ERR;
return NVM_IO_OK;
}
static void pblk_end_io_read(struct nvm_rq *rqd)
{
struct pblk *pblk = rqd->private;
struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
struct bio *bio = rqd->bio;
if (rqd->error)
pblk_log_read_err(pblk, rqd);
#ifdef CONFIG_NVM_DEBUG
else
WARN_ONCE(bio->bi_error, "pblk: corrupted read error\n");
#endif
if (rqd->nr_ppas > 1)
nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
bio_put(bio);
if (r_ctx->orig_bio) {
#ifdef CONFIG_NVM_DEBUG
WARN_ONCE(r_ctx->orig_bio->bi_error,
"pblk: corrupted read bio\n");
#endif
bio_endio(r_ctx->orig_bio);
bio_put(r_ctx->orig_bio);
}
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
#endif
pblk_free_rqd(pblk, rqd, READ);
}
static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
unsigned int bio_init_idx,
unsigned long *read_bitmap)
{
struct bio *new_bio, *bio = rqd->bio;
struct bio_vec src_bv, dst_bv;
void *ppa_ptr = NULL;
void *src_p, *dst_p;
dma_addr_t dma_ppa_list = 0;
int nr_secs = rqd->nr_ppas;
int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
int i, ret, hole;
DECLARE_COMPLETION_ONSTACK(wait);
new_bio = bio_alloc(GFP_KERNEL, nr_holes);
if (!new_bio) {
pr_err("pblk: could not alloc read bio\n");
return NVM_IO_ERR;
}
if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
goto err;
if (nr_holes != new_bio->bi_vcnt) {
pr_err("pblk: malformed bio\n");
goto err;
}
new_bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
new_bio->bi_private = &wait;
new_bio->bi_end_io = pblk_end_bio_sync;
rqd->bio = new_bio;
rqd->nr_ppas = nr_holes;
rqd->end_io = NULL;
if (unlikely(nr_secs > 1 && nr_holes == 1)) {
ppa_ptr = rqd->ppa_list;
dma_ppa_list = rqd->dma_ppa_list;
rqd->ppa_addr = rqd->ppa_list[0];
}
ret = pblk_submit_read_io(pblk, rqd);
if (ret) {
bio_put(rqd->bio);
pr_err("pblk: read IO submission failed\n");
goto err;
}
if (!wait_for_completion_io_timeout(&wait,
msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
pr_err("pblk: partial read I/O timed out\n");
}
if (rqd->error) {
atomic_long_inc(&pblk->read_failed);
#ifdef CONFIG_NVM_DEBUG
pblk_print_failed_rqd(pblk, rqd, rqd->error);
#endif
}
if (unlikely(nr_secs > 1 && nr_holes == 1)) {
rqd->ppa_list = ppa_ptr;
rqd->dma_ppa_list = dma_ppa_list;
}
/* Fill the holes in the original bio */
i = 0;
hole = find_first_zero_bit(read_bitmap, nr_secs);
do {
src_bv = new_bio->bi_io_vec[i++];
dst_bv = bio->bi_io_vec[bio_init_idx + hole];
src_p = kmap_atomic(src_bv.bv_page);
dst_p = kmap_atomic(dst_bv.bv_page);
memcpy(dst_p + dst_bv.bv_offset,
src_p + src_bv.bv_offset,
PBLK_EXPOSED_PAGE_SIZE);
kunmap_atomic(src_p);
kunmap_atomic(dst_p);
mempool_free(src_bv.bv_page, pblk->page_pool);
hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
} while (hole < nr_secs);
bio_put(new_bio);
/* Complete the original bio and associated request */
rqd->bio = bio;
rqd->nr_ppas = nr_secs;
rqd->private = pblk;
bio_endio(bio);
pblk_end_io_read(rqd);
return NVM_IO_OK;
err:
/* Free allocated pages in new bio */
pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
rqd->private = pblk;
pblk_end_io_read(rqd);
return NVM_IO_ERR;
}
static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
unsigned long *read_bitmap)
{
struct bio *bio = rqd->bio;
struct ppa_addr ppa;
sector_t lba = pblk_get_lba(bio);
/* logic error: lba out-of-bounds. Ignore read request */
if (!(lba < pblk->rl.nr_secs)) {
WARN_ON("pblk: read lba out of bounds\n");
return;
}
pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
#ifdef CONFIG_NVM_DEBUG
atomic_long_inc(&pblk->inflight_reads);
#endif
retry:
if (pblk_ppa_empty(ppa)) {
WARN_ON(test_and_set_bit(0, read_bitmap));
return;
}
/* Try to read from write buffer. The address is later checked on the
* write buffer to prevent retrieving overwritten data.
*/
if (pblk_addr_in_cache(ppa)) {
if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0)) {
pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
goto retry;
}
WARN_ON(test_and_set_bit(0, read_bitmap));
} else {
rqd->ppa_addr = ppa;
}
}
int pblk_submit_read(struct pblk *pblk, struct bio *bio)
{
struct nvm_tgt_dev *dev = pblk->dev;
int nr_secs = pblk_get_secs(bio);
struct nvm_rq *rqd;
unsigned long read_bitmap; /* Max 64 ppas per request */
unsigned int bio_init_idx;
int ret = NVM_IO_ERR;
if (nr_secs > PBLK_MAX_REQ_ADDRS)
return NVM_IO_ERR;
bitmap_zero(&read_bitmap, nr_secs);
rqd = pblk_alloc_rqd(pblk, READ);
if (IS_ERR(rqd)) {
pr_err_ratelimited("pblk: not able to alloc rqd");
return NVM_IO_ERR;
}
rqd->opcode = NVM_OP_PREAD;
rqd->bio = bio;
rqd->nr_ppas = nr_secs;
rqd->private = pblk;
rqd->end_io = pblk_end_io_read;
/* Save the index for this bio's start. This is needed in case
* we need to fill a partial read.
*/
bio_init_idx = pblk_get_bi_idx(bio);
if (nr_secs > 1) {
rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
&rqd->dma_ppa_list);
if (!rqd->ppa_list) {
pr_err("pblk: not able to allocate ppa list\n");
goto fail_rqd_free;
}
pblk_read_ppalist_rq(pblk, rqd, &read_bitmap);
} else {
pblk_read_rq(pblk, rqd, &read_bitmap);
}
bio_get(bio);
if (bitmap_full(&read_bitmap, nr_secs)) {
bio_endio(bio);
pblk_end_io_read(rqd);
return NVM_IO_OK;
}
/* All sectors are to be read from the device */
if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) {
struct bio *int_bio = NULL;
struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
/* Clone read bio to deal with read errors internally */
int_bio = bio_clone_bioset(bio, GFP_KERNEL, fs_bio_set);
if (!int_bio) {
pr_err("pblk: could not clone read bio\n");
return NVM_IO_ERR;
}
rqd->bio = int_bio;
r_ctx->orig_bio = bio;
ret = pblk_submit_read_io(pblk, rqd);
if (ret) {
pr_err("pblk: read IO submission failed\n");
if (int_bio)
bio_put(int_bio);
return ret;
}
return NVM_IO_OK;
}
/* The read bio request could be partially filled by the write buffer,
* but there are some holes that need to be read from the drive.
*/
ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
if (ret) {
pr_err("pblk: failed to perform partial read\n");
return ret;
}
return NVM_IO_OK;
fail_rqd_free:
pblk_free_rqd(pblk, rqd, READ);
return ret;
}
static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_line *line, u64 *lba_list,
unsigned int nr_secs)
{
struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
int valid_secs = 0;
int i;
pblk_lookup_l2p_rand(pblk, ppas, lba_list, nr_secs);
for (i = 0; i < nr_secs; i++) {
if (pblk_addr_in_cache(ppas[i]) || ppas[i].g.blk != line->id ||
pblk_ppa_empty(ppas[i])) {
lba_list[i] = ADDR_EMPTY;
continue;
}
rqd->ppa_list[valid_secs++] = ppas[i];
}
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(valid_secs, &pblk->inflight_reads);
#endif
return valid_secs;
}
static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_line *line, sector_t lba)
{
struct ppa_addr ppa;
int valid_secs = 0;
/* logic error: lba out-of-bounds */
if (!(lba < pblk->rl.nr_secs)) {
WARN_ON("pblk: read lba out of bounds\n");
goto out;
}
if (lba == ADDR_EMPTY)
goto out;
spin_lock(&pblk->trans_lock);
ppa = pblk_trans_map_get(pblk, lba);
spin_unlock(&pblk->trans_lock);
/* Ignore updated values until the moment */
if (pblk_addr_in_cache(ppa) || ppa.g.blk != line->id ||
pblk_ppa_empty(ppa))
goto out;
rqd->ppa_addr = ppa;
valid_secs = 1;
#ifdef CONFIG_NVM_DEBUG
atomic_long_inc(&pblk->inflight_reads);
#endif
out:
return valid_secs;
}
int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
unsigned int nr_secs, unsigned int *secs_to_gc,
struct pblk_line *line)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct request_queue *q = dev->q;
struct bio *bio;
struct nvm_rq rqd;
int ret, data_len;
DECLARE_COMPLETION_ONSTACK(wait);
memset(&rqd, 0, sizeof(struct nvm_rq));
if (nr_secs > 1) {
rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
&rqd.dma_ppa_list);
if (!rqd.ppa_list)
return NVM_IO_ERR;
*secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list,
nr_secs);
if (*secs_to_gc == 1) {
struct ppa_addr ppa;
ppa = rqd.ppa_list[0];
nvm_dev_dma_free(dev->parent, rqd.ppa_list,
rqd.dma_ppa_list);
rqd.ppa_addr = ppa;
}
} else {
*secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]);
}
if (!(*secs_to_gc))
goto out;
data_len = (*secs_to_gc) * geo->sec_size;
bio = bio_map_kern(q, data, data_len, GFP_KERNEL);
if (IS_ERR(bio)) {
pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
goto err_free_dma;
}
bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(bio, REQ_OP_READ, 0);
rqd.opcode = NVM_OP_PREAD;
rqd.end_io = pblk_end_io_sync;
rqd.private = &wait;
rqd.nr_ppas = *secs_to_gc;
rqd.bio = bio;
ret = pblk_submit_read_io(pblk, &rqd);
if (ret) {
bio_endio(bio);
pr_err("pblk: GC read request failed\n");
goto err_free_dma;
}
if (!wait_for_completion_io_timeout(&wait,
msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
pr_err("pblk: GC read I/O timed out\n");
}
if (rqd.error) {
atomic_long_inc(&pblk->read_failed_gc);
#ifdef CONFIG_NVM_DEBUG
pblk_print_failed_rqd(pblk, &rqd, rqd.error);
#endif
}
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(*secs_to_gc, &pblk->sync_reads);
atomic_long_add(*secs_to_gc, &pblk->recov_gc_reads);
atomic_long_sub(*secs_to_gc, &pblk->inflight_reads);
#endif
out:
if (rqd.nr_ppas > 1)
nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
return NVM_IO_OK;
err_free_dma:
if (rqd.nr_ppas > 1)
nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
return NVM_IO_ERR;
}

View File

@ -0,0 +1,998 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial: Javier Gonzalez <javier@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-recovery.c - pblk's recovery path
*/
#include "pblk.h"
void pblk_submit_rec(struct work_struct *work)
{
struct pblk_rec_ctx *recovery =
container_of(work, struct pblk_rec_ctx, ws_rec);
struct pblk *pblk = recovery->pblk;
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_rq *rqd = recovery->rqd;
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
int max_secs = nvm_max_phys_sects(dev);
struct bio *bio;
unsigned int nr_rec_secs;
unsigned int pgs_read;
int ret;
nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
max_secs);
bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
if (!bio) {
pr_err("pblk: not able to create recovery bio\n");
return;
}
bio->bi_iter.bi_sector = 0;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
rqd->bio = bio;
rqd->nr_ppas = nr_rec_secs;
pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed,
nr_rec_secs);
if (pgs_read != nr_rec_secs) {
pr_err("pblk: could not read recovery entries\n");
goto err;
}
if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
pr_err("pblk: could not setup recovery request\n");
goto err;
}
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(nr_rec_secs, &pblk->recov_writes);
#endif
ret = pblk_submit_io(pblk, rqd);
if (ret) {
pr_err("pblk: I/O submission failed: %d\n", ret);
goto err;
}
mempool_free(recovery, pblk->rec_pool);
return;
err:
bio_put(bio);
pblk_free_rqd(pblk, rqd, WRITE);
}
int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
struct pblk_rec_ctx *recovery, u64 *comp_bits,
unsigned int comp)
{
struct nvm_tgt_dev *dev = pblk->dev;
int max_secs = nvm_max_phys_sects(dev);
struct nvm_rq *rec_rqd;
struct pblk_c_ctx *rec_ctx;
int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
rec_rqd = pblk_alloc_rqd(pblk, WRITE);
if (IS_ERR(rec_rqd)) {
pr_err("pblk: could not create recovery req.\n");
return -ENOMEM;
}
rec_ctx = nvm_rq_to_pdu(rec_rqd);
/* Copy completion bitmap, but exclude the first X completed entries */
bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
(unsigned long int *)comp_bits,
comp, max_secs);
/* Save the context for the entries that need to be re-written and
* update current context with the completed entries.
*/
rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp);
if (comp >= c_ctx->nr_valid) {
rec_ctx->nr_valid = 0;
rec_ctx->nr_padded = nr_entries - comp;
c_ctx->nr_padded = comp - c_ctx->nr_valid;
} else {
rec_ctx->nr_valid = c_ctx->nr_valid - comp;
rec_ctx->nr_padded = c_ctx->nr_padded;
c_ctx->nr_valid = comp;
c_ctx->nr_padded = 0;
}
recovery->rqd = rec_rqd;
recovery->pblk = pblk;
return 0;
}
__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta)
{
u32 crc;
crc = pblk_calc_emeta_crc(pblk, emeta);
if (le32_to_cpu(emeta->crc) != crc)
return NULL;
if (le32_to_cpu(emeta->header.identifier) != PBLK_MAGIC)
return NULL;
return pblk_line_emeta_to_lbas(emeta);
}
static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
struct line_emeta *emeta = line->emeta;
__le64 *lba_list;
int data_start;
int nr_data_lbas, nr_valid_lbas, nr_lbas = 0;
int i;
lba_list = pblk_recov_get_lba_list(pblk, emeta);
if (!lba_list)
return 1;
data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
nr_data_lbas = lm->sec_per_line - lm->emeta_sec;
nr_valid_lbas = le64_to_cpu(emeta->nr_valid_lbas);
for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) {
struct ppa_addr ppa;
int pos;
ppa = addr_to_pblk_ppa(pblk, i, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
/* Do not update bad blocks */
if (test_bit(pos, line->blk_bitmap))
continue;
if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) {
spin_lock(&line->lock);
if (test_and_set_bit(i, line->invalid_bitmap))
WARN_ON_ONCE("pblk: rec. double invalidate:\n");
else
line->vsc--;
spin_unlock(&line->lock);
continue;
}
pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa);
nr_lbas++;
}
if (nr_valid_lbas != nr_lbas)
pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n",
line->id, line->emeta->nr_valid_lbas, nr_lbas);
line->left_msecs = 0;
return 0;
}
static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec -
nr_bb * geo->sec_per_blk;
}
struct pblk_recov_alloc {
struct ppa_addr *ppa_list;
struct pblk_sec_meta *meta_list;
struct nvm_rq *rqd;
void *data;
dma_addr_t dma_ppa_list;
dma_addr_t dma_meta_list;
};
static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
struct pblk_recov_alloc p, u64 r_ptr)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct ppa_addr *ppa_list;
struct pblk_sec_meta *meta_list;
struct nvm_rq *rqd;
struct bio *bio;
void *data;
dma_addr_t dma_ppa_list, dma_meta_list;
u64 r_ptr_int;
int left_ppas;
int rq_ppas, rq_len;
int i, j;
int ret = 0;
DECLARE_COMPLETION_ONSTACK(wait);
ppa_list = p.ppa_list;
meta_list = p.meta_list;
rqd = p.rqd;
data = p.data;
dma_ppa_list = p.dma_ppa_list;
dma_meta_list = p.dma_meta_list;
left_ppas = line->cur_sec - r_ptr;
if (!left_ppas)
return 0;
r_ptr_int = r_ptr;
next_read_rq:
memset(rqd, 0, pblk_r_rq_size);
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
if (!rq_ppas)
rq_ppas = pblk->min_write_pgs;
rq_len = rq_ppas * geo->sec_size;
bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
if (IS_ERR(bio))
return PTR_ERR(bio);
bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(bio, REQ_OP_READ, 0);
rqd->bio = bio;
rqd->opcode = NVM_OP_PREAD;
rqd->flags = pblk_set_read_mode(pblk);
rqd->meta_list = meta_list;
rqd->nr_ppas = rq_ppas;
rqd->ppa_list = ppa_list;
rqd->dma_ppa_list = dma_ppa_list;
rqd->dma_meta_list = dma_meta_list;
rqd->end_io = pblk_end_io_sync;
rqd->private = &wait;
for (i = 0; i < rqd->nr_ppas; ) {
struct ppa_addr ppa;
int pos;
ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
pos = pblk_dev_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) {
r_ptr_int += pblk->min_write_pgs;
ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
pos = pblk_dev_ppa_to_pos(geo, ppa);
}
for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++)
rqd->ppa_list[i] =
addr_to_gen_ppa(pblk, r_ptr_int, line->id);
}
/* If read fails, more padding is needed */
ret = pblk_submit_io(pblk, rqd);
if (ret) {
pr_err("pblk: I/O submission failed: %d\n", ret);
return ret;
}
if (!wait_for_completion_io_timeout(&wait,
msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
pr_err("pblk: L2P recovery read timed out\n");
return -EINTR;
}
reinit_completion(&wait);
/* At this point, the read should not fail. If it does, it is a problem
* we cannot recover from here. Need FTL log.
*/
if (rqd->error) {
pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
return -EINTR;
}
for (i = 0; i < rqd->nr_ppas; i++) {
u64 lba = le64_to_cpu(meta_list[i].lba);
if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
continue;
pblk_update_map(pblk, lba, rqd->ppa_list[i]);
}
left_ppas -= rq_ppas;
if (left_ppas > 0)
goto next_read_rq;
return 0;
}
static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
struct pblk_recov_alloc p, int left_ppas)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct ppa_addr *ppa_list;
struct pblk_sec_meta *meta_list;
struct nvm_rq *rqd;
struct bio *bio;
void *data;
dma_addr_t dma_ppa_list, dma_meta_list;
__le64 *lba_list = pblk_line_emeta_to_lbas(line->emeta);
u64 w_ptr = line->cur_sec;
int left_line_ppas = line->left_msecs;
int rq_ppas, rq_len;
int i, j;
int ret = 0;
DECLARE_COMPLETION_ONSTACK(wait);
ppa_list = p.ppa_list;
meta_list = p.meta_list;
rqd = p.rqd;
data = p.data;
dma_ppa_list = p.dma_ppa_list;
dma_meta_list = p.dma_meta_list;
next_pad_rq:
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
if (!rq_ppas)
rq_ppas = pblk->min_write_pgs;
rq_len = rq_ppas * geo->sec_size;
bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
if (IS_ERR(bio))
return PTR_ERR(bio);
bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
memset(rqd, 0, pblk_r_rq_size);
rqd->bio = bio;
rqd->opcode = NVM_OP_PWRITE;
rqd->flags = pblk_set_progr_mode(pblk, WRITE);
rqd->meta_list = meta_list;
rqd->nr_ppas = rq_ppas;
rqd->ppa_list = ppa_list;
rqd->dma_ppa_list = dma_ppa_list;
rqd->dma_meta_list = dma_meta_list;
rqd->end_io = pblk_end_io_sync;
rqd->private = &wait;
for (i = 0; i < rqd->nr_ppas; ) {
struct ppa_addr ppa;
int pos;
w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) {
w_ptr += pblk->min_write_pgs;
ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
}
for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
struct ppa_addr dev_ppa;
dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pblk_map_invalidate(pblk, dev_ppa);
meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
lba_list[w_ptr] = cpu_to_le64(ADDR_EMPTY);
rqd->ppa_list[i] = dev_ppa;
}
}
ret = pblk_submit_io(pblk, rqd);
if (ret) {
pr_err("pblk: I/O submission failed: %d\n", ret);
return ret;
}
if (!wait_for_completion_io_timeout(&wait,
msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
pr_err("pblk: L2P recovery write timed out\n");
}
reinit_completion(&wait);
left_line_ppas -= rq_ppas;
left_ppas -= rq_ppas;
if (left_ppas > 0 && left_line_ppas)
goto next_pad_rq;
return 0;
}
/* When this function is called, it means that not all upper pages have been
* written in a page that contains valid data. In order to recover this data, we
* first find the write pointer on the device, then we pad all necessary
* sectors, and finally attempt to read the valid data
*/
static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
struct pblk_recov_alloc p)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct ppa_addr *ppa_list;
struct pblk_sec_meta *meta_list;
struct nvm_rq *rqd;
struct bio *bio;
void *data;
dma_addr_t dma_ppa_list, dma_meta_list;
u64 w_ptr = 0, r_ptr;
int rq_ppas, rq_len;
int i, j;
int ret = 0;
int rec_round;
int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec;
DECLARE_COMPLETION_ONSTACK(wait);
ppa_list = p.ppa_list;
meta_list = p.meta_list;
rqd = p.rqd;
data = p.data;
dma_ppa_list = p.dma_ppa_list;
dma_meta_list = p.dma_meta_list;
/* we could recover up until the line write pointer */
r_ptr = line->cur_sec;
rec_round = 0;
next_rq:
memset(rqd, 0, pblk_r_rq_size);
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
if (!rq_ppas)
rq_ppas = pblk->min_write_pgs;
rq_len = rq_ppas * geo->sec_size;
bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
if (IS_ERR(bio))
return PTR_ERR(bio);
bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(bio, REQ_OP_READ, 0);
rqd->bio = bio;
rqd->opcode = NVM_OP_PREAD;
rqd->flags = pblk_set_read_mode(pblk);
rqd->meta_list = meta_list;
rqd->nr_ppas = rq_ppas;
rqd->ppa_list = ppa_list;
rqd->dma_ppa_list = dma_ppa_list;
rqd->dma_meta_list = dma_meta_list;
rqd->end_io = pblk_end_io_sync;
rqd->private = &wait;
for (i = 0; i < rqd->nr_ppas; ) {
struct ppa_addr ppa;
int pos;
w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pos = pblk_dev_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) {
w_ptr += pblk->min_write_pgs;
ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pos = pblk_dev_ppa_to_pos(geo, ppa);
}
for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++)
rqd->ppa_list[i] =
addr_to_gen_ppa(pblk, w_ptr, line->id);
}
ret = pblk_submit_io(pblk, rqd);
if (ret) {
pr_err("pblk: I/O submission failed: %d\n", ret);
return ret;
}
if (!wait_for_completion_io_timeout(&wait,
msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
pr_err("pblk: L2P recovery read timed out\n");
}
reinit_completion(&wait);
/* This should not happen since the read failed during normal recovery,
* but the media works funny sometimes...
*/
if (!rec_round++ && !rqd->error) {
rec_round = 0;
for (i = 0; i < rqd->nr_ppas; i++, r_ptr++) {
u64 lba = le64_to_cpu(meta_list[i].lba);
if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
continue;
pblk_update_map(pblk, lba, rqd->ppa_list[i]);
}
}
/* Reached the end of the written line */
if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
int pad_secs, nr_error_bits, bit;
int ret;
bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
nr_error_bits = rqd->nr_ppas - bit;
/* Roll back failed sectors */
line->cur_sec -= nr_error_bits;
line->left_msecs += nr_error_bits;
bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
pad_secs = pblk_pad_distance(pblk);
if (pad_secs > line->left_msecs)
pad_secs = line->left_msecs;
ret = pblk_recov_pad_oob(pblk, line, p, pad_secs);
if (ret)
pr_err("pblk: OOB padding failed (err:%d)\n", ret);
ret = pblk_recov_read_oob(pblk, line, p, r_ptr);
if (ret)
pr_err("pblk: OOB read failed (err:%d)\n", ret);
line->left_ssecs = line->left_msecs;
left_ppas = 0;
}
left_ppas -= rq_ppas;
if (left_ppas > 0)
goto next_rq;
return ret;
}
static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
struct pblk_recov_alloc p, int *done)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct ppa_addr *ppa_list;
struct pblk_sec_meta *meta_list;
struct nvm_rq *rqd;
struct bio *bio;
void *data;
dma_addr_t dma_ppa_list, dma_meta_list;
u64 paddr;
int rq_ppas, rq_len;
int i, j;
int ret = 0;
int left_ppas = pblk_calc_sec_in_line(pblk, line);
DECLARE_COMPLETION_ONSTACK(wait);
ppa_list = p.ppa_list;
meta_list = p.meta_list;
rqd = p.rqd;
data = p.data;
dma_ppa_list = p.dma_ppa_list;
dma_meta_list = p.dma_meta_list;
*done = 1;
next_rq:
memset(rqd, 0, pblk_r_rq_size);
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
if (!rq_ppas)
rq_ppas = pblk->min_write_pgs;
rq_len = rq_ppas * geo->sec_size;
bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
if (IS_ERR(bio))
return PTR_ERR(bio);
bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(bio, REQ_OP_READ, 0);
rqd->bio = bio;
rqd->opcode = NVM_OP_PREAD;
rqd->flags = pblk_set_read_mode(pblk);
rqd->meta_list = meta_list;
rqd->nr_ppas = rq_ppas;
rqd->ppa_list = ppa_list;
rqd->dma_ppa_list = dma_ppa_list;
rqd->dma_meta_list = dma_meta_list;
rqd->end_io = pblk_end_io_sync;
rqd->private = &wait;
for (i = 0; i < rqd->nr_ppas; ) {
struct ppa_addr ppa;
int pos;
paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
ppa = addr_to_gen_ppa(pblk, paddr, line->id);
pos = pblk_dev_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) {
paddr += pblk->min_write_pgs;
ppa = addr_to_gen_ppa(pblk, paddr, line->id);
pos = pblk_dev_ppa_to_pos(geo, ppa);
}
for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++)
rqd->ppa_list[i] =
addr_to_gen_ppa(pblk, paddr, line->id);
}
ret = pblk_submit_io(pblk, rqd);
if (ret) {
pr_err("pblk: I/O submission failed: %d\n", ret);
bio_put(bio);
return ret;
}
if (!wait_for_completion_io_timeout(&wait,
msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
pr_err("pblk: L2P recovery read timed out\n");
}
reinit_completion(&wait);
/* Reached the end of the written line */
if (rqd->error) {
int nr_error_bits, bit;
bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
nr_error_bits = rqd->nr_ppas - bit;
/* Roll back failed sectors */
line->cur_sec -= nr_error_bits;
line->left_msecs += nr_error_bits;
line->left_ssecs = line->left_msecs;
bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
left_ppas = 0;
rqd->nr_ppas = bit;
if (rqd->error != NVM_RSP_ERR_EMPTYPAGE)
*done = 0;
}
for (i = 0; i < rqd->nr_ppas; i++) {
u64 lba = le64_to_cpu(meta_list[i].lba);
if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
continue;
pblk_update_map(pblk, lba, rqd->ppa_list[i]);
}
left_ppas -= rq_ppas;
if (left_ppas > 0)
goto next_rq;
return ret;
}
/* Scan line for lbas on out of bound area */
static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct nvm_rq *rqd;
struct ppa_addr *ppa_list;
struct pblk_sec_meta *meta_list;
struct pblk_recov_alloc p;
void *data;
dma_addr_t dma_ppa_list, dma_meta_list;
int done, ret = 0;
rqd = pblk_alloc_rqd(pblk, READ);
if (IS_ERR(rqd))
return PTR_ERR(rqd);
meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
if (!meta_list) {
ret = -ENOMEM;
goto free_rqd;
}
ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
if (!data) {
ret = -ENOMEM;
goto free_meta_list;
}
p.ppa_list = ppa_list;
p.meta_list = meta_list;
p.rqd = rqd;
p.data = data;
p.dma_ppa_list = dma_ppa_list;
p.dma_meta_list = dma_meta_list;
ret = pblk_recov_scan_oob(pblk, line, p, &done);
if (ret) {
pr_err("pblk: could not recover L2P from OOB\n");
goto out;
}
if (!done) {
ret = pblk_recov_scan_all_oob(pblk, line, p);
if (ret) {
pr_err("pblk: could not recover L2P from OOB\n");
goto out;
}
}
if (pblk_line_is_full(line))
pblk_line_recov_close(pblk, line);
out:
kfree(data);
free_meta_list:
nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
free_rqd:
pblk_free_rqd(pblk, rqd, READ);
return ret;
}
/* Insert lines ordered by sequence number (seq_num) on list */
static void pblk_recov_line_add_ordered(struct list_head *head,
struct pblk_line *line)
{
struct pblk_line *t = NULL;
list_for_each_entry(t, head, list)
if (t->seq_nr > line->seq_nr)
break;
__list_add(&line->list, t->list.prev, &t->list);
}
struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *line, *tline, *data_line = NULL;
struct line_smeta *smeta;
struct line_emeta *emeta;
int found_lines = 0, recovered_lines = 0, open_lines = 0;
int is_next = 0;
int meta_line;
int i, valid_uuid = 0;
LIST_HEAD(recov_list);
/* TODO: Implement FTL snapshot */
/* Scan recovery - takes place when FTL snapshot fails */
spin_lock(&l_mg->free_lock);
meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
set_bit(meta_line, &l_mg->meta_bitmap);
smeta = l_mg->sline_meta[meta_line].meta;
emeta = l_mg->eline_meta[meta_line].meta;
spin_unlock(&l_mg->free_lock);
/* Order data lines using their sequence number */
for (i = 0; i < l_mg->nr_lines; i++) {
u32 crc;
line = &pblk->lines[i];
memset(smeta, 0, lm->smeta_len);
line->smeta = smeta;
line->lun_bitmap = ((void *)(smeta)) +
sizeof(struct line_smeta);
/* Lines that cannot be read are assumed as not written here */
if (pblk_line_read_smeta(pblk, line))
continue;
crc = pblk_calc_smeta_crc(pblk, smeta);
if (le32_to_cpu(smeta->crc) != crc)
continue;
if (le32_to_cpu(smeta->header.identifier) != PBLK_MAGIC)
continue;
if (le16_to_cpu(smeta->header.version) != 1) {
pr_err("pblk: found incompatible line version %u\n",
smeta->header.version);
return ERR_PTR(-EINVAL);
}
/* The first valid instance uuid is used for initialization */
if (!valid_uuid) {
memcpy(pblk->instance_uuid, smeta->header.uuid, 16);
valid_uuid = 1;
}
if (memcmp(pblk->instance_uuid, smeta->header.uuid, 16)) {
pr_debug("pblk: ignore line %u due to uuid mismatch\n",
i);
continue;
}
/* Update line metadata */
spin_lock(&line->lock);
line->id = le32_to_cpu(line->smeta->header.id);
line->type = le16_to_cpu(line->smeta->header.type);
line->seq_nr = le64_to_cpu(line->smeta->seq_nr);
spin_unlock(&line->lock);
/* Update general metadata */
spin_lock(&l_mg->free_lock);
if (line->seq_nr >= l_mg->d_seq_nr)
l_mg->d_seq_nr = line->seq_nr + 1;
l_mg->nr_free_lines--;
spin_unlock(&l_mg->free_lock);
if (pblk_line_recov_alloc(pblk, line))
goto out;
pblk_recov_line_add_ordered(&recov_list, line);
found_lines++;
pr_debug("pblk: recovering data line %d, seq:%llu\n",
line->id, smeta->seq_nr);
}
if (!found_lines) {
pblk_setup_uuid(pblk);
spin_lock(&l_mg->free_lock);
WARN_ON_ONCE(!test_and_clear_bit(meta_line,
&l_mg->meta_bitmap));
spin_unlock(&l_mg->free_lock);
goto out;
}
/* Verify closed blocks and recover this portion of L2P table*/
list_for_each_entry_safe(line, tline, &recov_list, list) {
int off, nr_bb;
recovered_lines++;
/* Calculate where emeta starts based on the line bb */
off = lm->sec_per_line - lm->emeta_sec;
nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
off -= nr_bb * geo->sec_per_pl;
memset(emeta, 0, lm->emeta_len);
line->emeta = emeta;
line->emeta_ssec = off;
if (pblk_line_read_emeta(pblk, line)) {
pblk_recov_l2p_from_oob(pblk, line);
goto next;
}
if (pblk_recov_l2p_from_emeta(pblk, line))
pblk_recov_l2p_from_oob(pblk, line);
next:
if (pblk_line_is_full(line)) {
struct list_head *move_list;
spin_lock(&line->lock);
line->state = PBLK_LINESTATE_CLOSED;
move_list = pblk_line_gc_list(pblk, line);
spin_unlock(&line->lock);
spin_lock(&l_mg->gc_lock);
list_move_tail(&line->list, move_list);
spin_unlock(&l_mg->gc_lock);
mempool_free(line->map_bitmap, pblk->line_meta_pool);
line->map_bitmap = NULL;
line->smeta = NULL;
line->emeta = NULL;
} else {
if (open_lines > 1)
pr_err("pblk: failed to recover L2P\n");
open_lines++;
line->meta_line = meta_line;
data_line = line;
}
}
spin_lock(&l_mg->free_lock);
if (!open_lines) {
WARN_ON_ONCE(!test_and_clear_bit(meta_line,
&l_mg->meta_bitmap));
pblk_line_replace_data(pblk);
} else {
/* Allocate next line for preparation */
l_mg->data_next = pblk_line_get(pblk);
if (l_mg->data_next) {
l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
l_mg->data_next->type = PBLK_LINETYPE_DATA;
is_next = 1;
}
}
spin_unlock(&l_mg->free_lock);
if (is_next) {
pblk_line_erase(pblk, l_mg->data_next);
pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
}
out:
if (found_lines != recovered_lines)
pr_err("pblk: failed to recover all found lines %d/%d\n",
found_lines, recovered_lines);
return data_line;
}
/*
* Pad until smeta can be read on current data line
*/
void pblk_recov_pad(struct pblk *pblk)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line *line;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct nvm_rq *rqd;
struct pblk_recov_alloc p;
struct ppa_addr *ppa_list;
struct pblk_sec_meta *meta_list;
void *data;
dma_addr_t dma_ppa_list, dma_meta_list;
spin_lock(&l_mg->free_lock);
line = l_mg->data_line;
spin_unlock(&l_mg->free_lock);
rqd = pblk_alloc_rqd(pblk, READ);
if (IS_ERR(rqd))
return;
meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
if (!meta_list)
goto free_rqd;
ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
if (!data)
goto free_meta_list;
p.ppa_list = ppa_list;
p.meta_list = meta_list;
p.rqd = rqd;
p.data = data;
p.dma_ppa_list = dma_ppa_list;
p.dma_meta_list = dma_meta_list;
if (pblk_recov_pad_oob(pblk, line, p, line->left_msecs)) {
pr_err("pblk: Tear down padding failed\n");
goto free_data;
}
pblk_line_close(pblk, line);
free_data:
kfree(data);
free_meta_list:
nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
free_rqd:
pblk_free_rqd(pblk, rqd, READ);
}

View File

@ -0,0 +1,182 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-rl.c - pblk's rate limiter for user I/O
*
*/
#include "pblk.h"
static void pblk_rl_kick_u_timer(struct pblk_rl *rl)
{
mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000));
}
int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries)
{
int rb_user_cnt = atomic_read(&rl->rb_user_cnt);
return (!(rb_user_cnt + nr_entries > rl->rb_user_max));
}
int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries)
{
int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt);
int rb_user_active;
/* If there is no user I/O let GC take over space on the write buffer */
rb_user_active = READ_ONCE(rl->rb_user_active);
return (!(rb_gc_cnt + nr_entries > rl->rb_gc_max && rb_user_active));
}
void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
{
atomic_add(nr_entries, &rl->rb_user_cnt);
/* Release user I/O state. Protect from GC */
smp_store_release(&rl->rb_user_active, 1);
pblk_rl_kick_u_timer(rl);
}
void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
{
atomic_add(nr_entries, &rl->rb_gc_cnt);
}
void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc)
{
atomic_sub(nr_user, &rl->rb_user_cnt);
atomic_sub(nr_gc, &rl->rb_gc_cnt);
}
unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
{
return atomic_read(&rl->free_blocks);
}
/*
* We check for (i) the number of free blocks in the current LUN and (ii) the
* total number of free blocks in the pblk instance. This is to even out the
* number of free blocks on each LUN when GC kicks in.
*
* Only the total number of free blocks is used to configure the rate limiter.
*/
static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
{
unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
if (free_blocks >= rl->high) {
rl->rb_user_max = max - rl->rb_gc_rsv;
rl->rb_gc_max = rl->rb_gc_rsv;
rl->rb_state = PBLK_RL_HIGH;
} else if (free_blocks < rl->high) {
int shift = rl->high_pw - rl->rb_windows_pw;
int user_windows = free_blocks >> shift;
int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW;
int gc_max;
rl->rb_user_max = user_max;
gc_max = max - rl->rb_user_max;
rl->rb_gc_max = max(gc_max, rl->rb_gc_rsv);
if (free_blocks > rl->low)
rl->rb_state = PBLK_RL_MID;
else
rl->rb_state = PBLK_RL_LOW;
}
return rl->rb_state;
}
void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv)
{
rl->rb_gc_rsv = rl->rb_gc_max = rsv;
}
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
int ret;
atomic_add(line->blk_in_line, &rl->free_blocks);
/* Rates will not change that often - no need to lock update */
ret = pblk_rl_update_rates(rl, rl->rb_budget);
if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
pblk_gc_should_start(pblk);
else
pblk_gc_should_stop(pblk);
}
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
int ret;
atomic_sub(line->blk_in_line, &rl->free_blocks);
/* Rates will not change that often - no need to lock update */
ret = pblk_rl_update_rates(rl, rl->rb_budget);
if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
pblk_gc_should_start(pblk);
else
pblk_gc_should_stop(pblk);
}
int pblk_rl_gc_thrs(struct pblk_rl *rl)
{
return rl->high;
}
int pblk_rl_sysfs_rate_show(struct pblk_rl *rl)
{
return rl->rb_user_max;
}
static void pblk_rl_u_timer(unsigned long data)
{
struct pblk_rl *rl = (struct pblk_rl *)data;
/* Release user I/O state. Protect from GC */
smp_store_release(&rl->rb_user_active, 0);
}
void pblk_rl_free(struct pblk_rl *rl)
{
del_timer(&rl->u_timer);
}
void pblk_rl_init(struct pblk_rl *rl, int budget)
{
unsigned int rb_windows;
rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS;
rl->low = rl->total_blocks / PBLK_USER_LOW_THRS;
rl->high_pw = get_count_order(rl->high);
/* This will always be a power-of-2 */
rb_windows = budget / PBLK_MAX_REQ_ADDRS;
rl->rb_windows_pw = get_count_order(rb_windows) + 1;
/* To start with, all buffer is available to user I/O writers */
rl->rb_budget = budget;
rl->rb_user_max = budget;
atomic_set(&rl->rb_user_cnt, 0);
rl->rb_gc_max = 0;
rl->rb_state = PBLK_RL_HIGH;
atomic_set(&rl->rb_gc_cnt, 0);
setup_timer(&rl->u_timer, pblk_rl_u_timer, (unsigned long)rl);
rl->rb_user_active = 0;
}

View File

@ -0,0 +1,507 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Implementation of a physical block-device target for Open-channel SSDs.
*
* pblk-sysfs.c - pblk's sysfs
*
*/
#include "pblk.h"
static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_lun *rlun;
ssize_t sz = 0;
int i;
for (i = 0; i < geo->nr_luns; i++) {
int active = 1;
rlun = &pblk->luns[i];
if (!down_trylock(&rlun->wr_sem)) {
active = 0;
up(&rlun->wr_sem);
}
sz += snprintf(page + sz, PAGE_SIZE - sz,
"pblk: pos:%d, ch:%d, lun:%d - %d\n",
i,
rlun->bppa.g.ch,
rlun->bppa.g.lun,
active);
}
return sz;
}
static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
int free_blocks, total_blocks;
int rb_user_max, rb_user_cnt;
int rb_gc_max, rb_gc_rsv, rb_gc_cnt, rb_budget, rb_state;
free_blocks = atomic_read(&pblk->rl.free_blocks);
rb_user_max = pblk->rl.rb_user_max;
rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
rb_gc_max = pblk->rl.rb_gc_max;
rb_gc_rsv = pblk->rl.rb_gc_rsv;
rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt);
rb_budget = pblk->rl.rb_budget;
rb_state = pblk->rl.rb_state;
total_blocks = geo->blks_per_lun * geo->nr_luns;
return snprintf(page, PAGE_SIZE,
"u:%u/%u,gc:%u/%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n",
rb_user_cnt,
rb_user_max,
rb_gc_cnt,
rb_gc_max,
rb_gc_rsv,
rb_state,
rb_budget,
pblk->rl.low,
pblk->rl.high,
free_blocks,
total_blocks,
READ_ONCE(pblk->rl.rb_user_active));
}
static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page)
{
int gc_enabled, gc_active;
pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active);
return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n",
gc_enabled, gc_active);
}
static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page)
{
ssize_t sz;
sz = snprintf(page, PAGE_SIZE,
"read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n",
atomic_long_read(&pblk->read_failed),
atomic_long_read(&pblk->read_high_ecc),
atomic_long_read(&pblk->read_empty),
atomic_long_read(&pblk->read_failed_gc),
atomic_long_read(&pblk->write_failed),
atomic_long_read(&pblk->erase_failed));
return sz;
}
static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page)
{
return pblk_rb_sysfs(&pblk->rwb, page);
}
static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
ssize_t sz = 0;
sz = snprintf(page, PAGE_SIZE - sz,
"g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
pblk->ppaf_bitsize,
pblk->ppaf.blk_offset, geo->ppaf.blk_len,
pblk->ppaf.pg_offset, geo->ppaf.pg_len,
pblk->ppaf.lun_offset, geo->ppaf.lun_len,
pblk->ppaf.ch_offset, geo->ppaf.ch_len,
pblk->ppaf.pln_offset, geo->ppaf.pln_len,
pblk->ppaf.sec_offset, geo->ppaf.sect_len);
sz += snprintf(page + sz, PAGE_SIZE - sz,
"d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
geo->ppaf.blk_offset, geo->ppaf.blk_len,
geo->ppaf.pg_offset, geo->ppaf.pg_len,
geo->ppaf.lun_offset, geo->ppaf.lun_len,
geo->ppaf.ch_offset, geo->ppaf.ch_len,
geo->ppaf.pln_offset, geo->ppaf.pln_len,
geo->ppaf.sect_offset, geo->ppaf.sect_len);
return sz;
}
static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *line;
ssize_t sz = 0;
int nr_free_lines;
int cur_data, cur_log;
int free_line_cnt = 0, closed_line_cnt = 0;
int d_line_cnt = 0, l_line_cnt = 0;
int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0;
int free = 0, bad = 0, cor = 0;
int msecs = 0, ssecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0;
int map_weight = 0, meta_weight = 0;
spin_lock(&l_mg->free_lock);
cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1;
cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1;
nr_free_lines = l_mg->nr_free_lines;
list_for_each_entry(line, &l_mg->free_list, list)
free_line_cnt++;
spin_unlock(&l_mg->free_lock);
spin_lock(&l_mg->gc_lock);
list_for_each_entry(line, &l_mg->gc_full_list, list) {
if (line->type == PBLK_LINETYPE_DATA)
d_line_cnt++;
else if (line->type == PBLK_LINETYPE_LOG)
l_line_cnt++;
closed_line_cnt++;
gc_full++;
}
list_for_each_entry(line, &l_mg->gc_high_list, list) {
if (line->type == PBLK_LINETYPE_DATA)
d_line_cnt++;
else if (line->type == PBLK_LINETYPE_LOG)
l_line_cnt++;
closed_line_cnt++;
gc_high++;
}
list_for_each_entry(line, &l_mg->gc_mid_list, list) {
if (line->type == PBLK_LINETYPE_DATA)
d_line_cnt++;
else if (line->type == PBLK_LINETYPE_LOG)
l_line_cnt++;
closed_line_cnt++;
gc_mid++;
}
list_for_each_entry(line, &l_mg->gc_low_list, list) {
if (line->type == PBLK_LINETYPE_DATA)
d_line_cnt++;
else if (line->type == PBLK_LINETYPE_LOG)
l_line_cnt++;
closed_line_cnt++;
gc_low++;
}
list_for_each_entry(line, &l_mg->gc_empty_list, list) {
if (line->type == PBLK_LINETYPE_DATA)
d_line_cnt++;
else if (line->type == PBLK_LINETYPE_LOG)
l_line_cnt++;
closed_line_cnt++;
gc_empty++;
}
list_for_each_entry(line, &l_mg->free_list, list)
free++;
list_for_each_entry(line, &l_mg->bad_list, list)
bad++;
list_for_each_entry(line, &l_mg->corrupt_list, list)
cor++;
spin_unlock(&l_mg->gc_lock);
spin_lock(&l_mg->free_lock);
if (l_mg->data_line) {
cur_sec = l_mg->data_line->cur_sec;
msecs = l_mg->data_line->left_msecs;
ssecs = l_mg->data_line->left_ssecs;
vsc = l_mg->data_line->vsc;
sec_in_line = l_mg->data_line->sec_in_line;
meta_weight = bitmap_weight(&l_mg->meta_bitmap,
PBLK_DATA_LINES);
map_weight = bitmap_weight(l_mg->data_line->map_bitmap,
lm->sec_per_line);
}
spin_unlock(&l_mg->free_lock);
if (nr_free_lines != free_line_cnt)
pr_err("pblk: corrupted free line list\n");
sz = snprintf(page, PAGE_SIZE - sz,
"line: nluns:%d, nblks:%d, nsecs:%d\n",
geo->nr_luns, lm->blk_per_line, lm->sec_per_line);
sz += snprintf(page + sz, PAGE_SIZE - sz,
"lines:d:%d,l:%d-f:%d(%d),b:%d,co:%d,c:%d(d:%d,l:%d)t:%d\n",
cur_data, cur_log,
free, nr_free_lines, bad, cor,
closed_line_cnt,
d_line_cnt, l_line_cnt,
l_mg->nr_lines);
sz += snprintf(page + sz, PAGE_SIZE - sz,
"GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n",
gc_full, gc_high, gc_mid, gc_low, gc_empty,
atomic_read(&pblk->gc.inflight_gc));
sz += snprintf(page + sz, PAGE_SIZE - sz,
"data (%d) cur:%d, left:%d/%d, vsc:%d, s:%d, map:%d/%d (%d)\n",
cur_data, cur_sec, msecs, ssecs, vsc, sec_in_line,
map_weight, lm->sec_per_line, meta_weight);
return sz;
}
static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
ssize_t sz = 0;
sz = snprintf(page, PAGE_SIZE - sz,
"smeta - len:%d, secs:%d\n",
lm->smeta_len, lm->smeta_sec);
sz += snprintf(page + sz, PAGE_SIZE - sz,
"emeta - len:%d, sec:%d, bb_start:%d\n",
lm->emeta_len, lm->emeta_sec,
lm->emeta_bb);
sz += snprintf(page + sz, PAGE_SIZE - sz,
"bitmap lengths: sec:%d, blk:%d, lun:%d\n",
lm->sec_bitmap_len,
lm->blk_bitmap_len,
lm->lun_bitmap_len);
sz += snprintf(page + sz, PAGE_SIZE - sz,
"blk_line:%d, sec_line:%d, sec_blk:%d\n",
lm->blk_per_line,
lm->sec_per_line,
geo->sec_per_blk);
return sz;
}
#ifdef CONFIG_NVM_DEBUG
static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
{
return snprintf(page, PAGE_SIZE,
"%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n",
atomic_long_read(&pblk->inflight_writes),
atomic_long_read(&pblk->inflight_reads),
atomic_long_read(&pblk->req_writes),
atomic_long_read(&pblk->nr_flush),
atomic_long_read(&pblk->padded_writes),
atomic_long_read(&pblk->padded_wb),
atomic_long_read(&pblk->sub_writes),
atomic_long_read(&pblk->sync_writes),
atomic_long_read(&pblk->compl_writes),
atomic_long_read(&pblk->recov_writes),
atomic_long_read(&pblk->recov_gc_writes),
atomic_long_read(&pblk->recov_gc_reads),
atomic_long_read(&pblk->sync_reads));
}
#endif
static ssize_t pblk_sysfs_rate_store(struct pblk *pblk, const char *page,
size_t len)
{
struct pblk_gc *gc = &pblk->gc;
size_t c_len;
int value;
c_len = strcspn(page, "\n");
if (c_len >= len)
return -EINVAL;
if (kstrtouint(page, 0, &value))
return -EINVAL;
spin_lock(&gc->lock);
pblk_rl_set_gc_rsc(&pblk->rl, value);
spin_unlock(&gc->lock);
return len;
}
static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page,
size_t len)
{
size_t c_len;
int force;
c_len = strcspn(page, "\n");
if (c_len >= len)
return -EINVAL;
if (kstrtouint(page, 0, &force))
return -EINVAL;
if (force < 0 || force > 1)
return -EINVAL;
pblk_gc_sysfs_force(pblk, force);
return len;
}
static struct attribute sys_write_luns = {
.name = "write_luns",
.mode = 0444,
};
static struct attribute sys_rate_limiter_attr = {
.name = "rate_limiter",
.mode = 0444,
};
static struct attribute sys_gc_state = {
.name = "gc_state",
.mode = 0444,
};
static struct attribute sys_errors_attr = {
.name = "errors",
.mode = 0444,
};
static struct attribute sys_rb_attr = {
.name = "write_buffer",
.mode = 0444,
};
static struct attribute sys_stats_ppaf_attr = {
.name = "ppa_format",
.mode = 0444,
};
static struct attribute sys_lines_attr = {
.name = "lines",
.mode = 0444,
};
static struct attribute sys_lines_info_attr = {
.name = "lines_info",
.mode = 0444,
};
static struct attribute sys_gc_force = {
.name = "gc_force",
.mode = 0200,
};
static struct attribute sys_gc_rl_max = {
.name = "gc_rl_max",
.mode = 0200,
};
#ifdef CONFIG_NVM_DEBUG
static struct attribute sys_stats_debug_attr = {
.name = "stats",
.mode = 0444,
};
#endif
static struct attribute *pblk_attrs[] = {
&sys_write_luns,
&sys_rate_limiter_attr,
&sys_errors_attr,
&sys_gc_state,
&sys_gc_force,
&sys_gc_rl_max,
&sys_rb_attr,
&sys_stats_ppaf_attr,
&sys_lines_attr,
&sys_lines_info_attr,
#ifdef CONFIG_NVM_DEBUG
&sys_stats_debug_attr,
#endif
NULL,
};
static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
struct pblk *pblk = container_of(kobj, struct pblk, kobj);
if (strcmp(attr->name, "rate_limiter") == 0)
return pblk_sysfs_rate_limiter(pblk, buf);
else if (strcmp(attr->name, "write_luns") == 0)
return pblk_sysfs_luns_show(pblk, buf);
else if (strcmp(attr->name, "gc_state") == 0)
return pblk_sysfs_gc_state_show(pblk, buf);
else if (strcmp(attr->name, "errors") == 0)
return pblk_sysfs_stats(pblk, buf);
else if (strcmp(attr->name, "write_buffer") == 0)
return pblk_sysfs_write_buffer(pblk, buf);
else if (strcmp(attr->name, "ppa_format") == 0)
return pblk_sysfs_ppaf(pblk, buf);
else if (strcmp(attr->name, "lines") == 0)
return pblk_sysfs_lines(pblk, buf);
else if (strcmp(attr->name, "lines_info") == 0)
return pblk_sysfs_lines_info(pblk, buf);
#ifdef CONFIG_NVM_DEBUG
else if (strcmp(attr->name, "stats") == 0)
return pblk_sysfs_stats_debug(pblk, buf);
#endif
return 0;
}
static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t len)
{
struct pblk *pblk = container_of(kobj, struct pblk, kobj);
if (strcmp(attr->name, "gc_rl_max") == 0)
return pblk_sysfs_rate_store(pblk, buf, len);
else if (strcmp(attr->name, "gc_force") == 0)
return pblk_sysfs_gc_force(pblk, buf, len);
return 0;
}
static const struct sysfs_ops pblk_sysfs_ops = {
.show = pblk_sysfs_show,
.store = pblk_sysfs_store,
};
static struct kobj_type pblk_ktype = {
.sysfs_ops = &pblk_sysfs_ops,
.default_attrs = pblk_attrs,
};
int pblk_sysfs_init(struct gendisk *tdisk)
{
struct pblk *pblk = tdisk->private_data;
struct device *parent_dev = disk_to_dev(pblk->disk);
int ret;
ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype,
kobject_get(&parent_dev->kobj),
"%s", "pblk");
if (ret) {
pr_err("pblk: could not register %s/pblk\n",
tdisk->disk_name);
return ret;
}
kobject_uevent(&pblk->kobj, KOBJ_ADD);
return 0;
}
void pblk_sysfs_exit(struct gendisk *tdisk)
{
struct pblk *pblk = tdisk->private_data;
kobject_uevent(&pblk->kobj, KOBJ_REMOVE);
kobject_del(&pblk->kobj);
kobject_put(&pblk->kobj);
}

View File

@ -0,0 +1,411 @@
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-write.c - pblk's write path from write buffer to media
*/
#include "pblk.h"
static void pblk_sync_line(struct pblk *pblk, struct pblk_line *line)
{
#ifdef CONFIG_NVM_DEBUG
atomic_long_inc(&pblk->sync_writes);
#endif
/* Counter protected by rb sync lock */
line->left_ssecs--;
if (!line->left_ssecs)
pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
}
static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct bio *original_bio;
unsigned long ret;
int i;
for (i = 0; i < c_ctx->nr_valid; i++) {
struct pblk_w_ctx *w_ctx;
struct ppa_addr p;
struct pblk_line *line;
w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
p = rqd->ppa_list[i];
line = &pblk->lines[pblk_dev_ppa_to_line(p)];
pblk_sync_line(pblk, line);
while ((original_bio = bio_list_pop(&w_ctx->bios)))
bio_endio(original_bio);
}
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(c_ctx->nr_valid, &pblk->compl_writes);
#endif
ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
if (rqd->meta_list)
nvm_dev_dma_free(dev->parent, rqd->meta_list,
rqd->dma_meta_list);
bio_put(rqd->bio);
pblk_free_rqd(pblk, rqd, WRITE);
return ret;
}
static unsigned long pblk_end_queued_w_bio(struct pblk *pblk,
struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
list_del(&c_ctx->list);
return pblk_end_w_bio(pblk, rqd, c_ctx);
}
static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
struct pblk_c_ctx *c, *r;
unsigned long flags;
unsigned long pos;
#ifdef CONFIG_NVM_DEBUG
atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
#endif
pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
pos = pblk_rb_sync_init(&pblk->rwb, &flags);
if (pos == c_ctx->sentry) {
pos = pblk_end_w_bio(pblk, rqd, c_ctx);
retry:
list_for_each_entry_safe(c, r, &pblk->compl_list, list) {
rqd = nvm_rq_from_c_ctx(c);
if (c->sentry == pos) {
pos = pblk_end_queued_w_bio(pblk, rqd, c);
goto retry;
}
}
} else {
WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd);
list_add_tail(&c_ctx->list, &pblk->compl_list);
}
pblk_rb_sync_end(&pblk->rwb, &flags);
}
/* When a write fails, we are not sure whether the block has grown bad or a page
* range is more susceptible to write errors. If a high number of pages fail, we
* assume that the block is bad and we mark it accordingly. In all cases, we
* remap and resubmit the failed entries as fast as possible; if a flush is
* waiting on a completion, the whole stack would stall otherwise.
*/
static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
{
void *comp_bits = &rqd->ppa_status;
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
struct pblk_rec_ctx *recovery;
struct ppa_addr *ppa_list = rqd->ppa_list;
int nr_ppas = rqd->nr_ppas;
unsigned int c_entries;
int bit, ret;
if (unlikely(nr_ppas == 1))
ppa_list = &rqd->ppa_addr;
recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
if (!recovery) {
pr_err("pblk: could not allocate recovery context\n");
return;
}
INIT_LIST_HEAD(&recovery->failed);
bit = -1;
while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
struct pblk_rb_entry *entry;
struct ppa_addr ppa;
/* Logic error */
if (bit > c_ctx->nr_valid) {
WARN_ON_ONCE("pblk: corrupted write request\n");
goto out;
}
ppa = ppa_list[bit];
entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
if (!entry) {
pr_err("pblk: could not scan entry on write failure\n");
goto out;
}
/* The list is filled first and emptied afterwards. No need for
* protecting it with a lock
*/
list_add_tail(&entry->index, &recovery->failed);
}
c_entries = find_first_bit(comp_bits, nr_ppas);
ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
if (ret) {
pr_err("pblk: could not recover from write failure\n");
goto out;
}
INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
queue_work(pblk->kw_wq, &recovery->ws_rec);
out:
pblk_complete_write(pblk, rqd, c_ctx);
}
static void pblk_end_io_write(struct nvm_rq *rqd)
{
struct pblk *pblk = rqd->private;
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
if (rqd->error) {
pblk_log_write_err(pblk, rqd);
return pblk_end_w_fail(pblk, rqd);
}
#ifdef CONFIG_NVM_DEBUG
else
WARN_ONCE(rqd->bio->bi_error, "pblk: corrupted write error\n");
#endif
pblk_complete_write(pblk, rqd, c_ctx);
}
static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
unsigned int nr_secs)
{
struct nvm_tgt_dev *dev = pblk->dev;
/* Setup write request */
rqd->opcode = NVM_OP_PWRITE;
rqd->nr_ppas = nr_secs;
rqd->flags = pblk_set_progr_mode(pblk, WRITE);
rqd->private = pblk;
rqd->end_io = pblk_end_io_write;
rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
&rqd->dma_meta_list);
if (!rqd->meta_list)
return -ENOMEM;
if (unlikely(nr_secs == 1))
return 0;
rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
return 0;
}
static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line *e_line = pblk_line_get_data_next(pblk);
struct ppa_addr erase_ppa;
unsigned int valid = c_ctx->nr_valid;
unsigned int padded = c_ctx->nr_padded;
unsigned int nr_secs = valid + padded;
unsigned long *lun_bitmap;
int ret = 0;
lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
if (!lun_bitmap) {
ret = -ENOMEM;
goto out;
}
c_ctx->lun_bitmap = lun_bitmap;
ret = pblk_alloc_w_rq(pblk, rqd, nr_secs);
if (ret) {
kfree(lun_bitmap);
goto out;
}
ppa_set_empty(&erase_ppa);
if (likely(!e_line || !e_line->left_eblks))
pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0);
else
pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
valid, &erase_ppa);
out:
if (unlikely(e_line && !ppa_empty(erase_ppa))) {
if (pblk_blk_erase_async(pblk, erase_ppa)) {
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
int bit;
e_line->left_eblks++;
bit = erase_ppa.g.lun * geo->nr_chnls + erase_ppa.g.ch;
WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
up(&pblk->erase_sem);
}
}
return ret;
}
int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
struct pblk_line_meta *lm = &pblk->lm;
unsigned long *lun_bitmap;
int ret;
lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
if (!lun_bitmap)
return -ENOMEM;
c_ctx->lun_bitmap = lun_bitmap;
ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas);
if (ret)
return ret;
pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
rqd->ppa_status = (u64)0;
rqd->flags = pblk_set_progr_mode(pblk, WRITE);
return ret;
}
static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
unsigned int secs_to_flush)
{
int secs_to_sync;
secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
#ifdef CONFIG_NVM_DEBUG
if ((!secs_to_sync && secs_to_flush)
|| (secs_to_sync < 0)
|| (secs_to_sync > secs_avail && !secs_to_flush)) {
pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n",
secs_avail, secs_to_sync, secs_to_flush);
}
#endif
return secs_to_sync;
}
static int pblk_submit_write(struct pblk *pblk)
{
struct bio *bio;
struct nvm_rq *rqd;
struct pblk_c_ctx *c_ctx;
unsigned int pgs_read;
unsigned int secs_avail, secs_to_sync, secs_to_com;
unsigned int secs_to_flush;
unsigned long pos;
int err;
/* If there are no sectors in the cache, flushes (bios without data)
* will be cleared on the cache threads
*/
secs_avail = pblk_rb_read_count(&pblk->rwb);
if (!secs_avail)
return 1;
secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
return 1;
rqd = pblk_alloc_rqd(pblk, WRITE);
if (IS_ERR(rqd)) {
pr_err("pblk: cannot allocate write req.\n");
return 1;
}
c_ctx = nvm_rq_to_pdu(rqd);
bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
if (!bio) {
pr_err("pblk: cannot allocate write bio\n");
goto fail_free_rqd;
}
bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
rqd->bio = bio;
secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
if (secs_to_sync > pblk->max_write_pgs) {
pr_err("pblk: bad buffer sync calculation\n");
goto fail_put_bio;
}
secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
pgs_read = pblk_rb_read_to_bio(&pblk->rwb, bio, c_ctx, pos,
secs_to_sync, secs_avail);
if (!pgs_read) {
pr_err("pblk: corrupted write bio\n");
goto fail_put_bio;
}
if (c_ctx->nr_padded)
if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, c_ctx->nr_padded))
goto fail_put_bio;
/* Assign lbas to ppas and populate request structure */
err = pblk_setup_w_rq(pblk, rqd, c_ctx);
if (err) {
pr_err("pblk: could not setup write request\n");
goto fail_free_bio;
}
err = pblk_submit_io(pblk, rqd);
if (err) {
pr_err("pblk: I/O submission failed: %d\n", err);
goto fail_free_bio;
}
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(secs_to_sync, &pblk->sub_writes);
#endif
return 0;
fail_free_bio:
if (c_ctx->nr_padded)
pblk_bio_free_pages(pblk, bio, secs_to_sync, c_ctx->nr_padded);
fail_put_bio:
bio_put(bio);
fail_free_rqd:
pblk_free_rqd(pblk, rqd, WRITE);
return 1;
}
int pblk_write_ts(void *data)
{
struct pblk *pblk = data;
while (!kthread_should_stop()) {
if (!pblk_submit_write(pblk))
continue;
set_current_state(TASK_INTERRUPTIBLE);
io_schedule();
}
return 0;
}

File diff suppressed because it is too large Load Diff