Main batch of InfiniBand/RDMA changes for 3.19:

- On-demand paging support in core midlayer and mlx5 driver.  This lets
    userspace create non-pinned memory regions and have the adapter HW
    trigger page faults.
  - iSER and IPoIB updates and fixes.
  - Low-level HW driver updates for cxgb4, mlx4 and ocrdma.
  - Other miscellaneous fixes.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQIcBAABCAAGBQJUk2pBAAoJEENa44ZhAt0hL18P/jdCGbWVXOJh25KvjzmKIfUV
 T3Bdixz5h/Xj2iU6ShsXLSZa8vkPXsiO5v3MIQcR5MuPn88vrxemTy/OmBjefJeL
 qKGnWfy9O8KxMqhYZAokTTIyl5ygtSITbJsCE5W0KHgRBgBtexbrHeFBcWsT3AZ5
 piGyRP4XWc2LtfjrFWdUUjRELz9m74L93uILy0P8lS58k3M8YIOvkjqVmGj5Ya3U
 /hadgk1HYWfxjw+z3v0keaP1IoqHpJferH+UyjCj8UsIB9swXabE8ap/SFrQPIpe
 p+Zwyi25292mavqEfm/neUmvn34xLF8c00XB6UKxr42Q9yd1mDxnO+ZxWpxW5klQ
 tKEZeySDbB/WplGrumCeNXPonFvdBpGOTguP3z5o0bcgj1UJ+yVk8KjNBlwSWhQw
 Mkh/Rb6gSJzeidB3pnQV3TKVkvcFr+Li6DgbG6a77f0W7ggQC2UaeTwEPY5FlMtK
 n2jQddhnXYsQXeOEpDcISbpAnCIx+qjDIRv7jYTajw0hg8A669ytcI/gi4b9qJeU
 l3epZDszbCkRwPACzOXCRfeZRiz1H6/USI+Vn/yIQZBlHEd7TcK6ph+KDO/btX+D
 PWKrirIgzorJsIsDyD4WBXHfJnNS1Imfoxl5s7/8kkwrIkwY+lGpU0zM1bu7cS8W
 c32iGI9+dgHSPTZt3RdL
 =MCO9
 -----END PGP SIGNATURE-----

Merge tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband

Pull infiniband updates from Roland Dreier:
 "Main batch of InfiniBand/RDMA changes for 3.19:

   - On-demand paging support in core midlayer and mlx5 driver.  This
     lets userspace create non-pinned memory regions and have the
     adapter HW trigger page faults.
   - iSER and IPoIB updates and fixes.
   - Low-level HW driver updates for cxgb4, mlx4 and ocrdma.
   - Other miscellaneous fixes"

* tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband: (56 commits)
  IB/mlx5: Implement on demand paging by adding support for MMU notifiers
  IB/mlx5: Add support for RDMA read/write responder page faults
  IB/mlx5: Handle page faults
  IB/mlx5: Page faults handling infrastructure
  IB/mlx5: Add mlx5_ib_update_mtt to update page tables after creation
  IB/mlx5: Changes in memory region creation to support on-demand paging
  IB/mlx5: Implement the ODP capability query verb
  mlx5_core: Add support for page faults events and low level handling
  mlx5_core: Re-add MLX5_DEV_CAP_FLAG_ON_DMND_PG flag
  IB/srp: Allow newline separator for connection string
  IB/core: Implement support for MMU notifiers regarding on demand paging regions
  IB/core: Add support for on demand paging regions
  IB/core: Add flags for on demand paging support
  IB/core: Add support for extended query device caps
  IB/mlx5: Add function to read WQE from user-space
  IB/core: Add umem function to read data from user-space
  IB/core: Replace ib_umem's offset field with a full address
  IB/mlx5: Enhance UMR support to allow partial page table update
  IB/mlx5: Remove per-MR pas and dma pointers
  RDMA/ocrdma: Always resolve destination mac from GRH for UD QPs
  ...
This commit is contained in:
Linus Torvalds 2014-12-18 20:10:44 -08:00
commit 4c929feed7
53 changed files with 3512 additions and 458 deletions

View file

@ -38,6 +38,17 @@ config INFINIBAND_USER_MEM
depends on INFINIBAND_USER_ACCESS != n
default y
config INFINIBAND_ON_DEMAND_PAGING
bool "InfiniBand on-demand paging support"
depends on INFINIBAND_USER_MEM
select MMU_NOTIFIER
default y
---help---
On demand paging support for the InfiniBand subsystem.
Together with driver support this allows registration of
memory regions without pinning their pages, fetching the
pages on demand instead.
config INFINIBAND_ADDR_TRANS
bool
depends on INFINIBAND

View file

@ -11,6 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
device.o fmr_pool.o cache.o netlink.o
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
ib_mad-y := mad.o smi.o agent.o mad_rmpp.o

View file

@ -176,8 +176,8 @@ static void set_timeout(unsigned long time)
unsigned long delay;
delay = time - jiffies;
if ((long)delay <= 0)
delay = 1;
if ((long)delay < 0)
delay = 0;
mod_delayed_work(addr_wq, &work, delay);
}

View file

@ -525,17 +525,22 @@ static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
if (status)
process_join_error(group, status);
else {
int mgids_changed, is_mgid0;
ib_find_pkey(group->port->dev->device, group->port->port_num,
be16_to_cpu(rec->pkey), &pkey_index);
spin_lock_irq(&group->port->lock);
group->rec = *rec;
if (group->state == MCAST_BUSY &&
group->pkey_index == MCAST_INVALID_PKEY_INDEX)
group->pkey_index = pkey_index;
if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
mgids_changed = memcmp(&rec->mgid, &group->rec.mgid,
sizeof(group->rec.mgid));
group->rec = *rec;
if (mgids_changed) {
rb_erase(&group->node, &group->port->table);
mcast_insert(group->port, group, 1);
is_mgid0 = !memcmp(&mgid0, &group->rec.mgid,
sizeof(mgid0));
mcast_insert(group->port, group, is_mgid0);
}
spin_unlock_irq(&group->port->lock);
}

View file

@ -39,6 +39,7 @@
#include <linux/hugetlb.h>
#include <linux/dma-attrs.h>
#include <linux/slab.h>
#include <rdma/ib_umem_odp.h>
#include "uverbs.h"
@ -69,6 +70,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
/**
* ib_umem_get - Pin and DMA map userspace memory.
*
* If access flags indicate ODP memory, avoid pinning. Instead, stores
* the mm for future page fault handling in conjunction with MMU notifiers.
*
* @context: userspace context to pin memory for
* @addr: userspace virtual address to start at
* @size: length of region to pin
@ -103,17 +108,30 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
umem->context = context;
umem->length = size;
umem->offset = addr & ~PAGE_MASK;
umem->address = addr;
umem->page_size = PAGE_SIZE;
umem->pid = get_task_pid(current, PIDTYPE_PID);
/*
* We ask for writable memory if any access flags other than
* "remote read" are set. "Local write" and "remote write"
* We ask for writable memory if any of the following
* access flags are set. "Local write" and "remote write"
* obviously require write access. "Remote atomic" can do
* things like fetch and add, which will modify memory, and
* "MW bind" can change permissions by binding a window.
*/
umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
umem->writable = !!(access &
(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
if (access & IB_ACCESS_ON_DEMAND) {
ret = ib_umem_odp_get(context, umem);
if (ret) {
kfree(umem);
return ERR_PTR(ret);
}
return umem;
}
umem->odp_data = NULL;
/* We assume the memory is from hugetlb until proved otherwise */
umem->hugetlb = 1;
@ -132,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
if (!vma_list)
umem->hugetlb = 0;
npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
npages = ib_umem_num_pages(umem);
down_write(&current->mm->mmap_sem);
@ -235,6 +253,11 @@ void ib_umem_release(struct ib_umem *umem)
struct task_struct *task;
unsigned long diff;
if (umem->odp_data) {
ib_umem_odp_release(umem);
return;
}
__ib_umem_release(umem->context->device, umem, 1);
task = get_pid_task(umem->pid, PIDTYPE_PID);
@ -246,7 +269,7 @@ void ib_umem_release(struct ib_umem *umem)
if (!mm)
goto out;
diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
diff = ib_umem_num_pages(umem);
/*
* We may be called with the mm's mmap_sem already held. This
@ -283,6 +306,9 @@ int ib_umem_page_count(struct ib_umem *umem)
int n;
struct scatterlist *sg;
if (umem->odp_data)
return ib_umem_num_pages(umem);
shift = ilog2(umem->page_size);
n = 0;
@ -292,3 +318,37 @@ int ib_umem_page_count(struct ib_umem *umem)
return n;
}
EXPORT_SYMBOL(ib_umem_page_count);
/*
* Copy from the given ib_umem's pages to the given buffer.
*
* umem - the umem to copy from
* offset - offset to start copying from
* dst - destination buffer
* length - buffer length
*
* Returns 0 on success, or an error code.
*/
int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
size_t length)
{
size_t end = offset + length;
int ret;
if (offset > umem->length || length > umem->length - offset) {
pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
offset, umem->length, end);
return -EINVAL;
}
ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length,
offset + ib_umem_offset(umem));
if (ret < 0)
return ret;
else if (ret != length)
return -EINVAL;
else
return 0;
}
EXPORT_SYMBOL(ib_umem_copy_from);

View file

@ -0,0 +1,668 @@
/*
* Copyright (c) 2014 Mellanox Technologies. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/pid.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_umem_odp.h>
static void ib_umem_notifier_start_account(struct ib_umem *item)
{
mutex_lock(&item->odp_data->umem_mutex);
/* Only update private counters for this umem if it has them.
* Otherwise skip it. All page faults will be delayed for this umem. */
if (item->odp_data->mn_counters_active) {
int notifiers_count = item->odp_data->notifiers_count++;
if (notifiers_count == 0)
/* Initialize the completion object for waiting on
* notifiers. Since notifier_count is zero, no one
* should be waiting right now. */
reinit_completion(&item->odp_data->notifier_completion);
}
mutex_unlock(&item->odp_data->umem_mutex);
}
static void ib_umem_notifier_end_account(struct ib_umem *item)
{
mutex_lock(&item->odp_data->umem_mutex);
/* Only update private counters for this umem if it has them.
* Otherwise skip it. All page faults will be delayed for this umem. */
if (item->odp_data->mn_counters_active) {
/*
* This sequence increase will notify the QP page fault that
* the page that is going to be mapped in the spte could have
* been freed.
*/
++item->odp_data->notifiers_seq;
if (--item->odp_data->notifiers_count == 0)
complete_all(&item->odp_data->notifier_completion);
}
mutex_unlock(&item->odp_data->umem_mutex);
}
/* Account for a new mmu notifier in an ib_ucontext. */
static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
{
atomic_inc(&context->notifier_count);
}
/* Account for a terminating mmu notifier in an ib_ucontext.
*
* Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
* the function takes the semaphore itself. */
static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
{
int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
if (zero_notifiers &&
!list_empty(&context->no_private_counters)) {
/* No currently running mmu notifiers. Now is the chance to
* add private accounting to all previously added umems. */
struct ib_umem_odp *odp_data, *next;
/* Prevent concurrent mmu notifiers from working on the
* no_private_counters list. */
down_write(&context->umem_rwsem);
/* Read the notifier_count again, with the umem_rwsem
* semaphore taken for write. */
if (!atomic_read(&context->notifier_count)) {
list_for_each_entry_safe(odp_data, next,
&context->no_private_counters,
no_private_counters) {
mutex_lock(&odp_data->umem_mutex);
odp_data->mn_counters_active = true;
list_del(&odp_data->no_private_counters);
complete_all(&odp_data->notifier_completion);
mutex_unlock(&odp_data->umem_mutex);
}
}
up_write(&context->umem_rwsem);
}
}
static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
u64 end, void *cookie) {
/*
* Increase the number of notifiers running, to
* prevent any further fault handling on this MR.
*/
ib_umem_notifier_start_account(item);
item->odp_data->dying = 1;
/* Make sure that the fact the umem is dying is out before we release
* all pending page faults. */
smp_wmb();
complete_all(&item->odp_data->notifier_completion);
item->context->invalidate_range(item, ib_umem_start(item),
ib_umem_end(item));
return 0;
}
static void ib_umem_notifier_release(struct mmu_notifier *mn,
struct mm_struct *mm)
{
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
if (!context->invalidate_range)
return;
ib_ucontext_notifier_start_account(context);
down_read(&context->umem_rwsem);
rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
ULLONG_MAX,
ib_umem_notifier_release_trampoline,
NULL);
up_read(&context->umem_rwsem);
}
static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
u64 end, void *cookie)
{
ib_umem_notifier_start_account(item);
item->context->invalidate_range(item, start, start + PAGE_SIZE);
ib_umem_notifier_end_account(item);
return 0;
}
static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address)
{
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
if (!context->invalidate_range)
return;
ib_ucontext_notifier_start_account(context);
down_read(&context->umem_rwsem);
rbt_ib_umem_for_each_in_range(&context->umem_tree, address,
address + PAGE_SIZE,
invalidate_page_trampoline, NULL);
up_read(&context->umem_rwsem);
ib_ucontext_notifier_end_account(context);
}
static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
u64 end, void *cookie)
{
ib_umem_notifier_start_account(item);
item->context->invalidate_range(item, start, end);
return 0;
}
static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
if (!context->invalidate_range)
return;
ib_ucontext_notifier_start_account(context);
down_read(&context->umem_rwsem);
rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
end,
invalidate_range_start_trampoline, NULL);
up_read(&context->umem_rwsem);
}
static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
u64 end, void *cookie)
{
ib_umem_notifier_end_account(item);
return 0;
}
static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
if (!context->invalidate_range)
return;
down_read(&context->umem_rwsem);
rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
end,
invalidate_range_end_trampoline, NULL);
up_read(&context->umem_rwsem);
ib_ucontext_notifier_end_account(context);
}
static struct mmu_notifier_ops ib_umem_notifiers = {
.release = ib_umem_notifier_release,
.invalidate_page = ib_umem_notifier_invalidate_page,
.invalidate_range_start = ib_umem_notifier_invalidate_range_start,
.invalidate_range_end = ib_umem_notifier_invalidate_range_end,
};
int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
{
int ret_val;
struct pid *our_pid;
struct mm_struct *mm = get_task_mm(current);
if (!mm)
return -EINVAL;
/* Prevent creating ODP MRs in child processes */
rcu_read_lock();
our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
rcu_read_unlock();
put_pid(our_pid);
if (context->tgid != our_pid) {
ret_val = -EINVAL;
goto out_mm;
}
umem->hugetlb = 0;
umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
if (!umem->odp_data) {
ret_val = -ENOMEM;
goto out_mm;
}
umem->odp_data->umem = umem;
mutex_init(&umem->odp_data->umem_mutex);
init_completion(&umem->odp_data->notifier_completion);
umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
sizeof(*umem->odp_data->page_list));
if (!umem->odp_data->page_list) {
ret_val = -ENOMEM;
goto out_odp_data;
}
umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) *
sizeof(*umem->odp_data->dma_list));
if (!umem->odp_data->dma_list) {
ret_val = -ENOMEM;
goto out_page_list;
}
/*
* When using MMU notifiers, we will get a
* notification before the "current" task (and MM) is
* destroyed. We use the umem_rwsem semaphore to synchronize.
*/
down_write(&context->umem_rwsem);
context->odp_mrs_count++;
if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
rbt_ib_umem_insert(&umem->odp_data->interval_tree,
&context->umem_tree);
if (likely(!atomic_read(&context->notifier_count)))
umem->odp_data->mn_counters_active = true;
else
list_add(&umem->odp_data->no_private_counters,
&context->no_private_counters);
downgrade_write(&context->umem_rwsem);
if (context->odp_mrs_count == 1) {
/*
* Note that at this point, no MMU notifier is running
* for this context!
*/
atomic_set(&context->notifier_count, 0);
INIT_HLIST_NODE(&context->mn.hlist);
context->mn.ops = &ib_umem_notifiers;
/*
* Lock-dep detects a false positive for mmap_sem vs.
* umem_rwsem, due to not grasping downgrade_write correctly.
*/
lockdep_off();
ret_val = mmu_notifier_register(&context->mn, mm);
lockdep_on();
if (ret_val) {
pr_err("Failed to register mmu_notifier %d\n", ret_val);
ret_val = -EBUSY;
goto out_mutex;
}
}
up_read(&context->umem_rwsem);
/*
* Note that doing an mmput can cause a notifier for the relevant mm.
* If the notifier is called while we hold the umem_rwsem, this will
* cause a deadlock. Therefore, we release the reference only after we
* released the semaphore.
*/
mmput(mm);
return 0;
out_mutex:
up_read(&context->umem_rwsem);
vfree(umem->odp_data->dma_list);
out_page_list:
vfree(umem->odp_data->page_list);
out_odp_data:
kfree(umem->odp_data);
out_mm:
mmput(mm);
return ret_val;
}
void ib_umem_odp_release(struct ib_umem *umem)
{
struct ib_ucontext *context = umem->context;
/*
* Ensure that no more pages are mapped in the umem.
*
* It is the driver's responsibility to ensure, before calling us,
* that the hardware will not attempt to access the MR any more.
*/
ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
ib_umem_end(umem));
down_write(&context->umem_rwsem);
if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
rbt_ib_umem_remove(&umem->odp_data->interval_tree,
&context->umem_tree);
context->odp_mrs_count--;
if (!umem->odp_data->mn_counters_active) {
list_del(&umem->odp_data->no_private_counters);
complete_all(&umem->odp_data->notifier_completion);
}
/*
* Downgrade the lock to a read lock. This ensures that the notifiers
* (who lock the mutex for reading) will be able to finish, and we
* will be able to enventually obtain the mmu notifiers SRCU. Note
* that since we are doing it atomically, no other user could register
* and unregister while we do the check.
*/
downgrade_write(&context->umem_rwsem);
if (!context->odp_mrs_count) {
struct task_struct *owning_process = NULL;
struct mm_struct *owning_mm = NULL;
owning_process = get_pid_task(context->tgid,
PIDTYPE_PID);
if (owning_process == NULL)
/*
* The process is already dead, notifier were removed
* already.
*/
goto out;
owning_mm = get_task_mm(owning_process);
if (owning_mm == NULL)
/*
* The process' mm is already dead, notifier were
* removed already.
*/
goto out_put_task;
mmu_notifier_unregister(&context->mn, owning_mm);
mmput(owning_mm);
out_put_task:
put_task_struct(owning_process);
}
out:
up_read(&context->umem_rwsem);
vfree(umem->odp_data->dma_list);
vfree(umem->odp_data->page_list);
kfree(umem->odp_data);
kfree(umem);
}
/*
* Map for DMA and insert a single page into the on-demand paging page tables.
*
* @umem: the umem to insert the page to.
* @page_index: index in the umem to add the page to.
* @page: the page struct to map and add.
* @access_mask: access permissions needed for this page.
* @current_seq: sequence number for synchronization with invalidations.
* the sequence number is taken from
* umem->odp_data->notifiers_seq.
*
* The function returns -EFAULT if the DMA mapping operation fails. It returns
* -EAGAIN if a concurrent invalidation prevents us from updating the page.
*
* The page is released via put_page even if the operation failed. For
* on-demand pinning, the page is released whenever it isn't stored in the
* umem.
*/
static int ib_umem_odp_map_dma_single_page(
struct ib_umem *umem,
int page_index,
u64 base_virt_addr,
struct page *page,
u64 access_mask,
unsigned long current_seq)
{
struct ib_device *dev = umem->context->device;
dma_addr_t dma_addr;
int stored_page = 0;
int remove_existing_mapping = 0;
int ret = 0;
mutex_lock(&umem->odp_data->umem_mutex);
/*
* Note: we avoid writing if seq is different from the initial seq, to
* handle case of a racing notifier. This check also allows us to bail
* early if we have a notifier running in parallel with us.
*/
if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
ret = -EAGAIN;
goto out;
}
if (!(umem->odp_data->dma_list[page_index])) {
dma_addr = ib_dma_map_page(dev,
page,
0, PAGE_SIZE,
DMA_BIDIRECTIONAL);
if (ib_dma_mapping_error(dev, dma_addr)) {
ret = -EFAULT;
goto out;
}
umem->odp_data->dma_list[page_index] = dma_addr | access_mask;
umem->odp_data->page_list[page_index] = page;
stored_page = 1;
} else if (umem->odp_data->page_list[page_index] == page) {
umem->odp_data->dma_list[page_index] |= access_mask;
} else {
pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
umem->odp_data->page_list[page_index], page);
/* Better remove the mapping now, to prevent any further
* damage. */
remove_existing_mapping = 1;
}
out:
mutex_unlock(&umem->odp_data->umem_mutex);
/* On Demand Paging - avoid pinning the page */
if (umem->context->invalidate_range || !stored_page)
put_page(page);
if (remove_existing_mapping && umem->context->invalidate_range) {
invalidate_page_trampoline(
umem,
base_virt_addr + (page_index * PAGE_SIZE),
base_virt_addr + ((page_index+1)*PAGE_SIZE),
NULL);
ret = -EAGAIN;
}
return ret;
}
/**
* ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
*
* Pins the range of pages passed in the argument, and maps them to
* DMA addresses. The DMA addresses of the mapped pages is updated in
* umem->odp_data->dma_list.
*
* Returns the number of pages mapped in success, negative error code
* for failure.
* An -EAGAIN error code is returned when a concurrent mmu notifier prevents
* the function from completing its task.
*
* @umem: the umem to map and pin
* @user_virt: the address from which we need to map.
* @bcnt: the minimal number of bytes to pin and map. The mapping might be
* bigger due to alignment, and may also be smaller in case of an error
* pinning or mapping a page. The actual pages mapped is returned in
* the return value.
* @access_mask: bit mask of the requested access permissions for the given
* range.
* @current_seq: the MMU notifiers sequance value for synchronization with
* invalidations. the sequance number is read from
* umem->odp_data->notifiers_seq before calling this function
*/
int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
u64 access_mask, unsigned long current_seq)
{
struct task_struct *owning_process = NULL;
struct mm_struct *owning_mm = NULL;
struct page **local_page_list = NULL;
u64 off;
int j, k, ret = 0, start_idx, npages = 0;
u64 base_virt_addr;
if (access_mask == 0)
return -EINVAL;
if (user_virt < ib_umem_start(umem) ||
user_virt + bcnt > ib_umem_end(umem))
return -EFAULT;
local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
if (!local_page_list)
return -ENOMEM;
off = user_virt & (~PAGE_MASK);
user_virt = user_virt & PAGE_MASK;
base_virt_addr = user_virt;
bcnt += off; /* Charge for the first page offset as well. */
owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
if (owning_process == NULL) {
ret = -EINVAL;
goto out_no_task;
}
owning_mm = get_task_mm(owning_process);
if (owning_mm == NULL) {
ret = -EINVAL;
goto out_put_task;
}
start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT;
k = start_idx;
while (bcnt > 0) {
const size_t gup_num_pages =
min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE,
PAGE_SIZE / sizeof(struct page *));
down_read(&owning_mm->mmap_sem);
/*
* Note: this might result in redundent page getting. We can
* avoid this by checking dma_list to be 0 before calling
* get_user_pages. However, this make the code much more
* complex (and doesn't gain us much performance in most use
* cases).
*/
npages = get_user_pages(owning_process, owning_mm, user_virt,
gup_num_pages,
access_mask & ODP_WRITE_ALLOWED_BIT, 0,
local_page_list, NULL);
up_read(&owning_mm->mmap_sem);
if (npages < 0)
break;
bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
user_virt += npages << PAGE_SHIFT;
for (j = 0; j < npages; ++j) {
ret = ib_umem_odp_map_dma_single_page(
umem, k, base_virt_addr, local_page_list[j],
access_mask, current_seq);
if (ret < 0)
break;
k++;
}
if (ret < 0) {
/* Release left over pages when handling errors. */
for (++j; j < npages; ++j)
put_page(local_page_list[j]);
break;
}
}
if (ret >= 0) {
if (npages < 0 && k == start_idx)
ret = npages;
else
ret = k - start_idx;
}
mmput(owning_mm);
out_put_task:
put_task_struct(owning_process);
out_no_task:
free_page((unsigned long)local_page_list);
return ret;
}
EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
u64 bound)
{
int idx;
u64 addr;
struct ib_device *dev = umem->context->device;
virt = max_t(u64, virt, ib_umem_start(umem));
bound = min_t(u64, bound, ib_umem_end(umem));
/* Note that during the run of this function, the
* notifiers_count of the MR is > 0, preventing any racing
* faults from completion. We might be racing with other
* invalidations, so we must make sure we free each page only
* once. */
for (addr = virt; addr < bound; addr += (u64)umem->page_size) {
idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
mutex_lock(&umem->odp_data->umem_mutex);
if (umem->odp_data->page_list[idx]) {
struct page *page = umem->odp_data->page_list[idx];
struct page *head_page = compound_head(page);
dma_addr_t dma = umem->odp_data->dma_list[idx];
dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
WARN_ON(!dma_addr);
ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
DMA_BIDIRECTIONAL);
if (dma & ODP_WRITE_ALLOWED_BIT)
/*
* set_page_dirty prefers being called with
* the page lock. However, MMU notifiers are
* called sometimes with and sometimes without
* the lock. We rely on the umem_mutex instead
* to prevent other mmu notifiers from
* continuing and allowing the page mapping to
* be removed.
*/
set_page_dirty(head_page);
/* on demand pinning support */
if (!umem->context->invalidate_range)
put_page(page);
umem->odp_data->page_list[idx] = NULL;
umem->odp_data->dma_list[idx] = 0;
}
mutex_unlock(&umem->odp_data->umem_mutex);
}
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);

View file

@ -0,0 +1,94 @@
/*
* Copyright (c) 2014 Mellanox Technologies. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/interval_tree_generic.h>
#include <linux/sched.h>
#include <linux/gfp.h>
#include <rdma/ib_umem_odp.h>
/*
* The ib_umem list keeps track of memory regions for which the HW
* device request to receive notification when the related memory
* mapping is changed.
*
* ib_umem_lock protects the list.
*/
static inline u64 node_start(struct umem_odp_node *n)
{
struct ib_umem_odp *umem_odp =
container_of(n, struct ib_umem_odp, interval_tree);
return ib_umem_start(umem_odp->umem);
}
/* Note that the representation of the intervals in the interval tree
* considers the ending point as contained in the interval, while the
* function ib_umem_end returns the first address which is not contained
* in the umem.
*/
static inline u64 node_last(struct umem_odp_node *n)
{
struct ib_umem_odp *umem_odp =
container_of(n, struct ib_umem_odp, interval_tree);
return ib_umem_end(umem_odp->umem) - 1;
}
INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
node_start, node_last, , rbt_ib_umem)
/* @last is not a part of the interval. See comment for function
* node_last.
*/
int rbt_ib_umem_for_each_in_range(struct rb_root *root,
u64 start, u64 last,
umem_call_back cb,
void *cookie)
{
int ret_val = 0;
struct umem_odp_node *node;
struct ib_umem_odp *umem;
if (unlikely(start == last))
return ret_val;
for (node = rbt_ib_umem_iter_first(root, start, last - 1); node;
node = rbt_ib_umem_iter_next(node, start, last - 1)) {
umem = container_of(node, struct ib_umem_odp, interval_tree);
ret_val = cb(umem->umem, start, last, cookie) || ret_val;
}
return ret_val;
}

View file

@ -258,5 +258,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
IB_UVERBS_DECLARE_EX_CMD(create_flow);
IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
IB_UVERBS_DECLARE_EX_CMD(query_device);
#endif /* UVERBS_H */

View file

@ -36,6 +36,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <asm/uaccess.h>
@ -288,6 +289,9 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
struct ib_uverbs_get_context_resp resp;
struct ib_udata udata;
struct ib_device *ibdev = file->device->ib_dev;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_device_attr dev_attr;
#endif
struct ib_ucontext *ucontext;
struct file *filp;
int ret;
@ -325,8 +329,25 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&ucontext->ah_list);
INIT_LIST_HEAD(&ucontext->xrcd_list);
INIT_LIST_HEAD(&ucontext->rule_list);
rcu_read_lock();
ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
rcu_read_unlock();
ucontext->closing = 0;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
ucontext->umem_tree = RB_ROOT;
init_rwsem(&ucontext->umem_rwsem);
ucontext->odp_mrs_count = 0;
INIT_LIST_HEAD(&ucontext->no_private_counters);
ret = ib_query_device(ibdev, &dev_attr);
if (ret)
goto err_free;
if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
ucontext->invalidate_range = NULL;
#endif
resp.num_comp_vectors = file->device->num_comp_vectors;
ret = get_unused_fd_flags(O_CLOEXEC);
@ -371,6 +392,7 @@ err_fd:
put_unused_fd(resp.async_fd);
err_free:
put_pid(ucontext->tgid);
ibdev->dealloc_ucontext(ucontext);
err:
@ -378,6 +400,52 @@ err:
return ret;
}
static void copy_query_dev_fields(struct ib_uverbs_file *file,
struct ib_uverbs_query_device_resp *resp,
struct ib_device_attr *attr)
{
resp->fw_ver = attr->fw_ver;
resp->node_guid = file->device->ib_dev->node_guid;
resp->sys_image_guid = attr->sys_image_guid;
resp->max_mr_size = attr->max_mr_size;
resp->page_size_cap = attr->page_size_cap;
resp->vendor_id = attr->vendor_id;
resp->vendor_part_id = attr->vendor_part_id;
resp->hw_ver = attr->hw_ver;
resp->max_qp = attr->max_qp;
resp->max_qp_wr = attr->max_qp_wr;
resp->device_cap_flags = attr->device_cap_flags;
resp->max_sge = attr->max_sge;
resp->max_sge_rd = attr->max_sge_rd;
resp->max_cq = attr->max_cq;
resp->max_cqe = attr->max_cqe;
resp->max_mr = attr->max_mr;
resp->max_pd = attr->max_pd;
resp->max_qp_rd_atom = attr->max_qp_rd_atom;
resp->max_ee_rd_atom = attr->max_ee_rd_atom;
resp->max_res_rd_atom = attr->max_res_rd_atom;
resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom;
resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom;
resp->atomic_cap = attr->atomic_cap;
resp->max_ee = attr->max_ee;
resp->max_rdd = attr->max_rdd;
resp->max_mw = attr->max_mw;
resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp;
resp->max_raw_ethy_qp = attr->max_raw_ethy_qp;
resp->max_mcast_grp = attr->max_mcast_grp;
resp->max_mcast_qp_attach = attr->max_mcast_qp_attach;
resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach;
resp->max_ah = attr->max_ah;
resp->max_fmr = attr->max_fmr;
resp->max_map_per_fmr = attr->max_map_per_fmr;
resp->max_srq = attr->max_srq;
resp->max_srq_wr = attr->max_srq_wr;
resp->max_srq_sge = attr->max_srq_sge;
resp->max_pkeys = attr->max_pkeys;
resp->local_ca_ack_delay = attr->local_ca_ack_delay;
resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt;
}
ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
const char __user *buf,
int in_len, int out_len)
@ -398,47 +466,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
return ret;
memset(&resp, 0, sizeof resp);
resp.fw_ver = attr.fw_ver;
resp.node_guid = file->device->ib_dev->node_guid;
resp.sys_image_guid = attr.sys_image_guid;
resp.max_mr_size = attr.max_mr_size;
resp.page_size_cap = attr.page_size_cap;
resp.vendor_id = attr.vendor_id;
resp.vendor_part_id = attr.vendor_part_id;
resp.hw_ver = attr.hw_ver;
resp.max_qp = attr.max_qp;
resp.max_qp_wr = attr.max_qp_wr;
resp.device_cap_flags = attr.device_cap_flags;
resp.max_sge = attr.max_sge;
resp.max_sge_rd = attr.max_sge_rd;
resp.max_cq = attr.max_cq;
resp.max_cqe = attr.max_cqe;
resp.max_mr = attr.max_mr;
resp.max_pd = attr.max_pd;
resp.max_qp_rd_atom = attr.max_qp_rd_atom;
resp.max_ee_rd_atom = attr.max_ee_rd_atom;
resp.max_res_rd_atom = attr.max_res_rd_atom;
resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom;
resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom;
resp.atomic_cap = attr.atomic_cap;
resp.max_ee = attr.max_ee;
resp.max_rdd = attr.max_rdd;
resp.max_mw = attr.max_mw;
resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp;
resp.max_raw_ethy_qp = attr.max_raw_ethy_qp;
resp.max_mcast_grp = attr.max_mcast_grp;
resp.max_mcast_qp_attach = attr.max_mcast_qp_attach;
resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach;
resp.max_ah = attr.max_ah;
resp.max_fmr = attr.max_fmr;
resp.max_map_per_fmr = attr.max_map_per_fmr;
resp.max_srq = attr.max_srq;
resp.max_srq_wr = attr.max_srq_wr;
resp.max_srq_sge = attr.max_srq_sge;
resp.max_pkeys = attr.max_pkeys;
resp.local_ca_ack_delay = attr.local_ca_ack_delay;
resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt;
copy_query_dev_fields(file, &resp, &attr);
if (copy_to_user((void __user *) (unsigned long) cmd.response,
&resp, sizeof resp))
@ -947,6 +975,18 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
goto err_free;
}
if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
struct ib_device_attr attr;
ret = ib_query_device(pd->device, &attr);
if (ret || !(attr.device_cap_flags &
IB_DEVICE_ON_DEMAND_PAGING)) {
pr_debug("ODP support not available\n");
ret = -EINVAL;
goto err_put;
}
}
mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
cmd.access_flags, &udata);
if (IS_ERR(mr)) {
@ -3253,3 +3293,52 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
return ret ? ret : in_len;
}
int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
struct ib_uverbs_ex_query_device_resp resp;
struct ib_uverbs_ex_query_device cmd;
struct ib_device_attr attr;
struct ib_device *device;
int err;
device = file->device->ib_dev;
if (ucore->inlen < sizeof(cmd))
return -EINVAL;
err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
if (err)
return err;
if (cmd.reserved)
return -EINVAL;
err = device->query_device(device, &attr);
if (err)
return err;
memset(&resp, 0, sizeof(resp));
copy_query_dev_fields(file, &resp.base, &attr);
resp.comp_mask = 0;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (cmd.comp_mask & IB_USER_VERBS_EX_QUERY_DEVICE_ODP) {
resp.odp_caps.general_caps = attr.odp_caps.general_caps;
resp.odp_caps.per_transport_caps.rc_odp_caps =
attr.odp_caps.per_transport_caps.rc_odp_caps;
resp.odp_caps.per_transport_caps.uc_odp_caps =
attr.odp_caps.per_transport_caps.uc_odp_caps;
resp.odp_caps.per_transport_caps.ud_odp_caps =
attr.odp_caps.per_transport_caps.ud_odp_caps;
resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP;
}
#endif
err = ib_copy_to_udata(ucore, &resp, sizeof(resp));
if (err)
return err;
return 0;
}

View file

@ -122,7 +122,8 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
struct ib_udata *ucore,
struct ib_udata *uhw) = {
[IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow,
[IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow
[IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow,
[IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device
};
static void ib_uverbs_add_one(struct ib_device *device);
@ -296,6 +297,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
kfree(uobj);
}
put_pid(context->tgid);
return context->device->dealloc_ucontext(context);
}

View file

@ -879,7 +879,8 @@ int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) {
rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac);
rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac);
qp_attr->vlan_id = rdma_get_vlan_id(&sgid);
if (!(*qp_attr_mask & IB_QP_VID))
qp_attr->vlan_id = rdma_get_vlan_id(&sgid);
} else {
ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid,
qp_attr->ah_attr.dmac, &qp_attr->vlan_id);

View file

@ -476,7 +476,7 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
c2mr->umem->page_size,
i,
length,
c2mr->umem->offset,
ib_umem_offset(c2mr->umem),
&kva,
c2_convert_access(acc),
c2mr);

View file

@ -1640,7 +1640,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
__state_set(&ep->com, MPA_REQ_RCVD);
/* drive upcall */
mutex_lock(&ep->parent_ep->com.mutex);
mutex_lock_nested(&ep->parent_ep->com.mutex,
SINGLE_DEPTH_NESTING);
if (ep->parent_ep->com.state != DEAD) {
if (connect_request_upcall(ep))
abort_connection(ep, skb, GFP_KERNEL);
@ -3126,6 +3127,8 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
err = c4iw_wait_for_reply(&ep->com.dev->rdev,
&ep->com.wr_wait,
0, 0, __func__);
else if (err > 0)
err = net_xmit_errno(err);
if (err)
pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
err, ep->stid,
@ -3159,6 +3162,8 @@ static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
err = c4iw_wait_for_reply(&ep->com.dev->rdev,
&ep->com.wr_wait,
0, 0, __func__);
else if (err > 0)
err = net_xmit_errno(err);
}
if (err)
pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n"

View file

@ -670,7 +670,7 @@ static int ep_open(struct inode *inode, struct file *file)
idr_for_each(&epd->devp->stid_idr, count_idrs, &count);
spin_unlock_irq(&epd->devp->lock);
epd->bufsize = count * 160;
epd->bufsize = count * 240;
epd->buf = vmalloc(epd->bufsize);
if (!epd->buf) {
ret = -ENOMEM;

View file

@ -50,6 +50,13 @@ static int inline_threshold = C4IW_INLINE_THRESHOLD;
module_param(inline_threshold, int, 0644);
MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)");
static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length)
{
return (is_t4(dev->rdev.lldi.adapter_type) ||
is_t5(dev->rdev.lldi.adapter_type)) &&
length >= 8*1024*1024*1024ULL;
}
static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
u32 len, dma_addr_t data, int wait)
{
@ -369,9 +376,11 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
int ret;
ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
FW_RI_STAG_NSMR, mhp->attr.perms,
FW_RI_STAG_NSMR, mhp->attr.len ?
mhp->attr.perms : 0,
mhp->attr.mw_bind_enable, mhp->attr.zbva,
mhp->attr.va_fbo, mhp->attr.len, shift - 12,
mhp->attr.va_fbo, mhp->attr.len ?
mhp->attr.len : -1, shift - 12,
mhp->attr.pbl_size, mhp->attr.pbl_addr);
if (ret)
return ret;
@ -536,6 +545,11 @@ int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask,
return ret;
}
if (mr_exceeds_hw_limits(rhp, total_size)) {
kfree(page_list);
return -EINVAL;
}
ret = reregister_mem(rhp, php, &mh, shift, npages);
kfree(page_list);
if (ret)
@ -596,6 +610,12 @@ struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
if (ret)
goto err;
if (mr_exceeds_hw_limits(rhp, total_size)) {
kfree(page_list);
ret = -EINVAL;
goto err;
}
ret = alloc_pbl(mhp, npages);
if (ret) {
kfree(page_list);
@ -699,6 +719,10 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
php = to_c4iw_pd(pd);
rhp = php->rhp;
if (mr_exceeds_hw_limits(rhp, length))
return ERR_PTR(-EINVAL);
mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
if (!mhp)
return ERR_PTR(-ENOMEM);

View file

@ -1538,9 +1538,9 @@ err:
set_state(qhp, C4IW_QP_STATE_ERROR);
free = 1;
abort = 1;
wake_up(&qhp->wait);
BUG_ON(!ep);
flush_qp(qhp);
wake_up(&qhp->wait);
out:
mutex_unlock(&qhp->mutex);

View file

@ -399,7 +399,7 @@ reg_user_mr_fallback:
pginfo.num_kpages = num_kpages;
pginfo.num_hwpages = num_hwpages;
pginfo.u.usr.region = e_mr->umem;
pginfo.next_hwpage = e_mr->umem->offset / hwpage_size;
pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size;
pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;
ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,

View file

@ -214,7 +214,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mr->mr.user_base = start;
mr->mr.iova = virt_addr;
mr->mr.length = length;
mr->mr.offset = umem->offset;
mr->mr.offset = ib_umem_offset(umem);
mr->mr.access_flags = mr_access_flags;
mr->mr.max_segs = n;
mr->umem = umem;

View file

@ -223,7 +223,6 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
if (flags & IB_MR_REREG_TRANS) {
int shift;
int err;
int n;
mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);

View file

@ -1,3 +1,4 @@
obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o
mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o
mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o

View file

@ -244,6 +244,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->max_mcast_grp;
props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)
props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
props->odp_caps = dev->odp_caps;
#endif
out:
kfree(in_mad);
kfree(out_mad);
@ -568,6 +574,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
goto out_count;
}
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
#endif
INIT_LIST_HEAD(&context->db_page_list);
mutex_init(&context->db_page_mutex);
@ -858,7 +868,7 @@ static ssize_t show_reg_pages(struct device *device,
struct mlx5_ib_dev *dev =
container_of(device, struct mlx5_ib_dev, ib_dev.dev);
return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages);
return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
}
static ssize_t show_hca(struct device *device, struct device_attribute *attr,
@ -1321,6 +1331,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) |
(1ull << IB_USER_VERBS_CMD_OPEN_QP);
dev->ib_dev.uverbs_ex_cmd_mask =
(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
dev->ib_dev.query_device = mlx5_ib_query_device;
dev->ib_dev.query_port = mlx5_ib_query_port;
@ -1366,6 +1378,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list;
dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status;
mlx5_ib_internal_query_odp_caps(dev);
if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) {
dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
@ -1379,16 +1393,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
goto err_eqs;
mutex_init(&dev->cap_mask_mutex);
spin_lock_init(&dev->mr_lock);
err = create_dev_resources(&dev->devr);
if (err)
goto err_eqs;
err = ib_register_device(&dev->ib_dev, NULL);
err = mlx5_ib_odp_init_one(dev);
if (err)
goto err_rsrc;
err = ib_register_device(&dev->ib_dev, NULL);
if (err)
goto err_odp;
err = create_umr_res(dev);
if (err)
goto err_dev;
@ -1410,6 +1427,9 @@ err_umrc:
err_dev:
ib_unregister_device(&dev->ib_dev);
err_odp:
mlx5_ib_odp_remove_one(dev);
err_rsrc:
destroy_dev_resources(&dev->devr);
@ -1425,8 +1445,10 @@ err_dealloc:
static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
{
struct mlx5_ib_dev *dev = context;
ib_unregister_device(&dev->ib_dev);
destroy_umrc_res(dev);
mlx5_ib_odp_remove_one(dev);
destroy_dev_resources(&dev->devr);
free_comp_eqs(dev);
ib_dealloc_device(&dev->ib_dev);
@ -1440,15 +1462,30 @@ static struct mlx5_interface mlx5_ib_interface = {
static int __init mlx5_ib_init(void)
{
int err;
if (deprecated_prof_sel != 2)
pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
return mlx5_register_interface(&mlx5_ib_interface);
err = mlx5_ib_odp_init();
if (err)
return err;
err = mlx5_register_interface(&mlx5_ib_interface);
if (err)
goto clean_odp;
return err;
clean_odp:
mlx5_ib_odp_cleanup();
return err;
}
static void __exit mlx5_ib_cleanup(void)
{
mlx5_unregister_interface(&mlx5_ib_interface);
mlx5_ib_odp_cleanup();
}
module_init(mlx5_ib_init);

View file

@ -32,6 +32,7 @@
#include <linux/module.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_umem_odp.h>
#include "mlx5_ib.h"
/* @umem: umem object to scan
@ -57,6 +58,17 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
int entry;
unsigned long page_shift = ilog2(umem->page_size);
/* With ODP we must always match OS page size. */
if (umem->odp_data) {
*count = ib_umem_page_count(umem);
*shift = PAGE_SHIFT;
*ncont = *count;
if (order)
*order = ilog2(roundup_pow_of_two(*count));
return;
}
addr = addr >> page_shift;
tmp = (unsigned long)addr;
m = find_first_bit(&tmp, sizeof(tmp));
@ -108,8 +120,36 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
*count = i;
}
void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int page_shift, __be64 *pas, int umr)
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
{
u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
if (umem_dma & ODP_READ_ALLOWED_BIT)
mtt_entry |= MLX5_IB_MTT_READ;
if (umem_dma & ODP_WRITE_ALLOWED_BIT)
mtt_entry |= MLX5_IB_MTT_WRITE;
return mtt_entry;
}
#endif
/*
* Populate the given array with bus addresses from the umem.
*
* dev - mlx5_ib device
* umem - umem to use to fill the pages
* page_shift - determines the page size used in the resulting array
* offset - offset into the umem to start from,
* only implemented for ODP umems
* num_pages - total number of pages to fill
* pas - bus addresses array to fill
* access_flags - access flags to set on all present pages.
use enum mlx5_ib_mtt_access_flags for this.
*/
void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int page_shift, size_t offset, size_t num_pages,
__be64 *pas, int access_flags)
{
unsigned long umem_page_shift = ilog2(umem->page_size);
int shift = page_shift - umem_page_shift;
@ -120,6 +160,21 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int len;
struct scatterlist *sg;
int entry;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
const bool odp = umem->odp_data != NULL;
if (odp) {
WARN_ON(shift != 0);
WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE));
for (i = 0; i < num_pages; ++i) {
dma_addr_t pa = umem->odp_data->dma_list[offset + i];
pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
}
return;
}
#endif
i = 0;
for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
@ -128,8 +183,7 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
for (k = 0; k < len; k++) {
if (!(i & mask)) {
cur = base + (k << umem_page_shift);
if (umr)
cur |= 3;
cur |= access_flags;
pas[i >> shift] = cpu_to_be64(cur);
mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n",
@ -142,6 +196,13 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
}
}
void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int page_shift, __be64 *pas, int access_flags)
{
return __mlx5_ib_populate_pas(dev, umem, page_shift, 0,
ib_umem_num_pages(umem), pas,
access_flags);
}
int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
{
u64 page_size;

View file

@ -111,6 +111,8 @@ struct mlx5_ib_pd {
*/
#define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START
#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1)
#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2)
#define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1
#define MLX5_IB_WR_UMR IB_WR_RESERVED1
@ -147,6 +149,29 @@ enum {
MLX5_QP_EMPTY
};
/*
* Connect-IB can trigger up to four concurrent pagefaults
* per-QP.
*/
enum mlx5_ib_pagefault_context {
MLX5_IB_PAGEFAULT_RESPONDER_READ,
MLX5_IB_PAGEFAULT_REQUESTOR_READ,
MLX5_IB_PAGEFAULT_RESPONDER_WRITE,
MLX5_IB_PAGEFAULT_REQUESTOR_WRITE,
MLX5_IB_PAGEFAULT_CONTEXTS
};
static inline enum mlx5_ib_pagefault_context
mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault)
{
return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE);
}
struct mlx5_ib_pfault {
struct work_struct work;
struct mlx5_pagefault mpfault;
};
struct mlx5_ib_qp {
struct ib_qp ibqp;
struct mlx5_core_qp mqp;
@ -192,6 +217,21 @@ struct mlx5_ib_qp {
/* Store signature errors */
bool signature_en;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
/*
* A flag that is true for QP's that are in a state that doesn't
* allow page faults, and shouldn't schedule any more faults.
*/
int disable_page_faults;
/*
* The disable_page_faults_lock protects a QP's disable_page_faults
* field, allowing for a thread to atomically check whether the QP
* allows page faults, and if so schedule a page fault.
*/
spinlock_t disable_page_faults_lock;
struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
#endif
};
struct mlx5_ib_cq_buf {
@ -206,6 +246,19 @@ enum mlx5_ib_qp_flags {
MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1,
};
struct mlx5_umr_wr {
union {
u64 virt_addr;
u64 offset;
} target;
struct ib_pd *pd;
unsigned int page_shift;
unsigned int npages;
u32 length;
int access_flags;
u32 mkey;
};
struct mlx5_shared_mr_info {
int mr_id;
struct ib_umem *umem;
@ -253,6 +306,13 @@ struct mlx5_ib_xrcd {
u32 xrcdn;
};
enum mlx5_ib_mtt_access_flags {
MLX5_IB_MTT_READ = (1 << 0),
MLX5_IB_MTT_WRITE = (1 << 1),
};
#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
struct mlx5_ib_mr {
struct ib_mr ibmr;
struct mlx5_core_mr mmr;
@ -261,12 +321,11 @@ struct mlx5_ib_mr {
struct list_head list;
int order;
int umred;
__be64 *pas;
dma_addr_t dma;
int npages;
struct mlx5_ib_dev *dev;
struct mlx5_create_mkey_mbox_out out;
struct mlx5_core_sig_ctx *sig;
int live;
};
struct mlx5_ib_fast_reg_page_list {
@ -372,11 +431,18 @@ struct mlx5_ib_dev {
struct umr_common umrc;
/* sync used page count stats
*/
spinlock_t mr_lock;
struct mlx5_ib_resources devr;
struct mlx5_mr_cache cache;
struct timer_list delay_timer;
int fill_delay;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_odp_caps odp_caps;
/*
* Sleepable RCU that prevents destruction of MRs while they are still
* being used by a page fault handler.
*/
struct srcu_struct mr_srcu;
#endif
};
static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@ -490,6 +556,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
struct ib_recv_wr **bad_wr);
void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
void *buffer, u32 length);
struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
int vector, struct ib_ucontext *context,
struct ib_udata *udata);
@ -502,6 +570,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags,
struct ib_udata *udata);
int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index,
int npages, int zap);
int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
int mlx5_ib_destroy_mr(struct ib_mr *ibmr);
struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
@ -533,8 +603,11 @@ int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev);
void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev);
void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
int *ncont, int *order);
void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int page_shift, size_t offset, size_t num_pages,
__be64 *pas, int access_flags);
void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int page_shift, __be64 *pas, int umr);
int page_shift, __be64 *pas, int access_flags);
void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq);
int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
@ -544,6 +617,38 @@ void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context);
int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
struct ib_mr_status *mr_status);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
extern struct workqueue_struct *mlx5_ib_page_fault_wq;
int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev);
void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
struct mlx5_ib_pfault *pfault);
void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp);
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
unsigned long end);
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
{
return 0;
}
static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {}
static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {}
static inline int mlx5_ib_odp_init(void) { return 0; }
static inline void mlx5_ib_odp_cleanup(void) {}
static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {}
static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {}
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline void init_query_mad(struct ib_smp *mad)
{
mad->base_version = 1;
@ -561,4 +666,7 @@ static inline u8 convert_access(int acc)
MLX5_PERM_LOCAL_READ;
}
#define MLX5_MAX_UMR_SHIFT 16
#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
#endif /* MLX5_IB_H */

View file

@ -37,21 +37,34 @@
#include <linux/export.h>
#include <linux/delay.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_umem_odp.h>
#include <rdma/ib_verbs.h>
#include "mlx5_ib.h"
enum {
MAX_PENDING_REG_MR = 8,
};
enum {
MLX5_UMR_ALIGN = 2048
};
#define MLX5_UMR_ALIGN 2048
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
static __be64 mlx5_ib_update_mtt_emergency_buffer[
MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)]
__aligned(MLX5_UMR_ALIGN);
static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
#endif
static __be64 *mr_align(__be64 *ptr, int align)
static int clean_mr(struct mlx5_ib_mr *mr);
static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
{
unsigned long mask = align - 1;
int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
return (__be64 *)(((unsigned long)ptr + mask) & ~mask);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
/* Wait until all page fault handlers using the mr complete. */
synchronize_srcu(&dev->mr_srcu);
#endif
return err;
}
static int order2idx(struct mlx5_ib_dev *dev, int order)
@ -146,7 +159,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
mr->order = ent->order;
mr->umred = 1;
mr->dev = dev;
in->seg.status = 1 << 6;
in->seg.status = MLX5_MKEY_STATUS_FREE;
in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
@ -191,7 +204,7 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
ent->cur--;
ent->size--;
spin_unlock_irq(&ent->lock);
err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
err = destroy_mkey(dev, mr);
if (err)
mlx5_ib_warn(dev, "failed destroy mkey\n");
else
@ -482,7 +495,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
ent->cur--;
ent->size--;
spin_unlock_irq(&ent->lock);
err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
err = destroy_mkey(dev, mr);
if (err)
mlx5_ib_warn(dev, "failed destroy mkey\n");
else
@ -668,7 +681,7 @@ static int get_octo_len(u64 addr, u64 len, int page_size)
static int use_umr(int order)
{
return order <= 17;
return order <= MLX5_MAX_UMR_SHIFT;
}
static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
@ -678,6 +691,7 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct ib_mr *mr = dev->umrc.mr;
struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
sg->addr = dma;
sg->length = ALIGN(sizeof(u64) * n, 64);
@ -692,21 +706,24 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
wr->num_sge = 0;
wr->opcode = MLX5_IB_WR_UMR;
wr->wr.fast_reg.page_list_len = n;
wr->wr.fast_reg.page_shift = page_shift;
wr->wr.fast_reg.rkey = key;
wr->wr.fast_reg.iova_start = virt_addr;
wr->wr.fast_reg.length = len;
wr->wr.fast_reg.access_flags = access_flags;
wr->wr.fast_reg.page_list = (struct ib_fast_reg_page_list *)pd;
umrwr->npages = n;
umrwr->page_shift = page_shift;
umrwr->mkey = key;
umrwr->target.virt_addr = virt_addr;
umrwr->length = len;
umrwr->access_flags = access_flags;
umrwr->pd = pd;
}
static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
struct ib_send_wr *wr, u32 key)
{
wr->send_flags = MLX5_IB_SEND_UMR_UNREG;
struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE;
wr->opcode = MLX5_IB_WR_UMR;
wr->wr.fast_reg.rkey = key;
umrwr->mkey = key;
}
void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context)
@ -742,7 +759,10 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
struct ib_send_wr wr, *bad;
struct mlx5_ib_mr *mr;
struct ib_sge sg;
int size = sizeof(u64) * npages;
int size;
__be64 *mr_pas;
__be64 *pas;
dma_addr_t dma;
int err = 0;
int i;
@ -761,25 +781,31 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
if (!mr)
return ERR_PTR(-EAGAIN);
mr->pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
if (!mr->pas) {
/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
* To avoid copying garbage after the pas array, we allocate
* a little more. */
size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT);
mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
if (!mr_pas) {
err = -ENOMEM;
goto free_mr;
}
mlx5_ib_populate_pas(dev, umem, page_shift,
mr_align(mr->pas, MLX5_UMR_ALIGN), 1);
pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN);
mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
/* Clear padding after the actual pages. */
memset(pas + npages, 0, size - npages * sizeof(u64));
mr->dma = dma_map_single(ddev, mr_align(mr->pas, MLX5_UMR_ALIGN), size,
DMA_TO_DEVICE);
if (dma_mapping_error(ddev, mr->dma)) {
dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
if (dma_mapping_error(ddev, dma)) {
err = -ENOMEM;
goto free_pas;
}
memset(&wr, 0, sizeof(wr));
wr.wr_id = (u64)(unsigned long)&umr_context;
prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags);
prep_umr_reg_wqe(pd, &wr, &sg, dma, npages, mr->mmr.key, page_shift,
virt_addr, len, access_flags);
mlx5_ib_init_umr_context(&umr_context);
down(&umrc->sem);
@ -799,12 +825,14 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
mr->mmr.size = len;
mr->mmr.pd = to_mpd(pd)->pdn;
mr->live = 1;
unmap_dma:
up(&umrc->sem);
dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE);
dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
free_pas:
kfree(mr->pas);
kfree(mr_pas);
free_mr:
if (err) {
@ -815,6 +843,128 @@ free_mr:
return mr;
}
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
int zap)
{
struct mlx5_ib_dev *dev = mr->dev;
struct device *ddev = dev->ib_dev.dma_device;
struct umr_common *umrc = &dev->umrc;
struct mlx5_ib_umr_context umr_context;
struct ib_umem *umem = mr->umem;
int size;
__be64 *pas;
dma_addr_t dma;
struct ib_send_wr wr, *bad;
struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr.wr.fast_reg;
struct ib_sge sg;
int err = 0;
const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
const int page_index_mask = page_index_alignment - 1;
size_t pages_mapped = 0;
size_t pages_to_map = 0;
size_t pages_iter = 0;
int use_emergency_buf = 0;
/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
* so we need to align the offset and length accordingly */
if (start_page_index & page_index_mask) {
npages += start_page_index & page_index_mask;
start_page_index &= ~page_index_mask;
}
pages_to_map = ALIGN(npages, page_index_alignment);
if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
return -EINVAL;
size = sizeof(u64) * pages_to_map;
size = min_t(int, PAGE_SIZE, size);
/* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
* code, when we are called from an invalidation. The pas buffer must
* be 2k-aligned for Connect-IB. */
pas = (__be64 *)get_zeroed_page(GFP_ATOMIC);
if (!pas) {
mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n");
pas = mlx5_ib_update_mtt_emergency_buffer;
size = MLX5_UMR_MTT_MIN_CHUNK_SIZE;
use_emergency_buf = 1;
mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
memset(pas, 0, size);
}
pages_iter = size / sizeof(u64);
dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
if (dma_mapping_error(ddev, dma)) {
mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
err = -ENOMEM;
goto free_pas;
}
for (pages_mapped = 0;
pages_mapped < pages_to_map && !err;
pages_mapped += pages_iter, start_page_index += pages_iter) {
dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
npages = min_t(size_t,
pages_iter,
ib_umem_num_pages(umem) - start_page_index);
if (!zap) {
__mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT,
start_page_index, npages, pas,
MLX5_IB_MTT_PRESENT);
/* Clear padding after the pages brought from the
* umem. */
memset(pas + npages, 0, size - npages * sizeof(u64));
}
dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
memset(&wr, 0, sizeof(wr));
wr.wr_id = (u64)(unsigned long)&umr_context;
sg.addr = dma;
sg.length = ALIGN(npages * sizeof(u64),
MLX5_UMR_MTT_ALIGNMENT);
sg.lkey = dev->umrc.mr->lkey;
wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
MLX5_IB_SEND_UMR_UPDATE_MTT;
wr.sg_list = &sg;
wr.num_sge = 1;
wr.opcode = MLX5_IB_WR_UMR;
umrwr->npages = sg.length / sizeof(u64);
umrwr->page_shift = PAGE_SHIFT;
umrwr->mkey = mr->mmr.key;
umrwr->target.offset = start_page_index;
mlx5_ib_init_umr_context(&umr_context);
down(&umrc->sem);
err = ib_post_send(umrc->qp, &wr, &bad);
if (err) {
mlx5_ib_err(dev, "UMR post send failed, err %d\n", err);
} else {
wait_for_completion(&umr_context.done);
if (umr_context.status != IB_WC_SUCCESS) {
mlx5_ib_err(dev, "UMR completion failed, code %d\n",
umr_context.status);
err = -EFAULT;
}
}
up(&umrc->sem);
}
dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
free_pas:
if (!use_emergency_buf)
free_page((unsigned long)pas);
else
mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
return err;
}
#endif
static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
u64 length, struct ib_umem *umem,
int npages, int page_shift,
@ -825,6 +975,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
struct mlx5_ib_mr *mr;
int inlen;
int err;
bool pg_cap = !!(dev->mdev->caps.gen.flags &
MLX5_DEV_CAP_FLAG_ON_DMND_PG);
mr = kzalloc(sizeof(*mr), GFP_KERNEL);
if (!mr)
@ -836,8 +988,12 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
err = -ENOMEM;
goto err_1;
}
mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, 0);
mlx5_ib_populate_pas(dev, umem, page_shift, in->pas,
pg_cap ? MLX5_IB_MTT_PRESENT : 0);
/* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags
* in the page list submitted with the command. */
in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0;
in->seg.flags = convert_access(access_flags) |
MLX5_ACCESS_MODE_MTT;
in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
@ -856,6 +1012,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
goto err_2;
}
mr->umem = umem;
mr->live = 1;
kvfree(in);
mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
@ -910,6 +1067,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mlx5_ib_dbg(dev, "cache empty for order %d", order);
mr = NULL;
}
} else if (access_flags & IB_ACCESS_ON_DEMAND) {
err = -EINVAL;
pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
goto error;
}
if (!mr)
@ -925,16 +1086,51 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mr->umem = umem;
mr->npages = npages;
spin_lock(&dev->mr_lock);
dev->mdev->priv.reg_pages += npages;
spin_unlock(&dev->mr_lock);
atomic_add(npages, &dev->mdev->priv.reg_pages);
mr->ibmr.lkey = mr->mmr.key;
mr->ibmr.rkey = mr->mmr.key;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (umem->odp_data) {
/*
* This barrier prevents the compiler from moving the
* setting of umem->odp_data->private to point to our
* MR, before reg_umr finished, to ensure that the MR
* initialization have finished before starting to
* handle invalidations.
*/
smp_wmb();
mr->umem->odp_data->private = mr;
/*
* Make sure we will see the new
* umem->odp_data->private value in the invalidation
* routines, before we can get page faults on the
* MR. Page faults can happen once we put the MR in
* the tree, below this line. Without the barrier,
* there can be a fault handling and an invalidation
* before umem->odp_data->private == mr is visible to
* the invalidation handler.
*/
smp_wmb();
}
#endif
return &mr->ibmr;
error:
/*
* Destroy the umem *before* destroying the MR, to ensure we
* will not have any in-flight notifiers when destroying the
* MR.
*
* As the MR is completely invalid to begin with, and this
* error path is only taken if we can't push the mr entry into
* the pagefault tree, this is safe.
*/
ib_umem_release(umem);
/* Kill the MR, and return an error code. */
clean_mr(mr);
return ERR_PTR(err);
}
@ -971,17 +1167,14 @@ error:
return err;
}
int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
static int clean_mr(struct mlx5_ib_mr *mr)
{
struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
struct mlx5_ib_mr *mr = to_mmr(ibmr);
struct ib_umem *umem = mr->umem;
int npages = mr->npages;
struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
int umred = mr->umred;
int err;
if (!umred) {
err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
err = destroy_mkey(dev, mr);
if (err) {
mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
mr->mmr.key, err);
@ -996,19 +1189,51 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
free_cached_mr(dev, mr);
}
if (umem) {
ib_umem_release(umem);
spin_lock(&dev->mr_lock);
dev->mdev->priv.reg_pages -= npages;
spin_unlock(&dev->mr_lock);
}
if (!umred)
kfree(mr);
return 0;
}
int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
{
struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
struct mlx5_ib_mr *mr = to_mmr(ibmr);
int npages = mr->npages;
struct ib_umem *umem = mr->umem;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (umem && umem->odp_data) {
/* Prevent new page faults from succeeding */
mr->live = 0;
/* Wait for all running page-fault handlers to finish. */
synchronize_srcu(&dev->mr_srcu);
/* Destroy all page mappings */
mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
ib_umem_end(umem));
/*
* We kill the umem before the MR for ODP,
* so that there will not be any invalidations in
* flight, looking at the *mr struct.
*/
ib_umem_release(umem);
atomic_sub(npages, &dev->mdev->priv.reg_pages);
/* Avoid double-freeing the umem. */
umem = NULL;
}
#endif
clean_mr(mr);
if (umem) {
ib_umem_release(umem);
atomic_sub(npages, &dev->mdev->priv.reg_pages);
}
return 0;
}
struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
struct ib_mr_init_attr *mr_init_attr)
{
@ -1028,7 +1253,7 @@ struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
goto err_free;
}
in->seg.status = 1 << 6; /* free */
in->seg.status = MLX5_MKEY_STATUS_FREE;
in->seg.xlt_oct_size = cpu_to_be32(ndescs);
in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
@ -1113,7 +1338,7 @@ int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
kfree(mr->sig);
}
err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
err = destroy_mkey(dev, mr);
if (err) {
mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
mr->mmr.key, err);
@ -1143,7 +1368,7 @@ struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
goto err_free;
}
in->seg.status = 1 << 6; /* free */
in->seg.status = MLX5_MKEY_STATUS_FREE;
in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2);
in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT;

View file

@ -0,0 +1,798 @@
/*
* Copyright (c) 2014 Mellanox Technologies. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <rdma/ib_umem.h>
#include <rdma/ib_umem_odp.h>
#include "mlx5_ib.h"
#define MAX_PREFETCH_LEN (4*1024*1024U)
/* Timeout in ms to wait for an active mmu notifier to complete when handling
* a pagefault. */
#define MMU_NOTIFIER_TIMEOUT 1000
struct workqueue_struct *mlx5_ib_page_fault_wq;
void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
unsigned long end)
{
struct mlx5_ib_mr *mr;
const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1;
u64 idx = 0, blk_start_idx = 0;
int in_block = 0;
u64 addr;
if (!umem || !umem->odp_data) {
pr_err("invalidation called on NULL umem or non-ODP umem\n");
return;
}
mr = umem->odp_data->private;
if (!mr || !mr->ibmr.pd)
return;
start = max_t(u64, ib_umem_start(umem), start);
end = min_t(u64, ib_umem_end(umem), end);
/*
* Iteration one - zap the HW's MTTs. The notifiers_count ensures that
* while we are doing the invalidation, no page fault will attempt to
* overwrite the same MTTs. Concurent invalidations might race us,
* but they will write 0s as well, so no difference in the end result.
*/
for (addr = start; addr < end; addr += (u64)umem->page_size) {
idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
/*
* Strive to write the MTTs in chunks, but avoid overwriting
* non-existing MTTs. The huristic here can be improved to
* estimate the cost of another UMR vs. the cost of bigger
* UMR.
*/
if (umem->odp_data->dma_list[idx] &
(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
if (!in_block) {
blk_start_idx = idx;
in_block = 1;
}
} else {
u64 umr_offset = idx & umr_block_mask;
if (in_block && umr_offset == 0) {
mlx5_ib_update_mtt(mr, blk_start_idx,
idx - blk_start_idx, 1);
in_block = 0;
}
}
}
if (in_block)
mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1,
1);
/*
* We are now sure that the device will not access the
* memory. We can safely unmap it, and mark it as dirty if
* needed.
*/
ib_umem_odp_unmap_dma_pages(umem, start, end);
}
#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \
if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
} while (0)
int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
{
int err;
struct mlx5_odp_caps hw_caps;
struct ib_odp_caps *caps = &dev->odp_caps;
memset(caps, 0, sizeof(*caps));
if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG))
return 0;
err = mlx5_query_odp_caps(dev->mdev, &hw_caps);
if (err)
goto out;
caps->general_caps = IB_ODP_SUPPORT;
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps,
SEND);
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
SEND);
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
RECV);
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
WRITE);
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
READ);
out:
return err;
}
static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
u32 key)
{
u32 base_key = mlx5_base_mkey(key);
struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr);
if (!mmr || mmr->key != key || !mr->live)
return NULL;
return container_of(mmr, struct mlx5_ib_mr, mmr);
}
static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
struct mlx5_ib_pfault *pfault,
int error) {
struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn,
pfault->mpfault.flags,
error);
if (ret)
pr_err("Failed to resolve the page fault on QP 0x%x\n",
qp->mqp.qpn);
}
/*
* Handle a single data segment in a page-fault WQE.
*
* Returns number of pages retrieved on success. The caller will continue to
* the next data segment.
* Can return the following error codes:
* -EAGAIN to designate a temporary error. The caller will abort handling the
* page fault and resolve it.
* -EFAULT when there's an error mapping the requested pages. The caller will
* abort the page fault handling and possibly move the QP to an error state.
* On other errors the QP should also be closed with an error.
*/
static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
struct mlx5_ib_pfault *pfault,
u32 key, u64 io_virt, size_t bcnt,
u32 *bytes_mapped)
{
struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device);
int srcu_key;
unsigned int current_seq;
u64 start_idx;
int npages = 0, ret = 0;
struct mlx5_ib_mr *mr;
u64 access_mask = ODP_READ_ALLOWED_BIT;
srcu_key = srcu_read_lock(&mib_dev->mr_srcu);
mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key);
/*
* If we didn't find the MR, it means the MR was closed while we were
* handling the ODP event. In this case we return -EFAULT so that the
* QP will be closed.
*/
if (!mr || !mr->ibmr.pd) {
pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
key);
ret = -EFAULT;
goto srcu_unlock;
}
if (!mr->umem->odp_data) {
pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
key);
if (bytes_mapped)
*bytes_mapped +=
(bcnt - pfault->mpfault.bytes_committed);
goto srcu_unlock;
}
if (mr->ibmr.pd != qp->ibqp.pd) {
pr_err("Page-fault with different PDs for QP and MR.\n");
ret = -EFAULT;
goto srcu_unlock;
}
current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
/*
* Ensure the sequence number is valid for some time before we call
* gup.
*/
smp_rmb();
/*
* Avoid branches - this code will perform correctly
* in all iterations (in iteration 2 and above,
* bytes_committed == 0).
*/
io_virt += pfault->mpfault.bytes_committed;
bcnt -= pfault->mpfault.bytes_committed;
start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT;
if (mr->umem->writable)
access_mask |= ODP_WRITE_ALLOWED_BIT;
npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt,
access_mask, current_seq);
if (npages < 0) {
ret = npages;
goto srcu_unlock;
}
if (npages > 0) {
mutex_lock(&mr->umem->odp_data->umem_mutex);
if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
/*
* No need to check whether the MTTs really belong to
* this MR, since ib_umem_odp_map_dma_pages already
* checks this.
*/
ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
} else {
ret = -EAGAIN;
}
mutex_unlock(&mr->umem->odp_data->umem_mutex);
if (ret < 0) {
if (ret != -EAGAIN)
pr_err("Failed to update mkey page tables\n");
goto srcu_unlock;
}
if (bytes_mapped) {
u32 new_mappings = npages * PAGE_SIZE -
(io_virt - round_down(io_virt, PAGE_SIZE));
*bytes_mapped += min_t(u32, new_mappings, bcnt);
}
}
srcu_unlock:
if (ret == -EAGAIN) {
if (!mr->umem->odp_data->dying) {
struct ib_umem_odp *odp_data = mr->umem->odp_data;
unsigned long timeout =
msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
if (!wait_for_completion_timeout(
&odp_data->notifier_completion,
timeout)) {
pr_warn("timeout waiting for mmu notifier completion\n");
}
} else {
/* The MR is being killed, kill the QP as well. */
ret = -EFAULT;
}
}
srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
pfault->mpfault.bytes_committed = 0;
return ret ? ret : npages;
}
/**
* Parse a series of data segments for page fault handling.
*
* @qp the QP on which the fault occurred.
* @pfault contains page fault information.
* @wqe points at the first data segment in the WQE.
* @wqe_end points after the end of the WQE.
* @bytes_mapped receives the number of bytes that the function was able to
* map. This allows the caller to decide intelligently whether
* enough memory was mapped to resolve the page fault
* successfully (e.g. enough for the next MTU, or the entire
* WQE).
* @total_wqe_bytes receives the total data size of this WQE in bytes (minus
* the committed bytes).
*
* Returns the number of pages loaded if positive, zero for an empty WQE, or a
* negative error code.
*/
static int pagefault_data_segments(struct mlx5_ib_qp *qp,
struct mlx5_ib_pfault *pfault, void *wqe,
void *wqe_end, u32 *bytes_mapped,
u32 *total_wqe_bytes, int receive_queue)
{
int ret = 0, npages = 0;
u64 io_virt;
u32 key;
u32 byte_count;
size_t bcnt;
int inline_segment;
/* Skip SRQ next-WQE segment. */
if (receive_queue && qp->ibqp.srq)
wqe += sizeof(struct mlx5_wqe_srq_next_seg);
if (bytes_mapped)
*bytes_mapped = 0;
if (total_wqe_bytes)
*total_wqe_bytes = 0;
while (wqe < wqe_end) {
struct mlx5_wqe_data_seg *dseg = wqe;
io_virt = be64_to_cpu(dseg->addr);
key = be32_to_cpu(dseg->lkey);
byte_count = be32_to_cpu(dseg->byte_count);
inline_segment = !!(byte_count & MLX5_INLINE_SEG);
bcnt = byte_count & ~MLX5_INLINE_SEG;
if (inline_segment) {
bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK;
wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt,
16);
} else {
wqe += sizeof(*dseg);
}
/* receive WQE end of sg list. */
if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY &&
io_virt == 0)
break;
if (!inline_segment && total_wqe_bytes) {
*total_wqe_bytes += bcnt - min_t(size_t, bcnt,
pfault->mpfault.bytes_committed);
}
/* A zero length data segment designates a length of 2GB. */
if (bcnt == 0)
bcnt = 1U << 31;
if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) {
pfault->mpfault.bytes_committed -=
min_t(size_t, bcnt,
pfault->mpfault.bytes_committed);
continue;
}
ret = pagefault_single_data_segment(qp, pfault, key, io_virt,
bcnt, bytes_mapped);
if (ret < 0)
break;
npages += ret;
}
return ret < 0 ? ret : npages;
}
/*
* Parse initiator WQE. Advances the wqe pointer to point at the
* scatter-gather list, and set wqe_end to the end of the WQE.
*/
static int mlx5_ib_mr_initiator_pfault_handler(
struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault,
void **wqe, void **wqe_end, int wqe_length)
{
struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
u16 wqe_index = pfault->mpfault.wqe.wqe_index;
unsigned ds, opcode;
#if defined(DEBUG)
u32 ctrl_wqe_index, ctrl_qpn;
#endif
ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
ds, wqe_length);
return -EFAULT;
}
if (ds == 0) {
mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
wqe_index, qp->mqp.qpn);
return -EFAULT;
}
#if defined(DEBUG)
ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) &
MLX5_WQE_CTRL_WQE_INDEX_MASK) >>
MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
if (wqe_index != ctrl_wqe_index) {
mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
wqe_index, qp->mqp.qpn,
ctrl_wqe_index);
return -EFAULT;
}
ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
MLX5_WQE_CTRL_QPN_SHIFT;
if (qp->mqp.qpn != ctrl_qpn) {
mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
wqe_index, qp->mqp.qpn,
ctrl_qpn);
return -EFAULT;
}
#endif /* DEBUG */
*wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
*wqe += sizeof(*ctrl);
opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
MLX5_WQE_CTRL_OPCODE_MASK;
switch (qp->ibqp.qp_type) {
case IB_QPT_RC:
switch (opcode) {
case MLX5_OPCODE_SEND:
case MLX5_OPCODE_SEND_IMM:
case MLX5_OPCODE_SEND_INVAL:
if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
IB_ODP_SUPPORT_SEND))
goto invalid_transport_or_opcode;
break;
case MLX5_OPCODE_RDMA_WRITE:
case MLX5_OPCODE_RDMA_WRITE_IMM:
if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
IB_ODP_SUPPORT_WRITE))
goto invalid_transport_or_opcode;
*wqe += sizeof(struct mlx5_wqe_raddr_seg);
break;
case MLX5_OPCODE_RDMA_READ:
if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
IB_ODP_SUPPORT_READ))
goto invalid_transport_or_opcode;
*wqe += sizeof(struct mlx5_wqe_raddr_seg);
break;
default:
goto invalid_transport_or_opcode;
}
break;
case IB_QPT_UD:
switch (opcode) {
case MLX5_OPCODE_SEND:
case MLX5_OPCODE_SEND_IMM:
if (!(dev->odp_caps.per_transport_caps.ud_odp_caps &
IB_ODP_SUPPORT_SEND))
goto invalid_transport_or_opcode;
*wqe += sizeof(struct mlx5_wqe_datagram_seg);
break;
default:
goto invalid_transport_or_opcode;
}
break;
default:
invalid_transport_or_opcode:
mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n",
qp->ibqp.qp_type, opcode);
return -EFAULT;
}
return 0;
}
/*
* Parse responder WQE. Advances the wqe pointer to point at the
* scatter-gather list, and set wqe_end to the end of the WQE.
*/
static int mlx5_ib_mr_responder_pfault_handler(
struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault,
void **wqe, void **wqe_end, int wqe_length)
{
struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
struct mlx5_ib_wq *wq = &qp->rq;
int wqe_size = 1 << wq->wqe_shift;
if (qp->ibqp.srq) {
mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n");
return -EFAULT;
}
if (qp->wq_sig) {
mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
return -EFAULT;
}
if (wqe_size > wqe_length) {
mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
return -EFAULT;
}
switch (qp->ibqp.qp_type) {
case IB_QPT_RC:
if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
IB_ODP_SUPPORT_RECV))
goto invalid_transport_or_opcode;
break;
default:
invalid_transport_or_opcode:
mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
qp->ibqp.qp_type);
return -EFAULT;
}
*wqe_end = *wqe + wqe_size;
return 0;
}
static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
struct mlx5_ib_pfault *pfault)
{
struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
int ret;
void *wqe, *wqe_end;
u32 bytes_mapped, total_wqe_bytes;
char *buffer = NULL;
int resume_with_error = 0;
u16 wqe_index = pfault->mpfault.wqe.wqe_index;
int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR;
buffer = (char *)__get_free_page(GFP_KERNEL);
if (!buffer) {
mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
resume_with_error = 1;
goto resolve_page_fault;
}
ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
PAGE_SIZE);
if (ret < 0) {
mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n",
-ret, wqe_index, qp->mqp.qpn);
resume_with_error = 1;
goto resolve_page_fault;
}
wqe = buffer;
if (requestor)
ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe,
&wqe_end, ret);
else
ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe,
&wqe_end, ret);
if (ret < 0) {
resume_with_error = 1;
goto resolve_page_fault;
}
if (wqe >= wqe_end) {
mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
resume_with_error = 1;
goto resolve_page_fault;
}
ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped,
&total_wqe_bytes, !requestor);
if (ret == -EAGAIN) {
goto resolve_page_fault;
} else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n",
-ret);
resume_with_error = 1;
goto resolve_page_fault;
}
resolve_page_fault:
mlx5_ib_page_fault_resume(qp, pfault, resume_with_error);
mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n",
qp->mqp.qpn, resume_with_error, pfault->mpfault.flags);
free_page((unsigned long)buffer);
}
static int pages_in_range(u64 address, u32 length)
{
return (ALIGN(address + length, PAGE_SIZE) -
(address & PAGE_MASK)) >> PAGE_SHIFT;
}
static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp,
struct mlx5_ib_pfault *pfault)
{
struct mlx5_pagefault *mpfault = &pfault->mpfault;
u64 address;
u32 length;
u32 prefetch_len = mpfault->bytes_committed;
int prefetch_activated = 0;
u32 rkey = mpfault->rdma.r_key;
int ret;
/* The RDMA responder handler handles the page fault in two parts.
* First it brings the necessary pages for the current packet
* (and uses the pfault context), and then (after resuming the QP)
* prefetches more pages. The second operation cannot use the pfault
* context and therefore uses the dummy_pfault context allocated on
* the stack */
struct mlx5_ib_pfault dummy_pfault = {};
dummy_pfault.mpfault.bytes_committed = 0;
mpfault->rdma.rdma_va += mpfault->bytes_committed;
mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed,
mpfault->rdma.rdma_op_len);
mpfault->bytes_committed = 0;
address = mpfault->rdma.rdma_va;
length = mpfault->rdma.rdma_op_len;
/* For some operations, the hardware cannot tell the exact message
* length, and in those cases it reports zero. Use prefetch
* logic. */
if (length == 0) {
prefetch_activated = 1;
length = mpfault->rdma.packet_size;
prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
}
ret = pagefault_single_data_segment(qp, pfault, rkey, address, length,
NULL);
if (ret == -EAGAIN) {
/* We're racing with an invalidation, don't prefetch */
prefetch_activated = 0;
} else if (ret < 0 || pages_in_range(address, length) > ret) {
mlx5_ib_page_fault_resume(qp, pfault, 1);
return;
}
mlx5_ib_page_fault_resume(qp, pfault, 0);
/* At this point, there might be a new pagefault already arriving in
* the eq, switch to the dummy pagefault for the rest of the
* processing. We're still OK with the objects being alive as the
* work-queue is being fenced. */
if (prefetch_activated) {
ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey,
address,
prefetch_len,
NULL);
if (ret < 0) {
pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n",
ret, prefetch_activated,
qp->ibqp.qp_num, address, prefetch_len);
}
}
}
void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
struct mlx5_ib_pfault *pfault)
{
u8 event_subtype = pfault->mpfault.event_subtype;
switch (event_subtype) {
case MLX5_PFAULT_SUBTYPE_WQE:
mlx5_ib_mr_wqe_pfault_handler(qp, pfault);
break;
case MLX5_PFAULT_SUBTYPE_RDMA:
mlx5_ib_mr_rdma_pfault_handler(qp, pfault);
break;
default:
pr_warn("Invalid page fault event subtype: 0x%x\n",
event_subtype);
mlx5_ib_page_fault_resume(qp, pfault, 1);
break;
}
}
static void mlx5_ib_qp_pfault_action(struct work_struct *work)
{
struct mlx5_ib_pfault *pfault = container_of(work,
struct mlx5_ib_pfault,
work);
enum mlx5_ib_pagefault_context context =
mlx5_ib_get_pagefault_context(&pfault->mpfault);
struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp,
pagefaults[context]);
mlx5_ib_mr_pfault_handler(qp, pfault);
}
void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp)
{
unsigned long flags;
spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
qp->disable_page_faults = 1;
spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
/*
* Note that at this point, we are guarenteed that no more
* work queue elements will be posted to the work queue with
* the QP we are closing.
*/
flush_workqueue(mlx5_ib_page_fault_wq);
}
void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)
{
unsigned long flags;
spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
qp->disable_page_faults = 0;
spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
}
static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp,
struct mlx5_pagefault *pfault)
{
/*
* Note that we will only get one fault event per QP per context
* (responder/initiator, read/write), until we resolve the page fault
* with the mlx5_ib_page_fault_resume command. Since this function is
* called from within the work element, there is no risk of missing
* events.
*/
struct mlx5_ib_qp *mibqp = to_mibqp(qp);
enum mlx5_ib_pagefault_context context =
mlx5_ib_get_pagefault_context(pfault);
struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context];
qp_pfault->mpfault = *pfault;
/* No need to stop interrupts here since we are in an interrupt */
spin_lock(&mibqp->disable_page_faults_lock);
if (!mibqp->disable_page_faults)
queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work);
spin_unlock(&mibqp->disable_page_faults_lock);
}
void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
{
int i;
qp->disable_page_faults = 1;
spin_lock_init(&qp->disable_page_faults_lock);
qp->mqp.pfault_handler = mlx5_ib_pfault_handler;
for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
}
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev)
{
int ret;
ret = init_srcu_struct(&ibdev->mr_srcu);
if (ret)
return ret;
return 0;
}
void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)
{
cleanup_srcu_struct(&ibdev->mr_srcu);
}
int __init mlx5_ib_odp_init(void)
{
mlx5_ib_page_fault_wq =
create_singlethread_workqueue("mlx5_ib_page_faults");
if (!mlx5_ib_page_fault_wq)
return -ENOMEM;
return 0;
}
void mlx5_ib_odp_cleanup(void)
{
destroy_workqueue(mlx5_ib_page_fault_wq);
}

View file

@ -70,15 +70,6 @@ static const u32 mlx5_ib_opcode[] = {
[MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR,
};
struct umr_wr {
u64 virt_addr;
struct ib_pd *pd;
unsigned int page_shift;
unsigned int npages;
u32 length;
int access_flags;
u32 mkey;
};
static int is_qp0(enum ib_qp_type qp_type)
{
@ -110,6 +101,77 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE));
}
/**
* mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space.
*
* @qp: QP to copy from.
* @send: copy from the send queue when non-zero, use the receive queue
* otherwise.
* @wqe_index: index to start copying from. For send work queues, the
* wqe_index is in units of MLX5_SEND_WQE_BB.
* For receive work queue, it is the number of work queue
* element in the queue.
* @buffer: destination buffer.
* @length: maximum number of bytes to copy.
*
* Copies at least a single WQE, but may copy more data.
*
* Return: the number of bytes copied, or an error code.
*/
int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
void *buffer, u32 length)
{
struct ib_device *ibdev = qp->ibqp.device;
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
size_t offset;
size_t wq_end;
struct ib_umem *umem = qp->umem;
u32 first_copy_length;
int wqe_length;
int ret;
if (wq->wqe_cnt == 0) {
mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n",
qp->ibqp.qp_type);
return -EINVAL;
}
offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift);
wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift);
if (send && length < sizeof(struct mlx5_wqe_ctrl_seg))
return -EINVAL;
if (offset > umem->length ||
(send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length))
return -EINVAL;
first_copy_length = min_t(u32, offset + length, wq_end) - offset;
ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length);
if (ret)
return ret;
if (send) {
struct mlx5_wqe_ctrl_seg *ctrl = buffer;
int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
wqe_length = ds * MLX5_WQE_DS_UNITS;
} else {
wqe_length = 1 << wq->wqe_shift;
}
if (wqe_length <= first_copy_length)
return first_copy_length;
ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset,
wqe_length - first_copy_length);
if (ret)
return ret;
return wqe_length;
}
static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
{
struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
@ -814,6 +876,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
int inlen = sizeof(*in);
int err;
mlx5_ib_odp_create_qp(qp);
gen = &dev->mdev->caps.gen;
mutex_init(&qp->mutex);
spin_lock_init(&qp->sq.lock);
@ -1098,11 +1162,13 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
in = kzalloc(sizeof(*in), GFP_KERNEL);
if (!in)
return;
if (qp->state != IB_QPS_RESET)
if (qp->state != IB_QPS_RESET) {
mlx5_ib_qp_disable_pagefaults(qp);
if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state),
MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp))
mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
qp->mqp.qpn);
}
get_cqs(qp, &send_cq, &recv_cq);
@ -1650,6 +1716,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
if (mlx5_st < 0)
goto out;
/* If moving to a reset or error state, we must disable page faults on
* this QP and flush all current page faults. Otherwise a stale page
* fault may attempt to work on this QP after it is reset and moved
* again to RTS, and may cause the driver and the device to get out of
* sync. */
if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
(new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
mlx5_ib_qp_disable_pagefaults(qp);
optpar = ib_mask_to_mlx5_opt(attr_mask);
optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
in->optparam = cpu_to_be32(optpar);
@ -1659,6 +1734,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
if (err)
goto out;
if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
mlx5_ib_qp_enable_pagefaults(qp);
qp->state = new_state;
if (attr_mask & IB_QP_ACCESS_FLAGS)
@ -1848,37 +1926,70 @@ static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
umr->mkey_mask = frwr_mkey_mask();
}
static __be64 get_umr_reg_mr_mask(void)
{
u64 result;
result = MLX5_MKEY_MASK_LEN |
MLX5_MKEY_MASK_PAGE_SIZE |
MLX5_MKEY_MASK_START_ADDR |
MLX5_MKEY_MASK_PD |
MLX5_MKEY_MASK_LR |
MLX5_MKEY_MASK_LW |
MLX5_MKEY_MASK_KEY |
MLX5_MKEY_MASK_RR |
MLX5_MKEY_MASK_RW |
MLX5_MKEY_MASK_A |
MLX5_MKEY_MASK_FREE;
return cpu_to_be64(result);
}
static __be64 get_umr_unreg_mr_mask(void)
{
u64 result;
result = MLX5_MKEY_MASK_FREE;
return cpu_to_be64(result);
}
static __be64 get_umr_update_mtt_mask(void)
{
u64 result;
result = MLX5_MKEY_MASK_FREE;
return cpu_to_be64(result);
}
static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
struct ib_send_wr *wr)
{
struct umr_wr *umrwr = (struct umr_wr *)&wr->wr.fast_reg;
u64 mask;
struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
memset(umr, 0, sizeof(*umr));
if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE)
umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */
else
umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */
if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) {
umr->flags = 1 << 5; /* fail if not free */
umr->klm_octowords = get_klm_octo(umrwr->npages);
mask = MLX5_MKEY_MASK_LEN |
MLX5_MKEY_MASK_PAGE_SIZE |
MLX5_MKEY_MASK_START_ADDR |
MLX5_MKEY_MASK_PD |
MLX5_MKEY_MASK_LR |
MLX5_MKEY_MASK_LW |
MLX5_MKEY_MASK_KEY |
MLX5_MKEY_MASK_RR |
MLX5_MKEY_MASK_RW |
MLX5_MKEY_MASK_A |
MLX5_MKEY_MASK_FREE;
umr->mkey_mask = cpu_to_be64(mask);
if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) {
umr->mkey_mask = get_umr_update_mtt_mask();
umr->bsf_octowords = get_klm_octo(umrwr->target.offset);
umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
} else {
umr->mkey_mask = get_umr_reg_mr_mask();
}
} else {
umr->flags = 2 << 5; /* fail if free */
mask = MLX5_MKEY_MASK_FREE;
umr->mkey_mask = cpu_to_be64(mask);
umr->mkey_mask = get_umr_unreg_mr_mask();
}
if (!wr->num_sge)
umr->flags |= (1 << 7); /* inline */
umr->flags |= MLX5_UMR_INLINE;
}
static u8 get_umr_flags(int acc)
@ -1895,7 +2006,7 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr,
{
memset(seg, 0, sizeof(*seg));
if (li) {
seg->status = 1 << 6;
seg->status = MLX5_MKEY_STATUS_FREE;
return;
}
@ -1912,19 +2023,23 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr,
static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr)
{
struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
memset(seg, 0, sizeof(*seg));
if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) {
seg->status = 1 << 6;
seg->status = MLX5_MKEY_STATUS_FREE;
return;
}
seg->flags = convert_access(wr->wr.fast_reg.access_flags);
seg->flags_pd = cpu_to_be32(to_mpd((struct ib_pd *)wr->wr.fast_reg.page_list)->pdn);
seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
seg->len = cpu_to_be64(wr->wr.fast_reg.length);
seg->log2_page_size = wr->wr.fast_reg.page_shift;
seg->flags = convert_access(umrwr->access_flags);
if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) {
seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn);
seg->start_addr = cpu_to_be64(umrwr->target.virt_addr);
}
seg->len = cpu_to_be64(umrwr->length);
seg->log2_page_size = umrwr->page_shift;
seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
mlx5_mkey_variant(wr->wr.fast_reg.rkey));
mlx5_mkey_variant(umrwr->mkey));
}
static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg,
@ -2927,6 +3042,14 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
int mlx5_state;
int err = 0;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
/*
* Wait for any outstanding page faults, in case the user frees memory
* based upon this query's result.
*/
flush_workqueue(mlx5_ib_page_fault_wq);
#endif
mutex_lock(&qp->mutex);
outb = kzalloc(sizeof(*outb), GFP_KERNEL);
if (!outb) {

View file

@ -2341,9 +2341,9 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u,"
" offset = %u, page size = %u.\n",
(unsigned long int)start, (unsigned long int)virt, (u32)length,
region->offset, region->page_size);
ib_umem_offset(region), region->page_size);
skip_pages = ((u32)region->offset) >> 12;
skip_pages = ((u32)ib_umem_offset(region)) >> 12;
if (ib_copy_from_udata(&req, udata, sizeof(req))) {
ib_umem_release(region);
@ -2408,7 +2408,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
region_length -= skip_pages << 12;
for (page_index = skip_pages; page_index < chunk_pages; page_index++) {
skip_pages = 0;
if ((page_count != 0) && (page_count<<12)-(region->offset&(4096-1)) >= region->length)
if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length)
goto enough_pages;
if ((page_count&0x01FF) == 0) {
if (page_count >= 1024 * 512) {

View file

@ -96,7 +96,6 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
union ib_gid sgid;
u8 zmac[ETH_ALEN];
if (!(attr->ah_flags & IB_AH_GRH))
return ERR_PTR(-EINVAL);
@ -118,9 +117,7 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
goto av_conf_err;
}
memset(&zmac, 0, ETH_ALEN);
if (pd->uctx &&
memcmp(attr->dmac, &zmac, ETH_ALEN)) {
if (pd->uctx) {
status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid,
attr->dmac, &attr->vlan_id);
if (status) {

View file

@ -805,7 +805,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
goto umem_err;
mr->hwmr.pbe_size = mr->umem->page_size;
mr->hwmr.fbo = mr->umem->offset;
mr->hwmr.fbo = ib_umem_offset(mr->umem);
mr->hwmr.va = usr_addr;
mr->hwmr.len = len;
mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
@ -1410,6 +1410,8 @@ int ocrdma_query_qp(struct ib_qp *ibqp,
mutex_unlock(&dev->dev_lock);
if (status)
goto mbx_err;
if (qp->qp_type == IB_QPT_UD)
qp_attr->qkey = params.qkey;
qp_attr->qp_state = get_ibqp_state(IB_QPS_INIT);
qp_attr->cur_qp_state = get_ibqp_state(IB_QPS_INIT);
qp_attr->path_mtu =

View file

@ -258,7 +258,7 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mr->mr.user_base = start;
mr->mr.iova = virt_addr;
mr->mr.length = length;
mr->mr.offset = umem->offset;
mr->mr.offset = ib_umem_offset(umem);
mr->mr.access_flags = mr_access_flags;
mr->umem = umem;

View file

@ -98,9 +98,15 @@ enum {
IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */
IPOIB_MCAST_FLAG_SENDONLY = 1,
IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */
/*
* For IPOIB_MCAST_FLAG_BUSY
* When set, in flight join and mcast->mc is unreliable
* When clear and mcast->mc IS_ERR_OR_NULL, need to restart or
* haven't started yet
* When clear and mcast->mc is valid pointer, join was successful
*/
IPOIB_MCAST_FLAG_BUSY = 2,
IPOIB_MCAST_FLAG_ATTACHED = 3,
IPOIB_MCAST_JOIN_STARTED = 4,
MAX_SEND_CQE = 16,
IPOIB_CM_COPYBREAK = 256,
@ -317,6 +323,7 @@ struct ipoib_dev_priv {
struct list_head multicast_list;
struct rb_root multicast_tree;
struct workqueue_struct *wq;
struct delayed_work mcast_task;
struct work_struct carrier_on_task;
struct work_struct flush_light;
@ -477,10 +484,10 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work);
void ipoib_pkey_event(struct work_struct *work);
void ipoib_ib_dev_cleanup(struct net_device *dev);
int ipoib_ib_dev_open(struct net_device *dev, int flush);
int ipoib_ib_dev_open(struct net_device *dev);
int ipoib_ib_dev_up(struct net_device *dev);
int ipoib_ib_dev_down(struct net_device *dev, int flush);
int ipoib_ib_dev_stop(struct net_device *dev, int flush);
int ipoib_ib_dev_down(struct net_device *dev);
int ipoib_ib_dev_stop(struct net_device *dev);
void ipoib_pkey_dev_check_presence(struct net_device *dev);
int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
@ -492,7 +499,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
void ipoib_mcast_restart_task(struct work_struct *work);
int ipoib_mcast_start_thread(struct net_device *dev);
int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
int ipoib_mcast_stop_thread(struct net_device *dev);
void ipoib_mcast_dev_down(struct net_device *dev);
void ipoib_mcast_dev_flush(struct net_device *dev);

View file

@ -474,7 +474,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
}
spin_lock_irq(&priv->lock);
queue_delayed_work(ipoib_workqueue,
queue_delayed_work(priv->wq,
&priv->cm.stale_task, IPOIB_CM_RX_DELAY);
/* Add this entry to passive ids list head, but do not re-add it
* if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
@ -576,7 +576,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
spin_lock_irqsave(&priv->lock, flags);
list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
ipoib_cm_start_rx_drain(priv);
queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
queue_work(priv->wq, &priv->cm.rx_reap_task);
spin_unlock_irqrestore(&priv->lock, flags);
} else
ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
@ -603,7 +603,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
spin_lock_irqsave(&priv->lock, flags);
list_move(&p->list, &priv->cm.rx_reap_list);
spin_unlock_irqrestore(&priv->lock, flags);
queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
queue_work(priv->wq, &priv->cm.rx_reap_task);
}
return;
}
@ -827,7 +827,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
list_move(&tx->list, &priv->cm.reap_list);
queue_work(ipoib_workqueue, &priv->cm.reap_task);
queue_work(priv->wq, &priv->cm.reap_task);
}
clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
@ -1255,7 +1255,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
list_move(&tx->list, &priv->cm.reap_list);
queue_work(ipoib_workqueue, &priv->cm.reap_task);
queue_work(priv->wq, &priv->cm.reap_task);
}
spin_unlock_irqrestore(&priv->lock, flags);
@ -1284,7 +1284,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path
tx->dev = dev;
list_add(&tx->list, &priv->cm.start_list);
set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
queue_work(ipoib_workqueue, &priv->cm.start_task);
queue_work(priv->wq, &priv->cm.start_task);
return tx;
}
@ -1295,7 +1295,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
spin_lock_irqsave(&priv->lock, flags);
list_move(&tx->list, &priv->cm.reap_list);
queue_work(ipoib_workqueue, &priv->cm.reap_task);
queue_work(priv->wq, &priv->cm.reap_task);
ipoib_dbg(priv, "Reap connection for gid %pI6\n",
tx->neigh->daddr + 4);
tx->neigh = NULL;
@ -1417,7 +1417,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
skb_queue_tail(&priv->cm.skb_queue, skb);
if (e)
queue_work(ipoib_workqueue, &priv->cm.skb_task);
queue_work(priv->wq, &priv->cm.skb_task);
}
static void ipoib_cm_rx_reap(struct work_struct *work)
@ -1450,7 +1450,7 @@ static void ipoib_cm_stale_task(struct work_struct *work)
}
if (!list_empty(&priv->cm.passive_ids))
queue_delayed_work(ipoib_workqueue,
queue_delayed_work(priv->wq,
&priv->cm.stale_task, IPOIB_CM_RX_DELAY);
spin_unlock_irq(&priv->lock);
}

View file

@ -655,7 +655,7 @@ void ipoib_reap_ah(struct work_struct *work)
__ipoib_reap_ah(dev);
if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
queue_delayed_work(priv->wq, &priv->ah_reap_task,
round_jiffies_relative(HZ));
}
@ -664,7 +664,7 @@ static void ipoib_ib_tx_timer_func(unsigned long ctx)
drain_tx_cq((struct net_device *)ctx);
}
int ipoib_ib_dev_open(struct net_device *dev, int flush)
int ipoib_ib_dev_open(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret;
@ -696,7 +696,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush)
}
clear_bit(IPOIB_STOP_REAPER, &priv->flags);
queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
queue_delayed_work(priv->wq, &priv->ah_reap_task,
round_jiffies_relative(HZ));
if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
@ -706,7 +706,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush)
dev_stop:
if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
napi_enable(&priv->napi);
ipoib_ib_dev_stop(dev, flush);
ipoib_ib_dev_stop(dev);
return -1;
}
@ -738,7 +738,7 @@ int ipoib_ib_dev_up(struct net_device *dev)
return ipoib_mcast_start_thread(dev);
}
int ipoib_ib_dev_down(struct net_device *dev, int flush)
int ipoib_ib_dev_down(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@ -747,7 +747,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush)
clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
netif_carrier_off(dev);
ipoib_mcast_stop_thread(dev, flush);
ipoib_mcast_stop_thread(dev);
ipoib_mcast_dev_flush(dev);
ipoib_flush_paths(dev);
@ -807,7 +807,7 @@ void ipoib_drain_cq(struct net_device *dev)
local_bh_enable();
}
int ipoib_ib_dev_stop(struct net_device *dev, int flush)
int ipoib_ib_dev_stop(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ib_qp_attr qp_attr;
@ -880,8 +880,7 @@ timeout:
/* Wait for all AHs to be reaped */
set_bit(IPOIB_STOP_REAPER, &priv->flags);
cancel_delayed_work(&priv->ah_reap_task);
if (flush)
flush_workqueue(ipoib_workqueue);
flush_workqueue(priv->wq);
begin = jiffies;
@ -918,7 +917,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
(unsigned long) dev);
if (dev->flags & IFF_UP) {
if (ipoib_ib_dev_open(dev, 1)) {
if (ipoib_ib_dev_open(dev)) {
ipoib_transport_dev_cleanup(dev);
return -ENODEV;
}
@ -1040,12 +1039,12 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
}
if (level >= IPOIB_FLUSH_NORMAL)
ipoib_ib_dev_down(dev, 0);
ipoib_ib_dev_down(dev);
if (level == IPOIB_FLUSH_HEAVY) {
if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
ipoib_ib_dev_stop(dev, 0);
if (ipoib_ib_dev_open(dev, 0) != 0)
ipoib_ib_dev_stop(dev);
if (ipoib_ib_dev_open(dev) != 0)
return;
if (netif_queue_stopped(dev))
netif_start_queue(dev);
@ -1097,7 +1096,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
*/
ipoib_flush_paths(dev);
ipoib_mcast_stop_thread(dev, 1);
ipoib_mcast_stop_thread(dev);
ipoib_mcast_dev_flush(dev);
ipoib_transport_dev_cleanup(dev);

View file

@ -108,7 +108,7 @@ int ipoib_open(struct net_device *dev)
set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
if (ipoib_ib_dev_open(dev, 1)) {
if (ipoib_ib_dev_open(dev)) {
if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
return 0;
goto err_disable;
@ -139,7 +139,7 @@ int ipoib_open(struct net_device *dev)
return 0;
err_stop:
ipoib_ib_dev_stop(dev, 1);
ipoib_ib_dev_stop(dev);
err_disable:
clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
@ -157,8 +157,8 @@ static int ipoib_stop(struct net_device *dev)
netif_stop_queue(dev);
ipoib_ib_dev_down(dev, 1);
ipoib_ib_dev_stop(dev, 0);
ipoib_ib_dev_down(dev);
ipoib_ib_dev_stop(dev);
if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
struct ipoib_dev_priv *cpriv;
@ -839,7 +839,7 @@ static void ipoib_set_mcast_list(struct net_device *dev)
return;
}
queue_work(ipoib_workqueue, &priv->restart_task);
queue_work(priv->wq, &priv->restart_task);
}
static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
@ -954,7 +954,7 @@ static void ipoib_reap_neigh(struct work_struct *work)
__ipoib_reap_neigh(priv);
if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
queue_delayed_work(priv->wq, &priv->neigh_reap_task,
arp_tbl.gc_interval);
}
@ -1133,7 +1133,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
/* start garbage collection */
clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
queue_delayed_work(priv->wq, &priv->neigh_reap_task,
arp_tbl.gc_interval);
return 0;
@ -1262,15 +1262,13 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
if (ipoib_neigh_hash_init(priv) < 0)
goto out;
/* Allocate RX/TX "rings" to hold queued skbs */
priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
GFP_KERNEL);
if (!priv->rx_ring) {
printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
ca->name, ipoib_recvq_size);
goto out_neigh_hash_cleanup;
goto out;
}
priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
@ -1285,16 +1283,24 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
if (ipoib_ib_dev_init(dev, ca, port))
goto out_tx_ring_cleanup;
/*
* Must be after ipoib_ib_dev_init so we can allocate a per
* device wq there and use it here
*/
if (ipoib_neigh_hash_init(priv) < 0)
goto out_dev_uninit;
return 0;
out_dev_uninit:
ipoib_ib_dev_cleanup(dev);
out_tx_ring_cleanup:
vfree(priv->tx_ring);
out_rx_ring_cleanup:
kfree(priv->rx_ring);
out_neigh_hash_cleanup:
ipoib_neigh_hash_uninit(dev);
out:
return -ENOMEM;
}
@ -1317,6 +1323,12 @@ void ipoib_dev_cleanup(struct net_device *dev)
}
unregister_netdevice_many(&head);
/*
* Must be before ipoib_ib_dev_cleanup or we delete an in use
* work queue
*/
ipoib_neigh_hash_uninit(dev);
ipoib_ib_dev_cleanup(dev);
kfree(priv->rx_ring);
@ -1324,8 +1336,6 @@ void ipoib_dev_cleanup(struct net_device *dev)
priv->rx_ring = NULL;
priv->tx_ring = NULL;
ipoib_neigh_hash_uninit(dev);
}
static const struct header_ops ipoib_header_ops = {
@ -1636,7 +1646,7 @@ register_failed:
/* Stop GC if started before flush */
set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
cancel_delayed_work(&priv->neigh_reap_task);
flush_workqueue(ipoib_workqueue);
flush_workqueue(priv->wq);
event_failed:
ipoib_dev_cleanup(priv->dev);
@ -1707,7 +1717,7 @@ static void ipoib_remove_one(struct ib_device *device)
/* Stop GC */
set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
cancel_delayed_work(&priv->neigh_reap_task);
flush_workqueue(ipoib_workqueue);
flush_workqueue(priv->wq);
unregister_netdev(priv->dev);
free_netdev(priv->dev);
@ -1748,8 +1758,13 @@ static int __init ipoib_init_module(void)
* unregister_netdev() and linkwatch_event take the rtnl lock,
* so flush_scheduled_work() can deadlock during device
* removal.
*
* In addition, bringing one device up and another down at the
* same time can deadlock a single workqueue, so we have this
* global fallback workqueue, but we also attempt to open a
* per device workqueue each time we bring an interface up
*/
ipoib_workqueue = create_singlethread_workqueue("ipoib");
ipoib_workqueue = create_singlethread_workqueue("ipoib_flush");
if (!ipoib_workqueue) {
ret = -ENOMEM;
goto err_fs;

View file

@ -190,12 +190,6 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
spin_unlock_irq(&priv->lock);
priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
set_qkey = 1;
if (!ipoib_cm_admin_enabled(dev)) {
rtnl_lock();
dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu));
rtnl_unlock();
}
}
if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
@ -277,16 +271,27 @@ ipoib_mcast_sendonly_join_complete(int status,
struct ipoib_mcast *mcast = multicast->context;
struct net_device *dev = mcast->dev;
/*
* We have to take the mutex to force mcast_sendonly_join to
* return from ib_sa_multicast_join and set mcast->mc to a
* valid value. Otherwise we were racing with ourselves in
* that we might fail here, but get a valid return from
* ib_sa_multicast_join after we had cleared mcast->mc here,
* resulting in mis-matched joins and leaves and a deadlock
*/
mutex_lock(&mcast_mutex);
/* We trap for port events ourselves. */
if (status == -ENETRESET)
return 0;
goto out;
if (!status)
status = ipoib_mcast_join_finish(mcast, &multicast->rec);
if (status) {
if (mcast->logcount++ < 20)
ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n",
ipoib_dbg_mcast(netdev_priv(dev), "sendonly multicast "
"join failed for %pI6, status %d\n",
mcast->mcmember.mgid.raw, status);
/* Flush out any queued packets */
@ -296,11 +301,15 @@ ipoib_mcast_sendonly_join_complete(int status,
dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
}
netif_tx_unlock_bh(dev);
/* Clear the busy flag so we try again */
status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
&mcast->flags);
}
out:
clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
if (status)
mcast->mc = NULL;
complete(&mcast->done);
if (status == -ENETRESET)
status = 0;
mutex_unlock(&mcast_mutex);
return status;
}
@ -318,12 +327,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
int ret = 0;
if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
ipoib_dbg_mcast(priv, "device shutting down, no sendonly "
"multicast joins\n");
return -ENODEV;
}
if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
ipoib_dbg_mcast(priv, "multicast entry busy, skipping "
"sendonly join\n");
return -EBUSY;
}
@ -331,6 +342,9 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
rec.port_gid = priv->local_gid;
rec.pkey = cpu_to_be16(priv->pkey);
mutex_lock(&mcast_mutex);
init_completion(&mcast->done);
set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
priv->port, &rec,
IB_SA_MCMEMBER_REC_MGID |
@ -343,12 +357,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
if (IS_ERR(mcast->mc)) {
ret = PTR_ERR(mcast->mc);
clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
ret);
complete(&mcast->done);
ipoib_warn(priv, "ib_sa_join_multicast for sendonly join "
"failed (ret = %d)\n", ret);
} else {
ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n",
mcast->mcmember.mgid.raw);
ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting "
"sendonly join\n", mcast->mcmember.mgid.raw);
}
mutex_unlock(&mcast_mutex);
return ret;
}
@ -359,18 +375,29 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work)
carrier_on_task);
struct ib_port_attr attr;
/*
* Take rtnl_lock to avoid racing with ipoib_stop() and
* turning the carrier back on while a device is being
* removed.
*/
if (ib_query_port(priv->ca, priv->port, &attr) ||
attr.state != IB_PORT_ACTIVE) {
ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
return;
}
rtnl_lock();
/*
* Take rtnl_lock to avoid racing with ipoib_stop() and
* turning the carrier back on while a device is being
* removed. However, ipoib_stop() will attempt to flush
* the workqueue while holding the rtnl lock, so loop
* on trylock until either we get the lock or we see
* FLAG_ADMIN_UP go away as that signals that we are bailing
* and can safely ignore the carrier on work.
*/
while (!rtnl_trylock()) {
if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
return;
else
msleep(20);
}
if (!ipoib_cm_admin_enabled(priv->dev))
dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu));
netif_carrier_on(priv->dev);
rtnl_unlock();
}
@ -385,60 +412,63 @@ static int ipoib_mcast_join_complete(int status,
ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n",
mcast->mcmember.mgid.raw, status);
/*
* We have to take the mutex to force mcast_join to
* return from ib_sa_multicast_join and set mcast->mc to a
* valid value. Otherwise we were racing with ourselves in
* that we might fail here, but get a valid return from
* ib_sa_multicast_join after we had cleared mcast->mc here,
* resulting in mis-matched joins and leaves and a deadlock
*/
mutex_lock(&mcast_mutex);
/* We trap for port events ourselves. */
if (status == -ENETRESET) {
status = 0;
if (status == -ENETRESET)
goto out;
}
if (!status)
status = ipoib_mcast_join_finish(mcast, &multicast->rec);
if (!status) {
mcast->backoff = 1;
mutex_lock(&mcast_mutex);
if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
queue_delayed_work(ipoib_workqueue,
&priv->mcast_task, 0);
mutex_unlock(&mcast_mutex);
queue_delayed_work(priv->wq, &priv->mcast_task, 0);
/*
* Defer carrier on work to ipoib_workqueue to avoid a
* Defer carrier on work to priv->wq to avoid a
* deadlock on rtnl_lock here.
*/
if (mcast == priv->broadcast)
queue_work(ipoib_workqueue, &priv->carrier_on_task);
status = 0;
goto out;
}
if (mcast->logcount++ < 20) {
if (status == -ETIMEDOUT || status == -EAGAIN) {
ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n",
mcast->mcmember.mgid.raw, status);
} else {
ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
mcast->mcmember.mgid.raw, status);
queue_work(priv->wq, &priv->carrier_on_task);
} else {
if (mcast->logcount++ < 20) {
if (status == -ETIMEDOUT || status == -EAGAIN) {
ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n",
mcast->mcmember.mgid.raw, status);
} else {
ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
mcast->mcmember.mgid.raw, status);
}
}
mcast->backoff *= 2;
if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
}
mcast->backoff *= 2;
if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
/* Clear the busy flag so we try again */
status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
mutex_lock(&mcast_mutex);
out:
spin_lock_irq(&priv->lock);
if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
if (status)
mcast->mc = NULL;
complete(&mcast->done);
if (status == -ENETRESET)
status = 0;
if (status && test_bit(IPOIB_MCAST_RUN, &priv->flags))
queue_delayed_work(priv->wq, &priv->mcast_task,
mcast->backoff * HZ);
spin_unlock_irq(&priv->lock);
mutex_unlock(&mcast_mutex);
out:
complete(&mcast->done);
return status;
}
@ -487,10 +517,9 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
rec.hop_limit = priv->broadcast->mcmember.hop_limit;
}
set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
mutex_lock(&mcast_mutex);
init_completion(&mcast->done);
set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags);
set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
&rec, comp_mask, GFP_KERNEL,
ipoib_mcast_join_complete, mcast);
@ -504,13 +533,11 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
mutex_lock(&mcast_mutex);
if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
queue_delayed_work(ipoib_workqueue,
&priv->mcast_task,
queue_delayed_work(priv->wq, &priv->mcast_task,
mcast->backoff * HZ);
mutex_unlock(&mcast_mutex);
}
mutex_unlock(&mcast_mutex);
}
void ipoib_mcast_join_task(struct work_struct *work)
@ -547,8 +574,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
ipoib_warn(priv, "failed to allocate broadcast group\n");
mutex_lock(&mcast_mutex);
if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
queue_delayed_work(ipoib_workqueue,
&priv->mcast_task, HZ);
queue_delayed_work(priv->wq, &priv->mcast_task,
HZ);
mutex_unlock(&mcast_mutex);
return;
}
@ -563,7 +590,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
}
if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
ipoib_mcast_join(dev, priv->broadcast, 0);
return;
}
@ -571,23 +599,33 @@ void ipoib_mcast_join_task(struct work_struct *work)
while (1) {
struct ipoib_mcast *mcast = NULL;
/*
* Need the mutex so our flags are consistent, need the
* priv->lock so we don't race with list removals in either
* mcast_dev_flush or mcast_restart_task
*/
mutex_lock(&mcast_mutex);
spin_lock_irq(&priv->lock);
list_for_each_entry(mcast, &priv->multicast_list, list) {
if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
&& !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
&& !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
if (IS_ERR_OR_NULL(mcast->mc) &&
!test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
/* Found the next unjoined group */
break;
}
}
spin_unlock_irq(&priv->lock);
mutex_unlock(&mcast_mutex);
if (&mcast->list == &priv->multicast_list) {
/* All done */
break;
}
ipoib_mcast_join(dev, mcast, 1);
if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
ipoib_mcast_sendonly_join(mcast);
else
ipoib_mcast_join(dev, mcast, 1);
return;
}
@ -604,13 +642,13 @@ int ipoib_mcast_start_thread(struct net_device *dev)
mutex_lock(&mcast_mutex);
if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
queue_delayed_work(priv->wq, &priv->mcast_task, 0);
mutex_unlock(&mcast_mutex);
return 0;
}
int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
int ipoib_mcast_stop_thread(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@ -621,8 +659,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
cancel_delayed_work(&priv->mcast_task);
mutex_unlock(&mcast_mutex);
if (flush)
flush_workqueue(ipoib_workqueue);
flush_workqueue(priv->wq);
return 0;
}
@ -633,6 +670,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
int ret = 0;
if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n");
if (!IS_ERR_OR_NULL(mcast->mc))
ib_sa_free_multicast(mcast->mc);
if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
@ -685,6 +725,8 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
__ipoib_mcast_add(dev, mcast);
list_add_tail(&mcast->list, &priv->multicast_list);
if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
queue_delayed_work(priv->wq, &priv->mcast_task, 0);
}
if (!mcast->ah) {
@ -698,8 +740,6 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
ipoib_dbg_mcast(priv, "no address vector, "
"but multicast join already started\n");
else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
ipoib_mcast_sendonly_join(mcast);
/*
* If lookup completes between here and out:, don't
@ -759,9 +799,12 @@ void ipoib_mcast_dev_flush(struct net_device *dev)
spin_unlock_irqrestore(&priv->lock, flags);
/* seperate between the wait to the leave*/
/*
* make sure the in-flight joins have finished before we attempt
* to leave
*/
list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags))
if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
wait_for_completion(&mcast->done);
list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
@ -794,8 +837,6 @@ void ipoib_mcast_restart_task(struct work_struct *work)
ipoib_dbg_mcast(priv, "restarting multicast task\n");
ipoib_mcast_stop_thread(dev, 0);
local_irq_save(flags);
netif_addr_lock(dev);
spin_lock(&priv->lock);
@ -880,14 +921,38 @@ void ipoib_mcast_restart_task(struct work_struct *work)
netif_addr_unlock(dev);
local_irq_restore(flags);
/* We have to cancel outside of the spinlock */
/*
* make sure the in-flight joins have finished before we attempt
* to leave
*/
list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
wait_for_completion(&mcast->done);
/*
* We have to cancel outside of the spinlock, but we have to
* take the rtnl lock or else we race with the removal of
* entries from the remove list in mcast_dev_flush as part
* of ipoib_stop(). We detect the drop of the ADMIN_UP flag
* to signal that we have hit this particular race, and we
* return since we know we don't need to do anything else
* anyway.
*/
while (!rtnl_trylock()) {
if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
return;
else
msleep(20);
}
list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
ipoib_mcast_leave(mcast->dev, mcast);
ipoib_mcast_free(mcast);
}
if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
ipoib_mcast_start_thread(dev);
/*
* Restart our join task if needed
*/
ipoib_mcast_start_thread(dev);
rtnl_unlock();
}
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG

View file

@ -145,10 +145,20 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
int ret, size;
int i;
/*
* the various IPoIB tasks assume they will never race against
* themselves, so always use a single thread workqueue
*/
priv->wq = create_singlethread_workqueue("ipoib_wq");
if (!priv->wq) {
printk(KERN_WARNING "ipoib: failed to allocate device WQ\n");
return -ENODEV;
}
priv->pd = ib_alloc_pd(priv->ca);
if (IS_ERR(priv->pd)) {
printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
return -ENODEV;
goto out_free_wq;
}
priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
@ -242,6 +252,10 @@ out_free_mr:
out_free_pd:
ib_dealloc_pd(priv->pd);
out_free_wq:
destroy_workqueue(priv->wq);
priv->wq = NULL;
return -ENODEV;
}
@ -270,6 +284,12 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
if (ib_dealloc_pd(priv->pd))
ipoib_warn(priv, "ib_dealloc_pd failed\n");
if (priv->wq) {
flush_workqueue(priv->wq);
destroy_workqueue(priv->wq);
priv->wq = NULL;
}
}
void ipoib_event(struct ib_event_handler *handler,

View file

@ -97,7 +97,7 @@ module_param_named(pi_enable, iser_pi_enable, bool, 0644);
MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)");
module_param_named(pi_guard, iser_pi_guard, int, 0644);
MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:IP_CSUM)");
MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]");
static struct workqueue_struct *release_wq;
struct iser_global ig;
@ -164,18 +164,42 @@ iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
return 0;
}
int iser_initialize_task_headers(struct iscsi_task *task,
struct iser_tx_desc *tx_desc)
/**
* iser_initialize_task_headers() - Initialize task headers
* @task: iscsi task
* @tx_desc: iser tx descriptor
*
* Notes:
* This routine may race with iser teardown flow for scsi
* error handling TMFs. So for TMF we should acquire the
* state mutex to avoid dereferencing the IB device which
* may have already been terminated.
*/
int
iser_initialize_task_headers(struct iscsi_task *task,
struct iser_tx_desc *tx_desc)
{
struct iser_conn *iser_conn = task->conn->dd_data;
struct iser_conn *iser_conn = task->conn->dd_data;
struct iser_device *device = iser_conn->ib_conn.device;
struct iscsi_iser_task *iser_task = task->dd_data;
u64 dma_addr;
const bool mgmt_task = !task->sc && !in_interrupt();
int ret = 0;
if (unlikely(mgmt_task))
mutex_lock(&iser_conn->state_mutex);
if (unlikely(iser_conn->state != ISER_CONN_UP)) {
ret = -ENODEV;
goto out;
}
dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
ISER_HEADERS_LEN, DMA_TO_DEVICE);
if (ib_dma_mapping_error(device->ib_device, dma_addr))
return -ENOMEM;
if (ib_dma_mapping_error(device->ib_device, dma_addr)) {
ret = -ENOMEM;
goto out;
}
tx_desc->dma_addr = dma_addr;
tx_desc->tx_sg[0].addr = tx_desc->dma_addr;
@ -183,7 +207,11 @@ int iser_initialize_task_headers(struct iscsi_task *task,
tx_desc->tx_sg[0].lkey = device->mr->lkey;
iser_task->iser_conn = iser_conn;
return 0;
out:
if (unlikely(mgmt_task))
mutex_unlock(&iser_conn->state_mutex);
return ret;
}
/**
@ -199,9 +227,14 @@ static int
iscsi_iser_task_init(struct iscsi_task *task)
{
struct iscsi_iser_task *iser_task = task->dd_data;
int ret;
if (iser_initialize_task_headers(task, &iser_task->desc))
return -ENOMEM;
ret = iser_initialize_task_headers(task, &iser_task->desc);
if (ret) {
iser_err("Failed to init task %p, err = %d\n",
iser_task, ret);
return ret;
}
/* mgmt task */
if (!task->sc)
@ -508,8 +541,8 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
*/
if (iser_conn) {
mutex_lock(&iser_conn->state_mutex);
iscsi_conn_stop(cls_conn, flag);
iser_conn_terminate(iser_conn);
iscsi_conn_stop(cls_conn, flag);
/* unbind */
iser_conn->iscsi_conn = NULL;
@ -541,12 +574,13 @@ iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session)
static inline unsigned int
iser_dif_prot_caps(int prot_caps)
{
return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? SHOST_DIF_TYPE1_PROTECTION |
SHOST_DIX_TYPE1_PROTECTION : 0) |
((prot_caps & IB_PROT_T10DIF_TYPE_2) ? SHOST_DIF_TYPE2_PROTECTION |
SHOST_DIX_TYPE2_PROTECTION : 0) |
((prot_caps & IB_PROT_T10DIF_TYPE_3) ? SHOST_DIF_TYPE3_PROTECTION |
SHOST_DIX_TYPE3_PROTECTION : 0);
return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ?
SHOST_DIF_TYPE1_PROTECTION | SHOST_DIX_TYPE0_PROTECTION |
SHOST_DIX_TYPE1_PROTECTION : 0) |
((prot_caps & IB_PROT_T10DIF_TYPE_2) ?
SHOST_DIF_TYPE2_PROTECTION | SHOST_DIX_TYPE2_PROTECTION : 0) |
((prot_caps & IB_PROT_T10DIF_TYPE_3) ?
SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE3_PROTECTION : 0);
}
/**
@ -569,6 +603,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
struct Scsi_Host *shost;
struct iser_conn *iser_conn = NULL;
struct ib_conn *ib_conn;
u16 max_cmds;
shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
if (!shost)
@ -586,26 +621,41 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
*/
if (ep) {
iser_conn = ep->dd_data;
max_cmds = iser_conn->max_cmds;
mutex_lock(&iser_conn->state_mutex);
if (iser_conn->state != ISER_CONN_UP) {
iser_err("iser conn %p already started teardown\n",
iser_conn);
mutex_unlock(&iser_conn->state_mutex);
goto free_host;
}
ib_conn = &iser_conn->ib_conn;
if (ib_conn->pi_support) {
u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap;
scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps));
if (iser_pi_guard)
scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP);
else
scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC);
scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP |
SHOST_DIX_GUARD_CRC);
}
if (iscsi_host_add(shost,
ib_conn->device->ib_device->dma_device)) {
mutex_unlock(&iser_conn->state_mutex);
goto free_host;
}
mutex_unlock(&iser_conn->state_mutex);
} else {
max_cmds = ISER_DEF_XMIT_CMDS_MAX;
if (iscsi_host_add(shost, NULL))
goto free_host;
}
if (iscsi_host_add(shost, ep ?
ib_conn->device->ib_device->dma_device : NULL))
goto free_host;
if (cmds_max > ISER_DEF_XMIT_CMDS_MAX) {
if (cmds_max > max_cmds) {
iser_info("cmds_max changed from %u to %u\n",
cmds_max, ISER_DEF_XMIT_CMDS_MAX);
cmds_max = ISER_DEF_XMIT_CMDS_MAX;
cmds_max, max_cmds);
cmds_max = max_cmds;
}
cls_session = iscsi_session_setup(&iscsi_iser_transport, shost,

View file

@ -69,34 +69,31 @@
#define DRV_NAME "iser"
#define PFX DRV_NAME ": "
#define DRV_VER "1.4.8"
#define DRV_VER "1.5"
#define iser_dbg(fmt, arg...) \
do { \
if (iser_debug_level > 2) \
if (unlikely(iser_debug_level > 2)) \
printk(KERN_DEBUG PFX "%s: " fmt,\
__func__ , ## arg); \
} while (0)
#define iser_warn(fmt, arg...) \
do { \
if (iser_debug_level > 0) \
if (unlikely(iser_debug_level > 0)) \
pr_warn(PFX "%s: " fmt, \
__func__ , ## arg); \
} while (0)
#define iser_info(fmt, arg...) \
do { \
if (iser_debug_level > 1) \
if (unlikely(iser_debug_level > 1)) \
pr_info(PFX "%s: " fmt, \
__func__ , ## arg); \
} while (0)
#define iser_err(fmt, arg...) \
do { \
printk(KERN_ERR PFX "%s: " fmt, \
__func__ , ## arg); \
} while (0)
#define iser_err(fmt, arg...) \
pr_err(PFX "%s: " fmt, __func__ , ## arg)
#define SHIFT_4K 12
#define SIZE_4K (1ULL << SHIFT_4K)
@ -144,6 +141,11 @@
ISER_MAX_TX_MISC_PDUS + \
ISER_MAX_RX_MISC_PDUS)
#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr \
- ISER_MAX_TX_MISC_PDUS \
- ISER_MAX_RX_MISC_PDUS) / \
(1 + ISER_INFLIGHT_DATAOUTS))
#define ISER_WC_BATCH_COUNT 16
#define ISER_SIGNAL_CMD_COUNT 32
@ -247,7 +249,6 @@ struct iscsi_endpoint;
* @va: MR start address (buffer va)
* @len: MR length
* @mem_h: pointer to registration context (FMR/Fastreg)
* @is_mr: indicates weather we registered the buffer
*/
struct iser_mem_reg {
u32 lkey;
@ -255,7 +256,6 @@ struct iser_mem_reg {
u64 va;
u64 len;
void *mem_h;
int is_mr;
};
/**
@ -323,8 +323,6 @@ struct iser_rx_desc {
char pad[ISER_RX_PAD_SIZE];
} __attribute__((packed));
#define ISER_MAX_CQ 4
struct iser_conn;
struct ib_conn;
struct iscsi_iser_task;
@ -375,7 +373,7 @@ struct iser_device {
struct list_head ig_list;
int refcount;
int comps_used;
struct iser_comp comps[ISER_MAX_CQ];
struct iser_comp *comps;
int (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn,
unsigned cmds_max);
void (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn);
@ -432,6 +430,7 @@ struct fast_reg_descriptor {
* @cma_id: rdma_cm connection maneger handle
* @qp: Connection Queue-pair
* @post_recv_buf_count: post receive counter
* @sig_count: send work request signal count
* @rx_wr: receive work request for batch posts
* @device: reference to iser device
* @comp: iser completion context
@ -452,6 +451,7 @@ struct ib_conn {
struct rdma_cm_id *cma_id;
struct ib_qp *qp;
int post_recv_buf_count;
u8 sig_count;
struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX];
struct iser_device *device;
struct iser_comp *comp;
@ -482,6 +482,7 @@ struct ib_conn {
* to max number of post recvs
* @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1)
* @min_posted_rx: (qp_max_recv_dtos >> 2)
* @max_cmds: maximum cmds allowed for this connection
* @name: connection peer portal
* @release_work: deffered work for release job
* @state_mutex: protects iser onnection state
@ -507,6 +508,7 @@ struct iser_conn {
unsigned qp_max_recv_dtos;
unsigned qp_max_recv_dtos_mask;
unsigned min_posted_rx;
u16 max_cmds;
char name[ISER_OBJECT_NAME_SIZE];
struct work_struct release_work;
struct mutex state_mutex;

View file

@ -369,7 +369,7 @@ static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
return 0;
}
static inline bool iser_signal_comp(int sig_count)
static inline bool iser_signal_comp(u8 sig_count)
{
return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0);
}
@ -388,7 +388,7 @@ int iser_send_command(struct iscsi_conn *conn,
struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr;
struct scsi_cmnd *sc = task->sc;
struct iser_tx_desc *tx_desc = &iser_task->desc;
static unsigned sig_count;
u8 sig_count = ++iser_conn->ib_conn.sig_count;
edtl = ntohl(hdr->data_length);
@ -435,7 +435,7 @@ int iser_send_command(struct iscsi_conn *conn,
iser_task->status = ISER_TASK_STATUS_STARTED;
err = iser_post_send(&iser_conn->ib_conn, tx_desc,
iser_signal_comp(++sig_count));
iser_signal_comp(sig_count));
if (!err)
return 0;

View file

@ -73,7 +73,6 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
if (cmd_dir == ISER_DIR_OUT) {
/* copy the unaligned sg the buffer which is used for RDMA */
int i;
char *p, *from;
sgl = (struct scatterlist *)data->buf;
@ -409,7 +408,6 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
regd_buf->reg.rkey = device->mr->rkey;
regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]);
regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]);
regd_buf->reg.is_mr = 0;
iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X "
"va: 0x%08lX sz: %ld]\n",
@ -440,13 +438,13 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
return 0;
}
static inline void
static void
iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
struct ib_sig_domain *domain)
{
domain->sig_type = IB_SIG_TYPE_T10_DIF;
domain->sig.dif.pi_interval = sc->device->sector_size;
domain->sig.dif.ref_tag = scsi_get_lba(sc) & 0xffffffff;
domain->sig.dif.pi_interval = scsi_prot_interval(sc);
domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc);
/*
* At the moment we hard code those, but in the future
* we will take them from sc.
@ -454,8 +452,7 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
domain->sig.dif.apptag_check_mask = 0xffff;
domain->sig.dif.app_escape = true;
domain->sig.dif.ref_escape = true;
if (scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE1 ||
scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE2)
if (sc->prot_flags & SCSI_PROT_REF_INCREMENT)
domain->sig.dif.ref_remap = true;
};
@ -473,26 +470,16 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
case SCSI_PROT_WRITE_STRIP:
sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
/*
* At the moment we use this modparam to tell what is
* the memory bg_type, in the future we will take it
* from sc.
*/
sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM :
IB_T10DIF_CRC;
sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
IB_T10DIF_CSUM : IB_T10DIF_CRC;
break;
case SCSI_PROT_READ_PASS:
case SCSI_PROT_WRITE_PASS:
iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
/*
* At the moment we use this modparam to tell what is
* the memory bg_type, in the future we will take it
* from sc.
*/
sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM :
IB_T10DIF_CRC;
sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
IB_T10DIF_CSUM : IB_T10DIF_CRC;
break;
default:
iser_err("Unsupported PI operation %d\n",
@ -503,26 +490,28 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
return 0;
}
static int
static inline void
iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
{
switch (scsi_get_prot_type(sc)) {
case SCSI_PROT_DIF_TYPE0:
break;
case SCSI_PROT_DIF_TYPE1:
case SCSI_PROT_DIF_TYPE2:
*mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG;
break;
case SCSI_PROT_DIF_TYPE3:
*mask = ISER_CHECK_GUARD;
break;
default:
iser_err("Unsupported protection type %d\n",
scsi_get_prot_type(sc));
return -EINVAL;
}
*mask = 0;
if (sc->prot_flags & SCSI_PROT_REF_CHECK)
*mask |= ISER_CHECK_REFTAG;
if (sc->prot_flags & SCSI_PROT_GUARD_CHECK)
*mask |= ISER_CHECK_GUARD;
}
return 0;
static void
iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
{
u32 rkey;
memset(inv_wr, 0, sizeof(*inv_wr));
inv_wr->opcode = IB_WR_LOCAL_INV;
inv_wr->wr_id = ISER_FASTREG_LI_WRID;
inv_wr->ex.invalidate_rkey = mr->rkey;
rkey = ib_inc_rkey(mr->rkey);
ib_update_fast_reg_key(mr, rkey);
}
static int
@ -536,26 +525,17 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
struct ib_send_wr *bad_wr, *wr = NULL;
struct ib_sig_attrs sig_attrs;
int ret;
u32 key;
memset(&sig_attrs, 0, sizeof(sig_attrs));
ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs);
if (ret)
goto err;
ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask);
if (ret)
goto err;
iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask);
if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) {
memset(&inv_wr, 0, sizeof(inv_wr));
inv_wr.opcode = IB_WR_LOCAL_INV;
inv_wr.wr_id = ISER_FASTREG_LI_WRID;
inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey;
iser_inv_rkey(&inv_wr, pi_ctx->sig_mr);
wr = &inv_wr;
/* Bump the key */
key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF);
ib_update_fast_reg_key(pi_ctx->sig_mr, ++key);
}
memset(&sig_wr, 0, sizeof(sig_wr));
@ -585,12 +565,7 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
sig_sge->lkey = pi_ctx->sig_mr->lkey;
sig_sge->addr = 0;
sig_sge->length = data_sge->length + prot_sge->length;
if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT ||
scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) {
sig_sge->length += (data_sge->length /
iser_task->sc->device->sector_size) * 8;
}
sig_sge->length = scsi_transfer_length(iser_task->sc);
iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n",
sig_sge->addr, sig_sge->length,
@ -613,7 +588,6 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
struct ib_fast_reg_page_list *frpl;
struct ib_send_wr fastreg_wr, inv_wr;
struct ib_send_wr *bad_wr, *wr = NULL;
u8 key;
int ret, offset, size, plen;
/* if there a single dma entry, dma mr suffices */
@ -645,14 +619,8 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
}
if (!(desc->reg_indicators & ind)) {
memset(&inv_wr, 0, sizeof(inv_wr));
inv_wr.wr_id = ISER_FASTREG_LI_WRID;
inv_wr.opcode = IB_WR_LOCAL_INV;
inv_wr.ex.invalidate_rkey = mr->rkey;
iser_inv_rkey(&inv_wr, mr);
wr = &inv_wr;
/* Bump the key */
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
}
/* Prepare FASTREG WR */
@ -770,15 +738,11 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey;
regd_buf->reg.va = sig_sge.addr;
regd_buf->reg.len = sig_sge.length;
regd_buf->reg.is_mr = 1;
} else {
if (desc) {
if (desc)
regd_buf->reg.rkey = desc->data_mr->rkey;
regd_buf->reg.is_mr = 1;
} else {
else
regd_buf->reg.rkey = device->mr->rkey;
regd_buf->reg.is_mr = 0;
}
regd_buf->reg.lkey = data_sge.lkey;
regd_buf->reg.va = data_sge.addr;

View file

@ -76,7 +76,7 @@ static void iser_event_handler(struct ib_event_handler *handler,
static int iser_create_device_ib_res(struct iser_device *device)
{
struct ib_device_attr *dev_attr = &device->dev_attr;
int ret, i;
int ret, i, max_cqe;
ret = ib_query_device(device->ib_device, dev_attr);
if (ret) {
@ -104,11 +104,19 @@ static int iser_create_device_ib_res(struct iser_device *device)
return -1;
}
device->comps_used = min(ISER_MAX_CQ,
device->comps_used = min_t(int, num_online_cpus(),
device->ib_device->num_comp_vectors);
iser_info("using %d CQs, device %s supports %d vectors\n",
device->comps = kcalloc(device->comps_used, sizeof(*device->comps),
GFP_KERNEL);
if (!device->comps)
goto comps_err;
max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe);
iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n",
device->comps_used, device->ib_device->name,
device->ib_device->num_comp_vectors);
device->ib_device->num_comp_vectors, max_cqe);
device->pd = ib_alloc_pd(device->ib_device);
if (IS_ERR(device->pd))
@ -122,7 +130,7 @@ static int iser_create_device_ib_res(struct iser_device *device)
iser_cq_callback,
iser_cq_event_callback,
(void *)comp,
ISER_MAX_CQ_LEN, i);
max_cqe, i);
if (IS_ERR(comp->cq)) {
comp->cq = NULL;
goto cq_err;
@ -162,6 +170,8 @@ cq_err:
}
ib_dealloc_pd(device->pd);
pd_err:
kfree(device->comps);
comps_err:
iser_err("failed to allocate an IB resource\n");
return -1;
}
@ -187,6 +197,9 @@ static void iser_free_device_ib_res(struct iser_device *device)
(void)ib_dereg_mr(device->mr);
(void)ib_dealloc_pd(device->pd);
kfree(device->comps);
device->comps = NULL;
device->mr = NULL;
device->pd = NULL;
}
@ -425,7 +438,10 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn)
*/
static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
{
struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
ib_conn);
struct iser_device *device;
struct ib_device_attr *dev_attr;
struct ib_qp_init_attr init_attr;
int ret = -ENOMEM;
int index, min_index = 0;
@ -433,6 +449,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
BUG_ON(ib_conn->device == NULL);
device = ib_conn->device;
dev_attr = &device->dev_attr;
memset(&init_attr, 0, sizeof init_attr);
@ -460,8 +477,20 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
if (ib_conn->pi_support) {
init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
iser_conn->max_cmds =
ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
} else {
init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1;
if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1;
iser_conn->max_cmds =
ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
} else {
init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
iser_conn->max_cmds =
ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
iser_dbg("device %s supports max_send_wr %d\n",
device->ib_device->name, dev_attr->max_qp_wr);
}
}
ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
@ -475,7 +504,11 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
return ret;
out_err:
mutex_lock(&ig.connlist_mutex);
ib_conn->comp->active_qps--;
mutex_unlock(&ig.connlist_mutex);
iser_err("unable to alloc mem or create resource, err %d\n", ret);
return ret;
}
@ -610,9 +643,11 @@ void iser_conn_release(struct iser_conn *iser_conn)
mutex_unlock(&ig.connlist_mutex);
mutex_lock(&iser_conn->state_mutex);
if (iser_conn->state != ISER_CONN_DOWN)
if (iser_conn->state != ISER_CONN_DOWN) {
iser_warn("iser conn %p state %d, expected state down.\n",
iser_conn, iser_conn->state);
iser_conn->state = ISER_CONN_DOWN;
}
/*
* In case we never got to bind stage, we still need to
* release IB resources (which is safe to call more than once).
@ -662,8 +697,10 @@ int iser_conn_terminate(struct iser_conn *iser_conn)
/* post an indication that all flush errors were consumed */
err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr);
if (err)
if (err) {
iser_err("conn %p failed to post beacon", ib_conn);
return 1;
}
wait_for_completion(&ib_conn->flush_comp);
}
@ -846,20 +883,21 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
break;
case RDMA_CM_EVENT_DISCONNECTED:
case RDMA_CM_EVENT_ADDR_CHANGE:
iser_disconnected_handler(cma_id);
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
iser_cleanup_handler(cma_id, false);
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
/*
* we *must* destroy the device as we cannot rely
* on iscsid to be around to initiate error handling.
* also implicitly destroy the cma_id.
* also if we are not in state DOWN implicitly destroy
* the cma_id.
*/
iser_cleanup_handler(cma_id, true);
iser_conn->ib_conn.cma_id = NULL;
ret = 1;
break;
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
iser_cleanup_handler(cma_id, false);
if (iser_conn->state != ISER_CONN_DOWN) {
iser_conn->ib_conn.cma_id = NULL;
ret = 1;
}
break;
default:
iser_err("Unexpected RDMA CM event (%d)\n", event->event);
@ -981,7 +1019,6 @@ int iser_reg_page_vec(struct ib_conn *ib_conn,
mem_reg->rkey = mem->fmr->rkey;
mem_reg->len = page_vec->length * SIZE_4K;
mem_reg->va = io_addr;
mem_reg->is_mr = 1;
mem_reg->mem_h = (void *)mem;
mem_reg->va += page_vec->offset;
@ -1008,7 +1045,7 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;
int ret;
if (!reg->is_mr)
if (!reg->mem_h)
return;
iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h);
@ -1028,11 +1065,10 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
struct ib_conn *ib_conn = &iser_conn->ib_conn;
struct fast_reg_descriptor *desc = reg->mem_h;
if (!reg->is_mr)
if (!desc)
return;
reg->mem_h = NULL;
reg->is_mr = 0;
spin_lock_bh(&ib_conn->lock);
list_add_tail(&desc->list, &ib_conn->fastreg.pool);
spin_unlock_bh(&ib_conn->lock);
@ -1049,7 +1085,7 @@ int iser_post_recvl(struct iser_conn *iser_conn)
sge.length = ISER_RX_LOGIN_SIZE;
sge.lkey = ib_conn->device->mr->lkey;
rx_wr.wr_id = (unsigned long)iser_conn->login_resp_buf;
rx_wr.wr_id = (uintptr_t)iser_conn->login_resp_buf;
rx_wr.sg_list = &sge;
rx_wr.num_sge = 1;
rx_wr.next = NULL;
@ -1073,7 +1109,7 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count)
for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
rx_desc = &iser_conn->rx_descs[my_rx_head];
rx_wr->wr_id = (unsigned long)rx_desc;
rx_wr->wr_id = (uintptr_t)rx_desc;
rx_wr->sg_list = &rx_desc->rx_sg;
rx_wr->num_sge = 1;
rx_wr->next = rx_wr + 1;
@ -1110,7 +1146,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
DMA_TO_DEVICE);
send_wr.next = NULL;
send_wr.wr_id = (unsigned long)tx_desc;
send_wr.wr_id = (uintptr_t)tx_desc;
send_wr.sg_list = tx_desc->tx_sg;
send_wr.num_sge = tx_desc->num_sge;
send_wr.opcode = IB_WR_SEND;
@ -1160,6 +1196,7 @@ static void
iser_handle_comp_error(struct ib_conn *ib_conn,
struct ib_wc *wc)
{
void *wr_id = (void *)(uintptr_t)wc->wr_id;
struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
ib_conn);
@ -1168,8 +1205,8 @@ iser_handle_comp_error(struct ib_conn *ib_conn,
iscsi_conn_failure(iser_conn->iscsi_conn,
ISCSI_ERR_CONN_FAILED);
if (is_iser_tx_desc(iser_conn, (void *)wc->wr_id)) {
struct iser_tx_desc *desc = (struct iser_tx_desc *)wc->wr_id;
if (is_iser_tx_desc(iser_conn, wr_id)) {
struct iser_tx_desc *desc = wr_id;
if (desc->type == ISCSI_TX_DATAOUT)
kmem_cache_free(ig.desc_cache, desc);
@ -1193,14 +1230,14 @@ static void iser_handle_wc(struct ib_wc *wc)
struct iser_rx_desc *rx_desc;
ib_conn = wc->qp->qp_context;
if (wc->status == IB_WC_SUCCESS) {
if (likely(wc->status == IB_WC_SUCCESS)) {
if (wc->opcode == IB_WC_RECV) {
rx_desc = (struct iser_rx_desc *)wc->wr_id;
rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
iser_rcv_completion(rx_desc, wc->byte_len,
ib_conn);
} else
if (wc->opcode == IB_WC_SEND) {
tx_desc = (struct iser_tx_desc *)wc->wr_id;
tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
iser_snd_completion(tx_desc, ib_conn);
} else {
iser_err("Unknown wc opcode %d\n", wc->opcode);

View file

@ -2929,7 +2929,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)
return -ENOMEM;
sep_opt = options;
while ((p = strsep(&sep_opt, ",")) != NULL) {
while ((p = strsep(&sep_opt, ",\n")) != NULL) {
if (!*p)
continue;

View file

@ -171,9 +171,9 @@ int mlx4_check_port_params(struct mlx4_dev *dev,
{
int i;
for (i = 0; i < dev->caps.num_ports - 1; i++) {
if (port_type[i] != port_type[i + 1]) {
if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
for (i = 0; i < dev->caps.num_ports - 1; i++) {
if (port_type[i] != port_type[i + 1]) {
mlx4_err(dev, "Only same port types supported on this HCA, aborting\n");
return -EINVAL;
}

View file

@ -157,6 +157,8 @@ static const char *eqe_type_str(u8 type)
return "MLX5_EVENT_TYPE_CMD";
case MLX5_EVENT_TYPE_PAGE_REQUEST:
return "MLX5_EVENT_TYPE_PAGE_REQUEST";
case MLX5_EVENT_TYPE_PAGE_FAULT:
return "MLX5_EVENT_TYPE_PAGE_FAULT";
default:
return "Unrecognized event";
}
@ -279,6 +281,11 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
}
break;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
case MLX5_EVENT_TYPE_PAGE_FAULT:
mlx5_eq_pagefault(dev, eqe);
break;
#endif
default:
mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n",
@ -446,8 +453,12 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev)
int mlx5_start_eqs(struct mlx5_core_dev *dev)
{
struct mlx5_eq_table *table = &dev->priv.eq_table;
u32 async_event_mask = MLX5_ASYNC_EVENT_MASK;
int err;
if (dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)
async_event_mask |= (1ull << MLX5_EVENT_TYPE_PAGE_FAULT);
err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
"mlx5_cmd_eq", &dev->priv.uuari.uars[0]);
@ -459,7 +470,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
mlx5_cmd_use_events(dev);
err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC,
MLX5_NUM_ASYNC_EQE, MLX5_ASYNC_EVENT_MASK,
MLX5_NUM_ASYNC_EQE, async_event_mask,
"mlx5_async_eq", &dev->priv.uuari.uars[0]);
if (err) {
mlx5_core_warn(dev, "failed to create async EQ %d\n", err);

View file

@ -69,6 +69,46 @@ int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, struct mlx5_caps *caps)
return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR);
}
int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *caps)
{
u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)];
int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
void *out;
int err;
if (!(dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG))
return -ENOTSUPP;
memset(in, 0, sizeof(in));
out = kzalloc(out_sz, GFP_KERNEL);
if (!out)
return -ENOMEM;
MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
MLX5_SET(query_hca_cap_in, in, op_mod, HCA_CAP_OPMOD_GET_ODP_CUR);
err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
if (err)
goto out;
err = mlx5_cmd_status_to_err_v2(out);
if (err) {
mlx5_core_warn(dev, "query cur hca ODP caps failed, %d\n", err);
goto out;
}
memcpy(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct),
sizeof(*caps));
mlx5_core_dbg(dev, "on-demand paging capabilities:\nrc: %08x\nuc: %08x\nud: %08x\n",
be32_to_cpu(caps->per_transport_caps.rc_odp_caps),
be32_to_cpu(caps->per_transport_caps.uc_odp_caps),
be32_to_cpu(caps->per_transport_caps.ud_odp_caps));
out:
kfree(out);
return err;
}
EXPORT_SYMBOL(mlx5_query_odp_caps);
int mlx5_cmd_init_hca(struct mlx5_core_dev *dev)
{
struct mlx5_cmd_init_hca_mbox_in in;

View file

@ -88,6 +88,95 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
mlx5_core_put_rsc(common);
}
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
{
struct mlx5_eqe_page_fault *pf_eqe = &eqe->data.page_fault;
int qpn = be32_to_cpu(pf_eqe->flags_qpn) & MLX5_QPN_MASK;
struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, qpn);
struct mlx5_core_qp *qp =
container_of(common, struct mlx5_core_qp, common);
struct mlx5_pagefault pfault;
if (!qp) {
mlx5_core_warn(dev, "ODP event for non-existent QP %06x\n",
qpn);
return;
}
pfault.event_subtype = eqe->sub_type;
pfault.flags = (be32_to_cpu(pf_eqe->flags_qpn) >> MLX5_QPN_BITS) &
(MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE | MLX5_PFAULT_RDMA);
pfault.bytes_committed = be32_to_cpu(
pf_eqe->bytes_committed);
mlx5_core_dbg(dev,
"PAGE_FAULT: subtype: 0x%02x, flags: 0x%02x,\n",
eqe->sub_type, pfault.flags);
switch (eqe->sub_type) {
case MLX5_PFAULT_SUBTYPE_RDMA:
/* RDMA based event */
pfault.rdma.r_key =
be32_to_cpu(pf_eqe->rdma.r_key);
pfault.rdma.packet_size =
be16_to_cpu(pf_eqe->rdma.packet_length);
pfault.rdma.rdma_op_len =
be32_to_cpu(pf_eqe->rdma.rdma_op_len);
pfault.rdma.rdma_va =
be64_to_cpu(pf_eqe->rdma.rdma_va);
mlx5_core_dbg(dev,
"PAGE_FAULT: qpn: 0x%06x, r_key: 0x%08x,\n",
qpn, pfault.rdma.r_key);
mlx5_core_dbg(dev,
"PAGE_FAULT: rdma_op_len: 0x%08x,\n",
pfault.rdma.rdma_op_len);
mlx5_core_dbg(dev,
"PAGE_FAULT: rdma_va: 0x%016llx,\n",
pfault.rdma.rdma_va);
mlx5_core_dbg(dev,
"PAGE_FAULT: bytes_committed: 0x%06x\n",
pfault.bytes_committed);
break;
case MLX5_PFAULT_SUBTYPE_WQE:
/* WQE based event */
pfault.wqe.wqe_index =
be16_to_cpu(pf_eqe->wqe.wqe_index);
pfault.wqe.packet_size =
be16_to_cpu(pf_eqe->wqe.packet_length);
mlx5_core_dbg(dev,
"PAGE_FAULT: qpn: 0x%06x, wqe_index: 0x%04x,\n",
qpn, pfault.wqe.wqe_index);
mlx5_core_dbg(dev,
"PAGE_FAULT: bytes_committed: 0x%06x\n",
pfault.bytes_committed);
break;
default:
mlx5_core_warn(dev,
"Unsupported page fault event sub-type: 0x%02hhx, QP %06x\n",
eqe->sub_type, qpn);
/* Unsupported page faults should still be resolved by the
* page fault handler
*/
}
if (qp->pfault_handler) {
qp->pfault_handler(qp, &pfault);
} else {
mlx5_core_err(dev,
"ODP event for QP %08x, without a fault handler in QP\n",
qpn);
/* Page fault will remain unresolved. QP will hang until it is
* destroyed
*/
}
mlx5_core_put_rsc(common);
}
#endif
int mlx5_core_create_qp(struct mlx5_core_dev *dev,
struct mlx5_core_qp *qp,
struct mlx5_create_qp_mbox_in *in,
@ -322,3 +411,33 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn)
return err;
}
EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
u8 flags, int error)
{
struct mlx5_page_fault_resume_mbox_in in;
struct mlx5_page_fault_resume_mbox_out out;
int err;
memset(&in, 0, sizeof(in));
memset(&out, 0, sizeof(out));
in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_PAGE_FAULT_RESUME);
in.hdr.opmod = 0;
flags &= (MLX5_PAGE_FAULT_RESUME_REQUESTOR |
MLX5_PAGE_FAULT_RESUME_WRITE |
MLX5_PAGE_FAULT_RESUME_RDMA);
flags |= (error ? MLX5_PAGE_FAULT_RESUME_ERROR : 0);
in.flags_qpn = cpu_to_be32((qpn & MLX5_QPN_MASK) |
(flags << MLX5_QPN_BITS));
err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
if (err)
return err;
if (out.hdr.status)
err = mlx5_cmd_status_to_err(&out.hdr);
return err;
}
EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
#endif

View file

@ -119,6 +119,15 @@ enum {
MLX5_MAX_LOG_PKEY_TABLE = 5,
};
enum {
MLX5_MKEY_INBOX_PG_ACCESS = 1 << 31
};
enum {
MLX5_PFAULT_SUBTYPE_WQE = 0,
MLX5_PFAULT_SUBTYPE_RDMA = 1,
};
enum {
MLX5_PERM_LOCAL_READ = 1 << 2,
MLX5_PERM_LOCAL_WRITE = 1 << 3,
@ -180,6 +189,19 @@ enum {
MLX5_MKEY_MASK_FREE = 1ull << 29,
};
enum {
MLX5_UMR_TRANSLATION_OFFSET_EN = (1 << 4),
MLX5_UMR_CHECK_NOT_FREE = (1 << 5),
MLX5_UMR_CHECK_FREE = (2 << 5),
MLX5_UMR_INLINE = (1 << 7),
};
#define MLX5_UMR_MTT_ALIGNMENT 0x40
#define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1)
#define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT
enum mlx5_event {
MLX5_EVENT_TYPE_COMP = 0x0,
@ -206,6 +228,8 @@ enum mlx5_event {
MLX5_EVENT_TYPE_CMD = 0x0a,
MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb,
MLX5_EVENT_TYPE_PAGE_FAULT = 0xc,
};
enum {
@ -225,6 +249,7 @@ enum {
MLX5_DEV_CAP_FLAG_APM = 1LL << 17,
MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18,
MLX5_DEV_CAP_FLAG_BLOCK_MCAST = 1LL << 23,
MLX5_DEV_CAP_FLAG_ON_DMND_PG = 1LL << 24,
MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29,
MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30,
MLX5_DEV_CAP_FLAG_DCT = 1LL << 37,
@ -290,6 +315,8 @@ enum {
enum {
HCA_CAP_OPMOD_GET_MAX = 0,
HCA_CAP_OPMOD_GET_CUR = 1,
HCA_CAP_OPMOD_GET_ODP_MAX = 4,
HCA_CAP_OPMOD_GET_ODP_CUR = 5
};
struct mlx5_inbox_hdr {
@ -319,6 +346,23 @@ struct mlx5_cmd_query_adapter_mbox_out {
u8 vsd_psid[16];
};
enum mlx5_odp_transport_cap_bits {
MLX5_ODP_SUPPORT_SEND = 1 << 31,
MLX5_ODP_SUPPORT_RECV = 1 << 30,
MLX5_ODP_SUPPORT_WRITE = 1 << 29,
MLX5_ODP_SUPPORT_READ = 1 << 28,
};
struct mlx5_odp_caps {
char reserved[0x10];
struct {
__be32 rc_odp_caps;
__be32 uc_odp_caps;
__be32 ud_odp_caps;
} per_transport_caps;
char reserved2[0xe4];
};
struct mlx5_cmd_init_hca_mbox_in {
struct mlx5_inbox_hdr hdr;
u8 rsvd0[2];
@ -439,6 +483,27 @@ struct mlx5_eqe_page_req {
__be32 rsvd1[5];
};
struct mlx5_eqe_page_fault {
__be32 bytes_committed;
union {
struct {
u16 reserved1;
__be16 wqe_index;
u16 reserved2;
__be16 packet_length;
u8 reserved3[12];
} __packed wqe;
struct {
__be32 r_key;
u16 reserved1;
__be16 packet_length;
__be32 rdma_op_len;
__be64 rdma_va;
} __packed rdma;
} __packed;
__be32 flags_qpn;
} __packed;
union ev_data {
__be32 raw[7];
struct mlx5_eqe_cmd cmd;
@ -450,6 +515,7 @@ union ev_data {
struct mlx5_eqe_congestion cong;
struct mlx5_eqe_stall_vl stall_vl;
struct mlx5_eqe_page_req req_pages;
struct mlx5_eqe_page_fault page_fault;
} __packed;
struct mlx5_eqe {
@ -776,6 +842,10 @@ struct mlx5_query_eq_mbox_out {
struct mlx5_eq_context ctx;
};
enum {
MLX5_MKEY_STATUS_FREE = 1 << 6,
};
struct mlx5_mkey_seg {
/* This is a two bit field occupying bits 31-30.
* bit 31 is always 0,
@ -812,7 +882,7 @@ struct mlx5_query_special_ctxs_mbox_out {
struct mlx5_create_mkey_mbox_in {
struct mlx5_inbox_hdr hdr;
__be32 input_mkey_index;
u8 rsvd0[4];
__be32 flags;
struct mlx5_mkey_seg seg;
u8 rsvd1[16];
__be32 xlat_oct_act_size;

View file

@ -113,6 +113,13 @@ enum {
MLX5_REG_HOST_ENDIANNESS = 0x7004,
};
enum mlx5_page_fault_resume_flags {
MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0,
MLX5_PAGE_FAULT_RESUME_WRITE = 1 << 1,
MLX5_PAGE_FAULT_RESUME_RDMA = 1 << 2,
MLX5_PAGE_FAULT_RESUME_ERROR = 1 << 7,
};
enum dbg_rsc_type {
MLX5_DBG_RSC_QP,
MLX5_DBG_RSC_EQ,
@ -467,7 +474,7 @@ struct mlx5_priv {
struct workqueue_struct *pg_wq;
struct rb_root page_root;
int fw_pages;
int reg_pages;
atomic_t reg_pages;
struct list_head free_list;
struct mlx5_core_health health;
@ -703,6 +710,9 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev);
void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas);
void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn);
void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
#endif
void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type);
struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn);
void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector);
@ -740,6 +750,8 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn,
int npsvs, u32 *sig_index);
int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num);
void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common);
int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
struct mlx5_odp_caps *odp_caps);
static inline u32 mlx5_mkey_to_idx(u32 mkey)
{

View file

@ -50,6 +50,9 @@
#define MLX5_BSF_APPTAG_ESCAPE 0x1
#define MLX5_BSF_APPREF_ESCAPE 0x2
#define MLX5_QPN_BITS 24
#define MLX5_QPN_MASK ((1 << MLX5_QPN_BITS) - 1)
enum mlx5_qp_optpar {
MLX5_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0,
MLX5_QP_OPTPAR_RRE = 1 << 1,
@ -189,6 +192,14 @@ struct mlx5_wqe_ctrl_seg {
__be32 imm;
};
#define MLX5_WQE_CTRL_DS_MASK 0x3f
#define MLX5_WQE_CTRL_QPN_MASK 0xffffff00
#define MLX5_WQE_CTRL_QPN_SHIFT 8
#define MLX5_WQE_DS_UNITS 16
#define MLX5_WQE_CTRL_OPCODE_MASK 0xff
#define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00
#define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8
struct mlx5_wqe_xrc_seg {
__be32 xrc_srqn;
u8 rsvd[12];
@ -292,6 +303,8 @@ struct mlx5_wqe_signature_seg {
u8 rsvd1[11];
};
#define MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK 0x3ff
struct mlx5_wqe_inline_seg {
__be32 byte_count;
};
@ -360,9 +373,46 @@ struct mlx5_stride_block_ctrl_seg {
__be16 num_entries;
};
enum mlx5_pagefault_flags {
MLX5_PFAULT_REQUESTOR = 1 << 0,
MLX5_PFAULT_WRITE = 1 << 1,
MLX5_PFAULT_RDMA = 1 << 2,
};
/* Contains the details of a pagefault. */
struct mlx5_pagefault {
u32 bytes_committed;
u8 event_subtype;
enum mlx5_pagefault_flags flags;
union {
/* Initiator or send message responder pagefault details. */
struct {
/* Received packet size, only valid for responders. */
u32 packet_size;
/*
* WQE index. Refers to either the send queue or
* receive queue, according to event_subtype.
*/
u16 wqe_index;
} wqe;
/* RDMA responder pagefault details */
struct {
u32 r_key;
/*
* Received packet size, minimal size page fault
* resolution required for forward progress.
*/
u32 packet_size;
u32 rdma_op_len;
u64 rdma_va;
} rdma;
};
};
struct mlx5_core_qp {
struct mlx5_core_rsc_common common; /* must be first */
void (*event) (struct mlx5_core_qp *, int);
void (*pfault_handler)(struct mlx5_core_qp *, struct mlx5_pagefault *);
int qpn;
struct mlx5_rsc_debug *dbg;
int pid;
@ -530,6 +580,17 @@ static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u
return radix_tree_lookup(&dev->priv.mr_table.tree, key);
}
struct mlx5_page_fault_resume_mbox_in {
struct mlx5_inbox_hdr hdr;
__be32 flags_qpn;
u8 reserved[4];
};
struct mlx5_page_fault_resume_mbox_out {
struct mlx5_outbox_hdr hdr;
u8 rsvd[8];
};
int mlx5_core_create_qp(struct mlx5_core_dev *dev,
struct mlx5_core_qp *qp,
struct mlx5_create_qp_mbox_in *in,
@ -549,6 +610,10 @@ void mlx5_init_qp_table(struct mlx5_core_dev *dev);
void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev);
int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
u8 context, int error);
#endif
static inline const char *mlx5_qp_type_str(int type)
{

View file

@ -38,11 +38,12 @@
#include <linux/workqueue.h>
struct ib_ucontext;
struct ib_umem_odp;
struct ib_umem {
struct ib_ucontext *context;
size_t length;
int offset;
unsigned long address;
int page_size;
int writable;
int hugetlb;
@ -50,17 +51,43 @@ struct ib_umem {
struct pid *pid;
struct mm_struct *mm;
unsigned long diff;
struct ib_umem_odp *odp_data;
struct sg_table sg_head;
int nmap;
int npages;
};
/* Returns the offset of the umem start relative to the first page. */
static inline int ib_umem_offset(struct ib_umem *umem)
{
return umem->address & ((unsigned long)umem->page_size - 1);
}
/* Returns the first page of an ODP umem. */
static inline unsigned long ib_umem_start(struct ib_umem *umem)
{
return umem->address - ib_umem_offset(umem);
}
/* Returns the address of the page after the last one of an ODP umem. */
static inline unsigned long ib_umem_end(struct ib_umem *umem)
{
return PAGE_ALIGN(umem->address + umem->length);
}
static inline size_t ib_umem_num_pages(struct ib_umem *umem)
{
return (ib_umem_end(umem) - ib_umem_start(umem)) >> PAGE_SHIFT;
}
#ifdef CONFIG_INFINIBAND_USER_MEM
struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
size_t size, int access, int dmasync);
void ib_umem_release(struct ib_umem *umem);
int ib_umem_page_count(struct ib_umem *umem);
int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
size_t length);
#else /* CONFIG_INFINIBAND_USER_MEM */
@ -73,7 +100,10 @@ static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context,
}
static inline void ib_umem_release(struct ib_umem *umem) { }
static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; }
static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
size_t length) {
return -EINVAL;
}
#endif /* CONFIG_INFINIBAND_USER_MEM */
#endif /* IB_UMEM_H */

160
include/rdma/ib_umem_odp.h Normal file
View file

@ -0,0 +1,160 @@
/*
* Copyright (c) 2014 Mellanox Technologies. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef IB_UMEM_ODP_H
#define IB_UMEM_ODP_H
#include <rdma/ib_umem.h>
#include <rdma/ib_verbs.h>
#include <linux/interval_tree.h>
struct umem_odp_node {
u64 __subtree_last;
struct rb_node rb;
};
struct ib_umem_odp {
/*
* An array of the pages included in the on-demand paging umem.
* Indices of pages that are currently not mapped into the device will
* contain NULL.
*/
struct page **page_list;
/*
* An array of the same size as page_list, with DMA addresses mapped
* for pages the pages in page_list. The lower two bits designate
* access permissions. See ODP_READ_ALLOWED_BIT and
* ODP_WRITE_ALLOWED_BIT.
*/
dma_addr_t *dma_list;
/*
* The umem_mutex protects the page_list and dma_list fields of an ODP
* umem, allowing only a single thread to map/unmap pages. The mutex
* also protects access to the mmu notifier counters.
*/
struct mutex umem_mutex;
void *private; /* for the HW driver to use. */
/* When false, use the notifier counter in the ucontext struct. */
bool mn_counters_active;
int notifiers_seq;
int notifiers_count;
/* A linked list of umems that don't have private mmu notifier
* counters yet. */
struct list_head no_private_counters;
struct ib_umem *umem;
/* Tree tracking */
struct umem_odp_node interval_tree;
struct completion notifier_completion;
int dying;
};
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem);
void ib_umem_odp_release(struct ib_umem *umem);
/*
* The lower 2 bits of the DMA address signal the R/W permissions for
* the entry. To upgrade the permissions, provide the appropriate
* bitmask to the map_dma_pages function.
*
* Be aware that upgrading a mapped address might result in change of
* the DMA address for the page.
*/
#define ODP_READ_ALLOWED_BIT (1<<0ULL)
#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt,
u64 access_mask, unsigned long current_seq);
void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset,
u64 bound);
void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root);
void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root);
typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end,
void *cookie);
/*
* Call the callback on each ib_umem in the range. Returns the logical or of
* the return values of the functions called.
*/
int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end,
umem_call_back cb, void *cookie);
struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root,
u64 start, u64 last);
struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node,
u64 start, u64 last);
static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item,
unsigned long mmu_seq)
{
/*
* This code is strongly based on the KVM code from
* mmu_notifier_retry. Should be called with
* the relevant locks taken (item->odp_data->umem_mutex
* and the ucontext umem_mutex semaphore locked for read).
*/
/* Do not allow page faults while the new ib_umem hasn't seen a state
* with zero notifiers yet, and doesn't have its own valid set of
* private counters. */
if (!item->odp_data->mn_counters_active)
return 1;
if (unlikely(item->odp_data->notifiers_count))
return 1;
if (item->odp_data->notifiers_seq != mmu_seq)
return 1;
return 0;
}
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline int ib_umem_odp_get(struct ib_ucontext *context,
struct ib_umem *umem)
{
return -EINVAL;
}
static inline void ib_umem_odp_release(struct ib_umem *umem) {}
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
#endif /* IB_UMEM_ODP_H */

View file

@ -51,6 +51,7 @@
#include <uapi/linux/if_ether.h>
#include <linux/atomic.h>
#include <linux/mmu_notifier.h>
#include <asm/uaccess.h>
extern struct workqueue_struct *ib_wq;
@ -123,7 +124,8 @@ enum ib_device_cap_flags {
IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23),
IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24),
IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29),
IB_DEVICE_SIGNATURE_HANDOVER = (1<<30)
IB_DEVICE_SIGNATURE_HANDOVER = (1<<30),
IB_DEVICE_ON_DEMAND_PAGING = (1<<31),
};
enum ib_signature_prot_cap {
@ -143,6 +145,27 @@ enum ib_atomic_cap {
IB_ATOMIC_GLOB
};
enum ib_odp_general_cap_bits {
IB_ODP_SUPPORT = 1 << 0,
};
enum ib_odp_transport_cap_bits {
IB_ODP_SUPPORT_SEND = 1 << 0,
IB_ODP_SUPPORT_RECV = 1 << 1,
IB_ODP_SUPPORT_WRITE = 1 << 2,
IB_ODP_SUPPORT_READ = 1 << 3,
IB_ODP_SUPPORT_ATOMIC = 1 << 4,
};
struct ib_odp_caps {
uint64_t general_caps;
struct {
uint32_t rc_odp_caps;
uint32_t uc_odp_caps;
uint32_t ud_odp_caps;
} per_transport_caps;
};
struct ib_device_attr {
u64 fw_ver;
__be64 sys_image_guid;
@ -186,6 +209,7 @@ struct ib_device_attr {
u8 local_ca_ack_delay;
int sig_prot_cap;
int sig_guard_cap;
struct ib_odp_caps odp_caps;
};
enum ib_mtu {
@ -1073,7 +1097,8 @@ enum ib_access_flags {
IB_ACCESS_REMOTE_READ = (1<<2),
IB_ACCESS_REMOTE_ATOMIC = (1<<3),
IB_ACCESS_MW_BIND = (1<<4),
IB_ZERO_BASED = (1<<5)
IB_ZERO_BASED = (1<<5),
IB_ACCESS_ON_DEMAND = (1<<6),
};
struct ib_phys_buf {
@ -1115,6 +1140,8 @@ struct ib_fmr_attr {
u8 page_shift;
};
struct ib_umem;
struct ib_ucontext {
struct ib_device *device;
struct list_head pd_list;
@ -1127,6 +1154,24 @@ struct ib_ucontext {
struct list_head xrcd_list;
struct list_head rule_list;
int closing;
struct pid *tgid;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct rb_root umem_tree;
/*
* Protects .umem_rbroot and tree, as well as odp_mrs_count and
* mmu notifiers registration.
*/
struct rw_semaphore umem_rwsem;
void (*invalidate_range)(struct ib_umem *umem,
unsigned long start, unsigned long end);
struct mmu_notifier mn;
atomic_t notifier_count;
/* A list of umems that don't have private mmu notifier counters yet. */
struct list_head no_private_counters;
int odp_mrs_count;
#endif
};
struct ib_uobject {
@ -1662,7 +1707,10 @@ static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t
static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
{
return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
size_t copy_sz;
copy_sz = min_t(size_t, len, udata->outlen);
return copy_to_user(udata->outbuf, src, copy_sz) ? -EFAULT : 0;
}
/**

View file

@ -90,8 +90,9 @@ enum {
};
enum {
IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE,
IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
IB_USER_VERBS_EX_CMD_DESTROY_FLOW
IB_USER_VERBS_EX_CMD_DESTROY_FLOW,
};
/*
@ -201,6 +202,32 @@ struct ib_uverbs_query_device_resp {
__u8 reserved[4];
};
enum {
IB_USER_VERBS_EX_QUERY_DEVICE_ODP = 1ULL << 0,
};
struct ib_uverbs_ex_query_device {
__u32 comp_mask;
__u32 reserved;
};
struct ib_uverbs_odp_caps {
__u64 general_caps;
struct {
__u32 rc_odp_caps;
__u32 uc_odp_caps;
__u32 ud_odp_caps;
} per_transport_caps;
__u32 reserved;
};
struct ib_uverbs_ex_query_device_resp {
struct ib_uverbs_query_device_resp base;
__u32 comp_mask;
__u32 reserved;
struct ib_uverbs_odp_caps odp_caps;
};
struct ib_uverbs_query_port {
__u64 response;
__u8 port_num;