From 8c4037b501acd2ec3abc7925e66af8af40a2da9d Mon Sep 17 00:00:00 2001 From: David Dillow Date: Fri, 14 Jan 2011 17:02:23 -0500 Subject: [PATCH 01/10] IB/srp: always avoid non-zero offsets into an FMR It is unclear exactly how this code works around Mellanox SRP targets, or if the problem is on the target side or in the HCA itself. In an abundance of caution, we should always enable the workaround. Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 83664ed2804f..197e26cc2b0b 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -72,12 +72,6 @@ module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); -static int mellanox_workarounds = 1; - -module_param(mellanox_workarounds, int, 0444); -MODULE_PARM_DESC(mellanox_workarounds, - "Enable workarounds for Mellanox SRP target bugs if != 0"); - static void srp_add_one(struct ib_device *device); static void srp_remove_one(struct ib_device *device); static void srp_recv_completion(struct ib_cq *cq, void *target_ptr); @@ -114,14 +108,6 @@ static int srp_target_is_topspin(struct srp_target_port *target) !memcmp(&target->ioc_guid, cisco_oui, sizeof cisco_oui)); } -static int srp_target_is_mellanox(struct srp_target_port *target) -{ - static const u8 mellanox_oui[3] = { 0x00, 0x02, 0xc9 }; - - return mellanox_workarounds && - !memcmp(&target->ioc_guid, mellanox_oui, sizeof mellanox_oui); -} - static struct srp_iu *srp_alloc_iu(struct srp_host *host, size_t size, gfp_t gfp_mask, enum dma_data_direction direction) @@ -662,8 +648,7 @@ static int srp_map_fmr(struct srp_target_port *target, struct scatterlist *scat, if (!dev->fmr_pool) return -ENODEV; - if (srp_target_is_mellanox(target) && - (ib_sg_dma_address(ibdev, &scat[0]) & ~dev->fmr_page_mask)) + if (ib_sg_dma_address(ibdev, &scat[0]) & ~dev->fmr_page_mask) return -EINVAL; len = page_cnt = 0; From 961e0be89a5120a1409ebc525cca6f603615a8a8 Mon Sep 17 00:00:00 2001 From: David Dillow Date: Fri, 14 Jan 2011 17:32:07 -0500 Subject: [PATCH 02/10] IB/srp: move IB CM setup completion into its own function This is to clean up prior to further changes. Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 144 ++++++++++++++-------------- 1 file changed, 73 insertions(+), 71 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 197e26cc2b0b..060e6a84f18f 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1213,6 +1213,78 @@ err: return -ENOMEM; } +static void srp_cm_rep_handler(struct ib_cm_id *cm_id, + struct srp_login_rsp *lrsp, + struct srp_target_port *target) +{ + struct ib_qp_attr *qp_attr = NULL; + int attr_mask = 0; + int ret; + int i; + + if (lrsp->opcode == SRP_LOGIN_RSP) { + target->max_ti_iu_len = be32_to_cpu(lrsp->max_ti_iu_len); + target->req_lim = be32_to_cpu(lrsp->req_lim_delta); + + /* + * Reserve credits for task management so we don't + * bounce requests back to the SCSI mid-layer. + */ + target->scsi_host->can_queue + = min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE, + target->scsi_host->can_queue); + } else { + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled RSP opcode %#x\n", lrsp->opcode); + ret = -ECONNRESET; + goto error; + } + + if (!target->rx_ring[0]) { + ret = srp_alloc_iu_bufs(target); + if (ret) + goto error; + } + + ret = -ENOMEM; + qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); + if (!qp_attr) + goto error; + + qp_attr->qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (ret) + goto error_free; + + ret = ib_modify_qp(target->qp, qp_attr, attr_mask); + if (ret) + goto error_free; + + for (i = 0; i < SRP_RQ_SIZE; i++) { + struct srp_iu *iu = target->rx_ring[i]; + ret = srp_post_recv(target, iu); + if (ret) + goto error_free; + } + + qp_attr->qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (ret) + goto error_free; + + ret = ib_modify_qp(target->qp, qp_attr, attr_mask); + if (ret) + goto error_free; + + ret = ib_send_cm_rtu(cm_id, NULL, 0); + +error_free: + kfree(qp_attr); + +error: + target->status = ret; +} + static void srp_cm_rej_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event, struct srp_target_port *target) @@ -1296,11 +1368,7 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct srp_target_port *target = cm_id->context; - struct ib_qp_attr *qp_attr = NULL; - int attr_mask = 0; int comp = 0; - int opcode = 0; - int i; switch (event->event) { case IB_CM_REQ_ERROR: @@ -1312,71 +1380,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) case IB_CM_REP_RECEIVED: comp = 1; - opcode = *(u8 *) event->private_data; - - if (opcode == SRP_LOGIN_RSP) { - struct srp_login_rsp *rsp = event->private_data; - - target->max_ti_iu_len = be32_to_cpu(rsp->max_ti_iu_len); - target->req_lim = be32_to_cpu(rsp->req_lim_delta); - - /* - * Reserve credits for task management so we don't - * bounce requests back to the SCSI mid-layer. - */ - target->scsi_host->can_queue - = min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE, - target->scsi_host->can_queue); - } else { - shost_printk(KERN_WARNING, target->scsi_host, - PFX "Unhandled RSP opcode %#x\n", opcode); - target->status = -ECONNRESET; - break; - } - - if (!target->rx_ring[0]) { - target->status = srp_alloc_iu_bufs(target); - if (target->status) - break; - } - - qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); - if (!qp_attr) { - target->status = -ENOMEM; - break; - } - - qp_attr->qp_state = IB_QPS_RTR; - target->status = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); - if (target->status) - break; - - target->status = ib_modify_qp(target->qp, qp_attr, attr_mask); - if (target->status) - break; - - for (i = 0; i < SRP_RQ_SIZE; i++) { - struct srp_iu *iu = target->rx_ring[i]; - target->status = srp_post_recv(target, iu); - if (target->status) - break; - } - if (target->status) - break; - - qp_attr->qp_state = IB_QPS_RTS; - target->status = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); - if (target->status) - break; - - target->status = ib_modify_qp(target->qp, qp_attr, attr_mask); - if (target->status) - break; - - target->status = ib_send_cm_rtu(cm_id, NULL, 0); - if (target->status) - break; - + srp_cm_rep_handler(cm_id, event->private_data, target); break; case IB_CM_REJ_RECEIVED: @@ -1416,8 +1420,6 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) if (comp) complete(&target->done); - kfree(qp_attr); - return 0; } From 4924864404d0ce2c32a6d20b27b5b6fcb31e481d Mon Sep 17 00:00:00 2001 From: David Dillow Date: Fri, 14 Jan 2011 18:23:24 -0500 Subject: [PATCH 03/10] IB/srp: allow sg_tablesize to be set for each target Different configurations of target software allow differing max sizes of the command IU. Allowing this to be changed per-target allows all targets on an initiator to get an optimal setting. We deprecate srp_sg_tablesize and replace it with cmd_sg_entries in preparation for allowing more indirect descriptors than can fit in the IU. Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 83 ++++++++++++++++++++--------- drivers/infiniband/ulp/srp/ib_srp.h | 2 + 2 files changed, 60 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 060e6a84f18f..6f8ee0c7ef5f 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -59,15 +59,17 @@ MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator " "v" DRV_VERSION " (" DRV_RELDATE ")"); MODULE_LICENSE("Dual BSD/GPL"); -static int srp_sg_tablesize = SRP_DEF_SG_TABLESIZE; -static int srp_max_iu_len; - -module_param(srp_sg_tablesize, int, 0444); -MODULE_PARM_DESC(srp_sg_tablesize, - "Max number of gather/scatter entries per I/O (default is 12, max 255)"); - +static unsigned int srp_sg_tablesize; +static unsigned int cmd_sg_entries; static int topspin_workarounds = 1; +module_param(srp_sg_tablesize, uint, 0444); +MODULE_PARM_DESC(srp_sg_tablesize, "Deprecated name for cmd_sg_entries"); + +module_param(cmd_sg_entries, uint, 0444); +MODULE_PARM_DESC(cmd_sg_entries, + "Default number of gather/scatter entries in the SRP command (default is 12, max 255)"); + module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); @@ -364,7 +366,7 @@ static int srp_send_req(struct srp_target_port *target) req->priv.opcode = SRP_LOGIN_REQ; req->priv.tag = 0; - req->priv.req_it_iu_len = cpu_to_be32(srp_max_iu_len); + req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len); req->priv.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | SRP_BUF_FORMAT_INDIRECT); /* @@ -1125,7 +1127,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) spin_unlock_irqrestore(&target->lock, flags); dev = target->srp_host->srp_dev->dev; - ib_dma_sync_single_for_cpu(dev, iu->dma, srp_max_iu_len, + ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len, DMA_TO_DEVICE); scmnd->result = 0; @@ -1149,7 +1151,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) goto err_iu; } - ib_dma_sync_single_for_device(dev, iu->dma, srp_max_iu_len, + ib_dma_sync_single_for_device(dev, iu->dma, target->max_iu_len, DMA_TO_DEVICE); if (srp_post_send(target, iu, len)) { @@ -1189,7 +1191,7 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) for (i = 0; i < SRP_SQ_SIZE; ++i) { target->tx_ring[i] = srp_alloc_iu(target->srp_host, - srp_max_iu_len, + target->max_iu_len, GFP_KERNEL, DMA_TO_DEVICE); if (!target->tx_ring[i]) goto err; @@ -1645,6 +1647,14 @@ static ssize_t show_local_ib_device(struct device *dev, return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name); } +static ssize_t show_cmd_sg_entries(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%u\n", target->cmd_sg_cnt); +} + static DEVICE_ATTR(id_ext, S_IRUGO, show_id_ext, NULL); static DEVICE_ATTR(ioc_guid, S_IRUGO, show_ioc_guid, NULL); static DEVICE_ATTR(service_id, S_IRUGO, show_service_id, NULL); @@ -1655,6 +1665,7 @@ static DEVICE_ATTR(req_lim, S_IRUGO, show_req_lim, NULL); static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL); static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); +static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL); static struct device_attribute *srp_host_attrs[] = { &dev_attr_id_ext, @@ -1667,6 +1678,7 @@ static struct device_attribute *srp_host_attrs[] = { &dev_attr_zero_req_lim, &dev_attr_local_ib_port, &dev_attr_local_ib_device, + &dev_attr_cmd_sg_entries, NULL }; @@ -1679,6 +1691,7 @@ static struct scsi_host_template srp_template = { .eh_abort_handler = srp_abort, .eh_device_reset_handler = srp_reset_device, .eh_host_reset_handler = srp_reset_host, + .sg_tablesize = SRP_DEF_SG_TABLESIZE, .can_queue = SRP_CMD_SQ_SIZE, .this_id = -1, .cmd_per_lun = SRP_CMD_SQ_SIZE, @@ -1750,6 +1763,7 @@ enum { SRP_OPT_MAX_CMD_PER_LUN = 1 << 6, SRP_OPT_IO_CLASS = 1 << 7, SRP_OPT_INITIATOR_EXT = 1 << 8, + SRP_OPT_CMD_SG_ENTRIES = 1 << 9, SRP_OPT_ALL = (SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | @@ -1767,6 +1781,7 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_MAX_CMD_PER_LUN, "max_cmd_per_lun=%d" }, { SRP_OPT_IO_CLASS, "io_class=%x" }, { SRP_OPT_INITIATOR_EXT, "initiator_ext=%s" }, + { SRP_OPT_CMD_SG_ENTRIES, "cmd_sg_entries=%u" }, { SRP_OPT_ERR, NULL } }; @@ -1894,6 +1909,14 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) kfree(p); break; + case SRP_OPT_CMD_SG_ENTRIES: + if (match_int(args, &token) || token < 1 || token > 255) { + printk(KERN_WARNING PFX "bad max cmd_sg_entries parameter '%s'\n", p); + goto out; + } + target->cmd_sg_cnt = token; + break; + default: printk(KERN_WARNING PFX "unknown parameter or missing value " "'%s' in target creation request\n", p); @@ -1932,17 +1955,18 @@ static ssize_t srp_create_target(struct device *dev, if (!target_host) return -ENOMEM; - target_host->transportt = ib_srp_transport_template; + target_host->transportt = ib_srp_transport_template; target_host->max_lun = SRP_MAX_LUN; target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb; target = host_to_target(target_host); - target->io_class = SRP_REV16A_IB_IO_CLASS; - target->scsi_host = target_host; - target->srp_host = host; - target->lkey = host->srp_dev->mr->lkey; - target->rkey = host->srp_dev->mr->rkey; + target->io_class = SRP_REV16A_IB_IO_CLASS; + target->scsi_host = target_host; + target->srp_host = host; + target->lkey = host->srp_dev->mr->lkey; + target->rkey = host->srp_dev->mr->rkey; + target->cmd_sg_cnt = cmd_sg_entries; spin_lock_init(&target->lock); INIT_LIST_HEAD(&target->free_tx); @@ -1956,6 +1980,11 @@ static ssize_t srp_create_target(struct device *dev, if (ret) goto err; + target_host->sg_tablesize = target->cmd_sg_cnt; + target->max_iu_len = sizeof (struct srp_cmd) + + sizeof (struct srp_indirect_buf) + + target->cmd_sg_cnt * sizeof (struct srp_direct_buf); + ib_query_gid(host->srp_dev->dev, host->port, 0, &target->path.sgid); shost_printk(KERN_DEBUG, target->scsi_host, PFX @@ -2217,9 +2246,18 @@ static int __init srp_init_module(void) BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *)); - if (srp_sg_tablesize > 255) { - printk(KERN_WARNING PFX "Clamping srp_sg_tablesize to 255\n"); - srp_sg_tablesize = 255; + if (srp_sg_tablesize) { + printk(KERN_WARNING PFX "srp_sg_tablesize is deprecated, please use cmd_sg_entries\n"); + if (!cmd_sg_entries) + cmd_sg_entries = srp_sg_tablesize; + } + + if (!cmd_sg_entries) + cmd_sg_entries = SRP_DEF_SG_TABLESIZE; + + if (cmd_sg_entries > 255) { + printk(KERN_WARNING PFX "Clamping cmd_sg_entries to 255\n"); + cmd_sg_entries = 255; } ib_srp_transport_template = @@ -2227,11 +2265,6 @@ static int __init srp_init_module(void) if (!ib_srp_transport_template) return -ENOMEM; - srp_template.sg_tablesize = srp_sg_tablesize; - srp_max_iu_len = (sizeof (struct srp_cmd) + - sizeof (struct srp_indirect_buf) + - srp_sg_tablesize * 16); - ret = class_register(&srp_class); if (ret) { printk(KERN_ERR PFX "couldn't register class infiniband_srp\n"); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 9dc6fc3fd894..db39dbf76216 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -130,6 +130,8 @@ struct srp_target_port { u32 lkey; u32 rkey; enum srp_target_state state; + unsigned int max_iu_len; + unsigned int cmd_sg_cnt; /* Everything above this point is used in the hot path of * command processing. Try to keep them packed into cachelines. From 8f26c9ff9cd0317ad867bce972f69e0c6c2cbe3c Mon Sep 17 00:00:00 2001 From: David Dillow Date: Fri, 14 Jan 2011 19:45:50 -0500 Subject: [PATCH 04/10] IB/srp: rework mapping engine to use multiple FMR entries Instead of forcing all of the S/G entries to fit in one FMR, and falling back to indirect descriptors if that fails, allow the use of as many FMRs as needed to map the request. This lays the groundwork for allowing indirect descriptor tables that are larger than can fit in the command IU, but should marginally improve performance now by reducing the number of indirect descriptors needed. We increase the minimum page size for the FMR pool to 4K, as larger pages help increase the coverage of each FMR, and it is rare that the kernel would send down a request with scattered 512 byte fragments. This patch also move some of the target initialization code afte the parsing of options, to keep it together with the new code that needs to allocate memory based on the options given. Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 383 ++++++++++++++++++---------- drivers/infiniband/ulp/srp/ib_srp.h | 28 +- 2 files changed, 274 insertions(+), 137 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 6f8ee0c7ef5f..9ce129ab3beb 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -444,6 +444,17 @@ static bool srp_change_state(struct srp_target_port *target, return changed; } +static void srp_free_req_data(struct srp_target_port *target) +{ + struct srp_request *req; + int i; + + for (i = 0, req = target->req_ring; i < SRP_CMD_SQ_SIZE; ++i, ++req) { + kfree(req->fmr_list); + kfree(req->map_page); + } +} + static void srp_remove_work(struct work_struct *work) { struct srp_target_port *target = @@ -460,6 +471,7 @@ static void srp_remove_work(struct work_struct *work) scsi_remove_host(target->scsi_host); ib_destroy_cm_id(target->cm_id); srp_free_target_ib(target); + srp_free_req_data(target); scsi_host_put(target->scsi_host); } @@ -523,18 +535,20 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_request *req) { + struct ib_device *ibdev = target->srp_host->srp_dev->dev; + struct ib_pool_fmr **pfmr; + if (!scsi_sglist(scmnd) || (scmnd->sc_data_direction != DMA_TO_DEVICE && scmnd->sc_data_direction != DMA_FROM_DEVICE)) return; - if (req->fmr) { - ib_fmr_pool_unmap(req->fmr); - req->fmr = NULL; - } + pfmr = req->fmr_list; + while (req->nfmr--) + ib_fmr_pool_unmap(*pfmr++); - ib_dma_unmap_sg(target->srp_host->srp_dev->dev, scsi_sglist(scmnd), - scsi_sg_count(scmnd), scmnd->sc_data_direction); + ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd), + scmnd->sc_data_direction); } static void srp_remove_req(struct srp_target_port *target, @@ -633,95 +647,152 @@ err: return ret; } -static int srp_map_fmr(struct srp_target_port *target, struct scatterlist *scat, - int sg_cnt, struct srp_request *req, - struct srp_direct_buf *buf) +static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, + unsigned int dma_len, u32 rkey) { + struct srp_direct_buf *desc = state->desc; + + desc->va = cpu_to_be64(dma_addr); + desc->key = cpu_to_be32(rkey); + desc->len = cpu_to_be32(dma_len); + + state->total_len += dma_len; + state->desc++; + state->ndesc++; +} + +static int srp_map_finish_fmr(struct srp_map_state *state, + struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_pool_fmr *fmr; u64 io_addr = 0; - u64 *dma_pages; - u32 len; - int page_cnt; - int i, j; - int ret; + + if (!state->npages) + return 0; + + if (state->npages == 1) { + srp_map_desc(state, state->base_dma_addr, state->fmr_len, + target->rkey); + state->npages = state->fmr_len = 0; + return 0; + } + + fmr = ib_fmr_pool_map_phys(dev->fmr_pool, state->pages, + state->npages, io_addr); + if (IS_ERR(fmr)) + return PTR_ERR(fmr); + + *state->next_fmr++ = fmr; + state->nfmr++; + + srp_map_desc(state, 0, state->fmr_len, fmr->fmr->rkey); + state->npages = state->fmr_len = 0; + return 0; +} + +static void srp_map_update_start(struct srp_map_state *state, + struct scatterlist *sg, int sg_index, + dma_addr_t dma_addr) +{ + state->unmapped_sg = sg; + state->unmapped_index = sg_index; + state->unmapped_addr = dma_addr; +} + +static int srp_map_sg_entry(struct srp_map_state *state, + struct srp_target_port *target, + struct scatterlist *sg, int sg_index, + int use_fmr) +{ struct srp_device *dev = target->srp_host->srp_dev; struct ib_device *ibdev = dev->dev; - struct scatterlist *sg; + dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg); + unsigned int dma_len = ib_sg_dma_len(ibdev, sg); + unsigned int len; + int ret; - if (!dev->fmr_pool) - return -ENODEV; + if (!dma_len) + return 0; - if (ib_sg_dma_address(ibdev, &scat[0]) & ~dev->fmr_page_mask) - return -EINVAL; + if (use_fmr == SRP_MAP_NO_FMR) { + /* Once we're in direct map mode for a request, we don't + * go back to FMR mode, so no need to update anything + * other than the descriptor. + */ + srp_map_desc(state, dma_addr, dma_len, target->rkey); + return 0; + } - len = page_cnt = 0; - scsi_for_each_sg(req->scmnd, sg, sg_cnt, i) { - unsigned int dma_len = ib_sg_dma_len(ibdev, sg); + /* If we start at an offset into the FMR page, don't merge into + * the current FMR. Finish it out, and use the kernel's MR for this + * sg entry. This is to avoid potential bugs on some SRP targets + * that were never quite defined, but went away when the initiator + * avoided using FMR on such page fragments. + */ + if (dma_addr & ~dev->fmr_page_mask || dma_len > dev->fmr_max_size) { + ret = srp_map_finish_fmr(state, target); + if (ret) + return ret; - if (ib_sg_dma_address(ibdev, sg) & ~dev->fmr_page_mask) { - if (i > 0) - return -EINVAL; - else - ++page_cnt; - } - if ((ib_sg_dma_address(ibdev, sg) + dma_len) & - ~dev->fmr_page_mask) { - if (i < sg_cnt - 1) - return -EINVAL; - else - ++page_cnt; + srp_map_desc(state, dma_addr, dma_len, target->rkey); + srp_map_update_start(state, NULL, 0, 0); + return 0; + } + + /* If this is the first sg to go into the FMR, save our position. + * We need to know the first unmapped entry, its index, and the + * first unmapped address within that entry to be able to restart + * mapping after an error. + */ + if (!state->unmapped_sg) + srp_map_update_start(state, sg, sg_index, dma_addr); + + while (dma_len) { + if (state->npages == SRP_FMR_SIZE) { + ret = srp_map_finish_fmr(state, target); + if (ret) + return ret; + + srp_map_update_start(state, sg, sg_index, dma_addr); } - len += dma_len; + len = min_t(unsigned int, dma_len, dev->fmr_page_size); + + if (!state->npages) + state->base_dma_addr = dma_addr; + state->pages[state->npages++] = dma_addr; + state->fmr_len += len; + dma_addr += len; + dma_len -= len; } - page_cnt += len >> dev->fmr_page_shift; - if (page_cnt > SRP_FMR_SIZE) - return -ENOMEM; - - dma_pages = kmalloc(sizeof (u64) * page_cnt, GFP_ATOMIC); - if (!dma_pages) - return -ENOMEM; - - page_cnt = 0; - scsi_for_each_sg(req->scmnd, sg, sg_cnt, i) { - unsigned int dma_len = ib_sg_dma_len(ibdev, sg); - - for (j = 0; j < dma_len; j += dev->fmr_page_size) - dma_pages[page_cnt++] = - (ib_sg_dma_address(ibdev, sg) & - dev->fmr_page_mask) + j; - } - - req->fmr = ib_fmr_pool_map_phys(dev->fmr_pool, - dma_pages, page_cnt, io_addr); - if (IS_ERR(req->fmr)) { - ret = PTR_ERR(req->fmr); - req->fmr = NULL; - goto out; - } - - buf->va = cpu_to_be64(ib_sg_dma_address(ibdev, &scat[0]) & - ~dev->fmr_page_mask); - buf->key = cpu_to_be32(req->fmr->fmr->rkey); - buf->len = cpu_to_be32(len); - + /* If the last entry of the FMR wasn't a full page, then we need to + * close it out and start a new one -- we can only merge at page + * boundries. + */ ret = 0; - -out: - kfree(dma_pages); - + if (len != dev->fmr_page_size) { + ret = srp_map_finish_fmr(state, target); + if (!ret) + srp_map_update_start(state, NULL, 0, 0); + } return ret; } static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_request *req) { - struct scatterlist *scat; + struct scatterlist *scat, *sg; struct srp_cmd *cmd = req->cmd->buf; - int len, nents, count; - u8 fmt = SRP_DATA_DESC_DIRECT; + int i, len, nents, count, use_fmr; struct srp_device *dev; struct ib_device *ibdev; + struct srp_map_state state; + struct srp_indirect_buf *indirect_hdr; + dma_addr_t indirect_addr; + u32 table_len; + u8 fmt; if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE) return sizeof (struct srp_cmd); @@ -741,6 +812,8 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, ibdev = dev->dev; count = ib_dma_map_sg(ibdev, scat, nents, scmnd->sc_data_direction); + if (unlikely(count == 0)) + return -EIO; fmt = SRP_DATA_DESC_DIRECT; len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf); @@ -757,49 +830,80 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, buf->va = cpu_to_be64(ib_sg_dma_address(ibdev, scat)); buf->key = cpu_to_be32(target->rkey); buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat)); - } else if (srp_map_fmr(target, scat, count, req, - (void *) cmd->add_data)) { - /* - * FMR mapping failed, and the scatterlist has more - * than one entry. Generate an indirect memory - * descriptor. - */ - struct srp_indirect_buf *buf = (void *) cmd->add_data; - struct scatterlist *sg; - u32 datalen = 0; - int i; - fmt = SRP_DATA_DESC_INDIRECT; - len = sizeof (struct srp_cmd) + - sizeof (struct srp_indirect_buf) + - count * sizeof (struct srp_direct_buf); - - scsi_for_each_sg(scmnd, sg, count, i) { - unsigned int dma_len = ib_sg_dma_len(ibdev, sg); - - buf->desc_list[i].va = - cpu_to_be64(ib_sg_dma_address(ibdev, sg)); - buf->desc_list[i].key = - cpu_to_be32(target->rkey); - buf->desc_list[i].len = cpu_to_be32(dma_len); - datalen += dma_len; - } - - if (scmnd->sc_data_direction == DMA_TO_DEVICE) - cmd->data_out_desc_cnt = count; - else - cmd->data_in_desc_cnt = count; - - buf->table_desc.va = - cpu_to_be64(req->cmd->dma + sizeof *cmd + sizeof *buf); - buf->table_desc.key = - cpu_to_be32(target->rkey); - buf->table_desc.len = - cpu_to_be32(count * sizeof (struct srp_direct_buf)); - - buf->len = cpu_to_be32(datalen); + req->nfmr = 0; + goto map_complete; } + /* We have more than one scatter/gather entry, so build our indirect + * descriptor table, trying to merge as many entries with FMR as we + * can. + */ + indirect_hdr = (void *) cmd->add_data; + + memset(&state, 0, sizeof(state)); + state.desc = indirect_hdr->desc_list; + state.pages = req->map_page; + state.next_fmr = req->fmr_list; + + use_fmr = dev->fmr_pool ? SRP_MAP_ALLOW_FMR : SRP_MAP_NO_FMR; + + for_each_sg(scat, sg, count, i) { + if (srp_map_sg_entry(&state, target, sg, i, use_fmr)) { + /* FMR mapping failed, so backtrack to the first + * unmapped entry and continue on without using FMR. + */ + dma_addr_t dma_addr; + unsigned int dma_len; + +backtrack: + sg = state.unmapped_sg; + i = state.unmapped_index; + + dma_addr = ib_sg_dma_address(ibdev, sg); + dma_len = ib_sg_dma_len(ibdev, sg); + dma_len -= (state.unmapped_addr - dma_addr); + dma_addr = state.unmapped_addr; + use_fmr = SRP_MAP_NO_FMR; + srp_map_desc(&state, dma_addr, dma_len, target->rkey); + } + } + + if (use_fmr == SRP_MAP_ALLOW_FMR && srp_map_finish_fmr(&state, target)) + goto backtrack; + + /* We've mapped the request, fill in the command buffer. + */ + req->nfmr = state.nfmr; + if (state.ndesc == 1) { + /* FMR mapping was able to collapse this to one entry, + * so use a direct descriptor. + */ + struct srp_direct_buf *buf = (void *) cmd->add_data; + + *buf = indirect_hdr->desc_list[0]; + goto map_complete; + } + + table_len = state.ndesc * sizeof (struct srp_direct_buf); + + fmt = SRP_DATA_DESC_INDIRECT; + len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf); + len += table_len; + + indirect_addr = req->cmd->dma + sizeof *cmd + sizeof *indirect_hdr; + + indirect_hdr->table_desc.va = cpu_to_be64(indirect_addr); + indirect_hdr->table_desc.key = cpu_to_be32(target->rkey); + indirect_hdr->table_desc.len = cpu_to_be32(table_len); + indirect_hdr->len = cpu_to_be32(state.total_len); + + if (scmnd->sc_data_direction == DMA_TO_DEVICE) + cmd->data_out_desc_cnt = state.ndesc; + else + cmd->data_in_desc_cnt = state.ndesc; + +map_complete: if (scmnd->sc_data_direction == DMA_TO_DEVICE) cmd->buf_fmt = fmt << 4; else @@ -1947,8 +2051,7 @@ static ssize_t srp_create_target(struct device *dev, container_of(dev, struct srp_host, dev); struct Scsi_Host *target_host; struct srp_target_port *target; - int ret; - int i; + int i, ret; target_host = scsi_host_alloc(&srp_template, sizeof (struct srp_target_port)); @@ -1968,14 +2071,6 @@ static ssize_t srp_create_target(struct device *dev, target->rkey = host->srp_dev->mr->rkey; target->cmd_sg_cnt = cmd_sg_entries; - spin_lock_init(&target->lock); - INIT_LIST_HEAD(&target->free_tx); - INIT_LIST_HEAD(&target->free_reqs); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { - target->req_ring[i].index = i; - list_add_tail(&target->req_ring[i].list, &target->free_reqs); - } - ret = srp_parse_options(buf, target); if (ret) goto err; @@ -1985,6 +2080,23 @@ static ssize_t srp_create_target(struct device *dev, sizeof (struct srp_indirect_buf) + target->cmd_sg_cnt * sizeof (struct srp_direct_buf); + spin_lock_init(&target->lock); + INIT_LIST_HEAD(&target->free_tx); + INIT_LIST_HEAD(&target->free_reqs); + for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + struct srp_request *req = &target->req_ring[i]; + + req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof (void *), + GFP_KERNEL); + req->map_page = kmalloc(SRP_FMR_SIZE * sizeof (void *), + GFP_KERNEL); + if (!req->fmr_list || !req->map_page) + goto err_free_mem; + + req->index = i; + list_add_tail(&req->list, &target->free_reqs); + } + ib_query_gid(host->srp_dev->dev, host->port, 0, &target->path.sgid); shost_printk(KERN_DEBUG, target->scsi_host, PFX @@ -1998,11 +2110,11 @@ static ssize_t srp_create_target(struct device *dev, ret = srp_create_target_ib(target); if (ret) - goto err; + goto err_free_mem; ret = srp_new_cm_id(target); if (ret) - goto err_free; + goto err_free_ib; target->qp_in_error = 0; ret = srp_connect_target(target); @@ -2024,9 +2136,12 @@ err_disconnect: err_cm_id: ib_destroy_cm_id(target->cm_id); -err_free: +err_free_ib: srp_free_target_ib(target); +err_free_mem: + srp_free_req_data(target); + err: scsi_host_put(target_host); @@ -2099,7 +2214,7 @@ static void srp_add_one(struct ib_device *device) struct ib_device_attr *dev_attr; struct ib_fmr_pool_param fmr_param; struct srp_host *host; - int s, e, p; + int fmr_page_shift, s, e, p; dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); if (!dev_attr) @@ -2117,12 +2232,13 @@ static void srp_add_one(struct ib_device *device) /* * Use the smallest page size supported by the HCA, down to a - * minimum of 512 bytes (which is the smallest sector that a - * SCSI command will ever carry). + * minimum of 4096 bytes. We're unlikely to build large sglists + * out of smaller entries. */ - srp_dev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1); - srp_dev->fmr_page_size = 1 << srp_dev->fmr_page_shift; - srp_dev->fmr_page_mask = ~((u64) srp_dev->fmr_page_size - 1); + fmr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1); + srp_dev->fmr_page_size = 1 << fmr_page_shift; + srp_dev->fmr_page_mask = ~((u64) srp_dev->fmr_page_size - 1); + srp_dev->fmr_max_size = srp_dev->fmr_page_size * SRP_FMR_SIZE; INIT_LIST_HEAD(&srp_dev->dev_list); @@ -2143,7 +2259,7 @@ static void srp_add_one(struct ib_device *device) fmr_param.dirty_watermark = SRP_FMR_DIRTY_SIZE; fmr_param.cache = 1; fmr_param.max_pages_per_fmr = SRP_FMR_SIZE; - fmr_param.page_shift = srp_dev->fmr_page_shift; + fmr_param.page_shift = fmr_page_shift; fmr_param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ); @@ -2223,6 +2339,7 @@ static void srp_remove_one(struct ib_device *device) srp_disconnect_target(target); ib_destroy_cm_id(target->cm_id); srp_free_target_ib(target); + srp_free_req_data(target); scsi_host_put(target->scsi_host); } diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index db39dbf76216..b43b5e7acbde 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -71,7 +71,10 @@ enum { SRP_FMR_SIZE = 256, SRP_FMR_POOL_SIZE = 1024, - SRP_FMR_DIRTY_SIZE = SRP_FMR_POOL_SIZE / 4 + SRP_FMR_DIRTY_SIZE = SRP_FMR_POOL_SIZE / 4, + + SRP_MAP_ALLOW_FMR = 0, + SRP_MAP_NO_FMR = 1, }; enum srp_target_state { @@ -93,9 +96,9 @@ struct srp_device { struct ib_pd *pd; struct ib_mr *mr; struct ib_fmr_pool *fmr_pool; - int fmr_page_shift; - int fmr_page_size; u64 fmr_page_mask; + int fmr_page_size; + int fmr_max_size; }; struct srp_host { @@ -112,7 +115,9 @@ struct srp_request { struct list_head list; struct scsi_cmnd *scmnd; struct srp_iu *cmd; - struct ib_pool_fmr *fmr; + struct ib_pool_fmr **fmr_list; + u64 *map_page; + short nfmr; short index; }; @@ -181,4 +186,19 @@ struct srp_iu { enum dma_data_direction direction; }; +struct srp_map_state { + struct ib_pool_fmr **next_fmr; + struct srp_direct_buf *desc; + u64 *pages; + dma_addr_t base_dma_addr; + u32 fmr_len; + u32 total_len; + unsigned int npages; + unsigned int nfmr; + unsigned int ndesc; + struct scatterlist *unmapped_sg; + int unmapped_index; + dma_addr_t unmapped_addr; +}; + #endif /* IB_SRP_H */ From c07d424d6118d528ef71b22b7424bfc359c307a5 Mon Sep 17 00:00:00 2001 From: David Dillow Date: Sun, 16 Jan 2011 13:57:10 -0500 Subject: [PATCH 05/10] IB/srp: add support for indirect tables that don't fit in SRP_CMD This allows us to guarantee the ability to submit up to 8 MB requests based on the current value of SCSI_MAX_SG_CHAIN_SEGMENTS. While FMR will usually condense the requests into 8 SG entries, it is imperative that the target support external tables in case the FMR mapping fails or is not supported. We add a safety valve to allow targets without the needed support to reap the benefits of the large tables, but fail in a manner that lets the user know that the data didn't make it to the device. The user must add "allow_ext_sg=1" to the target parameters to indicate that the target has the needed support. If indirect_sg_entries is not specified in the modules options, then the sg_tablesize for the target will default to cmd_sg_entries unless overridden by the target options. Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 117 +++++++++++++++++++++++++--- drivers/infiniband/ulp/srp/ib_srp.h | 5 ++ 2 files changed, 110 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 9ce129ab3beb..4ec7dddbbf49 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -61,6 +61,8 @@ MODULE_LICENSE("Dual BSD/GPL"); static unsigned int srp_sg_tablesize; static unsigned int cmd_sg_entries; +static unsigned int indirect_sg_entries; +static bool allow_ext_sg; static int topspin_workarounds = 1; module_param(srp_sg_tablesize, uint, 0444); @@ -70,6 +72,14 @@ module_param(cmd_sg_entries, uint, 0444); MODULE_PARM_DESC(cmd_sg_entries, "Default number of gather/scatter entries in the SRP command (default is 12, max 255)"); +module_param(indirect_sg_entries, uint, 0444); +MODULE_PARM_DESC(indirect_sg_entries, + "Default max number of gather/scatter entries (default is 12, max is " __stringify(SCSI_MAX_SG_CHAIN_SEGMENTS) ")"); + +module_param(allow_ext_sg, bool, 0444); +MODULE_PARM_DESC(allow_ext_sg, + "Default behavior when there are more than cmd_sg_entries S/G entries after mapping; fails the request when false (default false)"); + module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); @@ -446,12 +456,19 @@ static bool srp_change_state(struct srp_target_port *target, static void srp_free_req_data(struct srp_target_port *target) { + struct ib_device *ibdev = target->srp_host->srp_dev->dev; struct srp_request *req; int i; for (i = 0, req = target->req_ring; i < SRP_CMD_SQ_SIZE; ++i, ++req) { kfree(req->fmr_list); kfree(req->map_page); + if (req->indirect_dma_addr) { + ib_dma_unmap_single(ibdev, req->indirect_dma_addr, + target->indirect_size, + DMA_TO_DEVICE); + } + kfree(req->indirect_desc); } } @@ -790,7 +807,6 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct ib_device *ibdev; struct srp_map_state state; struct srp_indirect_buf *indirect_hdr; - dma_addr_t indirect_addr; u32 table_len; u8 fmt; @@ -841,8 +857,11 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, */ indirect_hdr = (void *) cmd->add_data; + ib_dma_sync_single_for_cpu(ibdev, req->indirect_dma_addr, + target->indirect_size, DMA_TO_DEVICE); + memset(&state, 0, sizeof(state)); - state.desc = indirect_hdr->desc_list; + state.desc = req->indirect_desc; state.pages = req->map_page; state.next_fmr = req->fmr_list; @@ -872,7 +891,11 @@ backtrack: if (use_fmr == SRP_MAP_ALLOW_FMR && srp_map_finish_fmr(&state, target)) goto backtrack; - /* We've mapped the request, fill in the command buffer. + /* We've mapped the request, now pull as much of the indirect + * descriptor table as we can into the command buffer. If this + * target is not using an external indirect table, we are + * guaranteed to fit into the command, as the SCSI layer won't + * give us more S/G entries than we allow. */ req->nfmr = state.nfmr; if (state.ndesc == 1) { @@ -881,27 +904,39 @@ backtrack: */ struct srp_direct_buf *buf = (void *) cmd->add_data; - *buf = indirect_hdr->desc_list[0]; + *buf = req->indirect_desc[0]; goto map_complete; } + if (unlikely(target->cmd_sg_cnt < state.ndesc && + !target->allow_ext_sg)) { + shost_printk(KERN_ERR, target->scsi_host, + "Could not fit S/G list into SRP_CMD\n"); + return -EIO; + } + + count = min(state.ndesc, target->cmd_sg_cnt); table_len = state.ndesc * sizeof (struct srp_direct_buf); fmt = SRP_DATA_DESC_INDIRECT; len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf); - len += table_len; + len += count * sizeof (struct srp_direct_buf); - indirect_addr = req->cmd->dma + sizeof *cmd + sizeof *indirect_hdr; + memcpy(indirect_hdr->desc_list, req->indirect_desc, + count * sizeof (struct srp_direct_buf)); - indirect_hdr->table_desc.va = cpu_to_be64(indirect_addr); + indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr); indirect_hdr->table_desc.key = cpu_to_be32(target->rkey); indirect_hdr->table_desc.len = cpu_to_be32(table_len); indirect_hdr->len = cpu_to_be32(state.total_len); if (scmnd->sc_data_direction == DMA_TO_DEVICE) - cmd->data_out_desc_cnt = state.ndesc; + cmd->data_out_desc_cnt = count; else - cmd->data_in_desc_cnt = state.ndesc; + cmd->data_in_desc_cnt = count; + + ib_dma_sync_single_for_device(ibdev, req->indirect_dma_addr, table_len, + DMA_TO_DEVICE); map_complete: if (scmnd->sc_data_direction == DMA_TO_DEVICE) @@ -1759,6 +1794,14 @@ static ssize_t show_cmd_sg_entries(struct device *dev, return sprintf(buf, "%u\n", target->cmd_sg_cnt); } +static ssize_t show_allow_ext_sg(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%s\n", target->allow_ext_sg ? "true" : "false"); +} + static DEVICE_ATTR(id_ext, S_IRUGO, show_id_ext, NULL); static DEVICE_ATTR(ioc_guid, S_IRUGO, show_ioc_guid, NULL); static DEVICE_ATTR(service_id, S_IRUGO, show_service_id, NULL); @@ -1770,6 +1813,7 @@ static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL); static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL); +static DEVICE_ATTR(allow_ext_sg, S_IRUGO, show_allow_ext_sg, NULL); static struct device_attribute *srp_host_attrs[] = { &dev_attr_id_ext, @@ -1783,6 +1827,7 @@ static struct device_attribute *srp_host_attrs[] = { &dev_attr_local_ib_port, &dev_attr_local_ib_device, &dev_attr_cmd_sg_entries, + &dev_attr_allow_ext_sg, NULL }; @@ -1868,6 +1913,8 @@ enum { SRP_OPT_IO_CLASS = 1 << 7, SRP_OPT_INITIATOR_EXT = 1 << 8, SRP_OPT_CMD_SG_ENTRIES = 1 << 9, + SRP_OPT_ALLOW_EXT_SG = 1 << 10, + SRP_OPT_SG_TABLESIZE = 1 << 11, SRP_OPT_ALL = (SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | @@ -1886,6 +1933,8 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_IO_CLASS, "io_class=%x" }, { SRP_OPT_INITIATOR_EXT, "initiator_ext=%s" }, { SRP_OPT_CMD_SG_ENTRIES, "cmd_sg_entries=%u" }, + { SRP_OPT_ALLOW_EXT_SG, "allow_ext_sg=%u" }, + { SRP_OPT_SG_TABLESIZE, "sg_tablesize=%u" }, { SRP_OPT_ERR, NULL } }; @@ -2021,6 +2070,23 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) target->cmd_sg_cnt = token; break; + case SRP_OPT_ALLOW_EXT_SG: + if (match_int(args, &token)) { + printk(KERN_WARNING PFX "bad allow_ext_sg parameter '%s'\n", p); + goto out; + } + target->allow_ext_sg = !!token; + break; + + case SRP_OPT_SG_TABLESIZE: + if (match_int(args, &token) || token < 1 || + token > SCSI_MAX_SG_CHAIN_SEGMENTS) { + printk(KERN_WARNING PFX "bad max sg_tablesize parameter '%s'\n", p); + goto out; + } + target->sg_tablesize = token; + break; + default: printk(KERN_WARNING PFX "unknown parameter or missing value " "'%s' in target creation request\n", p); @@ -2051,6 +2117,8 @@ static ssize_t srp_create_target(struct device *dev, container_of(dev, struct srp_host, dev); struct Scsi_Host *target_host; struct srp_target_port *target; + struct ib_device *ibdev = host->srp_dev->dev; + dma_addr_t dma_addr; int i, ret; target_host = scsi_host_alloc(&srp_template, @@ -2070,12 +2138,22 @@ static ssize_t srp_create_target(struct device *dev, target->lkey = host->srp_dev->mr->lkey; target->rkey = host->srp_dev->mr->rkey; target->cmd_sg_cnt = cmd_sg_entries; + target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; + target->allow_ext_sg = allow_ext_sg; ret = srp_parse_options(buf, target); if (ret) goto err; - target_host->sg_tablesize = target->cmd_sg_cnt; + if (!host->srp_dev->fmr_pool && !target->allow_ext_sg && + target->cmd_sg_cnt < target->sg_tablesize) { + printk(KERN_WARNING PFX "No FMR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); + target->sg_tablesize = target->cmd_sg_cnt; + } + + target_host->sg_tablesize = target->sg_tablesize; + target->indirect_size = target->sg_tablesize * + sizeof (struct srp_direct_buf); target->max_iu_len = sizeof (struct srp_cmd) + sizeof (struct srp_indirect_buf) + target->cmd_sg_cnt * sizeof (struct srp_direct_buf); @@ -2090,14 +2168,22 @@ static ssize_t srp_create_target(struct device *dev, GFP_KERNEL); req->map_page = kmalloc(SRP_FMR_SIZE * sizeof (void *), GFP_KERNEL); - if (!req->fmr_list || !req->map_page) + req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); + if (!req->fmr_list || !req->map_page || !req->indirect_desc) goto err_free_mem; + dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, + target->indirect_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, dma_addr)) + goto err_free_mem; + + req->indirect_dma_addr = dma_addr; req->index = i; list_add_tail(&req->list, &target->free_reqs); } - ib_query_gid(host->srp_dev->dev, host->port, 0, &target->path.sgid); + ib_query_gid(ibdev, host->port, 0, &target->path.sgid); shost_printk(KERN_DEBUG, target->scsi_host, PFX "new target: id_ext %016llx ioc_guid %016llx pkey %04x " @@ -2377,6 +2463,13 @@ static int __init srp_init_module(void) cmd_sg_entries = 255; } + if (!indirect_sg_entries) + indirect_sg_entries = cmd_sg_entries; + else if (indirect_sg_entries < cmd_sg_entries) { + printk(KERN_WARNING PFX "Bumping up indirect_sg_entries to match cmd_sg_entries (%u)\n", cmd_sg_entries); + indirect_sg_entries = cmd_sg_entries; + } + ib_srp_transport_template = srp_attach_transport(&ib_srp_transport_functions); if (!ib_srp_transport_template) diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index b43b5e7acbde..cf696218eeeb 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -117,6 +117,8 @@ struct srp_request { struct srp_iu *cmd; struct ib_pool_fmr **fmr_list; u64 *map_page; + struct srp_direct_buf *indirect_desc; + dma_addr_t indirect_dma_addr; short nfmr; short index; }; @@ -137,6 +139,8 @@ struct srp_target_port { enum srp_target_state state; unsigned int max_iu_len; unsigned int cmd_sg_cnt; + unsigned int indirect_size; + bool allow_ext_sg; /* Everything above this point is used in the hot path of * command processing. Try to keep them packed into cachelines. @@ -151,6 +155,7 @@ struct srp_target_port { struct Scsi_Host *scsi_host; char target_name[32]; unsigned int scsi_id; + unsigned int sg_tablesize; struct ib_sa_path_rec path; __be16 orig_dgid[8]; From be8b981453a4904399cb090c1660618e250092d8 Mon Sep 17 00:00:00 2001 From: David Dillow Date: Tue, 18 Jan 2011 21:58:09 -0500 Subject: [PATCH 06/10] IB/srp: try to use larger FMR sizes to cover our mappings Now that we can get larger SG lists, we can take advantage of HCAs that allow us to use larger FMR sizes. In many cases, we can use up to 512 entries, so start there and work our way down. Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 29 ++++++++++++++++++----------- drivers/infiniband/ulp/srp/ib_srp.h | 3 ++- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 4ec7dddbbf49..376d640487d2 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -2300,7 +2300,7 @@ static void srp_add_one(struct ib_device *device) struct ib_device_attr *dev_attr; struct ib_fmr_pool_param fmr_param; struct srp_host *host; - int fmr_page_shift, s, e, p; + int max_pages_per_fmr, fmr_page_shift, s, e, p; dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); if (!dev_attr) @@ -2340,17 +2340,24 @@ static void srp_add_one(struct ib_device *device) if (IS_ERR(srp_dev->mr)) goto err_pd; - memset(&fmr_param, 0, sizeof fmr_param); - fmr_param.pool_size = SRP_FMR_POOL_SIZE; - fmr_param.dirty_watermark = SRP_FMR_DIRTY_SIZE; - fmr_param.cache = 1; - fmr_param.max_pages_per_fmr = SRP_FMR_SIZE; - fmr_param.page_shift = fmr_page_shift; - fmr_param.access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); + for (max_pages_per_fmr = SRP_FMR_SIZE; + max_pages_per_fmr >= SRP_FMR_MIN_SIZE; + max_pages_per_fmr /= 2, srp_dev->fmr_max_size /= 2) { + memset(&fmr_param, 0, sizeof fmr_param); + fmr_param.pool_size = SRP_FMR_POOL_SIZE; + fmr_param.dirty_watermark = SRP_FMR_DIRTY_SIZE; + fmr_param.cache = 1; + fmr_param.max_pages_per_fmr = max_pages_per_fmr; + fmr_param.page_shift = fmr_page_shift; + fmr_param.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + srp_dev->fmr_pool = ib_create_fmr_pool(srp_dev->pd, &fmr_param); + if (!IS_ERR(srp_dev->fmr_pool)) + break; + } - srp_dev->fmr_pool = ib_create_fmr_pool(srp_dev->pd, &fmr_param); if (IS_ERR(srp_dev->fmr_pool)) srp_dev->fmr_pool = NULL; diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index cf696218eeeb..020caf0c3789 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -69,7 +69,8 @@ enum { SRP_TAG_NO_REQ = ~0U, SRP_TAG_TSK_MGMT = 1U << 31, - SRP_FMR_SIZE = 256, + SRP_FMR_SIZE = 512, + SRP_FMR_MIN_SIZE = 128, SRP_FMR_POOL_SIZE = 1024, SRP_FMR_DIRTY_SIZE = SRP_FMR_POOL_SIZE / 4, From 1bdd6384c2b43cac9be8d6f2c298bcf39f50cd07 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 17 Mar 2011 23:35:39 +0000 Subject: [PATCH 07/10] RDMA/addr: Fix return of uninitialized ret value Commit b23dd4fe42b4 ("ipv4: Make output route lookup return rtable directly") resulted in leaving ret uninitialized, where it may later be returned. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index e0ef5fdc361e..4ffc224faa7f 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -204,7 +204,7 @@ static int addr4_resolve(struct sockaddr_in *src_in, /* If the device does ARP internally, return 'done' */ if (rt->dst.dev->flags & IFF_NOARP) { - rdma_copy_addr(addr, rt->dst.dev, NULL); + ret = rdma_copy_addr(addr, rt->dst.dev, NULL); goto put; } From 748bfd9c1ded8419e7fdba4ca6f0c1dc1642df63 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 18 Mar 2011 08:52:30 -0700 Subject: [PATCH 08/10] RDMA/nes: Don't print success message at level KERN_ERR There's no reason to print "NetEffect RNIC driver successfully loaded" at level KERN_ERR (where it will uglify the console on a quiet boot). Change it to KERN_INFO. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 3d7f3664b67b..13de1192927c 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -694,7 +694,7 @@ static int __devinit nes_probe(struct pci_dev *pcidev, const struct pci_device_i nesdev->netdev_count++; nesdev->nesadapter->netdev_count++; - printk(KERN_ERR PFX "%s: NetEffect RNIC driver successfully loaded.\n", + printk(KERN_INFO PFX "%s: NetEffect RNIC driver successfully loaded.\n", pci_name(pcidev)); return 0; From 1eba843dd7678b9033a0d1ccff21c21b857de77b Mon Sep 17 00:00:00 2001 From: Michael Heinz Date: Fri, 11 Feb 2011 21:44:30 +0000 Subject: [PATCH 09/10] IB/mad: Improve an error message so error code is included Signed-off-by: Michael Heinz Signed-off-by: Roland Dreier --- drivers/infiniband/core/agent.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index 91916a8d5de4..2bc7f5af64f4 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -101,7 +101,8 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, agent = port_priv->agent[qpn]; ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); if (IS_ERR(ah)) { - printk(KERN_ERR SPFX "ib_create_ah_from_wc error\n"); + printk(KERN_ERR SPFX "ib_create_ah_from_wc error %ld\n", + PTR_ERR(ah)); return; } From 7f9e5c48c1078507747434d4c182ab10925bf98a Mon Sep 17 00:00:00 2001 From: David Dillow Date: Mon, 17 Jan 2011 02:09:44 +0000 Subject: [PATCH 10/10] IB: Increase DMA max_segment_size on Mellanox hardware By default, each device is assumed to be able only handle 64 KB chunks during DMA. By giving the segment size a larger value, the block layer will coalesce more S/G entries together for SRP, allowing larger requests with the same sg_tablesize setting. The block layer is the only direct user of it, though a few IOMMU drivers reference it as well for their *_map_sg coalescing code. pci-gart_64 on x86, and a smattering on on sparc, powerpc, and ia64. Since other IB protocols could potentially see larger segments with this, let's check those: - iSER is fine, because you limit your maximum request size to 512 KB, so we'll never overrun the page vector in struct iser_page_vec (128 entries currently). It is independent of the DMA segment size, and handles multi-page segments already. - IPoIB is fine, as it maps each page individually, and doesn't use ib_dma_map_sg(). - RDS appears to do the right thing and has no dependencies on DMA segment size, but I don't claim to have done a complete audit. - NFSoRDMA and 9p are OK -- they do not use ib_dma_map_sg(), so they doesn't care about the coalescing. - Lustre's ko2iblnd does not care about coalescing -- it properly walks the returned sg list. This patch ups the value on Mellanox hardware to 1 GB, which matches reported firmware limits on mlx4. Signed-off-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_main.c | 3 +++ drivers/net/mlx4/main.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 8a40cd539ab1..f24b79b805f2 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -1043,6 +1043,9 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type) } } + /* We can handle large RDMA requests, so allow larger segments. */ + dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); + mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev); if (!mdev) { dev_err(&pdev->dev, "Device struct alloc failed, " diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 2765a3ce9c24..c83501122d77 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -1109,6 +1109,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) } } + /* Allow large DMA segments, up to the firmware limit of 1 GB */ + dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); + priv = kzalloc(sizeof *priv, GFP_KERNEL); if (!priv) { dev_err(&pdev->dev, "Device struct alloc failed, "