diff options
author | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-04-27 09:39:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-04-27 09:39:27 -0700 |
commit | afc2e82c0851317931a9bfdb98271253371825c6 (patch) | |
tree | 3f1c119559bd94402d0574f786851bd34bbc048f /drivers/infiniband/hw | |
parent | 0278ef8b484a71917bd4f03a763285cdaac10954 (diff) | |
parent | 1912ffbb88efe872eb8fa8113dfb3cb0b7238764 (diff) |
Merge branch 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband
* 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband: (49 commits)
IB: Set class_dev->dev in core for nice device symlink
IB/ehca: Implement modify_port
IB/umad: Clarify documentation of transaction ID
IPoIB/cm: spin_lock_irqsave() -> spin_lock_irq() replacements
IB/mad: Change SMI to use enums rather than magic return codes
IB/umad: Implement GRH handling for sent/received MADs
IB/ipoib: Use ib_init_ah_from_path to initialize ah_attr
IB/sa: Set src_path_bits correctly in ib_init_ah_from_path()
IB/ucm: Simplify ib_ucm_event()
RDMA/ucma: Simplify ucma_get_event()
IB/mthca: Simplify CQ cleaning in mthca_free_qp()
IB/mthca: Fix mthca_write_mtt() on HCAs with hidden memory
IB/mthca: Update HCA firmware revisions
IB/ipath: Fix WC format drift between user and kernel space
IB/ipath: Check that a UD work request's address handle is valid
IB/ipath: Remove duplicate stuff from ipath_verbs.h
IB/ipath: Check reserved memory keys
IB/ipath: Fix unit selection when all CPU affinity bits set
IB/ipath: Don't allow QPs 0 and 1 to be opened multiple times
IB/ipath: Disable IB link earlier in shutdown sequence
...
Diffstat (limited to 'drivers/infiniband/hw')
34 files changed, 1444 insertions, 841 deletions
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index fef972752912..607c09bf764c 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c @@ -796,7 +796,6 @@ int c2_register_device(struct c2_dev *dev) memcpy(&dev->ibdev.node_guid, dev->pseudo_netdev->dev_addr, 6); dev->ibdev.phys_port_cnt = 1; dev->ibdev.dma_device = &dev->pcidev->dev; - dev->ibdev.class_dev.dev = &dev->pcidev->dev; dev->ibdev.query_device = c2_query_device; dev->ibdev.query_port = c2_query_port; dev->ibdev.modify_port = c2_modify_port; diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 24e0df04f7db..af28a317016d 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1108,7 +1108,6 @@ int iwch_register_device(struct iwch_dev *dev) memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC)); dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports; dev->ibdev.dma_device = &(dev->rdev.rnic_info.pdev->dev); - dev->ibdev.class_dev.dev = &(dev->rdev.rnic_info.pdev->dev); dev->ibdev.query_device = iwch_query_device; dev->ibdev.query_port = iwch_query_port; dev->ibdev.modify_port = iwch_modify_port; diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index 82ded44c6cee..10fb8fbafa0c 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -106,6 +106,7 @@ struct ehca_shca { struct ehca_mr *maxmr; struct ehca_pd *pd; struct h_galpas galpas; + struct mutex modify_mutex; }; struct ehca_pd { diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c index 30eb45df9f0b..32b55a4f0e5b 100644 --- a/drivers/infiniband/hw/ehca/ehca_hca.c +++ b/drivers/infiniband/hw/ehca/ehca_hca.c @@ -147,6 +147,7 @@ int ehca_query_port(struct ib_device *ibdev, break; } + props->port_cap_flags = rblock->capability_mask; props->gid_tbl_len = rblock->gid_tbl_len; props->max_msg_sz = rblock->max_msg_sz; props->bad_pkey_cntr = rblock->bad_pkey_cntr; @@ -236,10 +237,60 @@ query_gid1: return ret; } +const u32 allowed_port_caps = ( + IB_PORT_SM | IB_PORT_LED_INFO_SUP | IB_PORT_CM_SUP | + IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_DEVICE_MGMT_SUP | + IB_PORT_VENDOR_CLASS_SUP); + int ehca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask, struct ib_port_modify *props) { - /* Not implemented yet */ - return -EFAULT; + int ret = 0; + struct ehca_shca *shca = container_of(ibdev, struct ehca_shca, ib_device); + struct hipz_query_port *rblock; + u32 cap; + u64 hret; + + if ((props->set_port_cap_mask | props->clr_port_cap_mask) + & ~allowed_port_caps) { + ehca_err(&shca->ib_device, "Non-changeable bits set in masks " + "set=%x clr=%x allowed=%x", props->set_port_cap_mask, + props->clr_port_cap_mask, allowed_port_caps); + return -EINVAL; + } + + if (mutex_lock_interruptible(&shca->modify_mutex)) + return -ERESTARTSYS; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + ret = -ENOMEM; + goto modify_port1; + } + + if (hipz_h_query_port(shca->ipz_hca_handle, port, rblock) != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto modify_port2; + } + + cap = (rblock->capability_mask | props->set_port_cap_mask) + & ~props->clr_port_cap_mask; + + hret = hipz_h_modify_port(shca->ipz_hca_handle, port, + cap, props->init_type, port_modify_mask); + if (hret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Modify port failed hret=%lx", hret); + ret = -EINVAL; + } + +modify_port2: + ehca_free_fw_ctrlblock(rblock); + +modify_port1: + mutex_unlock(&shca->modify_mutex); + + return ret; } diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index 059da9628bb5..3b23d677cb86 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -587,6 +587,7 @@ static int __devinit ehca_probe(struct ibmebus_dev *dev, ehca_gen_err("Cannot allocate shca memory."); return -ENOMEM; } + mutex_init(&shca->modify_mutex); shca->ibmebus_dev = dev; shca->ipz_hca_handle.handle = *handle; diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c index 3fb46e67df87..b564fcd3b282 100644 --- a/drivers/infiniband/hw/ehca/hcp_if.c +++ b/drivers/infiniband/hw/ehca/hcp_if.c @@ -70,6 +70,10 @@ #define H_ALL_RES_QP_SQUEUE_SIZE_PAGES EHCA_BMASK_IBM(0, 31) #define H_ALL_RES_QP_RQUEUE_SIZE_PAGES EHCA_BMASK_IBM(32, 63) +#define H_MP_INIT_TYPE EHCA_BMASK_IBM(44, 47) +#define H_MP_SHUTDOWN EHCA_BMASK_IBM(48, 48) +#define H_MP_RESET_QKEY_CTR EHCA_BMASK_IBM(49, 49) + /* direct access qp controls */ #define DAQP_CTRL_ENABLE 0x01 #define DAQP_CTRL_SEND_COMP 0x20 @@ -364,6 +368,26 @@ u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle, return ret; } +u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, const u32 port_cap, + const u8 init_type, const int modify_mask) +{ + u64 port_attributes = port_cap; + + if (modify_mask & IB_PORT_SHUTDOWN) + port_attributes |= EHCA_BMASK_SET(H_MP_SHUTDOWN, 1); + if (modify_mask & IB_PORT_INIT_TYPE) + port_attributes |= EHCA_BMASK_SET(H_MP_INIT_TYPE, init_type); + if (modify_mask & IB_PORT_RESET_QKEY_CNTR) + port_attributes |= EHCA_BMASK_SET(H_MP_RESET_QKEY_CTR, 1); + + return ehca_plpar_hcall_norets(H_MODIFY_PORT, + adapter_handle.handle, /* r4 */ + port_id, /* r5 */ + port_attributes, /* r6 */ + 0, 0, 0, 0); +} + u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle, struct hipz_query_hca *query_hca_rblock) { diff --git a/drivers/infiniband/hw/ehca/hcp_if.h b/drivers/infiniband/hw/ehca/hcp_if.h index 587ebd470959..2869f7dd6196 100644 --- a/drivers/infiniband/hw/ehca/hcp_if.h +++ b/drivers/infiniband/hw/ehca/hcp_if.h @@ -85,6 +85,10 @@ u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle, const u8 port_id, struct hipz_query_port *query_port_response_block); +u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, const u32 port_cap, + const u8 init_type, const int modify_mask); + u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle, struct hipz_query_hca *query_hca_rblock); diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h index 54139d398181..10c008f22ba6 100644 --- a/drivers/infiniband/hw/ipath/ipath_common.h +++ b/drivers/infiniband/hw/ipath/ipath_common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -78,6 +78,8 @@ #define IPATH_IB_LINKINIT 3 #define IPATH_IB_LINKDOWN_SLEEP 4 #define IPATH_IB_LINKDOWN_DISABLE 5 +#define IPATH_IB_LINK_LOOPBACK 6 /* enable local loopback */ +#define IPATH_IB_LINK_EXTERNAL 7 /* normal, disable local loopback */ /* * stats maintained by the driver. For now, at least, this is global @@ -316,11 +318,17 @@ struct ipath_base_info { /* address of readonly memory copy of the rcvhdrq tail register. */ __u64 spi_rcvhdr_tailaddr; - /* shared memory pages for subports if IPATH_RUNTIME_MASTER is set */ + /* shared memory pages for subports if port is shared */ __u64 spi_subport_uregbase; __u64 spi_subport_rcvegrbuf; __u64 spi_subport_rcvhdr_base; + /* shared memory page for hardware port if it is shared */ + __u64 spi_port_uregbase; + __u64 spi_port_rcvegrbuf; + __u64 spi_port_rcvhdr_base; + __u64 spi_port_rcvhdr_tailaddr; + } __attribute__ ((aligned(8))); @@ -344,7 +352,7 @@ struct ipath_base_info { * may not be implemented; the user code must deal with this if it * cares, or it must abort after initialization reports the difference. */ -#define IPATH_USER_SWMINOR 3 +#define IPATH_USER_SWMINOR 5 #define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR) @@ -418,11 +426,14 @@ struct ipath_user_info { #define IPATH_CMD_TID_UPDATE 19 /* update expected TID entries */ #define IPATH_CMD_TID_FREE 20 /* free expected TID entries */ #define IPATH_CMD_SET_PART_KEY 21 /* add partition key */ -#define IPATH_CMD_SLAVE_INFO 22 /* return info on slave processes */ +#define __IPATH_CMD_SLAVE_INFO 22 /* return info on slave processes (for old user code) */ #define IPATH_CMD_ASSIGN_PORT 23 /* allocate HCA and port */ #define IPATH_CMD_USER_INIT 24 /* set up userspace */ +#define IPATH_CMD_UNUSED_1 25 +#define IPATH_CMD_UNUSED_2 26 +#define IPATH_CMD_PIOAVAILUPD 27 /* force an update of PIOAvail reg */ -#define IPATH_CMD_MAX 24 +#define IPATH_CMD_MAX 27 struct ipath_port_info { __u32 num_active; /* number of active units */ @@ -430,7 +441,7 @@ struct ipath_port_info { __u16 port; /* port on unit assigned to caller */ __u16 subport; /* subport on unit assigned to caller */ __u16 num_ports; /* number of ports available on unit */ - __u16 num_subports; /* number of subport slaves opened on port */ + __u16 num_subports; /* number of subports opened on port */ }; struct ipath_tid_info { diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c index 87462e0cb4d2..ea78e6dddc90 100644 --- a/drivers/infiniband/hw/ipath/ipath_cq.c +++ b/drivers/infiniband/hw/ipath/ipath_cq.c @@ -76,7 +76,20 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) } return; } - wc->queue[head] = *entry; + wc->queue[head].wr_id = entry->wr_id; + wc->queue[head].status = entry->status; + wc->queue[head].opcode = entry->opcode; + wc->queue[head].vendor_err = entry->vendor_err; + wc->queue[head].byte_len = entry->byte_len; + wc->queue[head].imm_data = (__u32 __force)entry->imm_data; + wc->queue[head].qp_num = entry->qp->qp_num; + wc->queue[head].src_qp = entry->src_qp; + wc->queue[head].wc_flags = entry->wc_flags; + wc->queue[head].pkey_index = entry->pkey_index; + wc->queue[head].slid = entry->slid; + wc->queue[head].sl = entry->sl; + wc->queue[head].dlid_path_bits = entry->dlid_path_bits; + wc->queue[head].port_num = entry->port_num; wc->head = next; if (cq->notify == IB_CQ_NEXT_COMP || @@ -122,9 +135,30 @@ int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) if (tail > (u32) cq->ibcq.cqe) tail = (u32) cq->ibcq.cqe; for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { + struct ipath_qp *qp; + if (tail == wc->head) break; - *entry = wc->queue[tail]; + + qp = ipath_lookup_qpn(&to_idev(cq->ibcq.device)->qp_table, + wc->queue[tail].qp_num); + entry->qp = &qp->ibqp; + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + + entry->wr_id = wc->queue[tail].wr_id; + entry->status = wc->queue[tail].status; + entry->opcode = wc->queue[tail].opcode; + entry->vendor_err = wc->queue[tail].vendor_err; + entry->byte_len = wc->queue[tail].byte_len; + entry->imm_data = wc->queue[tail].imm_data; + entry->src_qp = wc->queue[tail].src_qp; + entry->wc_flags = wc->queue[tail].wc_flags; + entry->pkey_index = wc->queue[tail].pkey_index; + entry->slid = wc->queue[tail].slid; + entry->sl = wc->queue[tail].sl; + entry->dlid_path_bits = wc->queue[tail].dlid_path_bits; + entry->port_num = wc->queue[tail].port_num; if (tail >= cq->ibcq.cqe) tail = 0; else diff --git a/drivers/infiniband/hw/ipath/ipath_debug.h b/drivers/infiniband/hw/ipath/ipath_debug.h index df69f0d80b8b..42bfbdb0d3e6 100644 --- a/drivers/infiniband/hw/ipath/ipath_debug.h +++ b/drivers/infiniband/hw/ipath/ipath_debug.h @@ -57,6 +57,7 @@ #define __IPATH_PROCDBG 0x100 /* print mmap/nopage stuff, not using VDBG any more */ #define __IPATH_MMDBG 0x200 +#define __IPATH_ERRPKTDBG 0x400 #define __IPATH_USER_SEND 0x1000 /* use user mode send */ #define __IPATH_KERNEL_SEND 0x2000 /* use kernel mode send */ #define __IPATH_EPKTDBG 0x4000 /* print ethernet packet data */ diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c index 0f13a2182cc7..63e8368b0e95 100644 --- a/drivers/infiniband/hw/ipath/ipath_diag.c +++ b/drivers/infiniband/hw/ipath/ipath_diag.c @@ -296,7 +296,7 @@ static int ipath_diag_open(struct inode *in, struct file *fp) } fp->private_data = dd; - ipath_diag_inuse = 1; + ipath_diag_inuse = -2; diag_set_link = 0; ret = 0; @@ -461,6 +461,8 @@ static ssize_t ipath_diag_read(struct file *fp, char __user *data, else if ((count % 4) || (*off % 4)) /* address or length is not 32-bit aligned, hence invalid */ ret = -EINVAL; + else if (ipath_diag_inuse < 1 && (*off || count != 8)) + ret = -EINVAL; /* prevent cat /dev/ipath_diag* */ else if ((count % 8) || (*off % 8)) /* address or length not 64-bit aligned; do 32-bit reads */ ret = ipath_read_umem32(dd, data, kreg_base + *off, count); @@ -470,6 +472,8 @@ static ssize_t ipath_diag_read(struct file *fp, char __user *data, if (ret >= 0) { *off += count; ret = count; + if (ipath_diag_inuse == -2) + ipath_diag_inuse++; } return ret; @@ -489,6 +493,9 @@ static ssize_t ipath_diag_write(struct file *fp, const char __user *data, else if ((count % 4) || (*off % 4)) /* address or length is not 32-bit aligned, hence invalid */ ret = -EINVAL; + else if ((ipath_diag_inuse == -1 && (*off || count != 8)) || + ipath_diag_inuse == -2) /* read qw off 0, write qw off 0 */ + ret = -EINVAL; /* before any other write allowed */ else if ((count % 8) || (*off % 8)) /* address or length not 64-bit aligned; do 32-bit writes */ ret = ipath_write_umem32(dd, kreg_base + *off, data, count); @@ -498,6 +505,8 @@ static ssize_t ipath_diag_write(struct file *fp, const char __user *data, if (ret >= 0) { *off += count; ret = count; + if (ipath_diag_inuse == -1) + ipath_diag_inuse = 1; /* all read/write OK now */ } return ret; diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index ae7f21a0cdc0..e3a223209710 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -390,15 +390,23 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, /* setup the chip-specific functions, as early as possible. */ switch (ent->device) { -#ifdef CONFIG_HT_IRQ case PCI_DEVICE_ID_INFINIPATH_HT: +#ifdef CONFIG_HT_IRQ ipath_init_iba6110_funcs(dd); break; +#else + ipath_dev_err(dd, "QLogic HT device 0x%x cannot work if " + "CONFIG_HT_IRQ is not enabled\n", ent->device); + return -ENODEV; #endif -#ifdef CONFIG_PCI_MSI case PCI_DEVICE_ID_INFINIPATH_PE800: +#ifdef CONFIG_PCI_MSI ipath_init_iba6120_funcs(dd); break; +#else + ipath_dev_err(dd, "QLogic PCIE device 0x%x cannot work if " + "CONFIG_PCI_MSI is not enabled\n", ent->device); + return -ENODEV; #endif default: ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, " @@ -486,7 +494,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ret = ipath_init_chip(dd, 0); /* do the chip-specific init */ if (ret) - goto bail_iounmap; + goto bail_irqsetup; ret = ipath_enable_wc(dd); @@ -505,6 +513,9 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, goto bail; +bail_irqsetup: + if (pdev->irq) free_irq(pdev->irq, dd); + bail_iounmap: iounmap((volatile void __iomem *) dd->ipath_kregbase); @@ -525,8 +536,6 @@ static void __devexit cleanup_device(struct ipath_devdata *dd) { int port; - ipath_shutdown_device(dd); - if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) { /* can't do anything more with chip; needs re-init */ *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT; @@ -594,8 +603,9 @@ static void __devexit cleanup_device(struct ipath_devdata *dd) ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n", dd->ipath_pageshadow); - vfree(dd->ipath_pageshadow); + tmpp = dd->ipath_pageshadow; dd->ipath_pageshadow = NULL; + vfree(tmpp); } /* @@ -622,6 +632,12 @@ static void __devexit ipath_remove_one(struct pci_dev *pdev) ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd); + /* + * disable the IB link early, to be sure no new packets arrive, which + * complicates the shutdown process + */ + ipath_shutdown_device(dd); + if (dd->verbs_dev) ipath_unregister_ib_device(dd->verbs_dev); @@ -754,9 +770,42 @@ static int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT; } -void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) +/* + * Decode the error status into strings, deciding whether to always + * print * it or not depending on "normal packet errors" vs everything + * else. Return 1 if "real" errors, otherwise 0 if only packet + * errors, so caller can decide what to print with the string. + */ +int ipath_decode_err(char *buf, size_t blen, ipath_err_t err) { + int iserr = 1; *buf = '\0'; + if (err & INFINIPATH_E_PKTERRS) { + if (!(err & ~INFINIPATH_E_PKTERRS)) + iserr = 0; // if only packet errors. + if (ipath_debug & __IPATH_ERRPKTDBG) { + if (err & INFINIPATH_E_REBP) + strlcat(buf, "EBP ", blen); + if (err & INFINIPATH_E_RVCRC) + strlcat(buf, "VCRC ", blen); + if (err & INFINIPATH_E_RICRC) { + strlcat(buf, "CRC ", blen); + // clear for check below, so only once + err &= INFINIPATH_E_RICRC; + } + if (err & INFINIPATH_E_RSHORTPKTLEN) + strlcat(buf, "rshortpktlen ", blen); + if (err & INFINIPATH_E_SDROPPEDDATAPKT) + strlcat(buf, "sdroppeddatapkt ", blen); + if (err & INFINIPATH_E_SPKTLEN) + strlcat(buf, "spktlen ", blen); + } + if ((err & INFINIPATH_E_RICRC) && + !(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP))) + strlcat(buf, "CRC ", blen); + if (!iserr) + goto done; + } if (err & INFINIPATH_E_RHDRLEN) strlcat(buf, "rhdrlen ", blen); if (err & INFINIPATH_E_RBADTID) @@ -767,12 +816,12 @@ void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) strlcat(buf, "rhdr ", blen); if (err & INFINIPATH_E_RLONGPKTLEN) strlcat(buf, "rlongpktlen ", blen); - if (err & INFINIPATH_E_RSHORTPKTLEN) - strlcat(buf, "rshortpktlen ", blen); if (err & INFINIPATH_E_RMAXPKTLEN) strlcat(buf, "rmaxpktlen ", blen); if (err & INFINIPATH_E_RMINPKTLEN) strlcat(buf, "rminpktlen ", blen); + if (err & INFINIPATH_E_SMINPKTLEN) + strlcat(buf, "sminpktlen ", blen); if (err & INFINIPATH_E_RFORMATERR) strlcat(buf, "rformaterr ", blen); if (err & INFINIPATH_E_RUNSUPVL) @@ -781,32 +830,20 @@ void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) strlcat(buf, "runexpchar ", blen); if (err & INFINIPATH_E_RIBFLOW) strlcat(buf, "ribflow ", blen); - if (err & INFINIPATH_E_REBP) - strlcat(buf, "EBP ", blen); if (err & INFINIPATH_E_SUNDERRUN) strlcat(buf, "sunderrun ", blen); if (err & INFINIPATH_E_SPIOARMLAUNCH) strlcat(buf, "spioarmlaunch ", blen); if (err & INFINIPATH_E_SUNEXPERRPKTNUM) strlcat(buf, "sunexperrpktnum ", blen); - if (err & INFINIPATH_E_SDROPPEDDATAPKT) - strlcat(buf, "sdroppeddatapkt ", blen); if (err & INFINIPATH_E_SDROPPEDSMPPKT) strlcat(buf, "sdroppedsmppkt ", blen); if (err & INFINIPATH_E_SMAXPKTLEN) strlcat(buf, "smaxpktlen ", blen); - if (err & INFINIPATH_E_SMINPKTLEN) - strlcat(buf, "sminpktlen ", blen); if (err & INFINIPATH_E_SUNSUPVL) strlcat(buf, "sunsupVL ", blen); - if (err & INFINIPATH_E_SPKTLEN) - strlcat(buf, "spktlen ", blen); if (err & INFINIPATH_E_INVALIDADDR) strlcat(buf, "invalidaddr ", blen); - if (err & INFINIPATH_E_RICRC) - strlcat(buf, "CRC ", blen); - if (err & INFINIPATH_E_RVCRC) - strlcat(buf, "VCRC ", blen); if (err & INFINIPATH_E_RRCVEGRFULL) strlcat(buf, "rcvegrfull ", blen); if (err & INFINIPATH_E_RRCVHDRFULL) @@ -819,6 +856,8 @@ void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) strlcat(buf, "hardware ", blen); if (err & INFINIPATH_E_RESET) strlcat(buf, "reset ", blen); +done: + return iserr; } /** @@ -1662,6 +1701,22 @@ int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate) lstate = IPATH_LINKACTIVE; break; + case IPATH_IB_LINK_LOOPBACK: + dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n"); + dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + ret = 0; + goto bail; // no state change to wait for + + case IPATH_IB_LINK_EXTERNAL: + dev_info(&dd->pcidev->dev, "Disabling IB local loopback (normal)\n"); + dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + ret = 0; + goto bail; // no state change to wait for + default: ipath_dbg("Invalid linkstate 0x%x requested\n", newstate); ret = -EINVAL; @@ -1765,29 +1820,6 @@ int ipath_set_lid(struct ipath_devdata *dd, u32 arg, u8 lmc) return 0; } -/** - * ipath_read_kreg64_port - read a device's per-port 64-bit kernel register - * @dd: the infinipath device - * @regno: the register number to read - * @port: the port containing the register - * - * Registers that vary with the chip implementation constants (port) - * use this routine. - */ -u64 ipath_read_kreg64_port(const struct ipath_devdata *dd, ipath_kreg regno, - unsigned port) -{ - u16 where; - - if (port < dd->ipath_portcnt && - (regno == dd->ipath_kregs->kr_rcvhdraddr || - regno == dd->ipath_kregs->kr_rcvhdrtailaddr)) - where = regno + port; - else - where = -1; - - return ipath_read_kreg64(dd, where); -} /** * ipath_write_kreg_port - write a device's per-port 64-bit kernel register @@ -1973,7 +2005,8 @@ static int __init infinipath_init(void) { int ret; - ipath_dbg(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version); + if (ipath_debug & __IPATH_DBG) + printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version); /* * These must be called before the driver is registered with diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c index a4019a6b7560..030185f90ee2 100644 --- a/drivers/infiniband/hw/ipath/ipath_eeprom.c +++ b/drivers/infiniband/hw/ipath/ipath_eeprom.c @@ -626,6 +626,10 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd) } else memcpy(dd->ipath_serial, ifp->if_serial, sizeof ifp->if_serial); + if (!strstr(ifp->if_comment, "Tested successfully")) + ipath_dev_err(dd, "Board SN %s did not pass functional " + "test: %s\n", dd->ipath_serial, + ifp->if_comment); ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n", (unsigned long long) be64_to_cpu(dd->ipath_guid)); diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 5d64ff875297..1272aaf2a785 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -41,12 +41,6 @@ #include "ipath_kernel.h" #include "ipath_common.h" -/* - * mmap64 doesn't allow all 64 bits for 32-bit applications - * so only use the low 43 bits. - */ -#define MMAP64_MASK 0x7FFFFFFFFFFUL - static int ipath_open(struct inode *, struct file *); static int ipath_close(struct inode *, struct file *); static ssize_t ipath_write(struct file *, const char __user *, size_t, @@ -63,6 +57,24 @@ static const struct file_operations ipath_file_ops = { .mmap = ipath_mmap }; +/* + * Convert kernel virtual addresses to physical addresses so they don't + * potentially conflict with the chip addresses used as mmap offsets. + * It doesn't really matter what mmap offset we use as long as we can + * interpret it correctly. + */ +static u64 cvt_kvaddr(void *p) +{ + struct page *page; + u64 paddr = 0; + + page = vmalloc_to_page(p); + if (page) + paddr = page_to_pfn(page) << PAGE_SHIFT; + + return paddr; +} + static int ipath_get_base_info(struct file *fp, void __user *ubase, size_t ubase_size) { @@ -87,7 +99,7 @@ static int ipath_get_base_info(struct file *fp, sz = sizeof(*kinfo); /* If port sharing is not requested, allow the old size structure */ if (!shared) - sz -= 3 * sizeof(u64); + sz -= 7 * sizeof(u64); if (ubase_size < sz) { ipath_cdbg(PROC, "Base size %zu, need %zu (version mismatch?)\n", @@ -165,24 +177,41 @@ static int ipath_get_base_info(struct file *fp, kinfo->spi_piobufbase = (u64) pd->port_piobufs + dd->ipath_palign * (dd->ipath_pbufsport - kinfo->spi_piocnt); - kinfo->__spi_uregbase = (u64) dd->ipath_uregbase + - dd->ipath_palign * pd->port_port; } else { unsigned slave = subport_fp(fp) - 1; kinfo->spi_piocnt = dd->ipath_pbufsport / subport_cnt; kinfo->spi_piobufbase = (u64) pd->port_piobufs + dd->ipath_palign * kinfo->spi_piocnt * slave; - kinfo->__spi_uregbase = ((u64) pd->subport_uregbase + - PAGE_SIZE * slave) & MMAP64_MASK; + } + if (shared) { + kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase + + dd->ipath_palign * pd->port_port; + kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs; + kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base; + kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr; - kinfo->spi_rcvhdr_base = ((u64) pd->subport_rcvhdr_base + - pd->port_rcvhdrq_size * slave) & MMAP64_MASK; - kinfo->spi_rcvhdr_tailaddr = - (u64) pd->port_rcvhdrqtailaddr_phys & MMAP64_MASK; - kinfo->spi_rcv_egrbufs = ((u64) pd->subport_rcvegrbuf + - dd->ipath_rcvegrcnt * dd->ipath_rcvegrbufsize * slave) & - MMAP64_MASK; + kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase + + PAGE_SIZE * subport_fp(fp)); + + kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport_fp(fp)); + kinfo->spi_rcvhdr_tailaddr = 0; + kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf + + pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size * + subport_fp(fp)); + + kinfo->spi_subport_uregbase = + cvt_kvaddr(pd->subport_uregbase); + kinfo->spi_subport_rcvegrbuf = + cvt_kvaddr(pd->subport_rcvegrbuf); + kinfo->spi_subport_rcvhdr_base = + cvt_kvaddr(pd->subport_rcvhdr_base); + ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n", + kinfo->spi_port, kinfo->spi_runtime_flags, + (unsigned long long) kinfo->spi_subport_uregbase, + (unsigned long long) kinfo->spi_subport_rcvegrbuf, + (unsigned long long) kinfo->spi_subport_rcvhdr_base); } kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->ipath_piobufbase) / @@ -199,20 +228,10 @@ static int ipath_get_base_info(struct file *fp, if (master) { kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER; - kinfo->spi_subport_uregbase = - (u64) pd->subport_uregbase & MMAP64_MASK; - kinfo->spi_subport_rcvegrbuf = - (u64) pd->subport_rcvegrbuf & MMAP64_MASK; - kinfo->spi_subport_rcvhdr_base = - (u64) pd->subport_rcvhdr_base & MMAP64_MASK; - ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n", - kinfo->spi_port, kinfo->spi_runtime_flags, - (unsigned long long) kinfo->spi_subport_uregbase, - (unsigned long long) kinfo->spi_subport_rcvegrbuf, - (unsigned long long) kinfo->spi_subport_rcvhdr_base); } - if (copy_to_user(ubase, kinfo, sizeof(*kinfo))) + sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo); + if (copy_to_user(ubase, kinfo, sz)) ret = -EFAULT; bail: @@ -1132,67 +1151,55 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, struct ipath_devdata *dd; void *addr; size_t size; - int ret; + int ret = 0; /* If the port is not shared, all addresses should be physical */ - if (!pd->port_subport_cnt) { - ret = -EINVAL; + if (!pd->port_subport_cnt) goto bail; - } dd = pd->port_dd; size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size; /* - * Master has all the slave uregbase, rcvhdrq, and - * rcvegrbufs mmapped. + * Each process has all the subport uregbase, rcvhdrq, and + * rcvegrbufs mmapped - as an array for all the processes, + * and also separately for this process. */ - if (subport == 0) { - unsigned num_slaves = pd->port_subport_cnt - 1; - - if (pgaddr == ((u64) pd->subport_uregbase & MMAP64_MASK)) { - addr = pd->subport_uregbase; - size = PAGE_SIZE * num_slaves; - } else if (pgaddr == ((u64) pd->subport_rcvhdr_base & - MMAP64_MASK)) { - addr = pd->subport_rcvhdr_base; - size = pd->port_rcvhdrq_size * num_slaves; - } else if (pgaddr == ((u64) pd->subport_rcvegrbuf & - MMAP64_MASK)) { - addr = pd->subport_rcvegrbuf; - size *= num_slaves; - } else { - ret = -EINVAL; - goto bail; - } - } else if (pgaddr == (((u64) pd->subport_uregbase + - PAGE_SIZE * (subport - 1)) & MMAP64_MASK)) { - addr = pd->subport_uregbase + PAGE_SIZE * (subport - 1); - size = PAGE_SIZE; - } else if (pgaddr == (((u64) pd->subport_rcvhdr_base + - pd->port_rcvhdrq_size * (subport - 1)) & - MMAP64_MASK)) { - addr = pd->subport_rcvhdr_base + - pd->port_rcvhdrq_size * (subport - 1); - size = pd->port_rcvhdrq_size; - } else if (pgaddr == (((u64) pd->subport_rcvegrbuf + - size * (subport - 1)) & MMAP64_MASK)) { - addr = pd->subport_rcvegrbuf + size * (subport - 1); - /* rcvegrbufs are read-only on the slave */ - if (vma->vm_flags & VM_WRITE) { - dev_info(&dd->pcidev->dev, - "Can't map eager buffers as " - "writable (flags=%lx)\n", vma->vm_flags); - ret = -EPERM; - goto bail; - } - /* - * Don't allow permission to later change to writeable - * with mprotect. - */ - vma->vm_flags &= ~VM_MAYWRITE; + if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) { + addr = pd->subport_uregbase; + size = PAGE_SIZE * pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) { + addr = pd->subport_rcvhdr_base; + size = pd->port_rcvhdrq_size * pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) { + addr = pd->subport_rcvegrbuf; + size *= pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase + + PAGE_SIZE * subport)) { + addr = pd->subport_uregbase + PAGE_SIZE * subport; + size = PAGE_SIZE; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport)) { + addr = pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport; + size = pd->port_rcvhdrq_size; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf + + size * subport)) { + addr = pd->subport_rcvegrbuf + size * subport; + /* rcvegrbufs are read-only on the slave */ + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, + "Can't map eager buffers as " + "writable (flags=%lx)\n", vma->vm_flags); + ret = -EPERM; + goto bail; + } + /* + * Don't allow permission to later change to writeable + * with mprotect. + */ + vma->vm_flags &= ~VM_MAYWRITE; } else { - ret = -EINVAL; goto bail; } len = vma->vm_end - vma->vm_start; @@ -1205,7 +1212,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; vma->vm_ops = &ipath_file_vm_ops; vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; - ret = 0; + ret = 1; bail: return ret; @@ -1265,19 +1272,20 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) * Check for kernel virtual addresses first, anything else must * match a HW or memory address. */ - if (pgaddr >= (1ULL<<40)) { - ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp)); + ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp)); + if (ret) { + if (ret > 0) + ret = 0; goto bail; } + ureg = dd->ipath_uregbase + dd->ipath_palign * pd->port_port; if (!pd->port_subport_cnt) { /* port is not shared */ - ureg = dd->ipath_uregbase + dd->ipath_palign * pd->port_port; piocnt = dd->ipath_pbufsport; piobufs = pd->port_piobufs; } else if (!subport_fp(fp)) { /* caller is the master */ - ureg = dd->ipath_uregbase + dd->ipath_palign * pd->port_port; piocnt = (dd->ipath_pbufsport / pd->port_subport_cnt) + (dd->ipath_pbufsport % pd->port_subport_cnt); piobufs = pd->port_piobufs + @@ -1286,7 +1294,6 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) unsigned slave = subport_fp(fp) - 1; /* caller is a slave */ - ureg = 0; piocnt = dd->ipath_pbufsport / pd->port_subport_cnt; piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave; } @@ -1300,9 +1307,6 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, (void *) dd->ipath_pioavailregs_dma, "pioavail registers"); - else if (subport_fp(fp)) - /* Subports don't mmap the physical receive buffers */ - ret = -EINVAL; else if (pgaddr == pd->port_rcvegr_phys) ret = mmap_rcvegrbufs(vma, pd); else if (pgaddr == (u64) pd->port_rcvhdrq_phys) @@ -1400,32 +1404,41 @@ static int init_subports(struct ipath_devdata *dd, const struct ipath_user_info *uinfo) { int ret = 0; - unsigned num_slaves; + unsigned num_subports; size_t size; - /* Old user binaries don't know about subports */ - if ((uinfo->spu_userversion & 0xffff) != IPATH_USER_SWMINOR) - goto bail; /* * If the user is requesting zero or one port, * skip the subport allocation. */ if (uinfo->spu_subport_cnt <= 1) goto bail; - if (uinfo->spu_subport_cnt > 4) { + + /* Old user binaries don't know about new subport implementation */ + if ((uinfo->spu_userversion & 0xffff) != IPATH_USER_SWMINOR) { + dev_info(&dd->pcidev->dev, + "Mismatched user minor version (%d) and driver " + "minor version (%d) while port sharing. Ensure " + "that driver and library are from the same " + "release.\n", + (int) (uinfo->spu_userversion & 0xffff), + IPATH_USER_SWMINOR); + goto bail; + } + if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) { ret = -EINVAL; goto bail; } - num_slaves = uinfo->spu_subport_cnt - 1; - pd->subport_uregbase = vmalloc(PAGE_SIZE * num_slaves); + num_subports = uinfo->spu_subport_cnt; + pd->subport_uregbase = vmalloc(PAGE_SIZE * num_subports); if (!pd->subport_uregbase) { ret = -ENOMEM; goto bail; } /* Note: pd->port_rcvhdrq_size isn't initialized yet. */ size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * - sizeof(u32), PAGE_SIZE) * num_slaves; + sizeof(u32), PAGE_SIZE) * num_subports; pd->subport_rcvhdr_base = vmalloc(size); if (!pd->subport_rcvhdr_base) { ret = -ENOMEM; @@ -1434,7 +1447,7 @@ static int init_subports(struct ipath_devdata *dd, pd->subport_rcvegrbuf = vmalloc(pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size * - num_slaves); + num_subports); if (!pd->subport_rcvegrbuf) { ret = -ENOMEM; goto bail_rhdr; @@ -1443,6 +1456,12 @@ static int init_subports(struct ipath_devdata *dd, pd->port_subport_cnt = uinfo->spu_subport_cnt; pd->port_subport_id = uinfo->spu_subport_id; pd->active_slaves = 1; + set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag); + memset(pd->subport_uregbase, 0, PAGE_SIZE * num_subports); + memset(pd->subport_rcvhdr_base, 0, size); + memset(pd->subport_rcvegrbuf, 0, pd->port_rcvegrbuf_chunks * + pd->port_rcvegrbuf_size * + num_subports); goto bail; bail_rhdr: @@ -1573,18 +1592,19 @@ static int find_best_unit(struct file *fp, */ if (!cpus_empty(current->cpus_allowed) && !cpus_full(current->cpus_allowed)) { - int ncpus = num_online_cpus(), curcpu = -1; + int ncpus = num_online_cpus(), curcpu = -1, nset = 0; for (i = 0; i < ncpus; i++) if (cpu_isset(i, current->cpus_allowed)) { ipath_cdbg(PROC, "%s[%u] affinity set for " - "cpu %d\n", current->comm, - current->pid, i); + "cpu %d/%d\n", current->comm, + current->pid, i, ncpus); curcpu = i; + nset++; } - if (curcpu != -1) { + if (curcpu != -1 && nset != ncpus) { if (npresent) { prefunit = curcpu / (ncpus / npresent); - ipath_dbg("%s[%u] %d chips, %d cpus, " + ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, " "%d cpus/chip, select unit %d\n", current->comm, current->pid, npresent, ncpus, ncpus / npresent, @@ -1764,11 +1784,17 @@ static int ipath_do_user_init(struct file *fp, const struct ipath_user_info *uinfo) { int ret; - struct ipath_portdata *pd; + struct ipath_portdata *pd = port_fp(fp); struct ipath_devdata *dd; u32 head32; - pd = port_fp(fp); + /* Subports don't need to initialize anything since master did it. */ + if (subport_fp(fp)) { + ret = wait_event_interruptible(pd->port_wait, + !test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag)); + goto done; + } + dd = pd->port_dd; if (uinfo->spu_rcvhdrsize) { @@ -1826,6 +1852,11 @@ static int ipath_do_user_init(struct file *fp, dd->ipath_rcvctrl & ~INFINIPATH_R_TAILUPD); ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, dd->ipath_rcvctrl); + /* Notify any waiting slaves */ + if (pd->port_subport_cnt) { + clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag); + wake_up(&pd->port_wait); + } done: return ret; } @@ -2017,6 +2048,17 @@ static int ipath_get_slave_info(struct ipath_portdata *pd, return ret; } +static int ipath_force_pio_avail_update(struct ipath_devdata *dd) +{ + u64 reg = dd->ipath_sendctrl; + + clear_bit(IPATH_S_PIOBUFAVAILUPD, ®); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, reg); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + + return 0; +} + static ssize_t ipath_write(struct file *fp, const char __user *data, size_t count, loff_t *off) { @@ -2071,27 +2113,35 @@ static ssize_t ipath_write(struct file *fp, const char __user *data, dest = &cmd.cmd.part_key; src = &ucmd->cmd.part_key; break; - case IPATH_CMD_SLAVE_INFO: + case __IPATH_CMD_SLAVE_INFO: copy = sizeof(cmd.cmd.slave_mask_addr); dest = &cmd.cmd.slave_mask_addr; src = &ucmd->cmd.slave_mask_addr; break; + case IPATH_CMD_PIOAVAILUPD: // force an update of PIOAvail reg + copy = 0; + src = NULL; + dest = NULL; + break; default: ret = -EINVAL; goto bail; } - if ((count - consumed) < copy) { - ret = -EINVAL; - goto bail; - } + if (copy) { + if ((count - consumed) < copy) { + ret = -EINVAL; + goto bail; + } - if (copy_from_user(dest, src, copy)) { - ret = -EFAULT; - goto bail; + if (copy_from_user(dest, src, copy)) { + ret = -EFAULT; + goto bail; + } + + consumed += copy; } - consumed += copy; pd = port_fp(fp); if (!pd && cmd.type != __IPATH_CMD_USER_INIT && cmd.type != IPATH_CMD_ASSIGN_PORT) { @@ -2137,11 +2187,14 @@ static ssize_t ipath_write(struct file *fp, const char __user *data, case IPATH_CMD_SET_PART_KEY: ret = ipath_set_part_key(pd, cmd.cmd.part_key); break; - case IPATH_CMD_SLAVE_INFO: + case __IPATH_CMD_SLAVE_INFO: ret = ipath_get_slave_info(pd, (void __user *) (unsigned long) cmd.cmd.slave_mask_addr); break; + case IPATH_CMD_PIOAVAILUPD: + ret = ipath_force_pio_avail_update(pd->port_dd); + break; } if (ret >= 0) diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c index 993482545021..4171198fc202 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6110.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c @@ -43,6 +43,9 @@ #include "ipath_kernel.h" #include "ipath_registers.h" +static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64); + + /* * This lists the InfiniPath registers, in the actual chip layout. * This structure should never be directly accessed. @@ -208,8 +211,8 @@ static const struct ipath_kregs ipath_ht_kregs = { .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus), .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig), /* - * These should not be used directly via ipath_read_kreg64(), - * use them with ipath_read_kreg64_port(), + * These should not be used directly via ipath_write_kreg64(), + * use them with ipath_write_kreg64_port(), */ .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0), .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0) @@ -284,6 +287,14 @@ static const struct ipath_cregs ipath_ht_cregs = { #define INFINIPATH_EXTS_MEMBIST_ENDTEST 0x0000000000004000 #define INFINIPATH_EXTS_MEMBIST_CORRECT 0x0000000000008000 + +/* TID entries (memory), HT-only */ +#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL /* 40 bits valid */ +#define INFINIPATH_RT_VALID 0x8000000000000000ULL +#define INFINIPATH_RT_ADDR_SHIFT 0 +#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL +#define INFINIPATH_RT_BUFSIZE_SHIFT 48 + /* * masks and bits that are different in different chips, or present only * in one @@ -402,6 +413,14 @@ static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = { INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"), }; +#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \ + INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \ + << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) +#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \ + << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) + +static int ipath_ht_txe_recover(struct ipath_devdata *); + /** * ipath_ht_handle_hwerrors - display hardware errors. * @dd: the infinipath device @@ -450,13 +469,12 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, /* * make sure we get this much out, unless told to be quiet, + * it's a parity error we may recover from, * or it's occurred within the last 5 seconds */ - if ((hwerrs & ~(dd->ipath_lasthwerror | - ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | - INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) - << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) || - (ipath_debug & __IPATH_VERBDBG)) + if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY | + RXE_EAGER_PARITY)) || + (ipath_debug & __IPATH_VERBDBG)) dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " "(cleared)\n", (unsigned long long) hwerrs); dd->ipath_lasthwerror |= hwerrs; @@ -467,7 +485,7 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, (hwerrs & ~dd->ipath_hwe_bitsextant)); ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); - if (ctrl & INFINIPATH_C_FREEZEMODE) { + if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) { /* * parity errors in send memory are recoverable, * just cancel the send (if indicated in * sendbuffererror), @@ -476,50 +494,14 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, * occur if a processor speculative read is done to the PIO * buffer while we are sending a packet, for example. */ - if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | - INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) - << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) { - ipath_stats.sps_txeparity++; - ipath_dbg("Recovering from TXE parity error (%llu), " - "hwerrstatus=%llx\n", - (unsigned long long) ipath_stats.sps_txeparity, - (unsigned long long) hwerrs); - ipath_disarm_senderrbufs(dd); - hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | - INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) - << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT); - if (!hwerrs) { /* else leave in freeze mode */ - ipath_write_kreg(dd, - dd->ipath_kregs->kr_control, - dd->ipath_control); - return; - } - } - if (hwerrs) { - /* - * if any set that we aren't ignoring; only - * make the complaint once, in case it's stuck - * or recurring, and we get here multiple - * times. - */ - if (dd->ipath_flags & IPATH_INITTED) { - ipath_dev_err(dd, "Fatal Hardware Error (freeze " - "mode), no longer usable, SN %.16s\n", - dd->ipath_serial); - isfatal = 1; - } - *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; - /* mark as having had error */ - *dd->ipath_statusp |= IPATH_STATUS_HWERROR; - /* - * mark as not usable, at a minimum until driver - * is reloaded, probably until reboot, since no - * other reset is possible. - */ - dd->ipath_flags &= ~IPATH_INITTED; - } else { - ipath_dbg("Clearing freezemode on ignored hardware " - "error\n"); + if ((hwerrs & TXE_PIO_PARITY) && ipath_ht_txe_recover(dd)) + hwerrs &= ~TXE_PIO_PARITY; + if (hwerrs & RXE_EAGER_PARITY) + ipath_dev_err(dd, "RXE parity, Eager TID error is not " + "recoverable\n"); + if (!hwerrs) { + ipath_dbg("Clearing freezemode on ignored or " + "recovered hardware error\n"); ctrl &= ~INFINIPATH_C_FREEZEMODE; ipath_write_kreg(dd, dd->ipath_kregs->kr_control, ctrl); @@ -587,7 +569,39 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, dd->ipath_hwerrmask); } - ipath_dev_err(dd, "%s hardware error\n", msg); + if (hwerrs) { + /* + * if any set that we aren't ignoring; only + * make the complaint once, in case it's stuck + * or recurring, and we get here multiple + * times. + * force link down, so switch knows, and + * LEDs are turned off + */ + if (dd->ipath_flags & IPATH_INITTED) { + ipath_set_linkstate(dd, IPATH_IB_LINKDOWN); + ipath_setup_ht_setextled(dd, + INFINIPATH_IBCS_L_STATE_DOWN, + INFINIPATH_IBCS_LT_STATE_DISABLED); + ipath_dev_err(dd, "Fatal Hardware Error (freeze " + "mode), no longer usable, SN %.16s\n", + dd->ipath_serial); + isfatal = 1; + } + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + /* mark as having had error */ + *dd->ipath_statusp |= IPATH_STATUS_HWERROR; + /* + * mark as not usable, at a minimum until driver + * is reloaded, probably until reboot, since no + * other reset is possible. + */ + dd->ipath_flags &= ~IPATH_INITTED; + } + else + *msg = 0; /* recovered from all of them */ + if (*msg) + ipath_dev_err(dd, "%s hardware error\n", msg); if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg) /* * for status file; if no trailing brace is copied, @@ -658,7 +672,8 @@ static int ipath_ht_boardname(struct ipath_devdata *dd, char *name, if (n) snprintf(name, namelen, "%s", n); - if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 || dd->ipath_minrev > 3)) { + if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 || + dd->ipath_minrev > 3)) { /* * This version of the driver only supports Rev 3.2 and 3.3 */ @@ -1163,6 +1178,8 @@ static void ipath_ht_init_hwerrors(struct ipath_devdata *dd) if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST)) ipath_dev_err(dd, "MemBIST did not complete!\n"); + if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT) + ipath_dbg("MemBIST corrected\n"); ipath_check_htlink(dd); @@ -1366,6 +1383,9 @@ static void ipath_ht_put_tid(struct ipath_devdata *dd, u64 __iomem *tidptr, u32 type, unsigned long pa) { + if (!dd->ipath_kregbase) + return; + if (pa != dd->ipath_tidinvalid) { if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) { dev_info(&dd->pcidev->dev, @@ -1382,10 +1402,10 @@ static void ipath_ht_put_tid(struct ipath_devdata *dd, pa |= lenvalid | INFINIPATH_RT_VALID; } } - if (dd->ipath_kregbase) - writeq(pa, tidptr); + writeq(pa, tidptr); } + /** * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager * @dd: the infinipath device @@ -1515,7 +1535,7 @@ static int ipath_ht_early_init(struct ipath_devdata *dd) INFINIPATH_S_ABORT); ipath_get_eeprom_info(dd); - if(dd->ipath_boardrev == 5 && dd->ipath_serial[0] == '1' && + if (dd->ipath_boardrev == 5 && dd->ipath_serial[0] == '1' && dd->ipath_serial[1] == '2' && dd->ipath_serial[2] == '8') { /* * Later production QHT7040 has same changes as QHT7140, so @@ -1528,6 +1548,24 @@ static int ipath_ht_early_init(struct ipath_devdata *dd) return 0; } + +static int ipath_ht_txe_recover(struct ipath_devdata *dd) +{ + int cnt = ++ipath_stats.sps_txeparity; + if (cnt >= IPATH_MAX_PARITY_ATTEMPTS) { + if (cnt == IPATH_MAX_PARITY_ATTEMPTS) + ipath_dev_err(dd, + "Too many attempts to recover from " + "TXE parity, giving up\n"); + return 0; + } + dev_info(&dd->pcidev->dev, + "Recovering from TXE PIO parity error\n"); + ipath_disarm_senderrbufs(dd, 1); + return 1; +} + + /** * ipath_init_ht_get_base_info - set chip-specific flags for user code * @dd: the infinipath device diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c index 05918e1e7c36..1b9c30857754 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6120.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c @@ -43,6 +43,8 @@ #include "ipath_kernel.h" #include "ipath_registers.h" +static void ipath_setup_pe_setextled(struct ipath_devdata *, u64, u64); + /* * This file contains all the chip-specific register information and * access functions for the QLogic InfiniPath PCI-Express chip. @@ -207,8 +209,8 @@ static const struct ipath_kregs ipath_pe_kregs = { .kr_ibpllcfg = IPATH_KREG_OFFSET(IBPLLCfg), /* - * These should not be used directly via ipath_read_kreg64(), - * use them with ipath_read_kreg64_port() + * These should not be used directly via ipath_write_kreg64(), + * use them with ipath_write_kreg64_port(), */ .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0), .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0), @@ -321,6 +323,12 @@ static const struct ipath_hwerror_msgs ipath_6120_hwerror_msgs[] = { INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"), }; +#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \ + INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \ + << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) + +static int ipath_pe_txe_recover(struct ipath_devdata *); + /** * ipath_pe_handle_hwerrors - display hardware errors. * @dd: the infinipath device @@ -394,32 +402,21 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, * occur if a processor speculative read is done to the PIO * buffer while we are sending a packet, for example. */ - if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | - INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) - << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) { - ipath_stats.sps_txeparity++; - ipath_dbg("Recovering from TXE parity error (%llu), " - "hwerrstatus=%llx\n", - (unsigned long long) ipath_stats.sps_txeparity, - (unsigned long long) hwerrs); - ipath_disarm_senderrbufs(dd); - hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | - INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) - << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT); - if (!hwerrs) { /* else leave in freeze mode */ - ipath_write_kreg(dd, - dd->ipath_kregs->kr_control, - dd->ipath_control); - return; - } - } + if ((hwerrs & TXE_PIO_PARITY) && ipath_pe_txe_recover(dd)) + hwerrs &= ~TXE_PIO_PARITY; if (hwerrs) { /* * if any set that we aren't ignoring only make the * complaint once, in case it's stuck or recurring, * and we get here multiple times + * Force link down, so switch knows, and + * LEDs are turned off */ if (dd->ipath_flags & IPATH_INITTED) { + ipath_set_linkstate(dd, IPATH_IB_LINKDOWN); + ipath_setup_pe_setextled(dd, + INFINIPATH_IBCS_L_STATE_DOWN, + INFINIPATH_IBCS_LT_STATE_DISABLED); ipath_dev_err(dd, "Fatal Hardware Error (freeze " "mode), no longer usable, SN %.16s\n", dd->ipath_serial); @@ -493,7 +490,8 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, dd->ipath_hwerrmask); } - ipath_dev_err(dd, "%s hardware error\n", msg); + if (*msg) + ipath_dev_err(dd, "%s hardware error\n", msg); if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg) { /* * for /sys status file ; if no trailing } is copied, we'll @@ -581,6 +579,8 @@ static void ipath_pe_init_hwerrors(struct ipath_devdata *dd) if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST)) ipath_dev_err(dd, "MemBIST did not complete!\n"); + if (extsval & INFINIPATH_EXTS_MEMBIST_FOUND) + ipath_dbg("MemBIST corrected\n"); val = ~0ULL; /* barring bugs, all hwerrors become interrupts, */ @@ -1330,6 +1330,35 @@ static void ipath_pe_free_irq(struct ipath_devdata *dd) dd->ipath_irq = 0; } +/* + * On platforms using this chip, and not having ordered WC stores, we + * can get TXE parity errors due to speculative reads to the PIO buffers, + * and this, due to a chip bug can result in (many) false parity error + * reports. So it's a debug print on those, and an info print on systems + * where the speculative reads don't occur. + * Because we can get lots of false errors, we have no upper limit + * on recovery attempts on those platforms. + */ +static int ipath_pe_txe_recover(struct ipath_devdata *dd) +{ + if (ipath_unordered_wc()) + ipath_dbg("Recovering from TXE PIO parity error\n"); + else { + int cnt = ++ipath_stats.sps_txeparity; + if (cnt >= IPATH_MAX_PARITY_ATTEMPTS) { + if (cnt == IPATH_MAX_PARITY_ATTEMPTS) + ipath_dev_err(dd, + "Too many attempts to recover from " + "TXE parity, giving up\n"); + return 0; + } + dev_info(&dd->pcidev->dev, + "Recovering from TXE PIO parity error\n"); + } + ipath_disarm_senderrbufs(dd, 1); + return 1; +} + /** * ipath_init_iba6120_funcs - set up the chip-specific function pointers * @dd: the infinipath device diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c index d4f6b5239ef8..7045ba689494 100644 --- a/drivers/infiniband/hw/ipath/ipath_init_chip.c +++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -216,6 +216,20 @@ static int bringup_link(struct ipath_devdata *dd) return ret; } +static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd) +{ + struct ipath_portdata *pd = NULL; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (pd) { + pd->port_dd = dd; + pd->port_cnt = 1; + /* The port 0 pkey table is used by the layer interface. */ + pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY; + } + return pd; +} + static int init_chip_first(struct ipath_devdata *dd, struct ipath_portdata **pdp) { @@ -271,20 +285,16 @@ static int init_chip_first(struct ipath_devdata *dd, goto done; } - dd->ipath_pd[0] = kzalloc(sizeof(*pd), GFP_KERNEL); + pd = create_portdata0(dd); - if (!dd->ipath_pd[0]) { + if (!pd) { ipath_dev_err(dd, "Unable to allocate portdata for port " "0, failing\n"); ret = -ENOMEM; goto done; } - pd = dd->ipath_pd[0]; - pd->port_dd = dd; - pd->port_port = 0; - pd->port_cnt = 1; - /* The port 0 pkey table is used by the layer interface. */ - pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY; + dd->ipath_pd[0] = pd; + dd->ipath_rcvtidcnt = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt); dd->ipath_rcvtidbase = @@ -590,6 +600,10 @@ static int init_housekeeping(struct ipath_devdata *dd, goto done; } + + /* clear diagctrl register, in case diags were running and crashed */ + ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0); + /* clear the initial reset flag, in case first driver load */ ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, INFINIPATH_E_RESET); @@ -668,6 +682,7 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) { int ret = 0, i; u32 val32, kpiobufs; + u32 piobufs, uports; u64 val; struct ipath_portdata *pd = NULL; /* keep gcc4 happy */ gfp_t gfp_flags = GFP_USER | __GFP_COMP; @@ -702,16 +717,17 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) * the in memory DMA'ed copies of the registers. This has to * be done early, before we calculate lastport, etc. */ - val = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; /* * calc number of pioavail registers, and save it; we have 2 * bits per buffer. */ - dd->ipath_pioavregs = ALIGN(val, sizeof(u64) * BITS_PER_BYTE / 2) + dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) / (sizeof(u64) * BITS_PER_BYTE / 2); + uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0; if (ipath_kpiobufs == 0) { /* not set by user (this is default) */ - if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > 128) + if (piobufs >= (uports * IPATH_MIN_USER_PORT_BUFCNT) + 32) kpiobufs = 32; else kpiobufs = 16; @@ -719,31 +735,25 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) else kpiobufs = ipath_kpiobufs; - if (kpiobufs > - (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - - (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT))) { - i = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - - (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT); + if (kpiobufs + (uports * IPATH_MIN_USER_PORT_BUFCNT) > piobufs) { + i = (int) piobufs - + (int) (uports * IPATH_MIN_USER_PORT_BUFCNT); if (i < 0) i = 0; - dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs for " - "kernel leaves too few for %d user ports " + dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of " + "%d for kernel leaves too few for %d user ports " "(%d each); using %u\n", kpiobufs, - dd->ipath_cfgports - 1, - IPATH_MIN_USER_PORT_BUFCNT, i); + piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i); /* * shouldn't change ipath_kpiobufs, because could be * different for different devices... */ kpiobufs = i; } - dd->ipath_lastport_piobuf = - dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - kpiobufs; - dd->ipath_pbufsport = dd->ipath_cfgports > 1 - ? dd->ipath_lastport_piobuf / (dd->ipath_cfgports - 1) - : 0; - val32 = dd->ipath_lastport_piobuf - - (dd->ipath_pbufsport * (dd->ipath_cfgports - 1)); + dd->ipath_lastport_piobuf = piobufs - kpiobufs; + dd->ipath_pbufsport = + uports ? dd->ipath_lastport_piobuf / uports : 0; + val32 = dd->ipath_lastport_piobuf - (dd->ipath_pbufsport * uports); if (val32 > 0) { ipath_dbg("allocating %u pbufs/port leaves %u unused, " "add to kernel\n", dd->ipath_pbufsport, val32); @@ -754,8 +764,7 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) dd->ipath_lastpioindex = dd->ipath_lastport_piobuf; ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u " "each for %u user ports\n", kpiobufs, - dd->ipath_piobcnt2k + dd->ipath_piobcnt4k, - dd->ipath_pbufsport, dd->ipath_cfgports - 1); + piobufs, dd->ipath_pbufsport, uports); dd->ipath_f_early_init(dd); @@ -839,11 +848,24 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) * Set up the port 0 (kernel) rcvhdr q and egr TIDs. If doing * re-init, the simplest way to handle this is to free * existing, and re-allocate. + * Need to re-create rest of port 0 portdata as well. */ if (reinit) { - struct ipath_portdata *pd = dd->ipath_pd[0]; - dd->ipath_pd[0] = NULL; - ipath_free_pddata(dd, pd); + /* Alloc and init new ipath_portdata for port0, + * Then free old pd. Could lead to fragmentation, but also + * makes later support for hot-swap easier. + */ + struct ipath_portdata *npd; + npd = create_portdata0(dd); + if (npd) { + ipath_free_pddata(dd, pd); + dd->ipath_pd[0] = pd = npd; + } else { + ipath_dev_err(dd, "Unable to allocate portdata for" + " port 0, failing\n"); + ret = -ENOMEM; + goto done; + } } dd->ipath_f_tidtemplate(dd); ret = ipath_create_rcvhdrq(dd, pd); diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 72b9e279d19d..45d033169c6e 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -38,10 +38,39 @@ #include "ipath_common.h" /* + * clear (write) a pio buffer, to clear a parity error. This routine + * should only be called when in freeze mode, and the buffer should be + * canceled afterwards. + */ +static void ipath_clrpiobuf(struct ipath_devdata *dd, u32 pnum) +{ + u32 __iomem *pbuf; + u32 dwcnt; /* dword count to write */ + if (pnum < dd->ipath_piobcnt2k) { + pbuf = (u32 __iomem *) (dd->ipath_pio2kbase + pnum * + dd->ipath_palign); + dwcnt = dd->ipath_piosize2k >> 2; + } + else { + pbuf = (u32 __iomem *) (dd->ipath_pio4kbase + + (pnum - dd->ipath_piobcnt2k) * dd->ipath_4kalign); + dwcnt = dd->ipath_piosize4k >> 2; + } + dev_info(&dd->pcidev->dev, + "Rewrite PIO buffer %u, to recover from parity error\n", + pnum); + *pbuf = dwcnt+1; /* no flush required, since already in freeze */ + while(--dwcnt) + *pbuf++ = 0; +} + +/* * Called when we might have an error that is specific to a particular * PIO buffer, and may need to cancel that buffer, so it can be re-used. + * If rewrite is true, and bits are set in the sendbufferror registers, + * we'll write to the buffer, for error recovery on parity errors. */ -void ipath_disarm_senderrbufs(struct ipath_devdata *dd) +void ipath_disarm_senderrbufs(struct ipath_devdata *dd, int rewrite) { u32 piobcnt; unsigned long sbuf[4]; @@ -74,8 +103,11 @@ void ipath_disarm_senderrbufs(struct ipath_devdata *dd) } for (i = 0; i < piobcnt; i++) - if (test_bit(i, sbuf)) + if (test_bit(i, sbuf)) { + if (rewrite) + ipath_clrpiobuf(dd, i); ipath_disarm_piobufs(dd, i, 1); + } dd->ipath_lastcancel = jiffies+3; /* no armlaunch for a bit */ } } @@ -114,7 +146,7 @@ static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) { u64 ignore_this_time = 0; - ipath_disarm_senderrbufs(dd); + ipath_disarm_senderrbufs(dd, 0); if ((errs & E_SUM_LINK_PKTERRS) && !(dd->ipath_flags & IPATH_LINKACTIVE)) { /* @@ -403,10 +435,13 @@ static void handle_supp_msgs(struct ipath_devdata *dd, * happens so often we never want to count it. */ if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) { - ipath_decode_err(msg, sizeof msg, dd->ipath_lasterror & - ~INFINIPATH_E_IBSTATUSCHANGED); + int iserr; + iserr = ipath_decode_err(msg, sizeof msg, + dd->ipath_lasterror & + ~INFINIPATH_E_IBSTATUSCHANGED); if (dd->ipath_lasterror & - ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL)) + ~(INFINIPATH_E_RRCVEGRFULL | + INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS)) ipath_dev_err(dd, "Suppressed %u messages for " "fast-repeating errors (%s) (%llx)\n", supp_msgs, msg, @@ -420,8 +455,13 @@ static void handle_supp_msgs(struct ipath_devdata *dd, * them. So only complain about these at debug * level. */ - ipath_dbg("Suppressed %u messages for %s\n", - supp_msgs, msg); + if (iserr) + ipath_dbg("Suppressed %u messages for %s\n", + supp_msgs, msg); + else + ipath_cdbg(ERRPKT, + "Suppressed %u messages for %s\n", + supp_msgs, msg); } } } @@ -462,7 +502,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) { char msg[512]; u64 ignore_this_time = 0; - int i; + int i, iserr = 0; int chkerrpkts = 0, noprint = 0; unsigned supp_msgs; @@ -502,6 +542,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) } if (supp_msgs == 250000) { + int s_iserr; /* * It's not entirely reasonable assuming that the errors set * in the last clear period are all responsible for the @@ -511,17 +552,17 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) dd->ipath_maskederrs |= dd->ipath_lasterror | errs; ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, ~dd->ipath_maskederrs); - ipath_decode_err(msg, sizeof msg, + s_iserr = ipath_decode_err(msg, sizeof msg, (dd->ipath_maskederrs & ~dd-> ipath_ignorederrs)); if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) & - ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL)) - ipath_dev_err(dd, "Disabling error(s) %llx because " - "occurring too frequently (%s)\n", - (unsigned long long) - (dd->ipath_maskederrs & - ~dd->ipath_ignorederrs), msg); + ~(INFINIPATH_E_RRCVEGRFULL | + INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS)) + ipath_dev_err(dd, "Temporarily disabling " + "error(s) %llx reporting; too frequent (%s)\n", + (unsigned long long) (dd->ipath_maskederrs & + ~dd->ipath_ignorederrs), msg); else { /* * rcvegrfull and rcvhdrqfull are "normal", @@ -530,8 +571,15 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) * processing them. So only complain about * these at debug level. */ - ipath_dbg("Disabling frequent queue full errors " - "(%s)\n", msg); + if (s_iserr) + ipath_dbg("Temporarily disabling reporting " + "too frequent queue full errors (%s)\n", + msg); + else + ipath_cdbg(ERRPKT, + "Temporarily disabling reporting too" + " frequent packet errors (%s)\n", + msg); } /* @@ -589,6 +637,8 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) ipath_stats.sps_crcerrs++; chkerrpkts = 1; } + iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS); + /* * We don't want to print these two as they happen, or we can make @@ -677,8 +727,13 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) *dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF; } - if (!noprint && *msg) - ipath_dev_err(dd, "%s error\n", msg); + if (!noprint && *msg) { + if (iserr) + ipath_dev_err(dd, "%s error\n", msg); + else + dev_info(&dd->pcidev->dev, "%s packet problems\n", + msg); + } if (dd->ipath_state_wanted & dd->ipath_flags) { ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, " "waking\n", dd->ipath_state_wanted, @@ -819,11 +874,10 @@ static void handle_urcv(struct ipath_devdata *dd, u32 istat) struct ipath_portdata *pd = dd->ipath_pd[i]; if (portr & (1 << i) && pd && pd->port_cnt && test_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag)) { - int rcbit; clear_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag); - rcbit = i + INFINIPATH_R_INTRAVAIL_SHIFT; - clear_bit(1UL << rcbit, &dd->ipath_rcvctrl); + clear_bit(i + INFINIPATH_R_INTRAVAIL_SHIFT, + &dd->ipath_rcvctrl); wake_up_interruptible(&pd->port_wait); rcvdint = 1; } diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index 6d8d05fb5999..e900c2593f44 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -590,7 +590,6 @@ int ipath_enable_wc(struct ipath_devdata *dd); void ipath_disable_wc(struct ipath_devdata *dd); int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp); void ipath_shutdown_device(struct ipath_devdata *); -void ipath_disarm_senderrbufs(struct ipath_devdata *); struct file_operations; int ipath_cdev_init(int minor, char *name, const struct file_operations *fops, @@ -611,7 +610,7 @@ struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t); extern int ipath_diag_inuse; irqreturn_t ipath_intr(int irq, void *devid); -void ipath_decode_err(char *buf, size_t blen, ipath_err_t err); +int ipath_decode_err(char *buf, size_t blen, ipath_err_t err); #if __IPATH_INFO || __IPATH_DBG extern const char *ipath_ibcstatus_str[]; #endif @@ -701,6 +700,8 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv); #define IPATH_PORT_WAITING_RCV 2 /* waiting for a PIO buffer to be available */ #define IPATH_PORT_WAITING_PIO 3 + /* master has not finished initializing */ +#define IPATH_PORT_MASTER_UNINIT 4 /* free up any allocated data at closes */ void ipath_free_data(struct ipath_portdata *dd); @@ -711,6 +712,7 @@ void ipath_init_iba6120_funcs(struct ipath_devdata *); void ipath_init_iba6110_funcs(struct ipath_devdata *); void ipath_get_eeprom_info(struct ipath_devdata *); u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg); +void ipath_disarm_senderrbufs(struct ipath_devdata *, int); /* * number of words used for protocol header if not set by ipath_userinit(); @@ -754,8 +756,6 @@ int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int); /* these are used for the registers that vary with port */ void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg, unsigned, u64); -u64 ipath_read_kreg64_port(const struct ipath_devdata *, ipath_kreg, - unsigned); /* * We could have a single register get/put routine, that takes a group type, @@ -897,6 +897,8 @@ dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int); extern unsigned ipath_debug; /* debugging bit mask */ +#define IPATH_MAX_PARITY_ATTEMPTS 10000 /* max times to try recovery */ + const char *ipath_get_unit_name(int unit); extern struct mutex ipath_mutex; diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c index 851763d7d2db..dd487c100f5b 100644 --- a/drivers/infiniband/hw/ipath/ipath_keys.c +++ b/drivers/infiniband/hw/ipath/ipath_keys.c @@ -61,7 +61,7 @@ int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr) r = (r + 1) & (rkt->max - 1); if (r == n) { spin_unlock_irqrestore(&rkt->lock, flags); - ipath_dbg(KERN_INFO "LKEY table full\n"); + ipath_dbg("LKEY table full\n"); ret = 0; goto bail; } @@ -133,6 +133,12 @@ int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge, * being reversible by calling bus_to_virt(). */ if (sge->lkey == 0) { + struct ipath_pd *pd = to_ipd(qp->ibqp.pd); + + if (pd->user) { + ret = 0; + goto bail; + } isge->mr = NULL; isge->vaddr = (void *) sge->addr; isge->length = sge->length; @@ -206,6 +212,12 @@ int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss, * (see ipath_get_dma_mr and ipath_dma.c). */ if (rkey == 0) { + struct ipath_pd *pd = to_ipd(qp->ibqp.pd); + + if (pd->user) { + ret = 0; + goto bail; + } sge->mr = NULL; sge->vaddr = (void *) vaddr; sge->length = len; diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c index 8cc8598d6c69..31e70732e369 100644 --- a/drivers/infiniband/hw/ipath/ipath_mr.c +++ b/drivers/infiniband/hw/ipath/ipath_mr.c @@ -210,9 +210,15 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, m = 0; n = 0; list_for_each_entry(chunk, ®ion->chunk_list, list) { - for (i = 0; i < chunk->nmap; i++) { - mr->mr.map[m]->segs[n].vaddr = - page_address(chunk->page_list[i].page); + for (i = 0; i < chunk->nents; i++) { + void *vaddr; + + vaddr = page_address(chunk->page_list[i].page); + if (!vaddr) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + mr->mr.map[m]->segs[n].vaddr = vaddr; mr->mr.map[m]->segs[n].length = region->page_size; n++; if (n == IPATH_SEGSZ) { diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c index 64f07b19349f..16db9ac0b402 100644 --- a/drivers/infiniband/hw/ipath/ipath_qp.c +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -81,11 +81,51 @@ static u32 credit_table[31] = { 32768 /* 1E */ }; -static u32 alloc_qpn(struct ipath_qp_table *qpt) + +static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map) +{ + unsigned long page = get_zeroed_page(GFP_KERNEL); + unsigned long flags; + + /* + * Free the page if someone raced with us installing it. + */ + + spin_lock_irqsave(&qpt->lock, flags); + if (map->page) + free_page(page); + else + map->page = (void *)page; + spin_unlock_irqrestore(&qpt->lock, flags); +} + + +static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type) { u32 i, offset, max_scan, qpn; struct qpn_map *map; - u32 ret; + u32 ret = -1; + + if (type == IB_QPT_SMI) + ret = 0; + else if (type == IB_QPT_GSI) + ret = 1; + + if (ret != -1) { + map = &qpt->map[0]; + if (unlikely(!map->page)) { + get_map_page(qpt, map); + if (unlikely(!map->page)) { + ret = -ENOMEM; + goto bail; + } + } + if (!test_and_set_bit(ret, map->page)) + atomic_dec(&map->n_free); + else + ret = -EBUSY; + goto bail; + } qpn = qpt->last + 1; if (qpn >= QPN_MAX) @@ -95,19 +135,7 @@ static u32 alloc_qpn(struct ipath_qp_table *qpt) max_scan = qpt->nmaps - !offset; for (i = 0;;) { if (unlikely(!map->page)) { - unsigned long page = get_zeroed_page(GFP_KERNEL); - unsigned long flags; - - /* - * Free the page if someone raced with us - * installing it: - */ - spin_lock_irqsave(&qpt->lock, flags); - if (map->page) - free_page(page); - else - map->page = (void *)page; - spin_unlock_irqrestore(&qpt->lock, flags); + get_map_page(qpt, map); if (unlikely(!map->page)) break; } @@ -151,7 +179,7 @@ static u32 alloc_qpn(struct ipath_qp_table *qpt) qpn = mk_qpn(qpt, map, offset); } - ret = 0; + ret = -ENOMEM; bail: return ret; @@ -180,29 +208,19 @@ static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp, enum ib_qp_type type) { unsigned long flags; - u32 qpn; int ret; - if (type == IB_QPT_SMI) - qpn = 0; - else if (type == IB_QPT_GSI) - qpn = 1; - else { - /* Allocate the next available QPN */ - qpn = alloc_qpn(qpt); - if (qpn == 0) { - ret = -ENOMEM; - goto bail; - } - } - qp->ibqp.qp_num = qpn; + ret = alloc_qpn(qpt, type); + if (ret < 0) + goto bail; + qp->ibqp.qp_num = ret; /* Add the QP to the hash table. */ spin_lock_irqsave(&qpt->lock, flags); - qpn %= qpt->max; - qp->next = qpt->table[qpn]; - qpt->table[qpn] = qp; + ret %= qpt->max; + qp->next = qpt->table[ret]; + qpt->table[ret] = qp; atomic_inc(&qp->refcount); spin_unlock_irqrestore(&qpt->lock, flags); @@ -245,9 +263,7 @@ static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp) if (!fnd) return; - /* If QPN is not reserved, mark QPN free in the bitmap. */ - if (qp->ibqp.qp_num > 1) - free_qpn(qpt, qp->ibqp.qp_num); + free_qpn(qpt, qp->ibqp.qp_num); wait_event(qp->wait, !atomic_read(&qp->refcount)); } @@ -270,11 +286,10 @@ void ipath_free_all_qps(struct ipath_qp_table *qpt) while (qp) { nqp = qp->next; - if (qp->ibqp.qp_num > 1) - free_qpn(qpt, qp->ibqp.qp_num); + free_qpn(qpt, qp->ibqp.qp_num); if (!atomic_dec_and_test(&qp->refcount) || !ipath_destroy_qp(&qp->ibqp)) - ipath_dbg(KERN_INFO "QP memory leak!\n"); + ipath_dbg("QP memory leak!\n"); qp = nqp; } } @@ -320,7 +335,8 @@ static void ipath_reset_qp(struct ipath_qp *qp) qp->remote_qpn = 0; qp->qkey = 0; qp->qp_access_flags = 0; - clear_bit(IPATH_S_BUSY, &qp->s_flags); + qp->s_busy = 0; + qp->s_flags &= ~IPATH_S_SIGNAL_REQ_WR; qp->s_hdrwords = 0; qp->s_psn = 0; qp->r_psn = 0; @@ -333,7 +349,6 @@ static void ipath_reset_qp(struct ipath_qp *qp) qp->r_state = IB_OPCODE_UC_SEND_LAST; } qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; - qp->r_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; qp->r_nak_state = 0; qp->r_wrid_valid = 0; qp->s_rnr_timeout = 0; @@ -344,6 +359,10 @@ static void ipath_reset_qp(struct ipath_qp *qp) qp->s_ssn = 1; qp->s_lsn = 0; qp->s_wait_credit = 0; + memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); + qp->r_head_ack_queue = 0; + qp->s_tail_ack_queue = 0; + qp->s_num_rd_atomic = 0; if (qp->r_rq.wq) { qp->r_rq.wq->head = 0; qp->r_rq.wq->tail = 0; @@ -357,7 +376,7 @@ static void ipath_reset_qp(struct ipath_qp *qp) * @err: the receive completion error to signal if a RWQE is active * * Flushes both send and receive work queues. - * QP s_lock should be held and interrupts disabled. + * The QP s_lock should be held and interrupts disabled. */ void ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) @@ -365,7 +384,7 @@ void ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) struct ipath_ibdev *dev = to_idev(qp->ibqp.device); struct ib_wc wc; - ipath_dbg(KERN_INFO "QP%d/%d in error state\n", + ipath_dbg("QP%d/%d in error state\n", qp->ibqp.qp_num, qp->remote_qpn); spin_lock(&dev->pending_lock); @@ -389,6 +408,8 @@ void ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) wc.port_num = 0; if (qp->r_wrid_valid) { qp->r_wrid_valid = 0; + wc.wr_id = qp->r_wr_id; + wc.opcode = IB_WC_RECV; wc.status = err; ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1); } @@ -503,13 +524,17 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, attr->path_mig_state != IB_MIG_REARM) goto inval; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC) + goto inval; + switch (new_state) { case IB_QPS_RESET: ipath_reset_qp(qp); break; case IB_QPS_ERR: - ipath_error_qp(qp, IB_WC_GENERAL_ERR); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); break; default: @@ -559,6 +584,12 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_QKEY) qp->qkey = attr->qkey; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->r_max_rd_atomic = attr->max_dest_rd_atomic; + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) + qp->s_max_rd_atomic = attr->max_rd_atomic; + qp->state = new_state; spin_unlock_irqrestore(&qp->s_lock, flags); @@ -598,8 +629,8 @@ int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, attr->alt_pkey_index = 0; attr->en_sqd_async_notify = 0; attr->sq_draining = 0; - attr->max_rd_atomic = 1; - attr->max_dest_rd_atomic = 1; + attr->max_rd_atomic = qp->s_max_rd_atomic; + attr->max_dest_rd_atomic = qp->r_max_rd_atomic; attr->min_rnr_timer = qp->r_min_rnr_timer; attr->port_num = 1; attr->timeout = qp->timeout; @@ -614,7 +645,7 @@ int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, init_attr->recv_cq = qp->ibqp.recv_cq; init_attr->srq = qp->ibqp.srq; init_attr->cap = attr->cap; - if (qp->s_flags & (1 << IPATH_S_SIGNAL_REQ_WR)) + if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR) init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; else init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; @@ -786,7 +817,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, qp->s_size = init_attr->cap.max_send_wr + 1; qp->s_max_sge = init_attr->cap.max_send_sge; if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) - qp->s_flags = 1 << IPATH_S_SIGNAL_REQ_WR; + qp->s_flags = IPATH_S_SIGNAL_REQ_WR; else qp->s_flags = 0; dev = to_idev(ibpd->device); @@ -958,7 +989,7 @@ bail: * @wc: the WC responsible for putting the QP in this state * * Flushes the send work queue. - * The QP s_lock should be held. + * The QP s_lock should be held and interrupts disabled. */ void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc) @@ -966,7 +997,7 @@ void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc) struct ipath_ibdev *dev = to_idev(qp->ibqp.device); struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); - ipath_dbg(KERN_INFO "Send queue error on QP%d/%d: err: %d\n", + ipath_dbg("Send queue error on QP%d/%d: err: %d\n", qp->ibqp.qp_num, qp->remote_qpn, wc->status); spin_lock(&dev->pending_lock); @@ -984,12 +1015,12 @@ void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc) wc->status = IB_WC_WR_FLUSH_ERR; while (qp->s_last != qp->s_head) { + wqe = get_swqe_ptr(qp, qp->s_last); wc->wr_id = wqe->wr.wr_id; wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1); if (++qp->s_last >= qp->s_size) qp->s_last = 0; - wqe = get_swqe_ptr(qp, qp->s_last); } qp->s_cur = qp->s_tail = qp->s_head; qp->state = IB_QPS_SQE; diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c index 5ff20cb04494..b4b88d0b53f5 100644 --- a/drivers/infiniband/hw/ipath/ipath_rc.c +++ b/drivers/infiniband/hw/ipath/ipath_rc.c @@ -37,6 +37,19 @@ /* cut down ridiculously long IB macro names */ #define OP(x) IB_OPCODE_RC_##x +static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu; + ss->sge = wqe->sg_list[0]; + ss->sg_list = wqe->sg_list + 1; + ss->num_sge = wqe->wr.num_sge; + ipath_skip_sge(ss, len); + return wqe->length - len; +} + /** * ipath_init_restart- initialize the qp->s_sge after a restart * @qp: the QP who's SGE we're restarting @@ -47,15 +60,9 @@ static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) { struct ipath_ibdev *dev; - u32 len; - len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * - ib_mtu_enum_to_int(qp->path_mtu); - qp->s_sge.sge = wqe->sg_list[0]; - qp->s_sge.sg_list = wqe->sg_list + 1; - qp->s_sge.num_sge = wqe->wr.num_sge; - ipath_skip_sge(&qp->s_sge, len); - qp->s_len = wqe->length - len; + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, + ib_mtu_enum_to_int(qp->path_mtu)); dev = to_idev(qp->ibqp.device); spin_lock(&dev->pending_lock); if (list_empty(&qp->timerwait)) @@ -70,107 +77,123 @@ static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) * @ohdr: a pointer to the IB header being constructed * @pmtu: the path MTU * - * Return bth0 if constructed; otherwise, return 0. + * Return 1 if constructed; otherwise, return 0. + * Note that we are in the responder's side of the QP context. * Note the QP s_lock must be held. */ -u32 ipath_make_rc_ack(struct ipath_qp *qp, - struct ipath_other_headers *ohdr, - u32 pmtu) +static int ipath_make_rc_ack(struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 pmtu, u32 *bth0p, u32 *bth2p) { + struct ipath_ack_entry *e; u32 hwords; u32 len; u32 bth0; + u32 bth2; /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; - /* - * Send a response. Note that we are in the responder's - * side of the QP context. - */ switch (qp->s_ack_state) { - case OP(RDMA_READ_REQUEST): - qp->s_cur_sge = &qp->s_rdma_sge; - len = qp->s_rdma_len; - if (len > pmtu) { - len = pmtu; - qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); - } else - qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); - qp->s_rdma_len -= len; + case OP(RDMA_READ_RESPONSE_LAST): + case OP(RDMA_READ_RESPONSE_ONLY): + case OP(ATOMIC_ACKNOWLEDGE): + qp->s_ack_state = OP(ACKNOWLEDGE); + /* FALLTHROUGH */ + case OP(ACKNOWLEDGE): + /* Check for no next entry in the queue. */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { + if (qp->s_flags & IPATH_S_ACK_PENDING) + goto normal; + goto bail; + } + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST)) { + /* Copy SGE state in case we need to resend */ + qp->s_ack_rdma_sge = e->rdma_sge; + qp->s_cur_sge = &qp->s_ack_rdma_sge; + len = e->rdma_sge.sge.sge_length; + if (len > pmtu) { + len = pmtu; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); + } else { + qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); + if (++qp->s_tail_ack_queue > + IPATH_MAX_RDMA_ATOMIC) + qp->s_tail_ack_queue = 0; + } + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + qp->s_ack_rdma_psn = e->psn; + bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK; + } else { + /* COMPARE_SWAP or FETCH_ADD */ + qp->s_cur_sge = NULL; + len = 0; + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); + ohdr->u.at.aeth = ipath_compute_aeth(qp); + ohdr->u.at.atomic_ack_eth[0] = + cpu_to_be32(e->atomic_data >> 32); + ohdr->u.at.atomic_ack_eth[1] = + cpu_to_be32(e->atomic_data); + hwords += sizeof(ohdr->u.at) / sizeof(u32); + bth2 = e->psn; + if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC) + qp->s_tail_ack_queue = 0; + } bth0 = qp->s_ack_state << 24; - ohdr->u.aeth = ipath_compute_aeth(qp); - hwords++; break; case OP(RDMA_READ_RESPONSE_FIRST): qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); /* FALLTHROUGH */ case OP(RDMA_READ_RESPONSE_MIDDLE): - qp->s_cur_sge = &qp->s_rdma_sge; - len = qp->s_rdma_len; + len = qp->s_ack_rdma_sge.sge.sge_length; if (len > pmtu) len = pmtu; else { ohdr->u.aeth = ipath_compute_aeth(qp); hwords++; qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC) + qp->s_tail_ack_queue = 0; } - qp->s_rdma_len -= len; bth0 = qp->s_ack_state << 24; - break; - - case OP(RDMA_READ_RESPONSE_LAST): - case OP(RDMA_READ_RESPONSE_ONLY): - /* - * We have to prevent new requests from changing - * the r_sge state while a ipath_verbs_send() - * is in progress. - */ - qp->s_ack_state = OP(ACKNOWLEDGE); - bth0 = 0; - goto bail; - - case OP(COMPARE_SWAP): - case OP(FETCH_ADD): - qp->s_cur_sge = NULL; - len = 0; - /* - * Set the s_ack_state so the receive interrupt handler - * won't try to send an ACK (out of order) until this one - * is actually sent. - */ - qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); - bth0 = OP(ATOMIC_ACKNOWLEDGE) << 24; - ohdr->u.at.aeth = ipath_compute_aeth(qp); - ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->r_atomic_data); - hwords += sizeof(ohdr->u.at) / 4; + bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK; break; default: - /* Send a regular ACK. */ - qp->s_cur_sge = NULL; - len = 0; + normal: /* - * Set the s_ack_state so the receive interrupt handler - * won't try to send an ACK (out of order) until this one - * is actually sent. + * Send a regular ACK. + * Set the s_ack_state so we wait until after sending + * the ACK before setting s_ack_state to ACKNOWLEDGE + * (see above). */ - qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); - bth0 = OP(ACKNOWLEDGE) << 24; + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); + qp->s_flags &= ~IPATH_S_ACK_PENDING; + qp->s_cur_sge = NULL; if (qp->s_nak_state) - ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | - (qp->s_nak_state << - IPATH_AETH_CREDIT_SHIFT)); + ohdr->u.aeth = + cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | + (qp->s_nak_state << + IPATH_AETH_CREDIT_SHIFT)); else ohdr->u.aeth = ipath_compute_aeth(qp); hwords++; + len = 0; + bth0 = OP(ACKNOWLEDGE) << 24; + bth2 = qp->s_ack_psn & IPATH_PSN_MASK; } qp->s_hdrwords = hwords; qp->s_cur_size = len; + *bth0p = bth0; + *bth2p = bth2; + return 1; bail: - return bth0; + return 0; } /** @@ -197,9 +220,16 @@ int ipath_make_rc_req(struct ipath_qp *qp, u32 bth2; char newreq; + /* Sending responses has higher priority over sending requests. */ + if ((qp->r_head_ack_queue != qp->s_tail_ack_queue || + (qp->s_flags & IPATH_S_ACK_PENDING) || + qp->s_ack_state != IB_OPCODE_RC_ACKNOWLEDGE) && + ipath_make_rc_ack(qp, ohdr, pmtu, bth0p, bth2p)) + goto done; + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) || qp->s_rnr_timeout) - goto done; + goto bail; /* Limit the number of packets sent without an ACK. */ if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT) > 0) { @@ -210,7 +240,7 @@ int ipath_make_rc_req(struct ipath_qp *qp, list_add_tail(&qp->timerwait, &dev->pending[dev->pending_index]); spin_unlock(&dev->pending_lock); - goto done; + goto bail; } /* header size in 32-bit words LRH+BTH = (8+12)/4. */ @@ -232,7 +262,16 @@ int ipath_make_rc_req(struct ipath_qp *qp, if (qp->s_cur == qp->s_tail) { /* Check if send work queue is empty. */ if (qp->s_tail == qp->s_head) - goto done; + goto bail; + /* + * If a fence is requested, wait for previous + * RDMA read and atomic operations to finish. + */ + if ((wqe->wr.send_flags & IB_SEND_FENCE) && + qp->s_num_rd_atomic) { + qp->s_flags |= IPATH_S_FENCE_PENDING; + goto bail; + } wqe->psn = qp->s_next_psn; newreq = 1; } @@ -250,7 +289,7 @@ int ipath_make_rc_req(struct ipath_qp *qp, /* If no credit, return. */ if (qp->s_lsn != (u32) -1 && ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) - goto done; + goto bail; wqe->lpsn = wqe->psn; if (len > pmtu) { wqe->lpsn += (len - 1) / pmtu; @@ -281,13 +320,13 @@ int ipath_make_rc_req(struct ipath_qp *qp, /* If no credit, return. */ if (qp->s_lsn != (u32) -1 && ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) - goto done; + goto bail; ohdr->u.rc.reth.vaddr = cpu_to_be64(wqe->wr.wr.rdma.remote_addr); ohdr->u.rc.reth.rkey = cpu_to_be32(wqe->wr.wr.rdma.rkey); ohdr->u.rc.reth.length = cpu_to_be32(len); - hwords += sizeof(struct ib_reth) / 4; + hwords += sizeof(struct ib_reth) / sizeof(u32); wqe->lpsn = wqe->psn; if (len > pmtu) { wqe->lpsn += (len - 1) / pmtu; @@ -312,14 +351,17 @@ int ipath_make_rc_req(struct ipath_qp *qp, break; case IB_WR_RDMA_READ: - ohdr->u.rc.reth.vaddr = - cpu_to_be64(wqe->wr.wr.rdma.remote_addr); - ohdr->u.rc.reth.rkey = - cpu_to_be32(wqe->wr.wr.rdma.rkey); - ohdr->u.rc.reth.length = cpu_to_be32(len); - qp->s_state = OP(RDMA_READ_REQUEST); - hwords += sizeof(ohdr->u.rc.reth) / 4; + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= IPATH_S_RDMAR_PENDING; + goto bail; + } + qp->s_num_rd_atomic++; if (qp->s_lsn != (u32) -1) qp->s_lsn++; /* @@ -330,6 +372,13 @@ int ipath_make_rc_req(struct ipath_qp *qp, qp->s_next_psn += (len - 1) / pmtu; wqe->lpsn = qp->s_next_psn++; } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); ss = NULL; len = 0; if (++qp->s_cur == qp->s_size) @@ -338,32 +387,48 @@ int ipath_make_rc_req(struct ipath_qp *qp, case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: - if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) - qp->s_state = OP(COMPARE_SWAP); - else - qp->s_state = OP(FETCH_ADD); - ohdr->u.atomic_eth.vaddr = cpu_to_be64( - wqe->wr.wr.atomic.remote_addr); - ohdr->u.atomic_eth.rkey = cpu_to_be32( - wqe->wr.wr.atomic.rkey); - ohdr->u.atomic_eth.swap_data = cpu_to_be64( - wqe->wr.wr.atomic.swap); - ohdr->u.atomic_eth.compare_data = cpu_to_be64( - wqe->wr.wr.atomic.compare_add); - hwords += sizeof(struct ib_atomic_eth) / 4; + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= IPATH_S_RDMAR_PENDING; + goto bail; + } + qp->s_num_rd_atomic++; if (qp->s_lsn != (u32) -1) qp->s_lsn++; wqe->lpsn = wqe->psn; } - if (++qp->s_cur == qp->s_size) - qp->s_cur = 0; + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + qp->s_state = OP(COMPARE_SWAP); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.swap); + ohdr->u.atomic_eth.compare_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + } else { + qp->s_state = OP(FETCH_ADD); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + ohdr->u.atomic_eth.compare_data = 0; + } + ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr >> 32); + ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr); + ohdr->u.atomic_eth.rkey = cpu_to_be32( + wqe->wr.wr.atomic.rkey); + hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); ss = NULL; len = 0; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; break; default: - goto done; + goto bail; } qp->s_sge.sge = wqe->sg_list[0]; qp->s_sge.sg_list = wqe->sg_list + 1; @@ -379,7 +444,7 @@ int ipath_make_rc_req(struct ipath_qp *qp, qp->s_psn = wqe->lpsn + 1; else { qp->s_psn++; - if ((int)(qp->s_psn - qp->s_next_psn) > 0) + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) qp->s_next_psn = qp->s_psn; } /* @@ -406,7 +471,7 @@ int ipath_make_rc_req(struct ipath_qp *qp, /* FALLTHROUGH */ case OP(SEND_MIDDLE): bth2 = qp->s_psn++ & IPATH_PSN_MASK; - if ((int)(qp->s_psn - qp->s_next_psn) > 0) + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) qp->s_next_psn = qp->s_psn; ss = &qp->s_sge; len = qp->s_len; @@ -442,7 +507,7 @@ int ipath_make_rc_req(struct ipath_qp *qp, /* FALLTHROUGH */ case OP(RDMA_WRITE_MIDDLE): bth2 = qp->s_psn++ & IPATH_PSN_MASK; - if ((int)(qp->s_psn - qp->s_next_psn) > 0) + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) qp->s_next_psn = qp->s_psn; ss = &qp->s_sge; len = qp->s_len; @@ -479,9 +544,9 @@ int ipath_make_rc_req(struct ipath_qp *qp, cpu_to_be32(wqe->wr.wr.rdma.rkey); ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len); qp->s_state = OP(RDMA_READ_REQUEST); - hwords += sizeof(ohdr->u.rc.reth) / 4; + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); bth2 = qp->s_psn++ & IPATH_PSN_MASK; - if ((int)(qp->s_psn - qp->s_next_psn) > 0) + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) qp->s_next_psn = qp->s_psn; ss = NULL; len = 0; @@ -489,20 +554,6 @@ int ipath_make_rc_req(struct ipath_qp *qp, if (qp->s_cur == qp->s_size) qp->s_cur = 0; break; - - case OP(RDMA_READ_REQUEST): - case OP(COMPARE_SWAP): - case OP(FETCH_ADD): - /* - * We shouldn't start anything new until this request is - * finished. The ACK will handle rescheduling us. XXX The - * number of outstanding ones is negotiated at connection - * setup time (see pg. 258,289)? XXX Also, if we support - * multiple outstanding requests, we need to check the WQE - * IB_SEND_FENCE flag and not send a new request if a RDMA - * read or atomic is pending. - */ - goto done; } if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0) bth2 |= 1 << 31; /* Request ACK. */ @@ -512,9 +563,10 @@ int ipath_make_rc_req(struct ipath_qp *qp, qp->s_cur_size = len; *bth0p = bth0 | (qp->s_state << 24); *bth2p = bth2; +done: return 1; -done: +bail: return 0; } @@ -524,7 +576,8 @@ done: * * This is called from ipath_rc_rcv() and only uses the receive * side QP state. - * Note that RDMA reads are handled in the send side QP state and tasklet. + * Note that RDMA reads and atomics are handled in the + * send side QP state and tasklet. */ static void send_rc_ack(struct ipath_qp *qp) { @@ -535,6 +588,10 @@ static void send_rc_ack(struct ipath_qp *qp) struct ipath_ib_header hdr; struct ipath_other_headers *ohdr; + /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ + if (qp->r_head_ack_queue != qp->s_tail_ack_queue) + goto queue_ack; + /* Construct the header. */ ohdr = &hdr.u.oth; lrh0 = IPATH_LRH_BTH; @@ -548,19 +605,14 @@ static void send_rc_ack(struct ipath_qp *qp) lrh0 = IPATH_LRH_GRH; } /* read pkey_index w/o lock (its atomic) */ - bth0 = ipath_get_pkey(dev->dd, qp->s_pkey_index); + bth0 = ipath_get_pkey(dev->dd, qp->s_pkey_index) | + OP(ACKNOWLEDGE) << 24; if (qp->r_nak_state) ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | (qp->r_nak_state << IPATH_AETH_CREDIT_SHIFT)); else ohdr->u.aeth = ipath_compute_aeth(qp); - if (qp->r_ack_state >= OP(COMPARE_SWAP)) { - bth0 |= OP(ATOMIC_ACKNOWLEDGE) << 24; - ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->r_atomic_data); - hwords += sizeof(ohdr->u.at.atomic_ack_eth) / 4; - } else - bth0 |= OP(ACKNOWLEDGE) << 24; lrh0 |= qp->remote_ah_attr.sl << 4; hdr.lrh[0] = cpu_to_be16(lrh0); hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); @@ -574,31 +626,31 @@ static void send_rc_ack(struct ipath_qp *qp) * If we can send the ACK, clear the ACK state. */ if (ipath_verbs_send(dev->dd, hwords, (u32 *) &hdr, 0, NULL) == 0) { - qp->r_ack_state = OP(ACKNOWLEDGE); dev->n_unicast_xmit++; - } else { - /* - * We are out of PIO buffers at the moment. - * Pass responsibility for sending the ACK to the - * send tasklet so that when a PIO buffer becomes - * available, the ACK is sent ahead of other outgoing - * packets. - */ - dev->n_rc_qacks++; - spin_lock_irq(&qp->s_lock); - /* Don't coalesce if a RDMA read or atomic is pending. */ - if (qp->s_ack_state == OP(ACKNOWLEDGE) || - qp->s_ack_state < OP(RDMA_READ_REQUEST)) { - qp->s_ack_state = qp->r_ack_state; - qp->s_nak_state = qp->r_nak_state; - qp->s_ack_psn = qp->r_ack_psn; - qp->r_ack_state = OP(ACKNOWLEDGE); - } - spin_unlock_irq(&qp->s_lock); - - /* Call ipath_do_rc_send() in another thread. */ - tasklet_hi_schedule(&qp->s_task); + goto done; } + + /* + * We are out of PIO buffers at the moment. + * Pass responsibility for sending the ACK to the + * send tasklet so that when a PIO buffer becomes + * available, the ACK is sent ahead of other outgoing + * packets. + */ + dev->n_rc_qacks++; + +queue_ack: + spin_lock_irq(&qp->s_lock); + qp->s_flags |= IPATH_S_ACK_PENDING; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + spin_unlock_irq(&qp->s_lock); + + /* Call ipath_do_rc_send() in another thread. */ + tasklet_hi_schedule(&qp->s_task); + +done: + return; } /** @@ -727,7 +779,7 @@ void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) if (wqe->wr.opcode == IB_WR_RDMA_READ) dev->n_rc_resends++; else - dev->n_rc_resends += (int)qp->s_psn - (int)psn; + dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK; reset_psn(qp, psn); tasklet_hi_schedule(&qp->s_task); @@ -775,10 +827,6 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) list_del_init(&qp->timerwait); spin_unlock(&dev->pending_lock); - /* Nothing is pending to ACK/NAK. */ - if (unlikely(qp->s_last == qp->s_tail)) - goto bail; - /* * Note that NAKs implicitly ACK outstanding SEND and RDMA write * requests and implicitly NAK RDMA read and atomic requests issued @@ -806,7 +854,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) */ if ((wqe->wr.opcode == IB_WR_RDMA_READ && (opcode != OP(RDMA_READ_RESPONSE_LAST) || - ipath_cmp24(ack_psn, wqe->lpsn) != 0)) || + ipath_cmp24(ack_psn, wqe->lpsn) != 0)) || ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && (opcode != OP(ATOMIC_ACKNOWLEDGE) || @@ -824,20 +872,33 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) */ goto bail; } - if (wqe->wr.opcode == IB_WR_RDMA_READ || - wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || - wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) - tasklet_hi_schedule(&qp->s_task); + if (qp->s_num_rd_atomic && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { + qp->s_num_rd_atomic--; + /* Restart sending task if fence is complete */ + if ((qp->s_flags & IPATH_S_FENCE_PENDING) && + !qp->s_num_rd_atomic) { + qp->s_flags &= ~IPATH_S_FENCE_PENDING; + tasklet_hi_schedule(&qp->s_task); + } else if (qp->s_flags & IPATH_S_RDMAR_PENDING) { + qp->s_flags &= ~IPATH_S_RDMAR_PENDING; + tasklet_hi_schedule(&qp->s_task); + } + } /* Post a send completion queue entry if requested. */ - if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) || + if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || (wqe->wr.send_flags & IB_SEND_SIGNALED)) { wc.wr_id = wqe->wr.wr_id; wc.status = IB_WC_SUCCESS; wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; wc.vendor_err = 0; wc.byte_len = wqe->length; + wc.imm_data = 0; wc.qp = &qp->ibqp; wc.src_qp = qp->remote_qpn; + wc.wc_flags = 0; wc.pkey_index = 0; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; @@ -854,15 +915,19 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) if (qp->s_last == qp->s_cur) { if (++qp->s_cur >= qp->s_size) qp->s_cur = 0; + qp->s_last = qp->s_cur; + if (qp->s_last == qp->s_tail) + break; wqe = get_swqe_ptr(qp, qp->s_cur); qp->s_state = OP(SEND_LAST); qp->s_psn = wqe->psn; + } else { + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + if (qp->s_last == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, qp->s_last); } - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; - wqe = get_swqe_ptr(qp, qp->s_last); - if (qp->s_last == qp->s_tail) - break; } switch (aeth >> 29) { @@ -874,6 +939,18 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) list_add_tail(&qp->timerwait, &dev->pending[dev->pending_index]); spin_unlock(&dev->pending_lock); + /* + * If we get a partial ACK for a resent operation, + * we can stop resending the earlier packets and + * continue with the next packet the receiver wants. + */ + if (ipath_cmp24(qp->s_psn, psn) <= 0) { + reset_psn(qp, psn + 1); + tasklet_hi_schedule(&qp->s_task); + } + } else if (ipath_cmp24(qp->s_psn, psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = psn + 1; } ipath_get_credit(qp, aeth); qp->s_rnr_retry = qp->s_rnr_retry_cnt; @@ -884,22 +961,23 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) case 1: /* RNR NAK */ dev->n_rnr_naks++; + if (qp->s_last == qp->s_tail) + goto bail; if (qp->s_rnr_retry == 0) { - if (qp->s_last == qp->s_tail) - goto bail; - wc.status = IB_WC_RNR_RETRY_EXC_ERR; goto class_b; } if (qp->s_rnr_retry_cnt < 7) qp->s_rnr_retry--; - if (qp->s_last == qp->s_tail) - goto bail; /* The last valid PSN is the previous PSN. */ update_last_psn(qp, psn - 1); - dev->n_rc_resends += (int)qp->s_psn - (int)psn; + if (wqe->wr.opcode == IB_WR_RDMA_READ) + dev->n_rc_resends++; + else + dev->n_rc_resends += + (qp->s_psn - psn) & IPATH_PSN_MASK; reset_psn(qp, psn); @@ -910,26 +988,20 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) goto bail; case 3: /* NAK */ - /* The last valid PSN seen is the previous request's. */ - if (qp->s_last != qp->s_tail) - update_last_psn(qp, wqe->psn - 1); + if (qp->s_last == qp->s_tail) + goto bail; + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK) { case 0: /* PSN sequence error */ dev->n_seq_naks++; /* - * Back up to the responder's expected PSN. XXX + * Back up to the responder's expected PSN. * Note that we might get a NAK in the middle of an * RDMA READ response which terminates the RDMA * READ. */ - if (qp->s_last == qp->s_tail) - break; - - if (ipath_cmp24(psn, wqe->psn) < 0) - break; - - /* Retry the request. */ ipath_restart_rc(qp, psn, &wc); break; @@ -1003,6 +1075,7 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, u32 psn, u32 hdrsize, u32 pmtu, int header_in_data) { + struct ipath_swqe *wqe; unsigned long flags; struct ib_wc wc; int diff; @@ -1032,6 +1105,10 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, goto ack_done; } + if (unlikely(qp->s_last == qp->s_tail)) + goto ack_done; + wqe = get_swqe_ptr(qp, qp->s_last); + switch (opcode) { case OP(ACKNOWLEDGE): case OP(ATOMIC_ACKNOWLEDGE): @@ -1042,38 +1119,49 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, aeth = be32_to_cpu(((__be32 *) data)[0]); data += sizeof(__be32); } - if (opcode == OP(ATOMIC_ACKNOWLEDGE)) - *(u64 *) qp->s_sge.sge.vaddr = *(u64 *) data; + if (opcode == OP(ATOMIC_ACKNOWLEDGE)) { + u64 val; + + if (!header_in_data) { + __be32 *p = ohdr->u.at.atomic_ack_eth; + + val = ((u64) be32_to_cpu(p[0]) << 32) | + be32_to_cpu(p[1]); + } else + val = be64_to_cpu(((__be64 *) data)[0]); + *(u64 *) wqe->sg_list[0].vaddr = val; + } if (!do_rc_ack(qp, aeth, psn, opcode) || opcode != OP(RDMA_READ_RESPONSE_FIRST)) goto ack_done; hdrsize += 4; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; /* - * do_rc_ack() has already checked the PSN so skip - * the sequence check. + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. */ - goto rdma_read; + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_middle; case OP(RDMA_READ_RESPONSE_MIDDLE): /* no AETH, no ACK */ if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { dev->n_rdma_seq++; - if (qp->s_last != qp->s_tail) - ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); + ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); goto ack_done; } - rdma_read: - if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST))) - goto ack_done; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + read_middle: if (unlikely(tlen != (hdrsize + pmtu + 4))) - goto ack_done; - if (unlikely(pmtu >= qp->s_len)) - goto ack_done; + goto ack_len_err; + if (unlikely(pmtu >= qp->s_rdma_read_len)) + goto ack_len_err; + /* We got a response so update the timeout. */ - if (unlikely(qp->s_last == qp->s_tail || - get_swqe_ptr(qp, qp->s_last)->wr.opcode != - IB_WR_RDMA_READ)) - goto ack_done; spin_lock(&dev->pending_lock); if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait)) list_move_tail(&qp->timerwait, @@ -1082,67 +1170,97 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, /* * Update the RDMA receive state but do the copy w/o * holding the locks and blocking interrupts. - * XXX Yet another place that affects relaxed RDMA order - * since we don't want s_sge modified. */ - qp->s_len -= pmtu; + qp->s_rdma_read_len -= pmtu; update_last_psn(qp, psn); spin_unlock_irqrestore(&qp->s_lock, flags); - ipath_copy_sge(&qp->s_sge, data, pmtu); + ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu); goto bail; - case OP(RDMA_READ_RESPONSE_LAST): - /* ACKs READ req. */ + case OP(RDMA_READ_RESPONSE_ONLY): if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { dev->n_rdma_seq++; - if (qp->s_last != qp->s_tail) - ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); + ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); goto ack_done; } - /* FALLTHROUGH */ - case OP(RDMA_READ_RESPONSE_ONLY): - if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST))) - goto ack_done; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 0 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen < (hdrsize + pad + 8))) + goto ack_len_err; /* - * Get the number of bytes the message was padded by. + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. */ + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_last; + + case OP(RDMA_READ_RESPONSE_LAST): + /* ACKs READ req. */ + if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { + dev->n_rdma_seq++; + ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); + goto ack_done; + } + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* Get the number of bytes the message was padded by. */ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; /* * Check that the data size is >= 1 && <= pmtu. * Remember to account for the AETH header (4) and * ICRC (4). */ - if (unlikely(tlen <= (hdrsize + pad + 8))) { - /* XXX Need to generate an error CQ entry. */ - goto ack_done; - } + if (unlikely(tlen <= (hdrsize + pad + 8))) + goto ack_len_err; + read_last: tlen -= hdrsize + pad + 8; - if (unlikely(tlen != qp->s_len)) { - /* XXX Need to generate an error CQ entry. */ - goto ack_done; - } + if (unlikely(tlen != qp->s_rdma_read_len)) + goto ack_len_err; if (!header_in_data) aeth = be32_to_cpu(ohdr->u.aeth); else { aeth = be32_to_cpu(((__be32 *) data)[0]); data += sizeof(__be32); } - ipath_copy_sge(&qp->s_sge, data, tlen); - if (do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST))) { - /* - * Change the state so we contimue - * processing new requests and wake up the - * tasklet if there are posted sends. - */ - qp->s_state = OP(SEND_LAST); - if (qp->s_tail != qp->s_head) - tasklet_hi_schedule(&qp->s_task); - } + ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen); + (void) do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST)); goto ack_done; } ack_done: spin_unlock_irqrestore(&qp->s_lock, flags); + goto bail; + +ack_op_err: + wc.status = IB_WC_LOC_QP_OP_ERR; + goto ack_err; + +ack_len_err: + wc.status = IB_WC_LOC_LEN_ERR; +ack_err: + wc.wr_id = wqe->wr.wr_id; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.vendor_err = 0; + wc.byte_len = 0; + wc.imm_data = 0; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.wc_flags = 0; + wc.pkey_index = 0; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.dlid_path_bits = 0; + wc.port_num = 0; + ipath_sqerror_qp(qp, &wc); bail: return; } @@ -1162,7 +1280,7 @@ bail: * incoming RC packet for the given QP. * Called at interrupt level. * Return 1 if no more processing is needed; otherwise return 0 to - * schedule a response to be sent and the s_lock unlocked. + * schedule a response to be sent. */ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, struct ipath_other_headers *ohdr, @@ -1173,25 +1291,23 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, int diff, int header_in_data) { - struct ib_reth *reth; + struct ipath_ack_entry *e; + u8 i, prev; + int old_req; if (diff > 0) { /* * Packet sequence error. * A NAK will ACK earlier sends and RDMA writes. - * Don't queue the NAK if a RDMA read, atomic, or - * NAK is pending though. + * Don't queue the NAK if we already sent one. */ - if (qp->s_ack_state != OP(ACKNOWLEDGE) || - qp->r_nak_state != 0) - goto done; - if (qp->r_ack_state < OP(COMPARE_SWAP)) { - qp->r_ack_state = OP(SEND_ONLY); + if (!qp->r_nak_state) { qp->r_nak_state = IB_NAK_PSN_ERROR; /* Use the expected PSN. */ qp->r_ack_psn = qp->r_psn; + goto send_ack; } - goto send_ack; + goto done; } /* @@ -1204,8 +1320,46 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, * can coalesce an outstanding duplicate ACK. We have to * send the earliest so that RDMA reads can be restarted at * the requester's expected PSN. + * + * First, find where this duplicate PSN falls within the + * ACKs previously sent. */ - if (opcode == OP(RDMA_READ_REQUEST)) { + psn &= IPATH_PSN_MASK; + e = NULL; + old_req = 1; + spin_lock_irq(&qp->s_lock); + for (i = qp->r_head_ack_queue; ; i = prev) { + if (i == qp->s_tail_ack_queue) + old_req = 0; + if (i) + prev = i - 1; + else + prev = IPATH_MAX_RDMA_ATOMIC; + if (prev == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[prev]; + if (!e->opcode) { + e = NULL; + break; + } + if (ipath_cmp24(psn, e->psn) >= 0) + break; + } + switch (opcode) { + case OP(RDMA_READ_REQUEST): { + struct ib_reth *reth; + u32 offset; + u32 len; + + /* + * If we didn't find the RDMA read request in the ack queue, + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req) + goto unlock_done; /* RETH comes after BTH */ if (!header_in_data) reth = &ohdr->u.rc.reth; @@ -1214,88 +1368,87 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, data += sizeof(*reth); } /* - * If we receive a duplicate RDMA request, it means the - * requester saw a sequence error and needs to restart - * from an earlier point. We can abort the current - * RDMA read send in that case. + * Address range must be a subset of the original + * request and start on pmtu boundaries. + * We reuse the old ack_queue slot since the requester + * should not back up and request an earlier PSN for the + * same request. */ - spin_lock_irq(&qp->s_lock); - if (qp->s_ack_state != OP(ACKNOWLEDGE) && - (qp->s_hdrwords || ipath_cmp24(psn, qp->s_ack_psn) >= 0)) { - /* - * We are already sending earlier requested data. - * Don't abort it to send later out of sequence data. - */ - spin_unlock_irq(&qp->s_lock); - goto done; - } - qp->s_rdma_len = be32_to_cpu(reth->length); - if (qp->s_rdma_len != 0) { + offset = ((psn - e->psn) & IPATH_PSN_MASK) * + ib_mtu_enum_to_int(qp->path_mtu); + len = be32_to_cpu(reth->length); + if (unlikely(offset + len > e->rdma_sge.sge.sge_length)) + goto unlock_done; + if (len != 0) { u32 rkey = be32_to_cpu(reth->rkey); u64 vaddr = be64_to_cpu(reth->vaddr); int ok; - /* - * Address range must be a subset of the original - * request and start on pmtu boundaries. - */ - ok = ipath_rkey_ok(qp, &qp->s_rdma_sge, - qp->s_rdma_len, vaddr, rkey, + ok = ipath_rkey_ok(qp, &e->rdma_sge, + len, vaddr, rkey, IB_ACCESS_REMOTE_READ); - if (unlikely(!ok)) { - spin_unlock_irq(&qp->s_lock); - goto done; - } + if (unlikely(!ok)) + goto unlock_done; } else { - qp->s_rdma_sge.sg_list = NULL; - qp->s_rdma_sge.num_sge = 0; - qp->s_rdma_sge.sge.mr = NULL; - qp->s_rdma_sge.sge.vaddr = NULL; - qp->s_rdma_sge.sge.length = 0; - qp->s_rdma_sge.sge.sge_length = 0; + e->rdma_sge.sg_list = NULL; + e->rdma_sge.num_sge = 0; + e->rdma_sge.sge.mr = NULL; + e->rdma_sge.sge.vaddr = NULL; + e->rdma_sge.sge.length = 0; + e->rdma_sge.sge.sge_length = 0; } - qp->s_ack_state = opcode; - qp->s_ack_psn = psn; - spin_unlock_irq(&qp->s_lock); - tasklet_hi_schedule(&qp->s_task); - goto send_ack; + e->psn = psn; + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = prev; + break; } - /* - * A pending RDMA read will ACK anything before it so - * ignore earlier duplicate requests. - */ - if (qp->s_ack_state != OP(ACKNOWLEDGE)) - goto done; - - /* - * If an ACK is pending, don't replace the pending ACK - * with an earlier one since the later one will ACK the earlier. - * Also, if we already have a pending atomic, send it. - */ - if (qp->r_ack_state != OP(ACKNOWLEDGE) && - (ipath_cmp24(psn, qp->r_ack_psn) <= 0 || - qp->r_ack_state >= OP(COMPARE_SWAP))) - goto send_ack; - switch (opcode) { case OP(COMPARE_SWAP): - case OP(FETCH_ADD): + case OP(FETCH_ADD): { /* - * Check for the PSN of the last atomic operation - * performed and resend the result if found. + * If we didn't find the atomic request in the ack queue + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. */ - if ((psn & IPATH_PSN_MASK) != qp->r_atomic_psn) - goto done; + if (!e || e->opcode != (u8) opcode || old_req) + goto unlock_done; + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = prev; + break; + } + + default: + if (old_req) + goto unlock_done; + /* + * Resend the most recent ACK if this request is + * after all the previous RDMA reads and atomics. + */ + if (i == qp->r_head_ack_queue) { + spin_unlock_irq(&qp->s_lock); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->r_psn - 1; + goto send_ack; + } + /* + * Resend the RDMA read or atomic op which + * ACKs this duplicate request. + */ + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = i; break; } - qp->r_ack_state = opcode; qp->r_nak_state = 0; - qp->r_ack_psn = psn; -send_ack: - return 0; + spin_unlock_irq(&qp->s_lock); + tasklet_hi_schedule(&qp->s_task); +unlock_done: + spin_unlock_irq(&qp->s_lock); done: return 1; + +send_ack: + return 0; } static void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err) @@ -1391,15 +1544,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, opcode == OP(SEND_LAST_WITH_IMMEDIATE)) break; nack_inv: - /* - * A NAK will ACK earlier sends and RDMA writes. - * Don't queue the NAK if a RDMA read, atomic, or NAK - * is pending though. - */ - if (qp->r_ack_state >= OP(COMPARE_SWAP)) - goto send_ack; ipath_rc_error(qp, IB_WC_REM_INV_REQ_ERR); - qp->r_ack_state = OP(SEND_ONLY); qp->r_nak_state = IB_NAK_INVALID_REQUEST; qp->r_ack_psn = qp->r_psn; goto send_ack; @@ -1441,9 +1586,8 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, * Don't queue the NAK if a RDMA read or atomic * is pending though. */ - if (qp->r_ack_state >= OP(COMPARE_SWAP)) - goto send_ack; - qp->r_ack_state = OP(SEND_ONLY); + if (qp->r_nak_state) + goto done; qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; qp->r_ack_psn = qp->r_psn; goto send_ack; @@ -1567,7 +1711,19 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, goto rnr_nak; goto send_last_imm; - case OP(RDMA_READ_REQUEST): + case OP(RDMA_READ_REQUEST): { + struct ipath_ack_entry *e; + u32 len; + u8 next; + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto nack_acc; + next = qp->r_head_ack_queue + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + if (unlikely(next == qp->s_tail_ack_queue)) + goto nack_inv; + e = &qp->s_ack_queue[qp->r_head_ack_queue]; /* RETH comes after BTH */ if (!header_in_data) reth = &ohdr->u.rc.reth; @@ -1575,72 +1731,75 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, reth = (struct ib_reth *)data; data += sizeof(*reth); } - if (unlikely(!(qp->qp_access_flags & - IB_ACCESS_REMOTE_READ))) - goto nack_acc; - spin_lock_irq(&qp->s_lock); - qp->s_rdma_len = be32_to_cpu(reth->length); - if (qp->s_rdma_len != 0) { + len = be32_to_cpu(reth->length); + if (len) { u32 rkey = be32_to_cpu(reth->rkey); u64 vaddr = be64_to_cpu(reth->vaddr); int ok; /* Check rkey & NAK */ - ok = ipath_rkey_ok(qp, &qp->s_rdma_sge, - qp->s_rdma_len, vaddr, rkey, - IB_ACCESS_REMOTE_READ); - if (unlikely(!ok)) { - spin_unlock_irq(&qp->s_lock); + ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr, + rkey, IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) goto nack_acc; - } /* * Update the next expected PSN. We add 1 later * below, so only add the remainder here. */ - if (qp->s_rdma_len > pmtu) - qp->r_psn += (qp->s_rdma_len - 1) / pmtu; + if (len > pmtu) + qp->r_psn += (len - 1) / pmtu; } else { - qp->s_rdma_sge.sg_list = NULL; - qp->s_rdma_sge.num_sge = 0; - qp->s_rdma_sge.sge.mr = NULL; - qp->s_rdma_sge.sge.vaddr = NULL; - qp->s_rdma_sge.sge.length = 0; - qp->s_rdma_sge.sge.sge_length = 0; + e->rdma_sge.sg_list = NULL; + e->rdma_sge.num_sge = 0; + e->rdma_sge.sge.mr = NULL; + e->rdma_sge.sge.vaddr = NULL; + e->rdma_sge.sge.length = 0; + e->rdma_sge.sge.sge_length = 0; } + e->opcode = opcode; + e->psn = psn; /* * We need to increment the MSN here instead of when we * finish sending the result since a duplicate request would * increment it more than once. */ qp->r_msn++; - - qp->s_ack_state = opcode; - qp->s_ack_psn = psn; - spin_unlock_irq(&qp->s_lock); - qp->r_psn++; qp->r_state = opcode; qp->r_nak_state = 0; + barrier(); + qp->r_head_ack_queue = next; /* Call ipath_do_rc_send() in another thread. */ tasklet_hi_schedule(&qp->s_task); goto done; + } case OP(COMPARE_SWAP): case OP(FETCH_ADD): { struct ib_atomic_eth *ateth; + struct ipath_ack_entry *e; u64 vaddr; + atomic64_t *maddr; u64 sdata; u32 rkey; + u8 next; + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_acc; + next = qp->r_head_ack_queue + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + if (unlikely(next == qp->s_tail_ack_queue)) + goto nack_inv; if (!header_in_data) ateth = &ohdr->u.atomic_eth; - else { + else ateth = (struct ib_atomic_eth *)data; - data += sizeof(*ateth); - } - vaddr = be64_to_cpu(ateth->vaddr); + vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) | + be32_to_cpu(ateth->vaddr[1]); if (unlikely(vaddr & (sizeof(u64) - 1))) goto nack_inv; rkey = be32_to_cpu(ateth->rkey); @@ -1649,63 +1808,50 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, sizeof(u64), vaddr, rkey, IB_ACCESS_REMOTE_ATOMIC))) goto nack_acc; - if (unlikely(!(qp->qp_access_flags & - IB_ACCESS_REMOTE_ATOMIC))) - goto nack_acc; /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; sdata = be64_to_cpu(ateth->swap_data); - spin_lock_irq(&dev->pending_lock); - qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr; - if (opcode == OP(FETCH_ADD)) - *(u64 *) qp->r_sge.sge.vaddr = - qp->r_atomic_data + sdata; - else if (qp->r_atomic_data == - be64_to_cpu(ateth->compare_data)) - *(u64 *) qp->r_sge.sge.vaddr = sdata; - spin_unlock_irq(&dev->pending_lock); + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + e->atomic_data = (opcode == OP(FETCH_ADD)) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + be64_to_cpu(ateth->compare_data), + sdata); + e->opcode = opcode; + e->psn = psn & IPATH_PSN_MASK; qp->r_msn++; - qp->r_atomic_psn = psn & IPATH_PSN_MASK; - psn |= 1 << 31; - break; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + barrier(); + qp->r_head_ack_queue = next; + + /* Call ipath_do_rc_send() in another thread. */ + tasklet_hi_schedule(&qp->s_task); + + goto done; } default: - /* Drop packet for unknown opcodes. */ - goto done; + /* NAK unknown opcodes. */ + goto nack_inv; } qp->r_psn++; qp->r_state = opcode; + qp->r_ack_psn = psn; qp->r_nak_state = 0; /* Send an ACK if requested or required. */ - if (psn & (1 << 31)) { - /* - * Coalesce ACKs unless there is a RDMA READ or - * ATOMIC pending. - */ - if (qp->r_ack_state < OP(COMPARE_SWAP)) { - qp->r_ack_state = opcode; - qp->r_ack_psn = psn; - } + if (psn & (1 << 31)) goto send_ack; - } goto done; nack_acc: - /* - * A NAK will ACK earlier sends and RDMA writes. - * Don't queue the NAK if a RDMA read, atomic, or NAK - * is pending though. - */ - if (qp->r_ack_state < OP(COMPARE_SWAP)) { - ipath_rc_error(qp, IB_WC_REM_ACCESS_ERR); - qp->r_ack_state = OP(RDMA_WRITE_ONLY); - qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; - qp->r_ack_psn = qp->r_psn; - } + ipath_rc_error(qp, IB_WC_REM_ACCESS_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; + send_ack: - /* Send ACK right away unless the send tasklet has a pending ACK. */ - if (qp->s_ack_state == OP(ACKNOWLEDGE)) - send_rc_ack(qp); + send_rc_ack(qp); done: return; diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/infiniband/hw/ipath/ipath_registers.h index dffc76016d3c..c182bcd62098 100644 --- a/drivers/infiniband/hw/ipath/ipath_registers.h +++ b/drivers/infiniband/hw/ipath/ipath_registers.h @@ -126,9 +126,18 @@ #define INFINIPATH_E_RESET 0x0004000000000000ULL #define INFINIPATH_E_HARDWARE 0x0008000000000000ULL +/* + * this is used to print "common" packet errors only when the + * __IPATH_ERRPKTDBG bit is set in ipath_debug. + */ +#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \ + | INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \ + | INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \ + | INFINIPATH_E_REBP ) + /* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ /* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo - * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2: eagerTID, 3: expTID + * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2: expTID, 3: eagerTID * bit 4: flag buffer, 5: datainfo, 6: header info */ #define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL #define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40 @@ -143,8 +152,8 @@ /* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */ #define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF 0x01ULL #define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ 0x02ULL -#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x04ULL -#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID 0x08ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID 0x04ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL #define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF 0x10ULL #define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL #define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO 0x40ULL @@ -299,13 +308,6 @@ #define INFINIPATH_XGXS_RX_POL_SHIFT 19 #define INFINIPATH_XGXS_RX_POL_MASK 0xfULL -#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL /* 40 bits valid */ - -/* TID entries (memory), HT-only */ -#define INFINIPATH_RT_VALID 0x8000000000000000ULL -#define INFINIPATH_RT_ADDR_SHIFT 0 -#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFF -#define INFINIPATH_RT_BUFSIZE_SHIFT 48 /* * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c index e86cb171872e..d9c2a9b15d86 100644 --- a/drivers/infiniband/hw/ipath/ipath_ruc.c +++ b/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -202,6 +202,7 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) wq->tail = tail; ret = 1; + qp->r_wrid_valid = 1; if (handler) { u32 n; @@ -229,7 +230,6 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) } } spin_unlock_irqrestore(&rq->lock, flags); - qp->r_wrid_valid = 1; bail: return ret; @@ -255,6 +255,7 @@ static void ipath_ruc_loopback(struct ipath_qp *sqp) unsigned long flags; struct ib_wc wc; u64 sdata; + atomic64_t *maddr; qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn); if (!qp) { @@ -265,7 +266,8 @@ static void ipath_ruc_loopback(struct ipath_qp *sqp) again: spin_lock_irqsave(&sqp->s_lock, flags); - if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_SEND_OK)) { + if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_SEND_OK) || + qp->s_rnr_timeout) { spin_unlock_irqrestore(&sqp->s_lock, flags); goto done; } @@ -310,7 +312,7 @@ again: sqp->s_rnr_retry--; dev->n_rnr_naks++; sqp->s_rnr_timeout = - ib_ipath_rnr_table[sqp->r_min_rnr_timer]; + ib_ipath_rnr_table[qp->r_min_rnr_timer]; ipath_insert_rnr_queue(sqp); goto done; } @@ -343,20 +345,22 @@ again: wc.sl = sqp->remote_ah_attr.sl; wc.dlid_path_bits = 0; wc.port_num = 0; + spin_lock_irqsave(&sqp->s_lock, flags); ipath_sqerror_qp(sqp, &wc); + spin_unlock_irqrestore(&sqp->s_lock, flags); goto done; } break; case IB_WR_RDMA_READ: + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_READ))) + goto acc_err; if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length, wqe->wr.wr.rdma.remote_addr, wqe->wr.wr.rdma.rkey, IB_ACCESS_REMOTE_READ))) goto acc_err; - if (unlikely(!(qp->qp_access_flags & - IB_ACCESS_REMOTE_READ))) - goto acc_err; qp->r_sge.sge = wqe->sg_list[0]; qp->r_sge.sg_list = wqe->sg_list + 1; qp->r_sge.num_sge = wqe->wr.num_sge; @@ -364,22 +368,22 @@ again: case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64), - wqe->wr.wr.rdma.remote_addr, - wqe->wr.wr.rdma.rkey, + wqe->wr.wr.atomic.remote_addr, + wqe->wr.wr.atomic.rkey, IB_ACCESS_REMOTE_ATOMIC))) goto acc_err; /* Perform atomic OP and save result. */ - sdata = wqe->wr.wr.atomic.swap; - spin_lock_irqsave(&dev->pending_lock, flags); - qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr; - if (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) - *(u64 *) qp->r_sge.sge.vaddr = - qp->r_atomic_data + sdata; - else if (qp->r_atomic_data == wqe->wr.wr.atomic.compare_add) - *(u64 *) qp->r_sge.sge.vaddr = sdata; - spin_unlock_irqrestore(&dev->pending_lock, flags); - *(u64 *) sqp->s_sge.sge.vaddr = qp->r_atomic_data; + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = wqe->wr.wr.atomic.compare_add; + *(u64 *) sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + sdata, wqe->wr.wr.atomic.swap); goto send_comp; default: @@ -440,7 +444,7 @@ again: send_comp: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &sqp->s_flags) || + if (!(sqp->s_flags & IPATH_S_SIGNAL_REQ_WR) || (wqe->wr.send_flags & IB_SEND_SIGNALED)) { wc.wr_id = wqe->wr.wr_id; wc.status = IB_WC_SUCCESS; @@ -502,7 +506,7 @@ void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev) * We clear the tasklet flag now since we are committing to return * from the tasklet function. */ - clear_bit(IPATH_S_BUSY, &qp->s_flags); + clear_bit(IPATH_S_BUSY, &qp->s_busy); tasklet_unlock(&qp->s_task); want_buffer(dev->dd); dev->n_piowait++; @@ -541,6 +545,9 @@ int ipath_post_ruc_send(struct ipath_qp *qp, struct ib_send_wr *wr) wr->sg_list[0].addr & (sizeof(u64) - 1))) { ret = -EINVAL; goto bail; + } else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) { + ret = -EINVAL; + goto bail; } /* IB spec says that num_sge == 0 is OK. */ if (wr->num_sge > qp->s_max_sge) { @@ -647,7 +654,7 @@ void ipath_do_ruc_send(unsigned long data) u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); struct ipath_other_headers *ohdr; - if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags)) + if (test_and_set_bit(IPATH_S_BUSY, &qp->s_busy)) goto bail; if (unlikely(qp->remote_ah_attr.dlid == dev->dd->ipath_lid)) { @@ -683,19 +690,15 @@ again: */ spin_lock_irqsave(&qp->s_lock, flags); - /* Sending responses has higher priority over sending requests. */ - if (qp->s_ack_state != IB_OPCODE_RC_ACKNOWLEDGE && - (bth0 = ipath_make_rc_ack(qp, ohdr, pmtu)) != 0) - bth2 = qp->s_ack_psn++ & IPATH_PSN_MASK; - else if (!((qp->ibqp.qp_type == IB_QPT_RC) ? - ipath_make_rc_req(qp, ohdr, pmtu, &bth0, &bth2) : - ipath_make_uc_req(qp, ohdr, pmtu, &bth0, &bth2))) { + if (!((qp->ibqp.qp_type == IB_QPT_RC) ? + ipath_make_rc_req(qp, ohdr, pmtu, &bth0, &bth2) : + ipath_make_uc_req(qp, ohdr, pmtu, &bth0, &bth2))) { /* * Clear the busy bit before unlocking to avoid races with * adding new work queue items and then failing to process * them. */ - clear_bit(IPATH_S_BUSY, &qp->s_flags); + clear_bit(IPATH_S_BUSY, &qp->s_busy); spin_unlock_irqrestore(&qp->s_lock, flags); goto bail; } @@ -728,7 +731,7 @@ again: goto again; clear: - clear_bit(IPATH_S_BUSY, &qp->s_flags); + clear_bit(IPATH_S_BUSY, &qp->s_busy); bail: return; } diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c index 30a825928fcf..9307f7187ca5 100644 --- a/drivers/infiniband/hw/ipath/ipath_stats.c +++ b/drivers/infiniband/hw/ipath/ipath_stats.c @@ -207,7 +207,7 @@ void ipath_get_faststats(unsigned long opaque) * don't access the chip while running diags, or memory diags can * fail */ - if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT) || + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) || ipath_diag_inuse) /* but re-arm the timer, for diags case; won't hurt other */ goto done; @@ -237,11 +237,13 @@ void ipath_get_faststats(unsigned long opaque) if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) && time_after(jiffies, dd->ipath_unmasktime)) { char ebuf[256]; - ipath_decode_err(ebuf, sizeof ebuf, + int iserr; + iserr = ipath_decode_err(ebuf, sizeof ebuf, (dd->ipath_maskederrs & ~dd-> ipath_ignorederrs)); if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) & - ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL)) + ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_PKTERRS )) ipath_dev_err(dd, "Re-enabling masked errors " "(%s)\n", ebuf); else { @@ -252,8 +254,12 @@ void ipath_get_faststats(unsigned long opaque) * them. So only complain about these at debug * level. */ - ipath_dbg("Disabling frequent queue full errors " - "(%s)\n", ebuf); + if (iserr) + ipath_dbg("Re-enabling queue full errors (%s)\n", + ebuf); + else + ipath_cdbg(ERRPKT, "Re-enabling packet" + " problem interrupt (%s)\n", ebuf); } dd->ipath_maskederrs = dd->ipath_ignorederrs; ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c index 325d6634ff53..1c2b03c2ef5e 100644 --- a/drivers/infiniband/hw/ipath/ipath_uc.c +++ b/drivers/infiniband/hw/ipath/ipath_uc.c @@ -42,7 +42,7 @@ static void complete_last_send(struct ipath_qp *qp, struct ipath_swqe *wqe, { if (++qp->s_last == qp->s_size) qp->s_last = 0; - if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) || + if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || (wqe->wr.send_flags & IB_SEND_SIGNALED)) { wc->wr_id = wqe->wr.wr_id; wc->status = IB_WC_SUCCESS; @@ -344,13 +344,13 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, send_first: if (qp->r_reuse_sge) { qp->r_reuse_sge = 0; - qp->r_sge = qp->s_rdma_sge; + qp->r_sge = qp->s_rdma_read_sge; } else if (!ipath_get_rwqe(qp, 0)) { dev->n_pkt_drops++; goto done; } /* Save the WQE so we can reuse it in case of an error. */ - qp->s_rdma_sge = qp->r_sge; + qp->s_rdma_read_sge = qp->r_sge; qp->r_rcv_len = 0; if (opcode == OP(SEND_ONLY)) goto send_last; diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c index 9a3e54664ee4..a518f7c8fa83 100644 --- a/drivers/infiniband/hw/ipath/ipath_ud.c +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -308,6 +308,11 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) goto bail; } + if (wr->wr.ud.ah->pd != qp->ibqp.pd) { + ret = -EPERM; + goto bail; + } + /* IB spec says that num_sge == 0 is OK. */ if (wr->num_sge > qp->s_max_sge) { ret = -EINVAL; @@ -467,7 +472,7 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) done: /* Queue the completion status entry. */ - if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) || + if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || (wr->send_flags & IB_SEND_SIGNALED)) { wc.wr_id = wr->wr_id; wc.status = IB_WC_SUCCESS; @@ -647,6 +652,7 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh)); ipath_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh)); + qp->r_wrid_valid = 0; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; wc.opcode = IB_WC_RECV; diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 2aaacdb7e52a..18c6df2052c2 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -438,6 +438,10 @@ void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data, struct ipath_mcast *mcast; struct ipath_mcast_qp *p; + if (lnh != IPATH_LRH_GRH) { + dev->n_pkt_drops++; + goto bail; + } mcast = ipath_mcast_find(&hdr->u.l.grh.dgid); if (mcast == NULL) { dev->n_pkt_drops++; @@ -445,8 +449,7 @@ void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data, } dev->n_multicast_rcv++; list_for_each_entry_rcu(p, &mcast->qp_list, list) - ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data, - tlen, p->qp); + ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp); /* * Notify ipath_multicast_detach() if it is waiting for us * to finish. @@ -773,7 +776,6 @@ int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords, /* +1 is for the qword padding of pbc */ plen = hdrwords + ((len + 3) >> 2) + 1; if (unlikely((plen << 2) > dd->ipath_ibmaxlen)) { - ipath_dbg("packet len 0x%x too long, failing\n", plen); ret = -EINVAL; goto bail; } @@ -980,14 +982,14 @@ static int ipath_query_device(struct ib_device *ibdev, props->max_cqe = ib_ipath_max_cqes; props->max_mr = dev->lk_table.max; props->max_pd = ib_ipath_max_pds; - props->max_qp_rd_atom = 1; - props->max_qp_init_rd_atom = 1; + props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC; + props->max_qp_init_rd_atom = 255; /* props->max_res_rd_atom */ props->max_srq = ib_ipath_max_srqs; props->max_srq_wr = ib_ipath_max_srq_wrs; props->max_srq_sge = ib_ipath_max_srq_sges; /* props->local_ca_ack_delay */ - props->atomic_cap = IB_ATOMIC_HCA; + props->atomic_cap = IB_ATOMIC_GLOB; props->max_pkeys = ipath_get_npkeys(dev->dd); props->max_mcast_grp = ib_ipath_max_mcast_grps; props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached; @@ -1557,7 +1559,6 @@ int ipath_register_ib_device(struct ipath_devdata *dd) dev->node_type = RDMA_NODE_IB_CA; dev->phys_port_cnt = 1; dev->dma_device = &dd->pcidev->dev; - dev->class_dev.dev = dev->dma_device; dev->query_device = ipath_query_device; dev->modify_device = ipath_modify_device; dev->query_port = ipath_query_port; diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h index c0c8d5b24a7d..7c4929f1cb5b 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.h +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -40,9 +40,12 @@ #include <linux/interrupt.h> #include <linux/kref.h> #include <rdma/ib_pack.h> +#include <rdma/ib_user_verbs.h> #include "ipath_layer.h" +#define IPATH_MAX_RDMA_ATOMIC 4 + #define QPN_MAX (1 << 24) #define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) @@ -89,7 +92,7 @@ struct ib_reth { } __attribute__ ((packed)); struct ib_atomic_eth { - __be64 vaddr; + __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ __be32 rkey; __be64 swap_data; __be64 compare_data; @@ -108,7 +111,7 @@ struct ipath_other_headers { } rc; struct { __be32 aeth; - __be64 atomic_ack_eth; + __be32 atomic_ack_eth[2]; } at; __be32 imm_data; __be32 aeth; @@ -186,7 +189,7 @@ struct ipath_mmap_info { struct ipath_cq_wc { u32 head; /* index of next entry to fill */ u32 tail; /* index of next ib_poll_cq() entry */ - struct ib_wc queue[1]; /* this is actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc queue[1]; /* this is actually size ibcq.cqe + 1 */ }; /* @@ -312,6 +315,19 @@ struct ipath_sge_state { }; /* + * This structure holds the information that the send tasklet needs + * to send a RDMA read response or atomic operation. + */ +struct ipath_ack_entry { + u8 opcode; + u32 psn; + union { + struct ipath_sge_state rdma_sge; + u64 atomic_data; + }; +}; + +/* * Variables prefixed with s_ are for the requester (sender). * Variables prefixed with r_ are for the responder (receiver). * Variables prefixed with ack_ are for responder replies. @@ -333,24 +349,24 @@ struct ipath_qp { struct ipath_mmap_info *ip; struct ipath_sge_state *s_cur_sge; struct ipath_sge_state s_sge; /* current send request data */ - /* current RDMA read send data */ - struct ipath_sge_state s_rdma_sge; + struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1]; + struct ipath_sge_state s_ack_rdma_sge; + struct ipath_sge_state s_rdma_read_sge; struct ipath_sge_state r_sge; /* current receive data */ spinlock_t s_lock; - unsigned long s_flags; + unsigned long s_busy; u32 s_hdrwords; /* size of s_hdr in 32 bit words */ u32 s_cur_size; /* size of send packet in bytes */ u32 s_len; /* total length of s_sge */ - u32 s_rdma_len; /* total length of s_rdma_sge */ + u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ u32 s_next_psn; /* PSN for next request */ u32 s_last_psn; /* last response PSN processed */ u32 s_psn; /* current packet sequence number */ - u32 s_ack_psn; /* PSN for RDMA_READ */ + u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ + u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ u64 r_wr_id; /* ID for current receive WQE */ - u64 r_atomic_data; /* data for last atomic op */ - u32 r_atomic_psn; /* PSN of last atomic op */ u32 r_len; /* total length of r_sge */ u32 r_rcv_len; /* receive data len processed */ u32 r_psn; /* expected rcv packet sequence number */ @@ -360,12 +376,13 @@ struct ipath_qp { u8 s_ack_state; /* opcode of packet to ACK */ u8 s_nak_state; /* non-zero if NAK is pending */ u8 r_state; /* opcode of last packet received */ - u8 r_ack_state; /* opcode of packet to ACK */ u8 r_nak_state; /* non-zero if NAK is pending */ u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ u8 r_reuse_sge; /* for UC receive errors */ u8 r_sge_inx; /* current index into sg_list */ u8 r_wrid_valid; /* r_wrid set but CQ entry not yet made */ + u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ + u8 r_head_ack_queue; /* index into s_ack_queue[] */ u8 qp_access_flags; u8 s_max_sge; /* size of s_wq->sg_list */ u8 s_retry_cnt; /* number of times to retry */ @@ -374,6 +391,10 @@ struct ipath_qp { u8 s_rnr_retry; /* requester RNR retry counter */ u8 s_wait_credit; /* limit number of unacked packets sent */ u8 s_pkey_index; /* PKEY index to use */ + u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ + u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ + u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + u8 s_flags; u8 timeout; /* Timeout for this QP */ enum ib_mtu path_mtu; u32 remote_qpn; @@ -390,11 +411,16 @@ struct ipath_qp { struct ipath_sge r_sg_list[0]; /* verified SGEs */ }; +/* Bit definition for s_busy. */ +#define IPATH_S_BUSY 0 + /* * Bit definitions for s_flags. */ -#define IPATH_S_BUSY 0 -#define IPATH_S_SIGNAL_REQ_WR 1 +#define IPATH_S_SIGNAL_REQ_WR 0x01 +#define IPATH_S_FENCE_PENDING 0x02 +#define IPATH_S_RDMAR_PENDING 0x04 +#define IPATH_S_ACK_PENDING 0x08 #define IPATH_PSN_CREDIT 2048 @@ -706,8 +732,6 @@ int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); int ipath_destroy_srq(struct ib_srq *ibsrq); -void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig); - int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, @@ -757,9 +781,6 @@ u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, void ipath_do_ruc_send(unsigned long data); -u32 ipath_make_rc_ack(struct ipath_qp *qp, struct ipath_other_headers *ohdr, - u32 pmtu); - int ipath_make_rc_req(struct ipath_qp *qp, struct ipath_other_headers *ohdr, u32 pmtu, u32 *bth0p, u32 *bth2p); diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 0d9b7d06bbc2..773145e29947 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -1013,14 +1013,14 @@ static struct { u64 latest_fw; u32 flags; } mthca_hca_table[] = { - [TAVOR] = { .latest_fw = MTHCA_FW_VER(3, 4, 0), + [TAVOR] = { .latest_fw = MTHCA_FW_VER(3, 5, 0), .flags = 0 }, - [ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 7, 600), + [ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 8, 200), .flags = MTHCA_FLAG_PCIE }, - [ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 1, 400), + [ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 2, 0), .flags = MTHCA_FLAG_MEMFREE | MTHCA_FLAG_PCIE }, - [SINAI] = { .latest_fw = MTHCA_FW_VER(1, 1, 0), + [SINAI] = { .latest_fw = MTHCA_FW_VER(1, 2, 0), .flags = MTHCA_FLAG_MEMFREE | MTHCA_FLAG_PCIE | MTHCA_FLAG_SINAI_OPT } @@ -1135,7 +1135,7 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type) goto err_cmd; if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) { - mthca_warn(mdev, "HCA FW version %d.%d.%d is old (%d.%d.%d is current).\n", + mthca_warn(mdev, "HCA FW version %d.%d.%3d is old (%d.%d.%3d is current).\n", (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff, (int) (mdev->fw_ver & 0xffff), (int) (mthca_hca_table[hca_type].latest_fw >> 32), diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index ee561c569d5f..aa6c70a6a36f 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -297,7 +297,8 @@ out: int mthca_write_mtt_size(struct mthca_dev *dev) { - if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy) + if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy || + !(dev->mthca_flags & MTHCA_FLAG_FMR)) /* * Be friendly to WRITE_MTT command * and leave two empty slots for the @@ -355,7 +356,8 @@ int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, int size = mthca_write_mtt_size(dev); int chunk; - if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy) + if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy || + !(dev->mthca_flags & MTHCA_FLAG_FMR)) return __mthca_write_mtt(dev, mtt, start_index, buffer_list, list_len); while (list_len > 0) { diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 0725ad7ad9bf..47e6fd46d9c2 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1293,7 +1293,6 @@ int mthca_register_device(struct mthca_dev *dev) dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.phys_port_cnt = dev->limits.num_ports; dev->ib_dev.dma_device = &dev->pdev->dev; - dev->ib_dev.class_dev.dev = &dev->pdev->dev; dev->ib_dev.query_device = mthca_query_device; dev->ib_dev.query_port = mthca_query_port; dev->ib_dev.modify_device = mthca_modify_device; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 1c6b63aca268..8fe6fee7a97a 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1419,11 +1419,10 @@ void mthca_free_qp(struct mthca_dev *dev, * unref the mem-free tables and free the QPN in our table. */ if (!qp->ibqp.uobject) { - mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq), qp->qpn, + mthca_cq_clean(dev, recv_cq, qp->qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); - if (qp->ibqp.send_cq != qp->ibqp.recv_cq) - mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq), qp->qpn, - qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (send_cq != recv_cq) + mthca_cq_clean(dev, send_cq, qp->qpn, NULL); mthca_free_memfree(dev, qp); mthca_free_wqe_buf(dev, qp); |