diff options
-rw-r--r-- | drivers/infiniband/hw/hfi1/init.c | 6 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/rc.c | 10 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/ruc.c | 3 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/uc.c | 10 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/ud.c | 18 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/verbs.c | 226 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/verbs.h | 25 | ||||
-rw-r--r-- | drivers/infiniband/hw/qib/qib_rc.c | 10 | ||||
-rw-r--r-- | drivers/infiniband/hw/qib/qib_ruc.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/hw/qib/qib_uc.c | 10 | ||||
-rw-r--r-- | drivers/infiniband/hw/qib/qib_ud.c | 13 | ||||
-rw-r--r-- | drivers/infiniband/hw/qib/qib_verbs.c | 22 | ||||
-rw-r--r-- | drivers/infiniband/hw/qib/qib_verbs.h | 3 | ||||
-rw-r--r-- | drivers/infiniband/sw/rdmavt/Kconfig | 2 | ||||
-rw-r--r-- | drivers/infiniband/sw/rdmavt/qp.c | 258 | ||||
-rw-r--r-- | drivers/infiniband/sw/rdmavt/qp.h | 2 | ||||
-rw-r--r-- | drivers/infiniband/sw/rdmavt/vt.c | 12 | ||||
-rw-r--r-- | include/rdma/rdma_vt.h | 22 | ||||
-rw-r--r-- | include/rdma/rdmavt_qp.h | 4 |
19 files changed, 344 insertions, 314 deletions
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 1e770a133779..09044905284f 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1504,9 +1504,6 @@ static int __init hfi1_mod_init(void) idr_init(&hfi1_unit_table); hfi1_dbg_init(); - ret = hfi1_wss_init(); - if (ret < 0) - goto bail_wss; ret = pci_register_driver(&hfi1_pci_driver); if (ret < 0) { pr_err("Unable to register driver: error %d\n", -ret); @@ -1515,8 +1512,6 @@ static int __init hfi1_mod_init(void) goto bail; /* all OK */ bail_dev: - hfi1_wss_exit(); -bail_wss: hfi1_dbg_exit(); idr_destroy(&hfi1_unit_table); dev_cleanup(); @@ -1533,7 +1528,6 @@ static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); node_affinity_destroy_all(); - hfi1_wss_exit(); hfi1_dbg_exit(); idr_destroy(&hfi1_unit_table); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 9bd63abb2dfe..673b31ebf0ac 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -1644,7 +1644,8 @@ read_middle: qp->s_rdma_read_len -= pmtu; update_last_psn(qp, psn); spin_unlock_irqrestore(&qp->s_lock, flags); - hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, pmtu, false, false); goto bail; case OP(RDMA_READ_RESPONSE_ONLY): @@ -1684,7 +1685,8 @@ read_last: if (unlikely(tlen != qp->s_rdma_read_len)) goto ack_len_err; aeth = be32_to_cpu(ohdr->u.aeth); - hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, tlen, false, false); WARN_ON(qp->s_rdma_read_sge.num_sge); (void)do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST), 0, rcd); @@ -2144,7 +2146,7 @@ send_middle: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto nack_inv; - hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -2200,7 +2202,7 @@ send_last: wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last); rvt_put_ss(&qp->r_sge); qp->r_msn++; if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 17b49b4309a3..223eaf184934 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -361,7 +361,8 @@ do_write: if (len > sge->sge_length) len = sge->sge_length; WARN_ON_ONCE(len == 0); - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, + len, release, copy_last); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index e254dcec6f64..48a320c01552 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -426,7 +426,7 @@ send_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto rewind; - hfi1_copy_sge(&qp->r_sge, data, pmtu, false, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false); break; case OP(SEND_LAST_WITH_IMMEDIATE): @@ -449,7 +449,7 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto rewind; wc.opcode = IB_WC_RECV; - hfi1_copy_sge(&qp->r_sge, data, tlen, false, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false); rvt_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; @@ -523,7 +523,7 @@ rdma_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto drop; - hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -550,7 +550,7 @@ rdma_last_imm: } wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); goto last_imm; @@ -564,7 +564,7 @@ rdma_last: tlen -= (hdrsize + extra_bytes); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); break; diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 70d39fc450a1..e55bc4280d58 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -210,8 +210,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) } hfi1_make_grh(ibp, &grh, &grd, 0, 0); - hfi1_copy_sge(&qp->r_sge, &grh, - sizeof(grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(grh), true, false); wc.wc_flags |= IB_WC_GRH; } else { rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); @@ -228,7 +228,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (len > sge->sge_length) len = sge->sge_length; WARN_ON_ONCE(len == 0); - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, true, false); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; @@ -1019,8 +1019,8 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) goto drop; } if (packet->grh) { - hfi1_copy_sge(&qp->r_sge, packet->grh, - sizeof(struct ib_grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, packet->grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else if (packet->etype == RHF_RCV_TYPE_BYPASS) { struct ib_grh grh; @@ -1030,14 +1030,14 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) * out when creating 16B, add back the GRH here. */ hfi1_make_ext_grh(packet, &grh, slid, dlid); - hfi1_copy_sge(&qp->r_sge, &grh, - sizeof(struct ib_grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else { rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); } - hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), - true, false); + rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), + true, false); rvt_put_ss(&qp->r_sge); if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) return; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 16b88948383b..0a47b46f979e 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -129,8 +129,6 @@ unsigned short piothreshold = 256; module_param(piothreshold, ushort, S_IRUGO); MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio"); -#define COPY_CACHELESS 1 -#define COPY_ADAPTIVE 2 static unsigned int sge_copy_mode; module_param(sge_copy_mode, uint, S_IRUGO); MODULE_PARM_DESC(sge_copy_mode, @@ -151,159 +149,13 @@ static int pio_wait(struct rvt_qp *qp, /* 16B trailing buffer */ static const u8 trail_buf[MAX_16B_PADDING]; -static uint wss_threshold; +static uint wss_threshold = 80; module_param(wss_threshold, uint, S_IRUGO); MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy"); static uint wss_clean_period = 256; module_param(wss_clean_period, uint, S_IRUGO); MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned"); -/* memory working set size */ -struct hfi1_wss { - unsigned long *entries; - atomic_t total_count; - atomic_t clean_counter; - atomic_t clean_entry; - - int threshold; - int num_entries; - long pages_mask; -}; - -static struct hfi1_wss wss; - -int hfi1_wss_init(void) -{ - long llc_size; - long llc_bits; - long table_size; - long table_bits; - - /* check for a valid percent range - default to 80 if none or invalid */ - if (wss_threshold < 1 || wss_threshold > 100) - wss_threshold = 80; - /* reject a wildly large period */ - if (wss_clean_period > 1000000) - wss_clean_period = 256; - /* reject a zero period */ - if (wss_clean_period == 0) - wss_clean_period = 1; - - /* - * Calculate the table size - the next power of 2 larger than the - * LLC size. LLC size is in KiB. - */ - llc_size = wss_llc_size() * 1024; - table_size = roundup_pow_of_two(llc_size); - - /* one bit per page in rounded up table */ - llc_bits = llc_size / PAGE_SIZE; - table_bits = table_size / PAGE_SIZE; - wss.pages_mask = table_bits - 1; - wss.num_entries = table_bits / BITS_PER_LONG; - - wss.threshold = (llc_bits * wss_threshold) / 100; - if (wss.threshold == 0) - wss.threshold = 1; - - atomic_set(&wss.clean_counter, wss_clean_period); - - wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries), - GFP_KERNEL); - if (!wss.entries) { - hfi1_wss_exit(); - return -ENOMEM; - } - - return 0; -} - -void hfi1_wss_exit(void) -{ - /* coded to handle partially initialized and repeat callers */ - kfree(wss.entries); - wss.entries = NULL; -} - -/* - * Advance the clean counter. When the clean period has expired, - * clean an entry. - * - * This is implemented in atomics to avoid locking. Because multiple - * variables are involved, it can be racy which can lead to slightly - * inaccurate information. Since this is only a heuristic, this is - * OK. Any innaccuracies will clean themselves out as the counter - * advances. That said, it is unlikely the entry clean operation will - * race - the next possible racer will not start until the next clean - * period. - * - * The clean counter is implemented as a decrement to zero. When zero - * is reached an entry is cleaned. - */ -static void wss_advance_clean_counter(void) -{ - int entry; - int weight; - unsigned long bits; - - /* become the cleaner if we decrement the counter to zero */ - if (atomic_dec_and_test(&wss.clean_counter)) { - /* - * Set, not add, the clean period. This avoids an issue - * where the counter could decrement below the clean period. - * Doing a set can result in lost decrements, slowing the - * clean advance. Since this a heuristic, this possible - * slowdown is OK. - * - * An alternative is to loop, advancing the counter by a - * clean period until the result is > 0. However, this could - * lead to several threads keeping another in the clean loop. - * This could be mitigated by limiting the number of times - * we stay in the loop. - */ - atomic_set(&wss.clean_counter, wss_clean_period); - - /* - * Uniquely grab the entry to clean and move to next. - * The current entry is always the lower bits of - * wss.clean_entry. The table size, wss.num_entries, - * is always a power-of-2. - */ - entry = (atomic_inc_return(&wss.clean_entry) - 1) - & (wss.num_entries - 1); - - /* clear the entry and count the bits */ - bits = xchg(&wss.entries[entry], 0); - weight = hweight64((u64)bits); - /* only adjust the contended total count if needed */ - if (weight) - atomic_sub(weight, &wss.total_count); - } -} - -/* - * Insert the given address into the working set array. - */ -static void wss_insert(void *address) -{ - u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask; - u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ - u32 nr = page & (BITS_PER_LONG - 1); - - if (!test_and_set_bit(nr, &wss.entries[entry])) - atomic_inc(&wss.total_count); - - wss_advance_clean_counter(); -} - -/* - * Is the working set larger than the threshold? - */ -static inline bool wss_exceeds_threshold(void) -{ - return atomic_read(&wss.total_count) >= wss.threshold; -} - /* * Translate ib_wr_opcode into ib_wc_opcode. */ @@ -438,79 +290,6 @@ static const u32 pio_opmask[BIT(3)] = { */ __be64 ib_hfi1_sys_image_guid; -/** - * hfi1_copy_sge - copy data to SGE memory - * @ss: the SGE state - * @data: the data to copy - * @length: the length of the data - * @release: boolean to release MR - * @copy_last: do a separate copy of the last 8 bytes - */ -void hfi1_copy_sge( - struct rvt_sge_state *ss, - void *data, u32 length, - bool release, - bool copy_last) -{ - struct rvt_sge *sge = &ss->sge; - int i; - bool in_last = false; - bool cacheless_copy = false; - - if (sge_copy_mode == COPY_CACHELESS) { - cacheless_copy = length >= PAGE_SIZE; - } else if (sge_copy_mode == COPY_ADAPTIVE) { - if (length >= PAGE_SIZE) { - /* - * NOTE: this *assumes*: - * o The first vaddr is the dest. - * o If multiple pages, then vaddr is sequential. - */ - wss_insert(sge->vaddr); - if (length >= (2 * PAGE_SIZE)) - wss_insert(sge->vaddr + PAGE_SIZE); - - cacheless_copy = wss_exceeds_threshold(); - } else { - wss_advance_clean_counter(); - } - } - if (copy_last) { - if (length > 8) { - length -= 8; - } else { - copy_last = false; - in_last = true; - } - } - -again: - while (length) { - u32 len = rvt_get_sge_length(sge, length); - - WARN_ON_ONCE(len == 0); - if (unlikely(in_last)) { - /* enforce byte transfer ordering */ - for (i = 0; i < len; i++) - ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; - } else if (cacheless_copy) { - cacheless_memcpy(sge->vaddr, data, len); - } else { - memcpy(sge->vaddr, data, len); - } - rvt_update_sge(ss, len, release); - data += len; - length -= len; - } - - if (copy_last) { - copy_last = false; - in_last = true; - length = 8; - goto again; - } -} - /* * Make sure the QP is ready and able to accept the given opcode. */ @@ -1949,6 +1728,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size; dd->verbs_dev.rdi.dparms.nports = dd->num_pports; dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); + dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; + dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; + dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index d4164114396e..eb99e8df6251 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -315,9 +315,6 @@ void hfi1_put_txreq(struct verbs_txreq *tx); int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps); -void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, - bool release, bool copy_last); - void hfi1_cnp_rcv(struct hfi1_packet *packet); void hfi1_uc_rcv(struct hfi1_packet *packet); @@ -393,28 +390,6 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc); -int hfi1_wss_init(void); -void hfi1_wss_exit(void); - -/* platform specific: return the lowest level cache (llc) size, in KiB */ -static inline int wss_llc_size(void) -{ - /* assume that the boot CPU value is universal for all CPUs */ - return boot_cpu_data.x86_cache_size; -} - -/* platform specific: cacheless copy */ -static inline void cacheless_memcpy(void *dst, void *src, size_t n) -{ - /* - * Use the only available X64 cacheless copy. Add a __user cast - * to quiet sparse. The src agument is already in the kernel so - * there are no security issues. The extra fault recovery machinery - * is not invoked. - */ - __copy_user_nocache(dst, (void __user *)src, n, 0); -} - static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr) { return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ); diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index f35fdeb14347..034b9729f991 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -1425,7 +1425,8 @@ read_middle: qp->s_rdma_read_len -= pmtu; update_last_psn(qp, psn); spin_unlock_irqrestore(&qp->s_lock, flags); - qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, pmtu, false, false); goto bail; case OP(RDMA_READ_RESPONSE_ONLY): @@ -1471,7 +1472,8 @@ read_last: if (unlikely(tlen != qp->s_rdma_read_len)) goto ack_len_err; aeth = be32_to_cpu(ohdr->u.aeth); - qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, tlen, false, false); WARN_ON(qp->s_rdma_read_sge.num_sge); (void) do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST), 0, rcd); @@ -1844,7 +1846,7 @@ send_middle: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto nack_inv; - qib_copy_sge(&qp->r_sge, data, pmtu, 1); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -1890,7 +1892,7 @@ send_last: wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; - qib_copy_sge(&qp->r_sge, data, tlen, 1); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); qp->r_msn++; if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index f8a7de795beb..bc2a9e208d18 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -354,7 +354,7 @@ again: if (len > sge->sge_length) len = sge->sge_length; BUG_ON(len == 0); - qib_copy_sge(&qp->r_sge, sge->vaddr, len, release); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, release, false); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 3e54bc11e0ae..0a090569148c 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -359,7 +359,7 @@ send_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto rewind; - qib_copy_sge(&qp->r_sge, data, pmtu, 0); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false); break; case OP(SEND_LAST_WITH_IMMEDIATE): @@ -385,7 +385,7 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto rewind; wc.opcode = IB_WC_RECV; - qib_copy_sge(&qp->r_sge, data, tlen, 0); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false); rvt_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; @@ -449,7 +449,7 @@ rdma_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto drop; - qib_copy_sge(&qp->r_sge, data, pmtu, 1); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -479,7 +479,7 @@ rdma_last_imm: } wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - qib_copy_sge(&qp->r_sge, data, tlen, 1); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); goto last_imm; @@ -495,7 +495,7 @@ rdma_last: tlen -= (hdrsize + pad + 4); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; - qib_copy_sge(&qp->r_sge, data, tlen, 1); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); break; diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index f8d029a2390f..b12b9c3a6b5c 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -162,8 +162,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) const struct ib_global_route *grd = rdma_ah_read_grh(ah_attr); qib_make_grh(ibp, &grh, grd, 0, 0); - qib_copy_sge(&qp->r_sge, &grh, - sizeof(grh), 1); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(grh), true, false); wc.wc_flags |= IB_WC_GRH; } else rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); @@ -179,7 +179,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (len > sge->sge_length) len = sge->sge_length; BUG_ON(len == 0); - qib_copy_sge(&qp->r_sge, sge->vaddr, len, 1); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; @@ -551,12 +551,13 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, goto drop; } if (has_grh) { - qib_copy_sge(&qp->r_sge, &hdr->u.l.grh, - sizeof(struct ib_grh), 1); + rvt_copy_sge(qp, &qp->r_sge, &hdr->u.l.grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); - qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1); + rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), + true, false); rvt_put_ss(&qp->r_sge); if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) return; diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 26ab78e5aaa7..ae6d42cc9651 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -131,27 +131,6 @@ const enum ib_wc_opcode ib_qib_wc_opcode[] = { */ __be64 ib_qib_sys_image_guid; -/** - * qib_copy_sge - copy data to SGE memory - * @ss: the SGE state - * @data: the data to copy - * @length: the length of the data - */ -void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, int release) -{ - struct rvt_sge *sge = &ss->sge; - - while (length) { - u32 len = rvt_get_sge_length(sge, length); - - WARN_ON_ONCE(len == 0); - memcpy(sge->vaddr, data, len); - rvt_update_sge(ss, len, release); - data += len; - length -= len; - } -} - /* * Count the number of DMA descriptors needed to send length bytes of data. * Don't modify the qib_sge_state to get the count. @@ -1631,6 +1610,7 @@ int qib_register_ib_device(struct qib_devdata *dd) dd->verbs_dev.rdi.dparms.node = dd->assigned_node_id; dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB; dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE; + dd->verbs_dev.rdi.dparms.sge_copy_mode = RVT_SGE_COPY_MEMCPY; qib_fill_device_attr(dd); diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index df90a7a41534..0c5e623ec70c 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -292,9 +292,6 @@ void qib_put_txreq(struct qib_verbs_txreq *tx); int qib_verbs_send(struct rvt_qp *qp, struct ib_header *hdr, u32 hdrwords, struct rvt_sge_state *ss, u32 len); -void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, - int release); - void qib_uc_rcv(struct qib_ibport *ibp, struct ib_header *hdr, int has_grh, void *data, u32 tlen, struct rvt_qp *qp); diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig index 98e798007f75..7df896a18d38 100644 --- a/drivers/infiniband/sw/rdmavt/Kconfig +++ b/drivers/infiniband/sw/rdmavt/Kconfig @@ -1,6 +1,6 @@ config INFINIBAND_RDMAVT tristate "RDMA verbs transport library" - depends on 64BIT && ARCH_DMA_ADDR_T_64BIT + depends on X86_64 && ARCH_DMA_ADDR_T_64BIT depends on PCI select DMA_VIRT_OPS ---help--- diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index a036a5368103..d969b0803e6f 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -118,6 +118,187 @@ const int ib_rvt_state_ops[IB_QPS_ERR + 1] = { }; EXPORT_SYMBOL(ib_rvt_state_ops); +/* platform specific: return the last level cache (llc) size, in KiB */ +static int rvt_wss_llc_size(void) +{ + /* assume that the boot CPU value is universal for all CPUs */ + return boot_cpu_data.x86_cache_size; +} + +/* platform specific: cacheless copy */ +static void cacheless_memcpy(void *dst, void *src, size_t n) +{ + /* + * Use the only available X64 cacheless copy. Add a __user cast + * to quiet sparse. The src agument is already in the kernel so + * there are no security issues. The extra fault recovery machinery + * is not invoked. + */ + __copy_user_nocache(dst, (void __user *)src, n, 0); +} + +void rvt_wss_exit(struct rvt_dev_info *rdi) +{ + struct rvt_wss *wss = rdi->wss; + + if (!wss) + return; + + /* coded to handle partially initialized and repeat callers */ + kfree(wss->entries); + wss->entries = NULL; + kfree(rdi->wss); + rdi->wss = NULL; +} + +/** + * rvt_wss_init - Init wss data structures + * + * Return: 0 on success + */ +int rvt_wss_init(struct rvt_dev_info *rdi) +{ + unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode; + unsigned int wss_threshold = rdi->dparms.wss_threshold; + unsigned int wss_clean_period = rdi->dparms.wss_clean_period; + long llc_size; + long llc_bits; + long table_size; + long table_bits; + struct rvt_wss *wss; + int node = rdi->dparms.node; + + if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) { + rdi->wss = NULL; + return 0; + } + + rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node); + if (!rdi->wss) + return -ENOMEM; + wss = rdi->wss; + + /* check for a valid percent range - default to 80 if none or invalid */ + if (wss_threshold < 1 || wss_threshold > 100) + wss_threshold = 80; + + /* reject a wildly large period */ + if (wss_clean_period > 1000000) + wss_clean_period = 256; + + /* reject a zero period */ + if (wss_clean_period == 0) + wss_clean_period = 1; + + /* + * Calculate the table size - the next power of 2 larger than the + * LLC size. LLC size is in KiB. + */ + llc_size = rvt_wss_llc_size() * 1024; + table_size = roundup_pow_of_two(llc_size); + + /* one bit per page in rounded up table */ + llc_bits = llc_size / PAGE_SIZE; + table_bits = table_size / PAGE_SIZE; + wss->pages_mask = table_bits - 1; + wss->num_entries = table_bits / BITS_PER_LONG; + + wss->threshold = (llc_bits * wss_threshold) / 100; + if (wss->threshold == 0) + wss->threshold = 1; + + wss->clean_period = wss_clean_period; + atomic_set(&wss->clean_counter, wss_clean_period); + + wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries), + GFP_KERNEL, node); + if (!wss->entries) { + rvt_wss_exit(rdi); + return -ENOMEM; + } + + return 0; +} + +/* + * Advance the clean counter. When the clean period has expired, + * clean an entry. + * + * This is implemented in atomics to avoid locking. Because multiple + * variables are involved, it can be racy which can lead to slightly + * inaccurate information. Since this is only a heuristic, this is + * OK. Any innaccuracies will clean themselves out as the counter + * advances. That said, it is unlikely the entry clean operation will + * race - the next possible racer will not start until the next clean + * period. + * + * The clean counter is implemented as a decrement to zero. When zero + * is reached an entry is cleaned. + */ +static void wss_advance_clean_counter(struct rvt_wss *wss) +{ + int entry; + int weight; + unsigned long bits; + + /* become the cleaner if we decrement the counter to zero */ + if (atomic_dec_and_test(&wss->clean_counter)) { + /* + * Set, not add, the clean period. This avoids an issue + * where the counter could decrement below the clean period. + * Doing a set can result in lost decrements, slowing the + * clean advance. Since this a heuristic, this possible + * slowdown is OK. + * + * An alternative is to loop, advancing the counter by a + * clean period until the result is > 0. However, this could + * lead to several threads keeping another in the clean loop. + * This could be mitigated by limiting the number of times + * we stay in the loop. + */ + atomic_set(&wss->clean_counter, wss->clean_period); + + /* + * Uniquely grab the entry to clean and move to next. + * The current entry is always the lower bits of + * wss.clean_entry. The table size, wss.num_entries, + * is always a power-of-2. + */ + entry = (atomic_inc_return(&wss->clean_entry) - 1) + & (wss->num_entries - 1); + + /* clear the entry and count the bits */ + bits = xchg(&wss->entries[entry], 0); + weight = hweight64((u64)bits); + /* only adjust the contended total count if needed */ + if (weight) + atomic_sub(weight, &wss->total_count); + } +} + +/* + * Insert the given address into the working set array. + */ +static void wss_insert(struct rvt_wss *wss, void *address) +{ + u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask; + u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ + u32 nr = page & (BITS_PER_LONG - 1); + + if (!test_and_set_bit(nr, &wss->entries[entry])) + atomic_inc(&wss->total_count); + + wss_advance_clean_counter(wss); +} + +/* + * Is the working set larger than the threshold? + */ +static inline bool wss_exceeds_threshold(struct rvt_wss *wss) +{ + return atomic_read(&wss->total_count) >= wss->threshold; +} + static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map) { @@ -2476,3 +2657,80 @@ void rvt_qp_iter(struct rvt_dev_info *rdi, rcu_read_unlock(); } EXPORT_SYMBOL(rvt_qp_iter); + +/** + * rvt_copy_sge - copy data to SGE memory + * @qp: associated QP + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + * @release: boolean to release MR + * @copy_last: do a separate copy of the last 8 bytes + */ +void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, + void *data, u32 length, + bool release, bool copy_last) +{ + struct rvt_sge *sge = &ss->sge; + int i; + bool in_last = false; + bool cacheless_copy = false; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + struct rvt_wss *wss = rdi->wss; + unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode; + + if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) { + cacheless_copy = length >= PAGE_SIZE; + } else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) { + if (length >= PAGE_SIZE) { + /* + * NOTE: this *assumes*: + * o The first vaddr is the dest. + * o If multiple pages, then vaddr is sequential. + */ + wss_insert(wss, sge->vaddr); + if (length >= (2 * PAGE_SIZE)) + wss_insert(wss, (sge->vaddr + PAGE_SIZE)); + + cacheless_copy = wss_exceeds_threshold(wss); + } else { + wss_advance_clean_counter(wss); + } + } + + if (copy_last) { + if (length > 8) { + length -= 8; + } else { + copy_last = false; + in_last = true; + } + } + +again: + while (length) { + u32 len = rvt_get_sge_length(sge, length); + + WARN_ON_ONCE(len == 0); + if (unlikely(in_last)) { + /* enforce byte transfer ordering */ + for (i = 0; i < len; i++) + ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; + } else if (cacheless_copy) { + cacheless_memcpy(sge->vaddr, data, len); + } else { + memcpy(sge->vaddr, data, len); + } + rvt_update_sge(ss, len, release); + data += len; + length -= len; + } + + if (copy_last) { + copy_last = false; + in_last = true; + length = 8; + goto again; + } +} +EXPORT_SYMBOL(rvt_copy_sge); diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h index 264811fdc530..6d883972e0b8 100644 --- a/drivers/infiniband/sw/rdmavt/qp.h +++ b/drivers/infiniband/sw/rdmavt/qp.h @@ -66,4 +66,6 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); +int rvt_wss_init(struct rvt_dev_info *rdi); +void rvt_wss_exit(struct rvt_dev_info *rdi); #endif /* DEF_RVTQP_H */ diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index e3249d46bcef..723d3daf2eba 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -774,6 +774,13 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) goto bail_no_mr; } + /* Memory Working Set Size */ + ret = rvt_wss_init(rdi); + if (ret) { + rvt_pr_err(rdi, "Error in WSS init.\n"); + goto bail_mr; + } + /* Completion queues */ spin_lock_init(&rdi->n_cqs_lock); @@ -832,7 +839,7 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rdi->driver_f.port_callback); if (ret) { rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); - goto bail_mr; + goto bail_wss; } rvt_create_mad_agents(rdi); @@ -840,6 +847,8 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rvt_pr_info(rdi, "Registration with rdmavt done.\n"); return ret; +bail_wss: + rvt_wss_exit(rdi); bail_mr: rvt_mr_exit(rdi); @@ -863,6 +872,7 @@ void rvt_unregister_device(struct rvt_dev_info *rdi) rvt_free_mad_agents(rdi); ib_unregister_device(&rdi->ibdev); + rvt_wss_exit(rdi); rvt_mr_exit(rdi); rvt_qp_exit(rdi); } diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 0a888a9ecc96..7fa2f2d46a3c 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -149,6 +149,10 @@ struct rvt_ibport { #define RVT_CQN_MAX 16 /* maximum length of cq name */ +#define RVT_SGE_COPY_MEMCPY 0 +#define RVT_SGE_COPY_CACHELESS 1 +#define RVT_SGE_COPY_ADAPTIVE 2 + /* * Things that are driver specific, module parameters in hfi1 and qib */ @@ -161,6 +165,9 @@ struct rvt_driver_params { */ unsigned int lkey_table_size; unsigned int qp_table_size; + unsigned int sge_copy_mode; + unsigned int wss_threshold; + unsigned int wss_clean_period; int qpn_start; int qpn_inc; int qpn_res_start; @@ -193,6 +200,19 @@ struct rvt_ah { u8 log_pmtu; }; +/* memory working set size */ +struct rvt_wss { + unsigned long *entries; + atomic_t total_count; + atomic_t clean_counter; + atomic_t clean_entry; + + int threshold; + int num_entries; + long pages_mask; + unsigned int clean_period; +}; + struct rvt_dev_info; struct rvt_swqe; struct rvt_driver_provided { @@ -418,6 +438,8 @@ struct rvt_dev_info { u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ spinlock_t n_mcast_grps_lock; + /* Memory Working Set Size */ + struct rvt_wss *wss; }; /** diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 927f6d5b6d0f..eaf2593ca822 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -678,6 +678,10 @@ void rvt_del_timers_sync(struct rvt_qp *qp); void rvt_stop_rc_timers(struct rvt_qp *qp); void rvt_add_retry_timer(struct rvt_qp *qp); +void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, + void *data, u32 length, + bool release, bool copy_last); + /** * struct rvt_qp_iter - the iterator for QPs * @qp - the current QP |