diff options
author | Herbert Xu <herbert@gondor.apana.org.au> | 2015-06-19 22:07:07 +0800 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2015-06-19 22:07:07 +0800 |
commit | c0b59fafe31bf91f589736be304d739b13952fdd (patch) | |
tree | 0088a41c6b68132739294643be06734e3af67677 /drivers/infiniband/ulp/ipoib | |
parent | 28bceeaaf81140d69647acd0eb7dc9312f27844a (diff) | |
parent | bfa1ce5f38938cc9e6c7f2d1011f88eba2b9e2b2 (diff) |
Merge branch 'mvebu/drivers' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
Merge the mvebu/drivers branch of the arm-soc tree which contains
just a single patch bfa1ce5f38938cc9e6c7f2d1011f88eba2b9e2b2 ("bus:
mvebu-mbus: add mv_mbus_dram_info_nooverlap()") that happens to be
a prerequisite of the new marvell/cesa crypto driver.
Diffstat (limited to 'drivers/infiniband/ulp/ipoib')
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib.h | 31 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_cm.c | 18 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_ib.c | 195 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_main.c | 78 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 520 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 44 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 3 |
7 files changed, 498 insertions, 391 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index d7562beb5423..bd94b0a6e9e5 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -87,7 +87,6 @@ enum { IPOIB_FLAG_ADMIN_UP = 2, IPOIB_PKEY_ASSIGNED = 3, IPOIB_FLAG_SUBINTERFACE = 5, - IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, IPOIB_FLAG_ADMIN_CM = 9, IPOIB_FLAG_UMCAST = 10, @@ -98,9 +97,15 @@ enum { IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ IPOIB_MCAST_FLAG_SENDONLY = 1, - IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ + /* + * For IPOIB_MCAST_FLAG_BUSY + * When set, in flight join and mcast->mc is unreliable + * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or + * haven't started yet + * When clear and mcast->mc is valid pointer, join was successful + */ + IPOIB_MCAST_FLAG_BUSY = 2, IPOIB_MCAST_FLAG_ATTACHED = 3, - IPOIB_MCAST_JOIN_STARTED = 4, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, @@ -148,6 +153,7 @@ struct ipoib_mcast { unsigned long created; unsigned long backoff; + unsigned long delay_until; unsigned long flags; unsigned char logcount; @@ -292,6 +298,11 @@ struct ipoib_neigh_table { struct completion deleted; }; +struct ipoib_qp_state_validate { + struct work_struct work; + struct ipoib_dev_priv *priv; +}; + /* * Device private locking: network stack tx_lock protects members used * in TX fast path, lock protects everything else. lock nests inside @@ -317,6 +328,7 @@ struct ipoib_dev_priv { struct list_head multicast_list; struct rb_root multicast_tree; + struct workqueue_struct *wq; struct delayed_work mcast_task; struct work_struct carrier_on_task; struct work_struct flush_light; @@ -426,11 +438,6 @@ struct ipoib_neigh { #define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) #define IPOIB_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) -static inline int ipoib_ud_need_sg(unsigned int ib_mtu) -{ - return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE; -} - void ipoib_neigh_dtor(struct ipoib_neigh *neigh); static inline void ipoib_neigh_put(struct ipoib_neigh *neigh) { @@ -477,10 +484,10 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); -int ipoib_ib_dev_open(struct net_device *dev, int flush); +int ipoib_ib_dev_open(struct net_device *dev); int ipoib_ib_dev_up(struct net_device *dev); -int ipoib_ib_dev_down(struct net_device *dev, int flush); -int ipoib_ib_dev_stop(struct net_device *dev, int flush); +int ipoib_ib_dev_down(struct net_device *dev); +int ipoib_ib_dev_stop(struct net_device *dev); void ipoib_pkey_dev_check_presence(struct net_device *dev); int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); @@ -492,7 +499,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); void ipoib_mcast_restart_task(struct work_struct *work); int ipoib_mcast_start_thread(struct net_device *dev); -int ipoib_mcast_stop_thread(struct net_device *dev, int flush); +int ipoib_mcast_stop_thread(struct net_device *dev); void ipoib_mcast_dev_down(struct net_device *dev); void ipoib_mcast_dev_flush(struct net_device *dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 933efcea0d03..56959adb6c7d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -474,7 +474,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even } spin_lock_irq(&priv->lock); - queue_delayed_work(ipoib_workqueue, + queue_delayed_work(priv->wq, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); /* Add this entry to passive ids list head, but do not re-add it * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ @@ -576,7 +576,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); ipoib_cm_start_rx_drain(priv); - queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + queue_work(priv->wq, &priv->cm.rx_reap_task); spin_unlock_irqrestore(&priv->lock, flags); } else ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", @@ -603,7 +603,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) spin_lock_irqsave(&priv->lock, flags); list_move(&p->list, &priv->cm.rx_reap_list); spin_unlock_irqrestore(&priv->lock, flags); - queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + queue_work(priv->wq, &priv->cm.rx_reap_task); } return; } @@ -827,7 +827,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); - queue_work(ipoib_workqueue, &priv->cm.reap_task); + queue_work(priv->wq, &priv->cm.reap_task); } clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); @@ -1255,7 +1255,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); - queue_work(ipoib_workqueue, &priv->cm.reap_task); + queue_work(priv->wq, &priv->cm.reap_task); } spin_unlock_irqrestore(&priv->lock, flags); @@ -1284,7 +1284,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path tx->dev = dev; list_add(&tx->list, &priv->cm.start_list); set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); - queue_work(ipoib_workqueue, &priv->cm.start_task); + queue_work(priv->wq, &priv->cm.start_task); return tx; } @@ -1295,7 +1295,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { spin_lock_irqsave(&priv->lock, flags); list_move(&tx->list, &priv->cm.reap_list); - queue_work(ipoib_workqueue, &priv->cm.reap_task); + queue_work(priv->wq, &priv->cm.reap_task); ipoib_dbg(priv, "Reap connection for gid %pI6\n", tx->neigh->daddr + 4); tx->neigh = NULL; @@ -1417,7 +1417,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, skb_queue_tail(&priv->cm.skb_queue, skb); if (e) - queue_work(ipoib_workqueue, &priv->cm.skb_task); + queue_work(priv->wq, &priv->cm.skb_task); } static void ipoib_cm_rx_reap(struct work_struct *work) @@ -1450,7 +1450,7 @@ static void ipoib_cm_stale_task(struct work_struct *work) } if (!list_empty(&priv->cm.passive_ids)) - queue_delayed_work(ipoib_workqueue, + queue_delayed_work(priv->wq, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); spin_unlock_irq(&priv->lock); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 72626c348174..63b92cbb29ad 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -94,39 +94,9 @@ void ipoib_free_ah(struct kref *kref) static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, u64 mapping[IPOIB_UD_RX_SG]) { - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE, - DMA_FROM_DEVICE); - ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE, - DMA_FROM_DEVICE); - } else - ib_dma_unmap_single(priv->ca, mapping[0], - IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), - DMA_FROM_DEVICE); -} - -static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv, - struct sk_buff *skb, - unsigned int length) -{ - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[0]; - unsigned int size; - /* - * There is only two buffers needed for max_payload = 4K, - * first buf size is IPOIB_UD_HEAD_SIZE - */ - skb->tail += IPOIB_UD_HEAD_SIZE; - skb->len += length; - - size = length - IPOIB_UD_HEAD_SIZE; - - skb_frag_size_set(frag, size); - skb->data_len += size; - skb->truesize += PAGE_SIZE; - } else - skb_put(skb, length); - + ib_dma_unmap_single(priv->ca, mapping[0], + IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), + DMA_FROM_DEVICE); } static int ipoib_ib_post_receive(struct net_device *dev, int id) @@ -156,18 +126,11 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; int buf_size; - int tailroom; u64 *mapping; - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - buf_size = IPOIB_UD_HEAD_SIZE; - tailroom = 128; /* reserve some tailroom for IP/TCP headers */ - } else { - buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); - tailroom = 0; - } + buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); - skb = dev_alloc_skb(buf_size + tailroom + 4); + skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN); if (unlikely(!skb)) return NULL; @@ -184,23 +147,8 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) goto error; - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - struct page *page = alloc_page(GFP_ATOMIC); - if (!page) - goto partial_error; - skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE); - mapping[1] = - ib_dma_map_page(priv->ca, page, - 0, PAGE_SIZE, DMA_FROM_DEVICE); - if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1]))) - goto partial_error; - } - priv->rx_ring[id].skb = skb; return skb; - -partial_error: - ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE); error: dev_kfree_skb_any(skb); return NULL; @@ -278,7 +226,8 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) wc->byte_len, wc->slid); ipoib_ud_dma_unmap_rx(priv, mapping); - ipoib_ud_skb_put_frags(priv, skb, wc->byte_len); + + skb_put(skb, wc->byte_len); /* First byte of dgid signals multicast when 0xff */ dgid = &((struct ib_grh *)skb->data)->dgid; @@ -296,6 +245,8 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) skb_reset_mac_header(skb); skb_pull(skb, IPOIB_ENCAP_LEN); + skb->truesize = SKB_TRUESIZE(skb->len); + ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; @@ -376,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, } } +/* + * As the result of a completion error the QP Can be transferred to SQE states. + * The function checks if the (send)QP is in SQE state and + * moves it back to RTS state, that in order to have it functional again. + */ +static void ipoib_qp_state_validate_work(struct work_struct *work) +{ + struct ipoib_qp_state_validate *qp_work = + container_of(work, struct ipoib_qp_state_validate, work); + + struct ipoib_dev_priv *priv = qp_work->priv; + struct ib_qp_attr qp_attr; + struct ib_qp_init_attr query_init_attr; + int ret; + + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); + if (ret) { + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", + __func__, ret); + goto free_res; + } + pr_info("%s: QP: 0x%x is in state: %d\n", + __func__, priv->qp->qp_num, qp_attr.qp_state); + + /* currently support only in SQE->RTS transition*/ + if (qp_attr.qp_state == IB_QPS_SQE) { + qp_attr.qp_state = IB_QPS_RTS; + + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); + if (ret) { + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", + ret, priv->qp->qp_num); + goto free_res; + } + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n", + __func__, priv->qp->qp_num); + } else { + pr_warn("QP (%d) will stay in state: %d\n", + priv->qp->qp_num, qp_attr.qp_state); + } + +free_res: + kfree(qp_work); +} + static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -407,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) netif_wake_queue(dev); if (wc->status != IB_WC_SUCCESS && - wc->status != IB_WC_WR_FLUSH_ERR) + wc->status != IB_WC_WR_FLUSH_ERR) { + struct ipoib_qp_state_validate *qp_work; ipoib_warn(priv, "failed send event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); + if (!qp_work) { + ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n", + __func__, priv->qp->qp_num); + return; + } + + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); + qp_work->priv = priv; + queue_work(priv->wq, &qp_work->work); + } } static int poll_tx(struct ipoib_dev_priv *priv) @@ -655,16 +663,33 @@ void ipoib_reap_ah(struct work_struct *work) __ipoib_reap_ah(dev); if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + queue_delayed_work(priv->wq, &priv->ah_reap_task, round_jiffies_relative(HZ)); } +static void ipoib_flush_ah(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + cancel_delayed_work(&priv->ah_reap_task); + flush_workqueue(priv->wq); + ipoib_reap_ah(&priv->ah_reap_task.work); +} + +static void ipoib_stop_ah(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + set_bit(IPOIB_STOP_REAPER, &priv->flags); + ipoib_flush_ah(dev); +} + static void ipoib_ib_tx_timer_func(unsigned long ctx) { drain_tx_cq((struct net_device *)ctx); } -int ipoib_ib_dev_open(struct net_device *dev, int flush) +int ipoib_ib_dev_open(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; @@ -696,7 +721,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) } clear_bit(IPOIB_STOP_REAPER, &priv->flags); - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + queue_delayed_work(priv->wq, &priv->ah_reap_task, round_jiffies_relative(HZ)); if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) @@ -706,7 +731,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) dev_stop: if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) napi_enable(&priv->napi); - ipoib_ib_dev_stop(dev, flush); + ipoib_ib_dev_stop(dev); return -1; } @@ -738,7 +763,7 @@ int ipoib_ib_dev_up(struct net_device *dev) return ipoib_mcast_start_thread(dev); } -int ipoib_ib_dev_down(struct net_device *dev, int flush) +int ipoib_ib_dev_down(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -747,7 +772,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); netif_carrier_off(dev); - ipoib_mcast_stop_thread(dev, flush); + ipoib_mcast_stop_thread(dev); ipoib_mcast_dev_flush(dev); ipoib_flush_paths(dev); @@ -807,7 +832,7 @@ void ipoib_drain_cq(struct net_device *dev) local_bh_enable(); } -int ipoib_ib_dev_stop(struct net_device *dev, int flush) +int ipoib_ib_dev_stop(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr qp_attr; @@ -877,24 +902,7 @@ timeout: if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) ipoib_warn(priv, "Failed to modify QP to RESET state\n"); - /* Wait for all AHs to be reaped */ - set_bit(IPOIB_STOP_REAPER, &priv->flags); - cancel_delayed_work(&priv->ah_reap_task); - if (flush) - flush_workqueue(ipoib_workqueue); - - begin = jiffies; - - while (!list_empty(&priv->dead_ahs)) { - __ipoib_reap_ah(dev); - - if (time_after(jiffies, begin + HZ)) { - ipoib_warn(priv, "timing out; will leak address handles\n"); - break; - } - - msleep(1); - } + ipoib_flush_ah(dev); ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); @@ -918,7 +926,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) (unsigned long) dev); if (dev->flags & IFF_UP) { - if (ipoib_ib_dev_open(dev, 1)) { + if (ipoib_ib_dev_open(dev)) { ipoib_transport_dev_cleanup(dev); return -ENODEV; } @@ -1037,15 +1045,16 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, if (level == IPOIB_FLUSH_LIGHT) { ipoib_mark_paths_invalid(dev); ipoib_mcast_dev_flush(dev); + ipoib_flush_ah(dev); } if (level >= IPOIB_FLUSH_NORMAL) - ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_down(dev); if (level == IPOIB_FLUSH_HEAVY) { if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) - ipoib_ib_dev_stop(dev, 0); - if (ipoib_ib_dev_open(dev, 0) != 0) + ipoib_ib_dev_stop(dev); + if (ipoib_ib_dev_open(dev) != 0) return; if (netif_queue_stopped(dev)) netif_start_queue(dev); @@ -1097,9 +1106,17 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) */ ipoib_flush_paths(dev); - ipoib_mcast_stop_thread(dev, 1); + ipoib_mcast_stop_thread(dev); ipoib_mcast_dev_flush(dev); + /* + * All of our ah references aren't free until after + * ipoib_mcast_dev_flush(), ipoib_flush_paths, and + * the neighbor garbage collection is stopped and reaped. + * That should all be done now, so make a final ah flush. + */ + ipoib_stop_ah(dev); + ipoib_transport_dev_cleanup(dev); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 657b89b1d291..9e1b203d756d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -108,7 +108,7 @@ int ipoib_open(struct net_device *dev) set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); - if (ipoib_ib_dev_open(dev, 1)) { + if (ipoib_ib_dev_open(dev)) { if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; goto err_disable; @@ -139,7 +139,7 @@ int ipoib_open(struct net_device *dev) return 0; err_stop: - ipoib_ib_dev_stop(dev, 1); + ipoib_ib_dev_stop(dev); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); @@ -157,8 +157,8 @@ static int ipoib_stop(struct net_device *dev) netif_stop_queue(dev); - ipoib_ib_dev_down(dev, 1); - ipoib_ib_dev_stop(dev, 0); + ipoib_ib_dev_down(dev); + ipoib_ib_dev_stop(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; @@ -640,8 +640,10 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, if (!path->query && path_rec_start(dev, path)) goto err_path; - - __skb_queue_tail(&neigh->queue, skb); + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) + __skb_queue_tail(&neigh->queue, skb); + else + goto err_drop; } spin_unlock_irqrestore(&priv->lock, flags); @@ -676,7 +678,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, new_path = 1; } if (path) { - __skb_queue_tail(&path->queue, skb); + if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + __skb_queue_tail(&path->queue, skb); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } if (!path->query && path_rec_start(dev, path)) { spin_unlock_irqrestore(&priv->lock, flags); @@ -839,13 +846,18 @@ static void ipoib_set_mcast_list(struct net_device *dev) return; } - queue_work(ipoib_workqueue, &priv->restart_task); + queue_work(priv->wq, &priv->restart_task); } static int ipoib_get_iflink(const struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); + /* parent interface */ + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + return dev->ifindex; + + /* child/vlan interface */ return priv->parent->ifindex; } @@ -961,7 +973,7 @@ static void ipoib_reap_neigh(struct work_struct *work) __ipoib_reap_neigh(priv); if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); } @@ -1140,7 +1152,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) /* start garbage collection */ clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); - queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); return 0; @@ -1269,15 +1281,13 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) { struct ipoib_dev_priv *priv = netdev_priv(dev); - if (ipoib_neigh_hash_init(priv) < 0) - goto out; /* Allocate RX/TX "rings" to hold queued skbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); - goto out_neigh_hash_cleanup; + goto out; } priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); @@ -1292,16 +1302,24 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) if (ipoib_ib_dev_init(dev, ca, port)) goto out_tx_ring_cleanup; + /* + * Must be after ipoib_ib_dev_init so we can allocate a per + * device wq there and use it here + */ + if (ipoib_neigh_hash_init(priv) < 0) + goto out_dev_uninit; + return 0; +out_dev_uninit: + ipoib_ib_dev_cleanup(dev); + out_tx_ring_cleanup: vfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); -out_neigh_hash_cleanup: - ipoib_neigh_hash_uninit(dev); out: return -ENOMEM; } @@ -1324,6 +1342,12 @@ void ipoib_dev_cleanup(struct net_device *dev) } unregister_netdevice_many(&head); + /* + * Must be before ipoib_ib_dev_cleanup or we delete an in use + * work queue + */ + ipoib_neigh_hash_uninit(dev); + ipoib_ib_dev_cleanup(dev); kfree(priv->rx_ring); @@ -1331,8 +1355,6 @@ void ipoib_dev_cleanup(struct net_device *dev) priv->rx_ring = NULL; priv->tx_ring = NULL; - - ipoib_neigh_hash_uninit(dev); } static const struct header_ops ipoib_header_ops = { @@ -1641,10 +1663,11 @@ sysfs_failed: register_failed: ib_unregister_event_handler(&priv->event_handler); + flush_workqueue(ipoib_workqueue); /* Stop GC if started before flush */ set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); cancel_delayed_work(&priv->neigh_reap_task); - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); event_failed: ipoib_dev_cleanup(priv->dev); @@ -1707,6 +1730,7 @@ static void ipoib_remove_one(struct ib_device *device) list_for_each_entry_safe(priv, tmp, dev_list, list) { ib_unregister_event_handler(&priv->event_handler); + flush_workqueue(ipoib_workqueue); rtnl_lock(); dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); @@ -1715,7 +1739,7 @@ static void ipoib_remove_one(struct ib_device *device) /* Stop GC */ set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); cancel_delayed_work(&priv->neigh_reap_task); - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); unregister_netdev(priv->dev); free_netdev(priv->dev); @@ -1750,14 +1774,16 @@ static int __init ipoib_init_module(void) return ret; /* - * We create our own workqueue mainly because we want to be - * able to flush it when devices are being removed. We can't - * use schedule_work()/flush_scheduled_work() because both - * unregister_netdev() and linkwatch_event take the rtnl lock, - * so flush_scheduled_work() can deadlock during device - * removal. + * We create a global workqueue here that is used for all flush + * operations. However, if you attempt to flush a workqueue + * from a task on that same workqueue, it deadlocks the system. + * We want to be able to flush the tasks associated with a + * specific net device, so we also create a workqueue for each + * netdevice. We queue up the tasks for that device only on + * its private workqueue, and we only queue up flush events + * on our global flush workqueue. This avoids the deadlocks. */ - ipoib_workqueue = create_singlethread_workqueue("ipoib"); + ipoib_workqueue = create_singlethread_workqueue("ipoib_flush"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index ffb83b5f7e80..0d23e0568deb 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -55,8 +55,6 @@ MODULE_PARM_DESC(mcast_debug_level, "Enable multicast debug tracing if > 0"); #endif -static DEFINE_MUTEX(mcast_mutex); - struct ipoib_mcast_iter { struct net_device *dev; union ib_gid mgid; @@ -66,6 +64,48 @@ struct ipoib_mcast_iter { unsigned int send_only; }; +/* + * This should be called with the priv->lock held + */ +static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv, + struct ipoib_mcast *mcast, + bool delay) +{ + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + return; + + /* + * We will be scheduling *something*, so cancel whatever is + * currently scheduled first + */ + cancel_delayed_work(&priv->mcast_task); + if (mcast && delay) { + /* + * We had a failure and want to schedule a retry later + */ + mcast->backoff *= 2; + if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) + mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; + mcast->delay_until = jiffies + (mcast->backoff * HZ); + /* + * Mark this mcast for its delay, but restart the + * task immediately. The join task will make sure to + * clear out all entries without delays, and then + * schedule itself to run again when the earliest + * delay expires + */ + queue_delayed_work(priv->wq, &priv->mcast_task, 0); + } else if (delay) { + /* + * Special case of retrying after a failure to + * allocate the broadcast multicast group, wait + * 1 second and try again + */ + queue_delayed_work(priv->wq, &priv->mcast_task, HZ); + } else + queue_delayed_work(priv->wq, &priv->mcast_task, 0); +} + static void ipoib_mcast_free(struct ipoib_mcast *mcast) { struct net_device *dev = mcast->dev; @@ -103,6 +143,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, mcast->dev = dev; mcast->created = jiffies; + mcast->delay_until = jiffies; mcast->backoff = 1; INIT_LIST_HEAD(&mcast->list); @@ -185,17 +226,27 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, spin_unlock_irq(&priv->lock); return -EAGAIN; } - priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + /*update priv member according to the new mcast*/ + priv->broadcast->mcmember.qkey = mcmember->qkey; + priv->broadcast->mcmember.mtu = mcmember->mtu; + priv->broadcast->mcmember.traffic_class = mcmember->traffic_class; + priv->broadcast->mcmember.rate = mcmember->rate; + priv->broadcast->mcmember.sl = mcmember->sl; + priv->broadcast->mcmember.flow_label = mcmember->flow_label; + priv->broadcast->mcmember.hop_limit = mcmember->hop_limit; + /* assume if the admin and the mcast are the same both can be changed */ + if (priv->mcast_mtu == priv->admin_mtu) + priv->admin_mtu = + priv->mcast_mtu = + IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + else + priv->mcast_mtu = + IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); priv->tx_wr.wr.ud.remote_qkey = priv->qkey; set_qkey = 1; - - if (!ipoib_cm_admin_enabled(dev)) { - rtnl_lock(); - dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); - rtnl_unlock(); - } } if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { @@ -270,107 +321,35 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, return 0; } -static int -ipoib_mcast_sendonly_join_complete(int status, - struct ib_sa_multicast *multicast) -{ - struct ipoib_mcast *mcast = multicast->context; - struct net_device *dev = mcast->dev; - - /* We trap for port events ourselves. */ - if (status == -ENETRESET) - return 0; - - if (!status) - status = ipoib_mcast_join_finish(mcast, &multicast->rec); - - if (status) { - if (mcast->logcount++ < 20) - ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); - - /* Flush out any queued packets */ - netif_tx_lock_bh(dev); - while (!skb_queue_empty(&mcast->pkt_queue)) { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); - } - netif_tx_unlock_bh(dev); - - /* Clear the busy flag so we try again */ - status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, - &mcast->flags); - } - return status; -} - -static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) -{ - struct net_device *dev = mcast->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ib_sa_mcmember_rec rec = { -#if 0 /* Some SMs don't support send-only yet */ - .join_state = 4 -#else - .join_state = 1 -#endif - }; - int ret = 0; - - if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { - ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); - return -ENODEV; - } - - if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { - ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); - return -EBUSY; - } - - rec.mgid = mcast->mcmember.mgid; - rec.port_gid = priv->local_gid; - rec.pkey = cpu_to_be16(priv->pkey); - - mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, - priv->port, &rec, - IB_SA_MCMEMBER_REC_MGID | - IB_SA_MCMEMBER_REC_PORT_GID | - IB_SA_MCMEMBER_REC_PKEY | - IB_SA_MCMEMBER_REC_JOIN_STATE, - GFP_ATOMIC, - ipoib_mcast_sendonly_join_complete, - mcast); - if (IS_ERR(mcast->mc)) { - ret = PTR_ERR(mcast->mc); - clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", - ret); - } else { - ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n", - mcast->mcmember.mgid.raw); - } - - return ret; -} - void ipoib_mcast_carrier_on_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, carrier_on_task); struct ib_port_attr attr; - /* - * Take rtnl_lock to avoid racing with ipoib_stop() and - * turning the carrier back on while a device is being - * removed. - */ if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); return; } - rtnl_lock(); + /* + * Take rtnl_lock to avoid racing with ipoib_stop() and + * turning the carrier back on while a device is being + * removed. However, ipoib_stop() will attempt to flush + * the workqueue while holding the rtnl lock, so loop + * on trylock until either we get the lock or we see + * FLAG_OPER_UP go away as that signals that we are bailing + * and can safely ignore the carrier on work. + */ + while (!rtnl_trylock()) { + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + return; + else + msleep(20); + } + if (!ipoib_cm_admin_enabled(priv->dev)) + dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu)); netif_carrier_on(priv->dev); rtnl_unlock(); } @@ -382,7 +361,9 @@ static int ipoib_mcast_join_complete(int status, struct net_device *dev = mcast->dev; struct ipoib_dev_priv *priv = netdev_priv(dev); - ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", + ipoib_dbg_mcast(priv, "%sjoin completion for %pI6 (status %d)\n", + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? + "sendonly " : "", mcast->mcmember.mgid.raw, status); /* We trap for port events ourselves. */ @@ -396,49 +377,74 @@ static int ipoib_mcast_join_complete(int status, if (!status) { mcast->backoff = 1; - mutex_lock(&mcast_mutex); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, 0); - mutex_unlock(&mcast_mutex); + mcast->delay_until = jiffies; /* - * Defer carrier on work to ipoib_workqueue to avoid a - * deadlock on rtnl_lock here. + * Defer carrier on work to priv->wq to avoid a + * deadlock on rtnl_lock here. Requeue our multicast + * work too, which will end up happening right after + * our carrier on task work and will allow us to + * send out all of the non-broadcast joins */ - if (mcast == priv->broadcast) - queue_work(ipoib_workqueue, &priv->carrier_on_task); - - status = 0; - goto out; - } + if (mcast == priv->broadcast) { + spin_lock_irq(&priv->lock); + queue_work(priv->wq, &priv->carrier_on_task); + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + goto out_locked; + } + } else { + if (mcast->logcount++ < 20) { + if (status == -ETIMEDOUT || status == -EAGAIN) { + ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n", + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "", + mcast->mcmember.mgid.raw, status); + } else { + ipoib_warn(priv, "%smulticast join failed for %pI6, status %d\n", + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "", + mcast->mcmember.mgid.raw, status); + } + } - if (mcast->logcount++ < 20) { - if (status == -ETIMEDOUT || status == -EAGAIN) { - ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && + mcast->backoff >= 2) { + /* + * We only retry sendonly joins once before we drop + * the packet and quit trying to deal with the + * group. However, we leave the group in the + * mcast list as an unjoined group. If we want to + * try joining again, we simply queue up a packet + * and restart the join thread. The empty queue + * is why the join thread ignores this group. + */ + mcast->backoff = 1; + netif_tx_lock_bh(dev); + while (!skb_queue_empty(&mcast->pkt_queue)) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); + } + netif_tx_unlock_bh(dev); } else { - ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); + spin_lock_irq(&priv->lock); + /* Requeue this join task with a backoff delay */ + __ipoib_mcast_schedule_join_thread(priv, mcast, 1); + goto out_locked; } } - - mcast->backoff *= 2; - if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) - mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; - - /* Clear the busy flag so we try again */ - status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - - mutex_lock(&mcast_mutex); +out: spin_lock_irq(&priv->lock); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, - mcast->backoff * HZ); +out_locked: + /* + * Make sure to set mcast->mc before we clear the busy flag to avoid + * racing with code that checks for BUSY before checking mcast->mc + */ + if (status) + mcast->mc = NULL; + else + mcast->mc = multicast; + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); spin_unlock_irq(&priv->lock); - mutex_unlock(&mcast_mutex); -out: complete(&mcast->done); + return status; } @@ -446,6 +452,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, int create) { struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_sa_multicast *multicast; struct ib_sa_mcmember_rec rec = { .join_state = 1 }; @@ -487,29 +494,18 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, rec.hop_limit = priv->broadcast->mcmember.hop_limit; } - set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - init_completion(&mcast->done); - set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); - - mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, + multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); - if (IS_ERR(mcast->mc)) { + if (IS_ERR(multicast)) { + ret = PTR_ERR(multicast); + ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); + spin_lock_irq(&priv->lock); + /* Requeue this join task with a backoff delay */ + __ipoib_mcast_schedule_join_thread(priv, mcast, 1); clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + spin_unlock_irq(&priv->lock); complete(&mcast->done); - ret = PTR_ERR(mcast->mc); - ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); - - mcast->backoff *= 2; - if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) - mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; - - mutex_lock(&mcast_mutex); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, - mcast->backoff * HZ); - mutex_unlock(&mcast_mutex); } } @@ -519,8 +515,11 @@ void ipoib_mcast_join_task(struct work_struct *work) container_of(work, struct ipoib_dev_priv, mcast_task.work); struct net_device *dev = priv->dev; struct ib_port_attr port_attr; + unsigned long delay_until = 0; + struct ipoib_mcast *mcast = NULL; + int create = 1; - if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) return; if (ib_query_port(priv->ca, priv->port, &port_attr) || @@ -536,93 +535,118 @@ void ipoib_mcast_join_task(struct work_struct *work) else memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + spin_lock_irq(&priv->lock); + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + goto out; + if (!priv->broadcast) { struct ipoib_mcast *broadcast; - if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - return; - - broadcast = ipoib_mcast_alloc(dev, 1); + broadcast = ipoib_mcast_alloc(dev, 0); if (!broadcast) { ipoib_warn(priv, "failed to allocate broadcast group\n"); - mutex_lock(&mcast_mutex); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, HZ); - mutex_unlock(&mcast_mutex); - return; + /* + * Restart us after a 1 second delay to retry + * creating our broadcast group and attaching to + * it. Until this succeeds, this ipoib dev is + * completely stalled (multicast wise). + */ + __ipoib_mcast_schedule_join_thread(priv, NULL, 1); + goto out; } - spin_lock_irq(&priv->lock); memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid)); priv->broadcast = broadcast; __ipoib_mcast_add(dev, priv->broadcast); - spin_unlock_irq(&priv->lock); } if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { - if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) - ipoib_mcast_join(dev, priv->broadcast, 0); - return; - } - - while (1) { - struct ipoib_mcast *mcast = NULL; - - spin_lock_irq(&priv->lock); - list_for_each_entry(mcast, &priv->multicast_list, list) { - if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) - && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) - && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { - /* Found the next unjoined group */ - break; + if (IS_ERR_OR_NULL(priv->broadcast->mc) && + !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) { + mcast = priv->broadcast; + create = 0; + if (mcast->backoff > 1 && + time_before(jiffies, mcast->delay_until)) { + delay_until = mcast->delay_until; + mcast = NULL; } } - spin_unlock_irq(&priv->lock); + goto out; + } - if (&mcast->list == &priv->multicast_list) { - /* All done */ - break; + /* + * We'll never get here until the broadcast group is both allocated + * and attached + */ + list_for_each_entry(mcast, &priv->multicast_list, list) { + if (IS_ERR_OR_NULL(mcast->mc) && + !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) && + (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) || + !skb_queue_empty(&mcast->pkt_queue))) { + if (mcast->backoff == 1 || + time_after_eq(jiffies, mcast->delay_until)) { + /* Found the next unjoined group */ + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + create = 0; + else + create = 1; + spin_unlock_irq(&priv->lock); + ipoib_mcast_join(dev, mcast, create); + spin_lock_irq(&priv->lock); + } else if (!delay_until || + time_before(mcast->delay_until, delay_until)) + delay_until = mcast->delay_until; } - - ipoib_mcast_join(dev, mcast, 1); - return; } - ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); + mcast = NULL; + ipoib_dbg_mcast(priv, "successfully started all multicast joins\n"); - clear_bit(IPOIB_MCAST_RUN, &priv->flags); +out: + if (delay_until) { + cancel_delayed_work(&priv->mcast_task); + queue_delayed_work(priv->wq, &priv->mcast_task, + delay_until - jiffies); + } + if (mcast) { + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + } + spin_unlock_irq(&priv->lock); + if (mcast) + ipoib_mcast_join(dev, mcast, create); } int ipoib_mcast_start_thread(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned long flags; ipoib_dbg_mcast(priv, "starting multicast thread\n"); - mutex_lock(&mcast_mutex); - if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); - mutex_unlock(&mcast_mutex); + spin_lock_irqsave(&priv->lock, flags); + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + spin_unlock_irqrestore(&priv->lock, flags); return 0; } -int ipoib_mcast_stop_thread(struct net_device *dev, int flush) +int ipoib_mcast_stop_thread(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned long flags; ipoib_dbg_mcast(priv, "stopping multicast thread\n"); - mutex_lock(&mcast_mutex); - clear_bit(IPOIB_MCAST_RUN, &priv->flags); + spin_lock_irqsave(&priv->lock, flags); cancel_delayed_work(&priv->mcast_task); - mutex_unlock(&mcast_mutex); + spin_unlock_irqrestore(&priv->lock, flags); - if (flush) - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); return 0; } @@ -633,6 +657,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) int ret = 0; if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n"); + + if (!IS_ERR_OR_NULL(mcast->mc)) ib_sa_free_multicast(mcast->mc); if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { @@ -644,7 +671,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) be16_to_cpu(mcast->mcmember.mlid)); if (ret) ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); - } + } else if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + ipoib_dbg(priv, "leaving with no mcmember but not a " + "SENDONLY join\n"); return 0; } @@ -667,49 +696,37 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) } mcast = __ipoib_mcast_find(dev, mgid); - if (!mcast) { - /* Let's create a new send only group now */ - ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n", - mgid); - - mcast = ipoib_mcast_alloc(dev, 0); + if (!mcast || !mcast->ah) { if (!mcast) { - ipoib_warn(priv, "unable to allocate memory for " - "multicast structure\n"); - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); - goto out; - } - - set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); - memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); - __ipoib_mcast_add(dev, mcast); - list_add_tail(&mcast->list, &priv->multicast_list); - } + /* Let's create a new send only group now */ + ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n", + mgid); + + mcast = ipoib_mcast_alloc(dev, 0); + if (!mcast) { + ipoib_warn(priv, "unable to allocate memory " + "for multicast structure\n"); + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto unlock; + } - if (!mcast->ah) { + set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); + memcpy(mcast->mcmember.mgid.raw, mgid, + sizeof (union ib_gid)); + __ipoib_mcast_add(dev, mcast); + list_add_tail(&mcast->list, &priv->multicast_list); + } if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) skb_queue_tail(&mcast->pkt_queue, skb); else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } - - if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) - ipoib_dbg_mcast(priv, "no address vector, " - "but multicast join already started\n"); - else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) - ipoib_mcast_sendonly_join(mcast); - - /* - * If lookup completes between here and out:, don't - * want to send packet twice. - */ - mcast = NULL; - } - -out: - if (mcast && mcast->ah) { + if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + } + } else { struct ipoib_neigh *neigh; spin_unlock_irqrestore(&priv->lock, flags); @@ -759,9 +776,12 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); - /* seperate between the wait to the leave*/ + /* + * make sure the in-flight joins have finished before we attempt + * to leave + */ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) - if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) wait_for_completion(&mcast->done); list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { @@ -792,9 +812,14 @@ void ipoib_mcast_restart_task(struct work_struct *work) unsigned long flags; struct ib_sa_mcmember_rec rec; - ipoib_dbg_mcast(priv, "restarting multicast task\n"); + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + /* + * shortcut...on shutdown flush is called next, just + * let it do all the work + */ + return; - ipoib_mcast_stop_thread(dev, 0); + ipoib_dbg_mcast(priv, "restarting multicast task\n"); local_irq_save(flags); netif_addr_lock(dev); @@ -880,14 +905,27 @@ void ipoib_mcast_restart_task(struct work_struct *work) netif_addr_unlock(dev); local_irq_restore(flags); - /* We have to cancel outside of the spinlock */ + /* + * make sure the in-flight joins have finished before we attempt + * to leave + */ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(mcast->dev, mcast); ipoib_mcast_free(mcast); } - if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - ipoib_mcast_start_thread(dev); + /* + * Double check that we are still up + */ + if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { + spin_lock_irqsave(&priv->lock, flags); + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + spin_unlock_irqrestore(&priv->lock, flags); + } } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index c56d5d44c53b..e5cc43074196 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -157,6 +157,16 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) goto out_free_pd; } + /* + * the various IPoIB tasks assume they will never race against + * themselves, so always use a single thread workqueue + */ + priv->wq = create_singlethread_workqueue("ipoib_wq"); + if (!priv->wq) { + printk(KERN_WARNING "ipoib: failed to allocate device WQ\n"); + goto out_free_mr; + } + size = ipoib_recvq_size + 1; ret = ipoib_cm_dev_init(dev); if (!ret) { @@ -165,12 +175,13 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */ else size += ipoib_recvq_size * ipoib_max_conn_qp; - } + } else + goto out_free_wq; priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0); if (IS_ERR(priv->recv_cq)) { printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); - goto out_free_mr; + goto out_cm_dev_cleanup; } priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL, @@ -216,15 +227,10 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->tx_wr.send_flags = IB_SEND_SIGNALED; priv->rx_sge[0].lkey = priv->mr->lkey; - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE; - priv->rx_sge[1].length = PAGE_SIZE; - priv->rx_sge[1].lkey = priv->mr->lkey; - priv->rx_wr.num_sge = IPOIB_UD_RX_SG; - } else { - priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); - priv->rx_wr.num_sge = 1; - } + + priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + priv->rx_wr.num_sge = 1; + priv->rx_wr.next = NULL; priv->rx_wr.sg_list = priv->rx_sge; @@ -236,12 +242,19 @@ out_free_send_cq: out_free_recv_cq: ib_destroy_cq(priv->recv_cq); +out_cm_dev_cleanup: + ipoib_cm_dev_cleanup(dev); + +out_free_wq: + destroy_workqueue(priv->wq); + priv->wq = NULL; + out_free_mr: ib_dereg_mr(priv->mr); - ipoib_cm_dev_cleanup(dev); out_free_pd: ib_dealloc_pd(priv->pd); + return -ENODEV; } @@ -265,11 +278,18 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) ipoib_cm_dev_cleanup(dev); + if (priv->wq) { + flush_workqueue(priv->wq); + destroy_workqueue(priv->wq); + priv->wq = NULL; + } + if (ib_dereg_mr(priv->mr)) ipoib_warn(priv, "ib_dereg_mr failed\n"); if (ib_dealloc_pd(priv->pd)) ipoib_warn(priv, "ib_dealloc_pd failed\n"); + } void ipoib_event(struct ib_event_handler *handler, diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 4dd1313056a4..fca1a882de27 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -58,6 +58,7 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, /* MTU will be reset when mcast join happens */ priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + priv->parent = ppriv->dev; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); result = ipoib_set_dev_features(priv, ppriv->ca); @@ -84,8 +85,6 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, goto register_failed; } - priv->parent = ppriv->dev; - ipoib_create_debug_files(priv->dev); /* RTNL childs don't need proprietary sysfs entries */ |