From 616d37a070bb33ea387d0e93343acd8336a30886 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Date: Tue, 18 Jun 2019 22:12:05 -0700
Subject: rds: fix reordering with composite message notification

RDS composite message(rdma + control) user notification needs to be
triggered once the full message is delivered and such a fix was
added as part of commit 941f8d55f6d61 ("RDS: RDMA: Fix the composite
message user notification"). But rds_send_remove_from_sock is missing
data part notify check and hence at times the user don't get
notification which isn't desirable.

One way is to fix the rds_send_remove_from_sock to check of that case
but considering the ordering complexity with completion handler and
rdma + control messages are always dispatched back to back in same send
context, just delaying the signaled completion on rmda work request also
gets the desired behaviour. i.e Notifying application only after
RDMA + control message send completes. So patch updates the earlier
fix with this approach. The delay signaling completions of rdma op
till the control message send completes fix was done by Venkat
Venkatsubra in downstream kernel.

Reviewed-and-tested-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Reviewed-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib_send.c | 29 +++++++++++++----------------
 net/rds/rdma.c    | 10 ----------
 net/rds/rds.h     |  1 -
 net/rds/send.c    |  4 +---
 4 files changed, 14 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 18f2341202f8..dfe6237dafe2 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -69,6 +69,16 @@ static void rds_ib_send_complete(struct rds_message *rm,
 	complete(rm, notify_status);
 }
 
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+				   struct rm_data_op *op,
+				   int wc_status)
+{
+	if (op->op_nents)
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				DMA_TO_DEVICE);
+}
+
 static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
 				   struct rm_rdma_op *op,
 				   int wc_status)
@@ -129,21 +139,6 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
 		rds_ib_stats_inc(s_ib_atomic_fadd);
 }
 
-static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
-				   struct rm_data_op *op,
-				   int wc_status)
-{
-	struct rds_message *rm = container_of(op, struct rds_message, data);
-
-	if (op->op_nents)
-		ib_dma_unmap_sg(ic->i_cm_id->device,
-				op->op_sg, op->op_nents,
-				DMA_TO_DEVICE);
-
-	if (rm->rdma.op_active && rm->data.op_notify)
-		rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status);
-}
-
 /*
  * Unmap the resources associated with a struct send_work.
  *
@@ -902,7 +897,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 		send->s_queued = jiffies;
 		send->s_op = NULL;
 
-		nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+		if (!op->op_notify)
+			nr_sig += rds_ib_set_wr_signal_state(ic, send,
+							     op->op_notify);
 
 		send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
 		send->s_rdma_wr.remote_addr = remote_addr;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index b340ed4fc43a..916f5ec373d8 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -641,16 +641,6 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 		}
 		op->op_notifier->n_user_token = args->user_token;
 		op->op_notifier->n_status = RDS_RDMA_SUCCESS;
-
-		/* Enable rmda notification on data operation for composite
-		 * rds messages and make sure notification is enabled only
-		 * for the data operation which follows it so that application
-		 * gets notified only after full message gets delivered.
-		 */
-		if (rm->data.op_sg) {
-			rm->rdma.op_notify = 0;
-			rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
-		}
 	}
 
 	/* The cookie contains the R_Key of the remote memory region, and
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0d8f67cadd74..f0066d168499 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -476,7 +476,6 @@ struct rds_message {
 		} rdma;
 		struct rm_data_op {
 			unsigned int		op_active:1;
-			unsigned int		op_notify:1;
 			unsigned int		op_nents;
 			unsigned int		op_count;
 			unsigned int		op_dmasg;
diff --git a/net/rds/send.c b/net/rds/send.c
index 166dd578c1cc..031b1e97a466 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -491,14 +491,12 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
 	struct rm_rdma_op *ro;
 	struct rds_notifier *notifier;
 	unsigned long flags;
-	unsigned int notify = 0;
 
 	spin_lock_irqsave(&rm->m_rs_lock, flags);
 
-	notify =  rm->rdma.op_notify | rm->data.op_notify;
 	ro = &rm->rdma;
 	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
-	    ro->op_active && notify && ro->op_notifier) {
+	    ro->op_active && ro->op_notify && ro->op_notifier) {
 		notifier = ro->op_notifier;
 		rs = rm->m_rs;
 		sock_hold(rds_rs_to_sk(rs));
-- 
cgit v1.2.3


From a55207884708bf1d5b8c87a3c504502de77a5416 Mon Sep 17 00:00:00 2001
From: Gerd Rausch <gerd.rausch@oracle.com>
Date: Fri, 28 Jun 2019 17:31:19 -0700
Subject: Revert "RDS: IB: split the mr registration and invalidation path"

This reverts commit 56012459310a1dbcc55c2dbf5500a9f7571402cb.

RDS kept spinning inside function "rds_ib_post_reg_frmr", waiting for
"i_fastreg_wrs" to become incremented:
         while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
                 atomic_inc(&ibmr->ic->i_fastreg_wrs);
                 cpu_relax();
         }

Looking at the original commit:

commit 56012459310a ("RDS: IB: split the mr registration and
invalidation path")

In there, the "rds_ib_mr_cqe_handler" was changed in the following
way:

 void rds_ib_mr_cqe_handler(struct
 rds_ib_connection *ic,
 struct ib_wc *wc)
        if (frmr->fr_inv) {
                  frmr->fr_state = FRMR_IS_FREE;
                  frmr->fr_inv = false;
                atomic_inc(&ic->i_fastreg_wrs);
        } else {
                atomic_inc(&ic->i_fastunreg_wrs);
        }

It looks like it's got it exactly backwards:

Function "rds_ib_post_reg_frmr" keeps track of the outstanding
requests via "i_fastreg_wrs".

Function "rds_ib_post_inv" keeps track of the outstanding requests
via "i_fastunreg_wrs" (post original commit). It also sets:
         frmr->fr_inv = true;

However the completion handler "rds_ib_mr_cqe_handler" adjusts
"i_fastreg_wrs" when "fr_inv" had been true, and adjusts
"i_fastunreg_wrs" otherwise.

The original commit was done in the name of performance:
to remove the performance bottleneck

No performance benefit could be observed with a fixed-up version
of the original commit measured between two Oracle X7 servers,
both equipped with Mellanox Connect-X5 HCAs.

The prudent course of action is to revert this commit.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h      |  4 +---
 net/rds/ib_cm.c   |  9 ++-------
 net/rds/ib_frmr.c | 11 +++++------
 3 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 67a715b076ca..66c03c7665b2 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -15,8 +15,7 @@
 
 #define RDS_IB_DEFAULT_RECV_WR		1024
 #define RDS_IB_DEFAULT_SEND_WR		256
-#define RDS_IB_DEFAULT_FR_WR		256
-#define RDS_IB_DEFAULT_FR_INV_WR	256
+#define RDS_IB_DEFAULT_FR_WR		512
 
 #define RDS_IB_DEFAULT_RETRY_COUNT	1
 
@@ -157,7 +156,6 @@ struct rds_ib_connection {
 
 	/* To control the number of wrs from fastreg */
 	atomic_t		i_fastreg_wrs;
-	atomic_t		i_fastunreg_wrs;
 
 	/* interrupt handling */
 	struct tasklet_struct	i_send_tasklet;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 66c6eb56072b..8891822eba4f 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -460,10 +460,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	 * completion queue and send queue. This extra space is used for FRMR
 	 * registration and invalidation work requests
 	 */
-	fr_queue_space = rds_ibdev->use_fastreg ?
-			 (RDS_IB_DEFAULT_FR_WR + 1) +
-			 (RDS_IB_DEFAULT_FR_INV_WR + 1)
-			 : 0;
+	fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
 
 	/* add the conn now so that connection establishment has the dev */
 	rds_ib_add_conn(rds_ibdev, conn);
@@ -530,7 +527,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	attr.send_cq = ic->i_send_cq;
 	attr.recv_cq = ic->i_recv_cq;
 	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
-	atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR);
 
 	/*
 	 * XXX this can fail if max_*_wr is too large?  Are we supposed
@@ -1009,8 +1005,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 		wait_event(rds_ib_ring_empty_wait,
 			   rds_ib_ring_empty(&ic->i_recv_ring) &&
 			   (atomic_read(&ic->i_signaled_sends) == 0) &&
-			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) &&
-			   (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR));
+			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
 		tasklet_kill(&ic->i_send_tasklet);
 		tasklet_kill(&ic->i_recv_tasklet);
 
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 688dcd68d4ea..32ae26ed58a0 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -239,8 +239,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 	if (frmr->fr_state != FRMR_IS_INUSE)
 		goto out;
 
-	while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) {
-		atomic_inc(&ibmr->ic->i_fastunreg_wrs);
+	while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
+		atomic_inc(&ibmr->ic->i_fastreg_wrs);
 		cpu_relax();
 	}
 
@@ -257,7 +257,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 	if (unlikely(ret)) {
 		frmr->fr_state = FRMR_IS_STALE;
 		frmr->fr_inv = false;
-		atomic_inc(&ibmr->ic->i_fastunreg_wrs);
+		atomic_inc(&ibmr->ic->i_fastreg_wrs);
 		pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
 		goto out;
 	}
@@ -285,10 +285,9 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 	if (frmr->fr_inv) {
 		frmr->fr_state = FRMR_IS_FREE;
 		frmr->fr_inv = false;
-		atomic_inc(&ic->i_fastreg_wrs);
-	} else {
-		atomic_inc(&ic->i_fastunreg_wrs);
 	}
+
+	atomic_inc(&ic->i_fastreg_wrs);
 }
 
 void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
-- 
cgit v1.2.3


From 8c6166cfc9cd48e93d9176561e50b63cef4330d5 Mon Sep 17 00:00:00 2001
From: Gerd Rausch <gerd.rausch@oracle.com>
Date: Thu, 27 Jun 2019 09:21:44 -0700
Subject: rds: Accept peer connection reject messages due to incompatible
 version

Prior to
commit d021fabf525ff ("rds: rdma: add consumer reject")

function "rds_rdma_cm_event_handler_cmn" would always honor a rejected
connection attempt by issuing a "rds_conn_drop".

The commit mentioned above added a "break", eliminating
the "fallthrough" case and made the "rds_conn_drop" rather conditional:

Now it only happens if a "consumer defined" reject (i.e. "rdma_reject")
carries an integer-value of "1" inside "private_data":

  if (!conn)
    break;
    err = (int *)rdma_consumer_reject_data(cm_id, event, &len);
    if (!err || (err && ((*err) == RDS_RDMA_REJ_INCOMPAT))) {
      pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n",
              &conn->c_laddr, &conn->c_faddr);
              conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
              rds_conn_drop(conn);
    }
    rdsdebug("Connection rejected: %s\n",
             rdma_reject_msg(cm_id, event->status));
    break;
    /* FALLTHROUGH */
A number of issues are worth mentioning here:
   #1) Previous versions of the RDS code simply rejected a connection
       by calling "rdma_reject(cm_id, NULL, 0);"
       So the value of the payload in "private_data" will not be "1",
       but "0".

   #2) Now the code has become dependent on host byte order and sizing.
       If one peer is big-endian, the other is little-endian,
       or there's a difference in sizeof(int) (e.g. ILP64 vs LP64),
       the *err check does not work as intended.

   #3) There is no check for "len" to see if the data behind *err is even valid.
       Luckily, it appears that the "rdma_reject(cm_id, NULL, 0)" will always
       carry 148 bytes of zeroized payload.
       But that should probably not be relied upon here.

   #4) With the added "break;",
       we might as well drop the misleading "/* FALLTHROUGH */" comment.

This commit does _not_ address issue #2, as the sender would have to
agree on a byte order as well.

Here is the sequence of messages in this observed error-scenario:
   Host-A is pre-QoS changes (excluding the commit mentioned above)
   Host-B is post-QoS changes (including the commit mentioned above)

   #1 Host-B
      issues a connection request via function "rds_conn_path_transition"
      connection state transitions to "RDS_CONN_CONNECTING"

   #2 Host-A
      rejects the incompatible connection request (from #1)
      It does so by calling "rdma_reject(cm_id, NULL, 0);"

   #3 Host-B
      receives an "RDMA_CM_EVENT_REJECTED" event (from #2)
      But since the code is changed in the way described above,
      it won't drop the connection here, simply because "*err == 0".

   #4 Host-A
      issues a connection request

   #5 Host-B
      receives an "RDMA_CM_EVENT_CONNECT_REQUEST" event
      and ends up calling "rds_ib_cm_handle_connect".
      But since the state is already in "RDS_CONN_CONNECTING"
      (as of #1) it will end up issuing a "rdma_reject" without
      dropping the connection:
         if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
             /* Wait and see - our connect may still be succeeding */
             rds_ib_stats_inc(s_ib_connect_raced);
         }
         goto out;

   #6 Host-A
      receives an "RDMA_CM_EVENT_REJECTED" event (from #5),
      drops the connection and tries again (goto #4) until it gives up.

Tested-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/rdma_transport.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 46bce8389066..9db455d02255 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -112,7 +112,9 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
 		if (!conn)
 			break;
 		err = (int *)rdma_consumer_reject_data(cm_id, event, &len);
-		if (!err || (err && ((*err) == RDS_RDMA_REJ_INCOMPAT))) {
+		if (!err ||
+		    (err && len >= sizeof(*err) &&
+		     ((*err) <= RDS_RDMA_REJ_INCOMPAT))) {
 			pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n",
 				&conn->c_laddr, &conn->c_faddr);
 			conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
@@ -122,7 +124,6 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
 		rdsdebug("Connection rejected: %s\n",
 			 rdma_reject_msg(cm_id, event->status));
 		break;
-		/* FALLTHROUGH */
 	case RDMA_CM_EVENT_ADDR_ERROR:
 	case RDMA_CM_EVENT_ROUTE_ERROR:
 	case RDMA_CM_EVENT_CONNECT_ERROR:
-- 
cgit v1.2.3


From fc640d4cbe268479ba448542e2b17045c908b2b3 Mon Sep 17 00:00:00 2001
From: Gerd Rausch <gerd.rausch@oracle.com>
Date: Fri, 28 Jun 2019 17:41:16 -0700
Subject: rds: Return proper "tos" value to user-space

The proper "tos" value needs to be returned
to user-space (sockopt RDS_INFO_CONNECTIONS).

Fixes: 3eb450367d08 ("rds: add type of service(tos) infrastructure")
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/connection.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 7ea134f9a825..ed7f2133acc2 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -736,6 +736,7 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
 	cinfo->next_rx_seq = cp->cp_next_rx_seq;
 	cinfo->laddr = conn->c_laddr.s6_addr32[3];
 	cinfo->faddr = conn->c_faddr.s6_addr32[3];
+	cinfo->tos = conn->c_tos;
 	strncpy(cinfo->transport, conn->c_trans->t_name,
 		sizeof(cinfo->transport));
 	cinfo->flags = 0;
-- 
cgit v1.2.3


From dc205a8d34228809dedab94a85a866cbb255248f Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Date: Fri, 28 Jun 2019 15:38:58 -0700
Subject: rds: avoid version downgrade to legitimate newer peer connections

Connections with legitimate tos values can get into usual connection
race. It can result in consumer reject. We don't want tos value or
protocol version to be demoted for such connections otherwise
piers would end up different tos values which can results in
no connection. Example a peer initiated connection with say
tos 8 while usual connection racing can get downgraded to tos 0
which is not desirable.

Patch fixes above issue introduced by commit
commit d021fabf525f ("rds: rdma: add consumer reject")

Reported-by: Yanjun Zhu <yanjun.zhu@oracle.com>
Tested-by: Yanjun Zhu <yanjun.zhu@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/rdma_transport.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 9db455d02255..ff74c4bbb9fc 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -117,8 +117,10 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
 		     ((*err) <= RDS_RDMA_REJ_INCOMPAT))) {
 			pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n",
 				&conn->c_laddr, &conn->c_faddr);
-			conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
-			conn->c_tos = 0;
+
+			if (!conn->c_tos)
+				conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
+
 			rds_conn_drop(conn);
 		}
 		rdsdebug("Connection rejected: %s\n",
-- 
cgit v1.2.3


From 311633b604063a8a5d3fbc74d0565b42df721f68 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 9 Jul 2019 23:24:54 -0700
Subject: hsr: switch ->dellink() to ->ndo_uninit()

Switching from ->priv_destructor to dellink() has an unexpected
consequence: existing RCU readers, that is, hsr_port_get_hsr()
callers, may still be able to read the port list.

Instead of checking the return value of each hsr_port_get_hsr(),
we can just move it to ->ndo_uninit() which is called after
device unregister and synchronize_net(), and we still have RTNL
lock there.

Fixes: b9a1e627405d ("hsr: implement dellink to clean up resources")
Fixes: edf070a0fb45 ("hsr: fix a NULL pointer deref in hsr_dev_xmit()")
Reported-by: syzbot+097ef84cdc95843fbaa8@syzkaller.appspotmail.com
Cc: Arvid Brodin <arvid.brodin@alten.se>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/hsr/hsr_device.c  | 18 ++++++++----------
 net/hsr/hsr_device.h  |  1 -
 net/hsr/hsr_netlink.c |  7 -------
 3 files changed, 8 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index f0f9b493c47b..f509b495451a 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -227,13 +227,8 @@ static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct hsr_port *master;
 
 	master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
-	if (master) {
-		skb->dev = master->dev;
-		hsr_forward_skb(skb, master);
-	} else {
-		atomic_long_inc(&dev->tx_dropped);
-		dev_kfree_skb_any(skb);
-	}
+	skb->dev = master->dev;
+	hsr_forward_skb(skb, master);
 	return NETDEV_TX_OK;
 }
 
@@ -348,7 +343,11 @@ static void hsr_announce(struct timer_list *t)
 	rcu_read_unlock();
 }
 
-void hsr_dev_destroy(struct net_device *hsr_dev)
+/* This has to be called after all the readers are gone.
+ * Otherwise we would have to check the return value of
+ * hsr_port_get_hsr().
+ */
+static void hsr_dev_destroy(struct net_device *hsr_dev)
 {
 	struct hsr_priv *hsr;
 	struct hsr_port *port;
@@ -364,8 +363,6 @@ void hsr_dev_destroy(struct net_device *hsr_dev)
 	del_timer_sync(&hsr->prune_timer);
 	del_timer_sync(&hsr->announce_timer);
 
-	synchronize_rcu();
-
 	hsr_del_self_node(&hsr->self_node_db);
 	hsr_del_nodes(&hsr->node_db);
 }
@@ -376,6 +373,7 @@ static const struct net_device_ops hsr_device_ops = {
 	.ndo_stop = hsr_dev_close,
 	.ndo_start_xmit = hsr_dev_xmit,
 	.ndo_fix_features = hsr_fix_features,
+	.ndo_uninit = hsr_dev_destroy,
 };
 
 static struct device_type hsr_type = {
diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h
index d0fa6b0696d2..6d7759c4f5f9 100644
--- a/net/hsr/hsr_device.h
+++ b/net/hsr/hsr_device.h
@@ -14,7 +14,6 @@
 void hsr_dev_setup(struct net_device *dev);
 int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
 		     unsigned char multicast_spec, u8 protocol_version);
-void hsr_dev_destroy(struct net_device *hsr_dev);
 void hsr_check_carrier_and_operstate(struct hsr_priv *hsr);
 bool is_hsr_master(struct net_device *dev);
 int hsr_get_max_mtu(struct hsr_priv *hsr);
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 160edd24de4e..8f8337f893ba 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -69,12 +69,6 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
 	return hsr_dev_finalize(dev, link, multicast_spec, hsr_version);
 }
 
-static void hsr_dellink(struct net_device *hsr_dev, struct list_head *head)
-{
-	hsr_dev_destroy(hsr_dev);
-	unregister_netdevice_queue(hsr_dev, head);
-}
-
 static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct hsr_priv *hsr;
@@ -119,7 +113,6 @@ static struct rtnl_link_ops hsr_link_ops __read_mostly = {
 	.priv_size	= sizeof(struct hsr_priv),
 	.setup		= hsr_dev_setup,
 	.newlink	= hsr_newlink,
-	.dellink	= hsr_dellink,
 	.fill_info	= hsr_fill_info,
 };
 
-- 
cgit v1.2.3


From 416e8126a2672f6e91e9e81c6f5c07cf46808b13 Mon Sep 17 00:00:00 2001
From: yangxingwu <xingwu.yang@gmail.com>
Date: Wed, 10 Jul 2019 21:14:10 +0800
Subject: ipv6: Use ipv6_authlen for len

The length of AH header is computed manually as (hp->hdrlen+2)<<2.
However, in include/linux/ipv6.h, a macro named ipv6_authlen is
already defined for exactly the same job. This commit replaces
the manual computation code with the macro.

Signed-off-by: yangxingwu <xingwu.yang@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ah6.c                          | 4 ++--
 net/ipv6/exthdrs_core.c                 | 2 +-
 net/ipv6/ip6_tunnel.c                   | 2 +-
 net/ipv6/netfilter/ip6t_ah.c            | 2 +-
 net/ipv6/netfilter/ip6t_ipv6header.c    | 2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +-
 net/ipv6/netfilter/nf_log_ipv6.c        | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 25e1172fd1c3..95835e8d99aa 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -464,7 +464,7 @@ static void ah6_input_done(struct crypto_async_request *base, int err)
 	struct ah_data *ahp = x->data;
 	struct ip_auth_hdr *ah = ip_auth_hdr(skb);
 	int hdr_len = skb_network_header_len(skb);
-	int ah_hlen = (ah->hdrlen + 2) << 2;
+	int ah_hlen = ipv6_authlen(ah);
 
 	if (err)
 		goto out;
@@ -546,7 +546,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
 	ahash = ahp->ahash;
 
 	nexthdr = ah->nexthdr;
-	ah_hlen = (ah->hdrlen + 2) << 2;
+	ah_hlen = ipv6_authlen(ah);
 
 	if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
 	    ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 11a43ee4dd45..b358f1a4dd08 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -266,7 +266,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
 		} else if (nexthdr == NEXTHDR_AUTH) {
 			if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0))
 				break;
-			hdrlen = (hp->hdrlen + 2) << 2;
+			hdrlen = ipv6_authlen(hp);
 		} else
 			hdrlen = ipv6_optlen(hp);
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index b80fde1bc005..3134fbb65d7f 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -416,7 +416,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
 				break;
 			optlen = 8;
 		} else if (nexthdr == NEXTHDR_AUTH) {
-			optlen = (hdr->hdrlen + 2) << 2;
+			optlen = ipv6_authlen(hdr);
 		} else {
 			optlen = ipv6_optlen(hdr);
 		}
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index 0228ff3636bb..4e15a14435e4 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -55,7 +55,7 @@ static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par)
 		return false;
 	}
 
-	hdrlen = (ah->hdrlen + 2) << 2;
+	hdrlen = ipv6_authlen(ah);
 
 	pr_debug("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen);
 	pr_debug("RES %04X ", ah->reserved);
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index fd439f88377f..0fc6326ef499 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -71,7 +71,7 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par)
 		if (nexthdr == NEXTHDR_FRAGMENT)
 			hdrlen = 8;
 		else if (nexthdr == NEXTHDR_AUTH)
-			hdrlen = (hp->hdrlen + 2) << 2;
+			hdrlen = ipv6_authlen(hp);
 		else
 			hdrlen = ipv6_optlen(hp);
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 398e1df41406..0f82c150543b 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -414,7 +414,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
 		if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
 			BUG();
 		if (nexthdr == NEXTHDR_AUTH)
-			hdrlen = (hdr.hdrlen+2)<<2;
+			hdrlen = ipv6_authlen(&hdr);
 		else
 			hdrlen = ipv6_optlen(&hdr);
 
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 549c51156d5d..f53bd8f01219 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -155,7 +155,7 @@ static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
 
 			}
 
-			hdrlen = (hp->hdrlen+2)<<2;
+			hdrlen = ipv6_authlen(hp);
 			break;
 		case IPPROTO_ESP:
 			if (logflags & NF_LOG_IPOPT) {
-- 
cgit v1.2.3


From 052e0690f1f62f76493ba996d73847c7ca9fd132 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Jul 2019 06:40:09 -0700
Subject: ipv6: tcp: fix flowlabels reflection for RST packets

In 323a53c41292 ("ipv6: tcp: enable flowlabel reflection in some RST packets")
and 50a8accf1062 ("ipv6: tcp: send consistent flowlabel in TIME_WAIT state")
we took care of IPv6 flowlabel reflections for two cases.

This patch takes care of the remaining case, when the RST packet
is sent on behalf of a 'full' socket.

In Marek use case, this was a socket in TCP_CLOSE state.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Marek Majkowski <marek@cloudflare.com>
Tested-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d56a9019a0fe..5da069e91cac 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -984,8 +984,13 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 
 	if (sk) {
 		oif = sk->sk_bound_dev_if;
-		if (sk_fullsock(sk))
+		if (sk_fullsock(sk)) {
+			const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
+
 			trace_tcp_send_reset(sk, skb);
+			if (np->repflow)
+				label = ip6_flowlabel(ipv6h);
+		}
 		if (sk->sk_state == TCP_TIME_WAIT)
 			label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
 	} else {
-- 
cgit v1.2.3


From 8975a3abc3030bc8cdc3c94b988bcf819a14ed41 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Jul 2019 06:40:10 -0700
Subject: ipv6: fix potential crash in ip6_datagram_dst_update()

Willem forgot to change one of the calls to fl6_sock_lookup(),
which can now return an error or NULL.

syzbot reported :

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 31763 Comm: syz-executor.0 Not tainted 5.2.0-rc6+ #63
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:ip6_datagram_dst_update+0x559/0xc30 net/ipv6/datagram.c:83
Code: 00 00 e8 ea 29 3f fb 4d 85 f6 0f 84 96 04 00 00 e8 dc 29 3f fb 49 8d 7e 20 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 16 06 00 00 4d 8b 6e 20 e8 b4 29 3f fb 4c 89 ee
RSP: 0018:ffff88809ba97ae0 EFLAGS: 00010207
RAX: dffffc0000000000 RBX: ffff8880a81254b0 RCX: ffffc90008118000
RDX: 0000000000000003 RSI: ffffffff86319a84 RDI: 000000000000001e
RBP: ffff88809ba97c10 R08: ffff888065e9e700 R09: ffffed1015d26c80
R10: ffffed1015d26c7f R11: ffff8880ae9363fb R12: ffff8880a8124f40
R13: 0000000000000001 R14: fffffffffffffffe R15: ffff88809ba97b40
FS:  00007f38e606a700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000202c0140 CR3: 00000000a026a000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 __ip6_datagram_connect+0x5e9/0x1390 net/ipv6/datagram.c:246
 ip6_datagram_connect+0x30/0x50 net/ipv6/datagram.c:269
 ip6_datagram_connect_v6_only+0x69/0x90 net/ipv6/datagram.c:281
 inet_dgram_connect+0x14a/0x2d0 net/ipv4/af_inet.c:571
 __sys_connect+0x264/0x330 net/socket.c:1824
 __do_sys_connect net/socket.c:1835 [inline]
 __se_sys_connect net/socket.c:1832 [inline]
 __x64_sys_connect+0x73/0xb0 net/socket.c:1832
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4597c9
Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f38e6069c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004597c9
RDX: 000000000000001c RSI: 0000000020000040 RDI: 0000000000000003
RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f38e606a6d4
R13: 00000000004bfd07 R14: 00000000004d1838 R15: 00000000ffffffff
Modules linked in:
RIP: 0010:ip6_datagram_dst_update+0x559/0xc30 net/ipv6/datagram.c:83
Code: 00 00 e8 ea 29 3f fb 4d 85 f6 0f 84 96 04 00 00 e8 dc 29 3f fb 49 8d 7e 20 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 16 06 00 00 4d 8b 6e 20 e8 b4 29 3f fb 4c 89 ee

Fixes: 59c820b2317f ("ipv6: elide flowlabel check if no exclusive leases exist")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/datagram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 9d78c907b918..9ab897ded4df 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -74,7 +74,7 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
 
 	if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) {
 		flowlabel = fl6_sock_lookup(sk, np->flow_label);
-		if (!flowlabel)
+		if (IS_ERR(flowlabel))
 			return -EINVAL;
 	}
 	ip6_datagram_flow_key_init(&fl6, sk);
-- 
cgit v1.2.3


From d44e3fa5d7e6e9573c69f6f9f4f7f3200b0c9eee Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Jul 2019 06:40:11 -0700
Subject: ipv6: fix static key imbalance in fl_create()

fl_create() should call static_branch_deferred_inc() only in
case of success.

Also we should not call fl_free() in error path, as this could
cause a static key imbalance.

jump label: negative count!
WARNING: CPU: 0 PID: 15907 at kernel/jump_label.c:221 static_key_slow_try_dec kernel/jump_label.c:221 [inline]
WARNING: CPU: 0 PID: 15907 at kernel/jump_label.c:221 static_key_slow_try_dec+0x1ab/0x1d0 kernel/jump_label.c:206
Kernel panic - not syncing: panic_on_warn set ...
CPU: 0 PID: 15907 Comm: syz-executor.2 Not tainted 5.2.0-rc6+ #62
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 panic+0x2cb/0x744 kernel/panic.c:219
 __warn.cold+0x20/0x4d kernel/panic.c:576
 report_bug+0x263/0x2b0 lib/bug.c:186
 fixup_bug arch/x86/kernel/traps.c:179 [inline]
 fixup_bug arch/x86/kernel/traps.c:174 [inline]
 do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:272
 do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:291
 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:986
RIP: 0010:static_key_slow_try_dec kernel/jump_label.c:221 [inline]
RIP: 0010:static_key_slow_try_dec+0x1ab/0x1d0 kernel/jump_label.c:206
Code: c0 e8 e9 3e e5 ff 83 fb 01 0f 85 32 ff ff ff e8 5b 3d e5 ff 45 31 ff eb a0 e8 51 3d e5 ff 48 c7 c7 40 99 92 87 e8 13 75 b7 ff <0f> 0b eb 8b 4c 89 e7 e8 a9 c0 1e 00 e9 de fe ff ff e8 bf 6d b7 ff
RSP: 0018:ffff88805f9c7450 EFLAGS: 00010286
RAX: 0000000000000000 RBX: 00000000ffffffff RCX: 0000000000000000
RDX: 000000000000e3e1 RSI: ffffffff815adb06 RDI: ffffed100bf38e7c
RBP: ffff88805f9c74e0 R08: ffff88806acf0700 R09: ffffed1015d060a9
R10: ffffed1015d060a8 R11: ffff8880ae830547 R12: ffffffff89832ce0
R13: ffff88805f9c74b8 R14: 1ffff1100bf38e8b R15: 00000000ffffff01
 __static_key_slow_dec_deferred+0x65/0x110 kernel/jump_label.c:272
 fl_free+0xa9/0xe0 net/ipv6/ip6_flowlabel.c:121
 fl_create+0x6af/0x9f0 net/ipv6/ip6_flowlabel.c:457
 ipv6_flowlabel_opt+0x80e/0x2730 net/ipv6/ip6_flowlabel.c:624
 do_ipv6_setsockopt.isra.0+0x2119/0x4100 net/ipv6/ipv6_sockglue.c:825
 ipv6_setsockopt+0xf6/0x170 net/ipv6/ipv6_sockglue.c:944
 tcp_setsockopt net/ipv4/tcp.c:3131 [inline]
 tcp_setsockopt+0x8f/0xe0 net/ipv4/tcp.c:3125
 sock_common_setsockopt+0x94/0xd0 net/core/sock.c:3130
 __sys_setsockopt+0x253/0x4b0 net/socket.c:2080
 __do_sys_setsockopt net/socket.c:2096 [inline]
 __se_sys_setsockopt net/socket.c:2093 [inline]
 __x64_sys_setsockopt+0xbe/0x150 net/socket.c:2093
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4597c9
Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f2670556c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00000000004597c9
RDX: 0000000000000020 RSI: 0000000000000029 RDI: 0000000000000003
RBP: 000000000075bfc8 R08: 000000000000fdf7 R09: 0000000000000000
R10: 0000000020000000 R11: 0000000000000246 R12: 00007f26705576d4
R13: 00000000004cec00 R14: 00000000004dd520 R15: 00000000ffffffff
Kernel Offset: disabled
Rebooting in 86400 seconds..

Fixes: 59c820b2317f ("ipv6: elide flowlabel check if no exclusive leases exist")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_flowlabel.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index ad284b1fd308..d64b83e85642 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -435,8 +435,6 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
 	}
 	fl->dst = freq->flr_dst;
 	atomic_set(&fl->users, 1);
-	if (fl_shared_exclusive(fl) || fl->opt)
-		static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
 	switch (fl->share) {
 	case IPV6_FL_S_EXCL:
 	case IPV6_FL_S_ANY:
@@ -451,10 +449,15 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
 		err = -EINVAL;
 		goto done;
 	}
+	if (fl_shared_exclusive(fl) || fl->opt)
+		static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
 	return fl;
 
 done:
-	fl_free(fl);
+	if (fl) {
+		kfree(fl->opt);
+		kfree(fl);
+	}
 	*err_p = err;
 	return NULL;
 }
-- 
cgit v1.2.3


From 6b660c4177aaebdc73df7a3378f0e8b110aa4b51 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Sat, 6 Jul 2019 01:08:09 +0900
Subject: net: openvswitch: do not update max_headroom if new headroom is equal
 to old headroom

When a vport is deleted, the maximum headroom size would be changed.
If the vport which has the largest headroom is deleted,
the new max_headroom would be set.
But, if the new headroom size is equal to the old headroom size,
updating routine is unnecessary.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Tested-by: Greg Rose <gvrose8192@gmail.com>
Reviewed-by: Greg Rose <gvrose8192@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 33b388103741..892287d06c17 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1958,10 +1958,9 @@ static struct vport *lookup_vport(struct net *net,
 
 }
 
-/* Called with ovs_mutex */
-static void update_headroom(struct datapath *dp)
+static unsigned int ovs_get_max_headroom(struct datapath *dp)
 {
-	unsigned dev_headroom, max_headroom = 0;
+	unsigned int dev_headroom, max_headroom = 0;
 	struct net_device *dev;
 	struct vport *vport;
 	int i;
@@ -1975,10 +1974,19 @@ static void update_headroom(struct datapath *dp)
 		}
 	}
 
-	dp->max_headroom = max_headroom;
+	return max_headroom;
+}
+
+/* Called with ovs_mutex */
+static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom)
+{
+	struct vport *vport;
+	int i;
+
+	dp->max_headroom = new_headroom;
 	for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
 		hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
-			netdev_set_rx_headroom(vport->dev, max_headroom);
+			netdev_set_rx_headroom(vport->dev, new_headroom);
 }
 
 static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
@@ -1989,6 +1997,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	struct sk_buff *reply;
 	struct vport *vport;
 	struct datapath *dp;
+	unsigned int new_headroom;
 	u32 port_no;
 	int err;
 
@@ -2050,8 +2059,10 @@ restart:
 				      info->snd_portid, info->snd_seq, 0,
 				      OVS_VPORT_CMD_NEW);
 
-	if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
-		update_headroom(dp);
+	new_headroom = netdev_get_fwd_headroom(vport->dev);
+
+	if (new_headroom > dp->max_headroom)
+		ovs_update_headroom(dp, new_headroom);
 	else
 		netdev_set_rx_headroom(vport->dev, dp->max_headroom);
 
@@ -2122,11 +2133,12 @@ exit_unlock_free:
 
 static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 {
-	bool must_update_headroom = false;
+	bool update_headroom = false;
 	struct nlattr **a = info->attrs;
 	struct sk_buff *reply;
 	struct datapath *dp;
 	struct vport *vport;
+	unsigned int new_headroom;
 	int err;
 
 	reply = ovs_vport_cmd_alloc_info();
@@ -2152,12 +2164,17 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	/* the vport deletion may trigger dp headroom update */
 	dp = vport->dp;
 	if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom)
-		must_update_headroom = true;
+		update_headroom = true;
+
 	netdev_reset_rx_headroom(vport->dev);
 	ovs_dp_detach_port(vport);
 
-	if (must_update_headroom)
-		update_headroom(dp);
+	if (update_headroom) {
+		new_headroom = ovs_get_max_headroom(dp);
+
+		if (new_headroom < dp->max_headroom)
+			ovs_update_headroom(dp, new_headroom);
+	}
 	ovs_unlock();
 
 	ovs_notify(&dp_vport_genl_family, reply, info);
-- 
cgit v1.2.3


From c1a970d06f8cf390354a4a426976ed7f960b71f1 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Wed, 10 Jul 2019 20:12:29 +0300
Subject: net: sched: Fix NULL-pointer dereference in tc_indr_block_ing_cmd()

After recent refactoring of block offlads infrastructure, indr_dev->block
pointer is dereferenced before it is verified to be non-NULL. Example stack
trace where this behavior leads to NULL-pointer dereference error when
creating vxlan dev on system with mlx5 NIC with offloads enabled:

[ 1157.852938] ==================================================================
[ 1157.866877] BUG: KASAN: null-ptr-deref in tc_indr_block_ing_cmd.isra.41+0x9c/0x160
[ 1157.880877] Read of size 4 at addr 0000000000000090 by task ip/3829
[ 1157.901637] CPU: 22 PID: 3829 Comm: ip Not tainted 5.2.0-rc6+ #488
[ 1157.914438] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017
[ 1157.929031] Call Trace:
[ 1157.938318]  dump_stack+0x9a/0xeb
[ 1157.948362]  ? tc_indr_block_ing_cmd.isra.41+0x9c/0x160
[ 1157.960262]  ? tc_indr_block_ing_cmd.isra.41+0x9c/0x160
[ 1157.972082]  __kasan_report+0x176/0x192
[ 1157.982513]  ? tc_indr_block_ing_cmd.isra.41+0x9c/0x160
[ 1157.994348]  kasan_report+0xe/0x20
[ 1158.004324]  tc_indr_block_ing_cmd.isra.41+0x9c/0x160
[ 1158.015950]  ? tcf_block_setup+0x430/0x430
[ 1158.026558]  ? kasan_unpoison_shadow+0x30/0x40
[ 1158.037464]  __tc_indr_block_cb_register+0x5f5/0xf20
[ 1158.049288]  ? mlx5e_rep_indr_tc_block_unbind+0xa0/0xa0 [mlx5_core]
[ 1158.062344]  ? tc_indr_block_dev_put.part.47+0x5c0/0x5c0
[ 1158.074498]  ? rdma_roce_rescan_device+0x20/0x20 [ib_core]
[ 1158.086580]  ? br_device_event+0x98/0x480 [bridge]
[ 1158.097870]  ? strcmp+0x30/0x50
[ 1158.107578]  mlx5e_nic_rep_netdevice_event+0xdd/0x180 [mlx5_core]
[ 1158.120212]  notifier_call_chain+0x6d/0xa0
[ 1158.130753]  register_netdevice+0x6fc/0x7e0
[ 1158.141322]  ? netdev_change_features+0xa0/0xa0
[ 1158.152218]  ? vxlan_config_apply+0x210/0x310 [vxlan]
[ 1158.163593]  __vxlan_dev_create+0x2ad/0x520 [vxlan]
[ 1158.174770]  ? vxlan_changelink+0x490/0x490 [vxlan]
[ 1158.185870]  ? rcu_read_unlock+0x60/0x60 [vxlan]
[ 1158.196798]  vxlan_newlink+0x99/0xf0 [vxlan]
[ 1158.207303]  ? __vxlan_dev_create+0x520/0x520 [vxlan]
[ 1158.218601]  ? rtnl_create_link+0x3d0/0x450
[ 1158.228900]  __rtnl_newlink+0x8a7/0xb00
[ 1158.238701]  ? stack_access_ok+0x35/0x80
[ 1158.248450]  ? rtnl_link_unregister+0x1a0/0x1a0
[ 1158.258735]  ? find_held_lock+0x6d/0xd0
[ 1158.268379]  ? is_bpf_text_address+0x67/0xf0
[ 1158.278330]  ? lock_acquire+0xc1/0x1f0
[ 1158.287686]  ? is_bpf_text_address+0x5/0xf0
[ 1158.297449]  ? is_bpf_text_address+0x86/0xf0
[ 1158.307310]  ? kernel_text_address+0xec/0x100
[ 1158.317155]  ? arch_stack_walk+0x92/0xe0
[ 1158.326497]  ? __kernel_text_address+0xe/0x30
[ 1158.336213]  ? unwind_get_return_address+0x2f/0x50
[ 1158.346267]  ? create_prof_cpu_mask+0x20/0x20
[ 1158.355936]  ? arch_stack_walk+0x92/0xe0
[ 1158.365117]  ? stack_trace_save+0x8a/0xb0
[ 1158.374272]  ? stack_trace_consume_entry+0x80/0x80
[ 1158.384226]  ? match_held_lock+0x33/0x210
[ 1158.393216]  ? kasan_unpoison_shadow+0x30/0x40
[ 1158.402593]  rtnl_newlink+0x53/0x80
[ 1158.410925]  rtnetlink_rcv_msg+0x3a5/0x600
[ 1158.419777]  ? validate_linkmsg+0x400/0x400
[ 1158.428620]  ? find_held_lock+0x6d/0xd0
[ 1158.437117]  ? match_held_lock+0x1b/0x210
[ 1158.445760]  ? validate_linkmsg+0x400/0x400
[ 1158.454642]  netlink_rcv_skb+0xc7/0x1f0
[ 1158.463150]  ? netlink_ack+0x470/0x470
[ 1158.471538]  ? netlink_deliver_tap+0x1f3/0x5a0
[ 1158.480607]  netlink_unicast+0x2ae/0x350
[ 1158.489099]  ? netlink_attachskb+0x340/0x340
[ 1158.497935]  ? _copy_from_iter_full+0xde/0x3b0
[ 1158.506945]  ? __virt_addr_valid+0xb6/0xf0
[ 1158.515578]  ? __check_object_size+0x159/0x240
[ 1158.524515]  netlink_sendmsg+0x4d3/0x630
[ 1158.532879]  ? netlink_unicast+0x350/0x350
[ 1158.541400]  ? netlink_unicast+0x350/0x350
[ 1158.549805]  sock_sendmsg+0x94/0xa0
[ 1158.557561]  ___sys_sendmsg+0x49d/0x570
[ 1158.565625]  ? copy_msghdr_from_user+0x210/0x210
[ 1158.574457]  ? __fput+0x1e2/0x330
[ 1158.581948]  ? __kasan_slab_free+0x130/0x180
[ 1158.590407]  ? kmem_cache_free+0xb6/0x2d0
[ 1158.598574]  ? mark_lock+0xc7/0x790
[ 1158.606177]  ? task_work_run+0xcf/0x100
[ 1158.614165]  ? exit_to_usermode_loop+0x102/0x110
[ 1158.622954]  ? __lock_acquire+0x963/0x1ee0
[ 1158.631199]  ? lockdep_hardirqs_on+0x260/0x260
[ 1158.639777]  ? match_held_lock+0x1b/0x210
[ 1158.647918]  ? lockdep_hardirqs_on+0x260/0x260
[ 1158.656501]  ? match_held_lock+0x1b/0x210
[ 1158.664643]  ? __fget_light+0xa6/0xe0
[ 1158.672423]  ? __sys_sendmsg+0xd2/0x150
[ 1158.680334]  __sys_sendmsg+0xd2/0x150
[ 1158.688063]  ? __ia32_sys_shutdown+0x30/0x30
[ 1158.696435]  ? lock_downgrade+0x2e0/0x2e0
[ 1158.704541]  ? mark_held_locks+0x1a/0x90
[ 1158.712611]  ? mark_held_locks+0x1a/0x90
[ 1158.720619]  ? do_syscall_64+0x1e/0x2c0
[ 1158.728530]  do_syscall_64+0x78/0x2c0
[ 1158.736254]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1158.745414] RIP: 0033:0x7f62d505cb87
[ 1158.753070] Code: 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 80 00 00 00 00 8b 05 6a 2b 2c 00 48 63 d2 48 63 ff 85 c0 75 18 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 59 f3 c3 0f 1f 80 00 00[87/1817]
 48 89 f3 48
[ 1158.780924] RSP: 002b:00007fffd9832268 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
[ 1158.793204] RAX: ffffffffffffffda RBX: 000000005d26048f RCX: 00007f62d505cb87
[ 1158.805111] RDX: 0000000000000000 RSI: 00007fffd98322d0 RDI: 0000000000000003
[ 1158.817055] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000006
[ 1158.828987] R10: 00007f62d50ce260 R11: 0000000000000246 R12: 0000000000000001
[ 1158.840909] R13: 000000000067e540 R14: 0000000000000000 R15: 000000000067ed20
[ 1158.852873] ==================================================================

Introduce new function tcf_block_non_null_shared() that verifies block
pointer before dereferencing it to obtain index. Use the function in
tc_indr_block_ing_cmd() to prevent NULL pointer dereference.

Fixes: 955bcb6ea0df ("drivers: net: use flow block API")
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 638c1bc1ea1b..278014e26aec 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -684,7 +684,7 @@ static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
 		.command	= command,
 		.binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
 		.net		= dev_net(indr_dev->dev),
-		.block_shared	= tcf_block_shared(indr_dev->block),
+		.block_shared	= tcf_block_non_null_shared(indr_dev->block),
 	};
 	INIT_LIST_HEAD(&bo.cb_list);
 
-- 
cgit v1.2.3


From d12cffe9329fd278555d0f9bb89af1259a2fd933 Mon Sep 17 00:00:00 2001
From: Chris Packham <chris.packham@alliedtelesis.co.nz>
Date: Fri, 12 Jul 2019 10:41:15 +1200
Subject: tipc: ensure head->lock is initialised

tipc_named_node_up() creates a skb list. It passes the list to
tipc_node_xmit() which has some code paths that can call
skb_queue_purge() which relies on the list->lock being initialised.

The spin_lock is only needed if the messages end up on the receive path
but when the list is created in tipc_named_node_up() we don't
necessarily know if it is going to end up there.

Once all the skb list users are updated in tipc it will then be possible
to update them to use the unlocked variants of the skb list functions
and initialise the lock when we know the message will follow the receive
path.

Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_distr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 61219f0b9677..44abc8e9c990 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -190,7 +190,7 @@ void tipc_named_node_up(struct net *net, u32 dnode)
 	struct name_table *nt = tipc_name_table(net);
 	struct sk_buff_head head;
 
-	__skb_queue_head_init(&head);
+	skb_queue_head_init(&head);
 
 	read_lock_bh(&nt->cluster_scope_lock);
 	named_distribute(net, &head, dnode, &nt->cluster_scope);
-- 
cgit v1.2.3