diff options
Diffstat (limited to 'net/dccp')
-rw-r--r-- | net/dccp/ccid.h | 34 | ||||
-rw-r--r-- | net/dccp/ccids/ccid2.c | 23 | ||||
-rw-r--r-- | net/dccp/ccids/ccid2.h | 5 | ||||
-rw-r--r-- | net/dccp/ccids/ccid3.c | 12 | ||||
-rw-r--r-- | net/dccp/dccp.h | 5 | ||||
-rw-r--r-- | net/dccp/output.c | 209 | ||||
-rw-r--r-- | net/dccp/proto.c | 21 | ||||
-rw-r--r-- | net/dccp/timer.c | 27 |
8 files changed, 220 insertions, 116 deletions
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 117fb093dcaf..75c3582a7678 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -134,13 +134,41 @@ static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); +/* + * Congestion control of queued data packets via CCID decision. + * + * The TX CCID performs its congestion-control by indicating whether and when a + * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). + * The following modes are supported via the symbolic constants below: + * - timer-based pacing (CCID returns a delay value in milliseconds); + * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). + */ + +enum ccid_dequeueing_decision { + CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ + CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ + CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ + CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ + CCID_PACKET_ERR = 0xF0000, /* error condition */ +}; + +static inline int ccid_packet_dequeue_eval(const int return_code) +{ + if (return_code < 0) + return CCID_PACKET_ERR; + if (return_code == 0) + return CCID_PACKET_SEND_AT_ONCE; + if (return_code <= CCID_PACKET_DELAY_MAX) + return CCID_PACKET_DELAY; + return return_code; +} + static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { - int rc = 0; if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) - rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); - return rc; + return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); + return CCID_PACKET_SEND_AT_ONCE; } static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index d850e291f87c..6576eae9e779 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -78,12 +78,9 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - if (hc->tx_pipe < hc->tx_cwnd) - return 0; - - return 1; /* XXX CCID should dequeue when ready instead of polling */ + if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) + return CCID_PACKET_WILL_DEQUEUE_LATER; + return CCID_PACKET_SEND_AT_ONCE; } static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) @@ -115,6 +112,7 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -129,8 +127,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) if (hc->tx_rto > DCCP_RTO_MAX) hc->tx_rto = DCCP_RTO_MAX; - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); - /* adjust pipe, cwnd etc */ hc->tx_ssthresh = hc->tx_cwnd / 2; if (hc->tx_ssthresh < 2) @@ -146,6 +142,12 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hc->tx_rpseq = 0; hc->tx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); + + /* if we were blocked before, we may now send cwnd=1 packet */ + if (sender_was_blocked) + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); + /* restart backed-off timer */ + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); out: bh_unlock_sock(sk); sock_put(sk); @@ -434,6 +436,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); u64 ackno, seqno; struct ccid2_seq *seqp; unsigned char *vector; @@ -631,6 +634,10 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) sk_stop_timer(sk, &hc->tx_rtotimer); else sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); + + /* check if incoming Acks allow pending packets to be sent */ + if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 9731c2dc1487..25cb6b216eda 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -81,6 +81,11 @@ struct ccid2_hc_tx_sock { u64 tx_high_ack; }; +static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc) +{ + return hc->tx_pipe >= hc->tx_cwnd; +} + struct ccid2_hc_rx_sock { int rx_data; }; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 3060a60ed5ab..3d604e1349c0 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -268,11 +268,11 @@ out: sock_put(sk); } -/* - * returns - * > 0: delay (in msecs) that should pass before actually sending - * = 0: can send immediately - * < 0: error condition; do not send packet +/** + * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets + * @skb: next packet candidate to send on @sk + * This function uses the convention of ccid_packet_dequeue_eval() and + * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. */ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { @@ -348,7 +348,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) /* set the nominal send time for the next following packet */ hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi); - return 0; + return CCID_PACKET_SEND_AT_ONCE; } static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 3eb264b60823..a8ed459508b2 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -243,8 +243,9 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, extern void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); -extern void dccp_write_xmit(struct sock *sk, int block); -extern void dccp_write_space(struct sock *sk); +extern void dccp_write_xmit(struct sock *sk); +extern void dccp_write_space(struct sock *sk); +extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); extern void dccp_init_xmit_timers(struct sock *sk); static inline void dccp_clear_xmit_timers(struct sock *sk) diff --git a/net/dccp/output.c b/net/dccp/output.c index a988fe9ffcba..45b91853f5ae 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -209,108 +209,150 @@ void dccp_write_space(struct sock *sk) } /** - * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet + * dccp_wait_for_ccid - Await CCID send permission * @sk: socket to wait for - * @skb: current skb to pass on for waiting - * @delay: sleep timeout in milliseconds (> 0) - * This function is called by default when the socket is closed, and - * when a non-zero linger time is set on the socket. For consistency + * @delay: timeout in jiffies + * This is used by CCIDs which need to delay the send time in process context. */ -static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) +static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) { - struct dccp_sock *dp = dccp_sk(sk); DEFINE_WAIT(wait); - unsigned long jiffdelay; - int rc; + long remaining; + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + sk->sk_write_pending++; + release_sock(sk); + + remaining = schedule_timeout(delay); + + lock_sock(sk); + sk->sk_write_pending--; + finish_wait(sk_sleep(sk), &wait); + + if (signal_pending(current) || sk->sk_err) + return -1; + return remaining; +} + +/** + * dccp_xmit_packet - Send data packet under control of CCID + * Transmits next-queued payload and informs CCID to account for the packet. + */ +static void dccp_xmit_packet(struct sock *sk) +{ + int err, len; + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue); - do { - dccp_pr_debug("delayed send by %d msec\n", delay); - jiffdelay = msecs_to_jiffies(delay); + if (unlikely(skb == NULL)) + return; + len = skb->len; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (sk->sk_state == DCCP_PARTOPEN) { + const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; + /* + * See 8.1.5 - Handshake Completion. + * + * For robustness we resend Confirm options until the client has + * entered OPEN. During the initial feature negotiation, the MPS + * is smaller than usual, reduced by the Change/Confirm options. + */ + if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { + DCCP_WARN("Payload too large (%d) for featneg.\n", len); + dccp_send_ack(sk); + dccp_feat_list_purge(&dp->dccps_featneg); + } - sk->sk_write_pending++; - release_sock(sk); - schedule_timeout(jiffdelay); - lock_sock(sk); - sk->sk_write_pending--; + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + inet_csk(sk)->icsk_rto, + DCCP_RTO_MAX); + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; + } else if (dccp_ack_pending(sk)) { + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; + } else { + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; + } + + err = dccp_transmit_skb(sk, skb); + if (err) + dccp_pr_debug("transmit_skb() returned err=%d\n", err); + /* + * Register this one as sent even if an error occurred. To the remote + * end a local packet drop is indistinguishable from network loss, i.e. + * any local drop will eventually be reported via receiver feedback. + */ + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); +} - if (sk->sk_err) - goto do_error; - if (signal_pending(current)) - goto do_interrupted; +/** + * dccp_flush_write_queue - Drain queue at end of connection + * Since dccp_sendmsg queues packets without waiting for them to be sent, it may + * happen that the TX queue is not empty at the end of a connection. We give the + * HC-sender CCID a grace period of up to @time_budget jiffies. If this function + * returns with a non-empty write queue, it will be purged later. + */ +void dccp_flush_write_queue(struct sock *sk, long *time_budget) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb; + long delay, rc; + while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - } while ((delay = rc) > 0); -out: - finish_wait(sk_sleep(sk), &wait); - return rc; - -do_error: - rc = -EPIPE; - goto out; -do_interrupted: - rc = -EINTR; - goto out; + + switch (ccid_packet_dequeue_eval(rc)) { + case CCID_PACKET_WILL_DEQUEUE_LATER: + /* + * If the CCID determines when to send, the next sending + * time is unknown or the CCID may not even send again + * (e.g. remote host crashes or lost Ack packets). + */ + DCCP_WARN("CCID did not manage to send all packets\n"); + return; + case CCID_PACKET_DELAY: + delay = msecs_to_jiffies(rc); + if (delay > *time_budget) + return; + rc = dccp_wait_for_ccid(sk, delay); + if (rc < 0) + return; + *time_budget -= (delay - rc); + /* check again if we can send now */ + break; + case CCID_PACKET_SEND_AT_ONCE: + dccp_xmit_packet(sk); + break; + case CCID_PACKET_ERR: + skb_dequeue(&sk->sk_write_queue); + kfree_skb(skb); + dccp_pr_debug("packet discarded due to err=%ld\n", rc); + } + } } -void dccp_write_xmit(struct sock *sk, int block) +void dccp_write_xmit(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; while ((skb = skb_peek(&sk->sk_write_queue))) { - int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - if (err > 0) { - if (!block) { - sk_reset_timer(sk, &dp->dccps_xmit_timer, - msecs_to_jiffies(err)+jiffies); - break; - } else - err = dccp_wait_for_ccid(sk, skb, err); - if (err && err != -EINTR) - DCCP_BUG("err=%d after dccp_wait_for_ccid", err); - } + int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - skb_dequeue(&sk->sk_write_queue); - if (err == 0) { - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const int len = skb->len; - - if (sk->sk_state == DCCP_PARTOPEN) { - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; - /* - * See 8.1.5 - Handshake Completion. - * - * For robustness we resend Confirm options until the client has - * entered OPEN. During the initial feature negotiation, the MPS - * is smaller than usual, reduced by the Change/Confirm options. - */ - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { - DCCP_WARN("Payload too large (%d) for featneg.\n", len); - dccp_send_ack(sk); - dccp_feat_list_purge(&dp->dccps_featneg); - } - - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); - dcb->dccpd_type = DCCP_PKT_DATAACK; - } else if (dccp_ack_pending(sk)) - dcb->dccpd_type = DCCP_PKT_DATAACK; - else - dcb->dccpd_type = DCCP_PKT_DATA; - - err = dccp_transmit_skb(sk, skb); - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); - if (err) - DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", - err); - } else { - dccp_pr_debug("packet discarded due to err=%d\n", err); + switch (ccid_packet_dequeue_eval(rc)) { + case CCID_PACKET_WILL_DEQUEUE_LATER: + return; + case CCID_PACKET_DELAY: + sk_reset_timer(sk, &dp->dccps_xmit_timer, + jiffies + msecs_to_jiffies(rc)); + return; + case CCID_PACKET_SEND_AT_ONCE: + dccp_xmit_packet(sk); + break; + case CCID_PACKET_ERR: + skb_dequeue(&sk->sk_write_queue); kfree_skb(skb); + dccp_pr_debug("packet discarded due to err=%d\n", rc); } } } @@ -622,7 +664,6 @@ void dccp_send_close(struct sock *sk, const int active) DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; if (active) { - dccp_write_xmit(sk, 1); dccp_skb_entail(sk, skb); dccp_transmit_skb(sk, skb_clone(skb, prio)); /* diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 7e5fc04eb6d1..ef343d53fcea 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -726,7 +726,13 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto out_discard; skb_queue_tail(&sk->sk_write_queue, skb); - dccp_write_xmit(sk,0); + /* + * The xmit_timer is set if the TX CCID is rate-based and will expire + * when congestion control permits to release further packets into the + * network. Window-based CCIDs do not use this timer. + */ + if (!timer_pending(&dp->dccps_xmit_timer)) + dccp_write_xmit(sk); out_release: release_sock(sk); return rc ? : len; @@ -951,9 +957,22 @@ void dccp_close(struct sock *sk, long timeout) /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); } else if (sk->sk_state != DCCP_CLOSED) { + /* + * Normal connection termination. May need to wait if there are + * still packets in the TX queue that are delayed by the CCID. + */ + dccp_flush_write_queue(sk, &timeout); dccp_terminate_connection(sk); } + /* + * Flush write queue. This may be necessary in several cases: + * - we have been closed by the peer but still have application data; + * - abortive termination (unread data or zero linger time), + * - normal termination but queue could not be flushed within time limit + */ + __skb_queue_purge(&sk->sk_write_queue); + sk_stream_wait_close(sk, timeout); adjudge_to_death: diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 1a9aa05d4dc4..7587870b7040 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -237,32 +237,35 @@ out: sock_put(sk); } -/* Transmit-delay timer: used by the CCIDs to delay actual send time */ -static void dccp_write_xmit_timer(unsigned long data) +/** + * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface + * See the comments above %ccid_dequeueing_decision for supported modes. + */ +static void dccp_write_xmitlet(unsigned long data) { struct sock *sk = (struct sock *)data; - struct dccp_sock *dp = dccp_sk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) - sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); + sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); else - dccp_write_xmit(sk, 0); + dccp_write_xmit(sk); bh_unlock_sock(sk); - sock_put(sk); } -static void dccp_init_write_xmit_timer(struct sock *sk) +static void dccp_write_xmit_timer(unsigned long data) { - struct dccp_sock *dp = dccp_sk(sk); - - setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, - (unsigned long)sk); + dccp_write_xmitlet(data); + sock_put((struct sock *)data); } void dccp_init_xmit_timers(struct sock *sk) { - dccp_init_write_xmit_timer(sk); + struct dccp_sock *dp = dccp_sk(sk); + + tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); + setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, + (unsigned long)sk); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); } |