Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'dccp' of git://eden-feed.erg.abdn.ac.uk/dccp_exp

Conflicts:

net/dccp/input.c
net/dccp/options.c

+4099 -3012
+42 -12
Documentation/networking/dccp.txt
··· 45 45 46 46 Socket options 47 47 ============== 48 + DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes 49 + a policy ID as argument and can only be set before the connection (i.e. changes 50 + during an established connection are not supported). Currently, two policies are 51 + defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, 52 + and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an 53 + u32 priority value as ancillary data to sendmsg(), where higher numbers indicate 54 + a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to 55 + be formatted using a cmsg(3) message header filled in as follows: 56 + cmsg->cmsg_level = SOL_DCCP; 57 + cmsg->cmsg_type = DCCP_SCM_PRIORITY; 58 + cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ 59 + 60 + DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero 61 + value is always interpreted as unbounded queue length. If different from zero, 62 + the interpretation of this parameter depends on the current dequeuing policy 63 + (see above): the "simple" policy will enforce a fixed queue size by returning 64 + EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the 65 + lowest-priority packet first. The default value for this parameter is 66 + initialised from /proc/sys/net/dccp/default/tx_qlen. 48 67 49 68 DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of 50 69 service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, ··· 75 56 76 57 DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet 77 58 size (application payload size) in bytes, see RFC 4340, section 14. 59 + 60 + DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs 61 + supported by the endpoint (see include/linux/dccp.h for symbolic constants). 62 + The caller needs to provide a sufficiently large (> 2) array of type uint8_t. 63 + 64 + DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same 65 + time, combining the operation of the next two socket options. This option is 66 + preferrable over the latter two, since often applications will use the same 67 + type of CCID for both directions; and mixed use of CCIDs is not currently well 68 + understood. This socket option takes as argument at least one uint8_t value, or 69 + an array of uint8_t values, which must match available CCIDS (see above). CCIDs 70 + must be registered on the socket before calling connect() or listen(). 71 + 72 + DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets 73 + the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. 74 + Please note that the getsockopt argument type here is `int', not uint8_t. 75 + 76 + DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. 78 77 79 78 DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold 80 79 timewait state when closing the connection (RFC 4340, 8.3). The usual case is ··· 152 115 importance for retransmitted acknowledgments and feature negotiation, 153 116 data packets are never retransmitted. Analogue of tcp_retries2. 154 117 155 - send_ndp = 1 156 - Whether or not to send NDP count options (sec. 7.7.2). 157 - 158 - send_ackvec = 1 159 - Whether or not to send Ack Vector options (sec. 11.5). 160 - 161 - ack_ratio = 2 162 - The default Ack Ratio (sec. 11.3) to use. 163 - 164 118 tx_ccid = 2 165 - Default CCID for the sender-receiver half-connection. 119 + Default CCID for the sender-receiver half-connection. Depending on the 120 + choice of CCID, the Send Ack Vector feature is enabled automatically. 166 121 167 122 rx_ccid = 2 168 - Default CCID for the receiver-sender half-connection. 123 + Default CCID for the receiver-sender half-connection; see tx_ccid. 169 124 170 125 seq_window = 100 171 - The initial sequence window (sec. 7.5.2). 126 + The initial sequence window (sec. 7.5.2) of the sender. This influences 127 + the local ackno validity and the remote seqno validity windows (7.5.1). 172 128 173 129 tx_qlen = 5 174 130 The size of the transmit buffer in packets. A value of 0 corresponds
+51 -71
include/linux/dccp.h
··· 165 165 DCCPO_TIMESTAMP_ECHO = 42, 166 166 DCCPO_ELAPSED_TIME = 43, 167 167 DCCPO_MAX = 45, 168 - DCCPO_MIN_CCID_SPECIFIC = 128, 169 - DCCPO_MAX_CCID_SPECIFIC = 255, 168 + DCCPO_MIN_RX_CCID_SPECIFIC = 128, /* from sender to receiver */ 169 + DCCPO_MAX_RX_CCID_SPECIFIC = 191, 170 + DCCPO_MIN_TX_CCID_SPECIFIC = 192, /* from receiver to sender */ 171 + DCCPO_MAX_TX_CCID_SPECIFIC = 255, 170 172 }; 173 + /* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ 174 + #define DCCP_SINGLE_OPT_MAXLEN 253 171 175 172 176 /* DCCP CCIDS */ 173 177 enum { ··· 180 176 }; 181 177 182 178 /* DCCP features (RFC 4340 section 6.4) */ 183 - enum { 179 + enum dccp_feature_numbers { 184 180 DCCPF_RESERVED = 0, 185 181 DCCPF_CCID = 1, 186 - DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */ 182 + DCCPF_SHORT_SEQNOS = 2, 187 183 DCCPF_SEQUENCE_WINDOW = 3, 188 - DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */ 184 + DCCPF_ECN_INCAPABLE = 4, 189 185 DCCPF_ACK_RATIO = 5, 190 186 DCCPF_SEND_ACK_VECTOR = 6, 191 187 DCCPF_SEND_NDP_COUNT = 7, 192 188 DCCPF_MIN_CSUM_COVER = 8, 193 - DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */ 189 + DCCPF_DATA_CHECKSUM = 9, 194 190 /* 10-127 reserved */ 195 191 DCCPF_MIN_CCID_SPECIFIC = 128, 192 + DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */ 196 193 DCCPF_MAX_CCID_SPECIFIC = 255, 197 194 }; 198 195 199 - /* this structure is argument to DCCP_SOCKOPT_CHANGE_X */ 200 - struct dccp_so_feat { 201 - __u8 dccpsf_feat; 202 - __u8 __user *dccpsf_val; 203 - __u8 dccpsf_len; 196 + /* DCCP socket control message types for cmsg */ 197 + enum dccp_cmsg_type { 198 + DCCP_SCM_PRIORITY = 1, 199 + DCCP_SCM_QPOLICY_MAX = 0xFFFF, 200 + /* ^-- Up to here reserved exclusively for qpolicy parameters */ 201 + DCCP_SCM_MAX 202 + }; 203 + 204 + /* DCCP priorities for outgoing/queued packets */ 205 + enum dccp_packet_dequeueing_policy { 206 + DCCPQ_POLICY_SIMPLE, 207 + DCCPQ_POLICY_PRIO, 208 + DCCPQ_POLICY_MAX 204 209 }; 205 210 206 211 /* DCCP socket options */ ··· 221 208 #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 222 209 #define DCCP_SOCKOPT_SEND_CSCOV 10 223 210 #define DCCP_SOCKOPT_RECV_CSCOV 11 211 + #define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 212 + #define DCCP_SOCKOPT_CCID 13 213 + #define DCCP_SOCKOPT_TX_CCID 14 214 + #define DCCP_SOCKOPT_RX_CCID 15 215 + #define DCCP_SOCKOPT_QPOLICY_ID 16 216 + #define DCCP_SOCKOPT_QPOLICY_TXQLEN 17 224 217 #define DCCP_SOCKOPT_CCID_RX_INFO 128 225 218 #define DCCP_SOCKOPT_CCID_TX_INFO 192 226 219 ··· 374 355 return __dccp_hdr_len(dccp_hdr(skb)); 375 356 } 376 357 377 - 378 - /* initial values for each feature */ 379 - #define DCCPF_INITIAL_SEQUENCE_WINDOW 100 380 - #define DCCPF_INITIAL_ACK_RATIO 2 381 - #define DCCPF_INITIAL_CCID DCCPC_CCID2 382 - #define DCCPF_INITIAL_SEND_ACK_VECTOR 1 383 - /* FIXME: for now we're default to 1 but it should really be 0 */ 384 - #define DCCPF_INITIAL_SEND_NDP_COUNT 1 385 - 386 - /** 387 - * struct dccp_minisock - Minimal DCCP connection representation 388 - * 389 - * Will be used to pass the state from dccp_request_sock to dccp_sock. 390 - * 391 - * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) 392 - * @dccpms_ccid - Congestion Control Id (CCID) (section 10) 393 - * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) 394 - * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) 395 - * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3) 396 - * @dccpms_pending - List of features being negotiated 397 - * @dccpms_conf - 398 - */ 399 - struct dccp_minisock { 400 - __u64 dccpms_sequence_window; 401 - __u8 dccpms_rx_ccid; 402 - __u8 dccpms_tx_ccid; 403 - __u8 dccpms_send_ack_vector; 404 - __u8 dccpms_send_ndp_count; 405 - __u8 dccpms_ack_ratio; 406 - struct list_head dccpms_pending; 407 - struct list_head dccpms_conf; 408 - }; 409 - 410 - struct dccp_opt_conf { 411 - __u8 *dccpoc_val; 412 - __u8 dccpoc_len; 413 - }; 414 - 415 - struct dccp_opt_pend { 416 - struct list_head dccpop_node; 417 - __u8 dccpop_type; 418 - __u8 dccpop_feat; 419 - __u8 *dccpop_val; 420 - __u8 dccpop_len; 421 - int dccpop_conf; 422 - struct dccp_opt_conf *dccpop_sc; 423 - }; 424 - 425 - extern void dccp_minisock_init(struct dccp_minisock *dmsk); 426 - 427 358 /** 428 359 * struct dccp_request_sock - represent DCCP-specific connection request 429 360 * @dreq_inet_rsk: structure inherited from 430 361 * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) 431 362 * @dreq_isr: initial sequence number received on the Request 432 363 * @dreq_service: service code present on the Request (there is just one) 364 + * @dreq_featneg: feature negotiation options for this connection 433 365 * The following two fields are analogous to the ones in dccp_sock: 434 366 * @dreq_timestamp_echo: last received timestamp to echo (13.1) 435 367 * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo ··· 390 420 __u64 dreq_iss; 391 421 __u64 dreq_isr; 392 422 __be32 dreq_service; 423 + struct list_head dreq_featneg; 393 424 __u32 dreq_timestamp_echo; 394 425 __u32 dreq_timestamp_time; 395 426 }; ··· 462 491 * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo 463 492 * @dccps_l_ack_ratio - feature-local Ack Ratio 464 493 * @dccps_r_ack_ratio - feature-remote Ack Ratio 494 + * @dccps_l_seq_win - local Sequence Window (influences ack number validity) 495 + * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) 465 496 * @dccps_pcslen - sender partial checksum coverage (via sockopt) 466 497 * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) 498 + * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) 467 499 * @dccps_ndp_count - number of Non Data Packets since last data packet 468 500 * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) 469 501 * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) 470 - * @dccps_minisock - associated minisock (accessed via dccp_msk) 502 + * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) 471 503 * @dccps_hc_rx_ackvec - rx half connection ack vector 472 504 * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) 473 505 * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) 474 506 * @dccps_options_received - parsed set of retrieved options 507 + * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy 508 + * @dccps_tx_qlen - maximum length of the TX queue 475 509 * @dccps_role - role of this sock, one of %dccp_role 476 510 * @dccps_hc_rx_insert_options - receiver wants to add options when acking 477 511 * @dccps_hc_tx_insert_options - sender wants to add options when sending 478 512 * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) 479 - * @dccps_xmit_timer - timer for when CCID is not ready to send 513 + * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" 514 + * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets 515 + * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) 480 516 * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) 481 517 */ 482 518 struct dccp_sock { ··· 507 529 __u32 dccps_timestamp_time; 508 530 __u16 dccps_l_ack_ratio; 509 531 __u16 dccps_r_ack_ratio; 510 - __u16 dccps_pcslen; 511 - __u16 dccps_pcrlen; 532 + __u64 dccps_l_seq_win:48; 533 + __u64 dccps_r_seq_win:48; 534 + __u8 dccps_pcslen:4; 535 + __u8 dccps_pcrlen:4; 536 + __u8 dccps_send_ndp_count:1; 512 537 __u64 dccps_ndp_count:48; 513 538 unsigned long dccps_rate_last; 514 - struct dccp_minisock dccps_minisock; 539 + struct list_head dccps_featneg; 515 540 struct dccp_ackvec *dccps_hc_rx_ackvec; 516 541 struct ccid *dccps_hc_rx_ccid; 517 542 struct ccid *dccps_hc_tx_ccid; 518 543 struct dccp_options_received dccps_options_received; 544 + __u8 dccps_qpolicy; 545 + __u32 dccps_tx_qlen; 519 546 enum dccp_role dccps_role:2; 520 547 __u8 dccps_hc_rx_insert_options:1; 521 548 __u8 dccps_hc_tx_insert_options:1; 522 549 __u8 dccps_server_timewait:1; 550 + __u8 dccps_sync_scheduled:1; 551 + struct tasklet_struct dccps_xmitlet; 523 552 struct timer_list dccps_xmit_timer; 524 553 }; 525 554 526 555 static inline struct dccp_sock *dccp_sk(const struct sock *sk) 527 556 { 528 557 return (struct dccp_sock *)sk; 529 - } 530 - 531 - static inline struct dccp_minisock *dccp_msk(const struct sock *sk) 532 - { 533 - return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock; 534 558 } 535 559 536 560 static inline const char *dccp_role(const struct sock *sk)
+15
include/net/tcp.h
··· 782 782 /* Use define here intentionally to get WARN_ON location shown at the caller */ 783 783 #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) 784 784 785 + /* 786 + * Convert RFC3390 larger initial windows into an equivalent number of packets. 787 + * 788 + * John Heffner states: 789 + * 790 + * The RFC specifies a window of no more than 4380 bytes 791 + * unless 2*MSS > 4380. Reading the pseudocode in the RFC 792 + * is a bit misleading because they use a clamp at 4380 bytes 793 + * rather than a multiplier in the relevant range. 794 + */ 795 + static inline u32 rfc3390_bytes_to_packets(const u32 bytes) 796 + { 797 + return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3); 798 + } 799 + 785 800 extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); 786 801 extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); 787 802
-3
net/dccp/Kconfig
··· 25 25 def_tristate y if (IP_DCCP = y && INET_DIAG = y) 26 26 def_tristate m 27 27 28 - config IP_DCCP_ACKVEC 29 - bool 30 - 31 28 source "net/dccp/ccids/Kconfig" 32 29 33 30 menu "DCCP Kernel Hacking"
+2 -3
net/dccp/Makefile
··· 1 1 obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o 2 2 3 - dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o 3 + dccp-y := ccid.o feat.o input.o minisocks.o options.o \ 4 + qpolicy.o output.o proto.o timer.o ackvec.o 4 5 5 6 dccp_ipv4-y := ipv4.o 6 7 7 8 # build dccp_ipv6 as module whenever either IPv6 or DCCP is a module 8 9 obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o 9 10 dccp_ipv6-y := ipv6.o 10 - 11 - dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o 12 11 13 12 obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o 14 13 obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
+310 -381
net/dccp/ackvec.c
··· 1 1 /* 2 2 * net/dccp/ackvec.c 3 3 * 4 - * An implementation of the DCCP protocol 4 + * An implementation of Ack Vectors for the DCCP protocol 5 + * Copyright (c) 2007 University of Aberdeen, Scotland, UK 5 6 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> 6 7 * 7 8 * This program is free software; you can redistribute it and/or modify it 8 9 * under the terms of the GNU General Public License as published by the 9 10 * Free Software Foundation; version 2 of the License; 10 11 */ 11 - 12 - #include "ackvec.h" 13 12 #include "dccp.h" 14 - 15 - #include <linux/dccp.h> 16 - #include <linux/init.h> 17 - #include <linux/errno.h> 18 13 #include <linux/kernel.h> 19 - #include <linux/skbuff.h> 20 14 #include <linux/slab.h> 21 - 22 - #include <net/sock.h> 23 15 24 16 static struct kmem_cache *dccp_ackvec_slab; 25 17 static struct kmem_cache *dccp_ackvec_record_slab; 26 18 27 - static struct dccp_ackvec_record *dccp_ackvec_record_new(void) 28 - { 29 - struct dccp_ackvec_record *avr = 30 - kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); 31 - 32 - if (avr != NULL) 33 - INIT_LIST_HEAD(&avr->avr_node); 34 - 35 - return avr; 36 - } 37 - 38 - static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) 39 - { 40 - if (unlikely(avr == NULL)) 41 - return; 42 - /* Check if deleting a linked record */ 43 - WARN_ON(!list_empty(&avr->avr_node)); 44 - kmem_cache_free(dccp_ackvec_record_slab, avr); 45 - } 46 - 47 - static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, 48 - struct dccp_ackvec_record *avr) 49 - { 50 - /* 51 - * AVRs are sorted by seqno. Since we are sending them in order, we 52 - * just add the AVR at the head of the list. 53 - * -sorbo. 54 - */ 55 - if (!list_empty(&av->av_records)) { 56 - const struct dccp_ackvec_record *head = 57 - list_entry(av->av_records.next, 58 - struct dccp_ackvec_record, 59 - avr_node); 60 - BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno)); 61 - } 62 - 63 - list_add(&avr->avr_node, &av->av_records); 64 - } 65 - 66 - int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) 67 - { 68 - struct dccp_sock *dp = dccp_sk(sk); 69 - struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; 70 - /* Figure out how many options do we need to represent the ackvec */ 71 - const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); 72 - u16 len = av->av_vec_len + 2 * nr_opts, i; 73 - u32 elapsed_time; 74 - const unsigned char *tail, *from; 75 - unsigned char *to; 76 - struct dccp_ackvec_record *avr; 77 - suseconds_t delta; 78 - 79 - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) 80 - return -1; 81 - 82 - delta = ktime_us_delta(ktime_get_real(), av->av_time); 83 - elapsed_time = delta / 10; 84 - 85 - if (elapsed_time != 0 && 86 - dccp_insert_option_elapsed_time(sk, skb, elapsed_time)) 87 - return -1; 88 - 89 - avr = dccp_ackvec_record_new(); 90 - if (avr == NULL) 91 - return -1; 92 - 93 - DCCP_SKB_CB(skb)->dccpd_opt_len += len; 94 - 95 - to = skb_push(skb, len); 96 - len = av->av_vec_len; 97 - from = av->av_buf + av->av_buf_head; 98 - tail = av->av_buf + DCCP_MAX_ACKVEC_LEN; 99 - 100 - for (i = 0; i < nr_opts; ++i) { 101 - int copylen = len; 102 - 103 - if (len > DCCP_MAX_ACKVEC_OPT_LEN) 104 - copylen = DCCP_MAX_ACKVEC_OPT_LEN; 105 - 106 - *to++ = DCCPO_ACK_VECTOR_0; 107 - *to++ = copylen + 2; 108 - 109 - /* Check if buf_head wraps */ 110 - if (from + copylen > tail) { 111 - const u16 tailsize = tail - from; 112 - 113 - memcpy(to, from, tailsize); 114 - to += tailsize; 115 - len -= tailsize; 116 - copylen -= tailsize; 117 - from = av->av_buf; 118 - } 119 - 120 - memcpy(to, from, copylen); 121 - from += copylen; 122 - to += copylen; 123 - len -= copylen; 124 - } 125 - 126 - /* 127 - * From RFC 4340, A.2: 128 - * 129 - * For each acknowledgement it sends, the HC-Receiver will add an 130 - * acknowledgement record. ack_seqno will equal the HC-Receiver 131 - * sequence number it used for the ack packet; ack_ptr will equal 132 - * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will 133 - * equal buf_nonce. 134 - */ 135 - avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; 136 - avr->avr_ack_ptr = av->av_buf_head; 137 - avr->avr_ack_ackno = av->av_buf_ackno; 138 - avr->avr_ack_nonce = av->av_buf_nonce; 139 - avr->avr_sent_len = av->av_vec_len; 140 - 141 - dccp_ackvec_insert_avr(av, avr); 142 - 143 - dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " 144 - "ack_ackno=%llu\n", 145 - dccp_role(sk), avr->avr_sent_len, 146 - (unsigned long long)avr->avr_ack_seqno, 147 - (unsigned long long)avr->avr_ack_ackno); 148 - return 0; 149 - } 150 - 151 19 struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) 152 20 { 153 - struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); 21 + struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); 154 22 155 23 if (av != NULL) { 156 - av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1; 157 - av->av_buf_ackno = UINT48_MAX + 1; 158 - av->av_buf_nonce = 0; 159 - av->av_time = ktime_set(0, 0); 160 - av->av_vec_len = 0; 24 + av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; 161 25 INIT_LIST_HEAD(&av->av_records); 162 26 } 163 - 164 27 return av; 28 + } 29 + 30 + static void dccp_ackvec_purge_records(struct dccp_ackvec *av) 31 + { 32 + struct dccp_ackvec_record *cur, *next; 33 + 34 + list_for_each_entry_safe(cur, next, &av->av_records, avr_node) 35 + kmem_cache_free(dccp_ackvec_record_slab, cur); 36 + INIT_LIST_HEAD(&av->av_records); 165 37 } 166 38 167 39 void dccp_ackvec_free(struct dccp_ackvec *av) 168 40 { 169 - if (unlikely(av == NULL)) 170 - return; 171 - 172 - if (!list_empty(&av->av_records)) { 173 - struct dccp_ackvec_record *avr, *next; 174 - 175 - list_for_each_entry_safe(avr, next, &av->av_records, avr_node) { 176 - list_del_init(&avr->avr_node); 177 - dccp_ackvec_record_delete(avr); 178 - } 41 + if (likely(av != NULL)) { 42 + dccp_ackvec_purge_records(av); 43 + kmem_cache_free(dccp_ackvec_slab, av); 179 44 } 180 - 181 - kmem_cache_free(dccp_ackvec_slab, av); 182 45 } 183 46 184 - static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, 185 - const u32 index) 186 - { 187 - return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK; 188 - } 189 - 190 - static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, 191 - const u32 index) 192 - { 193 - return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK; 194 - } 195 - 196 - /* 197 - * If several packets are missing, the HC-Receiver may prefer to enter multiple 198 - * bytes with run length 0, rather than a single byte with a larger run length; 199 - * this simplifies table updates if one of the missing packets arrives. 47 + /** 48 + * dccp_ackvec_update_records - Record information about sent Ack Vectors 49 + * @av: Ack Vector records to update 50 + * @seqno: Sequence number of the packet carrying the Ack Vector just sent 51 + * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector 200 52 */ 201 - static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, 202 - const unsigned int packets, 203 - const unsigned char state) 53 + int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) 204 54 { 205 - unsigned int gap; 206 - long new_head; 55 + struct dccp_ackvec_record *avr; 207 56 208 - if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN) 57 + avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); 58 + if (avr == NULL) 209 59 return -ENOBUFS; 210 60 211 - gap = packets - 1; 212 - new_head = av->av_buf_head - packets; 61 + avr->avr_ack_seqno = seqno; 62 + avr->avr_ack_ptr = av->av_buf_head; 63 + avr->avr_ack_ackno = av->av_buf_ackno; 64 + avr->avr_ack_nonce = nonce_sum; 65 + avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); 66 + /* 67 + * When the buffer overflows, we keep no more than one record. This is 68 + * the simplest way of disambiguating sender-Acks dating from before the 69 + * overflow from sender-Acks which refer to after the overflow; a simple 70 + * solution is preferable here since we are handling an exception. 71 + */ 72 + if (av->av_overflow) 73 + dccp_ackvec_purge_records(av); 74 + /* 75 + * Since GSS is incremented for each packet, the list is automatically 76 + * arranged in descending order of @ack_seqno. 77 + */ 78 + list_add(&avr->avr_node, &av->av_records); 213 79 214 - if (new_head < 0) { 215 - if (gap > 0) { 216 - memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, 217 - gap + new_head + 1); 218 - gap = -new_head; 219 - } 220 - new_head += DCCP_MAX_ACKVEC_LEN; 221 - } 222 - 223 - av->av_buf_head = new_head; 224 - 225 - if (gap > 0) 226 - memset(av->av_buf + av->av_buf_head + 1, 227 - DCCP_ACKVEC_STATE_NOT_RECEIVED, gap); 228 - 229 - av->av_buf[av->av_buf_head] = state; 230 - av->av_vec_len += packets; 80 + dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", 81 + (unsigned long long)avr->avr_ack_seqno, 82 + (unsigned long long)avr->avr_ack_ackno, 83 + avr->avr_ack_runlen); 231 84 return 0; 85 + } 86 + 87 + static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, 88 + const u64 ackno) 89 + { 90 + struct dccp_ackvec_record *avr; 91 + /* 92 + * Exploit that records are inserted in descending order of sequence 93 + * number, start with the oldest record first. If @ackno is `before' 94 + * the earliest ack_ackno, the packet is too old to be considered. 95 + */ 96 + list_for_each_entry_reverse(avr, av_list, avr_node) { 97 + if (avr->avr_ack_seqno == ackno) 98 + return avr; 99 + if (before48(ackno, avr->avr_ack_seqno)) 100 + break; 101 + } 102 + return NULL; 232 103 } 233 104 234 105 /* 235 - * Implements the RFC 4340, Appendix A 106 + * Buffer index and length computation using modulo-buffersize arithmetic. 107 + * Note that, as pointers move from right to left, head is `before' tail. 236 108 */ 237 - int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 238 - const u64 ackno, const u8 state) 109 + static inline u16 __ackvec_idx_add(const u16 a, const u16 b) 239 110 { 240 - /* 241 - * Check at the right places if the buffer is full, if it is, tell the 242 - * caller to start dropping packets till the HC-Sender acks our ACK 243 - * vectors, when we will free up space in av_buf. 244 - * 245 - * We may well decide to do buffer compression, etc, but for now lets 246 - * just drop. 247 - * 248 - * From Appendix A.1.1 (`New Packets'): 249 - * 250 - * Of course, the circular buffer may overflow, either when the 251 - * HC-Sender is sending data at a very high rate, when the 252 - * HC-Receiver's acknowledgements are not reaching the HC-Sender, 253 - * or when the HC-Sender is forgetting to acknowledge those acks 254 - * (so the HC-Receiver is unable to clean up old state). In this 255 - * case, the HC-Receiver should either compress the buffer (by 256 - * increasing run lengths when possible), transfer its state to 257 - * a larger buffer, or, as a last resort, drop all received 258 - * packets, without processing them whatsoever, until its buffer 259 - * shrinks again. 260 - */ 261 - 262 - /* See if this is the first ackno being inserted */ 263 - if (av->av_vec_len == 0) { 264 - av->av_buf[av->av_buf_head] = state; 265 - av->av_vec_len = 1; 266 - } else if (after48(ackno, av->av_buf_ackno)) { 267 - const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno); 268 - 269 - /* 270 - * Look if the state of this packet is the same as the 271 - * previous ackno and if so if we can bump the head len. 272 - */ 273 - if (delta == 1 && 274 - dccp_ackvec_state(av, av->av_buf_head) == state && 275 - dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK) 276 - av->av_buf[av->av_buf_head]++; 277 - else if (dccp_ackvec_set_buf_head_state(av, delta, state)) 278 - return -ENOBUFS; 279 - } else { 280 - /* 281 - * A.1.2. Old Packets 282 - * 283 - * When a packet with Sequence Number S <= buf_ackno 284 - * arrives, the HC-Receiver will scan the table for 285 - * the byte corresponding to S. (Indexing structures 286 - * could reduce the complexity of this scan.) 287 - */ 288 - u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno); 289 - u32 index = av->av_buf_head; 290 - 291 - while (1) { 292 - const u8 len = dccp_ackvec_len(av, index); 293 - const u8 av_state = dccp_ackvec_state(av, index); 294 - /* 295 - * valid packets not yet in av_buf have a reserved 296 - * entry, with a len equal to 0. 297 - */ 298 - if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED && 299 - len == 0 && delta == 0) { /* Found our 300 - reserved seat! */ 301 - dccp_pr_debug("Found %llu reserved seat!\n", 302 - (unsigned long long)ackno); 303 - av->av_buf[index] = state; 304 - goto out; 305 - } 306 - /* len == 0 means one packet */ 307 - if (delta < len + 1) 308 - goto out_duplicate; 309 - 310 - delta -= len + 1; 311 - if (++index == DCCP_MAX_ACKVEC_LEN) 312 - index = 0; 313 - } 314 - } 315 - 316 - av->av_buf_ackno = ackno; 317 - av->av_time = ktime_get_real(); 318 - out: 319 - return 0; 320 - 321 - out_duplicate: 322 - /* Duplicate packet */ 323 - dccp_pr_debug("Received a dup or already considered lost " 324 - "packet: %llu\n", (unsigned long long)ackno); 325 - return -EILSEQ; 111 + return (a + b) % DCCPAV_MAX_ACKVEC_LEN; 326 112 } 327 113 328 - static void dccp_ackvec_throw_record(struct dccp_ackvec *av, 329 - struct dccp_ackvec_record *avr) 114 + static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) 330 115 { 331 - struct dccp_ackvec_record *next; 332 - 333 - /* sort out vector length */ 334 - if (av->av_buf_head <= avr->avr_ack_ptr) 335 - av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; 336 - else 337 - av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 - 338 - av->av_buf_head + avr->avr_ack_ptr; 339 - 340 - /* free records */ 341 - list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { 342 - list_del_init(&avr->avr_node); 343 - dccp_ackvec_record_delete(avr); 344 - } 116 + return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); 345 117 } 346 118 347 - void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, 348 - const u64 ackno) 119 + u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) 349 120 { 350 - struct dccp_ackvec_record *avr; 351 - 352 - /* 353 - * If we traverse backwards, it should be faster when we have large 354 - * windows. We will be receiving ACKs for stuff we sent a while back 355 - * -sorbo. 356 - */ 357 - list_for_each_entry_reverse(avr, &av->av_records, avr_node) { 358 - if (ackno == avr->avr_ack_seqno) { 359 - dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " 360 - "ack_ackno=%llu, ACKED!\n", 361 - dccp_role(sk), 1, 362 - (unsigned long long)avr->avr_ack_seqno, 363 - (unsigned long long)avr->avr_ack_ackno); 364 - dccp_ackvec_throw_record(av, avr); 365 - break; 366 - } else if (avr->avr_ack_seqno > ackno) 367 - break; /* old news */ 368 - } 121 + if (unlikely(av->av_overflow)) 122 + return DCCPAV_MAX_ACKVEC_LEN; 123 + return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); 369 124 } 370 125 371 - static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, 372 - struct sock *sk, u64 *ackno, 373 - const unsigned char len, 374 - const unsigned char *vector) 126 + /** 127 + * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 128 + * @av: non-empty buffer to update 129 + * @distance: negative or zero distance of @seqno from buf_ackno downward 130 + * @seqno: the (old) sequence number whose record is to be updated 131 + * @state: state in which packet carrying @seqno was received 132 + */ 133 + static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, 134 + u64 seqno, enum dccp_ackvec_states state) 375 135 { 376 - unsigned char i; 377 - struct dccp_ackvec_record *avr; 136 + u16 ptr = av->av_buf_head; 378 137 379 - /* Check if we actually sent an ACK vector */ 380 - if (list_empty(&av->av_records)) 138 + BUG_ON(distance > 0); 139 + if (unlikely(dccp_ackvec_is_empty(av))) 381 140 return; 382 141 383 - i = len; 384 - /* 385 - * XXX 386 - * I think it might be more efficient to work backwards. See comment on 387 - * rcv_ackno. -sorbo. 388 - */ 389 - avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); 390 - while (i--) { 391 - const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; 392 - u64 ackno_end_rl; 142 + do { 143 + u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); 393 144 394 - dccp_set_seqno(&ackno_end_rl, *ackno - rl); 395 - 396 - /* 397 - * If our AVR sequence number is greater than the ack, go 398 - * forward in the AVR list until it is not so. 399 - */ 400 - list_for_each_entry_from(avr, &av->av_records, avr_node) { 401 - if (!after48(avr->avr_ack_seqno, *ackno)) 402 - goto found; 403 - } 404 - /* End of the av_records list, not found, exit */ 405 - break; 406 - found: 407 - if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) { 408 - const u8 state = *vector & DCCP_ACKVEC_STATE_MASK; 409 - if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) { 410 - dccp_pr_debug("%s ACK vector 0, len=%d, " 411 - "ack_seqno=%llu, ack_ackno=%llu, " 412 - "ACKED!\n", 413 - dccp_role(sk), len, 414 - (unsigned long long) 415 - avr->avr_ack_seqno, 416 - (unsigned long long) 417 - avr->avr_ack_ackno); 418 - dccp_ackvec_throw_record(av, avr); 419 - break; 420 - } 145 + if (distance + runlen >= 0) { 421 146 /* 422 - * If it wasn't received, continue scanning... we might 423 - * find another one. 147 + * Only update the state if packet has not been received 148 + * yet. This is OK as per the second table in RFC 4340, 149 + * 11.4.1; i.e. here we are using the following table: 150 + * RECEIVED 151 + * 0 1 3 152 + * S +---+---+---+ 153 + * T 0 | 0 | 0 | 0 | 154 + * O +---+---+---+ 155 + * R 1 | 1 | 1 | 1 | 156 + * E +---+---+---+ 157 + * D 3 | 0 | 1 | 3 | 158 + * +---+---+---+ 159 + * The "Not Received" state was set by reserve_seats(). 424 160 */ 161 + if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) 162 + av->av_buf[ptr] = state; 163 + else 164 + dccp_pr_debug("Not changing %llu state to %u\n", 165 + (unsigned long long)seqno, state); 166 + break; 425 167 } 426 168 427 - dccp_set_seqno(ackno, ackno_end_rl - 1); 428 - ++vector; 169 + distance += runlen + 1; 170 + ptr = __ackvec_idx_add(ptr, 1); 171 + 172 + } while (ptr != av->av_buf_tail); 173 + } 174 + 175 + /* Mark @num entries after buf_head as "Not yet received". */ 176 + static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) 177 + { 178 + u16 start = __ackvec_idx_add(av->av_buf_head, 1), 179 + len = DCCPAV_MAX_ACKVEC_LEN - start; 180 + 181 + /* check for buffer wrap-around */ 182 + if (num > len) { 183 + memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); 184 + start = 0; 185 + num -= len; 186 + } 187 + if (num) 188 + memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); 189 + } 190 + 191 + /** 192 + * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer 193 + * @av: container of buffer to update (can be empty or non-empty) 194 + * @num_packets: number of packets to register (must be >= 1) 195 + * @seqno: sequence number of the first packet in @num_packets 196 + * @state: state in which packet carrying @seqno was received 197 + */ 198 + static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, 199 + u64 seqno, enum dccp_ackvec_states state) 200 + { 201 + u32 num_cells = num_packets; 202 + 203 + if (num_packets > DCCPAV_BURST_THRESH) { 204 + u32 lost_packets = num_packets - 1; 205 + 206 + DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); 207 + /* 208 + * We received 1 packet and have a loss of size "num_packets-1" 209 + * which we squeeze into num_cells-1 rather than reserving an 210 + * entire byte for each lost packet. 211 + * The reason is that the vector grows in O(burst_length); when 212 + * it grows too large there will no room left for the payload. 213 + * This is a trade-off: if a few packets out of the burst show 214 + * up later, their state will not be changed; it is simply too 215 + * costly to reshuffle/reallocate/copy the buffer each time. 216 + * Should such problems persist, we will need to switch to a 217 + * different underlying data structure. 218 + */ 219 + for (num_packets = num_cells = 1; lost_packets; ++num_cells) { 220 + u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN); 221 + 222 + av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); 223 + av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; 224 + 225 + lost_packets -= len; 226 + } 227 + } 228 + 229 + if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { 230 + DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n"); 231 + av->av_overflow = true; 232 + } 233 + 234 + av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); 235 + if (av->av_overflow) 236 + av->av_buf_tail = av->av_buf_head; 237 + 238 + av->av_buf[av->av_buf_head] = state; 239 + av->av_buf_ackno = seqno; 240 + 241 + if (num_packets > 1) 242 + dccp_ackvec_reserve_seats(av, num_packets - 1); 243 + } 244 + 245 + /** 246 + * dccp_ackvec_input - Register incoming packet in the buffer 247 + */ 248 + void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) 249 + { 250 + u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; 251 + enum dccp_ackvec_states state = DCCPAV_RECEIVED; 252 + 253 + if (dccp_ackvec_is_empty(av)) { 254 + dccp_ackvec_add_new(av, 1, seqno, state); 255 + av->av_tail_ackno = seqno; 256 + 257 + } else { 258 + s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); 259 + u8 *current_head = av->av_buf + av->av_buf_head; 260 + 261 + if (num_packets == 1 && 262 + dccp_ackvec_state(current_head) == state && 263 + dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { 264 + 265 + *current_head += 1; 266 + av->av_buf_ackno = seqno; 267 + 268 + } else if (num_packets > 0) { 269 + dccp_ackvec_add_new(av, num_packets, seqno, state); 270 + } else { 271 + dccp_ackvec_update_old(av, num_packets, seqno, state); 272 + } 429 273 } 430 274 } 431 275 432 - int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, 433 - u64 *ackno, const u8 opt, const u8 *value, const u8 len) 434 - { 435 - if (len > DCCP_MAX_ACKVEC_OPT_LEN) 436 - return -1; 276 + /** 277 + * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection 278 + * This routine is called when the peer acknowledges the receipt of Ack Vectors 279 + * up to and including @ackno. While based on on section A.3 of RFC 4340, here 280 + * are additional precautions to prevent corrupted buffer state. In particular, 281 + * we use tail_ackno to identify outdated records; it always marks the earliest 282 + * packet of group (2) in 11.4.2. 283 + */ 284 + void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) 285 + { 286 + struct dccp_ackvec_record *avr, *next; 287 + u8 runlen_now, eff_runlen; 288 + s64 delta; 437 289 438 - /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ 439 - dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk, 440 - ackno, len, value); 290 + avr = dccp_ackvec_lookup(&av->av_records, ackno); 291 + if (avr == NULL) 292 + return; 293 + /* 294 + * Deal with outdated acknowledgments: this arises when e.g. there are 295 + * several old records and the acks from the peer come in slowly. In 296 + * that case we may still have records that pre-date tail_ackno. 297 + */ 298 + delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); 299 + if (delta < 0) 300 + goto free_records; 301 + /* 302 + * Deal with overlapping Ack Vectors: don't subtract more than the 303 + * number of packets between tail_ackno and ack_ackno. 304 + */ 305 + eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; 306 + 307 + runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); 308 + /* 309 + * The run length of Ack Vector cells does not decrease over time. If 310 + * the run length is the same as at the time the Ack Vector was sent, we 311 + * free the ack_ptr cell. That cell can however not be freed if the run 312 + * length has increased: in this case we need to move the tail pointer 313 + * backwards (towards higher indices), to its next-oldest neighbour. 314 + */ 315 + if (runlen_now > eff_runlen) { 316 + 317 + av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; 318 + av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); 319 + 320 + /* This move may not have cleared the overflow flag. */ 321 + if (av->av_overflow) 322 + av->av_overflow = (av->av_buf_head == av->av_buf_tail); 323 + } else { 324 + av->av_buf_tail = avr->avr_ack_ptr; 325 + /* 326 + * We have made sure that avr points to a valid cell within the 327 + * buffer. This cell is either older than head, or equals head 328 + * (empty buffer): in both cases we no longer have any overflow. 329 + */ 330 + av->av_overflow = 0; 331 + } 332 + 333 + /* 334 + * The peer has acknowledged up to and including ack_ackno. Hence the 335 + * first packet in group (2) of 11.4.2 is the successor of ack_ackno. 336 + */ 337 + av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); 338 + 339 + free_records: 340 + list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { 341 + list_del(&avr->avr_node); 342 + kmem_cache_free(dccp_ackvec_record_slab, avr); 343 + } 344 + } 345 + 346 + /* 347 + * Routines to keep track of Ack Vectors received in an skb 348 + */ 349 + int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) 350 + { 351 + struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); 352 + 353 + if (new == NULL) 354 + return -ENOBUFS; 355 + new->vec = vec; 356 + new->len = len; 357 + new->nonce = nonce; 358 + 359 + list_add_tail(&new->node, head); 441 360 return 0; 442 361 } 362 + EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); 363 + 364 + void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) 365 + { 366 + struct dccp_ackvec_parsed *cur, *next; 367 + 368 + list_for_each_entry_safe(cur, next, parsed_chunks, node) 369 + kfree(cur); 370 + INIT_LIST_HEAD(parsed_chunks); 371 + } 372 + EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); 443 373 444 374 int __init dccp_ackvec_init(void) 445 375 { ··· 379 449 if (dccp_ackvec_slab == NULL) 380 450 goto out_err; 381 451 382 - dccp_ackvec_record_slab = 383 - kmem_cache_create("dccp_ackvec_record", 384 - sizeof(struct dccp_ackvec_record), 385 - 0, SLAB_HWCACHE_ALIGN, NULL); 452 + dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record", 453 + sizeof(struct dccp_ackvec_record), 454 + 0, SLAB_HWCACHE_ALIGN, NULL); 386 455 if (dccp_ackvec_record_slab == NULL) 387 456 goto out_destroy_slab; 388 457
+91 -113
net/dccp/ackvec.h
··· 3 3 /* 4 4 * net/dccp/ackvec.h 5 5 * 6 - * An implementation of the DCCP protocol 6 + * An implementation of Ack Vectors for the DCCP protocol 7 + * Copyright (c) 2007 University of Aberdeen, Scotland, UK 7 8 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com> 8 - * 9 9 * This program is free software; you can redistribute it and/or modify it 10 10 * under the terms of the GNU General Public License version 2 as 11 11 * published by the Free Software Foundation. 12 12 */ 13 13 14 + #include <linux/dccp.h> 14 15 #include <linux/compiler.h> 15 - #include <linux/ktime.h> 16 16 #include <linux/list.h> 17 17 #include <linux/types.h> 18 18 19 - /* Read about the ECN nonce to see why it is 253 */ 20 - #define DCCP_MAX_ACKVEC_OPT_LEN 253 21 - /* We can spread an ack vector across multiple options */ 22 - #define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2) 19 + /* 20 + * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, 21 + * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 22 + * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives 23 + * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. 24 + * The maximum value is bounded by the u16 types for indices and functions. 25 + */ 26 + #define DCCPAV_NUM_ACKVECS 2 27 + #define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) 23 28 24 - #define DCCP_ACKVEC_STATE_RECEIVED 0 25 - #define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) 26 - #define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6) 29 + /* Estimated minimum average Ack Vector length - used for updating MPS */ 30 + #define DCCPAV_MIN_OPTLEN 16 27 31 28 - #define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */ 29 - #define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */ 32 + /* Threshold for coping with large bursts of losses */ 33 + #define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) 30 34 31 - /** struct dccp_ackvec - ack vector 35 + enum dccp_ackvec_states { 36 + DCCPAV_RECEIVED = 0x00, 37 + DCCPAV_ECN_MARKED = 0x40, 38 + DCCPAV_RESERVED = 0x80, 39 + DCCPAV_NOT_RECEIVED = 0xC0 40 + }; 41 + #define DCCPAV_MAX_RUNLEN 0x3F 42 + 43 + static inline u8 dccp_ackvec_runlen(const u8 *cell) 44 + { 45 + return *cell & DCCPAV_MAX_RUNLEN; 46 + } 47 + 48 + static inline u8 dccp_ackvec_state(const u8 *cell) 49 + { 50 + return *cell & ~DCCPAV_MAX_RUNLEN; 51 + } 52 + 53 + /** struct dccp_ackvec - Ack Vector main data structure 32 54 * 33 - * This data structure is the one defined in RFC 4340, Appendix A. 55 + * This implements a fixed-size circular buffer within an array and is largely 56 + * based on Appendix A of RFC 4340. 34 57 * 35 - * @av_buf_head - circular buffer head 36 - * @av_buf_tail - circular buffer tail 37 - * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the 38 - * buffer (i.e. %av_buf_head) 39 - * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked 40 - * by the buffer with State 0 41 - * 42 - * Additionally, the HC-Receiver must keep some information about the 43 - * Ack Vectors it has recently sent. For each packet sent carrying an 44 - * Ack Vector, it remembers four variables: 45 - * 46 - * @av_records - list of dccp_ackvec_record 47 - * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. 48 - * 49 - * @av_time - the time in usecs 50 - * @av_buf - circular buffer of acknowledgeable packets 58 + * @av_buf: circular buffer storage area 59 + * @av_buf_head: head index; begin of live portion in @av_buf 60 + * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf 61 + * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf 62 + * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf 63 + * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to 64 + * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf 65 + * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound 66 + * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) 51 67 */ 52 68 struct dccp_ackvec { 53 - u64 av_buf_ackno; 54 - struct list_head av_records; 55 - ktime_t av_time; 69 + u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; 56 70 u16 av_buf_head; 57 - u16 av_vec_len; 58 - u8 av_buf_nonce; 59 - u8 av_ack_nonce; 60 - u8 av_buf[DCCP_MAX_ACKVEC_LEN]; 71 + u16 av_buf_tail; 72 + u64 av_buf_ackno:48; 73 + u64 av_tail_ackno:48; 74 + bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; 75 + u8 av_overflow:1; 76 + struct list_head av_records; 61 77 }; 62 78 63 - /** struct dccp_ackvec_record - ack vector record 79 + /** struct dccp_ackvec_record - Records information about sent Ack Vectors 64 80 * 65 - * ACK vector record as defined in Appendix A of spec. 81 + * These list entries define the additional information which the HC-Receiver 82 + * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. 66 83 * 67 - * The list is sorted by avr_ack_seqno 84 + * @avr_node: the list node in @av_records 85 + * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on 86 + * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to 87 + * @avr_ack_ptr: pointer into @av_buf where this record starts 88 + * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending 89 + * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent 68 90 * 69 - * @avr_node - node in av_records 70 - * @avr_ack_seqno - sequence number of the packet this record was sent on 71 - * @avr_ack_ackno - sequence number being acknowledged 72 - * @avr_ack_ptr - pointer into av_buf where this record starts 73 - * @avr_ack_nonce - av_ack_nonce at the time this record was sent 74 - * @avr_sent_len - lenght of the record in av_buf 91 + * The list as a whole is sorted in descending order by @avr_ack_seqno. 75 92 */ 76 93 struct dccp_ackvec_record { 77 94 struct list_head avr_node; 78 - u64 avr_ack_seqno; 79 - u64 avr_ack_ackno; 95 + u64 avr_ack_seqno:48; 96 + u64 avr_ack_ackno:48; 80 97 u16 avr_ack_ptr; 81 - u16 avr_sent_len; 82 - u8 avr_ack_nonce; 98 + u8 avr_ack_runlen; 99 + u8 avr_ack_nonce:1; 83 100 }; 84 101 85 - struct sock; 86 - struct sk_buff; 87 - 88 - #ifdef CONFIG_IP_DCCP_ACKVEC 89 - extern int dccp_ackvec_init(void); 102 + extern int dccp_ackvec_init(void); 90 103 extern void dccp_ackvec_exit(void); 91 104 92 105 extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); 93 106 extern void dccp_ackvec_free(struct dccp_ackvec *av); 94 107 95 - extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 96 - const u64 ackno, const u8 state); 108 + extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); 109 + extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); 110 + extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); 111 + extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); 97 112 98 - extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, 99 - struct sock *sk, const u64 ackno); 100 - extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, 101 - u64 *ackno, const u8 opt, 102 - const u8 *value, const u8 len); 103 - 104 - extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb); 105 - 106 - static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) 113 + static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) 107 114 { 108 - return av->av_vec_len; 109 - } 110 - #else /* CONFIG_IP_DCCP_ACKVEC */ 111 - static inline int dccp_ackvec_init(void) 112 - { 113 - return 0; 115 + return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; 114 116 } 115 117 116 - static inline void dccp_ackvec_exit(void) 117 - { 118 - } 118 + /** 119 + * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb 120 + * @vec: start of vector (offset into skb) 121 + * @len: length of @vec 122 + * @nonce: whether @vec had an ECN nonce of 0 or 1 123 + * @node: FIFO - arranged in descending order of ack_ackno 124 + * This structure is used by CCIDs to access Ack Vectors in a received skb. 125 + */ 126 + struct dccp_ackvec_parsed { 127 + u8 *vec, 128 + len, 129 + nonce:1; 130 + struct list_head node; 131 + }; 119 132 120 - static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) 121 - { 122 - return NULL; 123 - } 124 - 125 - static inline void dccp_ackvec_free(struct dccp_ackvec *av) 126 - { 127 - } 128 - 129 - static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 130 - const u64 ackno, const u8 state) 131 - { 132 - return -1; 133 - } 134 - 135 - static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, 136 - struct sock *sk, const u64 ackno) 137 - { 138 - } 139 - 140 - static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, 141 - const u64 *ackno, const u8 opt, 142 - const u8 *value, const u8 len) 143 - { 144 - return -1; 145 - } 146 - 147 - static inline int dccp_insert_option_ackvec(const struct sock *sk, 148 - const struct sk_buff *skb) 149 - { 150 - return -1; 151 - } 152 - 153 - static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) 154 - { 155 - return 0; 156 - } 157 - #endif /* CONFIG_IP_DCCP_ACKVEC */ 133 + extern int dccp_ackvec_parsed_add(struct list_head *head, 134 + u8 *vec, u8 len, u8 nonce); 135 + extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); 158 136 #endif /* _ACKVEC_H */
+77 -24
net/dccp/ccid.c
··· 13 13 14 14 #include "ccid.h" 15 15 16 + static u8 builtin_ccids[] = { 17 + DCCPC_CCID2, /* CCID2 is supported by default */ 18 + #if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE) 19 + DCCPC_CCID3, 20 + #endif 21 + }; 22 + 16 23 static struct ccid_operations *ccids[CCID_MAX]; 17 24 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) 18 25 static atomic_t ccids_lockct = ATOMIC_INIT(0); ··· 93 86 } 94 87 } 95 88 89 + /* check that up to @array_len members in @ccid_array are supported */ 90 + bool ccid_support_check(u8 const *ccid_array, u8 array_len) 91 + { 92 + u8 i, j, found; 93 + 94 + for (i = 0, found = 0; i < array_len; i++, found = 0) { 95 + for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++) 96 + found = (ccid_array[i] == builtin_ccids[j]); 97 + if (!found) 98 + return false; 99 + } 100 + return true; 101 + } 102 + 103 + /** 104 + * ccid_get_builtin_ccids - Provide copy of `builtin' CCID array 105 + * @ccid_array: pointer to copy into 106 + * @array_len: value to return length into 107 + * This function allocates memory - caller must see that it is freed after use. 108 + */ 109 + int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) 110 + { 111 + *ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any()); 112 + if (*ccid_array == NULL) 113 + return -ENOBUFS; 114 + *array_len = ARRAY_SIZE(builtin_ccids); 115 + return 0; 116 + } 117 + 118 + int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, 119 + char __user *optval, int __user *optlen) 120 + { 121 + if (len < sizeof(builtin_ccids)) 122 + return -EINVAL; 123 + 124 + if (put_user(sizeof(builtin_ccids), optlen) || 125 + copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids))) 126 + return -EFAULT; 127 + return 0; 128 + } 129 + 96 130 int ccid_register(struct ccid_operations *ccid_ops) 97 131 { 98 132 int err = -ENOBUFS; ··· 196 148 197 149 EXPORT_SYMBOL_GPL(ccid_unregister); 198 150 151 + /** 152 + * ccid_request_module - Pre-load CCID module for later use 153 + * This should be called only from process context (e.g. during connection 154 + * setup) and is necessary for later calls to ccid_new (typically in software 155 + * interrupt), so that it has the modules available when they are needed. 156 + */ 157 + static int ccid_request_module(u8 id) 158 + { 159 + if (!in_atomic()) { 160 + ccids_read_lock(); 161 + if (ccids[id] == NULL) { 162 + ccids_read_unlock(); 163 + return request_module("net-dccp-ccid-%d", id); 164 + } 165 + ccids_read_unlock(); 166 + } 167 + return 0; 168 + } 169 + 170 + int ccid_request_modules(u8 const *ccid_array, u8 array_len) 171 + { 172 + #ifdef CONFIG_KMOD 173 + while (array_len--) 174 + if (ccid_request_module(ccid_array[array_len])) 175 + return -1; 176 + #endif 177 + return 0; 178 + } 179 + 199 180 struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) 200 181 { 201 182 struct ccid_operations *ccid_ops; 202 183 struct ccid *ccid = NULL; 203 184 204 185 ccids_read_lock(); 205 - #ifdef CONFIG_KMOD 206 - if (ccids[id] == NULL) { 207 - /* We only try to load if in process context */ 208 - ccids_read_unlock(); 209 - if (gfp & GFP_ATOMIC) 210 - goto out; 211 - request_module("net-dccp-ccid-%d", id); 212 - ccids_read_lock(); 213 - } 214 - #endif 215 186 ccid_ops = ccids[id]; 216 187 if (ccid_ops == NULL) 217 188 goto out_unlock; ··· 271 204 } 272 205 273 206 EXPORT_SYMBOL_GPL(ccid_new); 274 - 275 - struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp) 276 - { 277 - return ccid_new(id, sk, 1, gfp); 278 - } 279 - 280 - EXPORT_SYMBOL_GPL(ccid_hc_rx_new); 281 - 282 - struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp) 283 - { 284 - return ccid_new(id, sk, 0, gfp); 285 - } 286 - 287 - EXPORT_SYMBOL_GPL(ccid_hc_tx_new); 288 207 289 208 static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) 290 209 {
+80 -33
net/dccp/ccid.h
··· 60 60 void (*ccid_hc_tx_exit)(struct sock *sk); 61 61 void (*ccid_hc_rx_packet_recv)(struct sock *sk, 62 62 struct sk_buff *skb); 63 - int (*ccid_hc_rx_parse_options)(struct sock *sk, 64 - unsigned char option, 65 - unsigned char len, u16 idx, 66 - unsigned char* value); 63 + int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, 64 + u8 opt, u8 *val, u8 len); 67 65 int (*ccid_hc_rx_insert_options)(struct sock *sk, 68 66 struct sk_buff *skb); 69 67 void (*ccid_hc_tx_packet_recv)(struct sock *sk, 70 68 struct sk_buff *skb); 71 - int (*ccid_hc_tx_parse_options)(struct sock *sk, 72 - unsigned char option, 73 - unsigned char len, u16 idx, 74 - unsigned char* value); 69 + int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, 70 + u8 opt, u8 *val, u8 len); 75 71 int (*ccid_hc_tx_send_packet)(struct sock *sk, 76 72 struct sk_buff *skb); 77 73 void (*ccid_hc_tx_packet_sent)(struct sock *sk, 78 - int more, unsigned int len); 74 + unsigned int len); 79 75 void (*ccid_hc_rx_get_info)(struct sock *sk, 80 76 struct tcp_info *info); 81 77 void (*ccid_hc_tx_get_info)(struct sock *sk, ··· 99 103 return (void *)ccid->ccid_priv; 100 104 } 101 105 106 + extern bool ccid_support_check(u8 const *ccid_array, u8 array_len); 107 + extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); 108 + extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, 109 + char __user *, int __user *); 110 + 111 + extern int ccid_request_modules(u8 const *ccid_array, u8 array_len); 102 112 extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, 103 113 gfp_t gfp); 104 114 105 - extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, 106 - gfp_t gfp); 107 - extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk, 108 - gfp_t gfp); 115 + static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) 116 + { 117 + struct ccid *ccid = dp->dccps_hc_rx_ccid; 118 + 119 + if (ccid == NULL || ccid->ccid_ops == NULL) 120 + return -1; 121 + return ccid->ccid_ops->ccid_id; 122 + } 123 + 124 + static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) 125 + { 126 + struct ccid *ccid = dp->dccps_hc_tx_ccid; 127 + 128 + if (ccid == NULL || ccid->ccid_ops == NULL) 129 + return -1; 130 + return ccid->ccid_ops->ccid_id; 131 + } 109 132 110 133 extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); 111 134 extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); 112 135 136 + /* 137 + * Congestion control of queued data packets via CCID decision. 138 + * 139 + * The TX CCID performs its congestion-control by indicating whether and when a 140 + * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). 141 + * The following modes are supported via the symbolic constants below: 142 + * - timer-based pacing (CCID returns a delay value in milliseconds); 143 + * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). 144 + */ 145 + 146 + enum ccid_dequeueing_decision { 147 + CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ 148 + CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ 149 + CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ 150 + CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ 151 + CCID_PACKET_ERR = 0xF0000, /* error condition */ 152 + }; 153 + 154 + static inline int ccid_packet_dequeue_eval(const int return_code) 155 + { 156 + if (return_code < 0) 157 + return CCID_PACKET_ERR; 158 + if (return_code == 0) 159 + return CCID_PACKET_SEND_AT_ONCE; 160 + if (return_code <= CCID_PACKET_DELAY_MAX) 161 + return CCID_PACKET_DELAY; 162 + return return_code; 163 + } 164 + 113 165 static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, 114 166 struct sk_buff *skb) 115 167 { 116 - int rc = 0; 117 168 if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) 118 - rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); 119 - return rc; 169 + return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); 170 + return CCID_PACKET_SEND_AT_ONCE; 120 171 } 121 172 122 173 static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, 123 - int more, unsigned int len) 174 + unsigned int len) 124 175 { 125 176 if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) 126 - ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len); 177 + ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); 127 178 } 128 179 129 180 static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, ··· 187 144 ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); 188 145 } 189 146 147 + /** 148 + * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver 149 + * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) 150 + * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) 151 + * @val: value of @opt 152 + * @len: length of @val in bytes 153 + */ 190 154 static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, 191 - unsigned char option, 192 - unsigned char len, u16 idx, 193 - unsigned char* value) 155 + u8 pkt, u8 opt, u8 *val, u8 len) 194 156 { 195 - int rc = 0; 196 - if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL) 197 - rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx, 198 - value); 199 - return rc; 157 + if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL) 158 + return 0; 159 + return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); 200 160 } 201 161 162 + /** 163 + * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender 164 + * Arguments are analogous to ccid_hc_tx_parse_options() 165 + */ 202 166 static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, 203 - unsigned char option, 204 - unsigned char len, u16 idx, 205 - unsigned char* value) 167 + u8 pkt, u8 opt, u8 *val, u8 len) 206 168 { 207 - int rc = 0; 208 - if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL) 209 - rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value); 210 - return rc; 169 + if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL) 170 + return 0; 171 + return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); 211 172 } 212 173 213 174 static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
+24 -6
net/dccp/ccids/Kconfig
··· 1 1 menu "DCCP CCIDs Configuration (EXPERIMENTAL)" 2 - depends on EXPERIMENTAL 3 2 4 3 config IP_DCCP_CCID2 5 - tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" 4 + tristate "CCID2 (TCP-Like)" 6 5 def_tristate IP_DCCP 7 - select IP_DCCP_ACKVEC 8 6 ---help--- 9 7 CCID 2, TCP-like Congestion Control, denotes Additive Increase, 10 8 Multiplicative Decrease (AIMD) congestion control with behavior ··· 34 36 If in doubt, say N. 35 37 36 38 config IP_DCCP_CCID3 37 - tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" 39 + tristate "CCID3 (TCP-Friendly)" 38 40 def_tristate IP_DCCP 39 41 select IP_DCCP_TFRC_LIB 40 42 ---help--- ··· 62 64 63 65 If in doubt, say M. 64 66 67 + if IP_DCCP_CCID3 65 68 config IP_DCCP_CCID3_DEBUG 66 69 bool "CCID3 debugging messages" 67 - depends on IP_DCCP_CCID3 68 70 ---help--- 69 71 Enable CCID3-specific debugging messages. 70 72 ··· 74 76 75 77 If in doubt, say N. 76 78 79 + choice 80 + prompt "Select method for measuring the packet size s" 81 + default IP_DCCP_CCID3_MEASURE_S_AS_MPS 82 + 83 + config IP_DCCP_CCID3_MEASURE_S_AS_MPS 84 + bool "Always use MPS in place of s" 85 + ---help--- 86 + This use is recommended as it is consistent with the initialisation 87 + of X and suggested when s varies (rfc3448bis, (1) in section 4.1). 88 + config IP_DCCP_CCID3_MEASURE_S_AS_AVG 89 + bool "Use moving average" 90 + ---help--- 91 + An alternative way of tracking s, also supported by rfc3448bis. 92 + This used to be the default for CCID-3 in previous kernels. 93 + config IP_DCCP_CCID3_MEASURE_S_AS_MAX 94 + bool "Track the maximum payload length" 95 + ---help--- 96 + An experimental method based on tracking the maximum packet size. 97 + endchoice 98 + 77 99 config IP_DCCP_CCID3_RTO 78 100 int "Use higher bound for nofeedback timer" 79 101 default 100 80 - depends on IP_DCCP_CCID3 && EXPERIMENTAL 81 102 ---help--- 82 103 Use higher lower bound for nofeedback timer expiration. 83 104 ··· 123 106 The purpose of the nofeedback timer is to slow DCCP down when there 124 107 is serious network congestion: experimenting with larger values should 125 108 therefore not be performed on WANs. 109 + endif # IP_DCCP_CCID3 126 110 127 111 config IP_DCCP_TFRC_LIB 128 112 tristate
+250 -372
net/dccp/ccids/ccid2.c
··· 25 25 /* 26 26 * This implementation should follow RFC 4341 27 27 */ 28 - 28 + #include "../feat.h" 29 29 #include "../ccid.h" 30 30 #include "../dccp.h" 31 31 #include "ccid2.h" ··· 34 34 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG 35 35 static int ccid2_debug; 36 36 #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) 37 - 38 - static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx) 39 - { 40 - int len = 0; 41 - int pipe = 0; 42 - struct ccid2_seq *seqp = hctx->ccid2hctx_seqh; 43 - 44 - /* there is data in the chain */ 45 - if (seqp != hctx->ccid2hctx_seqt) { 46 - seqp = seqp->ccid2s_prev; 47 - len++; 48 - if (!seqp->ccid2s_acked) 49 - pipe++; 50 - 51 - while (seqp != hctx->ccid2hctx_seqt) { 52 - struct ccid2_seq *prev = seqp->ccid2s_prev; 53 - 54 - len++; 55 - if (!prev->ccid2s_acked) 56 - pipe++; 57 - 58 - /* packets are sent sequentially */ 59 - BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, 60 - prev->ccid2s_seq ) >= 0); 61 - BUG_ON(time_before(seqp->ccid2s_sent, 62 - prev->ccid2s_sent)); 63 - 64 - seqp = prev; 65 - } 66 - } 67 - 68 - BUG_ON(pipe != hctx->ccid2hctx_pipe); 69 - ccid2_pr_debug("len of chain=%d\n", len); 70 - 71 - do { 72 - seqp = seqp->ccid2s_prev; 73 - len++; 74 - } while (seqp != hctx->ccid2hctx_seqh); 75 - 76 - ccid2_pr_debug("total len=%d\n", len); 77 - BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN); 78 - } 79 37 #else 80 38 #define ccid2_pr_debug(format, a...) 81 - #define ccid2_hc_tx_check_sanity(hctx) 82 39 #endif 83 40 84 41 static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) ··· 44 87 int i; 45 88 46 89 /* check if we have space to preserve the pointer to the buffer */ 47 - if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) / 48 - sizeof(struct ccid2_seq*))) 90 + if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *)) 49 91 return -ENOMEM; 50 92 51 93 /* allocate buffer and initialize linked list */ ··· 60 104 seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 61 105 62 106 /* This is the first allocation. Initiate the head and tail. */ 63 - if (hctx->ccid2hctx_seqbufc == 0) 64 - hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp; 107 + if (hctx->seqbufc == 0) 108 + hctx->seqh = hctx->seqt = seqp; 65 109 else { 66 110 /* link the existing list with the one we just created */ 67 - hctx->ccid2hctx_seqh->ccid2s_next = seqp; 68 - seqp->ccid2s_prev = hctx->ccid2hctx_seqh; 111 + hctx->seqh->ccid2s_next = seqp; 112 + seqp->ccid2s_prev = hctx->seqh; 69 113 70 - hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 71 - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt; 114 + hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 115 + seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt; 72 116 } 73 117 74 118 /* store the original pointer to the buffer so we can free it */ 75 - hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp; 76 - hctx->ccid2hctx_seqbufc++; 119 + hctx->seqbuf[hctx->seqbufc] = seqp; 120 + hctx->seqbufc++; 77 121 78 122 return 0; 79 123 } 80 124 81 125 static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 82 126 { 83 - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 84 - 85 - if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) 86 - return 0; 87 - 88 - return 1; /* XXX CCID should dequeue when ready instead of polling */ 127 + if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) 128 + return CCID_PACKET_WILL_DEQUEUE_LATER; 129 + return CCID_PACKET_SEND_AT_ONCE; 89 130 } 90 131 91 132 static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) 92 133 { 93 134 struct dccp_sock *dp = dccp_sk(sk); 94 - u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2); 135 + u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2); 95 136 96 137 /* 97 138 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from ··· 100 147 DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); 101 148 val = max_ratio; 102 149 } 103 - if (val > 0xFFFF) /* RFC 4340, 11.3 */ 104 - val = 0xFFFF; 150 + if (val > DCCPF_ACK_RATIO_MAX) 151 + val = DCCPF_ACK_RATIO_MAX; 105 152 106 153 if (val == dp->dccps_l_ack_ratio) 107 154 return; ··· 110 157 dp->dccps_l_ack_ratio = val; 111 158 } 112 159 113 - static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) 114 - { 115 - ccid2_pr_debug("change SRTT to %ld\n", val); 116 - hctx->ccid2hctx_srtt = val; 117 - } 118 - 119 - static void ccid2_start_rto_timer(struct sock *sk); 120 - 121 160 static void ccid2_hc_tx_rto_expire(unsigned long data) 122 161 { 123 162 struct sock *sk = (struct sock *)data; 124 163 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 125 - long s; 164 + const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); 126 165 127 166 bh_lock_sock(sk); 128 167 if (sock_owned_by_user(sk)) { 129 - sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, 130 - jiffies + HZ / 5); 168 + sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5); 131 169 goto out; 132 170 } 133 171 134 172 ccid2_pr_debug("RTO_EXPIRE\n"); 135 173 136 - ccid2_hc_tx_check_sanity(hctx); 137 - 138 174 /* back-off timer */ 139 - hctx->ccid2hctx_rto <<= 1; 140 - 141 - s = hctx->ccid2hctx_rto / HZ; 142 - if (s > 60) 143 - hctx->ccid2hctx_rto = 60 * HZ; 144 - 145 - ccid2_start_rto_timer(sk); 175 + hctx->rto <<= 1; 176 + if (hctx->rto > DCCP_RTO_MAX) 177 + hctx->rto = DCCP_RTO_MAX; 146 178 147 179 /* adjust pipe, cwnd etc */ 148 - hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2; 149 - if (hctx->ccid2hctx_ssthresh < 2) 150 - hctx->ccid2hctx_ssthresh = 2; 151 - hctx->ccid2hctx_cwnd = 1; 152 - hctx->ccid2hctx_pipe = 0; 180 + hctx->ssthresh = hctx->cwnd / 2; 181 + if (hctx->ssthresh < 2) 182 + hctx->ssthresh = 2; 183 + hctx->cwnd = 1; 184 + hctx->pipe = 0; 153 185 154 186 /* clear state about stuff we sent */ 155 - hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; 156 - hctx->ccid2hctx_packets_acked = 0; 187 + hctx->seqt = hctx->seqh; 188 + hctx->packets_acked = 0; 157 189 158 190 /* clear ack ratio state. */ 159 - hctx->ccid2hctx_rpseq = 0; 160 - hctx->ccid2hctx_rpdupack = -1; 191 + hctx->rpseq = 0; 192 + hctx->rpdupack = -1; 161 193 ccid2_change_l_ack_ratio(sk, 1); 162 - ccid2_hc_tx_check_sanity(hctx); 194 + 195 + /* if we were blocked before, we may now send cwnd=1 packet */ 196 + if (sender_was_blocked) 197 + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); 198 + /* restart backed-off timer */ 199 + sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); 163 200 out: 164 201 bh_unlock_sock(sk); 165 202 sock_put(sk); 166 203 } 167 204 168 - static void ccid2_start_rto_timer(struct sock *sk) 169 - { 170 - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 171 - 172 - ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto); 173 - 174 - BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer)); 175 - sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, 176 - jiffies + hctx->ccid2hctx_rto); 177 - } 178 - 179 - static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) 205 + static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) 180 206 { 181 207 struct dccp_sock *dp = dccp_sk(sk); 182 208 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 183 209 struct ccid2_seq *next; 184 210 185 - hctx->ccid2hctx_pipe++; 211 + hctx->pipe++; 186 212 187 - hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss; 188 - hctx->ccid2hctx_seqh->ccid2s_acked = 0; 189 - hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; 213 + hctx->seqh->ccid2s_seq = dp->dccps_gss; 214 + hctx->seqh->ccid2s_acked = 0; 215 + hctx->seqh->ccid2s_sent = jiffies; 190 216 191 - next = hctx->ccid2hctx_seqh->ccid2s_next; 217 + next = hctx->seqh->ccid2s_next; 192 218 /* check if we need to alloc more space */ 193 - if (next == hctx->ccid2hctx_seqt) { 219 + if (next == hctx->seqt) { 194 220 if (ccid2_hc_tx_alloc_seq(hctx)) { 195 221 DCCP_CRIT("packet history - out of memory!"); 196 222 /* FIXME: find a more graceful way to bail out */ 197 223 return; 198 224 } 199 - next = hctx->ccid2hctx_seqh->ccid2s_next; 200 - BUG_ON(next == hctx->ccid2hctx_seqt); 225 + next = hctx->seqh->ccid2s_next; 226 + BUG_ON(next == hctx->seqt); 201 227 } 202 - hctx->ccid2hctx_seqh = next; 228 + hctx->seqh = next; 203 229 204 - ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, 205 - hctx->ccid2hctx_pipe); 230 + ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe); 206 231 207 232 /* 208 233 * FIXME: The code below is broken and the variables have been removed ··· 203 272 */ 204 273 #if 0 205 274 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ 206 - hctx->ccid2hctx_arsent++; 275 + hctx->arsent++; 207 276 /* We had an ack loss in this window... */ 208 - if (hctx->ccid2hctx_ackloss) { 209 - if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) { 210 - hctx->ccid2hctx_arsent = 0; 211 - hctx->ccid2hctx_ackloss = 0; 277 + if (hctx->ackloss) { 278 + if (hctx->arsent >= hctx->cwnd) { 279 + hctx->arsent = 0; 280 + hctx->ackloss = 0; 212 281 } 213 282 } else { 214 283 /* No acks lost up to now... */ ··· 218 287 int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - 219 288 dp->dccps_l_ack_ratio; 220 289 221 - denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom; 290 + denom = hctx->cwnd * hctx->cwnd / denom; 222 291 223 - if (hctx->ccid2hctx_arsent >= denom) { 292 + if (hctx->arsent >= denom) { 224 293 ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); 225 - hctx->ccid2hctx_arsent = 0; 294 + hctx->arsent = 0; 226 295 } 227 296 } else { 228 297 /* we can't increase ack ratio further [1] */ 229 - hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ 298 + hctx->arsent = 0; /* or maybe set it to cwnd*/ 230 299 } 231 300 } 232 301 #endif 233 302 234 303 /* setup RTO timer */ 235 - if (!timer_pending(&hctx->ccid2hctx_rtotimer)) 236 - ccid2_start_rto_timer(sk); 304 + if (!timer_pending(&hctx->rtotimer)) 305 + sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); 237 306 238 307 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG 239 308 do { 240 - struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; 309 + struct ccid2_seq *seqp = hctx->seqt; 241 310 242 - while (seqp != hctx->ccid2hctx_seqh) { 311 + while (seqp != hctx->seqh) { 243 312 ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", 244 313 (unsigned long long)seqp->ccid2s_seq, 245 314 seqp->ccid2s_acked, seqp->ccid2s_sent); ··· 247 316 } 248 317 } while (0); 249 318 ccid2_pr_debug("=========\n"); 250 - ccid2_hc_tx_check_sanity(hctx); 251 319 #endif 252 320 } 253 321 254 - /* XXX Lame code duplication! 255 - * returns -1 if none was found. 256 - * else returns the next offset to use in the function call. 322 + /** 323 + * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm 324 + * This code is almost identical with TCP's tcp_rtt_estimator(), since 325 + * - it has a higher sampling frequency (recommended by RFC 1323), 326 + * - the RTO does not collapse into RTT due to RTTVAR going towards zero, 327 + * - it is simple (cf. more complex proposals such as Eifel timer or research 328 + * which suggests that the gain should be set according to window size), 329 + * - in tests it was found to work well with CCID2 [gerrit]. 257 330 */ 258 - static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, 259 - unsigned char **vec, unsigned char *veclen) 331 + static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) 260 332 { 261 - const struct dccp_hdr *dh = dccp_hdr(skb); 262 - unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); 263 - unsigned char *opt_ptr; 264 - const unsigned char *opt_end = (unsigned char *)dh + 265 - (dh->dccph_doff * 4); 266 - unsigned char opt, len; 267 - unsigned char *value; 333 + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 334 + long m = mrtt ? : 1; 268 335 269 - BUG_ON(offset < 0); 270 - options += offset; 271 - opt_ptr = options; 272 - if (opt_ptr >= opt_end) 273 - return -1; 336 + if (hctx->srtt == 0) { 337 + /* First measurement m */ 338 + hctx->srtt = m << 3; 339 + hctx->mdev = m << 1; 274 340 275 - while (opt_ptr != opt_end) { 276 - opt = *opt_ptr++; 277 - len = 0; 278 - value = NULL; 341 + hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev); 342 + hctx->rttvar = hctx->mdev_max; 343 + hctx->rtt_seq = dccp_sk(sk)->dccps_gss; 344 + } else { 345 + /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ 346 + m -= (hctx->srtt >> 3); 347 + hctx->srtt += m; 279 348 280 - /* Check if this isn't a single byte option */ 281 - if (opt > DCCPO_MAX_RESERVED) { 282 - if (opt_ptr == opt_end) 283 - goto out_invalid_option; 284 - 285 - len = *opt_ptr++; 286 - if (len < 3) 287 - goto out_invalid_option; 349 + /* Similarly, update scaled mdev with regard to |m| */ 350 + if (m < 0) { 351 + m = -m; 352 + m -= (hctx->mdev >> 2); 288 353 /* 289 - * Remove the type and len fields, leaving 290 - * just the value size 354 + * This neutralises RTO increase when RTT < SRTT - mdev 355 + * (see P. Sarolahti, A. Kuznetsov,"Congestion Control 356 + * in Linux TCP", USENIX 2002, pp. 49-62). 291 357 */ 292 - len -= 2; 293 - value = opt_ptr; 294 - opt_ptr += len; 295 - 296 - if (opt_ptr > opt_end) 297 - goto out_invalid_option; 298 - } 299 - 300 - switch (opt) { 301 - case DCCPO_ACK_VECTOR_0: 302 - case DCCPO_ACK_VECTOR_1: 303 - *vec = value; 304 - *veclen = len; 305 - return offset + (opt_ptr - options); 306 - } 307 - } 308 - 309 - return -1; 310 - 311 - out_invalid_option: 312 - DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); 313 - return -1; 314 - } 315 - 316 - static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) 317 - { 318 - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 319 - 320 - sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer); 321 - ccid2_pr_debug("deleted RTO timer\n"); 322 - } 323 - 324 - static inline void ccid2_new_ack(struct sock *sk, 325 - struct ccid2_seq *seqp, 326 - unsigned int *maxincr) 327 - { 328 - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 329 - 330 - if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { 331 - if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) { 332 - hctx->ccid2hctx_cwnd += 1; 333 - *maxincr -= 1; 334 - hctx->ccid2hctx_packets_acked = 0; 335 - } 336 - } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) { 337 - hctx->ccid2hctx_cwnd += 1; 338 - hctx->ccid2hctx_packets_acked = 0; 339 - } 340 - 341 - /* update RTO */ 342 - if (hctx->ccid2hctx_srtt == -1 || 343 - time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) { 344 - unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; 345 - int s; 346 - 347 - /* first measurement */ 348 - if (hctx->ccid2hctx_srtt == -1) { 349 - ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", 350 - r, jiffies, 351 - (unsigned long long)seqp->ccid2s_seq); 352 - ccid2_change_srtt(hctx, r); 353 - hctx->ccid2hctx_rttvar = r >> 1; 358 + if (m > 0) 359 + m >>= 3; 354 360 } else { 355 - /* RTTVAR */ 356 - long tmp = hctx->ccid2hctx_srtt - r; 357 - long srtt; 358 - 359 - if (tmp < 0) 360 - tmp *= -1; 361 - 362 - tmp >>= 2; 363 - hctx->ccid2hctx_rttvar *= 3; 364 - hctx->ccid2hctx_rttvar >>= 2; 365 - hctx->ccid2hctx_rttvar += tmp; 366 - 367 - /* SRTT */ 368 - srtt = hctx->ccid2hctx_srtt; 369 - srtt *= 7; 370 - srtt >>= 3; 371 - tmp = r >> 3; 372 - srtt += tmp; 373 - ccid2_change_srtt(hctx, srtt); 361 + m -= (hctx->mdev >> 2); 374 362 } 375 - s = hctx->ccid2hctx_rttvar << 2; 376 - /* clock granularity is 1 when based on jiffies */ 377 - if (!s) 378 - s = 1; 379 - hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s; 363 + hctx->mdev += m; 380 364 381 - /* must be at least a second */ 382 - s = hctx->ccid2hctx_rto / HZ; 383 - /* DCCP doesn't require this [but I like it cuz my code sux] */ 384 - #if 1 385 - if (s < 1) 386 - hctx->ccid2hctx_rto = HZ; 387 - #endif 388 - /* max 60 seconds */ 389 - if (s > 60) 390 - hctx->ccid2hctx_rto = HZ * 60; 365 + if (hctx->mdev > hctx->mdev_max) { 366 + hctx->mdev_max = hctx->mdev; 367 + if (hctx->mdev_max > hctx->rttvar) 368 + hctx->rttvar = hctx->mdev_max; 369 + } 391 370 392 - hctx->ccid2hctx_lastrtt = jiffies; 393 - 394 - ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", 395 - hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar, 396 - hctx->ccid2hctx_rto, HZ, r); 371 + /* 372 + * Decay RTTVAR at most once per flight, exploiting that 373 + * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) 374 + * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) 375 + * GAR is a useful bound for FlightSize = pipe, AWL is probably 376 + * too low as it over-estimates pipe. 377 + */ 378 + if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) { 379 + if (hctx->mdev_max < hctx->rttvar) 380 + hctx->rttvar -= (hctx->rttvar - 381 + hctx->mdev_max) >> 2; 382 + hctx->rtt_seq = dccp_sk(sk)->dccps_gss; 383 + hctx->mdev_max = TCP_RTO_MIN; 384 + } 397 385 } 398 386 399 - /* we got a new ack, so re-start RTO timer */ 400 - ccid2_hc_tx_kill_rto_timer(sk); 401 - ccid2_start_rto_timer(sk); 387 + /* 388 + * Set RTO from SRTT and RTTVAR 389 + * Clock granularity is ignored since the minimum error for RTTVAR is 390 + * clamped to 50msec (corresponding to HZ=20). This leads to a minimum 391 + * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP 392 + * does not retransmit data, DCCP does not require TCP's recommended 393 + * minimum timeout of one second". 394 + */ 395 + hctx->rto = (hctx->srtt >> 3) + hctx->rttvar; 396 + 397 + if (hctx->rto > DCCP_RTO_MAX) 398 + hctx->rto = DCCP_RTO_MAX; 402 399 } 403 400 404 - static void ccid2_hc_tx_dec_pipe(struct sock *sk) 401 + static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, 402 + unsigned int *maxincr) 405 403 { 406 404 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 407 405 408 - if (hctx->ccid2hctx_pipe == 0) 409 - DCCP_BUG("pipe == 0"); 410 - else 411 - hctx->ccid2hctx_pipe--; 412 - 413 - if (hctx->ccid2hctx_pipe == 0) 414 - ccid2_hc_tx_kill_rto_timer(sk); 406 + if (hctx->cwnd < hctx->ssthresh) { 407 + if (*maxincr > 0 && ++hctx->packets_acked == 2) { 408 + hctx->cwnd += 1; 409 + *maxincr -= 1; 410 + hctx->packets_acked = 0; 411 + } 412 + } else if (++hctx->packets_acked >= hctx->cwnd) { 413 + hctx->cwnd += 1; 414 + hctx->packets_acked = 0; 415 + } 416 + /* 417 + * FIXME: RTT is sampled several times per acknowledgment (for each 418 + * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). 419 + * This causes the RTT to be over-estimated, since the older entries 420 + * in the Ack Vector have earlier sending times. 421 + * The cleanest solution is to not use the ccid2s_sent field at all 422 + * and instead use DCCP timestamps - need to be resolved at some time. 423 + */ 424 + ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); 415 425 } 416 426 417 427 static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) 418 428 { 419 429 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 420 430 421 - if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { 431 + if (time_before(seqp->ccid2s_sent, hctx->last_cong)) { 422 432 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); 423 433 return; 424 434 } 425 435 426 - hctx->ccid2hctx_last_cong = jiffies; 436 + hctx->last_cong = jiffies; 427 437 428 - hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U; 429 - hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U); 438 + hctx->cwnd = hctx->cwnd / 2 ? : 1U; 439 + hctx->ssthresh = max(hctx->cwnd, 2U); 430 440 431 441 /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ 432 - if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd) 433 - ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd); 442 + if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd) 443 + ccid2_change_l_ack_ratio(sk, hctx->cwnd); 444 + } 445 + 446 + static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, 447 + u8 option, u8 *optval, u8 optlen) 448 + { 449 + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 450 + 451 + switch (option) { 452 + case DCCPO_ACK_VECTOR_0: 453 + case DCCPO_ACK_VECTOR_1: 454 + return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen, 455 + option - DCCPO_ACK_VECTOR_0); 456 + } 457 + return 0; 434 458 } 435 459 436 460 static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 437 461 { 438 462 struct dccp_sock *dp = dccp_sk(sk); 439 463 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 464 + const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); 465 + struct dccp_ackvec_parsed *avp; 440 466 u64 ackno, seqno; 441 467 struct ccid2_seq *seqp; 442 - unsigned char *vector; 443 - unsigned char veclen; 444 - int offset = 0; 445 468 int done = 0; 446 469 unsigned int maxincr = 0; 447 470 448 - ccid2_hc_tx_check_sanity(hctx); 449 471 /* check reverse path congestion */ 450 472 seqno = DCCP_SKB_CB(skb)->dccpd_seq; 451 473 ··· 407 523 * -sorbo. 408 524 */ 409 525 /* need to bootstrap */ 410 - if (hctx->ccid2hctx_rpdupack == -1) { 411 - hctx->ccid2hctx_rpdupack = 0; 412 - hctx->ccid2hctx_rpseq = seqno; 526 + if (hctx->rpdupack == -1) { 527 + hctx->rpdupack = 0; 528 + hctx->rpseq = seqno; 413 529 } else { 414 530 /* check if packet is consecutive */ 415 - if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1) 416 - hctx->ccid2hctx_rpseq = seqno; 531 + if (dccp_delta_seqno(hctx->rpseq, seqno) == 1) 532 + hctx->rpseq = seqno; 417 533 /* it's a later packet */ 418 - else if (after48(seqno, hctx->ccid2hctx_rpseq)) { 419 - hctx->ccid2hctx_rpdupack++; 534 + else if (after48(seqno, hctx->rpseq)) { 535 + hctx->rpdupack++; 420 536 421 537 /* check if we got enough dupacks */ 422 - if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) { 423 - hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ 424 - hctx->ccid2hctx_rpseq = 0; 538 + if (hctx->rpdupack >= NUMDUPACK) { 539 + hctx->rpdupack = -1; /* XXX lame */ 540 + hctx->rpseq = 0; 425 541 426 542 ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); 427 543 } ··· 429 545 } 430 546 431 547 /* check forward path congestion */ 432 - /* still didn't send out new data packets */ 433 - if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt) 548 + if (dccp_packet_without_ack(skb)) 434 549 return; 435 550 436 - switch (DCCP_SKB_CB(skb)->dccpd_type) { 437 - case DCCP_PKT_ACK: 438 - case DCCP_PKT_DATAACK: 439 - break; 440 - default: 441 - return; 442 - } 551 + /* still didn't send out new data packets */ 552 + if (hctx->seqh == hctx->seqt) 553 + goto done; 443 554 444 555 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; 445 - if (after48(ackno, hctx->ccid2hctx_high_ack)) 446 - hctx->ccid2hctx_high_ack = ackno; 556 + if (after48(ackno, hctx->high_ack)) 557 + hctx->high_ack = ackno; 447 558 448 - seqp = hctx->ccid2hctx_seqt; 559 + seqp = hctx->seqt; 449 560 while (before48(seqp->ccid2s_seq, ackno)) { 450 561 seqp = seqp->ccid2s_next; 451 - if (seqp == hctx->ccid2hctx_seqh) { 452 - seqp = hctx->ccid2hctx_seqh->ccid2s_prev; 562 + if (seqp == hctx->seqh) { 563 + seqp = hctx->seqh->ccid2s_prev; 453 564 break; 454 565 } 455 566 } ··· 454 575 * packets per acknowledgement. Rounding up avoids that cwnd is not 455 576 * advanced when Ack Ratio is 1 and gives a slight edge otherwise. 456 577 */ 457 - if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) 578 + if (hctx->cwnd < hctx->ssthresh) 458 579 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); 459 580 460 581 /* go through all ack vectors */ 461 - while ((offset = ccid2_ackvector(sk, skb, offset, 462 - &vector, &veclen)) != -1) { 582 + list_for_each_entry(avp, &hctx->av_chunks, node) { 463 583 /* go through this ack vector */ 464 - while (veclen--) { 465 - const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; 466 - u64 ackno_end_rl = SUB48(ackno, rl); 584 + for (; avp->len--; avp->vec++) { 585 + u64 ackno_end_rl = SUB48(ackno, 586 + dccp_ackvec_runlen(avp->vec)); 467 587 468 - ccid2_pr_debug("ackvec start:%llu end:%llu\n", 588 + ccid2_pr_debug("ackvec %llu |%u,%u|\n", 469 589 (unsigned long long)ackno, 470 - (unsigned long long)ackno_end_rl); 590 + dccp_ackvec_state(avp->vec) >> 6, 591 + dccp_ackvec_runlen(avp->vec)); 471 592 /* if the seqno we are analyzing is larger than the 472 593 * current ackno, then move towards the tail of our 473 594 * seqnos. 474 595 */ 475 596 while (after48(seqp->ccid2s_seq, ackno)) { 476 - if (seqp == hctx->ccid2hctx_seqt) { 597 + if (seqp == hctx->seqt) { 477 598 done = 1; 478 599 break; 479 600 } ··· 486 607 * run length 487 608 */ 488 609 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { 489 - const u8 state = *vector & 490 - DCCP_ACKVEC_STATE_MASK; 610 + const u8 state = dccp_ackvec_state(avp->vec); 491 611 492 612 /* new packet received or marked */ 493 - if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && 613 + if (state != DCCPAV_NOT_RECEIVED && 494 614 !seqp->ccid2s_acked) { 495 - if (state == 496 - DCCP_ACKVEC_STATE_ECN_MARKED) { 615 + if (state == DCCPAV_ECN_MARKED) 497 616 ccid2_congestion_event(sk, 498 617 seqp); 499 - } else 618 + else 500 619 ccid2_new_ack(sk, seqp, 501 620 &maxincr); 502 621 503 622 seqp->ccid2s_acked = 1; 504 623 ccid2_pr_debug("Got ack for %llu\n", 505 624 (unsigned long long)seqp->ccid2s_seq); 506 - ccid2_hc_tx_dec_pipe(sk); 625 + hctx->pipe--; 507 626 } 508 - if (seqp == hctx->ccid2hctx_seqt) { 627 + if (seqp == hctx->seqt) { 509 628 done = 1; 510 629 break; 511 630 } ··· 513 636 break; 514 637 515 638 ackno = SUB48(ackno_end_rl, 1); 516 - vector++; 517 639 } 518 640 if (done) 519 641 break; ··· 521 645 /* The state about what is acked should be correct now 522 646 * Check for NUMDUPACK 523 647 */ 524 - seqp = hctx->ccid2hctx_seqt; 525 - while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) { 648 + seqp = hctx->seqt; 649 + while (before48(seqp->ccid2s_seq, hctx->high_ack)) { 526 650 seqp = seqp->ccid2s_next; 527 - if (seqp == hctx->ccid2hctx_seqh) { 528 - seqp = hctx->ccid2hctx_seqh->ccid2s_prev; 651 + if (seqp == hctx->seqh) { 652 + seqp = hctx->seqh->ccid2s_prev; 529 653 break; 530 654 } 531 655 } ··· 536 660 if (done == NUMDUPACK) 537 661 break; 538 662 } 539 - if (seqp == hctx->ccid2hctx_seqt) 663 + if (seqp == hctx->seqt) 540 664 break; 541 665 seqp = seqp->ccid2s_prev; 542 666 } ··· 557 681 * one ack vector. 558 682 */ 559 683 ccid2_congestion_event(sk, seqp); 560 - ccid2_hc_tx_dec_pipe(sk); 684 + hctx->pipe--; 561 685 } 562 - if (seqp == hctx->ccid2hctx_seqt) 686 + if (seqp == hctx->seqt) 563 687 break; 564 688 seqp = seqp->ccid2s_prev; 565 689 } 566 690 567 - hctx->ccid2hctx_seqt = last_acked; 691 + hctx->seqt = last_acked; 568 692 } 569 693 570 694 /* trim acked packets in tail */ 571 - while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) { 572 - if (!hctx->ccid2hctx_seqt->ccid2s_acked) 695 + while (hctx->seqt != hctx->seqh) { 696 + if (!hctx->seqt->ccid2s_acked) 573 697 break; 574 698 575 - hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next; 699 + hctx->seqt = hctx->seqt->ccid2s_next; 576 700 } 577 701 578 - ccid2_hc_tx_check_sanity(hctx); 702 + /* restart RTO timer if not all outstanding data has been acked */ 703 + if (hctx->pipe == 0) 704 + sk_stop_timer(sk, &hctx->rtotimer); 705 + else 706 + sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); 707 + done: 708 + /* check if incoming Acks allow pending packets to be sent */ 709 + if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx)) 710 + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); 711 + dccp_ackvec_parsed_cleanup(&hctx->av_chunks); 579 712 } 580 713 581 714 static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) ··· 594 709 u32 max_ratio; 595 710 596 711 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ 597 - hctx->ccid2hctx_ssthresh = ~0U; 712 + hctx->ssthresh = ~0U; 598 713 599 - /* 600 - * RFC 4341, 5: "The cwnd parameter is initialized to at most four 601 - * packets for new connections, following the rules from [RFC3390]". 602 - * We need to convert the bytes of RFC3390 into the packets of RFC 4341. 603 - */ 604 - hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); 714 + /* Use larger initial windows (RFC 3390, rfc2581bis) */ 715 + hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); 605 716 606 717 /* Make sure that Ack Ratio is enabled and within bounds. */ 607 - max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2); 718 + max_ratio = DIV_ROUND_UP(hctx->cwnd, 2); 608 719 if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) 609 720 dp->dccps_l_ack_ratio = max_ratio; 610 721 ··· 608 727 if (ccid2_hc_tx_alloc_seq(hctx)) 609 728 return -ENOMEM; 610 729 611 - hctx->ccid2hctx_rto = 3 * HZ; 612 - ccid2_change_srtt(hctx, -1); 613 - hctx->ccid2hctx_rttvar = -1; 614 - hctx->ccid2hctx_rpdupack = -1; 615 - hctx->ccid2hctx_last_cong = jiffies; 616 - setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire, 617 - (unsigned long)sk); 618 - 619 - ccid2_hc_tx_check_sanity(hctx); 730 + hctx->rto = DCCP_TIMEOUT_INIT; 731 + hctx->rpdupack = -1; 732 + hctx->last_cong = jiffies; 733 + setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); 734 + INIT_LIST_HEAD(&hctx->av_chunks); 620 735 return 0; 621 736 } 622 737 ··· 621 744 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 622 745 int i; 623 746 624 - ccid2_hc_tx_kill_rto_timer(sk); 747 + sk_stop_timer(sk, &hctx->rtotimer); 625 748 626 - for (i = 0; i < hctx->ccid2hctx_seqbufc; i++) 627 - kfree(hctx->ccid2hctx_seqbuf[i]); 628 - hctx->ccid2hctx_seqbufc = 0; 749 + for (i = 0; i < hctx->seqbufc; i++) 750 + kfree(hctx->seqbuf[i]); 751 + hctx->seqbufc = 0; 629 752 } 630 753 631 754 static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) ··· 636 759 switch (DCCP_SKB_CB(skb)->dccpd_type) { 637 760 case DCCP_PKT_DATA: 638 761 case DCCP_PKT_DATAACK: 639 - hcrx->ccid2hcrx_data++; 640 - if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) { 762 + hcrx->data++; 763 + if (hcrx->data >= dp->dccps_r_ack_ratio) { 641 764 dccp_send_ack(sk); 642 - hcrx->ccid2hcrx_data = 0; 765 + hcrx->data = 0; 643 766 } 644 767 break; 645 768 } 646 769 } 647 770 648 771 static struct ccid_operations ccid2 = { 649 - .ccid_id = DCCPC_CCID2, 650 - .ccid_name = "TCP-like", 651 - .ccid_owner = THIS_MODULE, 652 - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), 653 - .ccid_hc_tx_init = ccid2_hc_tx_init, 654 - .ccid_hc_tx_exit = ccid2_hc_tx_exit, 655 - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, 656 - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, 657 - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, 658 - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), 659 - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, 772 + .ccid_id = DCCPC_CCID2, 773 + .ccid_name = "TCP-like", 774 + .ccid_owner = THIS_MODULE, 775 + .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), 776 + .ccid_hc_tx_init = ccid2_hc_tx_init, 777 + .ccid_hc_tx_exit = ccid2_hc_tx_exit, 778 + .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, 779 + .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, 780 + .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, 781 + .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, 782 + .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), 783 + .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, 660 784 }; 661 785 662 786 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
+39 -24
net/dccp/ccids/ccid2.h
··· 42 42 43 43 /** struct ccid2_hc_tx_sock - CCID2 TX half connection 44 44 * 45 - * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 46 - * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465) 47 - * @ccid2hctx_lastrtt -time RTT was last measured 48 - * @ccid2hctx_rpseq - last consecutive seqno 49 - * @ccid2hctx_rpdupack - dupacks since rpseq 50 - */ 45 + * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 46 + * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465) 47 + * @srtt: smoothed RTT estimate, scaled by 2^3 48 + * @mdev: smoothed RTT variation, scaled by 2^2 49 + * @mdev_max: maximum of @mdev during one flight 50 + * @rttvar: moving average/maximum of @mdev_max 51 + * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) 52 + * @rtt_seq: to decay RTTVAR at most once per flight 53 + * @rpseq: last consecutive seqno 54 + * @rpdupack: dupacks since rpseq 55 + * @av_chunks: list of Ack Vectors received on current skb 56 + */ 51 57 struct ccid2_hc_tx_sock { 52 - u32 ccid2hctx_cwnd; 53 - u32 ccid2hctx_ssthresh; 54 - u32 ccid2hctx_pipe; 55 - u32 ccid2hctx_packets_acked; 56 - struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; 57 - int ccid2hctx_seqbufc; 58 - struct ccid2_seq *ccid2hctx_seqh; 59 - struct ccid2_seq *ccid2hctx_seqt; 60 - long ccid2hctx_rto; 61 - long ccid2hctx_srtt; 62 - long ccid2hctx_rttvar; 63 - unsigned long ccid2hctx_lastrtt; 64 - struct timer_list ccid2hctx_rtotimer; 65 - u64 ccid2hctx_rpseq; 66 - int ccid2hctx_rpdupack; 67 - unsigned long ccid2hctx_last_cong; 68 - u64 ccid2hctx_high_ack; 58 + u32 cwnd; 59 + u32 ssthresh; 60 + u32 pipe; 61 + u32 packets_acked; 62 + struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX]; 63 + int seqbufc; 64 + struct ccid2_seq *seqh; 65 + struct ccid2_seq *seqt; 66 + /* RTT measurement: variables/principles are the same as in TCP */ 67 + u32 srtt, 68 + mdev, 69 + mdev_max, 70 + rttvar, 71 + rto; 72 + u64 rtt_seq:48; 73 + struct timer_list rtotimer; 74 + u64 rpseq; 75 + int rpdupack; 76 + unsigned long last_cong; 77 + u64 high_ack; 78 + struct list_head av_chunks; 69 79 }; 70 80 81 + static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx) 82 + { 83 + return (hctx->pipe >= hctx->cwnd); 84 + } 85 + 71 86 struct ccid2_hc_rx_sock { 72 - int ccid2hcrx_data; 87 + int data; 73 88 }; 74 89 75 90 static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
+304 -464
net/dccp/ccids/ccid3.c
··· 49 49 /* 50 50 * Transmitter Half-Connection Routines 51 51 */ 52 - #ifdef CONFIG_IP_DCCP_CCID3_DEBUG 53 - static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) 54 - { 55 - static char *ccid3_state_names[] = { 56 - [TFRC_SSTATE_NO_SENT] = "NO_SENT", 57 - [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", 58 - [TFRC_SSTATE_FBACK] = "FBACK", 59 - [TFRC_SSTATE_TERM] = "TERM", 60 - }; 61 - 62 - return ccid3_state_names[state]; 63 - } 64 - #endif 65 - 66 - static void ccid3_hc_tx_set_state(struct sock *sk, 67 - enum ccid3_hc_tx_states state) 68 - { 69 - struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 70 - enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; 71 - 72 - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", 73 - dccp_role(sk), sk, ccid3_tx_state_name(oldstate), 74 - ccid3_tx_state_name(state)); 75 - WARN_ON(state == oldstate); 76 - hctx->ccid3hctx_state = state; 77 - } 52 + /* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */ 53 + static int do_osc_prev = true; 78 54 79 55 /* 80 56 * Compute the initial sending rate X_init in the manner of RFC 3390: 81 57 * 82 - * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT 58 + * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT 83 59 * 84 - * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis 85 - * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. 86 60 * For consistency with other parts of the code, X_init is scaled by 2^6. 87 61 */ 88 62 static inline u64 rfc3390_initial_rate(struct sock *sk) 89 63 { 90 - const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 91 - const __u32 w_init = clamp_t(__u32, 4380U, 92 - 2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s); 64 + const u32 mps = dccp_sk(sk)->dccps_mss_cache, 65 + w_init = clamp(4380U, 2 * mps, 4 * mps); 93 66 94 - return scaled_div(w_init << 6, hctx->ccid3hctx_rtt); 67 + return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt); 95 68 } 96 69 97 - /* 98 - * Recalculate t_ipi and delta (should be called whenever X changes) 70 + /** 71 + * ccid3_update_send_interval - Calculate new t_ipi = s / X 72 + * This respects the granularity of X (64 * bytes/second) and enforces the 73 + * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342. 99 74 */ 100 75 static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) 101 76 { 102 - /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ 103 - hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, 104 - hctx->ccid3hctx_x); 105 - 106 - /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ 107 - hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, 108 - TFRC_OPSYS_HALF_TIME_GRAN); 109 - 110 - ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", 111 - hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta, 112 - hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6)); 113 - 77 + if (unlikely(hctx->x <= hctx->s)) 78 + hctx->x = hctx->s; 79 + hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x); 114 80 } 115 81 116 82 static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) 117 83 { 118 - u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count); 84 + u32 delta = ktime_us_delta(now, hctx->t_last_win_count); 119 85 120 - return delta / hctx->ccid3hctx_rtt; 86 + return delta / hctx->rtt; 121 87 } 122 88 123 89 /** ··· 99 133 static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) 100 134 { 101 135 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 102 - __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; 103 - const __u64 old_x = hctx->ccid3hctx_x; 136 + u64 min_rate = 2 * hctx->x_recv; 137 + const u64 old_x = hctx->x; 104 138 ktime_t now = stamp ? *stamp : ktime_get_real(); 105 139 106 140 /* ··· 111 145 */ 112 146 if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { 113 147 min_rate = rfc3390_initial_rate(sk); 114 - min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); 148 + min_rate = max(min_rate, 2 * hctx->x_recv); 115 149 } 116 150 117 - if (hctx->ccid3hctx_p > 0) { 151 + if (hctx->p > 0) { 118 152 119 - hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6, 120 - min_rate); 121 - hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, 122 - (((__u64)hctx->ccid3hctx_s) << 6) / 123 - TFRC_T_MBI); 153 + hctx->x = min(((u64)hctx->x_calc) << 6, min_rate); 124 154 125 - } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld) 126 - - (s64)hctx->ccid3hctx_rtt >= 0) { 155 + } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) { 127 156 128 - hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate); 129 - hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, 130 - scaled_div(((__u64)hctx->ccid3hctx_s) << 6, 131 - hctx->ccid3hctx_rtt)); 132 - hctx->ccid3hctx_t_ld = now; 157 + hctx->x = min(2 * hctx->x, min_rate); 158 + hctx->x = max(hctx->x, 159 + scaled_div(((u64)hctx->s) << 6, hctx->rtt)); 160 + hctx->t_ld = now; 133 161 } 134 162 135 - if (hctx->ccid3hctx_x != old_x) { 163 + if (hctx->x != old_x) { 136 164 ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " 137 165 "X_recv=%u\n", (unsigned)(old_x >> 6), 138 - (unsigned)(hctx->ccid3hctx_x >> 6), 139 - hctx->ccid3hctx_x_calc, 140 - (unsigned)(hctx->ccid3hctx_x_recv >> 6)); 166 + (unsigned)(hctx->x >> 6), hctx->x_calc, 167 + (unsigned)(hctx->x_recv >> 6)); 141 168 142 169 ccid3_update_send_interval(hctx); 143 170 } 144 171 } 145 172 146 173 /* 147 - * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) 148 - * @len: DCCP packet payload size in bytes 174 + * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1) 175 + * @new_len: DCCP payload size in bytes (not used by all methods) 149 176 */ 150 - static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) 177 + static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) 151 178 { 152 - const u16 old_s = hctx->ccid3hctx_s; 153 - 154 - hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9); 155 - 156 - if (hctx->ccid3hctx_s != old_s) 157 - ccid3_update_send_interval(hctx); 179 + #if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG) 180 + return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9); 181 + #elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX) 182 + return max(ccid3_hc_tx_sk(sk)->s, new_len); 183 + #else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */ 184 + return dccp_sk(sk)->dccps_mss_cache; 185 + #endif 158 186 } 159 187 160 188 /* ··· 158 198 static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, 159 199 ktime_t now) 160 200 { 161 - u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count), 162 - quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt; 201 + u32 delta = ktime_us_delta(now, hctx->t_last_win_count), 202 + quarter_rtts = (4 * delta) / hctx->rtt; 163 203 164 204 if (quarter_rtts > 0) { 165 - hctx->ccid3hctx_t_last_win_count = now; 166 - hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U); 167 - hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */ 205 + hctx->t_last_win_count = now; 206 + hctx->last_win_count += min(quarter_rtts, 5U); 207 + hctx->last_win_count &= 0xF; /* mod 16 */ 168 208 } 169 209 } 170 210 ··· 181 221 goto restart_timer; 182 222 } 183 223 184 - ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, 185 - ccid3_tx_state_name(hctx->ccid3hctx_state)); 224 + ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk, 225 + hctx->feedback ? "" : "out"); 186 226 187 - if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK) 188 - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); 189 - else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) 227 + /* Ignore and do not restart after leaving the established state */ 228 + if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) 190 229 goto out; 230 + 231 + /* Reset feedback state to "no feedback received" */ 232 + hctx->feedback = false; 191 233 192 234 /* 193 235 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 236 + * RTO is 0 if and only if no feedback has been received yet. 194 237 */ 195 - if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */ 196 - hctx->ccid3hctx_p == 0) { 238 + if (hctx->t_rto == 0 || hctx->p == 0) { 197 239 198 240 /* halve send rate directly */ 199 - hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, 200 - (((__u64)hctx->ccid3hctx_s) << 6) / 201 - TFRC_T_MBI); 241 + hctx->x /= 2; 202 242 ccid3_update_send_interval(hctx); 243 + 203 244 } else { 204 245 /* 205 246 * Modify the cached value of X_recv ··· 212 251 * 213 252 * Note that X_recv is scaled by 2^6 while X_calc is not 214 253 */ 215 - BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); 254 + BUG_ON(hctx->p && !hctx->x_calc); 216 255 217 - if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5)) 218 - hctx->ccid3hctx_x_recv = 219 - max(hctx->ccid3hctx_x_recv / 2, 220 - (((__u64)hctx->ccid3hctx_s) << 6) / 221 - (2 * TFRC_T_MBI)); 256 + if (hctx->x_calc > (hctx->x_recv >> 5)) 257 + hctx->x_recv /= 2; 222 258 else { 223 - hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; 224 - hctx->ccid3hctx_x_recv <<= 4; 259 + hctx->x_recv = hctx->x_calc; 260 + hctx->x_recv <<= 4; 225 261 } 226 262 ccid3_hc_tx_update_x(sk, NULL); 227 263 } 228 264 ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", 229 - (unsigned long long)hctx->ccid3hctx_x); 265 + (unsigned long long)hctx->x); 230 266 231 267 /* 232 268 * Set new timeout for the nofeedback timer. 233 269 * See comments in packet_recv() regarding the value of t_RTO. 234 270 */ 235 - if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */ 271 + if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */ 236 272 t_nfb = TFRC_INITIAL_TIMEOUT; 237 273 else 238 - t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); 274 + t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); 239 275 240 276 restart_timer: 241 - sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 277 + sk_reset_timer(sk, &hctx->no_feedback_timer, 242 278 jiffies + usecs_to_jiffies(t_nfb)); 243 279 out: 244 280 bh_unlock_sock(sk); 245 281 sock_put(sk); 246 282 } 247 283 248 - /* 249 - * returns 250 - * > 0: delay (in msecs) that should pass before actually sending 251 - * = 0: can send immediately 252 - * < 0: error condition; do not send packet 284 + /** 285 + * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets 286 + * @skb: next packet candidate to send on @sk 287 + * This function uses the convention of ccid_packet_dequeue_eval() and 288 + * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. 253 289 */ 254 290 static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 255 291 { ··· 263 305 if (unlikely(skb->len == 0)) 264 306 return -EBADMSG; 265 307 266 - switch (hctx->ccid3hctx_state) { 267 - case TFRC_SSTATE_NO_SENT: 268 - sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 269 - (jiffies + 308 + if (hctx->s == 0) { 309 + sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies + 270 310 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); 271 - hctx->ccid3hctx_last_win_count = 0; 272 - hctx->ccid3hctx_t_last_win_count = now; 311 + hctx->last_win_count = 0; 312 + hctx->t_last_win_count = now; 273 313 274 314 /* Set t_0 for initial packet */ 275 - hctx->ccid3hctx_t_nom = now; 276 - 277 - hctx->ccid3hctx_s = skb->len; 315 + hctx->t_nom = now; 278 316 279 317 /* 280 318 * Use initial RTT sample when available: recommended by erratum ··· 279 325 */ 280 326 if (dp->dccps_syn_rtt) { 281 327 ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); 282 - hctx->ccid3hctx_rtt = dp->dccps_syn_rtt; 283 - hctx->ccid3hctx_x = rfc3390_initial_rate(sk); 284 - hctx->ccid3hctx_t_ld = now; 328 + hctx->rtt = dp->dccps_syn_rtt; 329 + hctx->x = rfc3390_initial_rate(sk); 330 + hctx->t_ld = now; 285 331 } else { 286 332 /* 287 333 * Sender does not have RTT sample: ··· 289 335 * is needed in several parts (e.g. window counter); 290 336 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. 291 337 */ 292 - hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT; 293 - hctx->ccid3hctx_x = hctx->ccid3hctx_s; 294 - hctx->ccid3hctx_x <<= 6; 338 + hctx->rtt = DCCP_FALLBACK_RTT; 339 + hctx->x = dp->dccps_mss_cache; 340 + hctx->x <<= 6; 295 341 } 342 + 343 + /* Compute t_ipi = s / X */ 344 + hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len); 296 345 ccid3_update_send_interval(hctx); 297 346 298 - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); 299 - break; 300 - case TFRC_SSTATE_NO_FBACK: 301 - case TFRC_SSTATE_FBACK: 302 - delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now); 347 + /* Seed value for Oscillation Prevention (sec. 4.5) */ 348 + hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt); 349 + 350 + } else { 351 + delay = ktime_us_delta(hctx->t_nom, now); 303 352 ccid3_pr_debug("delay=%ld\n", (long)delay); 304 353 /* 305 354 * Scheduling of packet transmissions [RFC 3448, 4.6] ··· 312 355 * else 313 356 * // send the packet in (t_nom - t_now) milliseconds. 314 357 */ 315 - if (delay - (s64)hctx->ccid3hctx_delta >= 1000) 316 - return (u32)delay / 1000L; 358 + if (delay >= TFRC_T_DELTA) 359 + return (u32)delay / USEC_PER_MSEC; 317 360 318 361 ccid3_hc_tx_update_win_count(hctx, now); 319 - break; 320 - case TFRC_SSTATE_TERM: 321 - DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); 322 - return -EINVAL; 323 362 } 324 363 325 364 /* prepare to send now (add options etc.) */ 326 365 dp->dccps_hc_tx_insert_options = 1; 327 - DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; 366 + DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count; 328 367 329 368 /* set the nominal send time for the next following packet */ 330 - hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom, 331 - hctx->ccid3hctx_t_ipi); 332 - return 0; 369 + hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi); 370 + return CCID_PACKET_SEND_AT_ONCE; 333 371 } 334 372 335 - static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, 336 - unsigned int len) 373 + static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) 337 374 { 338 375 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 339 376 340 - ccid3_hc_tx_update_s(hctx, len); 377 + /* Changes to s will become effective the next time X is computed */ 378 + hctx->s = ccid3_hc_tx_measure_packet_size(sk, len); 341 379 342 - if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss)) 380 + if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss)) 343 381 DCCP_CRIT("packet history - out of memory!"); 344 382 } 345 383 346 384 static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 347 385 { 348 386 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 349 - struct ccid3_options_received *opt_recv; 387 + struct tfrc_tx_hist_entry *acked; 350 388 ktime_t now; 351 389 unsigned long t_nfb; 352 - u32 pinv, r_sample; 390 + u32 r_sample; 353 391 354 392 /* we are only interested in ACKs */ 355 393 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || 356 394 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) 357 395 return; 358 - /* ... and only in the established state */ 359 - if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK && 360 - hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) 361 - return; 362 - 363 - opt_recv = &hctx->ccid3hctx_options_received; 364 - now = ktime_get_real(); 365 - 366 - /* Estimate RTT from history if ACK number is valid */ 367 - r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist, 368 - DCCP_SKB_CB(skb)->dccpd_ack_seq, now); 369 - if (r_sample == 0) { 370 - DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk, 371 - dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type), 372 - (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq); 373 - return; 374 - } 375 - 376 - /* Update receive rate in units of 64 * bytes/second */ 377 - hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; 378 - hctx->ccid3hctx_x_recv <<= 6; 379 - 380 - /* Update loss event rate (which is scaled by 1e6) */ 381 - pinv = opt_recv->ccid3or_loss_event_rate; 382 - if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ 383 - hctx->ccid3hctx_p = 0; 384 - else /* can not exceed 100% */ 385 - hctx->ccid3hctx_p = scaled_div(1, pinv); 386 396 /* 387 - * Validate new RTT sample and update moving average 397 + * Locate the acknowledged packet in the TX history. 398 + * 399 + * Returning "entry not found" here can for instance happen when 400 + * - the host has not sent out anything (e.g. a passive server), 401 + * - the Ack is outdated (packet with higher Ack number was received), 402 + * - it is a bogus Ack (for a packet not sent on this connection). 388 403 */ 389 - r_sample = dccp_sample_rtt(sk, r_sample); 390 - hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9); 404 + acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb)); 405 + if (acked == NULL) 406 + return; 407 + /* For the sake of RTT sampling, ignore/remove all older entries */ 408 + tfrc_tx_hist_purge(&acked->next); 409 + 410 + /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ 411 + now = ktime_get_real(); 412 + r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); 413 + hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9); 414 + 391 415 /* 392 416 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 393 417 */ 394 - if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { 395 - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); 418 + if (!hctx->feedback) { 419 + hctx->feedback = true; 396 420 397 - if (hctx->ccid3hctx_t_rto == 0) { 421 + if (hctx->t_rto == 0) { 398 422 /* 399 423 * Initial feedback packet: Larger Initial Windows (4.2) 400 424 */ 401 - hctx->ccid3hctx_x = rfc3390_initial_rate(sk); 402 - hctx->ccid3hctx_t_ld = now; 425 + hctx->x = rfc3390_initial_rate(sk); 426 + hctx->t_ld = now; 403 427 404 428 ccid3_update_send_interval(hctx); 405 429 406 430 goto done_computing_x; 407 - } else if (hctx->ccid3hctx_p == 0) { 431 + } else if (hctx->p == 0) { 408 432 /* 409 433 * First feedback after nofeedback timer expiry (4.3) 410 434 */ ··· 394 456 } 395 457 396 458 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ 397 - if (hctx->ccid3hctx_p > 0) 398 - hctx->ccid3hctx_x_calc = 399 - tfrc_calc_x(hctx->ccid3hctx_s, 400 - hctx->ccid3hctx_rtt, 401 - hctx->ccid3hctx_p); 459 + if (hctx->p > 0) 460 + hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p); 402 461 ccid3_hc_tx_update_x(sk, &now); 403 462 404 463 done_computing_x: 405 464 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " 406 465 "p=%u, X_calc=%u, X_recv=%u, X=%u\n", 407 - dccp_role(sk), 408 - sk, hctx->ccid3hctx_rtt, r_sample, 409 - hctx->ccid3hctx_s, hctx->ccid3hctx_p, 410 - hctx->ccid3hctx_x_calc, 411 - (unsigned)(hctx->ccid3hctx_x_recv >> 6), 412 - (unsigned)(hctx->ccid3hctx_x >> 6)); 466 + dccp_role(sk), sk, hctx->rtt, r_sample, 467 + hctx->s, hctx->p, hctx->x_calc, 468 + (unsigned)(hctx->x_recv >> 6), 469 + (unsigned)(hctx->x >> 6)); 470 + /* 471 + * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to 472 + * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This 473 + * can be useful if few connections share a link, avoiding that buffer 474 + * fill levels (RTT) oscillate as a result of frequent adjustments to X. 475 + * A useful presentation with background information is in 476 + * Joerg Widmer, "Equation-Based Congestion Control", 477 + * MSc Thesis, University of Mannheim, Germany, 2000 478 + * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation"). 479 + */ 480 + if (do_osc_prev) { 481 + r_sample = tfrc_scaled_sqrt(r_sample); 482 + /* 483 + * The modulation can work in both ways: increase/decrease t_ipi 484 + * according to long-term increases/decreases of the RTT. The 485 + * former is a useful measure, since it works against queue 486 + * build-up. The latter temporarily increases the sending rate, 487 + * so that buffers fill up more quickly. This in turn causes 488 + * the RTT to increase, so that either later reduction becomes 489 + * necessary or the RTT stays at a very high level. Decreasing 490 + * t_ipi is therefore not supported. 491 + * Furthermore, during the initial slow-start phase the RTT 492 + * naturally increases, where using the algorithm would cause 493 + * delays. Hence it is disabled during the initial slow-start. 494 + */ 495 + if (r_sample > hctx->r_sqmean && hctx->p > 0) 496 + hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample, 497 + hctx->r_sqmean); 498 + hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI); 499 + /* update R_sqmean _after_ computing the modulation factor */ 500 + hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9); 501 + } 413 502 414 503 /* unschedule no feedback timer */ 415 - sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); 504 + sk_stop_timer(sk, &hctx->no_feedback_timer); 416 505 417 506 /* 418 507 * As we have calculated new ipi, delta, t_nom it is possible ··· 453 488 * This can help avoid triggering the nofeedback timer too 454 489 * often ('spinning') on LANs with small RTTs. 455 490 */ 456 - hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, 457 - (CONFIG_IP_DCCP_CCID3_RTO * 458 - (USEC_PER_SEC / 1000))); 491 + hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO * 492 + (USEC_PER_SEC / 1000))); 459 493 /* 460 494 * Schedule no feedback timer to expire in 461 495 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) 462 496 */ 463 - t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); 497 + t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); 464 498 465 499 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " 466 500 "expire in %lu jiffies (%luus)\n", 467 - dccp_role(sk), 468 - sk, usecs_to_jiffies(t_nfb), t_nfb); 501 + dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); 469 502 470 - sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 503 + sk_reset_timer(sk, &hctx->no_feedback_timer, 471 504 jiffies + usecs_to_jiffies(t_nfb)); 472 505 } 473 506 474 - static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, 475 - unsigned char len, u16 idx, 476 - unsigned char *value) 507 + static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, 508 + u8 option, u8 *optval, u8 optlen) 477 509 { 478 - int rc = 0; 479 - const struct dccp_sock *dp = dccp_sk(sk); 480 510 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 481 - struct ccid3_options_received *opt_recv; 482 511 __be32 opt_val; 483 512 484 - opt_recv = &hctx->ccid3hctx_options_received; 485 - 486 - if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { 487 - opt_recv->ccid3or_seqno = dp->dccps_gsr; 488 - opt_recv->ccid3or_loss_event_rate = ~0; 489 - opt_recv->ccid3or_loss_intervals_idx = 0; 490 - opt_recv->ccid3or_loss_intervals_len = 0; 491 - opt_recv->ccid3or_receive_rate = 0; 492 - } 493 - 494 513 switch (option) { 495 - case TFRC_OPT_LOSS_EVENT_RATE: 496 - if (unlikely(len != 4)) { 497 - DCCP_WARN("%s(%p), invalid len %d " 498 - "for TFRC_OPT_LOSS_EVENT_RATE\n", 499 - dccp_role(sk), sk, len); 500 - rc = -EINVAL; 501 - } else { 502 - opt_val = get_unaligned((__be32 *)value); 503 - opt_recv->ccid3or_loss_event_rate = ntohl(opt_val); 504 - ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", 505 - dccp_role(sk), sk, 506 - opt_recv->ccid3or_loss_event_rate); 507 - } 508 - break; 509 - case TFRC_OPT_LOSS_INTERVALS: 510 - opt_recv->ccid3or_loss_intervals_idx = idx; 511 - opt_recv->ccid3or_loss_intervals_len = len; 512 - ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", 513 - dccp_role(sk), sk, 514 - opt_recv->ccid3or_loss_intervals_idx, 515 - opt_recv->ccid3or_loss_intervals_len); 516 - break; 517 514 case TFRC_OPT_RECEIVE_RATE: 518 - if (unlikely(len != 4)) { 519 - DCCP_WARN("%s(%p), invalid len %d " 520 - "for TFRC_OPT_RECEIVE_RATE\n", 521 - dccp_role(sk), sk, len); 522 - rc = -EINVAL; 523 - } else { 524 - opt_val = get_unaligned((__be32 *)value); 525 - opt_recv->ccid3or_receive_rate = ntohl(opt_val); 526 - ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", 527 - dccp_role(sk), sk, 528 - opt_recv->ccid3or_receive_rate); 515 + case TFRC_OPT_LOSS_EVENT_RATE: 516 + /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ 517 + if (packet_type == DCCP_PKT_DATA) 518 + break; 519 + if (unlikely(optlen != 4)) { 520 + DCCP_WARN("%s(%p), invalid len %d for %u\n", 521 + dccp_role(sk), sk, optlen, option); 522 + return -EINVAL; 529 523 } 530 - break; 531 - } 524 + opt_val = ntohl(get_unaligned((__be32 *)optval)); 532 525 533 - return rc; 526 + if (option == TFRC_OPT_RECEIVE_RATE) { 527 + /* Receive Rate is kept in units of 64 bytes/second */ 528 + hctx->x_recv = opt_val; 529 + hctx->x_recv <<= 6; 530 + 531 + ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", 532 + dccp_role(sk), sk, opt_val); 533 + } else { 534 + /* Update the fixpoint Loss Event Rate fraction */ 535 + hctx->p = tfrc_invert_loss_event_rate(opt_val); 536 + 537 + ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", 538 + dccp_role(sk), sk, opt_val); 539 + } 540 + } 541 + return 0; 534 542 } 535 543 536 544 static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) 537 545 { 538 546 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); 539 547 540 - hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; 541 - hctx->ccid3hctx_hist = NULL; 542 - setup_timer(&hctx->ccid3hctx_no_feedback_timer, 543 - ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); 544 - 548 + hctx->hist = NULL; 549 + setup_timer(&hctx->no_feedback_timer, 550 + ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); 545 551 return 0; 546 552 } 547 553 ··· 520 584 { 521 585 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 522 586 523 - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); 524 - sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); 525 - 526 - tfrc_tx_hist_purge(&hctx->ccid3hctx_hist); 587 + sk_stop_timer(sk, &hctx->no_feedback_timer); 588 + tfrc_tx_hist_purge(&hctx->hist); 527 589 } 528 590 529 591 static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) 530 592 { 531 - struct ccid3_hc_tx_sock *hctx; 532 - 533 - /* Listen socks doesn't have a private CCID block */ 534 - if (sk->sk_state == DCCP_LISTEN) 535 - return; 536 - 537 - hctx = ccid3_hc_tx_sk(sk); 538 - info->tcpi_rto = hctx->ccid3hctx_t_rto; 539 - info->tcpi_rtt = hctx->ccid3hctx_rtt; 593 + info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto; 594 + info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt; 540 595 } 541 596 542 597 static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, 543 598 u32 __user *optval, int __user *optlen) 544 599 { 545 - const struct ccid3_hc_tx_sock *hctx; 600 + const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 601 + struct tfrc_tx_info tfrc; 546 602 const void *val; 547 603 548 - /* Listen socks doesn't have a private CCID block */ 549 - if (sk->sk_state == DCCP_LISTEN) 550 - return -EINVAL; 551 - 552 - hctx = ccid3_hc_tx_sk(sk); 553 604 switch (optname) { 554 605 case DCCP_SOCKOPT_CCID_TX_INFO: 555 - if (len < sizeof(hctx->ccid3hctx_tfrc)) 606 + if (len < sizeof(tfrc)) 556 607 return -EINVAL; 557 - len = sizeof(hctx->ccid3hctx_tfrc); 558 - val = &hctx->ccid3hctx_tfrc; 608 + tfrc.tfrctx_x = hctx->x; 609 + tfrc.tfrctx_x_recv = hctx->x_recv; 610 + tfrc.tfrctx_x_calc = hctx->x_calc; 611 + tfrc.tfrctx_rtt = hctx->rtt; 612 + tfrc.tfrctx_p = hctx->p; 613 + tfrc.tfrctx_rto = hctx->t_rto; 614 + tfrc.tfrctx_ipi = hctx->t_ipi; 615 + len = sizeof(tfrc); 616 + val = &tfrc; 559 617 break; 560 618 default: 561 619 return -ENOPROTOOPT; ··· 564 634 /* 565 635 * Receiver Half-Connection Routines 566 636 */ 567 - 568 - /* CCID3 feedback types */ 569 - enum ccid3_fback_type { 570 - CCID3_FBACK_NONE = 0, 571 - CCID3_FBACK_INITIAL, 572 - CCID3_FBACK_PERIODIC, 573 - CCID3_FBACK_PARAM_CHANGE 574 - }; 575 - 576 - #ifdef CONFIG_IP_DCCP_CCID3_DEBUG 577 - static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) 578 - { 579 - static char *ccid3_rx_state_names[] = { 580 - [TFRC_RSTATE_NO_DATA] = "NO_DATA", 581 - [TFRC_RSTATE_DATA] = "DATA", 582 - [TFRC_RSTATE_TERM] = "TERM", 583 - }; 584 - 585 - return ccid3_rx_state_names[state]; 586 - } 587 - #endif 588 - 589 - static void ccid3_hc_rx_set_state(struct sock *sk, 590 - enum ccid3_hc_rx_states state) 591 - { 592 - struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 593 - enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; 594 - 595 - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", 596 - dccp_role(sk), sk, ccid3_rx_state_name(oldstate), 597 - ccid3_rx_state_name(state)); 598 - WARN_ON(state == oldstate); 599 - hcrx->ccid3hcrx_state = state; 600 - } 601 - 602 637 static void ccid3_hc_rx_send_feedback(struct sock *sk, 603 638 const struct sk_buff *skb, 604 639 enum ccid3_fback_type fbtype) 605 640 { 606 641 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 607 - struct dccp_sock *dp = dccp_sk(sk); 608 - ktime_t now; 609 - s64 delta = 0; 610 - 611 - if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM)) 612 - return; 613 - 614 - now = ktime_get_real(); 615 642 616 643 switch (fbtype) { 617 644 case CCID3_FBACK_INITIAL: 618 - hcrx->ccid3hcrx_x_recv = 0; 619 - hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ 645 + hcrx->x_recv = 0; 646 + hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */ 620 647 break; 621 648 case CCID3_FBACK_PARAM_CHANGE: 649 + if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) { 650 + /* 651 + * rfc3448bis-06, 6.3.1: First packet(s) lost or marked 652 + * FIXME: in rfc3448bis the receiver returns X_recv=0 653 + * here as it normally would in the first feedback packet. 654 + * However this is not possible yet, since the code still 655 + * uses RFC 3448, i.e. 656 + * If (p > 0) 657 + * Calculate X_calc using the TCP throughput equation. 658 + * X = max(min(X_calc, 2*X_recv), s/t_mbi); 659 + * would bring X down to s/t_mbi. That is why we return 660 + * X_recv according to rfc3448bis-06 for the moment. 661 + */ 662 + u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), 663 + rtt = tfrc_rx_hist_rtt(&hcrx->hist); 664 + 665 + hcrx->x_recv = scaled_div32(s, 2 * rtt); 666 + break; 667 + } 622 668 /* 623 669 * When parameters change (new loss or p > p_prev), we do not 624 670 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so 625 - * need to reuse the previous value of X_recv. However, when 626 - * X_recv was 0 (due to early loss), this would kill X down to 627 - * s/t_mbi (i.e. one packet in 64 seconds). 628 - * To avoid such drastic reduction, we approximate X_recv as 629 - * the number of bytes since last feedback. 630 - * This is a safe fallback, since X is bounded above by X_calc. 671 + * always check whether at least RTT time units were covered. 631 672 */ 632 - if (hcrx->ccid3hcrx_x_recv > 0) 633 - break; 634 - /* fall through */ 673 + hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); 674 + break; 635 675 case CCID3_FBACK_PERIODIC: 636 - delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback); 637 - if (delta <= 0) 638 - DCCP_BUG("delta (%ld) <= 0", (long)delta); 639 - else 640 - hcrx->ccid3hcrx_x_recv = 641 - scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); 676 + /* 677 + * Step (2) of rfc3448bis-06, 6.2: 678 + * - if no data packets have been received, just restart timer 679 + * - if data packets have been received, re-compute X_recv 680 + */ 681 + if (hcrx->hist.bytes_recvd == 0) 682 + goto prepare_for_next_time; 683 + hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); 642 684 break; 643 685 default: 644 686 return; 645 687 } 646 688 647 - ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, 648 - hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv); 689 + ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse); 649 690 650 - hcrx->ccid3hcrx_tstamp_last_feedback = now; 651 - hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval; 652 - hcrx->ccid3hcrx_bytes_recv = 0; 653 - 654 - dp->dccps_hc_rx_insert_options = 1; 691 + dccp_sk(sk)->dccps_hc_rx_insert_options = 1; 655 692 dccp_send_ack(sk); 693 + 694 + prepare_for_next_time: 695 + tfrc_rx_hist_restart_byte_counter(&hcrx->hist); 696 + hcrx->last_counter = dccp_hdr(skb)->dccph_ccval; 697 + hcrx->feedback = fbtype; 656 698 } 657 699 658 700 static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) 659 701 { 660 - const struct ccid3_hc_rx_sock *hcrx; 702 + const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 661 703 __be32 x_recv, pinv; 662 704 663 705 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) 664 706 return 0; 665 707 666 - hcrx = ccid3_hc_rx_sk(sk); 667 - 668 708 if (dccp_packet_without_ack(skb)) 669 709 return 0; 670 710 671 - x_recv = htonl(hcrx->ccid3hcrx_x_recv); 672 - pinv = htonl(hcrx->ccid3hcrx_pinv); 711 + x_recv = htonl(hcrx->x_recv); 712 + pinv = htonl(hcrx->p_inverse); 673 713 674 714 if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, 675 715 &pinv, sizeof(pinv)) || ··· 662 762 static u32 ccid3_first_li(struct sock *sk) 663 763 { 664 764 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 665 - u32 x_recv, p, delta; 765 + u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), 766 + rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p; 666 767 u64 fval; 667 768 668 - if (hcrx->ccid3hcrx_rtt == 0) { 669 - DCCP_WARN("No RTT estimate available, using fallback RTT\n"); 670 - hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT; 671 - } 769 + /* 770 + * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p 771 + * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p 772 + * is about 20.64%. This yields an interval length of 4.84 (rounded up). 773 + */ 774 + if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) 775 + return 5; 672 776 673 - delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback)); 674 - x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); 675 - if (x_recv == 0) { /* would also trigger divide-by-zero */ 676 - DCCP_WARN("X_recv==0\n"); 677 - if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) { 678 - DCCP_BUG("stored value of X_recv is zero"); 679 - return ~0U; 680 - } 681 - } 777 + x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); 778 + if (x_recv == 0) 779 + goto failed; 682 780 683 - fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt); 684 - fval = scaled_div32(fval, x_recv); 781 + fval = scaled_div32(scaled_div(s, rtt), x_recv); 685 782 p = tfrc_calc_x_reverse_lookup(fval); 686 783 687 784 ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " 688 785 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); 689 786 690 - return p == 0 ? ~0U : scaled_div(1, p); 787 + if (p > 0) 788 + return scaled_div(1, p); 789 + failed: 790 + return UINT_MAX; 691 791 } 692 792 693 793 static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 694 794 { 695 795 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 696 - enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; 697 796 const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; 698 797 const bool is_data_packet = dccp_data_packet(skb); 699 - 700 - if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { 701 - if (is_data_packet) { 702 - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; 703 - do_feedback = CCID3_FBACK_INITIAL; 704 - ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); 705 - hcrx->ccid3hcrx_s = payload; 706 - /* 707 - * Not necessary to update ccid3hcrx_bytes_recv here, 708 - * since X_recv = 0 for the first feedback packet (cf. 709 - * RFC 3448, 6.3) -- gerrit 710 - */ 711 - } 712 - goto update_records; 713 - } 714 - 715 - if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb)) 716 - return; /* done receiving */ 717 - 718 - if (is_data_packet) { 719 - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; 720 - /* 721 - * Update moving-average of s and the sum of received payload bytes 722 - */ 723 - hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9); 724 - hcrx->ccid3hcrx_bytes_recv += payload; 725 - } 726 798 727 799 /* 728 800 * Perform loss detection and handle pending losses 729 801 */ 730 - if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist, 731 - skb, ndp, ccid3_first_li, sk)) { 732 - do_feedback = CCID3_FBACK_PARAM_CHANGE; 733 - goto done_receiving; 734 - } 735 - 736 - if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist)) 737 - return; /* done receiving */ 738 - 802 + if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist, 803 + skb, ndp, ccid3_first_li, sk)) 804 + ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE); 739 805 /* 740 - * Handle data packets: RTT sampling and monitoring p 806 + * Feedback for first non-empty data packet (RFC 3448, 6.3) 741 807 */ 742 - if (unlikely(!is_data_packet)) 743 - goto update_records; 744 - 745 - if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) { 746 - const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb); 747 - /* 748 - * Empty loss history: no loss so far, hence p stays 0. 749 - * Sample RTT values, since an RTT estimate is required for the 750 - * computation of p when the first loss occurs; RFC 3448, 6.3.1. 751 - */ 752 - if (sample != 0) 753 - hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9); 754 - 755 - } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) { 756 - /* 757 - * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean 758 - * has decreased (resp. p has increased), send feedback now. 759 - */ 760 - do_feedback = CCID3_FBACK_PARAM_CHANGE; 761 - } 762 - 808 + else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet)) 809 + ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL); 763 810 /* 764 811 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 765 812 */ 766 - if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3) 767 - do_feedback = CCID3_FBACK_PERIODIC; 768 - 769 - update_records: 770 - tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp); 771 - 772 - done_receiving: 773 - if (do_feedback) 774 - ccid3_hc_rx_send_feedback(sk, skb, do_feedback); 813 + else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet && 814 + SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) 815 + ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC); 775 816 } 776 817 777 818 static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) 778 819 { 779 820 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); 780 821 781 - hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; 782 - tfrc_lh_init(&hcrx->ccid3hcrx_li_hist); 783 - return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist); 822 + tfrc_lh_init(&hcrx->li_hist); 823 + return tfrc_rx_hist_init(&hcrx->hist, sk); 784 824 } 785 825 786 826 static void ccid3_hc_rx_exit(struct sock *sk) 787 827 { 788 828 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 789 829 790 - ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); 791 - 792 - tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist); 793 - tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist); 830 + tfrc_rx_hist_purge(&hcrx->hist); 831 + tfrc_lh_cleanup(&hcrx->li_hist); 794 832 } 795 833 796 834 static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) 797 835 { 798 - const struct ccid3_hc_rx_sock *hcrx; 799 - 800 - /* Listen socks doesn't have a private CCID block */ 801 - if (sk->sk_state == DCCP_LISTEN) 802 - return; 803 - 804 - hcrx = ccid3_hc_rx_sk(sk); 805 - info->tcpi_ca_state = hcrx->ccid3hcrx_state; 806 836 info->tcpi_options |= TCPI_OPT_TIMESTAMPS; 807 - info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; 837 + info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist); 808 838 } 809 839 810 840 static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, 811 841 u32 __user *optval, int __user *optlen) 812 842 { 813 - const struct ccid3_hc_rx_sock *hcrx; 843 + const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 814 844 struct tfrc_rx_info rx_info; 815 845 const void *val; 816 846 817 - /* Listen socks doesn't have a private CCID block */ 818 - if (sk->sk_state == DCCP_LISTEN) 819 - return -EINVAL; 820 - 821 - hcrx = ccid3_hc_rx_sk(sk); 822 847 switch (optname) { 823 848 case DCCP_SOCKOPT_CCID_RX_INFO: 824 849 if (len < sizeof(rx_info)) 825 850 return -EINVAL; 826 - rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv; 827 - rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt; 828 - rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U : 829 - scaled_div(1, hcrx->ccid3hcrx_pinv); 851 + rx_info.tfrcrx_x_recv = hcrx->x_recv; 852 + rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist); 853 + rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse); 830 854 len = sizeof(rx_info); 831 855 val = &rx_info; 832 856 break; ··· 786 962 .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, 787 963 }; 788 964 965 + module_param(do_osc_prev, bool, 0644); 966 + MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)"); 967 + 789 968 #ifdef CONFIG_IP_DCCP_CCID3_DEBUG 790 969 module_param(ccid3_debug, bool, 0644); 791 970 MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); ··· 796 969 797 970 static __init int ccid3_module_init(void) 798 971 { 972 + struct timespec tp; 973 + 974 + /* 975 + * Without a fine-grained clock resolution, RTTs/X_recv are not sampled 976 + * correctly and feedback is sent either too early or too late. 977 + */ 978 + hrtimer_get_res(CLOCK_MONOTONIC, &tp); 979 + if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) { 980 + printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec" 981 + " resolution - check your clocksource.\n", __func__, 982 + tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION); 983 + return -ESOCKTNOSUPPORT; 984 + } 799 985 return ccid_register(&ccid3); 800 986 } 801 987 module_init(ccid3_module_init);
+68 -85
net/dccp/ccids/ccid3.h
··· 47 47 /* Two seconds as per RFC 3448 4.2 */ 48 48 #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) 49 49 50 - /* In usecs - half the scheduling granularity as per RFC3448 4.6 */ 51 - #define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) 50 + /* Maximum backoff interval t_mbi (RFC 3448, 4.3) */ 51 + #define TFRC_T_MBI (64 * USEC_PER_SEC) 52 52 53 - /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ 54 - #define TFRC_T_MBI 64 53 + /* 54 + * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are 55 + * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. 56 + * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse 57 + * resolution of HZ < 500 means that the error is below one timer tick (t_gran) 58 + * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). 59 + */ 60 + #if (HZ >= 500) 61 + # define TFRC_T_DELTA USEC_PER_MSEC 62 + #else 63 + # define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) 64 + #warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC. 65 + #endif 55 66 56 67 enum ccid3_options { 57 68 TFRC_OPT_LOSS_EVENT_RATE = 192, ··· 70 59 TFRC_OPT_RECEIVE_RATE = 194, 71 60 }; 72 61 73 - struct ccid3_options_received { 74 - u64 ccid3or_seqno:48, 75 - ccid3or_loss_intervals_idx:16; 76 - u16 ccid3or_loss_intervals_len; 77 - u32 ccid3or_loss_event_rate; 78 - u32 ccid3or_receive_rate; 79 - }; 80 - 81 - /* TFRC sender states */ 82 - enum ccid3_hc_tx_states { 83 - TFRC_SSTATE_NO_SENT = 1, 84 - TFRC_SSTATE_NO_FBACK, 85 - TFRC_SSTATE_FBACK, 86 - TFRC_SSTATE_TERM, 87 - }; 88 - 89 62 /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket 90 63 * 91 - * @ccid3hctx_x - Current sending rate in 64 * bytes per second 92 - * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second 93 - * @ccid3hctx_x_calc - Calculated rate in bytes per second 94 - * @ccid3hctx_rtt - Estimate of current round trip time in usecs 95 - * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 96 - * @ccid3hctx_s - Packet size in bytes 97 - * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs 98 - * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs 99 - * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states 100 - * @ccid3hctx_last_win_count - Last window counter sent 101 - * @ccid3hctx_t_last_win_count - Timestamp of earliest packet 102 - * with last_win_count value sent 103 - * @ccid3hctx_no_feedback_timer - Handle to no feedback timer 104 - * @ccid3hctx_t_ld - Time last doubled during slow start 105 - * @ccid3hctx_t_nom - Nominal send time of next packet 106 - * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs 107 - * @ccid3hctx_hist - Packet history 108 - * @ccid3hctx_options_received - Parsed set of retrieved options 64 + * @x - Current sending rate in 64 * bytes per second 65 + * @x_recv - Receive rate in 64 * bytes per second 66 + * @x_calc - Calculated rate in bytes per second 67 + * @rtt - Estimate of current round trip time in usecs 68 + * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5) 69 + * @p - Current loss event rate (0-1) scaled by 1000000 70 + * @s - Packet size in bytes 71 + * @t_rto - Nofeedback Timer setting in usecs 72 + * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs 73 + * @feedback - Whether feedback has been received or not 74 + * @last_win_count - Last window counter sent 75 + * @t_last_win_count - Timestamp of earliest packet with 76 + * last_win_count value sent 77 + * @no_feedback_timer - Handle to no feedback timer 78 + * @t_ld - Time last doubled during slow start 79 + * @t_nom - Nominal send time of next packet 80 + * @hist - Packet history 109 81 */ 110 82 struct ccid3_hc_tx_sock { 111 - struct tfrc_tx_info ccid3hctx_tfrc; 112 - #define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x 113 - #define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv 114 - #define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc 115 - #define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt 116 - #define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p 117 - #define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto 118 - #define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi 119 - u16 ccid3hctx_s; 120 - enum ccid3_hc_tx_states ccid3hctx_state:8; 121 - u8 ccid3hctx_last_win_count; 122 - ktime_t ccid3hctx_t_last_win_count; 123 - struct timer_list ccid3hctx_no_feedback_timer; 124 - ktime_t ccid3hctx_t_ld; 125 - ktime_t ccid3hctx_t_nom; 126 - u32 ccid3hctx_delta; 127 - struct tfrc_tx_hist_entry *ccid3hctx_hist; 128 - struct ccid3_options_received ccid3hctx_options_received; 83 + u64 x; 84 + u64 x_recv; 85 + u32 x_calc; 86 + u32 rtt; 87 + u16 r_sqmean; 88 + u32 p; 89 + u32 t_rto; 90 + u32 t_ipi; 91 + u16 s; 92 + bool feedback:1; 93 + u8 last_win_count; 94 + ktime_t t_last_win_count; 95 + struct timer_list no_feedback_timer; 96 + ktime_t t_ld; 97 + ktime_t t_nom; 98 + struct tfrc_tx_hist_entry *hist; 129 99 }; 130 100 131 101 static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) ··· 116 124 return hctx; 117 125 } 118 126 119 - /* TFRC receiver states */ 120 - enum ccid3_hc_rx_states { 121 - TFRC_RSTATE_NO_DATA = 1, 122 - TFRC_RSTATE_DATA, 123 - TFRC_RSTATE_TERM = 127, 127 + 128 + enum ccid3_fback_type { 129 + CCID3_FBACK_NONE = 0, 130 + CCID3_FBACK_INITIAL, 131 + CCID3_FBACK_PERIODIC, 132 + CCID3_FBACK_PARAM_CHANGE 124 133 }; 125 134 126 135 /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket 127 136 * 128 - * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) 129 - * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) 130 - * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4) 131 - * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) 132 - * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states 133 - * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes 134 - * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) 135 - * @ccid3hcrx_rtt - Receiver estimate of RTT 136 - * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent 137 - * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent 138 - * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling) 139 - * @ccid3hcrx_li_hist - Loss Interval database 140 - * @ccid3hcrx_s - Received packet size in bytes 141 - * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) 137 + * @last_counter - Tracks window counter (RFC 4342, 8.1) 138 + * @feedback - The type of the feedback last sent 139 + * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) 140 + * @tstamp_last_feedback - Time at which last feedback was sent 141 + * @hist - Packet history (loss detection + RTT sampling) 142 + * @li_hist - Loss Interval database 143 + * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) 142 144 */ 143 145 struct ccid3_hc_rx_sock { 144 - u8 ccid3hcrx_last_counter:4; 145 - enum ccid3_hc_rx_states ccid3hcrx_state:8; 146 - u32 ccid3hcrx_bytes_recv; 147 - u32 ccid3hcrx_x_recv; 148 - u32 ccid3hcrx_rtt; 149 - ktime_t ccid3hcrx_tstamp_last_feedback; 150 - struct tfrc_rx_hist ccid3hcrx_hist; 151 - struct tfrc_loss_hist ccid3hcrx_li_hist; 152 - u16 ccid3hcrx_s; 153 - #define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean 146 + u8 last_counter:4; 147 + enum ccid3_fback_type feedback:4; 148 + u32 x_recv; 149 + ktime_t tstamp_last_feedback; 150 + struct tfrc_rx_hist hist; 151 + struct tfrc_loss_hist li_hist; 152 + #define p_inverse li_hist.i_mean 154 153 }; 155 154 156 155 static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
+16 -14
net/dccp/ccids/lib/loss_interval.c
··· 86 86 87 87 /** 88 88 * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 89 - * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev 89 + * This updates I_mean as the sequence numbers increase. As a consequence, the 90 + * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1) 91 + * decreases, and thus there is no need to send renewed feedback. 90 92 */ 91 - u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) 93 + void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) 92 94 { 93 95 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); 94 - u32 old_i_mean = lh->i_mean; 95 96 s64 len; 96 97 97 98 if (cur == NULL) /* not initialised */ 98 - return 0; 99 + return; 100 + 101 + /* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */ 102 + if (!dccp_data_packet(skb)) 103 + return; 99 104 100 105 len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; 101 106 102 107 if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ 103 - return 0; 108 + return; 104 109 105 110 if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) 106 111 /* ··· 119 114 cur->li_is_closed = 1; 120 115 121 116 if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ 122 - return 0; 117 + return; 123 118 124 119 cur->li_length = len; 125 120 tfrc_lh_calc_i_mean(lh); 126 - 127 - return (lh->i_mean < old_i_mean); 128 121 } 129 - EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean); 130 122 131 123 /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ 132 124 static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, ··· 140 138 * @sk: Used by @calc_first_li in caller-specific way (subtyping) 141 139 * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. 142 140 */ 143 - int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, 144 - u32 (*calc_first_li)(struct sock *), struct sock *sk) 141 + bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, 142 + u32 (*calc_first_li)(struct sock *), struct sock *sk) 145 143 { 146 144 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; 147 145 148 146 if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) 149 - return 0; 147 + return false; 150 148 151 149 new = tfrc_lh_demand_next(lh); 152 150 if (unlikely(new == NULL)) { 153 151 DCCP_CRIT("Cannot allocate/add loss record."); 154 - return 0; 152 + return false; 155 153 } 156 154 157 155 new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; ··· 169 167 170 168 tfrc_lh_calc_i_mean(lh); 171 169 } 172 - return 1; 170 + return true; 173 171 } 174 172 EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); 175 173
+2 -2
net/dccp/ccids/lib/loss_interval.h
··· 67 67 68 68 struct tfrc_rx_hist; 69 69 70 - extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, 70 + extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, 71 71 u32 (*first_li)(struct sock *), struct sock *); 72 - extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); 72 + extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); 73 73 extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); 74 74 75 75 #endif /* _DCCP_LI_HIST_ */
+146 -138
net/dccp/ccids/lib/packet_history.c
··· 40 40 #include "packet_history.h" 41 41 #include "../../dccp.h" 42 42 43 - /** 44 - * tfrc_tx_hist_entry - Simple singly-linked TX history list 45 - * @next: next oldest entry (LIFO order) 46 - * @seqno: sequence number of this entry 47 - * @stamp: send time of packet with sequence number @seqno 48 - */ 49 - struct tfrc_tx_hist_entry { 50 - struct tfrc_tx_hist_entry *next; 51 - u64 seqno; 52 - ktime_t stamp; 53 - }; 54 - 55 43 /* 56 44 * Transmitter History Routines 57 45 */ ··· 59 71 kmem_cache_destroy(tfrc_tx_hist_slab); 60 72 tfrc_tx_hist_slab = NULL; 61 73 } 62 - } 63 - 64 - static struct tfrc_tx_hist_entry * 65 - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) 66 - { 67 - while (head != NULL && head->seqno != seqno) 68 - head = head->next; 69 - 70 - return head; 71 74 } 72 75 73 76 int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) ··· 89 110 *headp = NULL; 90 111 } 91 112 EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); 92 - 93 - u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, 94 - const ktime_t now) 95 - { 96 - u32 rtt = 0; 97 - struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); 98 - 99 - if (packet != NULL) { 100 - rtt = ktime_us_delta(now, packet->stamp); 101 - /* 102 - * Garbage-collect older (irrelevant) entries: 103 - */ 104 - tfrc_tx_hist_purge(&packet->next); 105 - } 106 - 107 - return rtt; 108 - } 109 - EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); 110 - 111 113 112 114 /* 113 115 * Receiver History Routines ··· 151 191 } 152 192 EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); 153 193 194 + 195 + static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) 196 + { 197 + struct tfrc_rx_hist_entry *tmp = h->ring[a]; 198 + 199 + h->ring[a] = h->ring[b]; 200 + h->ring[b] = tmp; 201 + } 202 + 154 203 static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) 155 204 { 156 - const u8 idx_a = tfrc_rx_hist_index(h, a), 157 - idx_b = tfrc_rx_hist_index(h, b); 158 - struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; 205 + __tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a), 206 + tfrc_rx_hist_index(h, b)); 207 + } 159 208 160 - h->ring[idx_a] = h->ring[idx_b]; 161 - h->ring[idx_b] = tmp; 209 + /** 210 + * tfrc_rx_hist_resume_rtt_sampling - Prepare RX history for RTT sampling 211 + * This is called after loss detection has finished, when the history entry 212 + * with the index of `loss_count' holds the highest-received sequence number. 213 + * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt). 214 + */ 215 + static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h) 216 + { 217 + __tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count)); 218 + h->loss_count = h->loss_start = 0; 162 219 } 163 220 164 221 /* ··· 192 215 u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, 193 216 s1 = DCCP_SKB_CB(skb)->dccpd_seq; 194 217 195 - if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ 218 + if (!dccp_loss_free(s0, s1, n1)) /* gap between S0 and S1 */ 196 219 h->loss_count = 1; 197 - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); 198 - } 199 220 } 200 221 201 222 static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) ··· 215 240 216 241 if (dccp_loss_free(s2, s1, n1)) { 217 242 /* hole is filled: S0, S2, and S1 are consecutive */ 218 - h->loss_count = 0; 219 - h->loss_start = tfrc_rx_hist_index(h, 1); 243 + tfrc_rx_hist_resume_rtt_sampling(h); 220 244 } else 221 245 /* gap between S2 and S1: just update loss_prev */ 222 246 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); ··· 268 294 269 295 if (dccp_loss_free(s1, s2, n2)) { 270 296 /* entire hole filled by S0, S3, S1, S2 */ 271 - h->loss_start = tfrc_rx_hist_index(h, 2); 272 - h->loss_count = 0; 297 + tfrc_rx_hist_resume_rtt_sampling(h); 273 298 } else { 274 299 /* gap remains between S1 and S2 */ 275 300 h->loss_start = tfrc_rx_hist_index(h, 1); ··· 312 339 313 340 if (dccp_loss_free(s2, s3, n3)) { 314 341 /* no gap between S2 and S3: entire hole is filled */ 315 - h->loss_start = tfrc_rx_hist_index(h, 3); 316 - h->loss_count = 0; 342 + tfrc_rx_hist_resume_rtt_sampling(h); 317 343 } else { 318 344 /* gap between S2 and S3 */ 319 345 h->loss_start = tfrc_rx_hist_index(h, 2); ··· 326 354 } 327 355 328 356 /** 329 - * tfrc_rx_handle_loss - Loss detection and further processing 330 - * @h: The non-empty RX history object 331 - * @lh: Loss Intervals database to update 332 - * @skb: Currently received packet 333 - * @ndp: The NDP count belonging to @skb 334 - * @calc_first_li: Caller-dependent computation of first loss interval in @lh 335 - * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) 357 + * tfrc_rx_congestion_event - Loss detection and further processing 358 + * @h: The non-empty RX history object 359 + * @lh: Loss Intervals database to update 360 + * @skb: Currently received packet 361 + * @ndp: The NDP count belonging to @skb 362 + * @first_li: Caller-dependent computation of first loss interval in @lh 363 + * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) 336 364 * Chooses action according to pending loss, updates LI database when a new 337 365 * loss was detected, and does required post-processing. Returns 1 when caller 338 366 * should send feedback, 0 otherwise. ··· 340 368 * records accordingly, the caller should not perform any more RX history 341 369 * operations when loss_count is greater than 0 after calling this function. 342 370 */ 343 - int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, 344 - struct tfrc_loss_hist *lh, 345 - struct sk_buff *skb, const u64 ndp, 346 - u32 (*calc_first_li)(struct sock *), struct sock *sk) 371 + bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, 372 + struct tfrc_loss_hist *lh, 373 + struct sk_buff *skb, const u64 ndp, 374 + u32 (*first_li)(struct sock *), struct sock *sk) 347 375 { 348 - int is_new_loss = 0; 376 + bool new_event = false; 377 + 378 + if (tfrc_rx_hist_duplicate(h, skb)) 379 + return 0; 349 380 350 381 if (h->loss_count == 0) { 351 382 __do_track_loss(h, skb, ndp); 383 + tfrc_rx_hist_sample_rtt(h, skb); 384 + tfrc_rx_hist_add_packet(h, skb, ndp); 352 385 } else if (h->loss_count == 1) { 353 386 __one_after_loss(h, skb, ndp); 354 387 } else if (h->loss_count != 2) { ··· 362 385 /* 363 386 * Update Loss Interval database and recycle RX records 364 387 */ 365 - is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); 388 + new_event = tfrc_lh_interval_add(lh, h, first_li, sk); 366 389 __three_after_loss(h); 367 390 } 368 - return is_new_loss; 369 - } 370 - EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); 371 391 372 - int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) 392 + /* 393 + * Update moving-average of `s' and the sum of received payload bytes. 394 + */ 395 + if (dccp_data_packet(skb)) { 396 + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; 397 + 398 + h->packet_size = tfrc_ewma(h->packet_size, payload, 9); 399 + h->bytes_recvd += payload; 400 + } 401 + 402 + /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */ 403 + if (!new_event) 404 + tfrc_lh_update_i_mean(lh, skb); 405 + 406 + return new_event; 407 + } 408 + EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event); 409 + 410 + /* Compute the sending rate X_recv measured between feedback intervals */ 411 + u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv) 373 412 { 374 - int i; 413 + u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate; 414 + s64 delta = ktime_to_us(net_timedelta(h->bytes_start)); 375 415 376 - for (i = 0; i <= TFRC_NDUPACK; i++) { 377 - h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); 378 - if (h->ring[i] == NULL) 379 - goto out_free; 416 + WARN_ON(delta <= 0); 417 + /* 418 + * Ensure that the sampling interval for X_recv is at least one RTT, 419 + * by extending the sampling interval backwards in time, over the last 420 + * R_(m-1) seconds, as per rfc3448bis-06, 6.2. 421 + * To reduce noise (e.g. when the RTT changes often), this is only 422 + * done when delta is smaller than RTT/2. 423 + */ 424 + if (last_x_recv > 0 && delta < last_rtt/2) { 425 + tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n", 426 + (long)delta, (unsigned)last_rtt); 427 + 428 + delta = (bytes ? delta : 0) + last_rtt; 429 + bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC); 380 430 } 381 431 382 - h->loss_count = h->loss_start = 0; 383 - return 0; 384 - 385 - out_free: 386 - while (i-- != 0) { 387 - kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); 388 - h->ring[i] = NULL; 432 + if (unlikely(bytes == 0)) { 433 + DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv); 434 + return last_x_recv; 389 435 } 390 - return -ENOBUFS; 436 + return scaled_div32(bytes, delta); 391 437 } 392 - EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc); 438 + EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv); 393 439 394 440 void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) 395 441 { ··· 426 426 } 427 427 EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); 428 428 429 - /** 430 - * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against 431 - */ 432 - static inline struct tfrc_rx_hist_entry * 433 - tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) 429 + static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) 434 430 { 435 - return h->ring[0]; 431 + int i; 432 + 433 + memset(h, 0, sizeof(*h)); 434 + 435 + for (i = 0; i <= TFRC_NDUPACK; i++) { 436 + h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); 437 + if (h->ring[i] == NULL) { 438 + tfrc_rx_hist_purge(h); 439 + return -ENOBUFS; 440 + } 441 + } 442 + return 0; 436 443 } 437 444 438 - /** 439 - * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry 440 - */ 441 - static inline struct tfrc_rx_hist_entry * 442 - tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) 445 + int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk) 443 446 { 444 - return h->ring[h->rtt_sample_prev]; 447 + if (tfrc_rx_hist_alloc(h)) 448 + return -ENOBUFS; 449 + /* 450 + * Initialise first entry with GSR to start loss detection as early as 451 + * possible. Code using this must not use any other fields. The entry 452 + * will be overwritten once the CCID updates its received packets. 453 + */ 454 + tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr; 455 + return 0; 445 456 } 457 + EXPORT_SYMBOL_GPL(tfrc_rx_hist_init); 446 458 447 459 /** 448 460 * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal 449 - * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able 450 - * to compute a sample with given data - calling function should check this. 461 + * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss 462 + * is pending and uses the following history entries (via rtt_sample_prev): 463 + * - h->ring[0] contains the most recent history entry prior to @skb; 464 + * - h->ring[1] is an unused `dummy' entry when the current difference is 0; 451 465 */ 452 - u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) 466 + void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) 453 467 { 454 - u32 sample = 0, 455 - delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, 456 - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); 468 + struct tfrc_rx_hist_entry *last = h->ring[0]; 469 + u32 sample, delta_v; 457 470 458 - if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ 459 - if (h->rtt_sample_prev == 2) { /* previous candidate stored */ 460 - sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, 461 - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); 462 - if (sample) 463 - sample = 4 / sample * 464 - ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, 465 - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); 466 - else /* 467 - * FIXME: This condition is in principle not 468 - * possible but occurs when CCID is used for 469 - * two-way data traffic. I have tried to trace 470 - * it, but the cause does not seem to be here. 471 - */ 472 - DCCP_BUG("please report to dccp@vger.kernel.org" 473 - " => prev = %u, last = %u", 474 - tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, 475 - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); 476 - } else if (delta_v < 1) { 477 - h->rtt_sample_prev = 1; 478 - goto keep_ref_for_next_time; 479 - } 471 + /* 472 + * When not to sample: 473 + * - on non-data packets 474 + * (RFC 4342, 8.1: CCVal only fully defined for data packets); 475 + * - when no data packets have been received yet 476 + * (FIXME: using sampled packet size as indicator here); 477 + * - as long as there are gaps in the sequence space (pending loss). 478 + */ 479 + if (!dccp_data_packet(skb) || h->packet_size == 0 || 480 + tfrc_rx_hist_loss_pending(h)) 481 + return; 480 482 481 - } else if (delta_v == 4) /* optimal match */ 482 - sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); 483 - else { /* suboptimal match */ 484 - h->rtt_sample_prev = 2; 485 - goto keep_ref_for_next_time; 483 + h->rtt_sample_prev = 0; /* reset previous candidate */ 484 + 485 + delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval); 486 + if (delta_v == 0) { /* less than RTT/4 difference */ 487 + h->rtt_sample_prev = 1; 488 + return; 486 489 } 490 + sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp))); 487 491 488 - if (unlikely(sample > DCCP_SANE_RTT_MAX)) { 489 - DCCP_WARN("RTT sample %u too large, using max\n", sample); 490 - sample = DCCP_SANE_RTT_MAX; 491 - } 492 + if (delta_v <= 4) /* between RTT/4 and RTT */ 493 + sample *= 4 / delta_v; 494 + else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2)) 495 + /* 496 + * Optimisation: CCVal difference is greater than 1 RTT, yet the 497 + * sample is less than the local RTT estimate; which means that 498 + * the RTT estimate is too high. 499 + * To avoid noise, it is not done if the sample is below RTT/2. 500 + */ 501 + return; 492 502 493 - h->rtt_sample_prev = 0; /* use current entry as next reference */ 494 - keep_ref_for_next_time: 495 - 496 - return sample; 503 + /* Use a lower weight than usual to increase responsiveness */ 504 + h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5); 497 505 } 498 506 EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt);
+67 -11
net/dccp/ccids/lib/packet_history.h
··· 40 40 #include <linux/slab.h> 41 41 #include "tfrc.h" 42 42 43 - struct tfrc_tx_hist_entry; 43 + /** 44 + * tfrc_tx_hist_entry - Simple singly-linked TX history list 45 + * @next: next oldest entry (LIFO order) 46 + * @seqno: sequence number of this entry 47 + * @stamp: send time of packet with sequence number @seqno 48 + */ 49 + struct tfrc_tx_hist_entry { 50 + struct tfrc_tx_hist_entry *next; 51 + u64 seqno; 52 + ktime_t stamp; 53 + }; 54 + 55 + static inline struct tfrc_tx_hist_entry * 56 + tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) 57 + { 58 + while (head != NULL && head->seqno != seqno) 59 + head = head->next; 60 + return head; 61 + } 44 62 45 63 extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); 46 64 extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); 47 - extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, 48 - const u64 seqno, const ktime_t now); 49 65 50 66 /* Subtraction a-b modulo-16, respects circular wrap-around */ 51 67 #define SUB16(a, b) (((a) + 16 - (b)) & 0xF) ··· 91 75 * @loss_count: Number of entries in circular history 92 76 * @loss_start: Movable index (for loss detection) 93 77 * @rtt_sample_prev: Used during RTT sampling, points to candidate entry 78 + * @rtt_estimate: Receiver RTT estimate 79 + * @packet_size: Packet size in bytes (as per RFC 3448, 3.1) 80 + * @bytes_recvd: Number of bytes received since @bytes_start 81 + * @bytes_start: Start time for counting @bytes_recvd 94 82 */ 95 83 struct tfrc_rx_hist { 96 84 struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; 97 85 u8 loss_count:2, 98 86 loss_start:2; 87 + /* Receiver RTT sampling */ 99 88 #define rtt_sample_prev loss_start 89 + u32 rtt_estimate; 90 + /* Receiver sampling of application payload lengths */ 91 + u32 packet_size, 92 + bytes_recvd; 93 + ktime_t bytes_start; 100 94 }; 101 95 102 96 /** ··· 150 124 return h->loss_count > 0; 151 125 } 152 126 127 + /* 128 + * Accessor functions to retrieve parameters sampled by the RX history 129 + */ 130 + static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h) 131 + { 132 + if (h->packet_size == 0) { 133 + DCCP_WARN("No sample for s, using fallback\n"); 134 + return TCP_MIN_RCVMSS; 135 + } 136 + return h->packet_size; 137 + 138 + } 139 + static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h) 140 + { 141 + if (h->rtt_estimate == 0) { 142 + DCCP_WARN("No RTT estimate available, using fallback RTT\n"); 143 + return DCCP_FALLBACK_RTT; 144 + } 145 + return h->rtt_estimate; 146 + } 147 + 148 + static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h) 149 + { 150 + h->bytes_recvd = 0; 151 + h->bytes_start = ktime_get_real(); 152 + } 153 + 154 + extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv); 155 + 156 + 153 157 extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, 154 158 const struct sk_buff *skb, const u64 ndp); 155 159 156 160 extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); 157 161 158 162 struct tfrc_loss_hist; 159 - extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, 160 - struct tfrc_loss_hist *lh, 161 - struct sk_buff *skb, const u64 ndp, 162 - u32 (*first_li)(struct sock *sk), 163 - struct sock *sk); 164 - extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, 165 - const struct sk_buff *skb); 166 - extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); 163 + extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, 164 + struct tfrc_loss_hist *lh, 165 + struct sk_buff *skb, const u64 ndp, 166 + u32 (*first_li)(struct sock *sk), 167 + struct sock *sk); 168 + extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, 169 + const struct sk_buff *skb); 170 + extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk); 167 171 extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); 168 172 169 173 #endif /* _DCCP_PKT_HIST_ */
+16
net/dccp/ccids/lib/tfrc.h
··· 48 48 } 49 49 50 50 /** 51 + * tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1 52 + * Uses scaling to improve accuracy of the integer approximation of sqrt(). The 53 + * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for 54 + * clamped RTT samples (dccp_sample_rtt). 55 + * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the 56 + * scaling factor is neutralised. For this purpose, it avoids returning zero. 57 + */ 58 + static inline u16 tfrc_scaled_sqrt(const u32 sample) 59 + { 60 + const unsigned long non_zero_sample = sample ? : 1; 61 + 62 + return int_sqrt(non_zero_sample << 10); 63 + } 64 + 65 + /** 51 66 * tfrc_ewma - Exponentially weighted moving average 52 67 * @weight: Weight to be used as damping factor, in units of 1/10 53 68 */ ··· 73 58 74 59 extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); 75 60 extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); 61 + extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); 76 62 77 63 extern int tfrc_tx_packet_history_init(void); 78 64 extern void tfrc_tx_packet_history_exit(void);
+25 -4
net/dccp/ccids/lib/tfrc_equation.c
··· 632 632 633 633 if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ 634 634 if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ 635 - DCCP_WARN("Value of p (%d) below resolution. " 636 - "Substituting %d\n", p, TFRC_SMALLEST_P); 635 + /* 636 + * In the congestion-avoidance phase p decays towards 0 637 + * when there are no further losses, so this case is 638 + * natural. Truncating to p_min = 0.01% means that the 639 + * maximum achievable throughput is limited to about 640 + * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g. 641 + * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps. 642 + */ 643 + tfrc_pr_debug("Value of p (%d) below resolution. " 644 + "Substituting %d\n", p, TFRC_SMALLEST_P); 637 645 index = 0; 638 646 } else /* 0.0001 <= p <= 0.05 */ 639 647 index = p/TFRC_SMALLEST_P - 1; ··· 666 658 result = scaled_div(s, R); 667 659 return scaled_div32(result, f); 668 660 } 669 - 670 661 EXPORT_SYMBOL_GPL(tfrc_calc_x); 671 662 672 663 /** ··· 700 693 index = tfrc_binsearch(fvalue, 0); 701 694 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; 702 695 } 703 - 704 696 EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); 697 + 698 + /** 699 + * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% 700 + * When @loss_event_rate is large, there is a chance that p is truncated to 0. 701 + * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. 702 + */ 703 + u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) 704 + { 705 + if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ 706 + return 0; 707 + if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ 708 + return 1000000; 709 + return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); 710 + } 711 + EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate);
+77 -27
net/dccp/dccp.h
··· 42 42 extern int dccp_debug; 43 43 #define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) 44 44 #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) 45 + #define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) 45 46 #else 46 47 #define dccp_pr_debug(format, a...) 47 48 #define dccp_pr_debug_cat(format, a...) 49 + #define dccp_debug(format, a...) 48 50 #endif 49 51 50 52 extern struct inet_hashinfo dccp_hashinfo; ··· 63 61 * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields 64 62 * Hence a safe upper bound for the maximum option length is 1020-28 = 992 65 63 */ 66 - #define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int)) 64 + #define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) 67 65 #define DCCP_MAX_PACKET_HDR 28 68 66 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) 69 67 #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) 68 + 69 + /* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ 70 + #define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) 70 71 71 72 #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT 72 73 * state, about 60 seconds */ ··· 86 81 */ 87 82 #define DCCP_RTO_MAX ((unsigned)(64 * HZ)) 88 83 84 + /* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */ 85 + #define DCCP_TIME_RESOLUTION 10 86 + 89 87 /* 90 88 * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 91 89 */ 92 - #define DCCP_SANE_RTT_MIN 100 90 + #define DCCP_SANE_RTT_MIN (10 * DCCP_TIME_RESOLUTION) 93 91 #define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) 94 92 #define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) 95 93 ··· 103 95 extern int sysctl_dccp_request_retries; 104 96 extern int sysctl_dccp_retries1; 105 97 extern int sysctl_dccp_retries2; 106 - extern int sysctl_dccp_feat_sequence_window; 107 - extern int sysctl_dccp_feat_rx_ccid; 108 - extern int sysctl_dccp_feat_tx_ccid; 109 - extern int sysctl_dccp_feat_ack_ratio; 110 - extern int sysctl_dccp_feat_send_ack_vector; 111 - extern int sysctl_dccp_feat_send_ndp_count; 112 98 extern int sysctl_dccp_tx_qlen; 113 99 extern int sysctl_dccp_sync_ratelimit; 114 100 ··· 237 235 extern void dccp_send_sync(struct sock *sk, const u64 seq, 238 236 const enum dccp_pkt_type pkt_type); 239 237 240 - extern void dccp_write_xmit(struct sock *sk, int block); 238 + /* 239 + * TX Packet Dequeueing Interface 240 + */ 241 + extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); 242 + extern bool dccp_qpolicy_full(struct sock *sk); 243 + extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); 244 + extern struct sk_buff *dccp_qpolicy_top(struct sock *sk); 245 + extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk); 246 + extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); 247 + 248 + /* 249 + * TX Packet Output and TX Timers 250 + */ 251 + extern void dccp_write_xmit(struct sock *sk); 241 252 extern void dccp_write_space(struct sock *sk); 253 + extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); 242 254 243 255 extern void dccp_init_xmit_timers(struct sock *sk); 244 256 static inline void dccp_clear_xmit_timers(struct sock *sk) ··· 268 252 extern void dccp_set_state(struct sock *sk, const int state); 269 253 extern void dccp_done(struct sock *sk); 270 254 271 - extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb); 255 + extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, 256 + struct sk_buff const *skb); 272 257 273 258 extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); 274 259 ··· 334 317 extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); 335 318 extern void dccp_send_close(struct sock *sk, const int active); 336 319 extern int dccp_invalid_packet(struct sk_buff *skb); 337 - extern u32 dccp_sample_rtt(struct sock *sk, long delta); 320 + 321 + static inline u32 dccp_sane_rtt(long usec_sample) 322 + { 323 + if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX)) 324 + DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample); 325 + return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX); 326 + } 327 + extern u32 dccp_sample_rtt(struct sock *sk, long delta); 338 328 339 329 static inline int dccp_bad_service_code(const struct sock *sk, 340 330 const __be32 service) ··· 435 411 static inline void dccp_update_gsr(struct sock *sk, u64 seq) 436 412 { 437 413 struct dccp_sock *dp = dccp_sk(sk); 438 - const struct dccp_minisock *dmsk = dccp_msk(sk); 439 414 440 415 dp->dccps_gsr = seq; 441 - dccp_set_seqno(&dp->dccps_swl, 442 - dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4)); 443 - dccp_set_seqno(&dp->dccps_swh, 444 - dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4); 416 + /* Sequence validity window depends on remote Sequence Window (7.5.1) */ 417 + dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); 418 + /* 419 + * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, 420 + * 7.5.1 we perform this check beyond the initial handshake: W/W' are 421 + * always > 32, so for the first W/W' packets in the lifetime of a 422 + * connection we always have to adjust SWL. 423 + * A second reason why we are doing this is that the window depends on 424 + * the feature-remote value of Sequence Window: nothing stops the peer 425 + * from updating this value while we are busy adjusting SWL for the 426 + * first W packets (we would have to count from scratch again then). 427 + * Therefore it is safer to always make sure that the Sequence Window 428 + * is not artificially extended by a peer who grows SWL downwards by 429 + * continually updating the feature-remote Sequence-Window. 430 + * If sequence numbers wrap it is bad luck. But that will take a while 431 + * (48 bit), and this measure prevents Sequence-number attacks. 432 + */ 433 + if (before48(dp->dccps_swl, dp->dccps_isr)) 434 + dp->dccps_swl = dp->dccps_isr; 435 + dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); 445 436 } 446 437 447 438 static inline void dccp_update_gss(struct sock *sk, u64 seq) 448 439 { 449 440 struct dccp_sock *dp = dccp_sk(sk); 450 441 451 - dp->dccps_awh = dp->dccps_gss = seq; 452 - dccp_set_seqno(&dp->dccps_awl, 453 - (dp->dccps_gss - 454 - dccp_msk(sk)->dccpms_sequence_window + 1)); 442 + dp->dccps_gss = seq; 443 + /* Ack validity window depends on local Sequence Window value (7.5.1) */ 444 + dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); 445 + /* Adjust AWL so that it is not below ISS - see comment above for SWL */ 446 + if (before48(dp->dccps_awl, dp->dccps_iss)) 447 + dp->dccps_awl = dp->dccps_iss; 448 + dp->dccps_awh = dp->dccps_gss; 449 + } 450 + 451 + static inline int dccp_ackvec_pending(const struct sock *sk) 452 + { 453 + return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && 454 + !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); 455 455 } 456 456 457 457 static inline int dccp_ack_pending(const struct sock *sk) 458 458 { 459 - const struct dccp_sock *dp = dccp_sk(sk); 460 - return dp->dccps_timestamp_echo != 0 || 461 - #ifdef CONFIG_IP_DCCP_ACKVEC 462 - (dccp_msk(sk)->dccpms_send_ack_vector && 463 - dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || 464 - #endif 465 - inet_csk_ack_scheduled(sk); 459 + return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); 466 460 } 461 + 462 + extern int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); 463 + extern int dccp_feat_finalise_settings(struct dccp_sock *dp); 464 + extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); 465 + extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, 466 + struct sk_buff *skb); 467 + extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); 468 + extern void dccp_feat_list_purge(struct list_head *fn_list); 467 469 468 470 extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); 469 471 extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
+1 -1
net/dccp/diag.c
··· 29 29 info->tcpi_backoff = icsk->icsk_backoff; 30 30 info->tcpi_pmtu = icsk->icsk_pmtu_cookie; 31 31 32 - if (dccp_msk(sk)->dccpms_send_ack_vector) 32 + if (dp->dccps_hc_rx_ackvec != NULL) 33 33 info->tcpi_options |= TCPI_OPT_SACK; 34 34 35 35 ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
+1431 -548
net/dccp/feat.c
··· 1 1 /* 2 2 * net/dccp/feat.c 3 3 * 4 - * An implementation of the DCCP protocol 5 - * Andrea Bittau <a.bittau@cs.ucl.ac.uk> 4 + * Feature negotiation for the DCCP protocol (RFC 4340, section 6) 5 + * 6 + * Copyright (c) 2008 The University of Aberdeen, Scotland, UK 7 + * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> 8 + * Rewrote from scratch, some bits from earlier code by 9 + * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> 10 + * 6 11 * 7 12 * ASSUMPTIONS 8 13 * ----------- 14 + * o Feature negotiation is coordinated with connection setup (as in TCP), wild 15 + * changes of parameters of an established connection are not supported. 16 + * o Changing NN values (Ack Ratio only) is supported in state OPEN/PARTOPEN. 9 17 * o All currently known SP features have 1-byte quantities. If in the future 10 18 * extensions of RFCs 4340..42 define features with item lengths larger than 11 19 * one byte, a feature-specific extension of the code will be required. ··· 23 15 * as published by the Free Software Foundation; either version 24 16 * 2 of the License, or (at your option) any later version. 25 17 */ 26 - 27 18 #include <linux/module.h> 28 - 29 19 #include "ccid.h" 30 20 #include "feat.h" 31 21 32 - #define DCCP_FEAT_SP_NOAGREE (-123) 22 + /* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ 23 + unsigned long sysctl_dccp_sequence_window __read_mostly = 100; 24 + int sysctl_dccp_rx_ccid __read_mostly = 2, 25 + sysctl_dccp_tx_ccid __read_mostly = 2; 33 26 34 - int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, 35 - u8 *val, u8 len, gfp_t gfp) 36 - { 37 - struct dccp_opt_pend *opt; 38 - 39 - dccp_feat_debug(type, feature, *val); 40 - 41 - if (len > 3) { 42 - DCCP_WARN("invalid length %d\n", len); 43 - return -EINVAL; 44 - } 45 - /* XXX add further sanity checks */ 46 - 47 - /* check if that feature is already being negotiated */ 48 - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 49 - /* ok we found a negotiation for this option already */ 50 - if (opt->dccpop_feat == feature && opt->dccpop_type == type) { 51 - dccp_pr_debug("Replacing old\n"); 52 - /* replace */ 53 - BUG_ON(opt->dccpop_val == NULL); 54 - kfree(opt->dccpop_val); 55 - opt->dccpop_val = val; 56 - opt->dccpop_len = len; 57 - opt->dccpop_conf = 0; 58 - return 0; 59 - } 60 - } 61 - 62 - /* negotiation for a new feature */ 63 - opt = kmalloc(sizeof(*opt), gfp); 64 - if (opt == NULL) 65 - return -ENOMEM; 66 - 67 - opt->dccpop_type = type; 68 - opt->dccpop_feat = feature; 69 - opt->dccpop_len = len; 70 - opt->dccpop_val = val; 71 - opt->dccpop_conf = 0; 72 - opt->dccpop_sc = NULL; 73 - 74 - BUG_ON(opt->dccpop_val == NULL); 75 - 76 - list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending); 77 - return 0; 78 - } 79 - 80 - EXPORT_SYMBOL_GPL(dccp_feat_change); 81 - 82 - static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr) 27 + /* 28 + * Feature activation handlers. 29 + * 30 + * These all use an u64 argument, to provide enough room for NN/SP features. At 31 + * this stage the negotiated values have been checked to be within their range. 32 + */ 33 + static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) 83 34 { 84 35 struct dccp_sock *dp = dccp_sk(sk); 85 - struct dccp_minisock *dmsk = dccp_msk(sk); 86 - /* figure out if we are changing our CCID or the peer's */ 87 - const int rx = type == DCCPO_CHANGE_R; 88 - const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid; 89 - struct ccid *new_ccid; 36 + struct ccid *new_ccid = ccid_new(ccid, sk, rx, gfp_any()); 90 37 91 - /* Check if nothing is being changed. */ 92 - if (ccid_nr == new_ccid_nr) 93 - return 0; 94 - 95 - new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC); 96 38 if (new_ccid == NULL) 97 39 return -ENOMEM; 98 40 99 41 if (rx) { 100 42 ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); 101 43 dp->dccps_hc_rx_ccid = new_ccid; 102 - dmsk->dccpms_rx_ccid = new_ccid_nr; 103 44 } else { 104 45 ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); 105 46 dp->dccps_hc_tx_ccid = new_ccid; 106 - dmsk->dccpms_tx_ccid = new_ccid_nr; 107 - } 108 - 109 - return 0; 110 - } 111 - 112 - static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val) 113 - { 114 - dccp_feat_debug(type, feat, val); 115 - 116 - switch (feat) { 117 - case DCCPF_CCID: 118 - return dccp_feat_update_ccid(sk, type, val); 119 - default: 120 - dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n", 121 - dccp_feat_typename(type), feat); 122 - break; 123 47 } 124 48 return 0; 125 49 } 126 50 127 - static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt, 128 - u8 *rpref, u8 rlen) 51 + static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) 129 52 { 130 53 struct dccp_sock *dp = dccp_sk(sk); 131 - u8 *spref, slen, *res = NULL; 132 - int i, j, rc, agree = 1; 133 54 134 - BUG_ON(rpref == NULL); 135 - 136 - /* check if we are the black sheep */ 137 - if (dp->dccps_role == DCCP_ROLE_CLIENT) { 138 - spref = rpref; 139 - slen = rlen; 140 - rpref = opt->dccpop_val; 141 - rlen = opt->dccpop_len; 55 + if (rx) { 56 + dp->dccps_r_seq_win = seq_win; 57 + /* propagate changes to update SWL/SWH */ 58 + dccp_update_gsr(sk, dp->dccps_gsr); 142 59 } else { 143 - spref = opt->dccpop_val; 144 - slen = opt->dccpop_len; 60 + dp->dccps_l_seq_win = seq_win; 61 + /* propagate changes to update AWL */ 62 + dccp_update_gss(sk, dp->dccps_gss); 145 63 } 146 - /* 147 - * Now we have server preference list in spref and client preference in 148 - * rpref 149 - */ 150 - BUG_ON(spref == NULL); 151 - BUG_ON(rpref == NULL); 152 - 153 - /* FIXME sanity check vals */ 154 - 155 - /* Are values in any order? XXX Lame "algorithm" here */ 156 - for (i = 0; i < slen; i++) { 157 - for (j = 0; j < rlen; j++) { 158 - if (spref[i] == rpref[j]) { 159 - res = &spref[i]; 160 - break; 161 - } 162 - } 163 - if (res) 164 - break; 165 - } 166 - 167 - /* we didn't agree on anything */ 168 - if (res == NULL) { 169 - /* confirm previous value */ 170 - switch (opt->dccpop_feat) { 171 - case DCCPF_CCID: 172 - /* XXX did i get this right? =P */ 173 - if (opt->dccpop_type == DCCPO_CHANGE_L) 174 - res = &dccp_msk(sk)->dccpms_tx_ccid; 175 - else 176 - res = &dccp_msk(sk)->dccpms_rx_ccid; 177 - break; 178 - 179 - default: 180 - DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat); 181 - /* XXX implement res */ 182 - return -EFAULT; 183 - } 184 - 185 - dccp_pr_debug("Don't agree... reconfirming %d\n", *res); 186 - agree = 0; /* this is used for mandatory options... */ 187 - } 188 - 189 - /* need to put result and our preference list */ 190 - rlen = 1 + opt->dccpop_len; 191 - rpref = kmalloc(rlen, GFP_ATOMIC); 192 - if (rpref == NULL) 193 - return -ENOMEM; 194 - 195 - *rpref = *res; 196 - memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len); 197 - 198 - /* put it in the "confirm queue" */ 199 - if (opt->dccpop_sc == NULL) { 200 - opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC); 201 - if (opt->dccpop_sc == NULL) { 202 - kfree(rpref); 203 - return -ENOMEM; 204 - } 205 - } else { 206 - /* recycle the confirm slot */ 207 - BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); 208 - kfree(opt->dccpop_sc->dccpoc_val); 209 - dccp_pr_debug("recycling confirm slot\n"); 210 - } 211 - memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc)); 212 - 213 - opt->dccpop_sc->dccpoc_val = rpref; 214 - opt->dccpop_sc->dccpoc_len = rlen; 215 - 216 - /* update the option on our side [we are about to send the confirm] */ 217 - rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res); 218 - if (rc) { 219 - kfree(opt->dccpop_sc->dccpoc_val); 220 - kfree(opt->dccpop_sc); 221 - opt->dccpop_sc = NULL; 222 - return rc; 223 - } 224 - 225 - dccp_pr_debug("Will confirm %d\n", *rpref); 226 - 227 - /* say we want to change to X but we just got a confirm X, suppress our 228 - * change 229 - */ 230 - if (!opt->dccpop_conf) { 231 - if (*opt->dccpop_val == *res) 232 - opt->dccpop_conf = 1; 233 - dccp_pr_debug("won't ask for change of same feature\n"); 234 - } 235 - 236 - return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */ 64 + return 0; 237 65 } 238 66 239 - static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) 67 + static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) 240 68 { 241 - struct dccp_minisock *dmsk = dccp_msk(sk); 242 - struct dccp_opt_pend *opt; 243 - int rc = 1; 244 - u8 t; 245 - 69 + #ifndef __CCID2_COPES_GRACEFULLY_WITH_DYNAMIC_ACK_RATIO_UPDATES__ 246 70 /* 247 - * We received a CHANGE. We gotta match it against our own preference 248 - * list. If we got a CHANGE_R it means it's a change for us, so we need 249 - * to compare our CHANGE_L list. 71 + * FIXME: This is required until several problems in the CCID-2 code are 72 + * resolved. The CCID-2 code currently does not cope well; using dynamic 73 + * Ack Ratios greater than 1 caused instabilities. These were manifest 74 + * in hangups and long RTO timeouts (1...3 seconds). Until this has been 75 + * stabilised, it is safer not to activate dynamic Ack Ratio changes. 250 76 */ 251 - if (type == DCCPO_CHANGE_L) 252 - t = DCCPO_CHANGE_R; 77 + dccp_pr_debug("Not changing %s Ack Ratio from 1 to %u\n", 78 + rx ? "RX" : "TX", (u16)ratio); 79 + ratio = 1; 80 + #endif 81 + if (rx) 82 + dccp_sk(sk)->dccps_r_ack_ratio = ratio; 253 83 else 254 - t = DCCPO_CHANGE_L; 255 - 256 - /* find our preference list for this feature */ 257 - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 258 - if (opt->dccpop_type != t || opt->dccpop_feat != feature) 259 - continue; 260 - 261 - /* find the winner from the two preference lists */ 262 - rc = dccp_feat_reconcile(sk, opt, val, len); 263 - break; 264 - } 265 - 266 - /* We didn't deal with the change. This can happen if we have no 267 - * preference list for the feature. In fact, it just shouldn't 268 - * happen---if we understand a feature, we should have a preference list 269 - * with at least the default value. 270 - */ 271 - BUG_ON(rc == 1); 272 - 273 - return rc; 274 - } 275 - 276 - static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) 277 - { 278 - struct dccp_opt_pend *opt; 279 - struct dccp_minisock *dmsk = dccp_msk(sk); 280 - u8 *copy; 281 - int rc; 282 - 283 - /* NN features must be Change L (sec. 6.3.2) */ 284 - if (type != DCCPO_CHANGE_L) { 285 - dccp_pr_debug("received %s for NN feature %d\n", 286 - dccp_feat_typename(type), feature); 287 - return -EFAULT; 288 - } 289 - 290 - /* XXX sanity check opt val */ 291 - 292 - /* copy option so we can confirm it */ 293 - opt = kzalloc(sizeof(*opt), GFP_ATOMIC); 294 - if (opt == NULL) 295 - return -ENOMEM; 296 - 297 - copy = kmemdup(val, len, GFP_ATOMIC); 298 - if (copy == NULL) { 299 - kfree(opt); 300 - return -ENOMEM; 301 - } 302 - 303 - opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */ 304 - opt->dccpop_feat = feature; 305 - opt->dccpop_val = copy; 306 - opt->dccpop_len = len; 307 - 308 - /* change feature */ 309 - rc = dccp_feat_update(sk, type, feature, *val); 310 - if (rc) { 311 - kfree(opt->dccpop_val); 312 - kfree(opt); 313 - return rc; 314 - } 315 - 316 - dccp_feat_debug(type, feature, *copy); 317 - 318 - list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); 319 - 84 + dccp_sk(sk)->dccps_l_ack_ratio = ratio; 320 85 return 0; 321 86 } 322 87 323 - static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk, 324 - u8 type, u8 feature) 88 + static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) 325 89 { 326 - /* XXX check if other confirms for that are queued and recycle slot */ 327 - struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC); 90 + struct dccp_sock *dp = dccp_sk(sk); 328 91 329 - if (opt == NULL) { 330 - /* XXX what do we do? Ignoring should be fine. It's a change 331 - * after all =P 332 - */ 333 - return; 334 - } 335 - 336 - switch (type) { 337 - case DCCPO_CHANGE_L: 338 - opt->dccpop_type = DCCPO_CONFIRM_R; 339 - break; 340 - case DCCPO_CHANGE_R: 341 - opt->dccpop_type = DCCPO_CONFIRM_L; 342 - break; 343 - default: 344 - DCCP_WARN("invalid type %d\n", type); 345 - kfree(opt); 346 - return; 347 - } 348 - opt->dccpop_feat = feature; 349 - opt->dccpop_val = NULL; 350 - opt->dccpop_len = 0; 351 - 352 - /* change feature */ 353 - dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature); 354 - 355 - list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); 356 - } 357 - 358 - static void dccp_feat_flush_confirm(struct sock *sk) 359 - { 360 - struct dccp_minisock *dmsk = dccp_msk(sk); 361 - /* Check if there is anything to confirm in the first place */ 362 - int yes = !list_empty(&dmsk->dccpms_conf); 363 - 364 - if (!yes) { 365 - struct dccp_opt_pend *opt; 366 - 367 - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 368 - if (opt->dccpop_conf) { 369 - yes = 1; 370 - break; 371 - } 92 + if (rx) { 93 + if (enable && dp->dccps_hc_rx_ackvec == NULL) { 94 + dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); 95 + if (dp->dccps_hc_rx_ackvec == NULL) 96 + return -ENOMEM; 97 + } else if (!enable) { 98 + dccp_ackvec_free(dp->dccps_hc_rx_ackvec); 99 + dp->dccps_hc_rx_ackvec = NULL; 372 100 } 373 101 } 374 - 375 - if (!yes) 376 - return; 377 - 378 - /* OK there is something to confirm... */ 379 - /* XXX check if packet is in flight? Send delayed ack?? */ 380 - if (sk->sk_state == DCCP_OPEN) 381 - dccp_send_ack(sk); 382 - } 383 - 384 - int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) 385 - { 386 - int rc; 387 - 388 - dccp_feat_debug(type, feature, *val); 389 - 390 - /* figure out if it's SP or NN feature */ 391 - switch (feature) { 392 - /* deal with SP features */ 393 - case DCCPF_CCID: 394 - rc = dccp_feat_sp(sk, type, feature, val, len); 395 - break; 396 - 397 - /* deal with NN features */ 398 - case DCCPF_ACK_RATIO: 399 - rc = dccp_feat_nn(sk, type, feature, val, len); 400 - break; 401 - 402 - /* XXX implement other features */ 403 - default: 404 - dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n", 405 - dccp_feat_typename(type), feature); 406 - rc = -EFAULT; 407 - break; 408 - } 409 - 410 - /* check if there were problems changing features */ 411 - if (rc) { 412 - /* If we don't agree on SP, we sent a confirm for old value. 413 - * However we propagate rc to caller in case option was 414 - * mandatory 415 - */ 416 - if (rc != DCCP_FEAT_SP_NOAGREE) 417 - dccp_feat_empty_confirm(dccp_msk(sk), type, feature); 418 - } 419 - 420 - /* generate the confirm [if required] */ 421 - dccp_feat_flush_confirm(sk); 422 - 423 - return rc; 424 - } 425 - 426 - EXPORT_SYMBOL_GPL(dccp_feat_change_recv); 427 - 428 - int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, 429 - u8 *val, u8 len) 430 - { 431 - u8 t; 432 - struct dccp_opt_pend *opt; 433 - struct dccp_minisock *dmsk = dccp_msk(sk); 434 - int found = 0; 435 - int all_confirmed = 1; 436 - 437 - dccp_feat_debug(type, feature, *val); 438 - 439 - /* locate our change request */ 440 - switch (type) { 441 - case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break; 442 - case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break; 443 - default: DCCP_WARN("invalid type %d\n", type); 444 - return 1; 445 - 446 - } 447 - /* XXX sanity check feature value */ 448 - 449 - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 450 - if (!opt->dccpop_conf && opt->dccpop_type == t && 451 - opt->dccpop_feat == feature) { 452 - found = 1; 453 - dccp_pr_debug("feature %d found\n", opt->dccpop_feat); 454 - 455 - /* XXX do sanity check */ 456 - 457 - opt->dccpop_conf = 1; 458 - 459 - /* We got a confirmation---change the option */ 460 - dccp_feat_update(sk, opt->dccpop_type, 461 - opt->dccpop_feat, *val); 462 - 463 - /* XXX check the return value of dccp_feat_update */ 464 - break; 465 - } 466 - 467 - if (!opt->dccpop_conf) 468 - all_confirmed = 0; 469 - } 470 - 471 - /* fix re-transmit timer */ 472 - /* XXX gotta make sure that no option negotiation occurs during 473 - * connection shutdown. Consider that the CLOSEREQ is sent and timer is 474 - * on. if all options are confirmed it might kill timer which should 475 - * remain alive until close is received. 476 - */ 477 - if (all_confirmed) { 478 - dccp_pr_debug("clear feat negotiation timer %p\n", sk); 479 - inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 480 - } 481 - 482 - if (!found) 483 - dccp_pr_debug("%s(%d, ...) never requested\n", 484 - dccp_feat_typename(type), feature); 485 102 return 0; 486 103 } 487 104 488 - EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv); 489 - 490 - void dccp_feat_clean(struct dccp_minisock *dmsk) 105 + static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) 491 106 { 492 - struct dccp_opt_pend *opt, *next; 493 - 494 - list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending, 495 - dccpop_node) { 496 - BUG_ON(opt->dccpop_val == NULL); 497 - kfree(opt->dccpop_val); 498 - 499 - if (opt->dccpop_sc != NULL) { 500 - BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); 501 - kfree(opt->dccpop_sc->dccpoc_val); 502 - kfree(opt->dccpop_sc); 503 - } 504 - 505 - kfree(opt); 506 - } 507 - INIT_LIST_HEAD(&dmsk->dccpms_pending); 508 - 509 - list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { 510 - BUG_ON(opt == NULL); 511 - if (opt->dccpop_val != NULL) 512 - kfree(opt->dccpop_val); 513 - kfree(opt); 514 - } 515 - INIT_LIST_HEAD(&dmsk->dccpms_conf); 107 + if (!rx) 108 + dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); 109 + return 0; 516 110 } 517 111 518 - EXPORT_SYMBOL_GPL(dccp_feat_clean); 519 - 520 - /* this is to be called only when a listening sock creates its child. It is 521 - * assumed by the function---the confirm is not duplicated, but rather it is 522 - * "passed on". 112 + /* 113 + * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that 114 + * `rx' holds when the sending peer informs about his partial coverage via a 115 + * ChangeR() option. In the other case, we are the sender and the receiver 116 + * announces its coverage via ChangeL() options. The policy here is to honour 117 + * such communication by enabling the corresponding partial coverage - but only 118 + * if it has not been set manually before; the warning here means that all 119 + * packets will be dropped. 523 120 */ 524 - int dccp_feat_clone(struct sock *oldsk, struct sock *newsk) 121 + static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) 525 122 { 526 - struct dccp_minisock *olddmsk = dccp_msk(oldsk); 527 - struct dccp_minisock *newdmsk = dccp_msk(newsk); 528 - struct dccp_opt_pend *opt; 529 - int rc = 0; 123 + struct dccp_sock *dp = dccp_sk(sk); 530 124 531 - INIT_LIST_HEAD(&newdmsk->dccpms_pending); 532 - INIT_LIST_HEAD(&newdmsk->dccpms_conf); 533 - 534 - list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) { 535 - struct dccp_opt_pend *newopt; 536 - /* copy the value of the option */ 537 - u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC); 538 - 539 - if (val == NULL) 540 - goto out_clean; 541 - 542 - newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC); 543 - if (newopt == NULL) { 544 - kfree(val); 545 - goto out_clean; 546 - } 547 - 548 - /* insert the option */ 549 - newopt->dccpop_val = val; 550 - list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending); 551 - 552 - /* XXX what happens with backlogs and multiple connections at 553 - * once... 554 - */ 555 - /* the master socket no longer needs to worry about confirms */ 556 - opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */ 557 - 558 - /* reset state for a new socket */ 559 - opt->dccpop_conf = 0; 125 + if (rx) 126 + dp->dccps_pcrlen = cscov; 127 + else { 128 + if (dp->dccps_pcslen == 0) 129 + dp->dccps_pcslen = cscov; 130 + else if (cscov > dp->dccps_pcslen) 131 + DCCP_WARN("CsCov %u too small, peer requires >= %u\n", 132 + dp->dccps_pcslen, (u8)cscov); 560 133 } 561 - 562 - /* XXX not doing anything about the conf queue */ 563 - 564 - out: 565 - return rc; 566 - 567 - out_clean: 568 - dccp_feat_clean(newdmsk); 569 - rc = -ENOMEM; 570 - goto out; 134 + return 0; 571 135 } 572 136 573 - EXPORT_SYMBOL_GPL(dccp_feat_clone); 137 + static const struct { 138 + u8 feat_num; /* DCCPF_xxx */ 139 + enum dccp_feat_type rxtx; /* RX or TX */ 140 + enum dccp_feat_type reconciliation; /* SP or NN */ 141 + u8 default_value; /* as in 6.4 */ 142 + int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); 143 + /* 144 + * Lookup table for location and type of features (from RFC 4340/4342) 145 + * +--------------------------+----+-----+----+----+---------+-----------+ 146 + * | Feature | Location | Reconc. | Initial | Section | 147 + * | | RX | TX | SP | NN | Value | Reference | 148 + * +--------------------------+----+-----+----+----+---------+-----------+ 149 + * | DCCPF_CCID | | X | X | | 2 | 10 | 150 + * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | 151 + * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | 152 + * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | 153 + * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | 154 + * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | 155 + * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | 156 + * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | 157 + * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | 158 + * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | 159 + * +--------------------------+----+-----+----+----+---------+-----------+ 160 + */ 161 + } dccp_feat_table[] = { 162 + { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, 163 + { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, 164 + { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, 165 + { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, 166 + { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, 167 + { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, 168 + { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, 169 + { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, 170 + { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, 171 + { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, 172 + }; 173 + #define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) 574 174 575 - static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat, 576 - u8 *val, u8 len) 175 + /** 176 + * dccp_feat_index - Hash function to map feature number into array position 177 + * Returns consecutive array index or -1 if the feature is not understood. 178 + */ 179 + static int dccp_feat_index(u8 feat_num) 577 180 { 578 - int rc = -ENOMEM; 579 - u8 *copy = kmemdup(val, len, GFP_KERNEL); 181 + /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ 182 + if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) 183 + return feat_num - 1; 580 184 581 - if (copy != NULL) { 582 - rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL); 583 - if (rc) 584 - kfree(copy); 185 + /* 186 + * Other features: add cases for new feature types here after adding 187 + * them to the above table. 188 + */ 189 + switch (feat_num) { 190 + case DCCPF_SEND_LEV_RATE: 191 + return DCCP_FEAT_SUPPORTED_MAX - 1; 585 192 } 586 - return rc; 193 + return -1; 587 194 } 588 195 589 - int dccp_feat_init(struct dccp_minisock *dmsk) 196 + static u8 dccp_feat_type(u8 feat_num) 590 197 { 591 - int rc; 198 + int idx = dccp_feat_index(feat_num); 592 199 593 - INIT_LIST_HEAD(&dmsk->dccpms_pending); 594 - INIT_LIST_HEAD(&dmsk->dccpms_conf); 595 - 596 - /* CCID L */ 597 - rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID, 598 - &dmsk->dccpms_tx_ccid, 1); 599 - if (rc) 600 - goto out; 601 - 602 - /* CCID R */ 603 - rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID, 604 - &dmsk->dccpms_rx_ccid, 1); 605 - if (rc) 606 - goto out; 607 - 608 - /* Ack ratio */ 609 - rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO, 610 - &dmsk->dccpms_ack_ratio, 1); 611 - out: 612 - return rc; 200 + if (idx < 0) 201 + return FEAT_UNKNOWN; 202 + return dccp_feat_table[idx].reconciliation; 613 203 } 614 204 615 - EXPORT_SYMBOL_GPL(dccp_feat_init); 616 - 617 - #ifdef CONFIG_IP_DCCP_DEBUG 618 - const char *dccp_feat_typename(const u8 type) 205 + static int dccp_feat_default_value(u8 feat_num) 619 206 { 620 - switch(type) { 621 - case DCCPO_CHANGE_L: return("ChangeL"); 622 - case DCCPO_CONFIRM_L: return("ConfirmL"); 623 - case DCCPO_CHANGE_R: return("ChangeR"); 624 - case DCCPO_CONFIRM_R: return("ConfirmR"); 625 - /* the following case must not appear in feature negotation */ 626 - default: dccp_pr_debug("unknown type %d [BUG!]\n", type); 627 - } 628 - return NULL; 207 + int idx = dccp_feat_index(feat_num); 208 + 209 + return idx < 0 ? : dccp_feat_table[idx].default_value; 629 210 } 630 211 631 - EXPORT_SYMBOL_GPL(dccp_feat_typename); 632 - 633 - const char *dccp_feat_name(const u8 feat) 212 + /* 213 + * Debugging and verbose-printing section 214 + */ 215 + static const char *dccp_feat_fname(const u8 feat) 634 216 { 635 217 static const char *feature_names[] = { 636 218 [DCCPF_RESERVED] = "Reserved", ··· 237 639 if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) 238 640 return feature_names[DCCPF_RESERVED]; 239 641 642 + if (feat == DCCPF_SEND_LEV_RATE) 643 + return "Send Loss Event Rate"; 240 644 if (feat >= DCCPF_MIN_CCID_SPECIFIC) 241 645 return "CCID-specific"; 242 646 243 647 return feature_names[feat]; 244 648 } 245 649 246 - EXPORT_SYMBOL_GPL(dccp_feat_name); 247 - #endif /* CONFIG_IP_DCCP_DEBUG */ 650 + static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING", 651 + "UNSTABLE", "STABLE" }; 652 + 653 + #ifdef CONFIG_IP_DCCP_DEBUG 654 + static const char *dccp_feat_oname(const u8 opt) 655 + { 656 + switch (opt) { 657 + case DCCPO_CHANGE_L: return "Change_L"; 658 + case DCCPO_CONFIRM_L: return "Confirm_L"; 659 + case DCCPO_CHANGE_R: return "Change_R"; 660 + case DCCPO_CONFIRM_R: return "Confirm_R"; 661 + } 662 + return NULL; 663 + } 664 + 665 + static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) 666 + { 667 + u8 i, type = dccp_feat_type(feat_num); 668 + 669 + if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) 670 + dccp_pr_debug_cat("(NULL)"); 671 + else if (type == FEAT_SP) 672 + for (i = 0; i < val->sp.len; i++) 673 + dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); 674 + else if (type == FEAT_NN) 675 + dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); 676 + else 677 + dccp_pr_debug_cat("unknown type %u", type); 678 + } 679 + 680 + static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) 681 + { 682 + u8 type = dccp_feat_type(feat_num); 683 + dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; 684 + 685 + if (type == FEAT_NN) 686 + fval.nn = dccp_decode_value_var(list, len); 687 + dccp_feat_printval(feat_num, &fval); 688 + } 689 + 690 + static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) 691 + { 692 + dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", 693 + dccp_feat_fname(entry->feat_num)); 694 + dccp_feat_printval(entry->feat_num, &entry->val); 695 + dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], 696 + entry->needs_confirm ? "(Confirm pending)" : ""); 697 + } 698 + 699 + #define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ 700 + dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ 701 + dccp_feat_printvals(feat, val, len); \ 702 + dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) 703 + 704 + #define dccp_feat_print_fnlist(fn_list) { \ 705 + const struct dccp_feat_entry *___entry; \ 706 + \ 707 + dccp_pr_debug("List Dump:\n"); \ 708 + list_for_each_entry(___entry, fn_list, node) \ 709 + dccp_feat_print_entry(___entry); \ 710 + } 711 + #else /* ! CONFIG_IP_DCCP_DEBUG */ 712 + #define dccp_feat_print_opt(opt, feat, val, len, mandatory) 713 + #define dccp_feat_print_fnlist(fn_list) 714 + #endif 715 + 716 + static int __dccp_feat_activate(struct sock *sk, const int idx, 717 + const bool is_local, dccp_feat_val const *fval) 718 + { 719 + bool rx; 720 + u64 val; 721 + 722 + if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) 723 + return -1; 724 + if (dccp_feat_table[idx].activation_hdlr == NULL) 725 + return 0; 726 + 727 + if (fval == NULL) { 728 + val = dccp_feat_table[idx].default_value; 729 + } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { 730 + if (fval->sp.vec == NULL) { 731 + /* 732 + * This can happen when an empty Confirm is sent 733 + * for an SP (i.e. known) feature. In this case 734 + * we would be using the default anyway. 735 + */ 736 + DCCP_CRIT("Feature #%d undefined: using default", idx); 737 + val = dccp_feat_table[idx].default_value; 738 + } else { 739 + val = fval->sp.vec[0]; 740 + } 741 + } else { 742 + val = fval->nn; 743 + } 744 + 745 + /* Location is RX if this is a local-RX or remote-TX feature */ 746 + rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); 747 + 748 + dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", 749 + dccp_feat_fname(dccp_feat_table[idx].feat_num), 750 + fval ? "" : "default ", (unsigned long long)val); 751 + 752 + return dccp_feat_table[idx].activation_hdlr(sk, val, rx); 753 + } 754 + 755 + /** 756 + * dccp_feat_activate - Activate feature value on socket 757 + * @sk: fully connected DCCP socket (after handshake is complete) 758 + * @feat_num: feature to activate, one of %dccp_feature_numbers 759 + * @local: whether local (1) or remote (0) @feat_num is meant 760 + * @fval: the value (SP or NN) to activate, or NULL to use the default value 761 + * For general use this function is preferable over __dccp_feat_activate(). 762 + */ 763 + static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, 764 + dccp_feat_val const *fval) 765 + { 766 + return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); 767 + } 768 + 769 + /* Test for "Req'd" feature (RFC 4340, 6.4) */ 770 + static inline int dccp_feat_must_be_understood(u8 feat_num) 771 + { 772 + return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || 773 + feat_num == DCCPF_SEQUENCE_WINDOW; 774 + } 775 + 776 + /* copy constructor, fval must not already contain allocated memory */ 777 + static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) 778 + { 779 + fval->sp.len = len; 780 + if (fval->sp.len > 0) { 781 + fval->sp.vec = kmemdup(val, len, gfp_any()); 782 + if (fval->sp.vec == NULL) { 783 + fval->sp.len = 0; 784 + return -ENOBUFS; 785 + } 786 + } 787 + return 0; 788 + } 789 + 790 + static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) 791 + { 792 + if (unlikely(val == NULL)) 793 + return; 794 + if (dccp_feat_type(feat_num) == FEAT_SP) 795 + kfree(val->sp.vec); 796 + memset(val, 0, sizeof(*val)); 797 + } 798 + 799 + static struct dccp_feat_entry * 800 + dccp_feat_clone_entry(struct dccp_feat_entry const *original) 801 + { 802 + struct dccp_feat_entry *new; 803 + u8 type = dccp_feat_type(original->feat_num); 804 + 805 + if (type == FEAT_UNKNOWN) 806 + return NULL; 807 + 808 + new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); 809 + if (new == NULL) 810 + return NULL; 811 + 812 + if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, 813 + original->val.sp.vec, 814 + original->val.sp.len)) { 815 + kfree(new); 816 + return NULL; 817 + } 818 + return new; 819 + } 820 + 821 + static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) 822 + { 823 + if (entry != NULL) { 824 + dccp_feat_val_destructor(entry->feat_num, &entry->val); 825 + kfree(entry); 826 + } 827 + } 828 + 829 + /* 830 + * List management functions 831 + * 832 + * Feature negotiation lists rely on and maintain the following invariants: 833 + * - each feat_num in the list is known, i.e. we know its type and default value 834 + * - each feat_num/is_local combination is unique (old entries are overwritten) 835 + * - SP values are always freshly allocated 836 + * - list is sorted in increasing order of feature number (faster lookup) 837 + */ 838 + static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, 839 + u8 feat_num, bool is_local) 840 + { 841 + struct dccp_feat_entry *entry; 842 + 843 + list_for_each_entry(entry, fn_list, node) 844 + if (entry->feat_num == feat_num && entry->is_local == is_local) 845 + return entry; 846 + else if (entry->feat_num > feat_num) 847 + break; 848 + return NULL; 849 + } 850 + 851 + /** 852 + * dccp_feat_entry_new - Central list update routine (called by all others) 853 + * @head: list to add to 854 + * @feat: feature number 855 + * @local: whether the local (1) or remote feature with number @feat is meant 856 + * This is the only constructor and serves to ensure the above invariants. 857 + */ 858 + static struct dccp_feat_entry * 859 + dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) 860 + { 861 + struct dccp_feat_entry *entry; 862 + 863 + list_for_each_entry(entry, head, node) 864 + if (entry->feat_num == feat && entry->is_local == local) { 865 + dccp_feat_val_destructor(entry->feat_num, &entry->val); 866 + return entry; 867 + } else if (entry->feat_num > feat) { 868 + head = &entry->node; 869 + break; 870 + } 871 + 872 + entry = kmalloc(sizeof(*entry), gfp_any()); 873 + if (entry != NULL) { 874 + entry->feat_num = feat; 875 + entry->is_local = local; 876 + list_add_tail(&entry->node, head); 877 + } 878 + return entry; 879 + } 880 + 881 + /** 882 + * dccp_feat_push_change - Add/overwrite a Change option in the list 883 + * @fn_list: feature-negotiation list to update 884 + * @feat: one of %dccp_feature_numbers 885 + * @local: whether local (1) or remote (0) @feat_num is meant 886 + * @needs_mandatory: whether to use Mandatory feature negotiation options 887 + * @fval: pointer to NN/SP value to be inserted (will be copied) 888 + */ 889 + static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, 890 + u8 mandatory, dccp_feat_val *fval) 891 + { 892 + struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); 893 + 894 + if (new == NULL) 895 + return -ENOMEM; 896 + 897 + new->feat_num = feat; 898 + new->is_local = local; 899 + new->state = FEAT_INITIALISING; 900 + new->needs_confirm = 0; 901 + new->empty_confirm = 0; 902 + new->val = *fval; 903 + new->needs_mandatory = mandatory; 904 + 905 + return 0; 906 + } 907 + 908 + /** 909 + * dccp_feat_push_confirm - Add a Confirm entry to the FN list 910 + * @fn_list: feature-negotiation list to add to 911 + * @feat: one of %dccp_feature_numbers 912 + * @local: whether local (1) or remote (0) @feat_num is being confirmed 913 + * @fval: pointer to NN/SP value to be inserted or NULL 914 + * Returns 0 on success, a Reset code for further processing otherwise. 915 + */ 916 + static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, 917 + dccp_feat_val *fval) 918 + { 919 + struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); 920 + 921 + if (new == NULL) 922 + return DCCP_RESET_CODE_TOO_BUSY; 923 + 924 + new->feat_num = feat; 925 + new->is_local = local; 926 + new->state = FEAT_STABLE; /* transition in 6.6.2 */ 927 + new->needs_confirm = 1; 928 + new->empty_confirm = (fval == NULL); 929 + new->val.nn = 0; /* zeroes the whole structure */ 930 + if (!new->empty_confirm) 931 + new->val = *fval; 932 + new->needs_mandatory = 0; 933 + 934 + return 0; 935 + } 936 + 937 + static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) 938 + { 939 + return dccp_feat_push_confirm(fn_list, feat, local, NULL); 940 + } 941 + 942 + static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) 943 + { 944 + list_del(&entry->node); 945 + dccp_feat_entry_destructor(entry); 946 + } 947 + 948 + void dccp_feat_list_purge(struct list_head *fn_list) 949 + { 950 + struct dccp_feat_entry *entry, *next; 951 + 952 + list_for_each_entry_safe(entry, next, fn_list, node) 953 + dccp_feat_entry_destructor(entry); 954 + INIT_LIST_HEAD(fn_list); 955 + } 956 + EXPORT_SYMBOL_GPL(dccp_feat_list_purge); 957 + 958 + /* generate @to as full clone of @from - @to must not contain any nodes */ 959 + int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) 960 + { 961 + struct dccp_feat_entry *entry, *new; 962 + 963 + INIT_LIST_HEAD(to); 964 + list_for_each_entry(entry, from, node) { 965 + new = dccp_feat_clone_entry(entry); 966 + if (new == NULL) 967 + goto cloning_failed; 968 + list_add_tail(&new->node, to); 969 + } 970 + return 0; 971 + 972 + cloning_failed: 973 + dccp_feat_list_purge(to); 974 + return -ENOMEM; 975 + } 976 + 977 + /** 978 + * dccp_feat_valid_nn_length - Enforce length constraints on NN options 979 + * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, 980 + * incoming options are accepted as long as their values are valid. 981 + */ 982 + static u8 dccp_feat_valid_nn_length(u8 feat_num) 983 + { 984 + if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ 985 + return 2; 986 + if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ 987 + return 6; 988 + return 0; 989 + } 990 + 991 + static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) 992 + { 993 + switch (feat_num) { 994 + case DCCPF_ACK_RATIO: 995 + return val <= DCCPF_ACK_RATIO_MAX; 996 + case DCCPF_SEQUENCE_WINDOW: 997 + return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; 998 + } 999 + return 0; /* feature unknown - so we can't tell */ 1000 + } 1001 + 1002 + /* check that SP values are within the ranges defined in RFC 4340 */ 1003 + static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) 1004 + { 1005 + switch (feat_num) { 1006 + case DCCPF_CCID: 1007 + return val == DCCPC_CCID2 || val == DCCPC_CCID3; 1008 + /* Type-check Boolean feature values: */ 1009 + case DCCPF_SHORT_SEQNOS: 1010 + case DCCPF_ECN_INCAPABLE: 1011 + case DCCPF_SEND_ACK_VECTOR: 1012 + case DCCPF_SEND_NDP_COUNT: 1013 + case DCCPF_DATA_CHECKSUM: 1014 + case DCCPF_SEND_LEV_RATE: 1015 + return val < 2; 1016 + case DCCPF_MIN_CSUM_COVER: 1017 + return val < 16; 1018 + } 1019 + return 0; /* feature unknown */ 1020 + } 1021 + 1022 + static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) 1023 + { 1024 + if (sp_list == NULL || sp_len < 1) 1025 + return 0; 1026 + while (sp_len--) 1027 + if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) 1028 + return 0; 1029 + return 1; 1030 + } 1031 + 1032 + /** 1033 + * dccp_feat_insert_opts - Generate FN options from current list state 1034 + * @skb: next sk_buff to be sent to the peer 1035 + * @dp: for client during handshake and general negotiation 1036 + * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) 1037 + */ 1038 + int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, 1039 + struct sk_buff *skb) 1040 + { 1041 + struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; 1042 + struct dccp_feat_entry *pos, *next; 1043 + u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; 1044 + bool rpt; 1045 + 1046 + /* put entries into @skb in the order they appear in the list */ 1047 + list_for_each_entry_safe_reverse(pos, next, fn, node) { 1048 + opt = dccp_feat_genopt(pos); 1049 + type = dccp_feat_type(pos->feat_num); 1050 + rpt = false; 1051 + 1052 + if (pos->empty_confirm) { 1053 + len = 0; 1054 + ptr = NULL; 1055 + } else { 1056 + if (type == FEAT_SP) { 1057 + len = pos->val.sp.len; 1058 + ptr = pos->val.sp.vec; 1059 + rpt = pos->needs_confirm; 1060 + } else if (type == FEAT_NN) { 1061 + len = dccp_feat_valid_nn_length(pos->feat_num); 1062 + ptr = nn_in_nbo; 1063 + dccp_encode_value_var(pos->val.nn, ptr, len); 1064 + } else { 1065 + DCCP_BUG("unknown feature %u", pos->feat_num); 1066 + return -1; 1067 + } 1068 + } 1069 + dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); 1070 + 1071 + if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) 1072 + return -1; 1073 + if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) 1074 + return -1; 1075 + /* 1076 + * Enter CHANGING after transmitting the Change option (6.6.2). 1077 + */ 1078 + if (pos->state == FEAT_INITIALISING) 1079 + pos->state = FEAT_CHANGING; 1080 + } 1081 + return 0; 1082 + } 1083 + 1084 + /** 1085 + * __feat_register_nn - Register new NN value on socket 1086 + * @fn: feature-negotiation list to register with 1087 + * @feat: an NN feature from %dccp_feature_numbers 1088 + * @mandatory: use Mandatory option if 1 1089 + * @nn_val: value to register (restricted to 4 bytes) 1090 + * Note that NN features are local by definition (RFC 4340, 6.3.2). 1091 + */ 1092 + static int __feat_register_nn(struct list_head *fn, u8 feat, 1093 + u8 mandatory, u64 nn_val) 1094 + { 1095 + dccp_feat_val fval = { .nn = nn_val }; 1096 + 1097 + if (dccp_feat_type(feat) != FEAT_NN || 1098 + !dccp_feat_is_valid_nn_val(feat, nn_val)) 1099 + return -EINVAL; 1100 + 1101 + /* Don't bother with default values, they will be activated anyway. */ 1102 + if (nn_val - (u64)dccp_feat_default_value(feat) == 0) 1103 + return 0; 1104 + 1105 + return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); 1106 + } 1107 + 1108 + /** 1109 + * __feat_register_sp - Register new SP value/list on socket 1110 + * @fn: feature-negotiation list to register with 1111 + * @feat: an SP feature from %dccp_feature_numbers 1112 + * @is_local: whether the local (1) or the remote (0) @feat is meant 1113 + * @mandatory: use Mandatory option if 1 1114 + * @sp_val: SP value followed by optional preference list 1115 + * @sp_len: length of @sp_val in bytes 1116 + */ 1117 + static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, 1118 + u8 mandatory, u8 const *sp_val, u8 sp_len) 1119 + { 1120 + dccp_feat_val fval; 1121 + 1122 + if (dccp_feat_type(feat) != FEAT_SP || 1123 + !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) 1124 + return -EINVAL; 1125 + 1126 + /* Avoid negotiating alien CCIDs by only advertising supported ones */ 1127 + if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) 1128 + return -EOPNOTSUPP; 1129 + 1130 + if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) 1131 + return -ENOMEM; 1132 + 1133 + return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); 1134 + } 1135 + 1136 + /** 1137 + * dccp_feat_register_sp - Register requests to change SP feature values 1138 + * @sk: client or listening socket 1139 + * @feat: one of %dccp_feature_numbers 1140 + * @is_local: whether the local (1) or remote (0) @feat is meant 1141 + * @list: array of preferred values, in descending order of preference 1142 + * @len: length of @list in bytes 1143 + */ 1144 + int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, 1145 + u8 const *list, u8 len) 1146 + { /* any changes must be registered before establishing the connection */ 1147 + if (sk->sk_state != DCCP_CLOSED) 1148 + return -EISCONN; 1149 + if (dccp_feat_type(feat) != FEAT_SP) 1150 + return -EINVAL; 1151 + return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, 1152 + 0, list, len); 1153 + } 1154 + 1155 + /* Analogous to dccp_feat_register_sp(), but for non-negotiable values */ 1156 + int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val) 1157 + { 1158 + /* any changes must be registered before establishing the connection */ 1159 + if (sk->sk_state != DCCP_CLOSED) 1160 + return -EISCONN; 1161 + if (dccp_feat_type(feat) != FEAT_NN) 1162 + return -EINVAL; 1163 + return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val); 1164 + } 1165 + 1166 + /** 1167 + * dccp_feat_signal_nn_change - Update NN values for an established connection 1168 + * @sk: DCCP socket of an established connection 1169 + * @feat: NN feature number from %dccp_feature_numbers 1170 + * @nn_val: the new value to use 1171 + * This function is used to communicate NN updates out-of-band. The difference 1172 + * to feature negotiation during connection setup is that values are activated 1173 + * immediately after validation, i.e. we don't wait for the Confirm: either the 1174 + * value is accepted by the peer (and then the waiting is futile), or it is not 1175 + * (Reset or empty Confirm). We don't accept empty Confirms - transmitted values 1176 + * are validated, and the peer "MUST accept any valid value" (RFC 4340, 6.3.2). 1177 + */ 1178 + int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) 1179 + { 1180 + struct list_head *fn = &dccp_sk(sk)->dccps_featneg; 1181 + dccp_feat_val fval = { .nn = nn_val }; 1182 + struct dccp_feat_entry *entry; 1183 + 1184 + if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) 1185 + return 0; 1186 + 1187 + if (dccp_feat_type(feat) != FEAT_NN || 1188 + !dccp_feat_is_valid_nn_val(feat, nn_val)) 1189 + return -EINVAL; 1190 + 1191 + entry = dccp_feat_list_lookup(fn, feat, 1); 1192 + if (entry != NULL) { 1193 + dccp_pr_debug("Ignoring %llu, entry %llu exists in state %s\n", 1194 + (unsigned long long)nn_val, 1195 + (unsigned long long)entry->val.nn, 1196 + dccp_feat_sname[entry->state]); 1197 + return 0; 1198 + } 1199 + 1200 + if (dccp_feat_activate(sk, feat, 1, &fval)) 1201 + return -EADV; 1202 + 1203 + inet_csk_schedule_ack(sk); 1204 + return dccp_feat_push_change(fn, feat, 1, 0, &fval); 1205 + } 1206 + EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); 1207 + 1208 + /* 1209 + * Tracking features whose value depend on the choice of CCID 1210 + * 1211 + * This is designed with an extension in mind so that a list walk could be done 1212 + * before activating any features. However, the existing framework was found to 1213 + * work satisfactorily up until now, the automatic verification is left open. 1214 + * When adding new CCIDs, add a corresponding dependency table here. 1215 + */ 1216 + static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) 1217 + { 1218 + static const struct ccid_dependency ccid2_dependencies[2][2] = { 1219 + /* 1220 + * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX 1221 + * feature and Send Ack Vector is an RX feature, `is_local' 1222 + * needs to be reversed. 1223 + */ 1224 + { /* Dependencies of the receiver-side (remote) CCID2 */ 1225 + { 1226 + .dependent_feat = DCCPF_SEND_ACK_VECTOR, 1227 + .is_local = true, 1228 + .is_mandatory = true, 1229 + .val = 1 1230 + }, 1231 + { 0, 0, 0, 0 } 1232 + }, 1233 + { /* Dependencies of the sender-side (local) CCID2 */ 1234 + { 1235 + .dependent_feat = DCCPF_SEND_ACK_VECTOR, 1236 + .is_local = false, 1237 + .is_mandatory = true, 1238 + .val = 1 1239 + }, 1240 + { 0, 0, 0, 0 } 1241 + } 1242 + }; 1243 + static const struct ccid_dependency ccid3_dependencies[2][5] = { 1244 + { /* 1245 + * Dependencies of the receiver-side CCID3 1246 + */ 1247 + { /* locally disable Ack Vectors */ 1248 + .dependent_feat = DCCPF_SEND_ACK_VECTOR, 1249 + .is_local = true, 1250 + .is_mandatory = false, 1251 + .val = 0 1252 + }, 1253 + { /* see below why Send Loss Event Rate is on */ 1254 + .dependent_feat = DCCPF_SEND_LEV_RATE, 1255 + .is_local = true, 1256 + .is_mandatory = true, 1257 + .val = 1 1258 + }, 1259 + { /* NDP Count is needed as per RFC 4342, 6.1.1 */ 1260 + .dependent_feat = DCCPF_SEND_NDP_COUNT, 1261 + .is_local = false, 1262 + .is_mandatory = true, 1263 + .val = 1 1264 + }, 1265 + { 0, 0, 0, 0 }, 1266 + }, 1267 + { /* 1268 + * CCID3 at the TX side: we request that the HC-receiver 1269 + * will not send Ack Vectors (they will be ignored, so 1270 + * Mandatory is not set); we enable Send Loss Event Rate 1271 + * (Mandatory since the implementation does not support 1272 + * the Loss Intervals option of RFC 4342, 8.6). 1273 + * The last two options are for peer's information only. 1274 + */ 1275 + { 1276 + .dependent_feat = DCCPF_SEND_ACK_VECTOR, 1277 + .is_local = false, 1278 + .is_mandatory = false, 1279 + .val = 0 1280 + }, 1281 + { 1282 + .dependent_feat = DCCPF_SEND_LEV_RATE, 1283 + .is_local = false, 1284 + .is_mandatory = true, 1285 + .val = 1 1286 + }, 1287 + { /* this CCID does not support Ack Ratio */ 1288 + .dependent_feat = DCCPF_ACK_RATIO, 1289 + .is_local = true, 1290 + .is_mandatory = false, 1291 + .val = 0 1292 + }, 1293 + { /* tell receiver we are sending NDP counts */ 1294 + .dependent_feat = DCCPF_SEND_NDP_COUNT, 1295 + .is_local = true, 1296 + .is_mandatory = false, 1297 + .val = 1 1298 + }, 1299 + { 0, 0, 0, 0 } 1300 + } 1301 + }; 1302 + switch (ccid) { 1303 + case DCCPC_CCID2: 1304 + return ccid2_dependencies[is_local]; 1305 + case DCCPC_CCID3: 1306 + return ccid3_dependencies[is_local]; 1307 + default: 1308 + return NULL; 1309 + } 1310 + } 1311 + 1312 + /** 1313 + * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID 1314 + * @fn: feature-negotiation list to update 1315 + * @id: CCID number to track 1316 + * @is_local: whether TX CCID (1) or RX CCID (0) is meant 1317 + * This function needs to be called after registering all other features. 1318 + */ 1319 + static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) 1320 + { 1321 + const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); 1322 + int i, rc = (table == NULL); 1323 + 1324 + for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) 1325 + if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) 1326 + rc = __feat_register_sp(fn, table[i].dependent_feat, 1327 + table[i].is_local, 1328 + table[i].is_mandatory, 1329 + &table[i].val, 1); 1330 + else 1331 + rc = __feat_register_nn(fn, table[i].dependent_feat, 1332 + table[i].is_mandatory, 1333 + table[i].val); 1334 + return rc; 1335 + } 1336 + 1337 + /** 1338 + * dccp_feat_finalise_settings - Finalise settings before starting negotiation 1339 + * @dp: client or listening socket (settings will be inherited) 1340 + * This is called after all registrations (socket initialisation, sysctls, and 1341 + * sockopt calls), and before sending the first packet containing Change options 1342 + * (ie. client-Request or server-Response), to ensure internal consistency. 1343 + */ 1344 + int dccp_feat_finalise_settings(struct dccp_sock *dp) 1345 + { 1346 + struct list_head *fn = &dp->dccps_featneg; 1347 + struct dccp_feat_entry *entry; 1348 + int i = 2, ccids[2] = { -1, -1 }; 1349 + 1350 + /* 1351 + * Propagating CCIDs: 1352 + * 1) not useful to propagate CCID settings if this host advertises more 1353 + * than one CCID: the choice of CCID may still change - if this is 1354 + * the client, or if this is the server and the client sends 1355 + * singleton CCID values. 1356 + * 2) since is that propagate_ccid changes the list, we defer changing 1357 + * the sorted list until after the traversal. 1358 + */ 1359 + list_for_each_entry(entry, fn, node) 1360 + if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) 1361 + ccids[entry->is_local] = entry->val.sp.vec[0]; 1362 + while (i--) 1363 + if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) 1364 + return -1; 1365 + dccp_feat_print_fnlist(fn); 1366 + return 0; 1367 + } 1368 + 1369 + /** 1370 + * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features 1371 + * It is the server which resolves the dependencies once the CCID has been 1372 + * fully negotiated. If no CCID has been negotiated, it uses the default CCID. 1373 + */ 1374 + int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) 1375 + { 1376 + struct list_head *fn = &dreq->dreq_featneg; 1377 + struct dccp_feat_entry *entry; 1378 + u8 is_local, ccid; 1379 + 1380 + for (is_local = 0; is_local <= 1; is_local++) { 1381 + entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); 1382 + 1383 + if (entry != NULL && !entry->empty_confirm) 1384 + ccid = entry->val.sp.vec[0]; 1385 + else 1386 + ccid = dccp_feat_default_value(DCCPF_CCID); 1387 + 1388 + if (dccp_feat_propagate_ccid(fn, ccid, is_local)) 1389 + return -1; 1390 + } 1391 + return 0; 1392 + } 1393 + 1394 + /* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ 1395 + static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) 1396 + { 1397 + u8 c, s; 1398 + 1399 + for (s = 0; s < slen; s++) 1400 + for (c = 0; c < clen; c++) 1401 + if (servlist[s] == clilist[c]) 1402 + return servlist[s]; 1403 + return -1; 1404 + } 1405 + 1406 + /** 1407 + * dccp_feat_prefer - Move preferred entry to the start of array 1408 + * Reorder the @array_len elements in @array so that @preferred_value comes 1409 + * first. Returns >0 to indicate that @preferred_value does occur in @array. 1410 + */ 1411 + static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) 1412 + { 1413 + u8 i, does_occur = 0; 1414 + 1415 + if (array != NULL) { 1416 + for (i = 0; i < array_len; i++) 1417 + if (array[i] == preferred_value) { 1418 + array[i] = array[0]; 1419 + does_occur++; 1420 + } 1421 + if (does_occur) 1422 + array[0] = preferred_value; 1423 + } 1424 + return does_occur; 1425 + } 1426 + 1427 + /** 1428 + * dccp_feat_reconcile - Reconcile SP preference lists 1429 + * @fval: SP list to reconcile into 1430 + * @arr: received SP preference list 1431 + * @len: length of @arr in bytes 1432 + * @is_server: whether this side is the server (and @fv is the server's list) 1433 + * @reorder: whether to reorder the list in @fv after reconciling with @arr 1434 + * When successful, > 0 is returned and the reconciled list is in @fval. 1435 + * A value of 0 means that negotiation failed (no shared entry). 1436 + */ 1437 + static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, 1438 + bool is_server, bool reorder) 1439 + { 1440 + int rc; 1441 + 1442 + if (!fv->sp.vec || !arr) { 1443 + DCCP_CRIT("NULL feature value or array"); 1444 + return 0; 1445 + } 1446 + 1447 + if (is_server) 1448 + rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); 1449 + else 1450 + rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); 1451 + 1452 + if (!reorder) 1453 + return rc; 1454 + if (rc < 0) 1455 + return 0; 1456 + 1457 + /* 1458 + * Reorder list: used for activating features and in dccp_insert_fn_opt. 1459 + */ 1460 + return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); 1461 + } 1462 + 1463 + /** 1464 + * dccp_feat_change_recv - Process incoming ChangeL/R options 1465 + * @fn: feature-negotiation list to update 1466 + * @is_mandatory: whether the Change was preceded by a Mandatory option 1467 + * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R 1468 + * @feat: one of %dccp_feature_numbers 1469 + * @val: NN value or SP value/preference list 1470 + * @len: length of @val in bytes 1471 + * @server: whether this node is the server (1) or the client (0) 1472 + */ 1473 + static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, 1474 + u8 feat, u8 *val, u8 len, const bool server) 1475 + { 1476 + u8 defval, type = dccp_feat_type(feat); 1477 + const bool local = (opt == DCCPO_CHANGE_R); 1478 + struct dccp_feat_entry *entry; 1479 + dccp_feat_val fval; 1480 + 1481 + if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ 1482 + goto unknown_feature_or_value; 1483 + 1484 + dccp_feat_print_opt(opt, feat, val, len, is_mandatory); 1485 + 1486 + /* 1487 + * Negotiation of NN features: Change R is invalid, so there is no 1488 + * simultaneous negotiation; hence we do not look up in the list. 1489 + */ 1490 + if (type == FEAT_NN) { 1491 + if (local || len > sizeof(fval.nn)) 1492 + goto unknown_feature_or_value; 1493 + 1494 + /* 6.3.2: "The feature remote MUST accept any valid value..." */ 1495 + fval.nn = dccp_decode_value_var(val, len); 1496 + if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) 1497 + goto unknown_feature_or_value; 1498 + 1499 + return dccp_feat_push_confirm(fn, feat, local, &fval); 1500 + } 1501 + 1502 + /* 1503 + * Unidirectional/simultaneous negotiation of SP features (6.3.1) 1504 + */ 1505 + entry = dccp_feat_list_lookup(fn, feat, local); 1506 + if (entry == NULL) { 1507 + /* 1508 + * No particular preferences have been registered. We deal with 1509 + * this situation by assuming that all valid values are equally 1510 + * acceptable, and apply the following checks: 1511 + * - if the peer's list is a singleton, we accept a valid value; 1512 + * - if we are the server, we first try to see if the peer (the 1513 + * client) advertises the default value. If yes, we use it, 1514 + * otherwise we accept the preferred value; 1515 + * - else if we are the client, we use the first list element. 1516 + */ 1517 + if (dccp_feat_clone_sp_val(&fval, val, 1)) 1518 + return DCCP_RESET_CODE_TOO_BUSY; 1519 + 1520 + if (len > 1 && server) { 1521 + defval = dccp_feat_default_value(feat); 1522 + if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) 1523 + fval.sp.vec[0] = defval; 1524 + } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { 1525 + kfree(fval.sp.vec); 1526 + goto unknown_feature_or_value; 1527 + } 1528 + 1529 + /* Treat unsupported CCIDs like invalid values */ 1530 + if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { 1531 + kfree(fval.sp.vec); 1532 + goto not_valid_or_not_known; 1533 + } 1534 + 1535 + return dccp_feat_push_confirm(fn, feat, local, &fval); 1536 + 1537 + } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ 1538 + return 0; 1539 + } 1540 + 1541 + if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { 1542 + entry->empty_confirm = 0; 1543 + } else if (is_mandatory) { 1544 + return DCCP_RESET_CODE_MANDATORY_ERROR; 1545 + } else if (entry->state == FEAT_INITIALISING) { 1546 + /* 1547 + * Failed simultaneous negotiation (server only): try to `save' 1548 + * the connection by checking whether entry contains the default 1549 + * value for @feat. If yes, send an empty Confirm to signal that 1550 + * the received Change was not understood - which implies using 1551 + * the default value. 1552 + * If this also fails, we use Reset as the last resort. 1553 + */ 1554 + WARN_ON(!server); 1555 + defval = dccp_feat_default_value(feat); 1556 + if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) 1557 + return DCCP_RESET_CODE_OPTION_ERROR; 1558 + entry->empty_confirm = 1; 1559 + } 1560 + entry->needs_confirm = 1; 1561 + entry->needs_mandatory = 0; 1562 + entry->state = FEAT_STABLE; 1563 + return 0; 1564 + 1565 + unknown_feature_or_value: 1566 + if (!is_mandatory) 1567 + return dccp_push_empty_confirm(fn, feat, local); 1568 + 1569 + not_valid_or_not_known: 1570 + return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR 1571 + : DCCP_RESET_CODE_OPTION_ERROR; 1572 + } 1573 + 1574 + /** 1575 + * dccp_feat_confirm_recv - Process received Confirm options 1576 + * @fn: feature-negotiation list to update 1577 + * @is_mandatory: whether @opt was preceded by a Mandatory option 1578 + * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R 1579 + * @feat: one of %dccp_feature_numbers 1580 + * @val: NN value or SP value/preference list 1581 + * @len: length of @val in bytes 1582 + * @server: whether this node is server (1) or client (0) 1583 + */ 1584 + static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, 1585 + u8 feat, u8 *val, u8 len, const bool server) 1586 + { 1587 + u8 *plist, plen, type = dccp_feat_type(feat); 1588 + const bool local = (opt == DCCPO_CONFIRM_R); 1589 + struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); 1590 + 1591 + dccp_feat_print_opt(opt, feat, val, len, is_mandatory); 1592 + 1593 + if (entry == NULL) { /* nothing queued: ignore or handle error */ 1594 + if (is_mandatory && type == FEAT_UNKNOWN) 1595 + return DCCP_RESET_CODE_MANDATORY_ERROR; 1596 + 1597 + if (!local && type == FEAT_NN) /* 6.3.2 */ 1598 + goto confirmation_failed; 1599 + return 0; 1600 + } 1601 + 1602 + if (entry->state != FEAT_CHANGING) /* 6.6.2 */ 1603 + return 0; 1604 + 1605 + if (len == 0) { 1606 + if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ 1607 + goto confirmation_failed; 1608 + /* 1609 + * Empty Confirm during connection setup: this means reverting 1610 + * to the `old' value, which in this case is the default. Since 1611 + * we handle default values automatically when no other values 1612 + * have been set, we revert to the old value by removing this 1613 + * entry from the list. 1614 + */ 1615 + dccp_feat_list_pop(entry); 1616 + return 0; 1617 + } 1618 + 1619 + if (type == FEAT_NN) { 1620 + if (len > sizeof(entry->val.nn)) 1621 + goto confirmation_failed; 1622 + 1623 + if (entry->val.nn == dccp_decode_value_var(val, len)) 1624 + goto confirmation_succeeded; 1625 + 1626 + DCCP_WARN("Bogus Confirm for non-existing value\n"); 1627 + goto confirmation_failed; 1628 + } 1629 + 1630 + /* 1631 + * Parsing SP Confirms: the first element of @val is the preferred 1632 + * SP value which the peer confirms, the remainder depends on @len. 1633 + * Note that only the confirmed value need to be a valid SP value. 1634 + */ 1635 + if (!dccp_feat_is_valid_sp_val(feat, *val)) 1636 + goto confirmation_failed; 1637 + 1638 + if (len == 1) { /* peer didn't supply a preference list */ 1639 + plist = val; 1640 + plen = len; 1641 + } else { /* preferred value + preference list */ 1642 + plist = val + 1; 1643 + plen = len - 1; 1644 + } 1645 + 1646 + /* Check whether the peer got the reconciliation right (6.6.8) */ 1647 + if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { 1648 + DCCP_WARN("Confirm selected the wrong value %u\n", *val); 1649 + return DCCP_RESET_CODE_OPTION_ERROR; 1650 + } 1651 + entry->val.sp.vec[0] = *val; 1652 + 1653 + confirmation_succeeded: 1654 + entry->state = FEAT_STABLE; 1655 + return 0; 1656 + 1657 + confirmation_failed: 1658 + DCCP_WARN("Confirmation failed\n"); 1659 + return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR 1660 + : DCCP_RESET_CODE_OPTION_ERROR; 1661 + } 1662 + 1663 + /** 1664 + * dccp_feat_handle_nn_established - Fast-path reception of NN options 1665 + * @sk: socket of an established DCCP connection 1666 + * @mandatory: whether @opt was preceded by a Mandatory option 1667 + * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) 1668 + * @feat: NN number, one of %dccp_feature_numbers 1669 + * @val: NN value 1670 + * @len: length of @val in bytes 1671 + * This function combines the functionality of change_recv/confirm_recv, with 1672 + * the following differences (reset codes are the same): 1673 + * - cleanup after receiving the Confirm; 1674 + * - values are directly activated after successful parsing; 1675 + * - deliberately restricted to NN features. 1676 + * The restriction to NN features is essential since SP features can have non- 1677 + * predictable outcomes (depending on the remote configuration), and are inter- 1678 + * dependent (CCIDs for instance cause further dependencies). 1679 + */ 1680 + static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, 1681 + u8 feat, u8 *val, u8 len) 1682 + { 1683 + struct list_head *fn = &dccp_sk(sk)->dccps_featneg; 1684 + const bool local = (opt == DCCPO_CONFIRM_R); 1685 + struct dccp_feat_entry *entry; 1686 + u8 type = dccp_feat_type(feat); 1687 + dccp_feat_val fval; 1688 + 1689 + dccp_feat_print_opt(opt, feat, val, len, mandatory); 1690 + 1691 + /* Ignore non-mandatory unknown and non-NN features */ 1692 + if (type == FEAT_UNKNOWN) { 1693 + if (local && !mandatory) 1694 + return 0; 1695 + goto fast_path_unknown; 1696 + } else if (type != FEAT_NN) { 1697 + return 0; 1698 + } 1699 + 1700 + /* 1701 + * We don't accept empty Confirms, since in fast-path feature 1702 + * negotiation the values are enabled immediately after sending 1703 + * the Change option. 1704 + * Empty Changes on the other hand are invalid (RFC 4340, 6.1). 1705 + */ 1706 + if (len == 0 || len > sizeof(fval.nn)) 1707 + goto fast_path_unknown; 1708 + 1709 + if (opt == DCCPO_CHANGE_L) { 1710 + fval.nn = dccp_decode_value_var(val, len); 1711 + if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) 1712 + goto fast_path_unknown; 1713 + 1714 + if (dccp_feat_push_confirm(fn, feat, local, &fval) || 1715 + dccp_feat_activate(sk, feat, local, &fval)) 1716 + return DCCP_RESET_CODE_TOO_BUSY; 1717 + 1718 + /* set the `Ack Pending' flag to piggyback a Confirm */ 1719 + inet_csk_schedule_ack(sk); 1720 + 1721 + } else if (opt == DCCPO_CONFIRM_R) { 1722 + entry = dccp_feat_list_lookup(fn, feat, local); 1723 + if (entry == NULL || entry->state != FEAT_CHANGING) 1724 + return 0; 1725 + 1726 + fval.nn = dccp_decode_value_var(val, len); 1727 + if (fval.nn != entry->val.nn) { 1728 + DCCP_WARN("Bogus Confirm for non-existing value\n"); 1729 + goto fast_path_failed; 1730 + } 1731 + 1732 + /* It has been confirmed - so remove the entry */ 1733 + dccp_feat_list_pop(entry); 1734 + 1735 + } else { 1736 + DCCP_WARN("Received illegal option %u\n", opt); 1737 + goto fast_path_failed; 1738 + } 1739 + return 0; 1740 + 1741 + fast_path_unknown: 1742 + if (!mandatory) 1743 + return dccp_push_empty_confirm(fn, feat, local); 1744 + 1745 + fast_path_failed: 1746 + return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR 1747 + : DCCP_RESET_CODE_OPTION_ERROR; 1748 + } 1749 + 1750 + /** 1751 + * dccp_feat_parse_options - Process Feature-Negotiation Options 1752 + * @sk: for general use and used by the client during connection setup 1753 + * @dreq: used by the server during connection setup 1754 + * @mandatory: whether @opt was preceded by a Mandatory option 1755 + * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R 1756 + * @feat: one of %dccp_feature_numbers 1757 + * @val: value contents of @opt 1758 + * @len: length of @val in bytes 1759 + * Returns 0 on success, a Reset code for ending the connection otherwise. 1760 + */ 1761 + int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, 1762 + u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) 1763 + { 1764 + struct dccp_sock *dp = dccp_sk(sk); 1765 + struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; 1766 + bool server = false; 1767 + 1768 + switch (sk->sk_state) { 1769 + /* 1770 + * Negotiation during connection setup 1771 + */ 1772 + case DCCP_LISTEN: 1773 + server = true; /* fall through */ 1774 + case DCCP_REQUESTING: 1775 + switch (opt) { 1776 + case DCCPO_CHANGE_L: 1777 + case DCCPO_CHANGE_R: 1778 + return dccp_feat_change_recv(fn, mandatory, opt, feat, 1779 + val, len, server); 1780 + case DCCPO_CONFIRM_R: 1781 + case DCCPO_CONFIRM_L: 1782 + return dccp_feat_confirm_recv(fn, mandatory, opt, feat, 1783 + val, len, server); 1784 + } 1785 + break; 1786 + /* 1787 + * Support for exchanging NN options on an established connection 1788 + * This is currently restricted to Ack Ratio (RFC 4341, 6.1.2) 1789 + */ 1790 + case DCCP_OPEN: 1791 + case DCCP_PARTOPEN: 1792 + return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, 1793 + val, len); 1794 + } 1795 + return 0; /* ignore FN options in all other states */ 1796 + } 1797 + 1798 + /** 1799 + * dccp_feat_init - Seed feature negotiation with host-specific defaults 1800 + * This initialises global defaults, depending on the value of the sysctls. 1801 + * These can later be overridden by registering changes via setsockopt calls. 1802 + * The last link in the chain is finalise_settings, to make sure that between 1803 + * here and the start of actual feature negotiation no inconsistencies enter. 1804 + * 1805 + * All features not appearing below use either defaults or are otherwise 1806 + * later adjusted through dccp_feat_finalise_settings(). 1807 + */ 1808 + int dccp_feat_init(struct sock *sk) 1809 + { 1810 + struct list_head *fn = &dccp_sk(sk)->dccps_featneg; 1811 + u8 on = 1, off = 0; 1812 + int rc; 1813 + struct { 1814 + u8 *val; 1815 + u8 len; 1816 + } tx, rx; 1817 + 1818 + /* Non-negotiable (NN) features */ 1819 + rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, 1820 + sysctl_dccp_sequence_window); 1821 + if (rc) 1822 + return rc; 1823 + 1824 + /* Server-priority (SP) features */ 1825 + 1826 + /* Advertise that short seqnos are not supported (7.6.1) */ 1827 + rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); 1828 + if (rc) 1829 + return rc; 1830 + 1831 + /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ 1832 + rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); 1833 + if (rc) 1834 + return rc; 1835 + 1836 + /* 1837 + * We advertise the available list of CCIDs and reorder according to 1838 + * preferences, to avoid failure resulting from negotiating different 1839 + * singleton values (which always leads to failure). 1840 + * These settings can still (later) be overridden via sockopts. 1841 + */ 1842 + if (ccid_get_builtin_ccids(&tx.val, &tx.len) || 1843 + ccid_get_builtin_ccids(&rx.val, &rx.len)) 1844 + return -ENOBUFS; 1845 + 1846 + /* Pre-load all CCID modules that are going to be advertised */ 1847 + rc = -EUNATCH; 1848 + if (ccid_request_modules(tx.val, tx.len)) 1849 + goto free_ccid_lists; 1850 + 1851 + if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || 1852 + !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) 1853 + goto free_ccid_lists; 1854 + 1855 + rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); 1856 + if (rc) 1857 + goto free_ccid_lists; 1858 + 1859 + rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); 1860 + 1861 + free_ccid_lists: 1862 + kfree(tx.val); 1863 + kfree(rx.val); 1864 + return rc; 1865 + } 1866 + 1867 + int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) 1868 + { 1869 + struct dccp_sock *dp = dccp_sk(sk); 1870 + struct dccp_feat_entry *cur, *next; 1871 + int idx; 1872 + dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { 1873 + [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } 1874 + }; 1875 + 1876 + list_for_each_entry(cur, fn_list, node) { 1877 + /* 1878 + * An empty Confirm means that either an unknown feature type 1879 + * or an invalid value was present. In the first case there is 1880 + * nothing to activate, in the other the default value is used. 1881 + */ 1882 + if (cur->empty_confirm) 1883 + continue; 1884 + 1885 + idx = dccp_feat_index(cur->feat_num); 1886 + if (idx < 0) { 1887 + DCCP_BUG("Unknown feature %u", cur->feat_num); 1888 + goto activation_failed; 1889 + } 1890 + if (cur->state != FEAT_STABLE) { 1891 + DCCP_CRIT("Negotiation of %s %s failed in state %s", 1892 + cur->is_local ? "local" : "remote", 1893 + dccp_feat_fname(cur->feat_num), 1894 + dccp_feat_sname[cur->state]); 1895 + goto activation_failed; 1896 + } 1897 + fvals[idx][cur->is_local] = &cur->val; 1898 + } 1899 + 1900 + /* 1901 + * Activate in decreasing order of index, so that the CCIDs are always 1902 + * activated as the last feature. This avoids the case where a CCID 1903 + * relies on the initialisation of one or more features that it depends 1904 + * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). 1905 + */ 1906 + for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) 1907 + if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || 1908 + __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { 1909 + DCCP_CRIT("Could not activate %d", idx); 1910 + goto activation_failed; 1911 + } 1912 + 1913 + /* Clean up Change options which have been confirmed already */ 1914 + list_for_each_entry_safe(cur, next, fn_list, node) 1915 + if (!cur->needs_confirm) 1916 + dccp_feat_list_pop(cur); 1917 + 1918 + dccp_pr_debug("Activation OK\n"); 1919 + return 0; 1920 + 1921 + activation_failed: 1922 + /* 1923 + * We clean up everything that may have been allocated, since 1924 + * it is difficult to track at which stage negotiation failed. 1925 + * This is ok, since all allocation functions below are robust 1926 + * against NULL arguments. 1927 + */ 1928 + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); 1929 + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); 1930 + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; 1931 + dccp_ackvec_free(dp->dccps_hc_rx_ackvec); 1932 + dp->dccps_hc_rx_ackvec = NULL; 1933 + return -1; 1934 + }
+119 -23
net/dccp/feat.h
··· 3 3 /* 4 4 * net/dccp/feat.h 5 5 * 6 - * An implementation of the DCCP protocol 6 + * Feature negotiation for the DCCP protocol (RFC 4340, section 6) 7 + * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> 7 8 * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> 8 9 * 9 - * This program is free software; you can redistribute it and/or modify it 10 - * under the terms of the GNU General Public License version 2 as 11 - * published by the Free Software Foundation. 10 + * This program is free software; you can redistribute it and/or modify it 11 + * under the terms of the GNU General Public License version 2 as 12 + * published by the Free Software Foundation. 12 13 */ 13 - 14 14 #include <linux/types.h> 15 15 #include "dccp.h" 16 16 17 - #ifdef CONFIG_IP_DCCP_DEBUG 18 - extern const char *dccp_feat_typename(const u8 type); 19 - extern const char *dccp_feat_name(const u8 feat); 17 + /* 18 + * Known limit values 19 + */ 20 + /* Ack Ratio takes 2-byte integer values (11.3) */ 21 + #define DCCPF_ACK_RATIO_MAX 0xFFFF 22 + /* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ 23 + #define DCCPF_SEQ_WMIN 32 24 + #define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull 25 + /* Maximum number of SP values that fit in a single (Confirm) option */ 26 + #define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) 20 27 21 - static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) 28 + enum dccp_feat_type { 29 + FEAT_AT_RX = 1, /* located at RX side of half-connection */ 30 + FEAT_AT_TX = 2, /* located at TX side of half-connection */ 31 + FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ 32 + FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ 33 + FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ 34 + }; 35 + 36 + enum dccp_feat_state { 37 + FEAT_DEFAULT = 0, /* using default values from 6.4 */ 38 + FEAT_INITIALISING, /* feature is being initialised */ 39 + FEAT_CHANGING, /* Change sent but not confirmed yet */ 40 + FEAT_UNSTABLE, /* local modification in state CHANGING */ 41 + FEAT_STABLE /* both ends (think they) agree */ 42 + }; 43 + 44 + /** 45 + * dccp_feat_val - Container for SP or NN feature values 46 + * @nn: single NN value 47 + * @sp.vec: single SP value plus optional preference list 48 + * @sp.len: length of @sp.vec in bytes 49 + */ 50 + typedef union { 51 + u64 nn; 52 + struct { 53 + u8 *vec; 54 + u8 len; 55 + } sp; 56 + } dccp_feat_val; 57 + 58 + /** 59 + * struct feat_entry - Data structure to perform feature negotiation 60 + * @feat_num: one of %dccp_feature_numbers 61 + * @val: feature's current value (SP features may have preference list) 62 + * @state: feature's current state 63 + * @needs_mandatory: whether Mandatory options should be sent 64 + * @needs_confirm: whether to send a Confirm instead of a Change 65 + * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) 66 + * @is_local: feature location (1) or feature-remote (0) 67 + * @node: list pointers, entries arranged in FIFO order 68 + */ 69 + struct dccp_feat_entry { 70 + u8 feat_num; 71 + dccp_feat_val val; 72 + enum dccp_feat_state state:8; 73 + bool needs_mandatory:1, 74 + needs_confirm:1, 75 + empty_confirm:1, 76 + is_local:1; 77 + 78 + struct list_head node; 79 + }; 80 + 81 + static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) 22 82 { 23 - dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type), 24 - dccp_feat_name(feat), feat, val); 83 + if (entry->needs_confirm) 84 + return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; 85 + return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; 25 86 } 26 - #else 27 - #define dccp_feat_debug(type, feat, val) 28 - #endif /* CONFIG_IP_DCCP_DEBUG */ 29 87 30 - extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, 31 - u8 *val, u8 len, gfp_t gfp); 32 - extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, 33 - u8 *val, u8 len); 34 - extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, 35 - u8 *val, u8 len); 36 - extern void dccp_feat_clean(struct dccp_minisock *dmsk); 37 - extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); 38 - extern int dccp_feat_init(struct dccp_minisock *dmsk); 88 + /** 89 + * struct ccid_dependency - Track changes resulting from choosing a CCID 90 + * @dependent_feat: one of %dccp_feature_numbers 91 + * @is_local: local (1) or remote (0) @dependent_feat 92 + * @is_mandatory: whether presence of @dependent_feat is mission-critical or not 93 + * @val: corresponding default value for @dependent_feat (u8 is sufficient here) 94 + */ 95 + struct ccid_dependency { 96 + u8 dependent_feat; 97 + bool is_local:1, 98 + is_mandatory:1; 99 + u8 val; 100 + }; 39 101 102 + /* 103 + * Sysctls to seed defaults for feature negotiation 104 + */ 105 + extern unsigned long sysctl_dccp_sequence_window; 106 + extern int sysctl_dccp_rx_ccid; 107 + extern int sysctl_dccp_tx_ccid; 108 + 109 + extern int dccp_feat_init(struct sock *sk); 110 + extern void dccp_feat_initialise_sysctls(void); 111 + extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, 112 + u8 const *list, u8 len); 113 + extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); 114 + extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, 115 + u8 mand, u8 opt, u8 feat, u8 *val, u8 len); 116 + extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); 117 + 118 + /* 119 + * Encoding variable-length options and their maximum length. 120 + * 121 + * This affects NN options (SP options are all u8) and other variable-length 122 + * options (see table 3 in RFC 4340). The limit is currently given the Sequence 123 + * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other 124 + * options consume less than 6 bytes (timestamps are 4 bytes). 125 + * When updating this constant (e.g. due to new internet drafts / RFCs), make 126 + * sure that you also update all code which refers to it. 127 + */ 128 + #define DCCP_OPTVAL_MAXLEN 6 129 + 130 + extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); 131 + extern u64 dccp_decode_value_var(const u8 *bf, const u8 len); 132 + 133 + extern int dccp_insert_option_mandatory(struct sk_buff *skb); 134 + extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, 135 + u8 *val, u8 len, bool repeat_first); 40 136 #endif /* _DCCP_FEAT_H */
+73 -91
net/dccp/input.c
··· 159 159 dccp_time_wait(sk, DCCP_TIME_WAIT, 0); 160 160 } 161 161 162 - static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) 162 + static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) 163 163 { 164 - struct dccp_sock *dp = dccp_sk(sk); 164 + struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; 165 165 166 - if (dccp_msk(sk)->dccpms_send_ack_vector) 167 - dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, 168 - DCCP_SKB_CB(skb)->dccpd_ack_seq); 166 + if (av == NULL) 167 + return; 168 + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 169 + dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); 170 + dccp_ackvec_input(av, skb); 169 171 } 170 172 171 173 static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) ··· 366 364 int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, 367 365 const struct dccp_hdr *dh, const unsigned len) 368 366 { 369 - struct dccp_sock *dp = dccp_sk(sk); 370 - 371 367 if (dccp_check_seqno(sk, skb)) 372 368 goto discard; 373 369 374 370 if (dccp_parse_options(sk, NULL, skb)) 375 371 return 1; 376 372 377 - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 378 - dccp_event_ack_recv(sk, skb); 379 - 380 - if (dccp_msk(sk)->dccpms_send_ack_vector && 381 - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, 382 - DCCP_SKB_CB(skb)->dccpd_seq, 383 - DCCP_ACKVEC_STATE_RECEIVED)) 384 - goto discard; 373 + dccp_handle_ackvec_processing(sk, skb); 385 374 dccp_deliver_input_to_ccids(sk, skb); 386 375 387 376 return __dccp_rcv_established(sk, skb, dh, len); ··· 414 421 goto out_invalid_packet; 415 422 } 416 423 424 + /* 425 + * If option processing (Step 8) failed, return 1 here so that 426 + * dccp_v4_do_rcv() sends a Reset. The Reset code depends on 427 + * the option type and is set in dccp_parse_options(). 428 + */ 417 429 if (dccp_parse_options(sk, NULL, skb)) 418 - goto out_invalid_packet; 430 + return 1; 419 431 420 432 /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ 421 433 if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) 422 434 dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - 423 435 dp->dccps_options_received.dccpor_timestamp_echo)); 424 - 425 - if (dccp_msk(sk)->dccpms_send_ack_vector && 426 - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, 427 - DCCP_SKB_CB(skb)->dccpd_seq, 428 - DCCP_ACKVEC_STATE_RECEIVED)) 429 - goto out_invalid_packet; /* FIXME: change error code */ 430 436 431 437 /* Stop the REQUEST timer */ 432 438 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); ··· 433 441 kfree_skb(sk->sk_send_head); 434 442 sk->sk_send_head = NULL; 435 443 436 - dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; 437 - dccp_update_gsr(sk, dp->dccps_isr); 438 444 /* 439 - * SWL and AWL are initially adjusted so that they are not less than 440 - * the initial Sequence Numbers received and sent, respectively: 441 - * SWL := max(GSR + 1 - floor(W/4), ISR), 442 - * AWL := max(GSS - W' + 1, ISS). 443 - * These adjustments MUST be applied only at the beginning of the 444 - * connection. 445 - * 446 - * AWL was adjusted in dccp_v4_connect -acme 445 + * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect 446 + * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH 447 + * is done as part of activating the feature values below, since 448 + * these settings depend on the local/remote Sequence Window 449 + * features, which were undefined or not confirmed until now. 447 450 */ 448 - dccp_set_seqno(&dp->dccps_swl, 449 - max48(dp->dccps_swl, dp->dccps_isr)); 451 + dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; 450 452 451 453 dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); 452 454 ··· 460 474 * three-way handshake * / 461 475 */ 462 476 dccp_set_state(sk, DCCP_PARTOPEN); 477 + 478 + /* 479 + * If feature negotiation was successful, activate features now; 480 + * an activation failure means that this host could not activate 481 + * one ore more features (e.g. insufficient memory), which would 482 + * leave at least one feature in an undefined state. 483 + */ 484 + if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) 485 + goto unable_to_proceed; 463 486 464 487 /* Make sure socket is routed, for correct metrics. */ 465 488 icsk->icsk_af_ops->rebuild_header(sk); ··· 503 508 out_invalid_packet: 504 509 /* dccp_v4_do_rcv will send a reset */ 505 510 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; 511 + return 1; 512 + 513 + unable_to_proceed: 514 + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; 515 + /* 516 + * We mark this socket as no longer usable, so that the loop in 517 + * dccp_sendmsg() terminates and the application gets notified. 518 + */ 519 + dccp_set_state(sk, DCCP_CLOSED); 520 + sk->sk_err = ECOMM; 506 521 return 1; 507 522 } 508 523 ··· 595 590 if (inet_csk(sk)->icsk_af_ops->conn_request(sk, 596 591 skb) < 0) 597 592 return 1; 598 - 599 - /* FIXME: do congestion control initialization */ 600 593 goto discard; 601 594 } 602 595 if (dh->dccph_type == DCCP_PKT_RESET) ··· 603 600 /* Caller (dccp_v4_do_rcv) will send Reset */ 604 601 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; 605 602 return 1; 603 + } else if (sk->sk_state == DCCP_CLOSED) { 604 + dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; 605 + return 1; 606 606 } 607 607 608 - if (sk->sk_state != DCCP_REQUESTING) { 609 - if (dccp_check_seqno(sk, skb)) 610 - goto discard; 608 + /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ 609 + if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) 610 + goto discard; 611 611 612 - /* 613 - * Step 8: Process options and mark acknowledgeable 614 - */ 615 - if (dccp_parse_options(sk, NULL, skb)) 616 - return 1; 617 - 618 - if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 619 - dccp_event_ack_recv(sk, skb); 620 - 621 - if (dccp_msk(sk)->dccpms_send_ack_vector && 622 - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, 623 - DCCP_SKB_CB(skb)->dccpd_seq, 624 - DCCP_ACKVEC_STATE_RECEIVED)) 625 - goto discard; 626 - 627 - dccp_deliver_input_to_ccids(sk, skb); 612 + /* 613 + * Step 7: Check for unexpected packet types 614 + * If (S.is_server and P.type == Response) 615 + * or (S.is_client and P.type == Request) 616 + * or (S.state == RESPOND and P.type == Data), 617 + * Send Sync packet acknowledging P.seqno 618 + * Drop packet and return 619 + */ 620 + if ((dp->dccps_role != DCCP_ROLE_CLIENT && 621 + dh->dccph_type == DCCP_PKT_RESPONSE) || 622 + (dp->dccps_role == DCCP_ROLE_CLIENT && 623 + dh->dccph_type == DCCP_PKT_REQUEST) || 624 + (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { 625 + dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); 626 + goto discard; 628 627 } 628 + 629 + /* Step 8: Process options */ 630 + if (dccp_parse_options(sk, NULL, skb)) 631 + return 1; 629 632 630 633 /* 631 634 * Step 9: Process Reset ··· 640 631 * S.state := TIMEWAIT 641 632 * Set TIMEWAIT timer 642 633 * Drop packet and return 643 - */ 634 + */ 644 635 if (dh->dccph_type == DCCP_PKT_RESET) { 645 636 dccp_rcv_reset(sk, skb); 646 637 return 0; 647 - /* 648 - * Step 7: Check for unexpected packet types 649 - * If (S.is_server and P.type == Response) 650 - * or (S.is_client and P.type == Request) 651 - * or (S.state == RESPOND and P.type == Data), 652 - * Send Sync packet acknowledging P.seqno 653 - * Drop packet and return 654 - */ 655 - } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && 656 - dh->dccph_type == DCCP_PKT_RESPONSE) || 657 - (dp->dccps_role == DCCP_ROLE_CLIENT && 658 - dh->dccph_type == DCCP_PKT_REQUEST) || 659 - (sk->sk_state == DCCP_RESPOND && 660 - dh->dccph_type == DCCP_PKT_DATA)) { 661 - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); 662 - goto discard; 663 - } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { 638 + } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ 664 639 if (dccp_rcv_closereq(sk, skb)) 665 640 return 0; 666 641 goto discard; 667 - } else if (dh->dccph_type == DCCP_PKT_CLOSE) { 642 + } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ 668 643 if (dccp_rcv_close(sk, skb)) 669 644 return 0; 670 645 goto discard; 671 646 } 672 647 673 648 switch (sk->sk_state) { 674 - case DCCP_CLOSED: 675 - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; 676 - return 1; 677 - 678 649 case DCCP_REQUESTING: 679 - /* FIXME: do congestion control initialization */ 680 - 681 650 queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); 682 651 if (queued >= 0) 683 652 return queued; ··· 663 676 __kfree_skb(skb); 664 677 return 0; 665 678 666 - case DCCP_RESPOND: 667 679 case DCCP_PARTOPEN: 680 + /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ 681 + dccp_handle_ackvec_processing(sk, skb); 682 + dccp_deliver_input_to_ccids(sk, skb); 683 + /* fall through */ 684 + case DCCP_RESPOND: 668 685 queued = dccp_rcv_respond_partopen_state_process(sk, skb, 669 686 dh, len); 670 687 break; ··· 707 716 /* dccpor_elapsed_time is either zeroed out or set and > 0 */ 708 717 delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; 709 718 710 - if (unlikely(delta <= 0)) { 711 - DCCP_WARN("unusable RTT sample %ld, using min\n", delta); 712 - return DCCP_SANE_RTT_MIN; 713 - } 714 - if (unlikely(delta > DCCP_SANE_RTT_MAX)) { 715 - DCCP_WARN("RTT sample %ld too large, using max\n", delta); 716 - return DCCP_SANE_RTT_MAX; 717 - } 718 - 719 - return delta; 719 + return dccp_sane_rtt(delta); 720 720 } 721 721 722 722 EXPORT_SYMBOL_GPL(dccp_sample_rtt);
+3 -1
net/dccp/ipv4.c
··· 545 545 546 546 static void dccp_v4_reqsk_destructor(struct request_sock *req) 547 547 { 548 + dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); 548 549 kfree(inet_rsk(req)->opt); 549 550 } 550 551 ··· 596 595 if (req == NULL) 597 596 goto drop; 598 597 599 - dccp_reqsk_init(req, skb); 598 + if (dccp_reqsk_init(req, dccp_sk(sk), skb)) 599 + goto drop_and_free; 600 600 601 601 dreq = dccp_rsk(req); 602 602 if (dccp_parse_options(sk, dreq, skb))
+3 -1
net/dccp/ipv6.c
··· 302 302 303 303 static void dccp_v6_reqsk_destructor(struct request_sock *req) 304 304 { 305 + dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); 305 306 if (inet6_rsk(req)->pktopts != NULL) 306 307 kfree_skb(inet6_rsk(req)->pktopts); 307 308 } ··· 425 424 if (req == NULL) 426 425 goto drop; 427 426 428 - dccp_reqsk_init(req, skb); 427 + if (dccp_reqsk_init(req, dccp_sk(sk), skb)) 428 + goto drop_and_free; 429 429 430 430 dreq = dccp_rsk(req); 431 431 if (dccp_parse_options(sk, dreq, skb))
+25 -66
net/dccp/minisocks.c
··· 42 42 43 43 EXPORT_SYMBOL_GPL(dccp_death_row); 44 44 45 - void dccp_minisock_init(struct dccp_minisock *dmsk) 46 - { 47 - dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; 48 - dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; 49 - dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; 50 - dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio; 51 - dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; 52 - dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; 53 - } 54 - 55 45 void dccp_time_wait(struct sock *sk, int state, int timeo) 56 46 { 57 47 struct inet_timewait_sock *tw = NULL; ··· 102 112 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); 103 113 104 114 if (newsk != NULL) { 105 - const struct dccp_request_sock *dreq = dccp_rsk(req); 115 + struct dccp_request_sock *dreq = dccp_rsk(req); 106 116 struct inet_connection_sock *newicsk = inet_csk(newsk); 107 117 struct dccp_sock *newdp = dccp_sk(newsk); 108 - struct dccp_minisock *newdmsk = dccp_msk(newsk); 109 118 110 119 newdp->dccps_role = DCCP_ROLE_SERVER; 111 120 newdp->dccps_hc_rx_ackvec = NULL; ··· 114 125 newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; 115 126 newicsk->icsk_rto = DCCP_TIMEOUT_INIT; 116 127 117 - if (dccp_feat_clone(sk, newsk)) 118 - goto out_free; 128 + INIT_LIST_HEAD(&newdp->dccps_featneg); 129 + /* 130 + * Step 3: Process LISTEN state 131 + * 132 + * Choose S.ISS (initial seqno) or set from Init Cookies 133 + * Initialize S.GAR := S.ISS 134 + * Set S.ISR, S.GSR from packet (or Init Cookies) 135 + * 136 + * Setting AWL/AWH and SWL/SWH happens as part of the feature 137 + * activation below, as these windows all depend on the local 138 + * and remote Sequence Window feature values (7.5.2). 139 + */ 140 + newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss; 141 + newdp->dccps_gar = newdp->dccps_iss; 142 + newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr; 119 143 120 - if (newdmsk->dccpms_send_ack_vector) { 121 - newdp->dccps_hc_rx_ackvec = 122 - dccp_ackvec_alloc(GFP_ATOMIC); 123 - if (unlikely(newdp->dccps_hc_rx_ackvec == NULL)) 124 - goto out_free; 125 - } 126 - 127 - newdp->dccps_hc_rx_ccid = 128 - ccid_hc_rx_new(newdmsk->dccpms_rx_ccid, 129 - newsk, GFP_ATOMIC); 130 - newdp->dccps_hc_tx_ccid = 131 - ccid_hc_tx_new(newdmsk->dccpms_tx_ccid, 132 - newsk, GFP_ATOMIC); 133 - if (unlikely(newdp->dccps_hc_rx_ccid == NULL || 134 - newdp->dccps_hc_tx_ccid == NULL)) { 135 - dccp_ackvec_free(newdp->dccps_hc_rx_ackvec); 136 - ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk); 137 - ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk); 138 - out_free: 144 + /* 145 + * Activate features: initialise CCIDs, sequence windows etc. 146 + */ 147 + if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { 139 148 /* It is still raw copy of parent, so invalidate 140 149 * destructor and make plain sk_free() */ 141 150 newsk->sk_destruct = NULL; 142 151 sk_free(newsk); 143 152 return NULL; 144 153 } 145 - 146 - /* 147 - * Step 3: Process LISTEN state 148 - * 149 - * Choose S.ISS (initial seqno) or set from Init Cookies 150 - * Initialize S.GAR := S.ISS 151 - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies 152 - */ 153 - 154 - /* See dccp_v4_conn_request */ 155 - newdmsk->dccpms_sequence_window = req->rcv_wnd; 156 - 157 - newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; 158 - dccp_update_gss(newsk, dreq->dreq_iss); 159 - 160 - newdp->dccps_isr = dreq->dreq_isr; 161 - dccp_update_gsr(newsk, dreq->dreq_isr); 162 - 163 - /* 164 - * SWL and AWL are initially adjusted so that they are not less than 165 - * the initial Sequence Numbers received and sent, respectively: 166 - * SWL := max(GSR + 1 - floor(W/4), ISR), 167 - * AWL := max(GSS - W' + 1, ISS). 168 - * These adjustments MUST be applied only at the beginning of the 169 - * connection. 170 - */ 171 - dccp_set_seqno(&newdp->dccps_swl, 172 - max48(newdp->dccps_swl, newdp->dccps_isr)); 173 - dccp_set_seqno(&newdp->dccps_awl, 174 - max48(newdp->dccps_awl, newdp->dccps_iss)); 175 - 176 154 dccp_init_xmit_timers(newsk); 177 155 178 156 DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); ··· 260 304 261 305 EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); 262 306 263 - void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) 307 + int dccp_reqsk_init(struct request_sock *req, 308 + struct dccp_sock const *dp, struct sk_buff const *skb) 264 309 { 265 310 struct dccp_request_sock *dreq = dccp_rsk(req); 266 311 267 312 inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; 268 313 inet_rsk(req)->acked = 0; 269 - req->rcv_wnd = sysctl_dccp_feat_sequence_window; 270 314 dreq->dreq_timestamp_echo = 0; 315 + 316 + /* inherit feature negotiation options from listening socket */ 317 + return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); 271 318 } 272 319 273 320 EXPORT_SYMBOL_GPL(dccp_reqsk_init);
+176 -161
net/dccp/options.c
··· 23 23 #include "dccp.h" 24 24 #include "feat.h" 25 25 26 - int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; 27 - int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; 28 - int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; 29 - int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO; 30 - int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; 31 - int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; 32 - 33 - static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) 26 + u64 dccp_decode_value_var(const u8 *bf, const u8 len) 34 27 { 35 - u32 value = 0; 28 + u64 value = 0; 36 29 30 + if (len >= DCCP_OPTVAL_MAXLEN) 31 + value += ((u64)*bf++) << 40; 32 + if (len > 4) 33 + value += ((u64)*bf++) << 32; 37 34 if (len > 3) 38 - value += *bf++ << 24; 35 + value += ((u64)*bf++) << 24; 39 36 if (len > 2) 40 - value += *bf++ << 16; 37 + value += ((u64)*bf++) << 16; 41 38 if (len > 1) 42 - value += *bf++ << 8; 39 + value += ((u64)*bf++) << 8; 43 40 if (len > 0) 44 41 value += *bf; 45 42 ··· 54 57 struct dccp_sock *dp = dccp_sk(sk); 55 58 const struct dccp_hdr *dh = dccp_hdr(skb); 56 59 const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; 57 - u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; 58 60 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); 59 61 unsigned char *opt_ptr = options; 60 62 const unsigned char *opt_end = (unsigned char *)dh + ··· 95 99 } 96 100 97 101 /* 98 - * CCID-Specific Options (from RFC 4340, sec. 10.3): 99 - * 100 - * Option numbers 128 through 191 are for options sent from the 101 - * HC-Sender to the HC-Receiver; option numbers 192 through 255 102 - * are for options sent from the HC-Receiver to the HC-Sender. 103 - * 104 102 * CCID-specific options are ignored during connection setup, as 105 103 * negotiation may still be in progress (see RFC 4340, 10.3). 106 104 * The same applies to Ack Vectors, as these depend on the CCID. 107 - * 108 105 */ 109 - if (dreq != NULL && (opt >= 128 || 106 + if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || 110 107 opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) 111 108 goto ignore_option; 112 109 ··· 120 131 dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), 121 132 (unsigned long long)opt_recv->dccpor_ndp); 122 133 break; 123 - case DCCPO_CHANGE_L: 124 - /* fall through */ 125 - case DCCPO_CHANGE_R: 126 - if (pkt_type == DCCP_PKT_DATA) 134 + case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: 135 + if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ 127 136 break; 128 - if (len < 2) 129 - goto out_invalid_option; 130 - rc = dccp_feat_change_recv(sk, opt, *value, value + 1, 131 - len - 1); 132 - /* 133 - * When there is a change error, change_recv is 134 - * responsible for dealing with it. i.e. reply with an 135 - * empty confirm. 136 - * If the change was mandatory, then we need to die. 137 - */ 138 - if (rc && mandatory) 139 - goto out_invalid_option; 140 - break; 141 - case DCCPO_CONFIRM_L: 142 - /* fall through */ 143 - case DCCPO_CONFIRM_R: 144 - if (pkt_type == DCCP_PKT_DATA) 145 - break; 146 - if (len < 2) /* FIXME this disallows empty confirm */ 147 - goto out_invalid_option; 148 - if (dccp_feat_confirm_recv(sk, opt, *value, 149 - value + 1, len - 1)) 150 - goto out_invalid_option; 151 - break; 152 - case DCCPO_ACK_VECTOR_0: 153 - case DCCPO_ACK_VECTOR_1: 154 - if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ 155 - break; 156 - 157 - if (dccp_msk(sk)->dccpms_send_ack_vector && 158 - dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) 159 - goto out_invalid_option; 137 + rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, 138 + *value, value + 1, len - 1); 139 + if (rc) 140 + goto out_featneg_failed; 160 141 break; 161 142 case DCCPO_TIMESTAMP: 162 143 if (len != 4) ··· 154 195 dccp_role(sk), ntohl(opt_val), 155 196 (unsigned long long) 156 197 DCCP_SKB_CB(skb)->dccpd_ack_seq); 198 + /* schedule an Ack in case this sender is quiescent */ 199 + inet_csk_schedule_ack(sk); 157 200 break; 158 201 case DCCPO_TIMESTAMP_ECHO: 159 202 if (len != 4 && len != 6 && len != 8) ··· 212 251 dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", 213 252 dccp_role(sk), elapsed_time); 214 253 break; 215 - case 128 ... 191: { 216 - const u16 idx = value - options; 217 - 254 + case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: 218 255 if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, 219 - opt, len, idx, 220 - value) != 0) 256 + pkt_type, opt, value, len)) 221 257 goto out_invalid_option; 222 - } 223 258 break; 224 - case 192 ... 255: { 225 - const u16 idx = value - options; 226 - 259 + case DCCPO_ACK_VECTOR_0: 260 + case DCCPO_ACK_VECTOR_1: 261 + if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ 262 + break; 263 + /* 264 + * Ack vectors are processed by the TX CCID if it is 265 + * interested. The RX CCID need not parse Ack Vectors, 266 + * since it is only interested in clearing old state. 267 + * Fall through. 268 + */ 269 + case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: 227 270 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, 228 - opt, len, idx, 229 - value) != 0) 271 + pkt_type, opt, value, len)) 230 272 goto out_invalid_option; 231 - } 232 273 break; 233 274 default: 234 275 DCCP_CRIT("DCCP(%p): option %d(len=%d) not " ··· 252 289 253 290 out_invalid_option: 254 291 DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); 255 - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; 256 - DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len); 292 + rc = DCCP_RESET_CODE_OPTION_ERROR; 293 + out_featneg_failed: 294 + DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); 295 + DCCP_SKB_CB(skb)->dccpd_reset_code = rc; 257 296 DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; 258 297 DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; 259 298 DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; ··· 264 299 265 300 EXPORT_SYMBOL_GPL(dccp_parse_options); 266 301 267 - static void dccp_encode_value_var(const u32 value, unsigned char *to, 268 - const unsigned int len) 302 + void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) 269 303 { 304 + if (len >= DCCP_OPTVAL_MAXLEN) 305 + *to++ = (value & 0xFF0000000000ull) >> 40; 306 + if (len > 4) 307 + *to++ = (value & 0xFF00000000ull) >> 32; 270 308 if (len > 3) 271 309 *to++ = (value & 0xFF000000) >> 24; 272 310 if (len > 2) ··· 429 461 return 0; 430 462 } 431 463 432 - static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, 433 - u8 *val, u8 len) 464 + static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) 434 465 { 435 - u8 *to; 466 + struct dccp_sock *dp = dccp_sk(sk); 467 + struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; 468 + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 469 + const u16 buflen = dccp_ackvec_buflen(av); 470 + /* Figure out how many options do we need to represent the ackvec */ 471 + const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); 472 + u16 len = buflen + 2 * nr_opts; 473 + u8 i, nonce = 0; 474 + const unsigned char *tail, *from; 475 + unsigned char *to; 436 476 437 - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) { 438 - DCCP_WARN("packet too small for feature %d option!\n", feat); 477 + if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { 478 + DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, 479 + dccp_packet_name(dcb->dccpd_type)); 439 480 return -1; 440 481 } 482 + /* 483 + * Since Ack Vectors are variable-length, we can not always predict 484 + * their size. To catch exception cases where the space is running out 485 + * on the skb, a separate Sync is scheduled to carry the Ack Vector. 486 + */ 487 + if (len > DCCPAV_MIN_OPTLEN && 488 + len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { 489 + DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " 490 + "MPS=%u ==> reduce payload size?\n", len, skb->len, 491 + dcb->dccpd_opt_len, dp->dccps_mss_cache); 492 + dp->dccps_sync_scheduled = 1; 493 + return 0; 494 + } 495 + dcb->dccpd_opt_len += len; 441 496 442 - DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3; 497 + to = skb_push(skb, len); 498 + len = buflen; 499 + from = av->av_buf + av->av_buf_head; 500 + tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; 443 501 444 - to = skb_push(skb, len + 3); 445 - *to++ = type; 446 - *to++ = len + 3; 447 - *to++ = feat; 502 + for (i = 0; i < nr_opts; ++i) { 503 + int copylen = len; 448 504 449 - if (len) 450 - memcpy(to, val, len); 505 + if (len > DCCP_SINGLE_OPT_MAXLEN) 506 + copylen = DCCP_SINGLE_OPT_MAXLEN; 451 507 452 - dccp_pr_debug("%s(%s (%d), ...), length %d\n", 453 - dccp_feat_typename(type), 454 - dccp_feat_name(feat), feat, len); 508 + /* 509 + * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via 510 + * its type; ack_nonce is the sum of all individual buf_nonce's. 511 + */ 512 + nonce ^= av->av_buf_nonce[i]; 513 + 514 + *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; 515 + *to++ = copylen + 2; 516 + 517 + /* Check if buf_head wraps */ 518 + if (from + copylen > tail) { 519 + const u16 tailsize = tail - from; 520 + 521 + memcpy(to, from, tailsize); 522 + to += tailsize; 523 + len -= tailsize; 524 + copylen -= tailsize; 525 + from = av->av_buf; 526 + } 527 + 528 + memcpy(to, from, copylen); 529 + from += copylen; 530 + to += copylen; 531 + len -= copylen; 532 + } 533 + /* 534 + * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. 535 + */ 536 + if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) 537 + return -ENOBUFS; 455 538 return 0; 456 539 } 457 540 458 - static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) 541 + /** 542 + * dccp_insert_option_mandatory - Mandatory option (5.8.2) 543 + * Note that since we are using skb_push, this function needs to be called 544 + * _after_ inserting the option it is supposed to influence (stack order). 545 + */ 546 + int dccp_insert_option_mandatory(struct sk_buff *skb) 459 547 { 460 - struct dccp_sock *dp = dccp_sk(sk); 461 - struct dccp_minisock *dmsk = dccp_msk(sk); 462 - struct dccp_opt_pend *opt, *next; 463 - int change = 0; 548 + if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) 549 + return -1; 464 550 465 - /* confirm any options [NN opts] */ 466 - list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { 467 - dccp_insert_feat_opt(skb, opt->dccpop_type, 468 - opt->dccpop_feat, opt->dccpop_val, 469 - opt->dccpop_len); 470 - /* fear empty confirms */ 471 - if (opt->dccpop_val) 472 - kfree(opt->dccpop_val); 473 - kfree(opt); 474 - } 475 - INIT_LIST_HEAD(&dmsk->dccpms_conf); 551 + DCCP_SKB_CB(skb)->dccpd_opt_len++; 552 + *skb_push(skb, 1) = DCCPO_MANDATORY; 553 + return 0; 554 + } 476 555 477 - /* see which features we need to send */ 478 - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 479 - /* see if we need to send any confirm */ 480 - if (opt->dccpop_sc) { 481 - dccp_insert_feat_opt(skb, opt->dccpop_type + 1, 482 - opt->dccpop_feat, 483 - opt->dccpop_sc->dccpoc_val, 484 - opt->dccpop_sc->dccpoc_len); 556 + /** 557 + * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb 558 + * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R 559 + * @feat: one out of %dccp_feature_numbers 560 + * @val: NN value or SP array (preferred element first) to copy 561 + * @len: true length of @val in bytes (excluding first element repetition) 562 + * @repeat_first: whether to copy the first element of @val twice 563 + * The last argument is used to construct Confirm options, where the preferred 564 + * value and the preference list appear separately (RFC 4340, 6.3.1). Preference 565 + * lists are kept such that the preferred entry is always first, so we only need 566 + * to copy twice, and avoid the overhead of cloning into a bigger array. 567 + */ 568 + int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, 569 + u8 *val, u8 len, bool repeat_first) 570 + { 571 + u8 tot_len, *to; 485 572 486 - BUG_ON(!opt->dccpop_sc->dccpoc_val); 487 - kfree(opt->dccpop_sc->dccpoc_val); 488 - kfree(opt->dccpop_sc); 489 - opt->dccpop_sc = NULL; 490 - } 491 - 492 - /* any option not confirmed, re-send it */ 493 - if (!opt->dccpop_conf) { 494 - dccp_insert_feat_opt(skb, opt->dccpop_type, 495 - opt->dccpop_feat, opt->dccpop_val, 496 - opt->dccpop_len); 497 - change++; 498 - } 573 + /* take the `Feature' field and possible repetition into account */ 574 + if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { 575 + DCCP_WARN("length %u for feature %u too large\n", len, feat); 576 + return -1; 499 577 } 500 578 501 - /* Retransmit timer. 502 - * If this is the master listening sock, we don't set a timer on it. It 503 - * should be fine because if the dude doesn't receive our RESPONSE 504 - * [which will contain the CHANGE] he will send another REQUEST which 505 - * will "retrnasmit" the change. 506 - */ 507 - if (change && dp->dccps_role != DCCP_ROLE_LISTEN) { 508 - dccp_pr_debug("reset feat negotiation timer %p\n", sk); 579 + if (unlikely(val == NULL || len == 0)) 580 + len = repeat_first = 0; 581 + tot_len = 3 + repeat_first + len; 509 582 510 - /* XXX don't reset the timer on re-transmissions. I.e. reset it 511 - * only when sending new stuff i guess. Currently the timer 512 - * never backs off because on re-transmission it just resets it! 513 - */ 514 - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 515 - inet_csk(sk)->icsk_rto, DCCP_RTO_MAX); 583 + if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { 584 + DCCP_WARN("packet too small for feature %d option!\n", feat); 585 + return -1; 516 586 } 587 + DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; 517 588 589 + to = skb_push(skb, tot_len); 590 + *to++ = type; 591 + *to++ = tot_len; 592 + *to++ = feat; 593 + 594 + if (repeat_first) 595 + *to++ = *val; 596 + if (len) 597 + memcpy(to, val, len); 518 598 return 0; 519 599 } 520 600 ··· 581 565 int dccp_insert_options(struct sock *sk, struct sk_buff *skb) 582 566 { 583 567 struct dccp_sock *dp = dccp_sk(sk); 584 - struct dccp_minisock *dmsk = dccp_msk(sk); 585 568 586 569 DCCP_SKB_CB(skb)->dccpd_opt_len = 0; 587 570 588 - if (dmsk->dccpms_send_ndp_count && 589 - dccp_insert_option_ndp(sk, skb)) 571 + if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) 590 572 return -1; 591 573 592 - if (!dccp_packet_without_ack(skb)) { 593 - if (dmsk->dccpms_send_ack_vector && 594 - dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && 595 - dccp_insert_option_ackvec(sk, skb)) 574 + if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { 575 + 576 + /* Feature Negotiation */ 577 + if (dccp_feat_insert_opts(dp, NULL, skb)) 596 578 return -1; 579 + 580 + if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { 581 + /* 582 + * Obtain RTT sample from Request/Response exchange. 583 + * This is currently used in CCID 3 initialisation. 584 + */ 585 + if (dccp_insert_option_timestamp(sk, skb)) 586 + return -1; 587 + 588 + } else if (dccp_ackvec_pending(sk) && 589 + dccp_insert_option_ackvec(sk, skb)) { 590 + return -1; 591 + } 597 592 } 598 593 599 594 if (dp->dccps_hc_rx_insert_options) { ··· 612 585 return -1; 613 586 dp->dccps_hc_rx_insert_options = 0; 614 587 } 615 - 616 - /* Feature negotiation */ 617 - /* Data packets can't do feat negotiation */ 618 - if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA && 619 - DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK && 620 - dccp_insert_options_feat(sk, skb)) 621 - return -1; 622 - 623 - /* 624 - * Obtain RTT sample from Request/Response exchange. 625 - * This is currently used in CCID 3 initialisation. 626 - */ 627 - if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST && 628 - dccp_insert_option_timestamp(sk, skb)) 629 - return -1; 630 588 631 589 if (dp->dccps_timestamp_echo != 0 && 632 590 dccp_insert_option_timestamp_echo(dp, NULL, skb)) ··· 624 612 int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) 625 613 { 626 614 DCCP_SKB_CB(skb)->dccpd_opt_len = 0; 615 + 616 + if (dccp_feat_insert_opts(NULL, dreq, skb)) 617 + return -1; 627 618 628 619 if (dreq->dreq_timestamp_echo != 0 && 629 620 dccp_insert_option_timestamp_echo(NULL, dreq, skb))
+179 -104
net/dccp/output.c
··· 26 26 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); 27 27 } 28 28 29 - static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb) 29 + /* enqueue @skb on sk_send_head for retransmission, return clone to send now */ 30 + static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) 30 31 { 31 32 skb_set_owner_w(skb, sk); 32 33 WARN_ON(sk->sk_send_head); 33 34 sk->sk_send_head = skb; 35 + return skb_clone(sk->sk_send_head, gfp_any()); 34 36 } 35 37 36 38 /* ··· 163 161 struct inet_connection_sock *icsk = inet_csk(sk); 164 162 struct dccp_sock *dp = dccp_sk(sk); 165 163 u32 ccmps = dccp_determine_ccmps(dp); 166 - int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; 164 + u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; 167 165 168 166 /* Account for header lengths and IPv4/v6 option overhead */ 169 167 cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + 170 168 sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); 171 169 172 170 /* 173 - * FIXME: this should come from the CCID infrastructure, where, say, 174 - * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets 175 - * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED 176 - * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to 177 - * make it a multiple of 4 171 + * Leave enough headroom for common DCCP header options. 172 + * This only considers options which may appear on DCCP-Data packets, as 173 + * per table 3 in RFC 4340, 5.8. When running out of space for other 174 + * options (eg. Ack Vector which can take up to 255 bytes), it is better 175 + * to schedule a separate Ack. Thus we leave headroom for the following: 176 + * - 1 byte for Slow Receiver (11.6) 177 + * - 6 bytes for Timestamp (13.1) 178 + * - 10 bytes for Timestamp Echo (13.3) 179 + * - 8 bytes for NDP count (7.7, when activated) 180 + * - 6 bytes for Data Checksum (9.3) 181 + * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) 178 182 */ 179 - 180 - cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; 183 + cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + 184 + (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); 181 185 182 186 /* And store cached results */ 183 187 icsk->icsk_pmtu_cookie = pmtu; ··· 208 200 } 209 201 210 202 /** 211 - * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet 203 + * dccp_wait_for_ccid - Await CCID send permission 212 204 * @sk: socket to wait for 213 - * @skb: current skb to pass on for waiting 214 - * @delay: sleep timeout in milliseconds (> 0) 215 - * This function is called by default when the socket is closed, and 216 - * when a non-zero linger time is set on the socket. For consistency 205 + * @delay: timeout in jiffies 206 + * This is used by CCIDs which need to delay the send time in process context. 217 207 */ 218 - static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) 208 + static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) 219 209 { 220 - struct dccp_sock *dp = dccp_sk(sk); 221 210 DEFINE_WAIT(wait); 222 - unsigned long jiffdelay; 223 - int rc; 211 + long remaining; 224 212 225 - do { 226 - dccp_pr_debug("delayed send by %d msec\n", delay); 227 - jiffdelay = msecs_to_jiffies(delay); 213 + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 214 + sk->sk_write_pending++; 215 + release_sock(sk); 228 216 229 - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 217 + remaining = schedule_timeout(delay); 230 218 231 - sk->sk_write_pending++; 232 - release_sock(sk); 233 - schedule_timeout(jiffdelay); 234 - lock_sock(sk); 235 - sk->sk_write_pending--; 236 - 237 - if (sk->sk_err) 238 - goto do_error; 239 - if (signal_pending(current)) 240 - goto do_interrupted; 241 - 242 - rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 243 - } while ((delay = rc) > 0); 244 - out: 219 + lock_sock(sk); 220 + sk->sk_write_pending--; 245 221 finish_wait(sk->sk_sleep, &wait); 246 - return rc; 247 222 248 - do_error: 249 - rc = -EPIPE; 250 - goto out; 251 - do_interrupted: 252 - rc = -EINTR; 253 - goto out; 223 + if (signal_pending(current) || sk->sk_err) 224 + return -1; 225 + return remaining; 254 226 } 255 227 256 - void dccp_write_xmit(struct sock *sk, int block) 228 + /** 229 + * dccp_xmit_packet - Send data packet under control of CCID 230 + * Transmits next-queued payload and informs CCID to account for the packet. 231 + */ 232 + static void dccp_xmit_packet(struct sock *sk) 233 + { 234 + int err, len; 235 + struct dccp_sock *dp = dccp_sk(sk); 236 + struct sk_buff *skb = dccp_qpolicy_pop(sk); 237 + 238 + if (unlikely(skb == NULL)) 239 + return; 240 + len = skb->len; 241 + 242 + if (sk->sk_state == DCCP_PARTOPEN) { 243 + const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; 244 + /* 245 + * See 8.1.5 - Handshake Completion. 246 + * 247 + * For robustness we resend Confirm options until the client has 248 + * entered OPEN. During the initial feature negotiation, the MPS 249 + * is smaller than usual, reduced by the Change/Confirm options. 250 + */ 251 + if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { 252 + DCCP_WARN("Payload too large (%d) for featneg.\n", len); 253 + dccp_send_ack(sk); 254 + dccp_feat_list_purge(&dp->dccps_featneg); 255 + } 256 + 257 + inet_csk_schedule_ack(sk); 258 + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 259 + inet_csk(sk)->icsk_rto, 260 + DCCP_RTO_MAX); 261 + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; 262 + } else if (dccp_ack_pending(sk)) { 263 + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; 264 + } else { 265 + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; 266 + } 267 + 268 + err = dccp_transmit_skb(sk, skb); 269 + if (err) 270 + dccp_pr_debug("transmit_skb() returned err=%d\n", err); 271 + /* 272 + * Register this one as sent even if an error occurred. To the remote 273 + * end a local packet drop is indistinguishable from network loss, i.e. 274 + * any local drop will eventually be reported via receiver feedback. 275 + */ 276 + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); 277 + 278 + /* 279 + * If the CCID needs to transfer additional header options out-of-band 280 + * (e.g. Ack Vectors or feature-negotiation options), it activates this 281 + * flag to schedule a Sync. The Sync will automatically incorporate all 282 + * currently pending header options, thus clearing the backlog. 283 + */ 284 + if (dp->dccps_sync_scheduled) 285 + dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); 286 + } 287 + 288 + /** 289 + * dccp_flush_write_queue - Drain queue at end of connection 290 + * Since dccp_sendmsg queues packets without waiting for them to be sent, it may 291 + * happen that the TX queue is not empty at the end of a connection. We give the 292 + * HC-sender CCID a grace period of up to @time_budget jiffies. If this function 293 + * returns with a non-empty write queue, it will be purged later. 294 + */ 295 + void dccp_flush_write_queue(struct sock *sk, long *time_budget) 296 + { 297 + struct dccp_sock *dp = dccp_sk(sk); 298 + struct sk_buff *skb; 299 + long delay, rc; 300 + 301 + while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { 302 + rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 303 + 304 + switch (ccid_packet_dequeue_eval(rc)) { 305 + case CCID_PACKET_WILL_DEQUEUE_LATER: 306 + /* 307 + * If the CCID determines when to send, the next sending 308 + * time is unknown or the CCID may not even send again 309 + * (e.g. remote host crashes or lost Ack packets). 310 + */ 311 + DCCP_WARN("CCID did not manage to send all packets\n"); 312 + return; 313 + case CCID_PACKET_DELAY: 314 + delay = msecs_to_jiffies(rc); 315 + if (delay > *time_budget) 316 + return; 317 + rc = dccp_wait_for_ccid(sk, delay); 318 + if (rc < 0) 319 + return; 320 + *time_budget -= (delay - rc); 321 + /* check again if we can send now */ 322 + break; 323 + case CCID_PACKET_SEND_AT_ONCE: 324 + dccp_xmit_packet(sk); 325 + break; 326 + case CCID_PACKET_ERR: 327 + skb_dequeue(&sk->sk_write_queue); 328 + kfree_skb(skb); 329 + dccp_pr_debug("packet discarded due to err=%ld\n", rc); 330 + } 331 + } 332 + } 333 + 334 + void dccp_write_xmit(struct sock *sk) 257 335 { 258 336 struct dccp_sock *dp = dccp_sk(sk); 259 337 struct sk_buff *skb; 260 338 261 - while ((skb = skb_peek(&sk->sk_write_queue))) { 262 - int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 339 + while ((skb = dccp_qpolicy_top(sk))) { 340 + int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 263 341 264 - if (err > 0) { 265 - if (!block) { 266 - sk_reset_timer(sk, &dp->dccps_xmit_timer, 267 - msecs_to_jiffies(err)+jiffies); 268 - break; 269 - } else 270 - err = dccp_wait_for_ccid(sk, skb, err); 271 - if (err && err != -EINTR) 272 - DCCP_BUG("err=%d after dccp_wait_for_ccid", err); 273 - } 274 - 275 - skb_dequeue(&sk->sk_write_queue); 276 - if (err == 0) { 277 - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 278 - const int len = skb->len; 279 - 280 - if (sk->sk_state == DCCP_PARTOPEN) { 281 - /* See 8.1.5. Handshake Completion */ 282 - inet_csk_schedule_ack(sk); 283 - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 284 - inet_csk(sk)->icsk_rto, 285 - DCCP_RTO_MAX); 286 - dcb->dccpd_type = DCCP_PKT_DATAACK; 287 - } else if (dccp_ack_pending(sk)) 288 - dcb->dccpd_type = DCCP_PKT_DATAACK; 289 - else 290 - dcb->dccpd_type = DCCP_PKT_DATA; 291 - 292 - err = dccp_transmit_skb(sk, skb); 293 - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); 294 - if (err) 295 - DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", 296 - err); 297 - } else { 298 - dccp_pr_debug("packet discarded due to err=%d\n", err); 299 - kfree_skb(skb); 342 + switch (ccid_packet_dequeue_eval(rc)) { 343 + case CCID_PACKET_WILL_DEQUEUE_LATER: 344 + return; 345 + case CCID_PACKET_DELAY: 346 + sk_reset_timer(sk, &dp->dccps_xmit_timer, 347 + jiffies + msecs_to_jiffies(rc)); 348 + return; 349 + case CCID_PACKET_SEND_AT_ONCE: 350 + dccp_xmit_packet(sk); 351 + break; 352 + case CCID_PACKET_ERR: 353 + dccp_qpolicy_drop(sk, skb); 354 + dccp_pr_debug("packet discarded due to err=%d\n", rc); 300 355 } 301 356 } 302 357 } ··· 410 339 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; 411 340 DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; 412 341 413 - if (dccp_insert_options_rsk(dreq, skb)) { 414 - kfree_skb(skb); 415 - return NULL; 416 - } 342 + /* Resolve feature dependencies resulting from choice of CCID */ 343 + if (dccp_feat_server_ccid_dependencies(dreq)) 344 + goto response_failed; 345 + 346 + if (dccp_insert_options_rsk(dreq, skb)) 347 + goto response_failed; 417 348 418 349 /* Build and checksum header */ 419 350 dh = dccp_zeroed_hdr(skb, dccp_header_size); ··· 436 363 inet_rsk(req)->acked = 1; 437 364 DCCP_INC_STATS(DCCP_MIB_OUTSEGS); 438 365 return skb; 366 + response_failed: 367 + kfree_skb(skb); 368 + return NULL; 439 369 } 440 370 441 371 EXPORT_SYMBOL_GPL(dccp_make_response); ··· 523 447 /* 524 448 * Do all connect socket setups that can be done AF independent. 525 449 */ 526 - static inline void dccp_connect_init(struct sock *sk) 450 + int dccp_connect(struct sock *sk) 527 451 { 452 + struct sk_buff *skb; 528 453 struct dccp_sock *dp = dccp_sk(sk); 529 454 struct dst_entry *dst = __sk_dst_get(sk); 530 455 struct inet_connection_sock *icsk = inet_csk(sk); ··· 535 458 536 459 dccp_sync_mss(sk, dst_mtu(dst)); 537 460 461 + /* do not connect if feature negotiation setup fails */ 462 + if (dccp_feat_finalise_settings(dccp_sk(sk))) 463 + return -EPROTO; 464 + 538 465 /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ 539 466 dp->dccps_gar = dp->dccps_iss; 540 - 541 - icsk->icsk_retransmits = 0; 542 - } 543 - 544 - int dccp_connect(struct sock *sk) 545 - { 546 - struct sk_buff *skb; 547 - struct inet_connection_sock *icsk = inet_csk(sk); 548 - 549 - dccp_connect_init(sk); 550 467 551 468 skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); 552 469 if (unlikely(skb == NULL)) ··· 551 480 552 481 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; 553 482 554 - dccp_skb_entail(sk, skb); 555 - dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); 483 + dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); 556 484 DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); 557 485 558 486 /* Timer for repeating the REQUEST until an answer. */ 487 + icsk->icsk_retransmits = 0; 559 488 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 560 489 icsk->icsk_rto, DCCP_RTO_MAX); 561 490 return 0; ··· 642 571 DCCP_SKB_CB(skb)->dccpd_type = pkt_type; 643 572 DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; 644 573 574 + /* 575 + * Clear the flag in case the Sync was scheduled for out-of-band data, 576 + * such as carrying a long Ack Vector. 577 + */ 578 + dccp_sk(sk)->dccps_sync_scheduled = 0; 579 + 645 580 dccp_transmit_skb(sk, skb); 646 581 } 647 582 ··· 676 599 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; 677 600 678 601 if (active) { 679 - dccp_write_xmit(sk, 1); 680 - dccp_skb_entail(sk, skb); 681 - dccp_transmit_skb(sk, skb_clone(skb, prio)); 602 + skb = dccp_skb_entail(sk, skb); 682 603 /* 683 604 * Retransmission timer for active-close: RFC 4340, 8.3 requires 684 605 * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ ··· 689 614 */ 690 615 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 691 616 DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); 692 - } else 693 - dccp_transmit_skb(sk, skb); 617 + } 618 + dccp_transmit_skb(sk, skb); 694 619 }
+27 -48
net/dccp/probe.c
··· 46 46 struct kfifo *fifo; 47 47 spinlock_t lock; 48 48 wait_queue_head_t wait; 49 - struct timespec tstart; 49 + ktime_t start; 50 50 } dccpw; 51 51 52 - static void printl(const char *fmt, ...) 52 + static void jdccp_write_xmit(struct sock *sk) 53 53 { 54 - va_list args; 55 - int len; 56 - struct timespec now; 57 - char tbuf[256]; 58 - 59 - va_start(args, fmt); 60 - getnstimeofday(&now); 61 - 62 - now = timespec_sub(now, dccpw.tstart); 63 - 64 - len = sprintf(tbuf, "%lu.%06lu ", 65 - (unsigned long) now.tv_sec, 66 - (unsigned long) now.tv_nsec / NSEC_PER_USEC); 67 - len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); 68 - va_end(args); 69 - 70 - kfifo_put(dccpw.fifo, tbuf, len); 71 - wake_up(&dccpw.wait); 72 - } 73 - 74 - static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, 75 - struct msghdr *msg, size_t size) 76 - { 77 - const struct dccp_minisock *dmsk = dccp_msk(sk); 78 54 const struct inet_sock *inet = inet_sk(sk); 79 - const struct ccid3_hc_tx_sock *hctx; 55 + struct ccid3_hc_tx_sock *hctx = NULL; 56 + struct timespec tv; 57 + char buf[256]; 58 + int len, ccid = ccid_get_current_tx_ccid(dccp_sk(sk)); 80 59 81 - if (dmsk->dccpms_tx_ccid == DCCPC_CCID3) 60 + if (ccid == DCCPC_CCID3) 82 61 hctx = ccid3_hc_tx_sk(sk); 83 - else 84 - hctx = NULL; 85 62 86 - if (port == 0 || ntohs(inet->dport) == port || 87 - ntohs(inet->sport) == port) { 63 + if (!port || ntohs(inet->dport) == port || ntohs(inet->sport) == port) { 64 + 65 + tv = ktime_to_timespec(ktime_sub(ktime_get(), dccpw.start)); 66 + len = sprintf(buf, "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u %d", 67 + (unsigned long)tv.tv_sec, 68 + (unsigned long)tv.tv_nsec, 69 + NIPQUAD(inet->saddr), ntohs(inet->sport), 70 + NIPQUAD(inet->daddr), ntohs(inet->dport), ccid); 71 + 88 72 if (hctx) 89 - printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u " 90 - "%llu %llu %d\n", 91 - NIPQUAD(inet->saddr), ntohs(inet->sport), 92 - NIPQUAD(inet->daddr), ntohs(inet->dport), size, 93 - hctx->ccid3hctx_s, hctx->ccid3hctx_rtt, 94 - hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc, 95 - hctx->ccid3hctx_x_recv >> 6, 96 - hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi); 97 - else 98 - printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n", 99 - NIPQUAD(inet->saddr), ntohs(inet->sport), 100 - NIPQUAD(inet->daddr), ntohs(inet->dport), size); 73 + len += sprintf(buf + len, " %d %d %d %u %u %u %d", 74 + hctx->s, hctx->rtt, hctx->p, hctx->x_calc, 75 + (unsigned)(hctx->x_recv >> 6), 76 + (unsigned)(hctx->x >> 6), hctx->t_ipi); 77 + 78 + len += sprintf(buf + len, "\n"); 79 + kfifo_put(dccpw.fifo, buf, len); 80 + wake_up(&dccpw.wait); 101 81 } 102 82 103 83 jprobe_return(); 104 - return 0; 105 84 } 106 85 107 86 static struct jprobe dccp_send_probe = { 108 87 .kp = { 109 - .symbol_name = "dccp_sendmsg", 88 + .symbol_name = "dccp_write_xmit", 110 89 }, 111 - .entry = jdccp_sendmsg, 90 + .entry = jdccp_write_xmit, 112 91 }; 113 92 114 93 static int dccpprobe_open(struct inode *inode, struct file *file) 115 94 { 116 95 kfifo_reset(dccpw.fifo); 117 - getnstimeofday(&dccpw.tstart); 96 + dccpw.start = ktime_get(); 118 97 return 0; 119 98 } 120 99
+175 -106
net/dccp/proto.c
··· 67 67 case DCCP_OPEN: 68 68 if (oldstate != DCCP_OPEN) 69 69 DCCP_INC_STATS(DCCP_MIB_CURRESTAB); 70 + /* Client retransmits all Confirm options until entering OPEN */ 71 + if (oldstate == DCCP_PARTOPEN) 72 + dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); 70 73 break; 71 74 72 75 case DCCP_CLOSED: ··· 178 175 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) 179 176 { 180 177 struct dccp_sock *dp = dccp_sk(sk); 181 - struct dccp_minisock *dmsk = dccp_msk(sk); 182 178 struct inet_connection_sock *icsk = inet_csk(sk); 183 - 184 - dccp_minisock_init(&dp->dccps_minisock); 185 179 186 180 icsk->icsk_rto = DCCP_TIMEOUT_INIT; 187 181 icsk->icsk_syn_retries = sysctl_dccp_request_retries; 188 182 sk->sk_state = DCCP_CLOSED; 189 183 sk->sk_write_space = dccp_write_space; 190 184 icsk->icsk_sync_mss = dccp_sync_mss; 191 - dp->dccps_mss_cache = 536; 185 + dp->dccps_mss_cache = TCP_MIN_RCVMSS; 192 186 dp->dccps_rate_last = jiffies; 193 187 dp->dccps_role = DCCP_ROLE_UNDEFINED; 194 188 dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; 195 - dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; 189 + dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; 196 190 197 191 dccp_init_xmit_timers(sk); 198 192 199 - /* 200 - * FIXME: We're hardcoding the CCID, and doing this at this point makes 201 - * the listening (master) sock get CCID control blocks, which is not 202 - * necessary, but for now, to not mess with the test userspace apps, 203 - * lets leave it here, later the real solution is to do this in a 204 - * setsockopt(CCIDs-I-want/accept). -acme 205 - */ 206 - if (likely(ctl_sock_initialized)) { 207 - int rc = dccp_feat_init(dmsk); 208 - 209 - if (rc) 210 - return rc; 211 - 212 - if (dmsk->dccpms_send_ack_vector) { 213 - dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL); 214 - if (dp->dccps_hc_rx_ackvec == NULL) 215 - return -ENOMEM; 216 - } 217 - dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid, 218 - sk, GFP_KERNEL); 219 - dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid, 220 - sk, GFP_KERNEL); 221 - if (unlikely(dp->dccps_hc_rx_ccid == NULL || 222 - dp->dccps_hc_tx_ccid == NULL)) { 223 - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); 224 - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); 225 - if (dmsk->dccpms_send_ack_vector) { 226 - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); 227 - dp->dccps_hc_rx_ackvec = NULL; 228 - } 229 - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; 230 - return -ENOMEM; 231 - } 232 - } else { 233 - /* control socket doesn't need feat nego */ 234 - INIT_LIST_HEAD(&dmsk->dccpms_pending); 235 - INIT_LIST_HEAD(&dmsk->dccpms_conf); 236 - } 237 - 193 + INIT_LIST_HEAD(&dp->dccps_featneg); 194 + /* control socket doesn't need feat nego */ 195 + if (likely(ctl_sock_initialized)) 196 + return dccp_feat_init(sk); 238 197 return 0; 239 198 } 240 199 ··· 205 240 void dccp_destroy_sock(struct sock *sk) 206 241 { 207 242 struct dccp_sock *dp = dccp_sk(sk); 208 - struct dccp_minisock *dmsk = dccp_msk(sk); 209 243 210 244 /* 211 245 * DCCP doesn't use sk_write_queue, just sk_send_head ··· 222 258 kfree(dp->dccps_service_list); 223 259 dp->dccps_service_list = NULL; 224 260 225 - if (dmsk->dccpms_send_ack_vector) { 261 + if (dp->dccps_hc_rx_ackvec != NULL) { 226 262 dccp_ackvec_free(dp->dccps_hc_rx_ackvec); 227 263 dp->dccps_hc_rx_ackvec = NULL; 228 264 } ··· 231 267 dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; 232 268 233 269 /* clean up feature negotiation state */ 234 - dccp_feat_clean(dmsk); 270 + dccp_feat_list_purge(&dp->dccps_featneg); 235 271 } 236 272 237 273 EXPORT_SYMBOL_GPL(dccp_destroy_sock); ··· 241 277 struct dccp_sock *dp = dccp_sk(sk); 242 278 243 279 dp->dccps_role = DCCP_ROLE_LISTEN; 280 + /* do not start to listen if feature negotiation setup fails */ 281 + if (dccp_feat_finalise_settings(dp)) 282 + return -EPROTO; 244 283 return inet_csk_listen_start(sk, backlog); 245 284 } 246 285 ··· 433 466 return 0; 434 467 } 435 468 436 - /* byte 1 is feature. the rest is the preference list */ 437 - static int dccp_setsockopt_change(struct sock *sk, int type, 438 - struct dccp_so_feat __user *optval) 469 + static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) 439 470 { 440 - struct dccp_so_feat opt; 441 - u8 *val; 442 - int rc; 471 + u8 *list, len; 472 + int i, rc; 443 473 444 - if (copy_from_user(&opt, optval, sizeof(opt))) 445 - return -EFAULT; 474 + if (cscov < 0 || cscov > 15) 475 + return -EINVAL; 446 476 /* 447 - * rfc4340: 6.1. Change Options 477 + * Populate a list of permissible values, in the range cscov...15. This 478 + * is necessary since feature negotiation of single values only works if 479 + * both sides incidentally choose the same value. Since the list starts 480 + * lowest-value first, negotiation will pick the smallest shared value. 448 481 */ 449 - if (opt.dccpsf_len < 1) 482 + if (cscov == 0) 483 + return 0; 484 + len = 16 - cscov; 485 + 486 + list = kmalloc(len, GFP_KERNEL); 487 + if (list == NULL) 488 + return -ENOBUFS; 489 + 490 + for (i = 0; i < len; i++) 491 + list[i] = cscov++; 492 + 493 + rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); 494 + 495 + if (rc == 0) { 496 + if (rx) 497 + dccp_sk(sk)->dccps_pcrlen = cscov; 498 + else 499 + dccp_sk(sk)->dccps_pcslen = cscov; 500 + } 501 + kfree(list); 502 + return rc; 503 + } 504 + 505 + static int dccp_setsockopt_ccid(struct sock *sk, int type, 506 + char __user *optval, int optlen) 507 + { 508 + u8 *val; 509 + int rc = 0; 510 + 511 + if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) 450 512 return -EINVAL; 451 513 452 - val = kmalloc(opt.dccpsf_len, GFP_KERNEL); 453 - if (!val) 514 + val = kmalloc(optlen, GFP_KERNEL); 515 + if (val == NULL) 454 516 return -ENOMEM; 455 517 456 - if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) { 457 - rc = -EFAULT; 458 - goto out_free_val; 518 + if (copy_from_user(val, optval, optlen)) { 519 + kfree(val); 520 + return -EFAULT; 459 521 } 460 522 461 - rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat, 462 - val, opt.dccpsf_len, GFP_KERNEL); 463 - if (rc) 464 - goto out_free_val; 523 + lock_sock(sk); 524 + if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) 525 + rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); 465 526 466 - out: 467 - return rc; 527 + if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) 528 + rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); 529 + release_sock(sk); 468 530 469 - out_free_val: 470 531 kfree(val); 471 - goto out; 532 + return rc; 472 533 } 473 534 474 535 static int do_dccp_setsockopt(struct sock *sk, int level, int optname, ··· 505 510 struct dccp_sock *dp = dccp_sk(sk); 506 511 int val, err = 0; 507 512 508 - if (optlen < sizeof(int)) 513 + switch (optname) { 514 + case DCCP_SOCKOPT_PACKET_SIZE: 515 + DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); 516 + return 0; 517 + case DCCP_SOCKOPT_CHANGE_L: 518 + case DCCP_SOCKOPT_CHANGE_R: 519 + DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); 520 + return 0; 521 + case DCCP_SOCKOPT_CCID: 522 + case DCCP_SOCKOPT_RX_CCID: 523 + case DCCP_SOCKOPT_TX_CCID: 524 + return dccp_setsockopt_ccid(sk, optname, optval, optlen); 525 + } 526 + 527 + if (optlen < (int)sizeof(int)) 509 528 return -EINVAL; 510 529 511 530 if (get_user(val, (int __user *)optval)) ··· 530 521 531 522 lock_sock(sk); 532 523 switch (optname) { 533 - case DCCP_SOCKOPT_PACKET_SIZE: 534 - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); 535 - err = 0; 536 - break; 537 - case DCCP_SOCKOPT_CHANGE_L: 538 - if (optlen != sizeof(struct dccp_so_feat)) 539 - err = -EINVAL; 540 - else 541 - err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L, 542 - (struct dccp_so_feat __user *) 543 - optval); 544 - break; 545 - case DCCP_SOCKOPT_CHANGE_R: 546 - if (optlen != sizeof(struct dccp_so_feat)) 547 - err = -EINVAL; 548 - else 549 - err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R, 550 - (struct dccp_so_feat __user *) 551 - optval); 552 - break; 553 524 case DCCP_SOCKOPT_SERVER_TIMEWAIT: 554 525 if (dp->dccps_role != DCCP_ROLE_SERVER) 555 526 err = -EOPNOTSUPP; 556 527 else 557 528 dp->dccps_server_timewait = (val != 0); 558 529 break; 559 - case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ 560 - if (val < 0 || val > 15) 530 + case DCCP_SOCKOPT_SEND_CSCOV: 531 + err = dccp_setsockopt_cscov(sk, val, false); 532 + break; 533 + case DCCP_SOCKOPT_RECV_CSCOV: 534 + err = dccp_setsockopt_cscov(sk, val, true); 535 + break; 536 + case DCCP_SOCKOPT_QPOLICY_ID: 537 + if (sk->sk_state != DCCP_CLOSED) 538 + err = -EISCONN; 539 + else if (val < 0 || val >= DCCPQ_POLICY_MAX) 561 540 err = -EINVAL; 562 541 else 563 - dp->dccps_pcslen = val; 542 + dp->dccps_qpolicy = val; 564 543 break; 565 - case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */ 566 - if (val < 0 || val > 15) 544 + case DCCP_SOCKOPT_QPOLICY_TXQLEN: 545 + if (val < 0) 567 546 err = -EINVAL; 568 - else { 569 - dp->dccps_pcrlen = val; 570 - /* FIXME: add feature negotiation, 571 - * ChangeL(MinimumChecksumCoverage, val) */ 572 - } 547 + else 548 + dp->dccps_tx_qlen = val; 573 549 break; 574 550 default: 575 551 err = -ENOPROTOOPT; 576 552 break; 577 553 } 578 - 579 554 release_sock(sk); 555 + 580 556 return err; 581 557 } 582 558 ··· 642 648 case DCCP_SOCKOPT_GET_CUR_MPS: 643 649 val = dp->dccps_mss_cache; 644 650 break; 651 + case DCCP_SOCKOPT_AVAILABLE_CCIDS: 652 + return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); 653 + case DCCP_SOCKOPT_TX_CCID: 654 + val = ccid_get_current_tx_ccid(dp); 655 + if (val < 0) 656 + return -ENOPROTOOPT; 657 + break; 658 + case DCCP_SOCKOPT_RX_CCID: 659 + val = ccid_get_current_rx_ccid(dp); 660 + if (val < 0) 661 + return -ENOPROTOOPT; 662 + break; 645 663 case DCCP_SOCKOPT_SERVER_TIMEWAIT: 646 664 val = dp->dccps_server_timewait; 647 665 break; ··· 662 656 break; 663 657 case DCCP_SOCKOPT_RECV_CSCOV: 664 658 val = dp->dccps_pcrlen; 659 + break; 660 + case DCCP_SOCKOPT_QPOLICY_ID: 661 + val = dp->dccps_qpolicy; 662 + break; 663 + case DCCP_SOCKOPT_QPOLICY_TXQLEN: 664 + val = dp->dccps_tx_qlen; 665 665 break; 666 666 case 128 ... 191: 667 667 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, ··· 711 699 EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); 712 700 #endif 713 701 702 + static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) 703 + { 704 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); 705 + 706 + /* 707 + * Assign an (opaque) qpolicy priority value to skb->priority. 708 + * 709 + * We are overloading this skb field for use with the qpolicy subystem. 710 + * The skb->priority is normally used for the SO_PRIORITY option, which 711 + * is initialised from sk_priority. Since the assignment of sk_priority 712 + * to skb->priority happens later (on layer 3), we overload this field 713 + * for use with queueing priorities as long as the skb is on layer 4. 714 + * The default priority value (if nothing is set) is 0. 715 + */ 716 + skb->priority = 0; 717 + 718 + for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { 719 + 720 + if (!CMSG_OK(msg, cmsg)) 721 + return -EINVAL; 722 + 723 + if (cmsg->cmsg_level != SOL_DCCP) 724 + continue; 725 + 726 + if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && 727 + !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) 728 + return -EINVAL; 729 + 730 + switch (cmsg->cmsg_type) { 731 + case DCCP_SCM_PRIORITY: 732 + if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) 733 + return -EINVAL; 734 + skb->priority = *(__u32 *)CMSG_DATA(cmsg); 735 + break; 736 + default: 737 + return -EINVAL; 738 + } 739 + } 740 + return 0; 741 + } 742 + 714 743 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 715 744 size_t len) 716 745 { ··· 767 714 768 715 lock_sock(sk); 769 716 770 - if (sysctl_dccp_tx_qlen && 771 - (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) { 717 + if (dccp_qpolicy_full(sk)) { 772 718 rc = -EAGAIN; 773 719 goto out_release; 774 720 } ··· 795 743 if (rc != 0) 796 744 goto out_discard; 797 745 798 - skb_queue_tail(&sk->sk_write_queue, skb); 799 - dccp_write_xmit(sk,0); 746 + rc = dccp_msghdr_parse(msg, skb); 747 + if (rc != 0) 748 + goto out_discard; 749 + 750 + dccp_qpolicy_push(sk, skb); 751 + dccp_write_xmit(sk); 800 752 out_release: 801 753 release_sock(sk); 802 754 return rc ? : len; ··· 1023 967 /* Check zero linger _after_ checking for unread data. */ 1024 968 sk->sk_prot->disconnect(sk, 0); 1025 969 } else if (sk->sk_state != DCCP_CLOSED) { 970 + /* 971 + * Normal connection termination. May need to wait if there are 972 + * still packets in the TX queue that are delayed by the CCID. 973 + */ 974 + dccp_flush_write_queue(sk, &timeout); 1026 975 dccp_terminate_connection(sk); 1027 976 } 977 + 978 + /* 979 + * Flush write queue. This may be necessary in several cases: 980 + * - we have been closed by the peer but still have application data; 981 + * - abortive termination (unread data or zero linger time), 982 + * - normal termination but queue could not be flushed within time limit 983 + */ 984 + __skb_queue_purge(&sk->sk_write_queue); 1028 985 1029 986 sk_stream_wait_close(sk, timeout); 1030 987
+137
net/dccp/qpolicy.c
··· 1 + /* 2 + * net/dccp/qpolicy.c 3 + * 4 + * Policy-based packet dequeueing interface for DCCP. 5 + * 6 + * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net> 7 + * 8 + * This program is free software; you can redistribute it and/or 9 + * modify it under the terms of the GNU General Public License v2 10 + * as published by the Free Software Foundation. 11 + */ 12 + #include "dccp.h" 13 + 14 + /* 15 + * Simple Dequeueing Policy: 16 + * If tx_qlen is different from 0, enqueue up to tx_qlen elements. 17 + */ 18 + static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) 19 + { 20 + skb_queue_tail(&sk->sk_write_queue, skb); 21 + } 22 + 23 + static bool qpolicy_simple_full(struct sock *sk) 24 + { 25 + return dccp_sk(sk)->dccps_tx_qlen && 26 + sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; 27 + } 28 + 29 + static struct sk_buff *qpolicy_simple_top(struct sock *sk) 30 + { 31 + return skb_peek(&sk->sk_write_queue); 32 + } 33 + 34 + /* 35 + * Priority-based Dequeueing Policy: 36 + * If tx_qlen is different from 0 and the queue has reached its upper bound 37 + * of tx_qlen elements, replace older packets lowest-priority-first. 38 + */ 39 + static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) 40 + { 41 + struct sk_buff *skb, *best = NULL; 42 + 43 + skb_queue_walk(&sk->sk_write_queue, skb) 44 + if (best == NULL || skb->priority > best->priority) 45 + best = skb; 46 + return best; 47 + } 48 + 49 + static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) 50 + { 51 + struct sk_buff *skb, *worst = NULL; 52 + 53 + skb_queue_walk(&sk->sk_write_queue, skb) 54 + if (worst == NULL || skb->priority < worst->priority) 55 + worst = skb; 56 + return worst; 57 + } 58 + 59 + static bool qpolicy_prio_full(struct sock *sk) 60 + { 61 + if (qpolicy_simple_full(sk)) 62 + dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); 63 + return false; 64 + } 65 + 66 + /** 67 + * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface 68 + * @push: add a new @skb to the write queue 69 + * @full: indicates that no more packets will be admitted 70 + * @top: peeks at whatever the queueing policy defines as its `top' 71 + */ 72 + static struct dccp_qpolicy_operations { 73 + void (*push) (struct sock *sk, struct sk_buff *skb); 74 + bool (*full) (struct sock *sk); 75 + struct sk_buff* (*top) (struct sock *sk); 76 + __be32 params; 77 + 78 + } qpol_table[DCCPQ_POLICY_MAX] = { 79 + [DCCPQ_POLICY_SIMPLE] = { 80 + .push = qpolicy_simple_push, 81 + .full = qpolicy_simple_full, 82 + .top = qpolicy_simple_top, 83 + .params = 0, 84 + }, 85 + [DCCPQ_POLICY_PRIO] = { 86 + .push = qpolicy_simple_push, 87 + .full = qpolicy_prio_full, 88 + .top = qpolicy_prio_best_skb, 89 + .params = DCCP_SCM_PRIORITY, 90 + }, 91 + }; 92 + 93 + /* 94 + * Externally visible interface 95 + */ 96 + void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) 97 + { 98 + qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); 99 + } 100 + 101 + bool dccp_qpolicy_full(struct sock *sk) 102 + { 103 + return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); 104 + } 105 + 106 + void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) 107 + { 108 + if (skb != NULL) { 109 + skb_unlink(skb, &sk->sk_write_queue); 110 + kfree_skb(skb); 111 + } 112 + } 113 + 114 + struct sk_buff *dccp_qpolicy_top(struct sock *sk) 115 + { 116 + return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); 117 + } 118 + 119 + struct sk_buff *dccp_qpolicy_pop(struct sock *sk) 120 + { 121 + struct sk_buff *skb = dccp_qpolicy_top(sk); 122 + 123 + /* Clear any skb fields that we used internally */ 124 + skb->priority = 0; 125 + 126 + if (skb) 127 + skb_unlink(skb, &sk->sk_write_queue); 128 + return skb; 129 + } 130 + 131 + bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) 132 + { 133 + /* check if exactly one bit is set */ 134 + if (!param || (param & (param - 1))) 135 + return false; 136 + return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; 137 + }
+30 -34
net/dccp/sysctl.c
··· 18 18 #error This file should not be compiled without CONFIG_SYSCTL defined 19 19 #endif 20 20 21 + /* Boundary values */ 22 + static int zero = 0, 23 + u8_max = 0xFF; 24 + static unsigned long seqw_min = 32; 25 + 21 26 static struct ctl_table dccp_default_table[] = { 22 27 { 23 28 .procname = "seq_window", 24 - .data = &sysctl_dccp_feat_sequence_window, 25 - .maxlen = sizeof(sysctl_dccp_feat_sequence_window), 29 + .data = &sysctl_dccp_sequence_window, 30 + .maxlen = sizeof(sysctl_dccp_sequence_window), 26 31 .mode = 0644, 27 - .proc_handler = proc_dointvec, 32 + .proc_handler = proc_doulongvec_minmax, 33 + .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ 28 34 }, 29 35 { 30 36 .procname = "rx_ccid", 31 - .data = &sysctl_dccp_feat_rx_ccid, 32 - .maxlen = sizeof(sysctl_dccp_feat_rx_ccid), 37 + .data = &sysctl_dccp_rx_ccid, 38 + .maxlen = sizeof(sysctl_dccp_rx_ccid), 33 39 .mode = 0644, 34 - .proc_handler = proc_dointvec, 40 + .proc_handler = proc_dointvec_minmax, 41 + .extra1 = &zero, 42 + .extra2 = &u8_max, /* RFC 4340, 10. */ 35 43 }, 36 44 { 37 45 .procname = "tx_ccid", 38 - .data = &sysctl_dccp_feat_tx_ccid, 39 - .maxlen = sizeof(sysctl_dccp_feat_tx_ccid), 46 + .data = &sysctl_dccp_tx_ccid, 47 + .maxlen = sizeof(sysctl_dccp_tx_ccid), 40 48 .mode = 0644, 41 - .proc_handler = proc_dointvec, 42 - }, 43 - { 44 - .procname = "ack_ratio", 45 - .data = &sysctl_dccp_feat_ack_ratio, 46 - .maxlen = sizeof(sysctl_dccp_feat_ack_ratio), 47 - .mode = 0644, 48 - .proc_handler = proc_dointvec, 49 - }, 50 - { 51 - .procname = "send_ackvec", 52 - .data = &sysctl_dccp_feat_send_ack_vector, 53 - .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector), 54 - .mode = 0644, 55 - .proc_handler = proc_dointvec, 56 - }, 57 - { 58 - .procname = "send_ndp", 59 - .data = &sysctl_dccp_feat_send_ndp_count, 60 - .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count), 61 - .mode = 0644, 62 - .proc_handler = proc_dointvec, 49 + .proc_handler = proc_dointvec_minmax, 50 + .extra1 = &zero, 51 + .extra2 = &u8_max, /* RFC 4340, 10. */ 63 52 }, 64 53 { 65 54 .procname = "request_retries", 66 55 .data = &sysctl_dccp_request_retries, 67 56 .maxlen = sizeof(sysctl_dccp_request_retries), 68 57 .mode = 0644, 69 - .proc_handler = proc_dointvec, 58 + .proc_handler = proc_dointvec_minmax, 59 + .extra1 = &zero, 60 + .extra2 = &u8_max, 70 61 }, 71 62 { 72 63 .procname = "retries1", 73 64 .data = &sysctl_dccp_retries1, 74 65 .maxlen = sizeof(sysctl_dccp_retries1), 75 66 .mode = 0644, 76 - .proc_handler = proc_dointvec, 67 + .proc_handler = proc_dointvec_minmax, 68 + .extra1 = &zero, 69 + .extra2 = &u8_max, 77 70 }, 78 71 { 79 72 .procname = "retries2", 80 73 .data = &sysctl_dccp_retries2, 81 74 .maxlen = sizeof(sysctl_dccp_retries2), 82 75 .mode = 0644, 83 - .proc_handler = proc_dointvec, 76 + .proc_handler = proc_dointvec_minmax, 77 + .extra1 = &zero, 78 + .extra2 = &u8_max, 84 79 }, 85 80 { 86 81 .procname = "tx_qlen", 87 82 .data = &sysctl_dccp_tx_qlen, 88 83 .maxlen = sizeof(sysctl_dccp_tx_qlen), 89 84 .mode = 0644, 90 - .proc_handler = proc_dointvec, 85 + .proc_handler = proc_dointvec_minmax, 86 + .extra1 = &zero, 91 87 }, 92 88 { 93 89 .procname = "sync_ratelimit",
+16 -26
net/dccp/timer.c
··· 87 87 { 88 88 struct inet_connection_sock *icsk = inet_csk(sk); 89 89 90 - /* retransmit timer is used for feature negotiation throughout 91 - * connection. In this case, no packet is re-transmitted, but rather an 92 - * ack is generated and pending changes are placed into its options. 93 - */ 94 - if (sk->sk_send_head == NULL) { 95 - dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk); 96 - if (sk->sk_state == DCCP_OPEN) 97 - dccp_send_ack(sk); 98 - goto backoff; 99 - } 100 - 101 90 /* 102 91 * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was 103 92 * sent, no need to retransmit, this sock is dead. ··· 115 126 return; 116 127 } 117 128 118 - backoff: 119 129 icsk->icsk_backoff++; 120 130 121 131 icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); ··· 237 249 sock_put(sk); 238 250 } 239 251 240 - /* Transmit-delay timer: used by the CCIDs to delay actual send time */ 241 - static void dccp_write_xmit_timer(unsigned long data) 252 + /** 253 + * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface 254 + * See the comments above %ccid_dequeueing_decision for supported modes. 255 + */ 256 + static void dccp_write_xmitlet(unsigned long data) 242 257 { 243 258 struct sock *sk = (struct sock *)data; 244 - struct dccp_sock *dp = dccp_sk(sk); 245 259 246 260 bh_lock_sock(sk); 247 261 if (sock_owned_by_user(sk)) 248 - sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); 262 + sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); 249 263 else 250 - dccp_write_xmit(sk, 0); 264 + dccp_write_xmit(sk); 251 265 bh_unlock_sock(sk); 252 - sock_put(sk); 253 266 } 254 267 255 - static void dccp_init_write_xmit_timer(struct sock *sk) 268 + static void dccp_write_xmit_timer(unsigned long data) 256 269 { 257 - struct dccp_sock *dp = dccp_sk(sk); 258 - 259 - setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 260 - (unsigned long)sk); 270 + dccp_write_xmitlet(data); 271 + sock_put((struct sock *)data); 261 272 } 262 273 263 274 void dccp_init_xmit_timers(struct sock *sk) 264 275 { 265 - dccp_init_write_xmit_timer(sk); 276 + struct dccp_sock *dp = dccp_sk(sk); 277 + 278 + tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); 279 + setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 280 + (unsigned long)sk); 266 281 inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, 267 282 &dccp_keepalive_timer); 268 283 } ··· 281 290 { 282 291 s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); 283 292 284 - do_div(delta, 10); 285 - return delta; 293 + return div_u64(delta, DCCP_TIME_RESOLUTION); 286 294 } 287 295 EXPORT_SYMBOL_GPL(dccp_timestamp); 288 296
+2 -15
net/ipv4/tcp_input.c
··· 811 811 } 812 812 } 813 813 814 - /* Numbers are taken from RFC3390. 815 - * 816 - * John Heffner states: 817 - * 818 - * The RFC specifies a window of no more than 4380 bytes 819 - * unless 2*MSS > 4380. Reading the pseudocode in the RFC 820 - * is a bit misleading because they use a clamp at 4380 bytes 821 - * rather than use a multiplier in the relevant range. 822 - */ 823 814 __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) 824 815 { 825 816 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 826 817 827 - if (!cwnd) { 828 - if (tp->mss_cache > 1460) 829 - cwnd = 2; 830 - else 831 - cwnd = (tp->mss_cache > 1095) ? 3 : 4; 832 - } 818 + if (!cwnd) 819 + cwnd = rfc3390_bytes_to_packets(tp->mss_cache); 833 820 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 834 821 } 835 822