Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

This reverts "Merge branch 'dccp' of git://eden-feed.erg.abdn.ac.uk/dccp_exp" as it accentally contained the wrong set of patches. These will be submitted separately. Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>

+3035 -4122
+12 -42
Documentation/networking/dccp.txt
··· 45 45 46 46 Socket options 47 47 ============== 48 - DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes 49 - a policy ID as argument and can only be set before the connection (i.e. changes 50 - during an established connection are not supported). Currently, two policies are 51 - defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, 52 - and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an 53 - u32 priority value as ancillary data to sendmsg(), where higher numbers indicate 54 - a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to 55 - be formatted using a cmsg(3) message header filled in as follows: 56 - cmsg->cmsg_level = SOL_DCCP; 57 - cmsg->cmsg_type = DCCP_SCM_PRIORITY; 58 - cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ 59 - 60 - DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero 61 - value is always interpreted as unbounded queue length. If different from zero, 62 - the interpretation of this parameter depends on the current dequeuing policy 63 - (see above): the "simple" policy will enforce a fixed queue size by returning 64 - EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the 65 - lowest-priority packet first. The default value for this parameter is 66 - initialised from /proc/sys/net/dccp/default/tx_qlen. 67 48 68 49 DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of 69 50 service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, ··· 56 75 57 76 DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet 58 77 size (application payload size) in bytes, see RFC 4340, section 14. 59 - 60 - DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs 61 - supported by the endpoint (see include/linux/dccp.h for symbolic constants). 62 - The caller needs to provide a sufficiently large (> 2) array of type uint8_t. 63 - 64 - DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same 65 - time, combining the operation of the next two socket options. This option is 66 - preferrable over the latter two, since often applications will use the same 67 - type of CCID for both directions; and mixed use of CCIDs is not currently well 68 - understood. This socket option takes as argument at least one uint8_t value, or 69 - an array of uint8_t values, which must match available CCIDS (see above). CCIDs 70 - must be registered on the socket before calling connect() or listen(). 71 - 72 - DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets 73 - the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. 74 - Please note that the getsockopt argument type here is `int', not uint8_t. 75 - 76 - DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. 77 78 78 79 DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold 79 80 timewait state when closing the connection (RFC 4340, 8.3). The usual case is ··· 115 152 importance for retransmitted acknowledgments and feature negotiation, 116 153 data packets are never retransmitted. Analogue of tcp_retries2. 117 154 155 + send_ndp = 1 156 + Whether or not to send NDP count options (sec. 7.7.2). 157 + 158 + send_ackvec = 1 159 + Whether or not to send Ack Vector options (sec. 11.5). 160 + 161 + ack_ratio = 2 162 + The default Ack Ratio (sec. 11.3) to use. 163 + 118 164 tx_ccid = 2 119 - Default CCID for the sender-receiver half-connection. Depending on the 120 - choice of CCID, the Send Ack Vector feature is enabled automatically. 165 + Default CCID for the sender-receiver half-connection. 121 166 122 167 rx_ccid = 2 123 - Default CCID for the receiver-sender half-connection; see tx_ccid. 168 + Default CCID for the receiver-sender half-connection. 124 169 125 170 seq_window = 100 126 - The initial sequence window (sec. 7.5.2) of the sender. This influences 127 - the local ackno validity and the remote seqno validity windows (7.5.1). 171 + The initial sequence window (sec. 7.5.2). 128 172 129 173 tx_qlen = 5 130 174 The size of the transmit buffer in packets. A value of 0 corresponds
+71 -51
include/linux/dccp.h
··· 165 165 DCCPO_TIMESTAMP_ECHO = 42, 166 166 DCCPO_ELAPSED_TIME = 43, 167 167 DCCPO_MAX = 45, 168 - DCCPO_MIN_RX_CCID_SPECIFIC = 128, /* from sender to receiver */ 169 - DCCPO_MAX_RX_CCID_SPECIFIC = 191, 170 - DCCPO_MIN_TX_CCID_SPECIFIC = 192, /* from receiver to sender */ 171 - DCCPO_MAX_TX_CCID_SPECIFIC = 255, 168 + DCCPO_MIN_CCID_SPECIFIC = 128, 169 + DCCPO_MAX_CCID_SPECIFIC = 255, 172 170 }; 173 - /* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ 174 - #define DCCP_SINGLE_OPT_MAXLEN 253 175 171 176 172 /* DCCP CCIDS */ 177 173 enum { ··· 176 180 }; 177 181 178 182 /* DCCP features (RFC 4340 section 6.4) */ 179 - enum dccp_feature_numbers { 183 + enum { 180 184 DCCPF_RESERVED = 0, 181 185 DCCPF_CCID = 1, 182 - DCCPF_SHORT_SEQNOS = 2, 186 + DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */ 183 187 DCCPF_SEQUENCE_WINDOW = 3, 184 - DCCPF_ECN_INCAPABLE = 4, 188 + DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */ 185 189 DCCPF_ACK_RATIO = 5, 186 190 DCCPF_SEND_ACK_VECTOR = 6, 187 191 DCCPF_SEND_NDP_COUNT = 7, 188 192 DCCPF_MIN_CSUM_COVER = 8, 189 - DCCPF_DATA_CHECKSUM = 9, 193 + DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */ 190 194 /* 10-127 reserved */ 191 195 DCCPF_MIN_CCID_SPECIFIC = 128, 192 - DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */ 193 196 DCCPF_MAX_CCID_SPECIFIC = 255, 194 197 }; 195 198 196 - /* DCCP socket control message types for cmsg */ 197 - enum dccp_cmsg_type { 198 - DCCP_SCM_PRIORITY = 1, 199 - DCCP_SCM_QPOLICY_MAX = 0xFFFF, 200 - /* ^-- Up to here reserved exclusively for qpolicy parameters */ 201 - DCCP_SCM_MAX 202 - }; 203 - 204 - /* DCCP priorities for outgoing/queued packets */ 205 - enum dccp_packet_dequeueing_policy { 206 - DCCPQ_POLICY_SIMPLE, 207 - DCCPQ_POLICY_PRIO, 208 - DCCPQ_POLICY_MAX 199 + /* this structure is argument to DCCP_SOCKOPT_CHANGE_X */ 200 + struct dccp_so_feat { 201 + __u8 dccpsf_feat; 202 + __u8 __user *dccpsf_val; 203 + __u8 dccpsf_len; 209 204 }; 210 205 211 206 /* DCCP socket options */ ··· 208 221 #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 209 222 #define DCCP_SOCKOPT_SEND_CSCOV 10 210 223 #define DCCP_SOCKOPT_RECV_CSCOV 11 211 - #define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 212 - #define DCCP_SOCKOPT_CCID 13 213 - #define DCCP_SOCKOPT_TX_CCID 14 214 - #define DCCP_SOCKOPT_RX_CCID 15 215 - #define DCCP_SOCKOPT_QPOLICY_ID 16 216 - #define DCCP_SOCKOPT_QPOLICY_TXQLEN 17 217 224 #define DCCP_SOCKOPT_CCID_RX_INFO 128 218 225 #define DCCP_SOCKOPT_CCID_TX_INFO 192 219 226 ··· 355 374 return __dccp_hdr_len(dccp_hdr(skb)); 356 375 } 357 376 377 + 378 + /* initial values for each feature */ 379 + #define DCCPF_INITIAL_SEQUENCE_WINDOW 100 380 + #define DCCPF_INITIAL_ACK_RATIO 2 381 + #define DCCPF_INITIAL_CCID DCCPC_CCID2 382 + #define DCCPF_INITIAL_SEND_ACK_VECTOR 1 383 + /* FIXME: for now we're default to 1 but it should really be 0 */ 384 + #define DCCPF_INITIAL_SEND_NDP_COUNT 1 385 + 386 + /** 387 + * struct dccp_minisock - Minimal DCCP connection representation 388 + * 389 + * Will be used to pass the state from dccp_request_sock to dccp_sock. 390 + * 391 + * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) 392 + * @dccpms_ccid - Congestion Control Id (CCID) (section 10) 393 + * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) 394 + * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) 395 + * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3) 396 + * @dccpms_pending - List of features being negotiated 397 + * @dccpms_conf - 398 + */ 399 + struct dccp_minisock { 400 + __u64 dccpms_sequence_window; 401 + __u8 dccpms_rx_ccid; 402 + __u8 dccpms_tx_ccid; 403 + __u8 dccpms_send_ack_vector; 404 + __u8 dccpms_send_ndp_count; 405 + __u8 dccpms_ack_ratio; 406 + struct list_head dccpms_pending; 407 + struct list_head dccpms_conf; 408 + }; 409 + 410 + struct dccp_opt_conf { 411 + __u8 *dccpoc_val; 412 + __u8 dccpoc_len; 413 + }; 414 + 415 + struct dccp_opt_pend { 416 + struct list_head dccpop_node; 417 + __u8 dccpop_type; 418 + __u8 dccpop_feat; 419 + __u8 *dccpop_val; 420 + __u8 dccpop_len; 421 + int dccpop_conf; 422 + struct dccp_opt_conf *dccpop_sc; 423 + }; 424 + 425 + extern void dccp_minisock_init(struct dccp_minisock *dmsk); 426 + 358 427 /** 359 428 * struct dccp_request_sock - represent DCCP-specific connection request 360 429 * @dreq_inet_rsk: structure inherited from 361 430 * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) 362 431 * @dreq_isr: initial sequence number received on the Request 363 432 * @dreq_service: service code present on the Request (there is just one) 364 - * @dreq_featneg: feature negotiation options for this connection 365 433 * The following two fields are analogous to the ones in dccp_sock: 366 434 * @dreq_timestamp_echo: last received timestamp to echo (13.1) 367 435 * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo ··· 420 390 __u64 dreq_iss; 421 391 __u64 dreq_isr; 422 392 __be32 dreq_service; 423 - struct list_head dreq_featneg; 424 393 __u32 dreq_timestamp_echo; 425 394 __u32 dreq_timestamp_time; 426 395 }; ··· 491 462 * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo 492 463 * @dccps_l_ack_ratio - feature-local Ack Ratio 493 464 * @dccps_r_ack_ratio - feature-remote Ack Ratio 494 - * @dccps_l_seq_win - local Sequence Window (influences ack number validity) 495 - * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) 496 465 * @dccps_pcslen - sender partial checksum coverage (via sockopt) 497 466 * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) 498 - * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) 499 467 * @dccps_ndp_count - number of Non Data Packets since last data packet 500 468 * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) 501 469 * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) 502 - * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) 470 + * @dccps_minisock - associated minisock (accessed via dccp_msk) 503 471 * @dccps_hc_rx_ackvec - rx half connection ack vector 504 472 * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) 505 473 * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) 506 474 * @dccps_options_received - parsed set of retrieved options 507 - * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy 508 - * @dccps_tx_qlen - maximum length of the TX queue 509 475 * @dccps_role - role of this sock, one of %dccp_role 510 476 * @dccps_hc_rx_insert_options - receiver wants to add options when acking 511 477 * @dccps_hc_tx_insert_options - sender wants to add options when sending 512 478 * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) 513 - * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" 514 - * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets 515 - * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) 479 + * @dccps_xmit_timer - timer for when CCID is not ready to send 516 480 * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) 517 481 */ 518 482 struct dccp_sock { ··· 529 507 __u32 dccps_timestamp_time; 530 508 __u16 dccps_l_ack_ratio; 531 509 __u16 dccps_r_ack_ratio; 532 - __u64 dccps_l_seq_win:48; 533 - __u64 dccps_r_seq_win:48; 534 - __u8 dccps_pcslen:4; 535 - __u8 dccps_pcrlen:4; 536 - __u8 dccps_send_ndp_count:1; 510 + __u16 dccps_pcslen; 511 + __u16 dccps_pcrlen; 537 512 __u64 dccps_ndp_count:48; 538 513 unsigned long dccps_rate_last; 539 - struct list_head dccps_featneg; 514 + struct dccp_minisock dccps_minisock; 540 515 struct dccp_ackvec *dccps_hc_rx_ackvec; 541 516 struct ccid *dccps_hc_rx_ccid; 542 517 struct ccid *dccps_hc_tx_ccid; 543 518 struct dccp_options_received dccps_options_received; 544 - __u8 dccps_qpolicy; 545 - __u32 dccps_tx_qlen; 546 519 enum dccp_role dccps_role:2; 547 520 __u8 dccps_hc_rx_insert_options:1; 548 521 __u8 dccps_hc_tx_insert_options:1; 549 522 __u8 dccps_server_timewait:1; 550 - __u8 dccps_sync_scheduled:1; 551 - struct tasklet_struct dccps_xmitlet; 552 523 struct timer_list dccps_xmit_timer; 553 524 }; 554 525 555 526 static inline struct dccp_sock *dccp_sk(const struct sock *sk) 556 527 { 557 528 return (struct dccp_sock *)sk; 529 + } 530 + 531 + static inline struct dccp_minisock *dccp_msk(const struct sock *sk) 532 + { 533 + return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock; 558 534 } 559 535 560 536 static inline const char *dccp_role(const struct sock *sk)
-15
include/net/tcp.h
··· 782 782 /* Use define here intentionally to get WARN_ON location shown at the caller */ 783 783 #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) 784 784 785 - /* 786 - * Convert RFC3390 larger initial windows into an equivalent number of packets. 787 - * 788 - * John Heffner states: 789 - * 790 - * The RFC specifies a window of no more than 4380 bytes 791 - * unless 2*MSS > 4380. Reading the pseudocode in the RFC 792 - * is a bit misleading because they use a clamp at 4380 bytes 793 - * rather than a multiplier in the relevant range. 794 - */ 795 - static inline u32 rfc3390_bytes_to_packets(const u32 bytes) 796 - { 797 - return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3); 798 - } 799 - 800 785 extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); 801 786 extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); 802 787
+3
net/dccp/Kconfig
··· 25 25 def_tristate y if (IP_DCCP = y && INET_DIAG = y) 26 26 def_tristate m 27 27 28 + config IP_DCCP_ACKVEC 29 + bool 30 + 28 31 source "net/dccp/ccids/Kconfig" 29 32 30 33 menu "DCCP Kernel Hacking"
+3 -2
net/dccp/Makefile
··· 1 1 obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o 2 2 3 - dccp-y := ccid.o feat.o input.o minisocks.o options.o \ 4 - qpolicy.o output.o proto.o timer.o ackvec.o 3 + dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o 5 4 6 5 dccp_ipv4-y := ipv4.o 7 6 8 7 # build dccp_ipv6 as module whenever either IPv6 or DCCP is a module 9 8 obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o 10 9 dccp_ipv6-y := ipv6.o 10 + 11 + dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o 11 12 12 13 obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o 13 14 obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
+390 -319
net/dccp/ackvec.c
··· 1 1 /* 2 2 * net/dccp/ackvec.c 3 3 * 4 - * An implementation of Ack Vectors for the DCCP protocol 5 - * Copyright (c) 2007 University of Aberdeen, Scotland, UK 4 + * An implementation of the DCCP protocol 6 5 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> 7 6 * 8 7 * This program is free software; you can redistribute it and/or modify it 9 8 * under the terms of the GNU General Public License as published by the 10 9 * Free Software Foundation; version 2 of the License; 11 10 */ 11 + 12 + #include "ackvec.h" 12 13 #include "dccp.h" 14 + 15 + #include <linux/dccp.h> 16 + #include <linux/init.h> 17 + #include <linux/errno.h> 13 18 #include <linux/kernel.h> 19 + #include <linux/skbuff.h> 14 20 #include <linux/slab.h> 21 + 22 + #include <net/sock.h> 15 23 16 24 static struct kmem_cache *dccp_ackvec_slab; 17 25 static struct kmem_cache *dccp_ackvec_record_slab; 18 26 19 - struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) 27 + static struct dccp_ackvec_record *dccp_ackvec_record_new(void) 20 28 { 21 - struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); 29 + struct dccp_ackvec_record *avr = 30 + kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); 22 31 23 - if (av != NULL) { 24 - av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; 25 - INIT_LIST_HEAD(&av->av_records); 26 - } 27 - return av; 32 + if (avr != NULL) 33 + INIT_LIST_HEAD(&avr->avr_node); 34 + 35 + return avr; 28 36 } 29 37 30 - static void dccp_ackvec_purge_records(struct dccp_ackvec *av) 38 + static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) 31 39 { 32 - struct dccp_ackvec_record *cur, *next; 40 + if (unlikely(avr == NULL)) 41 + return; 42 + /* Check if deleting a linked record */ 43 + WARN_ON(!list_empty(&avr->avr_node)); 44 + kmem_cache_free(dccp_ackvec_record_slab, avr); 45 + } 33 46 34 - list_for_each_entry_safe(cur, next, &av->av_records, avr_node) 35 - kmem_cache_free(dccp_ackvec_record_slab, cur); 36 - INIT_LIST_HEAD(&av->av_records); 47 + static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, 48 + struct dccp_ackvec_record *avr) 49 + { 50 + /* 51 + * AVRs are sorted by seqno. Since we are sending them in order, we 52 + * just add the AVR at the head of the list. 53 + * -sorbo. 54 + */ 55 + if (!list_empty(&av->av_records)) { 56 + const struct dccp_ackvec_record *head = 57 + list_entry(av->av_records.next, 58 + struct dccp_ackvec_record, 59 + avr_node); 60 + BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno)); 61 + } 62 + 63 + list_add(&avr->avr_node, &av->av_records); 64 + } 65 + 66 + int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) 67 + { 68 + struct dccp_sock *dp = dccp_sk(sk); 69 + struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; 70 + /* Figure out how many options do we need to represent the ackvec */ 71 + const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); 72 + u16 len = av->av_vec_len + 2 * nr_opts, i; 73 + u32 elapsed_time; 74 + const unsigned char *tail, *from; 75 + unsigned char *to; 76 + struct dccp_ackvec_record *avr; 77 + suseconds_t delta; 78 + 79 + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) 80 + return -1; 81 + 82 + delta = ktime_us_delta(ktime_get_real(), av->av_time); 83 + elapsed_time = delta / 10; 84 + 85 + if (elapsed_time != 0 && 86 + dccp_insert_option_elapsed_time(sk, skb, elapsed_time)) 87 + return -1; 88 + 89 + avr = dccp_ackvec_record_new(); 90 + if (avr == NULL) 91 + return -1; 92 + 93 + DCCP_SKB_CB(skb)->dccpd_opt_len += len; 94 + 95 + to = skb_push(skb, len); 96 + len = av->av_vec_len; 97 + from = av->av_buf + av->av_buf_head; 98 + tail = av->av_buf + DCCP_MAX_ACKVEC_LEN; 99 + 100 + for (i = 0; i < nr_opts; ++i) { 101 + int copylen = len; 102 + 103 + if (len > DCCP_MAX_ACKVEC_OPT_LEN) 104 + copylen = DCCP_MAX_ACKVEC_OPT_LEN; 105 + 106 + *to++ = DCCPO_ACK_VECTOR_0; 107 + *to++ = copylen + 2; 108 + 109 + /* Check if buf_head wraps */ 110 + if (from + copylen > tail) { 111 + const u16 tailsize = tail - from; 112 + 113 + memcpy(to, from, tailsize); 114 + to += tailsize; 115 + len -= tailsize; 116 + copylen -= tailsize; 117 + from = av->av_buf; 118 + } 119 + 120 + memcpy(to, from, copylen); 121 + from += copylen; 122 + to += copylen; 123 + len -= copylen; 124 + } 125 + 126 + /* 127 + * From RFC 4340, A.2: 128 + * 129 + * For each acknowledgement it sends, the HC-Receiver will add an 130 + * acknowledgement record. ack_seqno will equal the HC-Receiver 131 + * sequence number it used for the ack packet; ack_ptr will equal 132 + * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will 133 + * equal buf_nonce. 134 + */ 135 + avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; 136 + avr->avr_ack_ptr = av->av_buf_head; 137 + avr->avr_ack_ackno = av->av_buf_ackno; 138 + avr->avr_ack_nonce = av->av_buf_nonce; 139 + avr->avr_sent_len = av->av_vec_len; 140 + 141 + dccp_ackvec_insert_avr(av, avr); 142 + 143 + dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " 144 + "ack_ackno=%llu\n", 145 + dccp_role(sk), avr->avr_sent_len, 146 + (unsigned long long)avr->avr_ack_seqno, 147 + (unsigned long long)avr->avr_ack_ackno); 148 + return 0; 149 + } 150 + 151 + struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) 152 + { 153 + struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); 154 + 155 + if (av != NULL) { 156 + av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1; 157 + av->av_buf_ackno = UINT48_MAX + 1; 158 + av->av_buf_nonce = 0; 159 + av->av_time = ktime_set(0, 0); 160 + av->av_vec_len = 0; 161 + INIT_LIST_HEAD(&av->av_records); 162 + } 163 + 164 + return av; 37 165 } 38 166 39 167 void dccp_ackvec_free(struct dccp_ackvec *av) 40 168 { 41 - if (likely(av != NULL)) { 42 - dccp_ackvec_purge_records(av); 43 - kmem_cache_free(dccp_ackvec_slab, av); 169 + if (unlikely(av == NULL)) 170 + return; 171 + 172 + if (!list_empty(&av->av_records)) { 173 + struct dccp_ackvec_record *avr, *next; 174 + 175 + list_for_each_entry_safe(avr, next, &av->av_records, avr_node) { 176 + list_del_init(&avr->avr_node); 177 + dccp_ackvec_record_delete(avr); 178 + } 44 179 } 180 + 181 + kmem_cache_free(dccp_ackvec_slab, av); 45 182 } 46 183 47 - /** 48 - * dccp_ackvec_update_records - Record information about sent Ack Vectors 49 - * @av: Ack Vector records to update 50 - * @seqno: Sequence number of the packet carrying the Ack Vector just sent 51 - * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector 52 - */ 53 - int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) 184 + static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, 185 + const u32 index) 54 186 { 55 - struct dccp_ackvec_record *avr; 56 - 57 - avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); 58 - if (avr == NULL) 59 - return -ENOBUFS; 60 - 61 - avr->avr_ack_seqno = seqno; 62 - avr->avr_ack_ptr = av->av_buf_head; 63 - avr->avr_ack_ackno = av->av_buf_ackno; 64 - avr->avr_ack_nonce = nonce_sum; 65 - avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); 66 - /* 67 - * When the buffer overflows, we keep no more than one record. This is 68 - * the simplest way of disambiguating sender-Acks dating from before the 69 - * overflow from sender-Acks which refer to after the overflow; a simple 70 - * solution is preferable here since we are handling an exception. 71 - */ 72 - if (av->av_overflow) 73 - dccp_ackvec_purge_records(av); 74 - /* 75 - * Since GSS is incremented for each packet, the list is automatically 76 - * arranged in descending order of @ack_seqno. 77 - */ 78 - list_add(&avr->avr_node, &av->av_records); 79 - 80 - dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", 81 - (unsigned long long)avr->avr_ack_seqno, 82 - (unsigned long long)avr->avr_ack_ackno, 83 - avr->avr_ack_runlen); 84 - return 0; 187 + return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK; 85 188 } 86 189 87 - static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, 88 - const u64 ackno) 190 + static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, 191 + const u32 index) 89 192 { 90 - struct dccp_ackvec_record *avr; 91 - /* 92 - * Exploit that records are inserted in descending order of sequence 93 - * number, start with the oldest record first. If @ackno is `before' 94 - * the earliest ack_ackno, the packet is too old to be considered. 95 - */ 96 - list_for_each_entry_reverse(avr, av_list, avr_node) { 97 - if (avr->avr_ack_seqno == ackno) 98 - return avr; 99 - if (before48(ackno, avr->avr_ack_seqno)) 100 - break; 101 - } 102 - return NULL; 193 + return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK; 103 194 } 104 195 105 196 /* 106 - * Buffer index and length computation using modulo-buffersize arithmetic. 107 - * Note that, as pointers move from right to left, head is `before' tail. 197 + * If several packets are missing, the HC-Receiver may prefer to enter multiple 198 + * bytes with run length 0, rather than a single byte with a larger run length; 199 + * this simplifies table updates if one of the missing packets arrives. 108 200 */ 109 - static inline u16 __ackvec_idx_add(const u16 a, const u16 b) 201 + static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, 202 + const unsigned int packets, 203 + const unsigned char state) 110 204 { 111 - return (a + b) % DCCPAV_MAX_ACKVEC_LEN; 112 - } 205 + unsigned int gap; 206 + long new_head; 113 207 114 - static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) 115 - { 116 - return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); 117 - } 208 + if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN) 209 + return -ENOBUFS; 118 210 119 - u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) 120 - { 121 - if (unlikely(av->av_overflow)) 122 - return DCCPAV_MAX_ACKVEC_LEN; 123 - return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); 124 - } 211 + gap = packets - 1; 212 + new_head = av->av_buf_head - packets; 125 213 126 - /** 127 - * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 128 - * @av: non-empty buffer to update 129 - * @distance: negative or zero distance of @seqno from buf_ackno downward 130 - * @seqno: the (old) sequence number whose record is to be updated 131 - * @state: state in which packet carrying @seqno was received 132 - */ 133 - static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, 134 - u64 seqno, enum dccp_ackvec_states state) 135 - { 136 - u16 ptr = av->av_buf_head; 137 - 138 - BUG_ON(distance > 0); 139 - if (unlikely(dccp_ackvec_is_empty(av))) 140 - return; 141 - 142 - do { 143 - u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); 144 - 145 - if (distance + runlen >= 0) { 146 - /* 147 - * Only update the state if packet has not been received 148 - * yet. This is OK as per the second table in RFC 4340, 149 - * 11.4.1; i.e. here we are using the following table: 150 - * RECEIVED 151 - * 0 1 3 152 - * S +---+---+---+ 153 - * T 0 | 0 | 0 | 0 | 154 - * O +---+---+---+ 155 - * R 1 | 1 | 1 | 1 | 156 - * E +---+---+---+ 157 - * D 3 | 0 | 1 | 3 | 158 - * +---+---+---+ 159 - * The "Not Received" state was set by reserve_seats(). 160 - */ 161 - if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) 162 - av->av_buf[ptr] = state; 163 - else 164 - dccp_pr_debug("Not changing %llu state to %u\n", 165 - (unsigned long long)seqno, state); 166 - break; 214 + if (new_head < 0) { 215 + if (gap > 0) { 216 + memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, 217 + gap + new_head + 1); 218 + gap = -new_head; 167 219 } 168 - 169 - distance += runlen + 1; 170 - ptr = __ackvec_idx_add(ptr, 1); 171 - 172 - } while (ptr != av->av_buf_tail); 173 - } 174 - 175 - /* Mark @num entries after buf_head as "Not yet received". */ 176 - static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) 177 - { 178 - u16 start = __ackvec_idx_add(av->av_buf_head, 1), 179 - len = DCCPAV_MAX_ACKVEC_LEN - start; 180 - 181 - /* check for buffer wrap-around */ 182 - if (num > len) { 183 - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); 184 - start = 0; 185 - num -= len; 186 - } 187 - if (num) 188 - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); 189 - } 190 - 191 - /** 192 - * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer 193 - * @av: container of buffer to update (can be empty or non-empty) 194 - * @num_packets: number of packets to register (must be >= 1) 195 - * @seqno: sequence number of the first packet in @num_packets 196 - * @state: state in which packet carrying @seqno was received 197 - */ 198 - static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, 199 - u64 seqno, enum dccp_ackvec_states state) 200 - { 201 - u32 num_cells = num_packets; 202 - 203 - if (num_packets > DCCPAV_BURST_THRESH) { 204 - u32 lost_packets = num_packets - 1; 205 - 206 - DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); 207 - /* 208 - * We received 1 packet and have a loss of size "num_packets-1" 209 - * which we squeeze into num_cells-1 rather than reserving an 210 - * entire byte for each lost packet. 211 - * The reason is that the vector grows in O(burst_length); when 212 - * it grows too large there will no room left for the payload. 213 - * This is a trade-off: if a few packets out of the burst show 214 - * up later, their state will not be changed; it is simply too 215 - * costly to reshuffle/reallocate/copy the buffer each time. 216 - * Should such problems persist, we will need to switch to a 217 - * different underlying data structure. 218 - */ 219 - for (num_packets = num_cells = 1; lost_packets; ++num_cells) { 220 - u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN); 221 - 222 - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); 223 - av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; 224 - 225 - lost_packets -= len; 226 - } 220 + new_head += DCCP_MAX_ACKVEC_LEN; 227 221 } 228 222 229 - if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { 230 - DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n"); 231 - av->av_overflow = true; 232 - } 223 + av->av_buf_head = new_head; 233 224 234 - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); 235 - if (av->av_overflow) 236 - av->av_buf_tail = av->av_buf_head; 225 + if (gap > 0) 226 + memset(av->av_buf + av->av_buf_head + 1, 227 + DCCP_ACKVEC_STATE_NOT_RECEIVED, gap); 237 228 238 229 av->av_buf[av->av_buf_head] = state; 239 - av->av_buf_ackno = seqno; 240 - 241 - if (num_packets > 1) 242 - dccp_ackvec_reserve_seats(av, num_packets - 1); 243 - } 244 - 245 - /** 246 - * dccp_ackvec_input - Register incoming packet in the buffer 247 - */ 248 - void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) 249 - { 250 - u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; 251 - enum dccp_ackvec_states state = DCCPAV_RECEIVED; 252 - 253 - if (dccp_ackvec_is_empty(av)) { 254 - dccp_ackvec_add_new(av, 1, seqno, state); 255 - av->av_tail_ackno = seqno; 256 - 257 - } else { 258 - s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); 259 - u8 *current_head = av->av_buf + av->av_buf_head; 260 - 261 - if (num_packets == 1 && 262 - dccp_ackvec_state(current_head) == state && 263 - dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { 264 - 265 - *current_head += 1; 266 - av->av_buf_ackno = seqno; 267 - 268 - } else if (num_packets > 0) { 269 - dccp_ackvec_add_new(av, num_packets, seqno, state); 270 - } else { 271 - dccp_ackvec_update_old(av, num_packets, seqno, state); 272 - } 273 - } 274 - } 275 - 276 - /** 277 - * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection 278 - * This routine is called when the peer acknowledges the receipt of Ack Vectors 279 - * up to and including @ackno. While based on on section A.3 of RFC 4340, here 280 - * are additional precautions to prevent corrupted buffer state. In particular, 281 - * we use tail_ackno to identify outdated records; it always marks the earliest 282 - * packet of group (2) in 11.4.2. 283 - */ 284 - void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) 285 - { 286 - struct dccp_ackvec_record *avr, *next; 287 - u8 runlen_now, eff_runlen; 288 - s64 delta; 289 - 290 - avr = dccp_ackvec_lookup(&av->av_records, ackno); 291 - if (avr == NULL) 292 - return; 293 - /* 294 - * Deal with outdated acknowledgments: this arises when e.g. there are 295 - * several old records and the acks from the peer come in slowly. In 296 - * that case we may still have records that pre-date tail_ackno. 297 - */ 298 - delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); 299 - if (delta < 0) 300 - goto free_records; 301 - /* 302 - * Deal with overlapping Ack Vectors: don't subtract more than the 303 - * number of packets between tail_ackno and ack_ackno. 304 - */ 305 - eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; 306 - 307 - runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); 308 - /* 309 - * The run length of Ack Vector cells does not decrease over time. If 310 - * the run length is the same as at the time the Ack Vector was sent, we 311 - * free the ack_ptr cell. That cell can however not be freed if the run 312 - * length has increased: in this case we need to move the tail pointer 313 - * backwards (towards higher indices), to its next-oldest neighbour. 314 - */ 315 - if (runlen_now > eff_runlen) { 316 - 317 - av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; 318 - av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); 319 - 320 - /* This move may not have cleared the overflow flag. */ 321 - if (av->av_overflow) 322 - av->av_overflow = (av->av_buf_head == av->av_buf_tail); 323 - } else { 324 - av->av_buf_tail = avr->avr_ack_ptr; 325 - /* 326 - * We have made sure that avr points to a valid cell within the 327 - * buffer. This cell is either older than head, or equals head 328 - * (empty buffer): in both cases we no longer have any overflow. 329 - */ 330 - av->av_overflow = 0; 331 - } 332 - 333 - /* 334 - * The peer has acknowledged up to and including ack_ackno. Hence the 335 - * first packet in group (2) of 11.4.2 is the successor of ack_ackno. 336 - */ 337 - av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); 338 - 339 - free_records: 340 - list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { 341 - list_del(&avr->avr_node); 342 - kmem_cache_free(dccp_ackvec_record_slab, avr); 343 - } 230 + av->av_vec_len += packets; 231 + return 0; 344 232 } 345 233 346 234 /* 347 - * Routines to keep track of Ack Vectors received in an skb 235 + * Implements the RFC 4340, Appendix A 348 236 */ 349 - int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) 237 + int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 238 + const u64 ackno, const u8 state) 350 239 { 351 - struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); 240 + /* 241 + * Check at the right places if the buffer is full, if it is, tell the 242 + * caller to start dropping packets till the HC-Sender acks our ACK 243 + * vectors, when we will free up space in av_buf. 244 + * 245 + * We may well decide to do buffer compression, etc, but for now lets 246 + * just drop. 247 + * 248 + * From Appendix A.1.1 (`New Packets'): 249 + * 250 + * Of course, the circular buffer may overflow, either when the 251 + * HC-Sender is sending data at a very high rate, when the 252 + * HC-Receiver's acknowledgements are not reaching the HC-Sender, 253 + * or when the HC-Sender is forgetting to acknowledge those acks 254 + * (so the HC-Receiver is unable to clean up old state). In this 255 + * case, the HC-Receiver should either compress the buffer (by 256 + * increasing run lengths when possible), transfer its state to 257 + * a larger buffer, or, as a last resort, drop all received 258 + * packets, without processing them whatsoever, until its buffer 259 + * shrinks again. 260 + */ 352 261 353 - if (new == NULL) 354 - return -ENOBUFS; 355 - new->vec = vec; 356 - new->len = len; 357 - new->nonce = nonce; 262 + /* See if this is the first ackno being inserted */ 263 + if (av->av_vec_len == 0) { 264 + av->av_buf[av->av_buf_head] = state; 265 + av->av_vec_len = 1; 266 + } else if (after48(ackno, av->av_buf_ackno)) { 267 + const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno); 358 268 359 - list_add_tail(&new->node, head); 269 + /* 270 + * Look if the state of this packet is the same as the 271 + * previous ackno and if so if we can bump the head len. 272 + */ 273 + if (delta == 1 && 274 + dccp_ackvec_state(av, av->av_buf_head) == state && 275 + dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK) 276 + av->av_buf[av->av_buf_head]++; 277 + else if (dccp_ackvec_set_buf_head_state(av, delta, state)) 278 + return -ENOBUFS; 279 + } else { 280 + /* 281 + * A.1.2. Old Packets 282 + * 283 + * When a packet with Sequence Number S <= buf_ackno 284 + * arrives, the HC-Receiver will scan the table for 285 + * the byte corresponding to S. (Indexing structures 286 + * could reduce the complexity of this scan.) 287 + */ 288 + u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno); 289 + u32 index = av->av_buf_head; 290 + 291 + while (1) { 292 + const u8 len = dccp_ackvec_len(av, index); 293 + const u8 av_state = dccp_ackvec_state(av, index); 294 + /* 295 + * valid packets not yet in av_buf have a reserved 296 + * entry, with a len equal to 0. 297 + */ 298 + if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED && 299 + len == 0 && delta == 0) { /* Found our 300 + reserved seat! */ 301 + dccp_pr_debug("Found %llu reserved seat!\n", 302 + (unsigned long long)ackno); 303 + av->av_buf[index] = state; 304 + goto out; 305 + } 306 + /* len == 0 means one packet */ 307 + if (delta < len + 1) 308 + goto out_duplicate; 309 + 310 + delta -= len + 1; 311 + if (++index == DCCP_MAX_ACKVEC_LEN) 312 + index = 0; 313 + } 314 + } 315 + 316 + av->av_buf_ackno = ackno; 317 + av->av_time = ktime_get_real(); 318 + out: 319 + return 0; 320 + 321 + out_duplicate: 322 + /* Duplicate packet */ 323 + dccp_pr_debug("Received a dup or already considered lost " 324 + "packet: %llu\n", (unsigned long long)ackno); 325 + return -EILSEQ; 326 + } 327 + 328 + static void dccp_ackvec_throw_record(struct dccp_ackvec *av, 329 + struct dccp_ackvec_record *avr) 330 + { 331 + struct dccp_ackvec_record *next; 332 + 333 + /* sort out vector length */ 334 + if (av->av_buf_head <= avr->avr_ack_ptr) 335 + av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; 336 + else 337 + av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 - 338 + av->av_buf_head + avr->avr_ack_ptr; 339 + 340 + /* free records */ 341 + list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { 342 + list_del_init(&avr->avr_node); 343 + dccp_ackvec_record_delete(avr); 344 + } 345 + } 346 + 347 + void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, 348 + const u64 ackno) 349 + { 350 + struct dccp_ackvec_record *avr; 351 + 352 + /* 353 + * If we traverse backwards, it should be faster when we have large 354 + * windows. We will be receiving ACKs for stuff we sent a while back 355 + * -sorbo. 356 + */ 357 + list_for_each_entry_reverse(avr, &av->av_records, avr_node) { 358 + if (ackno == avr->avr_ack_seqno) { 359 + dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " 360 + "ack_ackno=%llu, ACKED!\n", 361 + dccp_role(sk), 1, 362 + (unsigned long long)avr->avr_ack_seqno, 363 + (unsigned long long)avr->avr_ack_ackno); 364 + dccp_ackvec_throw_record(av, avr); 365 + break; 366 + } else if (avr->avr_ack_seqno > ackno) 367 + break; /* old news */ 368 + } 369 + } 370 + 371 + static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, 372 + struct sock *sk, u64 *ackno, 373 + const unsigned char len, 374 + const unsigned char *vector) 375 + { 376 + unsigned char i; 377 + struct dccp_ackvec_record *avr; 378 + 379 + /* Check if we actually sent an ACK vector */ 380 + if (list_empty(&av->av_records)) 381 + return; 382 + 383 + i = len; 384 + /* 385 + * XXX 386 + * I think it might be more efficient to work backwards. See comment on 387 + * rcv_ackno. -sorbo. 388 + */ 389 + avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); 390 + while (i--) { 391 + const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; 392 + u64 ackno_end_rl; 393 + 394 + dccp_set_seqno(&ackno_end_rl, *ackno - rl); 395 + 396 + /* 397 + * If our AVR sequence number is greater than the ack, go 398 + * forward in the AVR list until it is not so. 399 + */ 400 + list_for_each_entry_from(avr, &av->av_records, avr_node) { 401 + if (!after48(avr->avr_ack_seqno, *ackno)) 402 + goto found; 403 + } 404 + /* End of the av_records list, not found, exit */ 405 + break; 406 + found: 407 + if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) { 408 + const u8 state = *vector & DCCP_ACKVEC_STATE_MASK; 409 + if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) { 410 + dccp_pr_debug("%s ACK vector 0, len=%d, " 411 + "ack_seqno=%llu, ack_ackno=%llu, " 412 + "ACKED!\n", 413 + dccp_role(sk), len, 414 + (unsigned long long) 415 + avr->avr_ack_seqno, 416 + (unsigned long long) 417 + avr->avr_ack_ackno); 418 + dccp_ackvec_throw_record(av, avr); 419 + break; 420 + } 421 + /* 422 + * If it wasn't received, continue scanning... we might 423 + * find another one. 424 + */ 425 + } 426 + 427 + dccp_set_seqno(ackno, ackno_end_rl - 1); 428 + ++vector; 429 + } 430 + } 431 + 432 + int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, 433 + u64 *ackno, const u8 opt, const u8 *value, const u8 len) 434 + { 435 + if (len > DCCP_MAX_ACKVEC_OPT_LEN) 436 + return -1; 437 + 438 + /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ 439 + dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk, 440 + ackno, len, value); 360 441 return 0; 361 442 } 362 - EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); 363 - 364 - void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) 365 - { 366 - struct dccp_ackvec_parsed *cur, *next; 367 - 368 - list_for_each_entry_safe(cur, next, parsed_chunks, node) 369 - kfree(cur); 370 - INIT_LIST_HEAD(parsed_chunks); 371 - } 372 - EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); 373 443 374 444 int __init dccp_ackvec_init(void) 375 445 { ··· 449 379 if (dccp_ackvec_slab == NULL) 450 380 goto out_err; 451 381 452 - dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record", 453 - sizeof(struct dccp_ackvec_record), 454 - 0, SLAB_HWCACHE_ALIGN, NULL); 382 + dccp_ackvec_record_slab = 383 + kmem_cache_create("dccp_ackvec_record", 384 + sizeof(struct dccp_ackvec_record), 385 + 0, SLAB_HWCACHE_ALIGN, NULL); 455 386 if (dccp_ackvec_record_slab == NULL) 456 387 goto out_destroy_slab; 457 388
+113 -91
net/dccp/ackvec.h
··· 3 3 /* 4 4 * net/dccp/ackvec.h 5 5 * 6 - * An implementation of Ack Vectors for the DCCP protocol 7 - * Copyright (c) 2007 University of Aberdeen, Scotland, UK 6 + * An implementation of the DCCP protocol 8 7 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com> 8 + * 9 9 * This program is free software; you can redistribute it and/or modify it 10 10 * under the terms of the GNU General Public License version 2 as 11 11 * published by the Free Software Foundation. 12 12 */ 13 13 14 - #include <linux/dccp.h> 15 14 #include <linux/compiler.h> 15 + #include <linux/ktime.h> 16 16 #include <linux/list.h> 17 17 #include <linux/types.h> 18 18 19 - /* 20 - * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, 21 - * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 22 - * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives 23 - * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. 24 - * The maximum value is bounded by the u16 types for indices and functions. 25 - */ 26 - #define DCCPAV_NUM_ACKVECS 2 27 - #define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) 19 + /* Read about the ECN nonce to see why it is 253 */ 20 + #define DCCP_MAX_ACKVEC_OPT_LEN 253 21 + /* We can spread an ack vector across multiple options */ 22 + #define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2) 28 23 29 - /* Estimated minimum average Ack Vector length - used for updating MPS */ 30 - #define DCCPAV_MIN_OPTLEN 16 24 + #define DCCP_ACKVEC_STATE_RECEIVED 0 25 + #define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) 26 + #define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6) 31 27 32 - /* Threshold for coping with large bursts of losses */ 33 - #define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) 28 + #define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */ 29 + #define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */ 34 30 35 - enum dccp_ackvec_states { 36 - DCCPAV_RECEIVED = 0x00, 37 - DCCPAV_ECN_MARKED = 0x40, 38 - DCCPAV_RESERVED = 0x80, 39 - DCCPAV_NOT_RECEIVED = 0xC0 40 - }; 41 - #define DCCPAV_MAX_RUNLEN 0x3F 42 - 43 - static inline u8 dccp_ackvec_runlen(const u8 *cell) 44 - { 45 - return *cell & DCCPAV_MAX_RUNLEN; 46 - } 47 - 48 - static inline u8 dccp_ackvec_state(const u8 *cell) 49 - { 50 - return *cell & ~DCCPAV_MAX_RUNLEN; 51 - } 52 - 53 - /** struct dccp_ackvec - Ack Vector main data structure 31 + /** struct dccp_ackvec - ack vector 54 32 * 55 - * This implements a fixed-size circular buffer within an array and is largely 56 - * based on Appendix A of RFC 4340. 33 + * This data structure is the one defined in RFC 4340, Appendix A. 57 34 * 58 - * @av_buf: circular buffer storage area 59 - * @av_buf_head: head index; begin of live portion in @av_buf 60 - * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf 61 - * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf 62 - * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf 63 - * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to 64 - * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf 65 - * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound 66 - * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) 35 + * @av_buf_head - circular buffer head 36 + * @av_buf_tail - circular buffer tail 37 + * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the 38 + * buffer (i.e. %av_buf_head) 39 + * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked 40 + * by the buffer with State 0 41 + * 42 + * Additionally, the HC-Receiver must keep some information about the 43 + * Ack Vectors it has recently sent. For each packet sent carrying an 44 + * Ack Vector, it remembers four variables: 45 + * 46 + * @av_records - list of dccp_ackvec_record 47 + * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. 48 + * 49 + * @av_time - the time in usecs 50 + * @av_buf - circular buffer of acknowledgeable packets 67 51 */ 68 52 struct dccp_ackvec { 69 - u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; 70 - u16 av_buf_head; 71 - u16 av_buf_tail; 72 - u64 av_buf_ackno:48; 73 - u64 av_tail_ackno:48; 74 - bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; 75 - u8 av_overflow:1; 53 + u64 av_buf_ackno; 76 54 struct list_head av_records; 55 + ktime_t av_time; 56 + u16 av_buf_head; 57 + u16 av_vec_len; 58 + u8 av_buf_nonce; 59 + u8 av_ack_nonce; 60 + u8 av_buf[DCCP_MAX_ACKVEC_LEN]; 77 61 }; 78 62 79 - /** struct dccp_ackvec_record - Records information about sent Ack Vectors 63 + /** struct dccp_ackvec_record - ack vector record 80 64 * 81 - * These list entries define the additional information which the HC-Receiver 82 - * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. 65 + * ACK vector record as defined in Appendix A of spec. 83 66 * 84 - * @avr_node: the list node in @av_records 85 - * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on 86 - * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to 87 - * @avr_ack_ptr: pointer into @av_buf where this record starts 88 - * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending 89 - * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent 67 + * The list is sorted by avr_ack_seqno 90 68 * 91 - * The list as a whole is sorted in descending order by @avr_ack_seqno. 69 + * @avr_node - node in av_records 70 + * @avr_ack_seqno - sequence number of the packet this record was sent on 71 + * @avr_ack_ackno - sequence number being acknowledged 72 + * @avr_ack_ptr - pointer into av_buf where this record starts 73 + * @avr_ack_nonce - av_ack_nonce at the time this record was sent 74 + * @avr_sent_len - lenght of the record in av_buf 92 75 */ 93 76 struct dccp_ackvec_record { 94 77 struct list_head avr_node; 95 - u64 avr_ack_seqno:48; 96 - u64 avr_ack_ackno:48; 78 + u64 avr_ack_seqno; 79 + u64 avr_ack_ackno; 97 80 u16 avr_ack_ptr; 98 - u8 avr_ack_runlen; 99 - u8 avr_ack_nonce:1; 81 + u16 avr_sent_len; 82 + u8 avr_ack_nonce; 100 83 }; 101 84 102 - extern int dccp_ackvec_init(void); 85 + struct sock; 86 + struct sk_buff; 87 + 88 + #ifdef CONFIG_IP_DCCP_ACKVEC 89 + extern int dccp_ackvec_init(void); 103 90 extern void dccp_ackvec_exit(void); 104 91 105 92 extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); 106 93 extern void dccp_ackvec_free(struct dccp_ackvec *av); 107 94 108 - extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); 109 - extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); 110 - extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); 111 - extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); 95 + extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 96 + const u64 ackno, const u8 state); 112 97 113 - static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) 98 + extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, 99 + struct sock *sk, const u64 ackno); 100 + extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, 101 + u64 *ackno, const u8 opt, 102 + const u8 *value, const u8 len); 103 + 104 + extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb); 105 + 106 + static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) 114 107 { 115 - return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; 108 + return av->av_vec_len; 109 + } 110 + #else /* CONFIG_IP_DCCP_ACKVEC */ 111 + static inline int dccp_ackvec_init(void) 112 + { 113 + return 0; 116 114 } 117 115 118 - /** 119 - * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb 120 - * @vec: start of vector (offset into skb) 121 - * @len: length of @vec 122 - * @nonce: whether @vec had an ECN nonce of 0 or 1 123 - * @node: FIFO - arranged in descending order of ack_ackno 124 - * This structure is used by CCIDs to access Ack Vectors in a received skb. 125 - */ 126 - struct dccp_ackvec_parsed { 127 - u8 *vec, 128 - len, 129 - nonce:1; 130 - struct list_head node; 131 - }; 116 + static inline void dccp_ackvec_exit(void) 117 + { 118 + } 132 119 133 - extern int dccp_ackvec_parsed_add(struct list_head *head, 134 - u8 *vec, u8 len, u8 nonce); 135 - extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); 120 + static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) 121 + { 122 + return NULL; 123 + } 124 + 125 + static inline void dccp_ackvec_free(struct dccp_ackvec *av) 126 + { 127 + } 128 + 129 + static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 130 + const u64 ackno, const u8 state) 131 + { 132 + return -1; 133 + } 134 + 135 + static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, 136 + struct sock *sk, const u64 ackno) 137 + { 138 + } 139 + 140 + static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, 141 + const u64 *ackno, const u8 opt, 142 + const u8 *value, const u8 len) 143 + { 144 + return -1; 145 + } 146 + 147 + static inline int dccp_insert_option_ackvec(const struct sock *sk, 148 + const struct sk_buff *skb) 149 + { 150 + return -1; 151 + } 152 + 153 + static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) 154 + { 155 + return 0; 156 + } 157 + #endif /* CONFIG_IP_DCCP_ACKVEC */ 136 158 #endif /* _ACKVEC_H */
+24 -77
net/dccp/ccid.c
··· 13 13 14 14 #include "ccid.h" 15 15 16 - static u8 builtin_ccids[] = { 17 - DCCPC_CCID2, /* CCID2 is supported by default */ 18 - #if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE) 19 - DCCPC_CCID3, 20 - #endif 21 - }; 22 - 23 16 static struct ccid_operations *ccids[CCID_MAX]; 24 17 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) 25 18 static atomic_t ccids_lockct = ATOMIC_INIT(0); ··· 86 93 } 87 94 } 88 95 89 - /* check that up to @array_len members in @ccid_array are supported */ 90 - bool ccid_support_check(u8 const *ccid_array, u8 array_len) 91 - { 92 - u8 i, j, found; 93 - 94 - for (i = 0, found = 0; i < array_len; i++, found = 0) { 95 - for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++) 96 - found = (ccid_array[i] == builtin_ccids[j]); 97 - if (!found) 98 - return false; 99 - } 100 - return true; 101 - } 102 - 103 - /** 104 - * ccid_get_builtin_ccids - Provide copy of `builtin' CCID array 105 - * @ccid_array: pointer to copy into 106 - * @array_len: value to return length into 107 - * This function allocates memory - caller must see that it is freed after use. 108 - */ 109 - int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) 110 - { 111 - *ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any()); 112 - if (*ccid_array == NULL) 113 - return -ENOBUFS; 114 - *array_len = ARRAY_SIZE(builtin_ccids); 115 - return 0; 116 - } 117 - 118 - int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, 119 - char __user *optval, int __user *optlen) 120 - { 121 - if (len < sizeof(builtin_ccids)) 122 - return -EINVAL; 123 - 124 - if (put_user(sizeof(builtin_ccids), optlen) || 125 - copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids))) 126 - return -EFAULT; 127 - return 0; 128 - } 129 - 130 96 int ccid_register(struct ccid_operations *ccid_ops) 131 97 { 132 98 int err = -ENOBUFS; ··· 148 196 149 197 EXPORT_SYMBOL_GPL(ccid_unregister); 150 198 151 - /** 152 - * ccid_request_module - Pre-load CCID module for later use 153 - * This should be called only from process context (e.g. during connection 154 - * setup) and is necessary for later calls to ccid_new (typically in software 155 - * interrupt), so that it has the modules available when they are needed. 156 - */ 157 - static int ccid_request_module(u8 id) 158 - { 159 - if (!in_atomic()) { 160 - ccids_read_lock(); 161 - if (ccids[id] == NULL) { 162 - ccids_read_unlock(); 163 - return request_module("net-dccp-ccid-%d", id); 164 - } 165 - ccids_read_unlock(); 166 - } 167 - return 0; 168 - } 169 - 170 - int ccid_request_modules(u8 const *ccid_array, u8 array_len) 171 - { 172 - #ifdef CONFIG_KMOD 173 - while (array_len--) 174 - if (ccid_request_module(ccid_array[array_len])) 175 - return -1; 176 - #endif 177 - return 0; 178 - } 179 - 180 199 struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) 181 200 { 182 201 struct ccid_operations *ccid_ops; 183 202 struct ccid *ccid = NULL; 184 203 185 204 ccids_read_lock(); 205 + #ifdef CONFIG_KMOD 206 + if (ccids[id] == NULL) { 207 + /* We only try to load if in process context */ 208 + ccids_read_unlock(); 209 + if (gfp & GFP_ATOMIC) 210 + goto out; 211 + request_module("net-dccp-ccid-%d", id); 212 + ccids_read_lock(); 213 + } 214 + #endif 186 215 ccid_ops = ccids[id]; 187 216 if (ccid_ops == NULL) 188 217 goto out_unlock; ··· 204 271 } 205 272 206 273 EXPORT_SYMBOL_GPL(ccid_new); 274 + 275 + struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp) 276 + { 277 + return ccid_new(id, sk, 1, gfp); 278 + } 279 + 280 + EXPORT_SYMBOL_GPL(ccid_hc_rx_new); 281 + 282 + struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp) 283 + { 284 + return ccid_new(id, sk, 0, gfp); 285 + } 286 + 287 + EXPORT_SYMBOL_GPL(ccid_hc_tx_new); 207 288 208 289 static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) 209 290 {
+33 -80
net/dccp/ccid.h
··· 60 60 void (*ccid_hc_tx_exit)(struct sock *sk); 61 61 void (*ccid_hc_rx_packet_recv)(struct sock *sk, 62 62 struct sk_buff *skb); 63 - int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, 64 - u8 opt, u8 *val, u8 len); 63 + int (*ccid_hc_rx_parse_options)(struct sock *sk, 64 + unsigned char option, 65 + unsigned char len, u16 idx, 66 + unsigned char* value); 65 67 int (*ccid_hc_rx_insert_options)(struct sock *sk, 66 68 struct sk_buff *skb); 67 69 void (*ccid_hc_tx_packet_recv)(struct sock *sk, 68 70 struct sk_buff *skb); 69 - int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, 70 - u8 opt, u8 *val, u8 len); 71 + int (*ccid_hc_tx_parse_options)(struct sock *sk, 72 + unsigned char option, 73 + unsigned char len, u16 idx, 74 + unsigned char* value); 71 75 int (*ccid_hc_tx_send_packet)(struct sock *sk, 72 76 struct sk_buff *skb); 73 77 void (*ccid_hc_tx_packet_sent)(struct sock *sk, 74 - unsigned int len); 78 + int more, unsigned int len); 75 79 void (*ccid_hc_rx_get_info)(struct sock *sk, 76 80 struct tcp_info *info); 77 81 void (*ccid_hc_tx_get_info)(struct sock *sk, ··· 103 99 return (void *)ccid->ccid_priv; 104 100 } 105 101 106 - extern bool ccid_support_check(u8 const *ccid_array, u8 array_len); 107 - extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); 108 - extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, 109 - char __user *, int __user *); 110 - 111 - extern int ccid_request_modules(u8 const *ccid_array, u8 array_len); 112 102 extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, 113 103 gfp_t gfp); 114 104 115 - static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) 116 - { 117 - struct ccid *ccid = dp->dccps_hc_rx_ccid; 118 - 119 - if (ccid == NULL || ccid->ccid_ops == NULL) 120 - return -1; 121 - return ccid->ccid_ops->ccid_id; 122 - } 123 - 124 - static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) 125 - { 126 - struct ccid *ccid = dp->dccps_hc_tx_ccid; 127 - 128 - if (ccid == NULL || ccid->ccid_ops == NULL) 129 - return -1; 130 - return ccid->ccid_ops->ccid_id; 131 - } 105 + extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, 106 + gfp_t gfp); 107 + extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk, 108 + gfp_t gfp); 132 109 133 110 extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); 134 111 extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); 135 112 136 - /* 137 - * Congestion control of queued data packets via CCID decision. 138 - * 139 - * The TX CCID performs its congestion-control by indicating whether and when a 140 - * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). 141 - * The following modes are supported via the symbolic constants below: 142 - * - timer-based pacing (CCID returns a delay value in milliseconds); 143 - * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). 144 - */ 145 - 146 - enum ccid_dequeueing_decision { 147 - CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ 148 - CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ 149 - CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ 150 - CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ 151 - CCID_PACKET_ERR = 0xF0000, /* error condition */ 152 - }; 153 - 154 - static inline int ccid_packet_dequeue_eval(const int return_code) 155 - { 156 - if (return_code < 0) 157 - return CCID_PACKET_ERR; 158 - if (return_code == 0) 159 - return CCID_PACKET_SEND_AT_ONCE; 160 - if (return_code <= CCID_PACKET_DELAY_MAX) 161 - return CCID_PACKET_DELAY; 162 - return return_code; 163 - } 164 - 165 113 static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, 166 114 struct sk_buff *skb) 167 115 { 116 + int rc = 0; 168 117 if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) 169 - return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); 170 - return CCID_PACKET_SEND_AT_ONCE; 118 + rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); 119 + return rc; 171 120 } 172 121 173 122 static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, 174 - unsigned int len) 123 + int more, unsigned int len) 175 124 { 176 125 if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) 177 - ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); 126 + ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len); 178 127 } 179 128 180 129 static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, ··· 144 187 ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); 145 188 } 146 189 147 - /** 148 - * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver 149 - * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) 150 - * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) 151 - * @val: value of @opt 152 - * @len: length of @val in bytes 153 - */ 154 190 static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, 155 - u8 pkt, u8 opt, u8 *val, u8 len) 191 + unsigned char option, 192 + unsigned char len, u16 idx, 193 + unsigned char* value) 156 194 { 157 - if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL) 158 - return 0; 159 - return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); 195 + int rc = 0; 196 + if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL) 197 + rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx, 198 + value); 199 + return rc; 160 200 } 161 201 162 - /** 163 - * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender 164 - * Arguments are analogous to ccid_hc_tx_parse_options() 165 - */ 166 202 static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, 167 - u8 pkt, u8 opt, u8 *val, u8 len) 203 + unsigned char option, 204 + unsigned char len, u16 idx, 205 + unsigned char* value) 168 206 { 169 - if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL) 170 - return 0; 171 - return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); 207 + int rc = 0; 208 + if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL) 209 + rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value); 210 + return rc; 172 211 } 173 212 174 213 static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
+6 -24
net/dccp/ccids/Kconfig
··· 1 1 menu "DCCP CCIDs Configuration (EXPERIMENTAL)" 2 + depends on EXPERIMENTAL 2 3 3 4 config IP_DCCP_CCID2 4 - tristate "CCID2 (TCP-Like)" 5 + tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" 5 6 def_tristate IP_DCCP 7 + select IP_DCCP_ACKVEC 6 8 ---help--- 7 9 CCID 2, TCP-like Congestion Control, denotes Additive Increase, 8 10 Multiplicative Decrease (AIMD) congestion control with behavior ··· 36 34 If in doubt, say N. 37 35 38 36 config IP_DCCP_CCID3 39 - tristate "CCID3 (TCP-Friendly)" 37 + tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" 40 38 def_tristate IP_DCCP 41 39 select IP_DCCP_TFRC_LIB 42 40 ---help--- ··· 64 62 65 63 If in doubt, say M. 66 64 67 - if IP_DCCP_CCID3 68 65 config IP_DCCP_CCID3_DEBUG 69 66 bool "CCID3 debugging messages" 67 + depends on IP_DCCP_CCID3 70 68 ---help--- 71 69 Enable CCID3-specific debugging messages. 72 70 ··· 76 74 77 75 If in doubt, say N. 78 76 79 - choice 80 - prompt "Select method for measuring the packet size s" 81 - default IP_DCCP_CCID3_MEASURE_S_AS_MPS 82 - 83 - config IP_DCCP_CCID3_MEASURE_S_AS_MPS 84 - bool "Always use MPS in place of s" 85 - ---help--- 86 - This use is recommended as it is consistent with the initialisation 87 - of X and suggested when s varies (rfc3448bis, (1) in section 4.1). 88 - config IP_DCCP_CCID3_MEASURE_S_AS_AVG 89 - bool "Use moving average" 90 - ---help--- 91 - An alternative way of tracking s, also supported by rfc3448bis. 92 - This used to be the default for CCID-3 in previous kernels. 93 - config IP_DCCP_CCID3_MEASURE_S_AS_MAX 94 - bool "Track the maximum payload length" 95 - ---help--- 96 - An experimental method based on tracking the maximum packet size. 97 - endchoice 98 - 99 77 config IP_DCCP_CCID3_RTO 100 78 int "Use higher bound for nofeedback timer" 101 79 default 100 80 + depends on IP_DCCP_CCID3 && EXPERIMENTAL 102 81 ---help--- 103 82 Use higher lower bound for nofeedback timer expiration. 104 83 ··· 106 123 The purpose of the nofeedback timer is to slow DCCP down when there 107 124 is serious network congestion: experimenting with larger values should 108 125 therefore not be performed on WANs. 109 - endif # IP_DCCP_CCID3 110 126 111 127 config IP_DCCP_TFRC_LIB 112 128 tristate
+369 -247
net/dccp/ccids/ccid2.c
··· 25 25 /* 26 26 * This implementation should follow RFC 4341 27 27 */ 28 - #include "../feat.h" 28 + 29 29 #include "../ccid.h" 30 30 #include "../dccp.h" 31 31 #include "ccid2.h" ··· 34 34 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG 35 35 static int ccid2_debug; 36 36 #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) 37 + 38 + static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx) 39 + { 40 + int len = 0; 41 + int pipe = 0; 42 + struct ccid2_seq *seqp = hctx->ccid2hctx_seqh; 43 + 44 + /* there is data in the chain */ 45 + if (seqp != hctx->ccid2hctx_seqt) { 46 + seqp = seqp->ccid2s_prev; 47 + len++; 48 + if (!seqp->ccid2s_acked) 49 + pipe++; 50 + 51 + while (seqp != hctx->ccid2hctx_seqt) { 52 + struct ccid2_seq *prev = seqp->ccid2s_prev; 53 + 54 + len++; 55 + if (!prev->ccid2s_acked) 56 + pipe++; 57 + 58 + /* packets are sent sequentially */ 59 + BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, 60 + prev->ccid2s_seq ) >= 0); 61 + BUG_ON(time_before(seqp->ccid2s_sent, 62 + prev->ccid2s_sent)); 63 + 64 + seqp = prev; 65 + } 66 + } 67 + 68 + BUG_ON(pipe != hctx->ccid2hctx_pipe); 69 + ccid2_pr_debug("len of chain=%d\n", len); 70 + 71 + do { 72 + seqp = seqp->ccid2s_prev; 73 + len++; 74 + } while (seqp != hctx->ccid2hctx_seqh); 75 + 76 + ccid2_pr_debug("total len=%d\n", len); 77 + BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN); 78 + } 37 79 #else 38 80 #define ccid2_pr_debug(format, a...) 81 + #define ccid2_hc_tx_check_sanity(hctx) 39 82 #endif 40 83 41 84 static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) ··· 87 44 int i; 88 45 89 46 /* check if we have space to preserve the pointer to the buffer */ 90 - if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *)) 47 + if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) / 48 + sizeof(struct ccid2_seq*))) 91 49 return -ENOMEM; 92 50 93 51 /* allocate buffer and initialize linked list */ ··· 104 60 seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 105 61 106 62 /* This is the first allocation. Initiate the head and tail. */ 107 - if (hctx->seqbufc == 0) 108 - hctx->seqh = hctx->seqt = seqp; 63 + if (hctx->ccid2hctx_seqbufc == 0) 64 + hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp; 109 65 else { 110 66 /* link the existing list with the one we just created */ 111 - hctx->seqh->ccid2s_next = seqp; 112 - seqp->ccid2s_prev = hctx->seqh; 67 + hctx->ccid2hctx_seqh->ccid2s_next = seqp; 68 + seqp->ccid2s_prev = hctx->ccid2hctx_seqh; 113 69 114 - hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 115 - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt; 70 + hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 71 + seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt; 116 72 } 117 73 118 74 /* store the original pointer to the buffer so we can free it */ 119 - hctx->seqbuf[hctx->seqbufc] = seqp; 120 - hctx->seqbufc++; 75 + hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp; 76 + hctx->ccid2hctx_seqbufc++; 121 77 122 78 return 0; 123 79 } 124 80 125 81 static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 126 82 { 127 - if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) 128 - return CCID_PACKET_WILL_DEQUEUE_LATER; 129 - return CCID_PACKET_SEND_AT_ONCE; 83 + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 84 + 85 + if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) 86 + return 0; 87 + 88 + return 1; /* XXX CCID should dequeue when ready instead of polling */ 130 89 } 131 90 132 91 static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) 133 92 { 134 93 struct dccp_sock *dp = dccp_sk(sk); 135 - u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2); 94 + u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2); 136 95 137 96 /* 138 97 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from ··· 147 100 DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); 148 101 val = max_ratio; 149 102 } 150 - if (val > DCCPF_ACK_RATIO_MAX) 151 - val = DCCPF_ACK_RATIO_MAX; 103 + if (val > 0xFFFF) /* RFC 4340, 11.3 */ 104 + val = 0xFFFF; 152 105 153 106 if (val == dp->dccps_l_ack_ratio) 154 107 return; ··· 157 110 dp->dccps_l_ack_ratio = val; 158 111 } 159 112 113 + static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) 114 + { 115 + ccid2_pr_debug("change SRTT to %ld\n", val); 116 + hctx->ccid2hctx_srtt = val; 117 + } 118 + 119 + static void ccid2_start_rto_timer(struct sock *sk); 120 + 160 121 static void ccid2_hc_tx_rto_expire(unsigned long data) 161 122 { 162 123 struct sock *sk = (struct sock *)data; 163 124 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 164 - const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); 125 + long s; 165 126 166 127 bh_lock_sock(sk); 167 128 if (sock_owned_by_user(sk)) { 168 - sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5); 129 + sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, 130 + jiffies + HZ / 5); 169 131 goto out; 170 132 } 171 133 172 134 ccid2_pr_debug("RTO_EXPIRE\n"); 173 135 136 + ccid2_hc_tx_check_sanity(hctx); 137 + 174 138 /* back-off timer */ 175 - hctx->rto <<= 1; 176 - if (hctx->rto > DCCP_RTO_MAX) 177 - hctx->rto = DCCP_RTO_MAX; 139 + hctx->ccid2hctx_rto <<= 1; 140 + 141 + s = hctx->ccid2hctx_rto / HZ; 142 + if (s > 60) 143 + hctx->ccid2hctx_rto = 60 * HZ; 144 + 145 + ccid2_start_rto_timer(sk); 178 146 179 147 /* adjust pipe, cwnd etc */ 180 - hctx->ssthresh = hctx->cwnd / 2; 181 - if (hctx->ssthresh < 2) 182 - hctx->ssthresh = 2; 183 - hctx->cwnd = 1; 184 - hctx->pipe = 0; 148 + hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2; 149 + if (hctx->ccid2hctx_ssthresh < 2) 150 + hctx->ccid2hctx_ssthresh = 2; 151 + hctx->ccid2hctx_cwnd = 1; 152 + hctx->ccid2hctx_pipe = 0; 185 153 186 154 /* clear state about stuff we sent */ 187 - hctx->seqt = hctx->seqh; 188 - hctx->packets_acked = 0; 155 + hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; 156 + hctx->ccid2hctx_packets_acked = 0; 189 157 190 158 /* clear ack ratio state. */ 191 - hctx->rpseq = 0; 192 - hctx->rpdupack = -1; 159 + hctx->ccid2hctx_rpseq = 0; 160 + hctx->ccid2hctx_rpdupack = -1; 193 161 ccid2_change_l_ack_ratio(sk, 1); 194 - 195 - /* if we were blocked before, we may now send cwnd=1 packet */ 196 - if (sender_was_blocked) 197 - tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); 198 - /* restart backed-off timer */ 199 - sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); 162 + ccid2_hc_tx_check_sanity(hctx); 200 163 out: 201 164 bh_unlock_sock(sk); 202 165 sock_put(sk); 203 166 } 204 167 205 - static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) 168 + static void ccid2_start_rto_timer(struct sock *sk) 169 + { 170 + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 171 + 172 + ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto); 173 + 174 + BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer)); 175 + sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, 176 + jiffies + hctx->ccid2hctx_rto); 177 + } 178 + 179 + static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) 206 180 { 207 181 struct dccp_sock *dp = dccp_sk(sk); 208 182 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 209 183 struct ccid2_seq *next; 210 184 211 - hctx->pipe++; 185 + hctx->ccid2hctx_pipe++; 212 186 213 - hctx->seqh->ccid2s_seq = dp->dccps_gss; 214 - hctx->seqh->ccid2s_acked = 0; 215 - hctx->seqh->ccid2s_sent = jiffies; 187 + hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss; 188 + hctx->ccid2hctx_seqh->ccid2s_acked = 0; 189 + hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; 216 190 217 - next = hctx->seqh->ccid2s_next; 191 + next = hctx->ccid2hctx_seqh->ccid2s_next; 218 192 /* check if we need to alloc more space */ 219 - if (next == hctx->seqt) { 193 + if (next == hctx->ccid2hctx_seqt) { 220 194 if (ccid2_hc_tx_alloc_seq(hctx)) { 221 195 DCCP_CRIT("packet history - out of memory!"); 222 196 /* FIXME: find a more graceful way to bail out */ 223 197 return; 224 198 } 225 - next = hctx->seqh->ccid2s_next; 226 - BUG_ON(next == hctx->seqt); 199 + next = hctx->ccid2hctx_seqh->ccid2s_next; 200 + BUG_ON(next == hctx->ccid2hctx_seqt); 227 201 } 228 - hctx->seqh = next; 202 + hctx->ccid2hctx_seqh = next; 229 203 230 - ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe); 204 + ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, 205 + hctx->ccid2hctx_pipe); 231 206 232 207 /* 233 208 * FIXME: The code below is broken and the variables have been removed ··· 272 203 */ 273 204 #if 0 274 205 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ 275 - hctx->arsent++; 206 + hctx->ccid2hctx_arsent++; 276 207 /* We had an ack loss in this window... */ 277 - if (hctx->ackloss) { 278 - if (hctx->arsent >= hctx->cwnd) { 279 - hctx->arsent = 0; 280 - hctx->ackloss = 0; 208 + if (hctx->ccid2hctx_ackloss) { 209 + if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) { 210 + hctx->ccid2hctx_arsent = 0; 211 + hctx->ccid2hctx_ackloss = 0; 281 212 } 282 213 } else { 283 214 /* No acks lost up to now... */ ··· 287 218 int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - 288 219 dp->dccps_l_ack_ratio; 289 220 290 - denom = hctx->cwnd * hctx->cwnd / denom; 221 + denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom; 291 222 292 - if (hctx->arsent >= denom) { 223 + if (hctx->ccid2hctx_arsent >= denom) { 293 224 ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); 294 - hctx->arsent = 0; 225 + hctx->ccid2hctx_arsent = 0; 295 226 } 296 227 } else { 297 228 /* we can't increase ack ratio further [1] */ 298 - hctx->arsent = 0; /* or maybe set it to cwnd*/ 229 + hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ 299 230 } 300 231 } 301 232 #endif 302 233 303 234 /* setup RTO timer */ 304 - if (!timer_pending(&hctx->rtotimer)) 305 - sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); 235 + if (!timer_pending(&hctx->ccid2hctx_rtotimer)) 236 + ccid2_start_rto_timer(sk); 306 237 307 238 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG 308 239 do { 309 - struct ccid2_seq *seqp = hctx->seqt; 240 + struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; 310 241 311 - while (seqp != hctx->seqh) { 242 + while (seqp != hctx->ccid2hctx_seqh) { 312 243 ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", 313 244 (unsigned long long)seqp->ccid2s_seq, 314 245 seqp->ccid2s_acked, seqp->ccid2s_sent); ··· 316 247 } 317 248 } while (0); 318 249 ccid2_pr_debug("=========\n"); 250 + ccid2_hc_tx_check_sanity(hctx); 319 251 #endif 320 252 } 321 253 322 - /** 323 - * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm 324 - * This code is almost identical with TCP's tcp_rtt_estimator(), since 325 - * - it has a higher sampling frequency (recommended by RFC 1323), 326 - * - the RTO does not collapse into RTT due to RTTVAR going towards zero, 327 - * - it is simple (cf. more complex proposals such as Eifel timer or research 328 - * which suggests that the gain should be set according to window size), 329 - * - in tests it was found to work well with CCID2 [gerrit]. 254 + /* XXX Lame code duplication! 255 + * returns -1 if none was found. 256 + * else returns the next offset to use in the function call. 330 257 */ 331 - static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) 258 + static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, 259 + unsigned char **vec, unsigned char *veclen) 332 260 { 333 - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 334 - long m = mrtt ? : 1; 261 + const struct dccp_hdr *dh = dccp_hdr(skb); 262 + unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); 263 + unsigned char *opt_ptr; 264 + const unsigned char *opt_end = (unsigned char *)dh + 265 + (dh->dccph_doff * 4); 266 + unsigned char opt, len; 267 + unsigned char *value; 335 268 336 - if (hctx->srtt == 0) { 337 - /* First measurement m */ 338 - hctx->srtt = m << 3; 339 - hctx->mdev = m << 1; 269 + BUG_ON(offset < 0); 270 + options += offset; 271 + opt_ptr = options; 272 + if (opt_ptr >= opt_end) 273 + return -1; 340 274 341 - hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev); 342 - hctx->rttvar = hctx->mdev_max; 343 - hctx->rtt_seq = dccp_sk(sk)->dccps_gss; 344 - } else { 345 - /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ 346 - m -= (hctx->srtt >> 3); 347 - hctx->srtt += m; 275 + while (opt_ptr != opt_end) { 276 + opt = *opt_ptr++; 277 + len = 0; 278 + value = NULL; 348 279 349 - /* Similarly, update scaled mdev with regard to |m| */ 350 - if (m < 0) { 351 - m = -m; 352 - m -= (hctx->mdev >> 2); 280 + /* Check if this isn't a single byte option */ 281 + if (opt > DCCPO_MAX_RESERVED) { 282 + if (opt_ptr == opt_end) 283 + goto out_invalid_option; 284 + 285 + len = *opt_ptr++; 286 + if (len < 3) 287 + goto out_invalid_option; 353 288 /* 354 - * This neutralises RTO increase when RTT < SRTT - mdev 355 - * (see P. Sarolahti, A. Kuznetsov,"Congestion Control 356 - * in Linux TCP", USENIX 2002, pp. 49-62). 289 + * Remove the type and len fields, leaving 290 + * just the value size 357 291 */ 358 - if (m > 0) 359 - m >>= 3; 360 - } else { 361 - m -= (hctx->mdev >> 2); 362 - } 363 - hctx->mdev += m; 292 + len -= 2; 293 + value = opt_ptr; 294 + opt_ptr += len; 364 295 365 - if (hctx->mdev > hctx->mdev_max) { 366 - hctx->mdev_max = hctx->mdev; 367 - if (hctx->mdev_max > hctx->rttvar) 368 - hctx->rttvar = hctx->mdev_max; 296 + if (opt_ptr > opt_end) 297 + goto out_invalid_option; 369 298 } 370 299 371 - /* 372 - * Decay RTTVAR at most once per flight, exploiting that 373 - * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) 374 - * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) 375 - * GAR is a useful bound for FlightSize = pipe, AWL is probably 376 - * too low as it over-estimates pipe. 377 - */ 378 - if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) { 379 - if (hctx->mdev_max < hctx->rttvar) 380 - hctx->rttvar -= (hctx->rttvar - 381 - hctx->mdev_max) >> 2; 382 - hctx->rtt_seq = dccp_sk(sk)->dccps_gss; 383 - hctx->mdev_max = TCP_RTO_MIN; 300 + switch (opt) { 301 + case DCCPO_ACK_VECTOR_0: 302 + case DCCPO_ACK_VECTOR_1: 303 + *vec = value; 304 + *veclen = len; 305 + return offset + (opt_ptr - options); 384 306 } 385 307 } 386 308 387 - /* 388 - * Set RTO from SRTT and RTTVAR 389 - * Clock granularity is ignored since the minimum error for RTTVAR is 390 - * clamped to 50msec (corresponding to HZ=20). This leads to a minimum 391 - * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP 392 - * does not retransmit data, DCCP does not require TCP's recommended 393 - * minimum timeout of one second". 394 - */ 395 - hctx->rto = (hctx->srtt >> 3) + hctx->rttvar; 309 + return -1; 396 310 397 - if (hctx->rto > DCCP_RTO_MAX) 398 - hctx->rto = DCCP_RTO_MAX; 311 + out_invalid_option: 312 + DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); 313 + return -1; 399 314 } 400 315 401 - static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, 402 - unsigned int *maxincr) 316 + static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) 403 317 { 404 318 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 405 319 406 - if (hctx->cwnd < hctx->ssthresh) { 407 - if (*maxincr > 0 && ++hctx->packets_acked == 2) { 408 - hctx->cwnd += 1; 409 - *maxincr -= 1; 410 - hctx->packets_acked = 0; 320 + sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer); 321 + ccid2_pr_debug("deleted RTO timer\n"); 322 + } 323 + 324 + static inline void ccid2_new_ack(struct sock *sk, 325 + struct ccid2_seq *seqp, 326 + unsigned int *maxincr) 327 + { 328 + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 329 + 330 + if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { 331 + if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) { 332 + hctx->ccid2hctx_cwnd += 1; 333 + *maxincr -= 1; 334 + hctx->ccid2hctx_packets_acked = 0; 411 335 } 412 - } else if (++hctx->packets_acked >= hctx->cwnd) { 413 - hctx->cwnd += 1; 414 - hctx->packets_acked = 0; 336 + } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) { 337 + hctx->ccid2hctx_cwnd += 1; 338 + hctx->ccid2hctx_packets_acked = 0; 415 339 } 416 - /* 417 - * FIXME: RTT is sampled several times per acknowledgment (for each 418 - * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). 419 - * This causes the RTT to be over-estimated, since the older entries 420 - * in the Ack Vector have earlier sending times. 421 - * The cleanest solution is to not use the ccid2s_sent field at all 422 - * and instead use DCCP timestamps - need to be resolved at some time. 423 - */ 424 - ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); 340 + 341 + /* update RTO */ 342 + if (hctx->ccid2hctx_srtt == -1 || 343 + time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) { 344 + unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; 345 + int s; 346 + 347 + /* first measurement */ 348 + if (hctx->ccid2hctx_srtt == -1) { 349 + ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", 350 + r, jiffies, 351 + (unsigned long long)seqp->ccid2s_seq); 352 + ccid2_change_srtt(hctx, r); 353 + hctx->ccid2hctx_rttvar = r >> 1; 354 + } else { 355 + /* RTTVAR */ 356 + long tmp = hctx->ccid2hctx_srtt - r; 357 + long srtt; 358 + 359 + if (tmp < 0) 360 + tmp *= -1; 361 + 362 + tmp >>= 2; 363 + hctx->ccid2hctx_rttvar *= 3; 364 + hctx->ccid2hctx_rttvar >>= 2; 365 + hctx->ccid2hctx_rttvar += tmp; 366 + 367 + /* SRTT */ 368 + srtt = hctx->ccid2hctx_srtt; 369 + srtt *= 7; 370 + srtt >>= 3; 371 + tmp = r >> 3; 372 + srtt += tmp; 373 + ccid2_change_srtt(hctx, srtt); 374 + } 375 + s = hctx->ccid2hctx_rttvar << 2; 376 + /* clock granularity is 1 when based on jiffies */ 377 + if (!s) 378 + s = 1; 379 + hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s; 380 + 381 + /* must be at least a second */ 382 + s = hctx->ccid2hctx_rto / HZ; 383 + /* DCCP doesn't require this [but I like it cuz my code sux] */ 384 + #if 1 385 + if (s < 1) 386 + hctx->ccid2hctx_rto = HZ; 387 + #endif 388 + /* max 60 seconds */ 389 + if (s > 60) 390 + hctx->ccid2hctx_rto = HZ * 60; 391 + 392 + hctx->ccid2hctx_lastrtt = jiffies; 393 + 394 + ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", 395 + hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar, 396 + hctx->ccid2hctx_rto, HZ, r); 397 + } 398 + 399 + /* we got a new ack, so re-start RTO timer */ 400 + ccid2_hc_tx_kill_rto_timer(sk); 401 + ccid2_start_rto_timer(sk); 402 + } 403 + 404 + static void ccid2_hc_tx_dec_pipe(struct sock *sk) 405 + { 406 + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 407 + 408 + if (hctx->ccid2hctx_pipe == 0) 409 + DCCP_BUG("pipe == 0"); 410 + else 411 + hctx->ccid2hctx_pipe--; 412 + 413 + if (hctx->ccid2hctx_pipe == 0) 414 + ccid2_hc_tx_kill_rto_timer(sk); 425 415 } 426 416 427 417 static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) 428 418 { 429 419 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 430 420 431 - if (time_before(seqp->ccid2s_sent, hctx->last_cong)) { 421 + if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { 432 422 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); 433 423 return; 434 424 } 435 425 436 - hctx->last_cong = jiffies; 426 + hctx->ccid2hctx_last_cong = jiffies; 437 427 438 - hctx->cwnd = hctx->cwnd / 2 ? : 1U; 439 - hctx->ssthresh = max(hctx->cwnd, 2U); 428 + hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U; 429 + hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U); 440 430 441 431 /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ 442 - if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd) 443 - ccid2_change_l_ack_ratio(sk, hctx->cwnd); 444 - } 445 - 446 - static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, 447 - u8 option, u8 *optval, u8 optlen) 448 - { 449 - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 450 - 451 - switch (option) { 452 - case DCCPO_ACK_VECTOR_0: 453 - case DCCPO_ACK_VECTOR_1: 454 - return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen, 455 - option - DCCPO_ACK_VECTOR_0); 456 - } 457 - return 0; 432 + if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd) 433 + ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd); 458 434 } 459 435 460 436 static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 461 437 { 462 438 struct dccp_sock *dp = dccp_sk(sk); 463 439 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 464 - const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); 465 - struct dccp_ackvec_parsed *avp; 466 440 u64 ackno, seqno; 467 441 struct ccid2_seq *seqp; 442 + unsigned char *vector; 443 + unsigned char veclen; 444 + int offset = 0; 468 445 int done = 0; 469 446 unsigned int maxincr = 0; 470 447 448 + ccid2_hc_tx_check_sanity(hctx); 471 449 /* check reverse path congestion */ 472 450 seqno = DCCP_SKB_CB(skb)->dccpd_seq; 473 451 ··· 523 407 * -sorbo. 524 408 */ 525 409 /* need to bootstrap */ 526 - if (hctx->rpdupack == -1) { 527 - hctx->rpdupack = 0; 528 - hctx->rpseq = seqno; 410 + if (hctx->ccid2hctx_rpdupack == -1) { 411 + hctx->ccid2hctx_rpdupack = 0; 412 + hctx->ccid2hctx_rpseq = seqno; 529 413 } else { 530 414 /* check if packet is consecutive */ 531 - if (dccp_delta_seqno(hctx->rpseq, seqno) == 1) 532 - hctx->rpseq = seqno; 415 + if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1) 416 + hctx->ccid2hctx_rpseq = seqno; 533 417 /* it's a later packet */ 534 - else if (after48(seqno, hctx->rpseq)) { 535 - hctx->rpdupack++; 418 + else if (after48(seqno, hctx->ccid2hctx_rpseq)) { 419 + hctx->ccid2hctx_rpdupack++; 536 420 537 421 /* check if we got enough dupacks */ 538 - if (hctx->rpdupack >= NUMDUPACK) { 539 - hctx->rpdupack = -1; /* XXX lame */ 540 - hctx->rpseq = 0; 422 + if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) { 423 + hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ 424 + hctx->ccid2hctx_rpseq = 0; 541 425 542 426 ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); 543 427 } ··· 545 429 } 546 430 547 431 /* check forward path congestion */ 548 - if (dccp_packet_without_ack(skb)) 432 + /* still didn't send out new data packets */ 433 + if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt) 549 434 return; 550 435 551 - /* still didn't send out new data packets */ 552 - if (hctx->seqh == hctx->seqt) 553 - goto done; 436 + switch (DCCP_SKB_CB(skb)->dccpd_type) { 437 + case DCCP_PKT_ACK: 438 + case DCCP_PKT_DATAACK: 439 + break; 440 + default: 441 + return; 442 + } 554 443 555 444 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; 556 - if (after48(ackno, hctx->high_ack)) 557 - hctx->high_ack = ackno; 445 + if (after48(ackno, hctx->ccid2hctx_high_ack)) 446 + hctx->ccid2hctx_high_ack = ackno; 558 447 559 - seqp = hctx->seqt; 448 + seqp = hctx->ccid2hctx_seqt; 560 449 while (before48(seqp->ccid2s_seq, ackno)) { 561 450 seqp = seqp->ccid2s_next; 562 - if (seqp == hctx->seqh) { 563 - seqp = hctx->seqh->ccid2s_prev; 451 + if (seqp == hctx->ccid2hctx_seqh) { 452 + seqp = hctx->ccid2hctx_seqh->ccid2s_prev; 564 453 break; 565 454 } 566 455 } ··· 575 454 * packets per acknowledgement. Rounding up avoids that cwnd is not 576 455 * advanced when Ack Ratio is 1 and gives a slight edge otherwise. 577 456 */ 578 - if (hctx->cwnd < hctx->ssthresh) 457 + if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) 579 458 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); 580 459 581 460 /* go through all ack vectors */ 582 - list_for_each_entry(avp, &hctx->av_chunks, node) { 461 + while ((offset = ccid2_ackvector(sk, skb, offset, 462 + &vector, &veclen)) != -1) { 583 463 /* go through this ack vector */ 584 - for (; avp->len--; avp->vec++) { 585 - u64 ackno_end_rl = SUB48(ackno, 586 - dccp_ackvec_runlen(avp->vec)); 464 + while (veclen--) { 465 + const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; 466 + u64 ackno_end_rl = SUB48(ackno, rl); 587 467 588 - ccid2_pr_debug("ackvec %llu |%u,%u|\n", 468 + ccid2_pr_debug("ackvec start:%llu end:%llu\n", 589 469 (unsigned long long)ackno, 590 - dccp_ackvec_state(avp->vec) >> 6, 591 - dccp_ackvec_runlen(avp->vec)); 470 + (unsigned long long)ackno_end_rl); 592 471 /* if the seqno we are analyzing is larger than the 593 472 * current ackno, then move towards the tail of our 594 473 * seqnos. 595 474 */ 596 475 while (after48(seqp->ccid2s_seq, ackno)) { 597 - if (seqp == hctx->seqt) { 476 + if (seqp == hctx->ccid2hctx_seqt) { 598 477 done = 1; 599 478 break; 600 479 } ··· 607 486 * run length 608 487 */ 609 488 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { 610 - const u8 state = dccp_ackvec_state(avp->vec); 489 + const u8 state = *vector & 490 + DCCP_ACKVEC_STATE_MASK; 611 491 612 492 /* new packet received or marked */ 613 - if (state != DCCPAV_NOT_RECEIVED && 493 + if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && 614 494 !seqp->ccid2s_acked) { 615 - if (state == DCCPAV_ECN_MARKED) 495 + if (state == 496 + DCCP_ACKVEC_STATE_ECN_MARKED) { 616 497 ccid2_congestion_event(sk, 617 498 seqp); 618 - else 499 + } else 619 500 ccid2_new_ack(sk, seqp, 620 501 &maxincr); 621 502 622 503 seqp->ccid2s_acked = 1; 623 504 ccid2_pr_debug("Got ack for %llu\n", 624 505 (unsigned long long)seqp->ccid2s_seq); 625 - hctx->pipe--; 506 + ccid2_hc_tx_dec_pipe(sk); 626 507 } 627 - if (seqp == hctx->seqt) { 508 + if (seqp == hctx->ccid2hctx_seqt) { 628 509 done = 1; 629 510 break; 630 511 } ··· 636 513 break; 637 514 638 515 ackno = SUB48(ackno_end_rl, 1); 516 + vector++; 639 517 } 640 518 if (done) 641 519 break; ··· 645 521 /* The state about what is acked should be correct now 646 522 * Check for NUMDUPACK 647 523 */ 648 - seqp = hctx->seqt; 649 - while (before48(seqp->ccid2s_seq, hctx->high_ack)) { 524 + seqp = hctx->ccid2hctx_seqt; 525 + while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) { 650 526 seqp = seqp->ccid2s_next; 651 - if (seqp == hctx->seqh) { 652 - seqp = hctx->seqh->ccid2s_prev; 527 + if (seqp == hctx->ccid2hctx_seqh) { 528 + seqp = hctx->ccid2hctx_seqh->ccid2s_prev; 653 529 break; 654 530 } 655 531 } ··· 660 536 if (done == NUMDUPACK) 661 537 break; 662 538 } 663 - if (seqp == hctx->seqt) 539 + if (seqp == hctx->ccid2hctx_seqt) 664 540 break; 665 541 seqp = seqp->ccid2s_prev; 666 542 } ··· 681 557 * one ack vector. 682 558 */ 683 559 ccid2_congestion_event(sk, seqp); 684 - hctx->pipe--; 560 + ccid2_hc_tx_dec_pipe(sk); 685 561 } 686 - if (seqp == hctx->seqt) 562 + if (seqp == hctx->ccid2hctx_seqt) 687 563 break; 688 564 seqp = seqp->ccid2s_prev; 689 565 } 690 566 691 - hctx->seqt = last_acked; 567 + hctx->ccid2hctx_seqt = last_acked; 692 568 } 693 569 694 570 /* trim acked packets in tail */ 695 - while (hctx->seqt != hctx->seqh) { 696 - if (!hctx->seqt->ccid2s_acked) 571 + while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) { 572 + if (!hctx->ccid2hctx_seqt->ccid2s_acked) 697 573 break; 698 574 699 - hctx->seqt = hctx->seqt->ccid2s_next; 575 + hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next; 700 576 } 701 577 702 - /* restart RTO timer if not all outstanding data has been acked */ 703 - if (hctx->pipe == 0) 704 - sk_stop_timer(sk, &hctx->rtotimer); 705 - else 706 - sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); 707 - done: 708 - /* check if incoming Acks allow pending packets to be sent */ 709 - if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx)) 710 - tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); 711 - dccp_ackvec_parsed_cleanup(&hctx->av_chunks); 578 + ccid2_hc_tx_check_sanity(hctx); 712 579 } 713 580 714 581 static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) ··· 709 594 u32 max_ratio; 710 595 711 596 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ 712 - hctx->ssthresh = ~0U; 597 + hctx->ccid2hctx_ssthresh = ~0U; 713 598 714 - /* Use larger initial windows (RFC 3390, rfc2581bis) */ 715 - hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); 599 + /* 600 + * RFC 4341, 5: "The cwnd parameter is initialized to at most four 601 + * packets for new connections, following the rules from [RFC3390]". 602 + * We need to convert the bytes of RFC3390 into the packets of RFC 4341. 603 + */ 604 + hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); 716 605 717 606 /* Make sure that Ack Ratio is enabled and within bounds. */ 718 - max_ratio = DIV_ROUND_UP(hctx->cwnd, 2); 607 + max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2); 719 608 if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) 720 609 dp->dccps_l_ack_ratio = max_ratio; 721 610 ··· 727 608 if (ccid2_hc_tx_alloc_seq(hctx)) 728 609 return -ENOMEM; 729 610 730 - hctx->rto = DCCP_TIMEOUT_INIT; 731 - hctx->rpdupack = -1; 732 - hctx->last_cong = jiffies; 733 - setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); 734 - INIT_LIST_HEAD(&hctx->av_chunks); 611 + hctx->ccid2hctx_rto = 3 * HZ; 612 + ccid2_change_srtt(hctx, -1); 613 + hctx->ccid2hctx_rttvar = -1; 614 + hctx->ccid2hctx_rpdupack = -1; 615 + hctx->ccid2hctx_last_cong = jiffies; 616 + setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire, 617 + (unsigned long)sk); 618 + 619 + ccid2_hc_tx_check_sanity(hctx); 735 620 return 0; 736 621 } 737 622 ··· 744 621 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 745 622 int i; 746 623 747 - sk_stop_timer(sk, &hctx->rtotimer); 624 + ccid2_hc_tx_kill_rto_timer(sk); 748 625 749 - for (i = 0; i < hctx->seqbufc; i++) 750 - kfree(hctx->seqbuf[i]); 751 - hctx->seqbufc = 0; 626 + for (i = 0; i < hctx->ccid2hctx_seqbufc; i++) 627 + kfree(hctx->ccid2hctx_seqbuf[i]); 628 + hctx->ccid2hctx_seqbufc = 0; 752 629 } 753 630 754 631 static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) ··· 759 636 switch (DCCP_SKB_CB(skb)->dccpd_type) { 760 637 case DCCP_PKT_DATA: 761 638 case DCCP_PKT_DATAACK: 762 - hcrx->data++; 763 - if (hcrx->data >= dp->dccps_r_ack_ratio) { 639 + hcrx->ccid2hcrx_data++; 640 + if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) { 764 641 dccp_send_ack(sk); 765 - hcrx->data = 0; 642 + hcrx->ccid2hcrx_data = 0; 766 643 } 767 644 break; 768 645 } 769 646 } 770 647 771 648 static struct ccid_operations ccid2 = { 772 - .ccid_id = DCCPC_CCID2, 773 - .ccid_name = "TCP-like", 774 - .ccid_owner = THIS_MODULE, 775 - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), 776 - .ccid_hc_tx_init = ccid2_hc_tx_init, 777 - .ccid_hc_tx_exit = ccid2_hc_tx_exit, 778 - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, 779 - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, 780 - .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, 781 - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, 782 - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), 783 - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, 649 + .ccid_id = DCCPC_CCID2, 650 + .ccid_name = "TCP-like", 651 + .ccid_owner = THIS_MODULE, 652 + .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), 653 + .ccid_hc_tx_init = ccid2_hc_tx_init, 654 + .ccid_hc_tx_exit = ccid2_hc_tx_exit, 655 + .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, 656 + .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, 657 + .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, 658 + .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), 659 + .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, 784 660 }; 785 661 786 662 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
+24 -39
net/dccp/ccids/ccid2.h
··· 42 42 43 43 /** struct ccid2_hc_tx_sock - CCID2 TX half connection 44 44 * 45 - * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 46 - * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465) 47 - * @srtt: smoothed RTT estimate, scaled by 2^3 48 - * @mdev: smoothed RTT variation, scaled by 2^2 49 - * @mdev_max: maximum of @mdev during one flight 50 - * @rttvar: moving average/maximum of @mdev_max 51 - * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) 52 - * @rtt_seq: to decay RTTVAR at most once per flight 53 - * @rpseq: last consecutive seqno 54 - * @rpdupack: dupacks since rpseq 55 - * @av_chunks: list of Ack Vectors received on current skb 56 - */ 45 + * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 46 + * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465) 47 + * @ccid2hctx_lastrtt -time RTT was last measured 48 + * @ccid2hctx_rpseq - last consecutive seqno 49 + * @ccid2hctx_rpdupack - dupacks since rpseq 50 + */ 57 51 struct ccid2_hc_tx_sock { 58 - u32 cwnd; 59 - u32 ssthresh; 60 - u32 pipe; 61 - u32 packets_acked; 62 - struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX]; 63 - int seqbufc; 64 - struct ccid2_seq *seqh; 65 - struct ccid2_seq *seqt; 66 - /* RTT measurement: variables/principles are the same as in TCP */ 67 - u32 srtt, 68 - mdev, 69 - mdev_max, 70 - rttvar, 71 - rto; 72 - u64 rtt_seq:48; 73 - struct timer_list rtotimer; 74 - u64 rpseq; 75 - int rpdupack; 76 - unsigned long last_cong; 77 - u64 high_ack; 78 - struct list_head av_chunks; 52 + u32 ccid2hctx_cwnd; 53 + u32 ccid2hctx_ssthresh; 54 + u32 ccid2hctx_pipe; 55 + u32 ccid2hctx_packets_acked; 56 + struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; 57 + int ccid2hctx_seqbufc; 58 + struct ccid2_seq *ccid2hctx_seqh; 59 + struct ccid2_seq *ccid2hctx_seqt; 60 + long ccid2hctx_rto; 61 + long ccid2hctx_srtt; 62 + long ccid2hctx_rttvar; 63 + unsigned long ccid2hctx_lastrtt; 64 + struct timer_list ccid2hctx_rtotimer; 65 + u64 ccid2hctx_rpseq; 66 + int ccid2hctx_rpdupack; 67 + unsigned long ccid2hctx_last_cong; 68 + u64 ccid2hctx_high_ack; 79 69 }; 80 70 81 - static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx) 82 - { 83 - return (hctx->pipe >= hctx->cwnd); 84 - } 85 - 86 71 struct ccid2_hc_rx_sock { 87 - int data; 72 + int ccid2hcrx_data; 88 73 }; 89 74 90 75 static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
+465 -305
net/dccp/ccids/ccid3.c
··· 49 49 /* 50 50 * Transmitter Half-Connection Routines 51 51 */ 52 - /* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */ 53 - static int do_osc_prev = true; 52 + #ifdef CONFIG_IP_DCCP_CCID3_DEBUG 53 + static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) 54 + { 55 + static char *ccid3_state_names[] = { 56 + [TFRC_SSTATE_NO_SENT] = "NO_SENT", 57 + [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", 58 + [TFRC_SSTATE_FBACK] = "FBACK", 59 + [TFRC_SSTATE_TERM] = "TERM", 60 + }; 61 + 62 + return ccid3_state_names[state]; 63 + } 64 + #endif 65 + 66 + static void ccid3_hc_tx_set_state(struct sock *sk, 67 + enum ccid3_hc_tx_states state) 68 + { 69 + struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 70 + enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; 71 + 72 + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", 73 + dccp_role(sk), sk, ccid3_tx_state_name(oldstate), 74 + ccid3_tx_state_name(state)); 75 + WARN_ON(state == oldstate); 76 + hctx->ccid3hctx_state = state; 77 + } 54 78 55 79 /* 56 80 * Compute the initial sending rate X_init in the manner of RFC 3390: 57 81 * 58 - * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT 82 + * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT 59 83 * 84 + * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis 85 + * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. 60 86 * For consistency with other parts of the code, X_init is scaled by 2^6. 61 87 */ 62 88 static inline u64 rfc3390_initial_rate(struct sock *sk) 63 89 { 64 - const u32 mps = dccp_sk(sk)->dccps_mss_cache, 65 - w_init = clamp(4380U, 2 * mps, 4 * mps); 90 + const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 91 + const __u32 w_init = clamp_t(__u32, 4380U, 92 + 2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s); 66 93 67 - return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt); 94 + return scaled_div(w_init << 6, hctx->ccid3hctx_rtt); 68 95 } 69 96 70 - /** 71 - * ccid3_update_send_interval - Calculate new t_ipi = s / X 72 - * This respects the granularity of X (64 * bytes/second) and enforces the 73 - * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342. 97 + /* 98 + * Recalculate t_ipi and delta (should be called whenever X changes) 74 99 */ 75 100 static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) 76 101 { 77 - if (unlikely(hctx->x <= hctx->s)) 78 - hctx->x = hctx->s; 79 - hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x); 102 + /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ 103 + hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, 104 + hctx->ccid3hctx_x); 105 + 106 + /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ 107 + hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, 108 + TFRC_OPSYS_HALF_TIME_GRAN); 109 + 110 + ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", 111 + hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta, 112 + hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6)); 113 + 80 114 } 81 115 82 116 static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) 83 117 { 84 - u32 delta = ktime_us_delta(now, hctx->t_last_win_count); 118 + u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count); 85 119 86 - return delta / hctx->rtt; 120 + return delta / hctx->ccid3hctx_rtt; 87 121 } 88 122 89 123 /** ··· 133 99 static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) 134 100 { 135 101 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 136 - u64 min_rate = 2 * hctx->x_recv; 137 - const u64 old_x = hctx->x; 102 + __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; 103 + const __u64 old_x = hctx->ccid3hctx_x; 138 104 ktime_t now = stamp ? *stamp : ktime_get_real(); 139 105 140 106 /* ··· 145 111 */ 146 112 if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { 147 113 min_rate = rfc3390_initial_rate(sk); 148 - min_rate = max(min_rate, 2 * hctx->x_recv); 114 + min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); 149 115 } 150 116 151 - if (hctx->p > 0) { 117 + if (hctx->ccid3hctx_p > 0) { 152 118 153 - hctx->x = min(((u64)hctx->x_calc) << 6, min_rate); 119 + hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6, 120 + min_rate); 121 + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, 122 + (((__u64)hctx->ccid3hctx_s) << 6) / 123 + TFRC_T_MBI); 154 124 155 - } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) { 125 + } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld) 126 + - (s64)hctx->ccid3hctx_rtt >= 0) { 156 127 157 - hctx->x = min(2 * hctx->x, min_rate); 158 - hctx->x = max(hctx->x, 159 - scaled_div(((u64)hctx->s) << 6, hctx->rtt)); 160 - hctx->t_ld = now; 128 + hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate); 129 + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, 130 + scaled_div(((__u64)hctx->ccid3hctx_s) << 6, 131 + hctx->ccid3hctx_rtt)); 132 + hctx->ccid3hctx_t_ld = now; 161 133 } 162 134 163 - if (hctx->x != old_x) { 135 + if (hctx->ccid3hctx_x != old_x) { 164 136 ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " 165 137 "X_recv=%u\n", (unsigned)(old_x >> 6), 166 - (unsigned)(hctx->x >> 6), hctx->x_calc, 167 - (unsigned)(hctx->x_recv >> 6)); 138 + (unsigned)(hctx->ccid3hctx_x >> 6), 139 + hctx->ccid3hctx_x_calc, 140 + (unsigned)(hctx->ccid3hctx_x_recv >> 6)); 168 141 169 142 ccid3_update_send_interval(hctx); 170 143 } 171 144 } 172 145 173 146 /* 174 - * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1) 175 - * @new_len: DCCP payload size in bytes (not used by all methods) 147 + * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) 148 + * @len: DCCP packet payload size in bytes 176 149 */ 177 - static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) 150 + static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) 178 151 { 179 - #if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG) 180 - return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9); 181 - #elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX) 182 - return max(ccid3_hc_tx_sk(sk)->s, new_len); 183 - #else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */ 184 - return dccp_sk(sk)->dccps_mss_cache; 185 - #endif 152 + const u16 old_s = hctx->ccid3hctx_s; 153 + 154 + hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9); 155 + 156 + if (hctx->ccid3hctx_s != old_s) 157 + ccid3_update_send_interval(hctx); 186 158 } 187 159 188 160 /* ··· 198 158 static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, 199 159 ktime_t now) 200 160 { 201 - u32 delta = ktime_us_delta(now, hctx->t_last_win_count), 202 - quarter_rtts = (4 * delta) / hctx->rtt; 161 + u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count), 162 + quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt; 203 163 204 164 if (quarter_rtts > 0) { 205 - hctx->t_last_win_count = now; 206 - hctx->last_win_count += min(quarter_rtts, 5U); 207 - hctx->last_win_count &= 0xF; /* mod 16 */ 165 + hctx->ccid3hctx_t_last_win_count = now; 166 + hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U); 167 + hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */ 208 168 } 209 169 } 210 170 ··· 221 181 goto restart_timer; 222 182 } 223 183 224 - ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk, 225 - hctx->feedback ? "" : "out"); 184 + ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, 185 + ccid3_tx_state_name(hctx->ccid3hctx_state)); 226 186 227 - /* Ignore and do not restart after leaving the established state */ 228 - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) 187 + if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK) 188 + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); 189 + else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) 229 190 goto out; 230 - 231 - /* Reset feedback state to "no feedback received" */ 232 - hctx->feedback = false; 233 191 234 192 /* 235 193 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 236 - * RTO is 0 if and only if no feedback has been received yet. 237 194 */ 238 - if (hctx->t_rto == 0 || hctx->p == 0) { 195 + if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */ 196 + hctx->ccid3hctx_p == 0) { 239 197 240 198 /* halve send rate directly */ 241 - hctx->x /= 2; 199 + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, 200 + (((__u64)hctx->ccid3hctx_s) << 6) / 201 + TFRC_T_MBI); 242 202 ccid3_update_send_interval(hctx); 243 - 244 203 } else { 245 204 /* 246 205 * Modify the cached value of X_recv ··· 251 212 * 252 213 * Note that X_recv is scaled by 2^6 while X_calc is not 253 214 */ 254 - BUG_ON(hctx->p && !hctx->x_calc); 215 + BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); 255 216 256 - if (hctx->x_calc > (hctx->x_recv >> 5)) 257 - hctx->x_recv /= 2; 217 + if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5)) 218 + hctx->ccid3hctx_x_recv = 219 + max(hctx->ccid3hctx_x_recv / 2, 220 + (((__u64)hctx->ccid3hctx_s) << 6) / 221 + (2 * TFRC_T_MBI)); 258 222 else { 259 - hctx->x_recv = hctx->x_calc; 260 - hctx->x_recv <<= 4; 223 + hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; 224 + hctx->ccid3hctx_x_recv <<= 4; 261 225 } 262 226 ccid3_hc_tx_update_x(sk, NULL); 263 227 } 264 228 ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", 265 - (unsigned long long)hctx->x); 229 + (unsigned long long)hctx->ccid3hctx_x); 266 230 267 231 /* 268 232 * Set new timeout for the nofeedback timer. 269 233 * See comments in packet_recv() regarding the value of t_RTO. 270 234 */ 271 - if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */ 235 + if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */ 272 236 t_nfb = TFRC_INITIAL_TIMEOUT; 273 237 else 274 - t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); 238 + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); 275 239 276 240 restart_timer: 277 - sk_reset_timer(sk, &hctx->no_feedback_timer, 241 + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 278 242 jiffies + usecs_to_jiffies(t_nfb)); 279 243 out: 280 244 bh_unlock_sock(sk); 281 245 sock_put(sk); 282 246 } 283 247 284 - /** 285 - * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets 286 - * @skb: next packet candidate to send on @sk 287 - * This function uses the convention of ccid_packet_dequeue_eval() and 288 - * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. 248 + /* 249 + * returns 250 + * > 0: delay (in msecs) that should pass before actually sending 251 + * = 0: can send immediately 252 + * < 0: error condition; do not send packet 289 253 */ 290 254 static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 291 255 { ··· 305 263 if (unlikely(skb->len == 0)) 306 264 return -EBADMSG; 307 265 308 - if (hctx->s == 0) { 309 - sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies + 266 + switch (hctx->ccid3hctx_state) { 267 + case TFRC_SSTATE_NO_SENT: 268 + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 269 + (jiffies + 310 270 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); 311 - hctx->last_win_count = 0; 312 - hctx->t_last_win_count = now; 271 + hctx->ccid3hctx_last_win_count = 0; 272 + hctx->ccid3hctx_t_last_win_count = now; 313 273 314 274 /* Set t_0 for initial packet */ 315 - hctx->t_nom = now; 275 + hctx->ccid3hctx_t_nom = now; 276 + 277 + hctx->ccid3hctx_s = skb->len; 316 278 317 279 /* 318 280 * Use initial RTT sample when available: recommended by erratum ··· 325 279 */ 326 280 if (dp->dccps_syn_rtt) { 327 281 ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); 328 - hctx->rtt = dp->dccps_syn_rtt; 329 - hctx->x = rfc3390_initial_rate(sk); 330 - hctx->t_ld = now; 282 + hctx->ccid3hctx_rtt = dp->dccps_syn_rtt; 283 + hctx->ccid3hctx_x = rfc3390_initial_rate(sk); 284 + hctx->ccid3hctx_t_ld = now; 331 285 } else { 332 286 /* 333 287 * Sender does not have RTT sample: ··· 335 289 * is needed in several parts (e.g. window counter); 336 290 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. 337 291 */ 338 - hctx->rtt = DCCP_FALLBACK_RTT; 339 - hctx->x = dp->dccps_mss_cache; 340 - hctx->x <<= 6; 292 + hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT; 293 + hctx->ccid3hctx_x = hctx->ccid3hctx_s; 294 + hctx->ccid3hctx_x <<= 6; 341 295 } 342 - 343 - /* Compute t_ipi = s / X */ 344 - hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len); 345 296 ccid3_update_send_interval(hctx); 346 297 347 - /* Seed value for Oscillation Prevention (sec. 4.5) */ 348 - hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt); 349 - 350 - } else { 351 - delay = ktime_us_delta(hctx->t_nom, now); 298 + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); 299 + break; 300 + case TFRC_SSTATE_NO_FBACK: 301 + case TFRC_SSTATE_FBACK: 302 + delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now); 352 303 ccid3_pr_debug("delay=%ld\n", (long)delay); 353 304 /* 354 305 * Scheduling of packet transmissions [RFC 3448, 4.6] ··· 355 312 * else 356 313 * // send the packet in (t_nom - t_now) milliseconds. 357 314 */ 358 - if (delay >= TFRC_T_DELTA) 359 - return (u32)delay / USEC_PER_MSEC; 315 + if (delay - (s64)hctx->ccid3hctx_delta >= 1000) 316 + return (u32)delay / 1000L; 360 317 361 318 ccid3_hc_tx_update_win_count(hctx, now); 319 + break; 320 + case TFRC_SSTATE_TERM: 321 + DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); 322 + return -EINVAL; 362 323 } 363 324 364 325 /* prepare to send now (add options etc.) */ 365 326 dp->dccps_hc_tx_insert_options = 1; 366 - DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count; 327 + DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; 367 328 368 329 /* set the nominal send time for the next following packet */ 369 - hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi); 370 - return CCID_PACKET_SEND_AT_ONCE; 330 + hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom, 331 + hctx->ccid3hctx_t_ipi); 332 + return 0; 371 333 } 372 334 373 - static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) 335 + static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, 336 + unsigned int len) 374 337 { 375 338 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 376 339 377 - /* Changes to s will become effective the next time X is computed */ 378 - hctx->s = ccid3_hc_tx_measure_packet_size(sk, len); 340 + ccid3_hc_tx_update_s(hctx, len); 379 341 380 - if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss)) 342 + if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss)) 381 343 DCCP_CRIT("packet history - out of memory!"); 382 344 } 383 345 384 346 static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 385 347 { 386 348 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 387 - struct tfrc_tx_hist_entry *acked; 349 + struct ccid3_options_received *opt_recv; 388 350 ktime_t now; 389 351 unsigned long t_nfb; 390 - u32 r_sample; 352 + u32 pinv, r_sample; 391 353 392 354 /* we are only interested in ACKs */ 393 355 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || 394 356 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) 395 357 return; 396 - /* 397 - * Locate the acknowledged packet in the TX history. 398 - * 399 - * Returning "entry not found" here can for instance happen when 400 - * - the host has not sent out anything (e.g. a passive server), 401 - * - the Ack is outdated (packet with higher Ack number was received), 402 - * - it is a bogus Ack (for a packet not sent on this connection). 403 - */ 404 - acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb)); 405 - if (acked == NULL) 358 + /* ... and only in the established state */ 359 + if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK && 360 + hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) 406 361 return; 407 - /* For the sake of RTT sampling, ignore/remove all older entries */ 408 - tfrc_tx_hist_purge(&acked->next); 409 362 410 - /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ 411 - now = ktime_get_real(); 412 - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); 413 - hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9); 363 + opt_recv = &hctx->ccid3hctx_options_received; 364 + now = ktime_get_real(); 414 365 366 + /* Estimate RTT from history if ACK number is valid */ 367 + r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist, 368 + DCCP_SKB_CB(skb)->dccpd_ack_seq, now); 369 + if (r_sample == 0) { 370 + DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk, 371 + dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type), 372 + (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq); 373 + return; 374 + } 375 + 376 + /* Update receive rate in units of 64 * bytes/second */ 377 + hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; 378 + hctx->ccid3hctx_x_recv <<= 6; 379 + 380 + /* Update loss event rate (which is scaled by 1e6) */ 381 + pinv = opt_recv->ccid3or_loss_event_rate; 382 + if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ 383 + hctx->ccid3hctx_p = 0; 384 + else /* can not exceed 100% */ 385 + hctx->ccid3hctx_p = scaled_div(1, pinv); 386 + /* 387 + * Validate new RTT sample and update moving average 388 + */ 389 + r_sample = dccp_sample_rtt(sk, r_sample); 390 + hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9); 415 391 /* 416 392 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 417 393 */ 418 - if (!hctx->feedback) { 419 - hctx->feedback = true; 394 + if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { 395 + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); 420 396 421 - if (hctx->t_rto == 0) { 397 + if (hctx->ccid3hctx_t_rto == 0) { 422 398 /* 423 399 * Initial feedback packet: Larger Initial Windows (4.2) 424 400 */ 425 - hctx->x = rfc3390_initial_rate(sk); 426 - hctx->t_ld = now; 401 + hctx->ccid3hctx_x = rfc3390_initial_rate(sk); 402 + hctx->ccid3hctx_t_ld = now; 427 403 428 404 ccid3_update_send_interval(hctx); 429 405 430 406 goto done_computing_x; 431 - } else if (hctx->p == 0) { 407 + } else if (hctx->ccid3hctx_p == 0) { 432 408 /* 433 409 * First feedback after nofeedback timer expiry (4.3) 434 410 */ ··· 456 394 } 457 395 458 396 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ 459 - if (hctx->p > 0) 460 - hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p); 397 + if (hctx->ccid3hctx_p > 0) 398 + hctx->ccid3hctx_x_calc = 399 + tfrc_calc_x(hctx->ccid3hctx_s, 400 + hctx->ccid3hctx_rtt, 401 + hctx->ccid3hctx_p); 461 402 ccid3_hc_tx_update_x(sk, &now); 462 403 463 404 done_computing_x: 464 405 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " 465 406 "p=%u, X_calc=%u, X_recv=%u, X=%u\n", 466 - dccp_role(sk), sk, hctx->rtt, r_sample, 467 - hctx->s, hctx->p, hctx->x_calc, 468 - (unsigned)(hctx->x_recv >> 6), 469 - (unsigned)(hctx->x >> 6)); 470 - /* 471 - * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to 472 - * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This 473 - * can be useful if few connections share a link, avoiding that buffer 474 - * fill levels (RTT) oscillate as a result of frequent adjustments to X. 475 - * A useful presentation with background information is in 476 - * Joerg Widmer, "Equation-Based Congestion Control", 477 - * MSc Thesis, University of Mannheim, Germany, 2000 478 - * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation"). 479 - */ 480 - if (do_osc_prev) { 481 - r_sample = tfrc_scaled_sqrt(r_sample); 482 - /* 483 - * The modulation can work in both ways: increase/decrease t_ipi 484 - * according to long-term increases/decreases of the RTT. The 485 - * former is a useful measure, since it works against queue 486 - * build-up. The latter temporarily increases the sending rate, 487 - * so that buffers fill up more quickly. This in turn causes 488 - * the RTT to increase, so that either later reduction becomes 489 - * necessary or the RTT stays at a very high level. Decreasing 490 - * t_ipi is therefore not supported. 491 - * Furthermore, during the initial slow-start phase the RTT 492 - * naturally increases, where using the algorithm would cause 493 - * delays. Hence it is disabled during the initial slow-start. 494 - */ 495 - if (r_sample > hctx->r_sqmean && hctx->p > 0) 496 - hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample, 497 - hctx->r_sqmean); 498 - hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI); 499 - /* update R_sqmean _after_ computing the modulation factor */ 500 - hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9); 501 - } 407 + dccp_role(sk), 408 + sk, hctx->ccid3hctx_rtt, r_sample, 409 + hctx->ccid3hctx_s, hctx->ccid3hctx_p, 410 + hctx->ccid3hctx_x_calc, 411 + (unsigned)(hctx->ccid3hctx_x_recv >> 6), 412 + (unsigned)(hctx->ccid3hctx_x >> 6)); 502 413 503 414 /* unschedule no feedback timer */ 504 - sk_stop_timer(sk, &hctx->no_feedback_timer); 415 + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); 505 416 506 417 /* 507 418 * As we have calculated new ipi, delta, t_nom it is possible ··· 488 453 * This can help avoid triggering the nofeedback timer too 489 454 * often ('spinning') on LANs with small RTTs. 490 455 */ 491 - hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO * 492 - (USEC_PER_SEC / 1000))); 456 + hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, 457 + (CONFIG_IP_DCCP_CCID3_RTO * 458 + (USEC_PER_SEC / 1000))); 493 459 /* 494 460 * Schedule no feedback timer to expire in 495 461 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) 496 462 */ 497 - t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); 463 + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); 498 464 499 465 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " 500 466 "expire in %lu jiffies (%luus)\n", 501 - dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); 467 + dccp_role(sk), 468 + sk, usecs_to_jiffies(t_nfb), t_nfb); 502 469 503 - sk_reset_timer(sk, &hctx->no_feedback_timer, 470 + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 504 471 jiffies + usecs_to_jiffies(t_nfb)); 505 472 } 506 473 507 - static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, 508 - u8 option, u8 *optval, u8 optlen) 474 + static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, 475 + unsigned char len, u16 idx, 476 + unsigned char *value) 509 477 { 478 + int rc = 0; 479 + const struct dccp_sock *dp = dccp_sk(sk); 510 480 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 481 + struct ccid3_options_received *opt_recv; 511 482 __be32 opt_val; 512 483 513 - switch (option) { 514 - case TFRC_OPT_RECEIVE_RATE: 515 - case TFRC_OPT_LOSS_EVENT_RATE: 516 - /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ 517 - if (packet_type == DCCP_PKT_DATA) 518 - break; 519 - if (unlikely(optlen != 4)) { 520 - DCCP_WARN("%s(%p), invalid len %d for %u\n", 521 - dccp_role(sk), sk, optlen, option); 522 - return -EINVAL; 523 - } 524 - opt_val = ntohl(get_unaligned((__be32 *)optval)); 484 + opt_recv = &hctx->ccid3hctx_options_received; 525 485 526 - if (option == TFRC_OPT_RECEIVE_RATE) { 527 - /* Receive Rate is kept in units of 64 bytes/second */ 528 - hctx->x_recv = opt_val; 529 - hctx->x_recv <<= 6; 530 - 531 - ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", 532 - dccp_role(sk), sk, opt_val); 533 - } else { 534 - /* Update the fixpoint Loss Event Rate fraction */ 535 - hctx->p = tfrc_invert_loss_event_rate(opt_val); 536 - 537 - ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", 538 - dccp_role(sk), sk, opt_val); 539 - } 486 + if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { 487 + opt_recv->ccid3or_seqno = dp->dccps_gsr; 488 + opt_recv->ccid3or_loss_event_rate = ~0; 489 + opt_recv->ccid3or_loss_intervals_idx = 0; 490 + opt_recv->ccid3or_loss_intervals_len = 0; 491 + opt_recv->ccid3or_receive_rate = 0; 540 492 } 541 - return 0; 493 + 494 + switch (option) { 495 + case TFRC_OPT_LOSS_EVENT_RATE: 496 + if (unlikely(len != 4)) { 497 + DCCP_WARN("%s(%p), invalid len %d " 498 + "for TFRC_OPT_LOSS_EVENT_RATE\n", 499 + dccp_role(sk), sk, len); 500 + rc = -EINVAL; 501 + } else { 502 + opt_val = get_unaligned((__be32 *)value); 503 + opt_recv->ccid3or_loss_event_rate = ntohl(opt_val); 504 + ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", 505 + dccp_role(sk), sk, 506 + opt_recv->ccid3or_loss_event_rate); 507 + } 508 + break; 509 + case TFRC_OPT_LOSS_INTERVALS: 510 + opt_recv->ccid3or_loss_intervals_idx = idx; 511 + opt_recv->ccid3or_loss_intervals_len = len; 512 + ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", 513 + dccp_role(sk), sk, 514 + opt_recv->ccid3or_loss_intervals_idx, 515 + opt_recv->ccid3or_loss_intervals_len); 516 + break; 517 + case TFRC_OPT_RECEIVE_RATE: 518 + if (unlikely(len != 4)) { 519 + DCCP_WARN("%s(%p), invalid len %d " 520 + "for TFRC_OPT_RECEIVE_RATE\n", 521 + dccp_role(sk), sk, len); 522 + rc = -EINVAL; 523 + } else { 524 + opt_val = get_unaligned((__be32 *)value); 525 + opt_recv->ccid3or_receive_rate = ntohl(opt_val); 526 + ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", 527 + dccp_role(sk), sk, 528 + opt_recv->ccid3or_receive_rate); 529 + } 530 + break; 531 + } 532 + 533 + return rc; 542 534 } 543 535 544 536 static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) 545 537 { 546 538 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); 547 539 548 - hctx->hist = NULL; 549 - setup_timer(&hctx->no_feedback_timer, 550 - ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); 540 + hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; 541 + hctx->ccid3hctx_hist = NULL; 542 + setup_timer(&hctx->ccid3hctx_no_feedback_timer, 543 + ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); 544 + 551 545 return 0; 552 546 } 553 547 ··· 584 520 { 585 521 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 586 522 587 - sk_stop_timer(sk, &hctx->no_feedback_timer); 588 - tfrc_tx_hist_purge(&hctx->hist); 523 + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); 524 + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); 525 + 526 + tfrc_tx_hist_purge(&hctx->ccid3hctx_hist); 589 527 } 590 528 591 529 static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) 592 530 { 593 - info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto; 594 - info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt; 531 + struct ccid3_hc_tx_sock *hctx; 532 + 533 + /* Listen socks doesn't have a private CCID block */ 534 + if (sk->sk_state == DCCP_LISTEN) 535 + return; 536 + 537 + hctx = ccid3_hc_tx_sk(sk); 538 + info->tcpi_rto = hctx->ccid3hctx_t_rto; 539 + info->tcpi_rtt = hctx->ccid3hctx_rtt; 595 540 } 596 541 597 542 static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, 598 543 u32 __user *optval, int __user *optlen) 599 544 { 600 - const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 601 - struct tfrc_tx_info tfrc; 545 + const struct ccid3_hc_tx_sock *hctx; 602 546 const void *val; 603 547 548 + /* Listen socks doesn't have a private CCID block */ 549 + if (sk->sk_state == DCCP_LISTEN) 550 + return -EINVAL; 551 + 552 + hctx = ccid3_hc_tx_sk(sk); 604 553 switch (optname) { 605 554 case DCCP_SOCKOPT_CCID_TX_INFO: 606 - if (len < sizeof(tfrc)) 555 + if (len < sizeof(hctx->ccid3hctx_tfrc)) 607 556 return -EINVAL; 608 - tfrc.tfrctx_x = hctx->x; 609 - tfrc.tfrctx_x_recv = hctx->x_recv; 610 - tfrc.tfrctx_x_calc = hctx->x_calc; 611 - tfrc.tfrctx_rtt = hctx->rtt; 612 - tfrc.tfrctx_p = hctx->p; 613 - tfrc.tfrctx_rto = hctx->t_rto; 614 - tfrc.tfrctx_ipi = hctx->t_ipi; 615 - len = sizeof(tfrc); 616 - val = &tfrc; 557 + len = sizeof(hctx->ccid3hctx_tfrc); 558 + val = &hctx->ccid3hctx_tfrc; 617 559 break; 618 560 default: 619 561 return -ENOPROTOOPT; ··· 634 564 /* 635 565 * Receiver Half-Connection Routines 636 566 */ 567 + 568 + /* CCID3 feedback types */ 569 + enum ccid3_fback_type { 570 + CCID3_FBACK_NONE = 0, 571 + CCID3_FBACK_INITIAL, 572 + CCID3_FBACK_PERIODIC, 573 + CCID3_FBACK_PARAM_CHANGE 574 + }; 575 + 576 + #ifdef CONFIG_IP_DCCP_CCID3_DEBUG 577 + static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) 578 + { 579 + static char *ccid3_rx_state_names[] = { 580 + [TFRC_RSTATE_NO_DATA] = "NO_DATA", 581 + [TFRC_RSTATE_DATA] = "DATA", 582 + [TFRC_RSTATE_TERM] = "TERM", 583 + }; 584 + 585 + return ccid3_rx_state_names[state]; 586 + } 587 + #endif 588 + 589 + static void ccid3_hc_rx_set_state(struct sock *sk, 590 + enum ccid3_hc_rx_states state) 591 + { 592 + struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 593 + enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; 594 + 595 + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", 596 + dccp_role(sk), sk, ccid3_rx_state_name(oldstate), 597 + ccid3_rx_state_name(state)); 598 + WARN_ON(state == oldstate); 599 + hcrx->ccid3hcrx_state = state; 600 + } 601 + 637 602 static void ccid3_hc_rx_send_feedback(struct sock *sk, 638 603 const struct sk_buff *skb, 639 604 enum ccid3_fback_type fbtype) 640 605 { 641 606 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 607 + struct dccp_sock *dp = dccp_sk(sk); 608 + ktime_t now; 609 + s64 delta = 0; 610 + 611 + if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM)) 612 + return; 613 + 614 + now = ktime_get_real(); 642 615 643 616 switch (fbtype) { 644 617 case CCID3_FBACK_INITIAL: 645 - hcrx->x_recv = 0; 646 - hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */ 618 + hcrx->ccid3hcrx_x_recv = 0; 619 + hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ 647 620 break; 648 621 case CCID3_FBACK_PARAM_CHANGE: 649 - if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) { 650 - /* 651 - * rfc3448bis-06, 6.3.1: First packet(s) lost or marked 652 - * FIXME: in rfc3448bis the receiver returns X_recv=0 653 - * here as it normally would in the first feedback packet. 654 - * However this is not possible yet, since the code still 655 - * uses RFC 3448, i.e. 656 - * If (p > 0) 657 - * Calculate X_calc using the TCP throughput equation. 658 - * X = max(min(X_calc, 2*X_recv), s/t_mbi); 659 - * would bring X down to s/t_mbi. That is why we return 660 - * X_recv according to rfc3448bis-06 for the moment. 661 - */ 662 - u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), 663 - rtt = tfrc_rx_hist_rtt(&hcrx->hist); 664 - 665 - hcrx->x_recv = scaled_div32(s, 2 * rtt); 666 - break; 667 - } 668 622 /* 669 623 * When parameters change (new loss or p > p_prev), we do not 670 624 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so 671 - * always check whether at least RTT time units were covered. 625 + * need to reuse the previous value of X_recv. However, when 626 + * X_recv was 0 (due to early loss), this would kill X down to 627 + * s/t_mbi (i.e. one packet in 64 seconds). 628 + * To avoid such drastic reduction, we approximate X_recv as 629 + * the number of bytes since last feedback. 630 + * This is a safe fallback, since X is bounded above by X_calc. 672 631 */ 673 - hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); 674 - break; 632 + if (hcrx->ccid3hcrx_x_recv > 0) 633 + break; 634 + /* fall through */ 675 635 case CCID3_FBACK_PERIODIC: 676 - /* 677 - * Step (2) of rfc3448bis-06, 6.2: 678 - * - if no data packets have been received, just restart timer 679 - * - if data packets have been received, re-compute X_recv 680 - */ 681 - if (hcrx->hist.bytes_recvd == 0) 682 - goto prepare_for_next_time; 683 - hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); 636 + delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback); 637 + if (delta <= 0) 638 + DCCP_BUG("delta (%ld) <= 0", (long)delta); 639 + else 640 + hcrx->ccid3hcrx_x_recv = 641 + scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); 684 642 break; 685 643 default: 686 644 return; 687 645 } 688 646 689 - ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse); 647 + ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, 648 + hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv); 690 649 691 - dccp_sk(sk)->dccps_hc_rx_insert_options = 1; 650 + hcrx->ccid3hcrx_tstamp_last_feedback = now; 651 + hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval; 652 + hcrx->ccid3hcrx_bytes_recv = 0; 653 + 654 + dp->dccps_hc_rx_insert_options = 1; 692 655 dccp_send_ack(sk); 693 - 694 - prepare_for_next_time: 695 - tfrc_rx_hist_restart_byte_counter(&hcrx->hist); 696 - hcrx->last_counter = dccp_hdr(skb)->dccph_ccval; 697 - hcrx->feedback = fbtype; 698 656 } 699 657 700 658 static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) 701 659 { 702 - const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 660 + const struct ccid3_hc_rx_sock *hcrx; 703 661 __be32 x_recv, pinv; 704 662 705 663 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) 706 664 return 0; 707 665 666 + hcrx = ccid3_hc_rx_sk(sk); 667 + 708 668 if (dccp_packet_without_ack(skb)) 709 669 return 0; 710 670 711 - x_recv = htonl(hcrx->x_recv); 712 - pinv = htonl(hcrx->p_inverse); 671 + x_recv = htonl(hcrx->ccid3hcrx_x_recv); 672 + pinv = htonl(hcrx->ccid3hcrx_pinv); 713 673 714 674 if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, 715 675 &pinv, sizeof(pinv)) || ··· 762 662 static u32 ccid3_first_li(struct sock *sk) 763 663 { 764 664 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 765 - u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), 766 - rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p; 665 + u32 x_recv, p, delta; 767 666 u64 fval; 768 667 769 - /* 770 - * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p 771 - * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p 772 - * is about 20.64%. This yields an interval length of 4.84 (rounded up). 773 - */ 774 - if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) 775 - return 5; 668 + if (hcrx->ccid3hcrx_rtt == 0) { 669 + DCCP_WARN("No RTT estimate available, using fallback RTT\n"); 670 + hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT; 671 + } 776 672 777 - x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); 778 - if (x_recv == 0) 779 - goto failed; 673 + delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback)); 674 + x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); 675 + if (x_recv == 0) { /* would also trigger divide-by-zero */ 676 + DCCP_WARN("X_recv==0\n"); 677 + if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) { 678 + DCCP_BUG("stored value of X_recv is zero"); 679 + return ~0U; 680 + } 681 + } 780 682 781 - fval = scaled_div32(scaled_div(s, rtt), x_recv); 683 + fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt); 684 + fval = scaled_div32(fval, x_recv); 782 685 p = tfrc_calc_x_reverse_lookup(fval); 783 686 784 687 ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " 785 688 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); 786 689 787 - if (p > 0) 788 - return scaled_div(1, p); 789 - failed: 790 - return UINT_MAX; 690 + return p == 0 ? ~0U : scaled_div(1, p); 791 691 } 792 692 793 693 static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 794 694 { 795 695 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 696 + enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; 796 697 const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; 797 698 const bool is_data_packet = dccp_data_packet(skb); 699 + 700 + if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { 701 + if (is_data_packet) { 702 + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; 703 + do_feedback = CCID3_FBACK_INITIAL; 704 + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); 705 + hcrx->ccid3hcrx_s = payload; 706 + /* 707 + * Not necessary to update ccid3hcrx_bytes_recv here, 708 + * since X_recv = 0 for the first feedback packet (cf. 709 + * RFC 3448, 6.3) -- gerrit 710 + */ 711 + } 712 + goto update_records; 713 + } 714 + 715 + if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb)) 716 + return; /* done receiving */ 717 + 718 + if (is_data_packet) { 719 + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; 720 + /* 721 + * Update moving-average of s and the sum of received payload bytes 722 + */ 723 + hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9); 724 + hcrx->ccid3hcrx_bytes_recv += payload; 725 + } 798 726 799 727 /* 800 728 * Perform loss detection and handle pending losses 801 729 */ 802 - if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist, 803 - skb, ndp, ccid3_first_li, sk)) 804 - ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE); 730 + if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist, 731 + skb, ndp, ccid3_first_li, sk)) { 732 + do_feedback = CCID3_FBACK_PARAM_CHANGE; 733 + goto done_receiving; 734 + } 735 + 736 + if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist)) 737 + return; /* done receiving */ 738 + 805 739 /* 806 - * Feedback for first non-empty data packet (RFC 3448, 6.3) 740 + * Handle data packets: RTT sampling and monitoring p 807 741 */ 808 - else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet)) 809 - ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL); 742 + if (unlikely(!is_data_packet)) 743 + goto update_records; 744 + 745 + if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) { 746 + const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb); 747 + /* 748 + * Empty loss history: no loss so far, hence p stays 0. 749 + * Sample RTT values, since an RTT estimate is required for the 750 + * computation of p when the first loss occurs; RFC 3448, 6.3.1. 751 + */ 752 + if (sample != 0) 753 + hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9); 754 + 755 + } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) { 756 + /* 757 + * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean 758 + * has decreased (resp. p has increased), send feedback now. 759 + */ 760 + do_feedback = CCID3_FBACK_PARAM_CHANGE; 761 + } 762 + 810 763 /* 811 764 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 812 765 */ 813 - else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet && 814 - SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) 815 - ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC); 766 + if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3) 767 + do_feedback = CCID3_FBACK_PERIODIC; 768 + 769 + update_records: 770 + tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp); 771 + 772 + done_receiving: 773 + if (do_feedback) 774 + ccid3_hc_rx_send_feedback(sk, skb, do_feedback); 816 775 } 817 776 818 777 static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) 819 778 { 820 779 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); 821 780 822 - tfrc_lh_init(&hcrx->li_hist); 823 - return tfrc_rx_hist_init(&hcrx->hist, sk); 781 + hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; 782 + tfrc_lh_init(&hcrx->ccid3hcrx_li_hist); 783 + return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist); 824 784 } 825 785 826 786 static void ccid3_hc_rx_exit(struct sock *sk) 827 787 { 828 788 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 829 789 830 - tfrc_rx_hist_purge(&hcrx->hist); 831 - tfrc_lh_cleanup(&hcrx->li_hist); 790 + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); 791 + 792 + tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist); 793 + tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist); 832 794 } 833 795 834 796 static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) 835 797 { 798 + const struct ccid3_hc_rx_sock *hcrx; 799 + 800 + /* Listen socks doesn't have a private CCID block */ 801 + if (sk->sk_state == DCCP_LISTEN) 802 + return; 803 + 804 + hcrx = ccid3_hc_rx_sk(sk); 805 + info->tcpi_ca_state = hcrx->ccid3hcrx_state; 836 806 info->tcpi_options |= TCPI_OPT_TIMESTAMPS; 837 - info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist); 807 + info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; 838 808 } 839 809 840 810 static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, 841 811 u32 __user *optval, int __user *optlen) 842 812 { 843 - const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 813 + const struct ccid3_hc_rx_sock *hcrx; 844 814 struct tfrc_rx_info rx_info; 845 815 const void *val; 846 816 817 + /* Listen socks doesn't have a private CCID block */ 818 + if (sk->sk_state == DCCP_LISTEN) 819 + return -EINVAL; 820 + 821 + hcrx = ccid3_hc_rx_sk(sk); 847 822 switch (optname) { 848 823 case DCCP_SOCKOPT_CCID_RX_INFO: 849 824 if (len < sizeof(rx_info)) 850 825 return -EINVAL; 851 - rx_info.tfrcrx_x_recv = hcrx->x_recv; 852 - rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist); 853 - rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse); 826 + rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv; 827 + rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt; 828 + rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U : 829 + scaled_div(1, hcrx->ccid3hcrx_pinv); 854 830 len = sizeof(rx_info); 855 831 val = &rx_info; 856 832 break; ··· 962 786 .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, 963 787 }; 964 788 965 - module_param(do_osc_prev, bool, 0644); 966 - MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)"); 967 - 968 789 #ifdef CONFIG_IP_DCCP_CCID3_DEBUG 969 790 module_param(ccid3_debug, bool, 0644); 970 791 MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); ··· 969 796 970 797 static __init int ccid3_module_init(void) 971 798 { 972 - struct timespec tp; 973 - 974 - /* 975 - * Without a fine-grained clock resolution, RTTs/X_recv are not sampled 976 - * correctly and feedback is sent either too early or too late. 977 - */ 978 - hrtimer_get_res(CLOCK_MONOTONIC, &tp); 979 - if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) { 980 - printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec" 981 - " resolution - check your clocksource.\n", __func__, 982 - tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION); 983 - return -ESOCKTNOSUPPORT; 984 - } 985 799 return ccid_register(&ccid3); 986 800 } 987 801 module_init(ccid3_module_init);
+85 -68
net/dccp/ccids/ccid3.h
··· 47 47 /* Two seconds as per RFC 3448 4.2 */ 48 48 #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) 49 49 50 - /* Maximum backoff interval t_mbi (RFC 3448, 4.3) */ 51 - #define TFRC_T_MBI (64 * USEC_PER_SEC) 50 + /* In usecs - half the scheduling granularity as per RFC3448 4.6 */ 51 + #define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) 52 52 53 - /* 54 - * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are 55 - * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. 56 - * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse 57 - * resolution of HZ < 500 means that the error is below one timer tick (t_gran) 58 - * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). 59 - */ 60 - #if (HZ >= 500) 61 - # define TFRC_T_DELTA USEC_PER_MSEC 62 - #else 63 - # define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) 64 - #warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC. 65 - #endif 53 + /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ 54 + #define TFRC_T_MBI 64 66 55 67 56 enum ccid3_options { 68 57 TFRC_OPT_LOSS_EVENT_RATE = 192, ··· 59 70 TFRC_OPT_RECEIVE_RATE = 194, 60 71 }; 61 72 73 + struct ccid3_options_received { 74 + u64 ccid3or_seqno:48, 75 + ccid3or_loss_intervals_idx:16; 76 + u16 ccid3or_loss_intervals_len; 77 + u32 ccid3or_loss_event_rate; 78 + u32 ccid3or_receive_rate; 79 + }; 80 + 81 + /* TFRC sender states */ 82 + enum ccid3_hc_tx_states { 83 + TFRC_SSTATE_NO_SENT = 1, 84 + TFRC_SSTATE_NO_FBACK, 85 + TFRC_SSTATE_FBACK, 86 + TFRC_SSTATE_TERM, 87 + }; 88 + 62 89 /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket 63 90 * 64 - * @x - Current sending rate in 64 * bytes per second 65 - * @x_recv - Receive rate in 64 * bytes per second 66 - * @x_calc - Calculated rate in bytes per second 67 - * @rtt - Estimate of current round trip time in usecs 68 - * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5) 69 - * @p - Current loss event rate (0-1) scaled by 1000000 70 - * @s - Packet size in bytes 71 - * @t_rto - Nofeedback Timer setting in usecs 72 - * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs 73 - * @feedback - Whether feedback has been received or not 74 - * @last_win_count - Last window counter sent 75 - * @t_last_win_count - Timestamp of earliest packet with 76 - * last_win_count value sent 77 - * @no_feedback_timer - Handle to no feedback timer 78 - * @t_ld - Time last doubled during slow start 79 - * @t_nom - Nominal send time of next packet 80 - * @hist - Packet history 91 + * @ccid3hctx_x - Current sending rate in 64 * bytes per second 92 + * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second 93 + * @ccid3hctx_x_calc - Calculated rate in bytes per second 94 + * @ccid3hctx_rtt - Estimate of current round trip time in usecs 95 + * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 96 + * @ccid3hctx_s - Packet size in bytes 97 + * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs 98 + * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs 99 + * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states 100 + * @ccid3hctx_last_win_count - Last window counter sent 101 + * @ccid3hctx_t_last_win_count - Timestamp of earliest packet 102 + * with last_win_count value sent 103 + * @ccid3hctx_no_feedback_timer - Handle to no feedback timer 104 + * @ccid3hctx_t_ld - Time last doubled during slow start 105 + * @ccid3hctx_t_nom - Nominal send time of next packet 106 + * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs 107 + * @ccid3hctx_hist - Packet history 108 + * @ccid3hctx_options_received - Parsed set of retrieved options 81 109 */ 82 110 struct ccid3_hc_tx_sock { 83 - u64 x; 84 - u64 x_recv; 85 - u32 x_calc; 86 - u32 rtt; 87 - u16 r_sqmean; 88 - u32 p; 89 - u32 t_rto; 90 - u32 t_ipi; 91 - u16 s; 92 - bool feedback:1; 93 - u8 last_win_count; 94 - ktime_t t_last_win_count; 95 - struct timer_list no_feedback_timer; 96 - ktime_t t_ld; 97 - ktime_t t_nom; 98 - struct tfrc_tx_hist_entry *hist; 111 + struct tfrc_tx_info ccid3hctx_tfrc; 112 + #define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x 113 + #define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv 114 + #define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc 115 + #define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt 116 + #define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p 117 + #define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto 118 + #define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi 119 + u16 ccid3hctx_s; 120 + enum ccid3_hc_tx_states ccid3hctx_state:8; 121 + u8 ccid3hctx_last_win_count; 122 + ktime_t ccid3hctx_t_last_win_count; 123 + struct timer_list ccid3hctx_no_feedback_timer; 124 + ktime_t ccid3hctx_t_ld; 125 + ktime_t ccid3hctx_t_nom; 126 + u32 ccid3hctx_delta; 127 + struct tfrc_tx_hist_entry *ccid3hctx_hist; 128 + struct ccid3_options_received ccid3hctx_options_received; 99 129 }; 100 130 101 131 static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) ··· 124 116 return hctx; 125 117 } 126 118 127 - 128 - enum ccid3_fback_type { 129 - CCID3_FBACK_NONE = 0, 130 - CCID3_FBACK_INITIAL, 131 - CCID3_FBACK_PERIODIC, 132 - CCID3_FBACK_PARAM_CHANGE 119 + /* TFRC receiver states */ 120 + enum ccid3_hc_rx_states { 121 + TFRC_RSTATE_NO_DATA = 1, 122 + TFRC_RSTATE_DATA, 123 + TFRC_RSTATE_TERM = 127, 133 124 }; 134 125 135 126 /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket 136 127 * 137 - * @last_counter - Tracks window counter (RFC 4342, 8.1) 138 - * @feedback - The type of the feedback last sent 139 - * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) 140 - * @tstamp_last_feedback - Time at which last feedback was sent 141 - * @hist - Packet history (loss detection + RTT sampling) 142 - * @li_hist - Loss Interval database 143 - * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) 128 + * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) 129 + * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) 130 + * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4) 131 + * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) 132 + * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states 133 + * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes 134 + * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) 135 + * @ccid3hcrx_rtt - Receiver estimate of RTT 136 + * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent 137 + * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent 138 + * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling) 139 + * @ccid3hcrx_li_hist - Loss Interval database 140 + * @ccid3hcrx_s - Received packet size in bytes 141 + * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) 144 142 */ 145 143 struct ccid3_hc_rx_sock { 146 - u8 last_counter:4; 147 - enum ccid3_fback_type feedback:4; 148 - u32 x_recv; 149 - ktime_t tstamp_last_feedback; 150 - struct tfrc_rx_hist hist; 151 - struct tfrc_loss_hist li_hist; 152 - #define p_inverse li_hist.i_mean 144 + u8 ccid3hcrx_last_counter:4; 145 + enum ccid3_hc_rx_states ccid3hcrx_state:8; 146 + u32 ccid3hcrx_bytes_recv; 147 + u32 ccid3hcrx_x_recv; 148 + u32 ccid3hcrx_rtt; 149 + ktime_t ccid3hcrx_tstamp_last_feedback; 150 + struct tfrc_rx_hist ccid3hcrx_hist; 151 + struct tfrc_loss_hist ccid3hcrx_li_hist; 152 + u16 ccid3hcrx_s; 153 + #define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean 153 154 }; 154 155 155 156 static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
+14 -16
net/dccp/ccids/lib/loss_interval.c
··· 86 86 87 87 /** 88 88 * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 89 - * This updates I_mean as the sequence numbers increase. As a consequence, the 90 - * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1) 91 - * decreases, and thus there is no need to send renewed feedback. 89 + * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev 92 90 */ 93 - void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) 91 + u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) 94 92 { 95 93 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); 94 + u32 old_i_mean = lh->i_mean; 96 95 s64 len; 97 96 98 97 if (cur == NULL) /* not initialised */ 99 - return; 100 - 101 - /* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */ 102 - if (!dccp_data_packet(skb)) 103 - return; 98 + return 0; 104 99 105 100 len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; 106 101 107 102 if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ 108 - return; 103 + return 0; 109 104 110 105 if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) 111 106 /* ··· 114 119 cur->li_is_closed = 1; 115 120 116 121 if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ 117 - return; 122 + return 0; 118 123 119 124 cur->li_length = len; 120 125 tfrc_lh_calc_i_mean(lh); 126 + 127 + return (lh->i_mean < old_i_mean); 121 128 } 129 + EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean); 122 130 123 131 /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ 124 132 static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, ··· 138 140 * @sk: Used by @calc_first_li in caller-specific way (subtyping) 139 141 * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. 140 142 */ 141 - bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, 142 - u32 (*calc_first_li)(struct sock *), struct sock *sk) 143 + int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, 144 + u32 (*calc_first_li)(struct sock *), struct sock *sk) 143 145 { 144 146 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; 145 147 146 148 if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) 147 - return false; 149 + return 0; 148 150 149 151 new = tfrc_lh_demand_next(lh); 150 152 if (unlikely(new == NULL)) { 151 153 DCCP_CRIT("Cannot allocate/add loss record."); 152 - return false; 154 + return 0; 153 155 } 154 156 155 157 new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; ··· 167 169 168 170 tfrc_lh_calc_i_mean(lh); 169 171 } 170 - return true; 172 + return 1; 171 173 } 172 174 EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); 173 175
+2 -2
net/dccp/ccids/lib/loss_interval.h
··· 67 67 68 68 struct tfrc_rx_hist; 69 69 70 - extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, 70 + extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, 71 71 u32 (*first_li)(struct sock *), struct sock *); 72 - extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); 72 + extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); 73 73 extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); 74 74 75 75 #endif /* _DCCP_LI_HIST_ */
+137 -145
net/dccp/ccids/lib/packet_history.c
··· 40 40 #include "packet_history.h" 41 41 #include "../../dccp.h" 42 42 43 + /** 44 + * tfrc_tx_hist_entry - Simple singly-linked TX history list 45 + * @next: next oldest entry (LIFO order) 46 + * @seqno: sequence number of this entry 47 + * @stamp: send time of packet with sequence number @seqno 48 + */ 49 + struct tfrc_tx_hist_entry { 50 + struct tfrc_tx_hist_entry *next; 51 + u64 seqno; 52 + ktime_t stamp; 53 + }; 54 + 43 55 /* 44 56 * Transmitter History Routines 45 57 */ ··· 71 59 kmem_cache_destroy(tfrc_tx_hist_slab); 72 60 tfrc_tx_hist_slab = NULL; 73 61 } 62 + } 63 + 64 + static struct tfrc_tx_hist_entry * 65 + tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) 66 + { 67 + while (head != NULL && head->seqno != seqno) 68 + head = head->next; 69 + 70 + return head; 74 71 } 75 72 76 73 int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) ··· 110 89 *headp = NULL; 111 90 } 112 91 EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); 92 + 93 + u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, 94 + const ktime_t now) 95 + { 96 + u32 rtt = 0; 97 + struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); 98 + 99 + if (packet != NULL) { 100 + rtt = ktime_us_delta(now, packet->stamp); 101 + /* 102 + * Garbage-collect older (irrelevant) entries: 103 + */ 104 + tfrc_tx_hist_purge(&packet->next); 105 + } 106 + 107 + return rtt; 108 + } 109 + EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); 110 + 113 111 114 112 /* 115 113 * Receiver History Routines ··· 191 151 } 192 152 EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); 193 153 194 - 195 - static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) 196 - { 197 - struct tfrc_rx_hist_entry *tmp = h->ring[a]; 198 - 199 - h->ring[a] = h->ring[b]; 200 - h->ring[b] = tmp; 201 - } 202 - 203 154 static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) 204 155 { 205 - __tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a), 206 - tfrc_rx_hist_index(h, b)); 207 - } 156 + const u8 idx_a = tfrc_rx_hist_index(h, a), 157 + idx_b = tfrc_rx_hist_index(h, b); 158 + struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; 208 159 209 - /** 210 - * tfrc_rx_hist_resume_rtt_sampling - Prepare RX history for RTT sampling 211 - * This is called after loss detection has finished, when the history entry 212 - * with the index of `loss_count' holds the highest-received sequence number. 213 - * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt). 214 - */ 215 - static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h) 216 - { 217 - __tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count)); 218 - h->loss_count = h->loss_start = 0; 160 + h->ring[idx_a] = h->ring[idx_b]; 161 + h->ring[idx_b] = tmp; 219 162 } 220 163 221 164 /* ··· 215 192 u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, 216 193 s1 = DCCP_SKB_CB(skb)->dccpd_seq; 217 194 218 - if (!dccp_loss_free(s0, s1, n1)) /* gap between S0 and S1 */ 195 + if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ 219 196 h->loss_count = 1; 197 + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); 198 + } 220 199 } 221 200 222 201 static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) ··· 240 215 241 216 if (dccp_loss_free(s2, s1, n1)) { 242 217 /* hole is filled: S0, S2, and S1 are consecutive */ 243 - tfrc_rx_hist_resume_rtt_sampling(h); 218 + h->loss_count = 0; 219 + h->loss_start = tfrc_rx_hist_index(h, 1); 244 220 } else 245 221 /* gap between S2 and S1: just update loss_prev */ 246 222 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); ··· 294 268 295 269 if (dccp_loss_free(s1, s2, n2)) { 296 270 /* entire hole filled by S0, S3, S1, S2 */ 297 - tfrc_rx_hist_resume_rtt_sampling(h); 271 + h->loss_start = tfrc_rx_hist_index(h, 2); 272 + h->loss_count = 0; 298 273 } else { 299 274 /* gap remains between S1 and S2 */ 300 275 h->loss_start = tfrc_rx_hist_index(h, 1); ··· 339 312 340 313 if (dccp_loss_free(s2, s3, n3)) { 341 314 /* no gap between S2 and S3: entire hole is filled */ 342 - tfrc_rx_hist_resume_rtt_sampling(h); 315 + h->loss_start = tfrc_rx_hist_index(h, 3); 316 + h->loss_count = 0; 343 317 } else { 344 318 /* gap between S2 and S3 */ 345 319 h->loss_start = tfrc_rx_hist_index(h, 2); ··· 354 326 } 355 327 356 328 /** 357 - * tfrc_rx_congestion_event - Loss detection and further processing 358 - * @h: The non-empty RX history object 359 - * @lh: Loss Intervals database to update 360 - * @skb: Currently received packet 361 - * @ndp: The NDP count belonging to @skb 362 - * @first_li: Caller-dependent computation of first loss interval in @lh 363 - * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) 329 + * tfrc_rx_handle_loss - Loss detection and further processing 330 + * @h: The non-empty RX history object 331 + * @lh: Loss Intervals database to update 332 + * @skb: Currently received packet 333 + * @ndp: The NDP count belonging to @skb 334 + * @calc_first_li: Caller-dependent computation of first loss interval in @lh 335 + * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) 364 336 * Chooses action according to pending loss, updates LI database when a new 365 337 * loss was detected, and does required post-processing. Returns 1 when caller 366 338 * should send feedback, 0 otherwise. ··· 368 340 * records accordingly, the caller should not perform any more RX history 369 341 * operations when loss_count is greater than 0 after calling this function. 370 342 */ 371 - bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, 372 - struct tfrc_loss_hist *lh, 373 - struct sk_buff *skb, const u64 ndp, 374 - u32 (*first_li)(struct sock *), struct sock *sk) 343 + int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, 344 + struct tfrc_loss_hist *lh, 345 + struct sk_buff *skb, const u64 ndp, 346 + u32 (*calc_first_li)(struct sock *), struct sock *sk) 375 347 { 376 - bool new_event = false; 377 - 378 - if (tfrc_rx_hist_duplicate(h, skb)) 379 - return 0; 348 + int is_new_loss = 0; 380 349 381 350 if (h->loss_count == 0) { 382 351 __do_track_loss(h, skb, ndp); 383 - tfrc_rx_hist_sample_rtt(h, skb); 384 - tfrc_rx_hist_add_packet(h, skb, ndp); 385 352 } else if (h->loss_count == 1) { 386 353 __one_after_loss(h, skb, ndp); 387 354 } else if (h->loss_count != 2) { ··· 385 362 /* 386 363 * Update Loss Interval database and recycle RX records 387 364 */ 388 - new_event = tfrc_lh_interval_add(lh, h, first_li, sk); 365 + is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); 389 366 __three_after_loss(h); 390 367 } 391 - 392 - /* 393 - * Update moving-average of `s' and the sum of received payload bytes. 394 - */ 395 - if (dccp_data_packet(skb)) { 396 - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; 397 - 398 - h->packet_size = tfrc_ewma(h->packet_size, payload, 9); 399 - h->bytes_recvd += payload; 400 - } 401 - 402 - /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */ 403 - if (!new_event) 404 - tfrc_lh_update_i_mean(lh, skb); 405 - 406 - return new_event; 368 + return is_new_loss; 407 369 } 408 - EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event); 370 + EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); 409 371 410 - /* Compute the sending rate X_recv measured between feedback intervals */ 411 - u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv) 372 + int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) 412 373 { 413 - u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate; 414 - s64 delta = ktime_to_us(net_timedelta(h->bytes_start)); 374 + int i; 415 375 416 - WARN_ON(delta <= 0); 417 - /* 418 - * Ensure that the sampling interval for X_recv is at least one RTT, 419 - * by extending the sampling interval backwards in time, over the last 420 - * R_(m-1) seconds, as per rfc3448bis-06, 6.2. 421 - * To reduce noise (e.g. when the RTT changes often), this is only 422 - * done when delta is smaller than RTT/2. 423 - */ 424 - if (last_x_recv > 0 && delta < last_rtt/2) { 425 - tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n", 426 - (long)delta, (unsigned)last_rtt); 427 - 428 - delta = (bytes ? delta : 0) + last_rtt; 429 - bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC); 376 + for (i = 0; i <= TFRC_NDUPACK; i++) { 377 + h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); 378 + if (h->ring[i] == NULL) 379 + goto out_free; 430 380 } 431 381 432 - if (unlikely(bytes == 0)) { 433 - DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv); 434 - return last_x_recv; 382 + h->loss_count = h->loss_start = 0; 383 + return 0; 384 + 385 + out_free: 386 + while (i-- != 0) { 387 + kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); 388 + h->ring[i] = NULL; 435 389 } 436 - return scaled_div32(bytes, delta); 390 + return -ENOBUFS; 437 391 } 438 - EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv); 392 + EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc); 439 393 440 394 void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) 441 395 { ··· 426 426 } 427 427 EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); 428 428 429 - static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) 429 + /** 430 + * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against 431 + */ 432 + static inline struct tfrc_rx_hist_entry * 433 + tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) 430 434 { 431 - int i; 432 - 433 - memset(h, 0, sizeof(*h)); 434 - 435 - for (i = 0; i <= TFRC_NDUPACK; i++) { 436 - h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); 437 - if (h->ring[i] == NULL) { 438 - tfrc_rx_hist_purge(h); 439 - return -ENOBUFS; 440 - } 441 - } 442 - return 0; 435 + return h->ring[0]; 443 436 } 444 437 445 - int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk) 438 + /** 439 + * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry 440 + */ 441 + static inline struct tfrc_rx_hist_entry * 442 + tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) 446 443 { 447 - if (tfrc_rx_hist_alloc(h)) 448 - return -ENOBUFS; 449 - /* 450 - * Initialise first entry with GSR to start loss detection as early as 451 - * possible. Code using this must not use any other fields. The entry 452 - * will be overwritten once the CCID updates its received packets. 453 - */ 454 - tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr; 455 - return 0; 444 + return h->ring[h->rtt_sample_prev]; 456 445 } 457 - EXPORT_SYMBOL_GPL(tfrc_rx_hist_init); 458 446 459 447 /** 460 448 * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal 461 - * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss 462 - * is pending and uses the following history entries (via rtt_sample_prev): 463 - * - h->ring[0] contains the most recent history entry prior to @skb; 464 - * - h->ring[1] is an unused `dummy' entry when the current difference is 0; 449 + * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able 450 + * to compute a sample with given data - calling function should check this. 465 451 */ 466 - void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) 452 + u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) 467 453 { 468 - struct tfrc_rx_hist_entry *last = h->ring[0]; 469 - u32 sample, delta_v; 454 + u32 sample = 0, 455 + delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, 456 + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); 470 457 471 - /* 472 - * When not to sample: 473 - * - on non-data packets 474 - * (RFC 4342, 8.1: CCVal only fully defined for data packets); 475 - * - when no data packets have been received yet 476 - * (FIXME: using sampled packet size as indicator here); 477 - * - as long as there are gaps in the sequence space (pending loss). 478 - */ 479 - if (!dccp_data_packet(skb) || h->packet_size == 0 || 480 - tfrc_rx_hist_loss_pending(h)) 481 - return; 458 + if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ 459 + if (h->rtt_sample_prev == 2) { /* previous candidate stored */ 460 + sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, 461 + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); 462 + if (sample) 463 + sample = 4 / sample * 464 + ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, 465 + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); 466 + else /* 467 + * FIXME: This condition is in principle not 468 + * possible but occurs when CCID is used for 469 + * two-way data traffic. I have tried to trace 470 + * it, but the cause does not seem to be here. 471 + */ 472 + DCCP_BUG("please report to dccp@vger.kernel.org" 473 + " => prev = %u, last = %u", 474 + tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, 475 + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); 476 + } else if (delta_v < 1) { 477 + h->rtt_sample_prev = 1; 478 + goto keep_ref_for_next_time; 479 + } 482 480 483 - h->rtt_sample_prev = 0; /* reset previous candidate */ 484 - 485 - delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval); 486 - if (delta_v == 0) { /* less than RTT/4 difference */ 487 - h->rtt_sample_prev = 1; 488 - return; 481 + } else if (delta_v == 4) /* optimal match */ 482 + sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); 483 + else { /* suboptimal match */ 484 + h->rtt_sample_prev = 2; 485 + goto keep_ref_for_next_time; 489 486 } 490 - sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp))); 491 487 492 - if (delta_v <= 4) /* between RTT/4 and RTT */ 493 - sample *= 4 / delta_v; 494 - else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2)) 495 - /* 496 - * Optimisation: CCVal difference is greater than 1 RTT, yet the 497 - * sample is less than the local RTT estimate; which means that 498 - * the RTT estimate is too high. 499 - * To avoid noise, it is not done if the sample is below RTT/2. 500 - */ 501 - return; 488 + if (unlikely(sample > DCCP_SANE_RTT_MAX)) { 489 + DCCP_WARN("RTT sample %u too large, using max\n", sample); 490 + sample = DCCP_SANE_RTT_MAX; 491 + } 502 492 503 - /* Use a lower weight than usual to increase responsiveness */ 504 - h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5); 493 + h->rtt_sample_prev = 0; /* use current entry as next reference */ 494 + keep_ref_for_next_time: 495 + 496 + return sample; 505 497 } 506 498 EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt);
+11 -67
net/dccp/ccids/lib/packet_history.h
··· 40 40 #include <linux/slab.h> 41 41 #include "tfrc.h" 42 42 43 - /** 44 - * tfrc_tx_hist_entry - Simple singly-linked TX history list 45 - * @next: next oldest entry (LIFO order) 46 - * @seqno: sequence number of this entry 47 - * @stamp: send time of packet with sequence number @seqno 48 - */ 49 - struct tfrc_tx_hist_entry { 50 - struct tfrc_tx_hist_entry *next; 51 - u64 seqno; 52 - ktime_t stamp; 53 - }; 54 - 55 - static inline struct tfrc_tx_hist_entry * 56 - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) 57 - { 58 - while (head != NULL && head->seqno != seqno) 59 - head = head->next; 60 - return head; 61 - } 43 + struct tfrc_tx_hist_entry; 62 44 63 45 extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); 64 46 extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); 47 + extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, 48 + const u64 seqno, const ktime_t now); 65 49 66 50 /* Subtraction a-b modulo-16, respects circular wrap-around */ 67 51 #define SUB16(a, b) (((a) + 16 - (b)) & 0xF) ··· 75 91 * @loss_count: Number of entries in circular history 76 92 * @loss_start: Movable index (for loss detection) 77 93 * @rtt_sample_prev: Used during RTT sampling, points to candidate entry 78 - * @rtt_estimate: Receiver RTT estimate 79 - * @packet_size: Packet size in bytes (as per RFC 3448, 3.1) 80 - * @bytes_recvd: Number of bytes received since @bytes_start 81 - * @bytes_start: Start time for counting @bytes_recvd 82 94 */ 83 95 struct tfrc_rx_hist { 84 96 struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; 85 97 u8 loss_count:2, 86 98 loss_start:2; 87 - /* Receiver RTT sampling */ 88 99 #define rtt_sample_prev loss_start 89 - u32 rtt_estimate; 90 - /* Receiver sampling of application payload lengths */ 91 - u32 packet_size, 92 - bytes_recvd; 93 - ktime_t bytes_start; 94 100 }; 95 101 96 102 /** ··· 124 150 return h->loss_count > 0; 125 151 } 126 152 127 - /* 128 - * Accessor functions to retrieve parameters sampled by the RX history 129 - */ 130 - static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h) 131 - { 132 - if (h->packet_size == 0) { 133 - DCCP_WARN("No sample for s, using fallback\n"); 134 - return TCP_MIN_RCVMSS; 135 - } 136 - return h->packet_size; 137 - 138 - } 139 - static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h) 140 - { 141 - if (h->rtt_estimate == 0) { 142 - DCCP_WARN("No RTT estimate available, using fallback RTT\n"); 143 - return DCCP_FALLBACK_RTT; 144 - } 145 - return h->rtt_estimate; 146 - } 147 - 148 - static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h) 149 - { 150 - h->bytes_recvd = 0; 151 - h->bytes_start = ktime_get_real(); 152 - } 153 - 154 - extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv); 155 - 156 - 157 153 extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, 158 154 const struct sk_buff *skb, const u64 ndp); 159 155 160 156 extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); 161 157 162 158 struct tfrc_loss_hist; 163 - extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, 164 - struct tfrc_loss_hist *lh, 165 - struct sk_buff *skb, const u64 ndp, 166 - u32 (*first_li)(struct sock *sk), 167 - struct sock *sk); 168 - extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, 169 - const struct sk_buff *skb); 170 - extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk); 159 + extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, 160 + struct tfrc_loss_hist *lh, 161 + struct sk_buff *skb, const u64 ndp, 162 + u32 (*first_li)(struct sock *sk), 163 + struct sock *sk); 164 + extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, 165 + const struct sk_buff *skb); 166 + extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); 171 167 extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); 172 168 173 169 #endif /* _DCCP_PKT_HIST_ */
-16
net/dccp/ccids/lib/tfrc.h
··· 48 48 } 49 49 50 50 /** 51 - * tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1 52 - * Uses scaling to improve accuracy of the integer approximation of sqrt(). The 53 - * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for 54 - * clamped RTT samples (dccp_sample_rtt). 55 - * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the 56 - * scaling factor is neutralised. For this purpose, it avoids returning zero. 57 - */ 58 - static inline u16 tfrc_scaled_sqrt(const u32 sample) 59 - { 60 - const unsigned long non_zero_sample = sample ? : 1; 61 - 62 - return int_sqrt(non_zero_sample << 10); 63 - } 64 - 65 - /** 66 51 * tfrc_ewma - Exponentially weighted moving average 67 52 * @weight: Weight to be used as damping factor, in units of 1/10 68 53 */ ··· 58 73 59 74 extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); 60 75 extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); 61 - extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); 62 76 63 77 extern int tfrc_tx_packet_history_init(void); 64 78 extern void tfrc_tx_packet_history_exit(void);
+4 -25
net/dccp/ccids/lib/tfrc_equation.c
··· 632 632 633 633 if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ 634 634 if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ 635 - /* 636 - * In the congestion-avoidance phase p decays towards 0 637 - * when there are no further losses, so this case is 638 - * natural. Truncating to p_min = 0.01% means that the 639 - * maximum achievable throughput is limited to about 640 - * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g. 641 - * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps. 642 - */ 643 - tfrc_pr_debug("Value of p (%d) below resolution. " 644 - "Substituting %d\n", p, TFRC_SMALLEST_P); 635 + DCCP_WARN("Value of p (%d) below resolution. " 636 + "Substituting %d\n", p, TFRC_SMALLEST_P); 645 637 index = 0; 646 638 } else /* 0.0001 <= p <= 0.05 */ 647 639 index = p/TFRC_SMALLEST_P - 1; ··· 658 666 result = scaled_div(s, R); 659 667 return scaled_div32(result, f); 660 668 } 669 + 661 670 EXPORT_SYMBOL_GPL(tfrc_calc_x); 662 671 663 672 /** ··· 693 700 index = tfrc_binsearch(fvalue, 0); 694 701 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; 695 702 } 696 - EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); 697 703 698 - /** 699 - * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% 700 - * When @loss_event_rate is large, there is a chance that p is truncated to 0. 701 - * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. 702 - */ 703 - u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) 704 - { 705 - if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ 706 - return 0; 707 - if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ 708 - return 1000000; 709 - return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); 710 - } 711 - EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate); 704 + EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
+27 -77
net/dccp/dccp.h
··· 42 42 extern int dccp_debug; 43 43 #define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) 44 44 #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) 45 - #define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) 46 45 #else 47 46 #define dccp_pr_debug(format, a...) 48 47 #define dccp_pr_debug_cat(format, a...) 49 - #define dccp_debug(format, a...) 50 48 #endif 51 49 52 50 extern struct inet_hashinfo dccp_hashinfo; ··· 61 63 * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields 62 64 * Hence a safe upper bound for the maximum option length is 1020-28 = 992 63 65 */ 64 - #define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) 66 + #define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int)) 65 67 #define DCCP_MAX_PACKET_HDR 28 66 68 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) 67 69 #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) 68 - 69 - /* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ 70 - #define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) 71 70 72 71 #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT 73 72 * state, about 60 seconds */ ··· 81 86 */ 82 87 #define DCCP_RTO_MAX ((unsigned)(64 * HZ)) 83 88 84 - /* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */ 85 - #define DCCP_TIME_RESOLUTION 10 86 - 87 89 /* 88 90 * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 89 91 */ 90 - #define DCCP_SANE_RTT_MIN (10 * DCCP_TIME_RESOLUTION) 92 + #define DCCP_SANE_RTT_MIN 100 91 93 #define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) 92 94 #define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) 93 95 ··· 95 103 extern int sysctl_dccp_request_retries; 96 104 extern int sysctl_dccp_retries1; 97 105 extern int sysctl_dccp_retries2; 106 + extern int sysctl_dccp_feat_sequence_window; 107 + extern int sysctl_dccp_feat_rx_ccid; 108 + extern int sysctl_dccp_feat_tx_ccid; 109 + extern int sysctl_dccp_feat_ack_ratio; 110 + extern int sysctl_dccp_feat_send_ack_vector; 111 + extern int sysctl_dccp_feat_send_ndp_count; 98 112 extern int sysctl_dccp_tx_qlen; 99 113 extern int sysctl_dccp_sync_ratelimit; 100 114 ··· 235 237 extern void dccp_send_sync(struct sock *sk, const u64 seq, 236 238 const enum dccp_pkt_type pkt_type); 237 239 238 - /* 239 - * TX Packet Dequeueing Interface 240 - */ 241 - extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); 242 - extern bool dccp_qpolicy_full(struct sock *sk); 243 - extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); 244 - extern struct sk_buff *dccp_qpolicy_top(struct sock *sk); 245 - extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk); 246 - extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); 247 - 248 - /* 249 - * TX Packet Output and TX Timers 250 - */ 251 - extern void dccp_write_xmit(struct sock *sk); 240 + extern void dccp_write_xmit(struct sock *sk, int block); 252 241 extern void dccp_write_space(struct sock *sk); 253 - extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); 254 242 255 243 extern void dccp_init_xmit_timers(struct sock *sk); 256 244 static inline void dccp_clear_xmit_timers(struct sock *sk) ··· 252 268 extern void dccp_set_state(struct sock *sk, const int state); 253 269 extern void dccp_done(struct sock *sk); 254 270 255 - extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, 256 - struct sk_buff const *skb); 271 + extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb); 257 272 258 273 extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); 259 274 ··· 317 334 extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); 318 335 extern void dccp_send_close(struct sock *sk, const int active); 319 336 extern int dccp_invalid_packet(struct sk_buff *skb); 320 - 321 - static inline u32 dccp_sane_rtt(long usec_sample) 322 - { 323 - if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX)) 324 - DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample); 325 - return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX); 326 - } 327 - extern u32 dccp_sample_rtt(struct sock *sk, long delta); 337 + extern u32 dccp_sample_rtt(struct sock *sk, long delta); 328 338 329 339 static inline int dccp_bad_service_code(const struct sock *sk, 330 340 const __be32 service) ··· 411 435 static inline void dccp_update_gsr(struct sock *sk, u64 seq) 412 436 { 413 437 struct dccp_sock *dp = dccp_sk(sk); 438 + const struct dccp_minisock *dmsk = dccp_msk(sk); 414 439 415 440 dp->dccps_gsr = seq; 416 - /* Sequence validity window depends on remote Sequence Window (7.5.1) */ 417 - dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); 418 - /* 419 - * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, 420 - * 7.5.1 we perform this check beyond the initial handshake: W/W' are 421 - * always > 32, so for the first W/W' packets in the lifetime of a 422 - * connection we always have to adjust SWL. 423 - * A second reason why we are doing this is that the window depends on 424 - * the feature-remote value of Sequence Window: nothing stops the peer 425 - * from updating this value while we are busy adjusting SWL for the 426 - * first W packets (we would have to count from scratch again then). 427 - * Therefore it is safer to always make sure that the Sequence Window 428 - * is not artificially extended by a peer who grows SWL downwards by 429 - * continually updating the feature-remote Sequence-Window. 430 - * If sequence numbers wrap it is bad luck. But that will take a while 431 - * (48 bit), and this measure prevents Sequence-number attacks. 432 - */ 433 - if (before48(dp->dccps_swl, dp->dccps_isr)) 434 - dp->dccps_swl = dp->dccps_isr; 435 - dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); 441 + dccp_set_seqno(&dp->dccps_swl, 442 + dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4)); 443 + dccp_set_seqno(&dp->dccps_swh, 444 + dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4); 436 445 } 437 446 438 447 static inline void dccp_update_gss(struct sock *sk, u64 seq) 439 448 { 440 449 struct dccp_sock *dp = dccp_sk(sk); 441 450 442 - dp->dccps_gss = seq; 443 - /* Ack validity window depends on local Sequence Window value (7.5.1) */ 444 - dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); 445 - /* Adjust AWL so that it is not below ISS - see comment above for SWL */ 446 - if (before48(dp->dccps_awl, dp->dccps_iss)) 447 - dp->dccps_awl = dp->dccps_iss; 448 - dp->dccps_awh = dp->dccps_gss; 449 - } 450 - 451 - static inline int dccp_ackvec_pending(const struct sock *sk) 452 - { 453 - return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && 454 - !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); 451 + dp->dccps_awh = dp->dccps_gss = seq; 452 + dccp_set_seqno(&dp->dccps_awl, 453 + (dp->dccps_gss - 454 + dccp_msk(sk)->dccpms_sequence_window + 1)); 455 455 } 456 456 457 457 static inline int dccp_ack_pending(const struct sock *sk) 458 458 { 459 - return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); 459 + const struct dccp_sock *dp = dccp_sk(sk); 460 + return dp->dccps_timestamp_echo != 0 || 461 + #ifdef CONFIG_IP_DCCP_ACKVEC 462 + (dccp_msk(sk)->dccpms_send_ack_vector && 463 + dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || 464 + #endif 465 + inet_csk_ack_scheduled(sk); 460 466 } 461 - 462 - extern int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); 463 - extern int dccp_feat_finalise_settings(struct dccp_sock *dp); 464 - extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); 465 - extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, 466 - struct sk_buff *skb); 467 - extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); 468 - extern void dccp_feat_list_purge(struct list_head *fn_list); 469 467 470 468 extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); 471 469 extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
+1 -1
net/dccp/diag.c
··· 29 29 info->tcpi_backoff = icsk->icsk_backoff; 30 30 info->tcpi_pmtu = icsk->icsk_pmtu_cookie; 31 31 32 - if (dp->dccps_hc_rx_ackvec != NULL) 32 + if (dccp_msk(sk)->dccpms_send_ack_vector) 33 33 info->tcpi_options |= TCPI_OPT_SACK; 34 34 35 35 ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
+555 -1438
net/dccp/feat.c
··· 1 1 /* 2 2 * net/dccp/feat.c 3 3 * 4 - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) 5 - * 6 - * Copyright (c) 2008 The University of Aberdeen, Scotland, UK 7 - * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> 8 - * Rewrote from scratch, some bits from earlier code by 9 - * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> 10 - * 4 + * An implementation of the DCCP protocol 5 + * Andrea Bittau <a.bittau@cs.ucl.ac.uk> 11 6 * 12 7 * ASSUMPTIONS 13 8 * ----------- 14 - * o Feature negotiation is coordinated with connection setup (as in TCP), wild 15 - * changes of parameters of an established connection are not supported. 16 - * o Changing NN values (Ack Ratio only) is supported in state OPEN/PARTOPEN. 17 9 * o All currently known SP features have 1-byte quantities. If in the future 18 10 * extensions of RFCs 4340..42 define features with item lengths larger than 19 11 * one byte, a feature-specific extension of the code will be required. ··· 15 23 * as published by the Free Software Foundation; either version 16 24 * 2 of the License, or (at your option) any later version. 17 25 */ 26 + 18 27 #include <linux/module.h> 28 + 19 29 #include "ccid.h" 20 30 #include "feat.h" 21 31 22 - /* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ 23 - unsigned long sysctl_dccp_sequence_window __read_mostly = 100; 24 - int sysctl_dccp_rx_ccid __read_mostly = 2, 25 - sysctl_dccp_tx_ccid __read_mostly = 2; 32 + #define DCCP_FEAT_SP_NOAGREE (-123) 26 33 27 - /* 28 - * Feature activation handlers. 29 - * 30 - * These all use an u64 argument, to provide enough room for NN/SP features. At 31 - * this stage the negotiated values have been checked to be within their range. 32 - */ 33 - static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) 34 + int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, 35 + u8 *val, u8 len, gfp_t gfp) 36 + { 37 + struct dccp_opt_pend *opt; 38 + 39 + dccp_feat_debug(type, feature, *val); 40 + 41 + if (len > 3) { 42 + DCCP_WARN("invalid length %d\n", len); 43 + return -EINVAL; 44 + } 45 + /* XXX add further sanity checks */ 46 + 47 + /* check if that feature is already being negotiated */ 48 + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 49 + /* ok we found a negotiation for this option already */ 50 + if (opt->dccpop_feat == feature && opt->dccpop_type == type) { 51 + dccp_pr_debug("Replacing old\n"); 52 + /* replace */ 53 + BUG_ON(opt->dccpop_val == NULL); 54 + kfree(opt->dccpop_val); 55 + opt->dccpop_val = val; 56 + opt->dccpop_len = len; 57 + opt->dccpop_conf = 0; 58 + return 0; 59 + } 60 + } 61 + 62 + /* negotiation for a new feature */ 63 + opt = kmalloc(sizeof(*opt), gfp); 64 + if (opt == NULL) 65 + return -ENOMEM; 66 + 67 + opt->dccpop_type = type; 68 + opt->dccpop_feat = feature; 69 + opt->dccpop_len = len; 70 + opt->dccpop_val = val; 71 + opt->dccpop_conf = 0; 72 + opt->dccpop_sc = NULL; 73 + 74 + BUG_ON(opt->dccpop_val == NULL); 75 + 76 + list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending); 77 + return 0; 78 + } 79 + 80 + EXPORT_SYMBOL_GPL(dccp_feat_change); 81 + 82 + static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr) 34 83 { 35 84 struct dccp_sock *dp = dccp_sk(sk); 36 - struct ccid *new_ccid = ccid_new(ccid, sk, rx, gfp_any()); 85 + struct dccp_minisock *dmsk = dccp_msk(sk); 86 + /* figure out if we are changing our CCID or the peer's */ 87 + const int rx = type == DCCPO_CHANGE_R; 88 + const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid; 89 + struct ccid *new_ccid; 37 90 91 + /* Check if nothing is being changed. */ 92 + if (ccid_nr == new_ccid_nr) 93 + return 0; 94 + 95 + new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC); 38 96 if (new_ccid == NULL) 39 97 return -ENOMEM; 40 98 41 99 if (rx) { 42 100 ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); 43 101 dp->dccps_hc_rx_ccid = new_ccid; 102 + dmsk->dccpms_rx_ccid = new_ccid_nr; 44 103 } else { 45 104 ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); 46 105 dp->dccps_hc_tx_ccid = new_ccid; 106 + dmsk->dccpms_tx_ccid = new_ccid_nr; 107 + } 108 + 109 + return 0; 110 + } 111 + 112 + static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val) 113 + { 114 + dccp_feat_debug(type, feat, val); 115 + 116 + switch (feat) { 117 + case DCCPF_CCID: 118 + return dccp_feat_update_ccid(sk, type, val); 119 + default: 120 + dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n", 121 + dccp_feat_typename(type), feat); 122 + break; 47 123 } 48 124 return 0; 49 125 } 50 126 51 - static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) 127 + static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt, 128 + u8 *rpref, u8 rlen) 52 129 { 53 130 struct dccp_sock *dp = dccp_sk(sk); 131 + u8 *spref, slen, *res = NULL; 132 + int i, j, rc, agree = 1; 54 133 55 - if (rx) { 56 - dp->dccps_r_seq_win = seq_win; 57 - /* propagate changes to update SWL/SWH */ 58 - dccp_update_gsr(sk, dp->dccps_gsr); 134 + BUG_ON(rpref == NULL); 135 + 136 + /* check if we are the black sheep */ 137 + if (dp->dccps_role == DCCP_ROLE_CLIENT) { 138 + spref = rpref; 139 + slen = rlen; 140 + rpref = opt->dccpop_val; 141 + rlen = opt->dccpop_len; 59 142 } else { 60 - dp->dccps_l_seq_win = seq_win; 61 - /* propagate changes to update AWL */ 62 - dccp_update_gss(sk, dp->dccps_gss); 143 + spref = opt->dccpop_val; 144 + slen = opt->dccpop_len; 63 145 } 64 - return 0; 65 - } 66 - 67 - static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) 68 - { 69 - #ifndef __CCID2_COPES_GRACEFULLY_WITH_DYNAMIC_ACK_RATIO_UPDATES__ 70 146 /* 71 - * FIXME: This is required until several problems in the CCID-2 code are 72 - * resolved. The CCID-2 code currently does not cope well; using dynamic 73 - * Ack Ratios greater than 1 caused instabilities. These were manifest 74 - * in hangups and long RTO timeouts (1...3 seconds). Until this has been 75 - * stabilised, it is safer not to activate dynamic Ack Ratio changes. 147 + * Now we have server preference list in spref and client preference in 148 + * rpref 76 149 */ 77 - dccp_pr_debug("Not changing %s Ack Ratio from 1 to %u\n", 78 - rx ? "RX" : "TX", (u16)ratio); 79 - ratio = 1; 80 - #endif 81 - if (rx) 82 - dccp_sk(sk)->dccps_r_ack_ratio = ratio; 150 + BUG_ON(spref == NULL); 151 + BUG_ON(rpref == NULL); 152 + 153 + /* FIXME sanity check vals */ 154 + 155 + /* Are values in any order? XXX Lame "algorithm" here */ 156 + for (i = 0; i < slen; i++) { 157 + for (j = 0; j < rlen; j++) { 158 + if (spref[i] == rpref[j]) { 159 + res = &spref[i]; 160 + break; 161 + } 162 + } 163 + if (res) 164 + break; 165 + } 166 + 167 + /* we didn't agree on anything */ 168 + if (res == NULL) { 169 + /* confirm previous value */ 170 + switch (opt->dccpop_feat) { 171 + case DCCPF_CCID: 172 + /* XXX did i get this right? =P */ 173 + if (opt->dccpop_type == DCCPO_CHANGE_L) 174 + res = &dccp_msk(sk)->dccpms_tx_ccid; 175 + else 176 + res = &dccp_msk(sk)->dccpms_rx_ccid; 177 + break; 178 + 179 + default: 180 + DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat); 181 + /* XXX implement res */ 182 + return -EFAULT; 183 + } 184 + 185 + dccp_pr_debug("Don't agree... reconfirming %d\n", *res); 186 + agree = 0; /* this is used for mandatory options... */ 187 + } 188 + 189 + /* need to put result and our preference list */ 190 + rlen = 1 + opt->dccpop_len; 191 + rpref = kmalloc(rlen, GFP_ATOMIC); 192 + if (rpref == NULL) 193 + return -ENOMEM; 194 + 195 + *rpref = *res; 196 + memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len); 197 + 198 + /* put it in the "confirm queue" */ 199 + if (opt->dccpop_sc == NULL) { 200 + opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC); 201 + if (opt->dccpop_sc == NULL) { 202 + kfree(rpref); 203 + return -ENOMEM; 204 + } 205 + } else { 206 + /* recycle the confirm slot */ 207 + BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); 208 + kfree(opt->dccpop_sc->dccpoc_val); 209 + dccp_pr_debug("recycling confirm slot\n"); 210 + } 211 + memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc)); 212 + 213 + opt->dccpop_sc->dccpoc_val = rpref; 214 + opt->dccpop_sc->dccpoc_len = rlen; 215 + 216 + /* update the option on our side [we are about to send the confirm] */ 217 + rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res); 218 + if (rc) { 219 + kfree(opt->dccpop_sc->dccpoc_val); 220 + kfree(opt->dccpop_sc); 221 + opt->dccpop_sc = NULL; 222 + return rc; 223 + } 224 + 225 + dccp_pr_debug("Will confirm %d\n", *rpref); 226 + 227 + /* say we want to change to X but we just got a confirm X, suppress our 228 + * change 229 + */ 230 + if (!opt->dccpop_conf) { 231 + if (*opt->dccpop_val == *res) 232 + opt->dccpop_conf = 1; 233 + dccp_pr_debug("won't ask for change of same feature\n"); 234 + } 235 + 236 + return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */ 237 + } 238 + 239 + static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) 240 + { 241 + struct dccp_minisock *dmsk = dccp_msk(sk); 242 + struct dccp_opt_pend *opt; 243 + int rc = 1; 244 + u8 t; 245 + 246 + /* 247 + * We received a CHANGE. We gotta match it against our own preference 248 + * list. If we got a CHANGE_R it means it's a change for us, so we need 249 + * to compare our CHANGE_L list. 250 + */ 251 + if (type == DCCPO_CHANGE_L) 252 + t = DCCPO_CHANGE_R; 83 253 else 84 - dccp_sk(sk)->dccps_l_ack_ratio = ratio; 254 + t = DCCPO_CHANGE_L; 255 + 256 + /* find our preference list for this feature */ 257 + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 258 + if (opt->dccpop_type != t || opt->dccpop_feat != feature) 259 + continue; 260 + 261 + /* find the winner from the two preference lists */ 262 + rc = dccp_feat_reconcile(sk, opt, val, len); 263 + break; 264 + } 265 + 266 + /* We didn't deal with the change. This can happen if we have no 267 + * preference list for the feature. In fact, it just shouldn't 268 + * happen---if we understand a feature, we should have a preference list 269 + * with at least the default value. 270 + */ 271 + BUG_ON(rc == 1); 272 + 273 + return rc; 274 + } 275 + 276 + static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) 277 + { 278 + struct dccp_opt_pend *opt; 279 + struct dccp_minisock *dmsk = dccp_msk(sk); 280 + u8 *copy; 281 + int rc; 282 + 283 + /* NN features must be Change L (sec. 6.3.2) */ 284 + if (type != DCCPO_CHANGE_L) { 285 + dccp_pr_debug("received %s for NN feature %d\n", 286 + dccp_feat_typename(type), feature); 287 + return -EFAULT; 288 + } 289 + 290 + /* XXX sanity check opt val */ 291 + 292 + /* copy option so we can confirm it */ 293 + opt = kzalloc(sizeof(*opt), GFP_ATOMIC); 294 + if (opt == NULL) 295 + return -ENOMEM; 296 + 297 + copy = kmemdup(val, len, GFP_ATOMIC); 298 + if (copy == NULL) { 299 + kfree(opt); 300 + return -ENOMEM; 301 + } 302 + 303 + opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */ 304 + opt->dccpop_feat = feature; 305 + opt->dccpop_val = copy; 306 + opt->dccpop_len = len; 307 + 308 + /* change feature */ 309 + rc = dccp_feat_update(sk, type, feature, *val); 310 + if (rc) { 311 + kfree(opt->dccpop_val); 312 + kfree(opt); 313 + return rc; 314 + } 315 + 316 + dccp_feat_debug(type, feature, *copy); 317 + 318 + list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); 319 + 85 320 return 0; 86 321 } 87 322 88 - static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) 323 + static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk, 324 + u8 type, u8 feature) 89 325 { 90 - struct dccp_sock *dp = dccp_sk(sk); 326 + /* XXX check if other confirms for that are queued and recycle slot */ 327 + struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC); 91 328 92 - if (rx) { 93 - if (enable && dp->dccps_hc_rx_ackvec == NULL) { 94 - dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); 95 - if (dp->dccps_hc_rx_ackvec == NULL) 96 - return -ENOMEM; 97 - } else if (!enable) { 98 - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); 99 - dp->dccps_hc_rx_ackvec = NULL; 329 + if (opt == NULL) { 330 + /* XXX what do we do? Ignoring should be fine. It's a change 331 + * after all =P 332 + */ 333 + return; 334 + } 335 + 336 + switch (type) { 337 + case DCCPO_CHANGE_L: 338 + opt->dccpop_type = DCCPO_CONFIRM_R; 339 + break; 340 + case DCCPO_CHANGE_R: 341 + opt->dccpop_type = DCCPO_CONFIRM_L; 342 + break; 343 + default: 344 + DCCP_WARN("invalid type %d\n", type); 345 + kfree(opt); 346 + return; 347 + } 348 + opt->dccpop_feat = feature; 349 + opt->dccpop_val = NULL; 350 + opt->dccpop_len = 0; 351 + 352 + /* change feature */ 353 + dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature); 354 + 355 + list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); 356 + } 357 + 358 + static void dccp_feat_flush_confirm(struct sock *sk) 359 + { 360 + struct dccp_minisock *dmsk = dccp_msk(sk); 361 + /* Check if there is anything to confirm in the first place */ 362 + int yes = !list_empty(&dmsk->dccpms_conf); 363 + 364 + if (!yes) { 365 + struct dccp_opt_pend *opt; 366 + 367 + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 368 + if (opt->dccpop_conf) { 369 + yes = 1; 370 + break; 371 + } 100 372 } 101 373 } 102 - return 0; 374 + 375 + if (!yes) 376 + return; 377 + 378 + /* OK there is something to confirm... */ 379 + /* XXX check if packet is in flight? Send delayed ack?? */ 380 + if (sk->sk_state == DCCP_OPEN) 381 + dccp_send_ack(sk); 103 382 } 104 383 105 - static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) 384 + int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) 106 385 { 107 - if (!rx) 108 - dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); 109 - return 0; 110 - } 386 + int rc; 111 387 112 - /* 113 - * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that 114 - * `rx' holds when the sending peer informs about his partial coverage via a 115 - * ChangeR() option. In the other case, we are the sender and the receiver 116 - * announces its coverage via ChangeL() options. The policy here is to honour 117 - * such communication by enabling the corresponding partial coverage - but only 118 - * if it has not been set manually before; the warning here means that all 119 - * packets will be dropped. 120 - */ 121 - static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) 122 - { 123 - struct dccp_sock *dp = dccp_sk(sk); 388 + dccp_feat_debug(type, feature, *val); 124 389 125 - if (rx) 126 - dp->dccps_pcrlen = cscov; 127 - else { 128 - if (dp->dccps_pcslen == 0) 129 - dp->dccps_pcslen = cscov; 130 - else if (cscov > dp->dccps_pcslen) 131 - DCCP_WARN("CsCov %u too small, peer requires >= %u\n", 132 - dp->dccps_pcslen, (u8)cscov); 390 + /* figure out if it's SP or NN feature */ 391 + switch (feature) { 392 + /* deal with SP features */ 393 + case DCCPF_CCID: 394 + rc = dccp_feat_sp(sk, type, feature, val, len); 395 + break; 396 + 397 + /* deal with NN features */ 398 + case DCCPF_ACK_RATIO: 399 + rc = dccp_feat_nn(sk, type, feature, val, len); 400 + break; 401 + 402 + /* XXX implement other features */ 403 + default: 404 + dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n", 405 + dccp_feat_typename(type), feature); 406 + rc = -EFAULT; 407 + break; 133 408 } 134 - return 0; 409 + 410 + /* check if there were problems changing features */ 411 + if (rc) { 412 + /* If we don't agree on SP, we sent a confirm for old value. 413 + * However we propagate rc to caller in case option was 414 + * mandatory 415 + */ 416 + if (rc != DCCP_FEAT_SP_NOAGREE) 417 + dccp_feat_empty_confirm(dccp_msk(sk), type, feature); 418 + } 419 + 420 + /* generate the confirm [if required] */ 421 + dccp_feat_flush_confirm(sk); 422 + 423 + return rc; 135 424 } 136 425 137 - static const struct { 138 - u8 feat_num; /* DCCPF_xxx */ 139 - enum dccp_feat_type rxtx; /* RX or TX */ 140 - enum dccp_feat_type reconciliation; /* SP or NN */ 141 - u8 default_value; /* as in 6.4 */ 142 - int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); 143 - /* 144 - * Lookup table for location and type of features (from RFC 4340/4342) 145 - * +--------------------------+----+-----+----+----+---------+-----------+ 146 - * | Feature | Location | Reconc. | Initial | Section | 147 - * | | RX | TX | SP | NN | Value | Reference | 148 - * +--------------------------+----+-----+----+----+---------+-----------+ 149 - * | DCCPF_CCID | | X | X | | 2 | 10 | 150 - * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | 151 - * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | 152 - * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | 153 - * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | 154 - * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | 155 - * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | 156 - * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | 157 - * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | 158 - * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | 159 - * +--------------------------+----+-----+----+----+---------+-----------+ 160 - */ 161 - } dccp_feat_table[] = { 162 - { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, 163 - { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, 164 - { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, 165 - { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, 166 - { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, 167 - { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, 168 - { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, 169 - { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, 170 - { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, 171 - { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, 172 - }; 173 - #define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) 426 + EXPORT_SYMBOL_GPL(dccp_feat_change_recv); 174 427 175 - /** 176 - * dccp_feat_index - Hash function to map feature number into array position 177 - * Returns consecutive array index or -1 if the feature is not understood. 178 - */ 179 - static int dccp_feat_index(u8 feat_num) 428 + int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, 429 + u8 *val, u8 len) 180 430 { 181 - /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ 182 - if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) 183 - return feat_num - 1; 431 + u8 t; 432 + struct dccp_opt_pend *opt; 433 + struct dccp_minisock *dmsk = dccp_msk(sk); 434 + int found = 0; 435 + int all_confirmed = 1; 184 436 185 - /* 186 - * Other features: add cases for new feature types here after adding 187 - * them to the above table. 437 + dccp_feat_debug(type, feature, *val); 438 + 439 + /* locate our change request */ 440 + switch (type) { 441 + case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break; 442 + case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break; 443 + default: DCCP_WARN("invalid type %d\n", type); 444 + return 1; 445 + 446 + } 447 + /* XXX sanity check feature value */ 448 + 449 + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 450 + if (!opt->dccpop_conf && opt->dccpop_type == t && 451 + opt->dccpop_feat == feature) { 452 + found = 1; 453 + dccp_pr_debug("feature %d found\n", opt->dccpop_feat); 454 + 455 + /* XXX do sanity check */ 456 + 457 + opt->dccpop_conf = 1; 458 + 459 + /* We got a confirmation---change the option */ 460 + dccp_feat_update(sk, opt->dccpop_type, 461 + opt->dccpop_feat, *val); 462 + 463 + /* XXX check the return value of dccp_feat_update */ 464 + break; 465 + } 466 + 467 + if (!opt->dccpop_conf) 468 + all_confirmed = 0; 469 + } 470 + 471 + /* fix re-transmit timer */ 472 + /* XXX gotta make sure that no option negotiation occurs during 473 + * connection shutdown. Consider that the CLOSEREQ is sent and timer is 474 + * on. if all options are confirmed it might kill timer which should 475 + * remain alive until close is received. 188 476 */ 189 - switch (feat_num) { 190 - case DCCPF_SEND_LEV_RATE: 191 - return DCCP_FEAT_SUPPORTED_MAX - 1; 477 + if (all_confirmed) { 478 + dccp_pr_debug("clear feat negotiation timer %p\n", sk); 479 + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 192 480 } 193 - return -1; 481 + 482 + if (!found) 483 + dccp_pr_debug("%s(%d, ...) never requested\n", 484 + dccp_feat_typename(type), feature); 485 + return 0; 194 486 } 195 487 196 - static u8 dccp_feat_type(u8 feat_num) 488 + EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv); 489 + 490 + void dccp_feat_clean(struct dccp_minisock *dmsk) 197 491 { 198 - int idx = dccp_feat_index(feat_num); 492 + struct dccp_opt_pend *opt, *next; 199 493 200 - if (idx < 0) 201 - return FEAT_UNKNOWN; 202 - return dccp_feat_table[idx].reconciliation; 494 + list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending, 495 + dccpop_node) { 496 + BUG_ON(opt->dccpop_val == NULL); 497 + kfree(opt->dccpop_val); 498 + 499 + if (opt->dccpop_sc != NULL) { 500 + BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); 501 + kfree(opt->dccpop_sc->dccpoc_val); 502 + kfree(opt->dccpop_sc); 503 + } 504 + 505 + kfree(opt); 506 + } 507 + INIT_LIST_HEAD(&dmsk->dccpms_pending); 508 + 509 + list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { 510 + BUG_ON(opt == NULL); 511 + if (opt->dccpop_val != NULL) 512 + kfree(opt->dccpop_val); 513 + kfree(opt); 514 + } 515 + INIT_LIST_HEAD(&dmsk->dccpms_conf); 203 516 } 204 517 205 - static int dccp_feat_default_value(u8 feat_num) 206 - { 207 - int idx = dccp_feat_index(feat_num); 518 + EXPORT_SYMBOL_GPL(dccp_feat_clean); 208 519 209 - return idx < 0 ? : dccp_feat_table[idx].default_value; 210 - } 211 - 212 - /* 213 - * Debugging and verbose-printing section 520 + /* this is to be called only when a listening sock creates its child. It is 521 + * assumed by the function---the confirm is not duplicated, but rather it is 522 + * "passed on". 214 523 */ 215 - static const char *dccp_feat_fname(const u8 feat) 524 + int dccp_feat_clone(struct sock *oldsk, struct sock *newsk) 525 + { 526 + struct dccp_minisock *olddmsk = dccp_msk(oldsk); 527 + struct dccp_minisock *newdmsk = dccp_msk(newsk); 528 + struct dccp_opt_pend *opt; 529 + int rc = 0; 530 + 531 + INIT_LIST_HEAD(&newdmsk->dccpms_pending); 532 + INIT_LIST_HEAD(&newdmsk->dccpms_conf); 533 + 534 + list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) { 535 + struct dccp_opt_pend *newopt; 536 + /* copy the value of the option */ 537 + u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC); 538 + 539 + if (val == NULL) 540 + goto out_clean; 541 + 542 + newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC); 543 + if (newopt == NULL) { 544 + kfree(val); 545 + goto out_clean; 546 + } 547 + 548 + /* insert the option */ 549 + newopt->dccpop_val = val; 550 + list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending); 551 + 552 + /* XXX what happens with backlogs and multiple connections at 553 + * once... 554 + */ 555 + /* the master socket no longer needs to worry about confirms */ 556 + opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */ 557 + 558 + /* reset state for a new socket */ 559 + opt->dccpop_conf = 0; 560 + } 561 + 562 + /* XXX not doing anything about the conf queue */ 563 + 564 + out: 565 + return rc; 566 + 567 + out_clean: 568 + dccp_feat_clean(newdmsk); 569 + rc = -ENOMEM; 570 + goto out; 571 + } 572 + 573 + EXPORT_SYMBOL_GPL(dccp_feat_clone); 574 + 575 + static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat, 576 + u8 *val, u8 len) 577 + { 578 + int rc = -ENOMEM; 579 + u8 *copy = kmemdup(val, len, GFP_KERNEL); 580 + 581 + if (copy != NULL) { 582 + rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL); 583 + if (rc) 584 + kfree(copy); 585 + } 586 + return rc; 587 + } 588 + 589 + int dccp_feat_init(struct dccp_minisock *dmsk) 590 + { 591 + int rc; 592 + 593 + INIT_LIST_HEAD(&dmsk->dccpms_pending); 594 + INIT_LIST_HEAD(&dmsk->dccpms_conf); 595 + 596 + /* CCID L */ 597 + rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID, 598 + &dmsk->dccpms_tx_ccid, 1); 599 + if (rc) 600 + goto out; 601 + 602 + /* CCID R */ 603 + rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID, 604 + &dmsk->dccpms_rx_ccid, 1); 605 + if (rc) 606 + goto out; 607 + 608 + /* Ack ratio */ 609 + rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO, 610 + &dmsk->dccpms_ack_ratio, 1); 611 + out: 612 + return rc; 613 + } 614 + 615 + EXPORT_SYMBOL_GPL(dccp_feat_init); 616 + 617 + #ifdef CONFIG_IP_DCCP_DEBUG 618 + const char *dccp_feat_typename(const u8 type) 619 + { 620 + switch(type) { 621 + case DCCPO_CHANGE_L: return("ChangeL"); 622 + case DCCPO_CONFIRM_L: return("ConfirmL"); 623 + case DCCPO_CHANGE_R: return("ChangeR"); 624 + case DCCPO_CONFIRM_R: return("ConfirmR"); 625 + /* the following case must not appear in feature negotation */ 626 + default: dccp_pr_debug("unknown type %d [BUG!]\n", type); 627 + } 628 + return NULL; 629 + } 630 + 631 + EXPORT_SYMBOL_GPL(dccp_feat_typename); 632 + 633 + const char *dccp_feat_name(const u8 feat) 216 634 { 217 635 static const char *feature_names[] = { 218 636 [DCCPF_RESERVED] = "Reserved", ··· 639 237 if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) 640 238 return feature_names[DCCPF_RESERVED]; 641 239 642 - if (feat == DCCPF_SEND_LEV_RATE) 643 - return "Send Loss Event Rate"; 644 240 if (feat >= DCCPF_MIN_CCID_SPECIFIC) 645 241 return "CCID-specific"; 646 242 647 243 return feature_names[feat]; 648 244 } 649 245 650 - static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING", 651 - "UNSTABLE", "STABLE" }; 652 - 653 - #ifdef CONFIG_IP_DCCP_DEBUG 654 - static const char *dccp_feat_oname(const u8 opt) 655 - { 656 - switch (opt) { 657 - case DCCPO_CHANGE_L: return "Change_L"; 658 - case DCCPO_CONFIRM_L: return "Confirm_L"; 659 - case DCCPO_CHANGE_R: return "Change_R"; 660 - case DCCPO_CONFIRM_R: return "Confirm_R"; 661 - } 662 - return NULL; 663 - } 664 - 665 - static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) 666 - { 667 - u8 i, type = dccp_feat_type(feat_num); 668 - 669 - if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) 670 - dccp_pr_debug_cat("(NULL)"); 671 - else if (type == FEAT_SP) 672 - for (i = 0; i < val->sp.len; i++) 673 - dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); 674 - else if (type == FEAT_NN) 675 - dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); 676 - else 677 - dccp_pr_debug_cat("unknown type %u", type); 678 - } 679 - 680 - static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) 681 - { 682 - u8 type = dccp_feat_type(feat_num); 683 - dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; 684 - 685 - if (type == FEAT_NN) 686 - fval.nn = dccp_decode_value_var(list, len); 687 - dccp_feat_printval(feat_num, &fval); 688 - } 689 - 690 - static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) 691 - { 692 - dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", 693 - dccp_feat_fname(entry->feat_num)); 694 - dccp_feat_printval(entry->feat_num, &entry->val); 695 - dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], 696 - entry->needs_confirm ? "(Confirm pending)" : ""); 697 - } 698 - 699 - #define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ 700 - dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ 701 - dccp_feat_printvals(feat, val, len); \ 702 - dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) 703 - 704 - #define dccp_feat_print_fnlist(fn_list) { \ 705 - const struct dccp_feat_entry *___entry; \ 706 - \ 707 - dccp_pr_debug("List Dump:\n"); \ 708 - list_for_each_entry(___entry, fn_list, node) \ 709 - dccp_feat_print_entry(___entry); \ 710 - } 711 - #else /* ! CONFIG_IP_DCCP_DEBUG */ 712 - #define dccp_feat_print_opt(opt, feat, val, len, mandatory) 713 - #define dccp_feat_print_fnlist(fn_list) 714 - #endif 715 - 716 - static int __dccp_feat_activate(struct sock *sk, const int idx, 717 - const bool is_local, dccp_feat_val const *fval) 718 - { 719 - bool rx; 720 - u64 val; 721 - 722 - if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) 723 - return -1; 724 - if (dccp_feat_table[idx].activation_hdlr == NULL) 725 - return 0; 726 - 727 - if (fval == NULL) { 728 - val = dccp_feat_table[idx].default_value; 729 - } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { 730 - if (fval->sp.vec == NULL) { 731 - /* 732 - * This can happen when an empty Confirm is sent 733 - * for an SP (i.e. known) feature. In this case 734 - * we would be using the default anyway. 735 - */ 736 - DCCP_CRIT("Feature #%d undefined: using default", idx); 737 - val = dccp_feat_table[idx].default_value; 738 - } else { 739 - val = fval->sp.vec[0]; 740 - } 741 - } else { 742 - val = fval->nn; 743 - } 744 - 745 - /* Location is RX if this is a local-RX or remote-TX feature */ 746 - rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); 747 - 748 - dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", 749 - dccp_feat_fname(dccp_feat_table[idx].feat_num), 750 - fval ? "" : "default ", (unsigned long long)val); 751 - 752 - return dccp_feat_table[idx].activation_hdlr(sk, val, rx); 753 - } 754 - 755 - /** 756 - * dccp_feat_activate - Activate feature value on socket 757 - * @sk: fully connected DCCP socket (after handshake is complete) 758 - * @feat_num: feature to activate, one of %dccp_feature_numbers 759 - * @local: whether local (1) or remote (0) @feat_num is meant 760 - * @fval: the value (SP or NN) to activate, or NULL to use the default value 761 - * For general use this function is preferable over __dccp_feat_activate(). 762 - */ 763 - static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, 764 - dccp_feat_val const *fval) 765 - { 766 - return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); 767 - } 768 - 769 - /* Test for "Req'd" feature (RFC 4340, 6.4) */ 770 - static inline int dccp_feat_must_be_understood(u8 feat_num) 771 - { 772 - return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || 773 - feat_num == DCCPF_SEQUENCE_WINDOW; 774 - } 775 - 776 - /* copy constructor, fval must not already contain allocated memory */ 777 - static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) 778 - { 779 - fval->sp.len = len; 780 - if (fval->sp.len > 0) { 781 - fval->sp.vec = kmemdup(val, len, gfp_any()); 782 - if (fval->sp.vec == NULL) { 783 - fval->sp.len = 0; 784 - return -ENOBUFS; 785 - } 786 - } 787 - return 0; 788 - } 789 - 790 - static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) 791 - { 792 - if (unlikely(val == NULL)) 793 - return; 794 - if (dccp_feat_type(feat_num) == FEAT_SP) 795 - kfree(val->sp.vec); 796 - memset(val, 0, sizeof(*val)); 797 - } 798 - 799 - static struct dccp_feat_entry * 800 - dccp_feat_clone_entry(struct dccp_feat_entry const *original) 801 - { 802 - struct dccp_feat_entry *new; 803 - u8 type = dccp_feat_type(original->feat_num); 804 - 805 - if (type == FEAT_UNKNOWN) 806 - return NULL; 807 - 808 - new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); 809 - if (new == NULL) 810 - return NULL; 811 - 812 - if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, 813 - original->val.sp.vec, 814 - original->val.sp.len)) { 815 - kfree(new); 816 - return NULL; 817 - } 818 - return new; 819 - } 820 - 821 - static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) 822 - { 823 - if (entry != NULL) { 824 - dccp_feat_val_destructor(entry->feat_num, &entry->val); 825 - kfree(entry); 826 - } 827 - } 828 - 829 - /* 830 - * List management functions 831 - * 832 - * Feature negotiation lists rely on and maintain the following invariants: 833 - * - each feat_num in the list is known, i.e. we know its type and default value 834 - * - each feat_num/is_local combination is unique (old entries are overwritten) 835 - * - SP values are always freshly allocated 836 - * - list is sorted in increasing order of feature number (faster lookup) 837 - */ 838 - static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, 839 - u8 feat_num, bool is_local) 840 - { 841 - struct dccp_feat_entry *entry; 842 - 843 - list_for_each_entry(entry, fn_list, node) 844 - if (entry->feat_num == feat_num && entry->is_local == is_local) 845 - return entry; 846 - else if (entry->feat_num > feat_num) 847 - break; 848 - return NULL; 849 - } 850 - 851 - /** 852 - * dccp_feat_entry_new - Central list update routine (called by all others) 853 - * @head: list to add to 854 - * @feat: feature number 855 - * @local: whether the local (1) or remote feature with number @feat is meant 856 - * This is the only constructor and serves to ensure the above invariants. 857 - */ 858 - static struct dccp_feat_entry * 859 - dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) 860 - { 861 - struct dccp_feat_entry *entry; 862 - 863 - list_for_each_entry(entry, head, node) 864 - if (entry->feat_num == feat && entry->is_local == local) { 865 - dccp_feat_val_destructor(entry->feat_num, &entry->val); 866 - return entry; 867 - } else if (entry->feat_num > feat) { 868 - head = &entry->node; 869 - break; 870 - } 871 - 872 - entry = kmalloc(sizeof(*entry), gfp_any()); 873 - if (entry != NULL) { 874 - entry->feat_num = feat; 875 - entry->is_local = local; 876 - list_add_tail(&entry->node, head); 877 - } 878 - return entry; 879 - } 880 - 881 - /** 882 - * dccp_feat_push_change - Add/overwrite a Change option in the list 883 - * @fn_list: feature-negotiation list to update 884 - * @feat: one of %dccp_feature_numbers 885 - * @local: whether local (1) or remote (0) @feat_num is meant 886 - * @needs_mandatory: whether to use Mandatory feature negotiation options 887 - * @fval: pointer to NN/SP value to be inserted (will be copied) 888 - */ 889 - static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, 890 - u8 mandatory, dccp_feat_val *fval) 891 - { 892 - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); 893 - 894 - if (new == NULL) 895 - return -ENOMEM; 896 - 897 - new->feat_num = feat; 898 - new->is_local = local; 899 - new->state = FEAT_INITIALISING; 900 - new->needs_confirm = 0; 901 - new->empty_confirm = 0; 902 - new->val = *fval; 903 - new->needs_mandatory = mandatory; 904 - 905 - return 0; 906 - } 907 - 908 - /** 909 - * dccp_feat_push_confirm - Add a Confirm entry to the FN list 910 - * @fn_list: feature-negotiation list to add to 911 - * @feat: one of %dccp_feature_numbers 912 - * @local: whether local (1) or remote (0) @feat_num is being confirmed 913 - * @fval: pointer to NN/SP value to be inserted or NULL 914 - * Returns 0 on success, a Reset code for further processing otherwise. 915 - */ 916 - static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, 917 - dccp_feat_val *fval) 918 - { 919 - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); 920 - 921 - if (new == NULL) 922 - return DCCP_RESET_CODE_TOO_BUSY; 923 - 924 - new->feat_num = feat; 925 - new->is_local = local; 926 - new->state = FEAT_STABLE; /* transition in 6.6.2 */ 927 - new->needs_confirm = 1; 928 - new->empty_confirm = (fval == NULL); 929 - new->val.nn = 0; /* zeroes the whole structure */ 930 - if (!new->empty_confirm) 931 - new->val = *fval; 932 - new->needs_mandatory = 0; 933 - 934 - return 0; 935 - } 936 - 937 - static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) 938 - { 939 - return dccp_feat_push_confirm(fn_list, feat, local, NULL); 940 - } 941 - 942 - static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) 943 - { 944 - list_del(&entry->node); 945 - dccp_feat_entry_destructor(entry); 946 - } 947 - 948 - void dccp_feat_list_purge(struct list_head *fn_list) 949 - { 950 - struct dccp_feat_entry *entry, *next; 951 - 952 - list_for_each_entry_safe(entry, next, fn_list, node) 953 - dccp_feat_entry_destructor(entry); 954 - INIT_LIST_HEAD(fn_list); 955 - } 956 - EXPORT_SYMBOL_GPL(dccp_feat_list_purge); 957 - 958 - /* generate @to as full clone of @from - @to must not contain any nodes */ 959 - int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) 960 - { 961 - struct dccp_feat_entry *entry, *new; 962 - 963 - INIT_LIST_HEAD(to); 964 - list_for_each_entry(entry, from, node) { 965 - new = dccp_feat_clone_entry(entry); 966 - if (new == NULL) 967 - goto cloning_failed; 968 - list_add_tail(&new->node, to); 969 - } 970 - return 0; 971 - 972 - cloning_failed: 973 - dccp_feat_list_purge(to); 974 - return -ENOMEM; 975 - } 976 - 977 - /** 978 - * dccp_feat_valid_nn_length - Enforce length constraints on NN options 979 - * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, 980 - * incoming options are accepted as long as their values are valid. 981 - */ 982 - static u8 dccp_feat_valid_nn_length(u8 feat_num) 983 - { 984 - if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ 985 - return 2; 986 - if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ 987 - return 6; 988 - return 0; 989 - } 990 - 991 - static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) 992 - { 993 - switch (feat_num) { 994 - case DCCPF_ACK_RATIO: 995 - return val <= DCCPF_ACK_RATIO_MAX; 996 - case DCCPF_SEQUENCE_WINDOW: 997 - return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; 998 - } 999 - return 0; /* feature unknown - so we can't tell */ 1000 - } 1001 - 1002 - /* check that SP values are within the ranges defined in RFC 4340 */ 1003 - static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) 1004 - { 1005 - switch (feat_num) { 1006 - case DCCPF_CCID: 1007 - return val == DCCPC_CCID2 || val == DCCPC_CCID3; 1008 - /* Type-check Boolean feature values: */ 1009 - case DCCPF_SHORT_SEQNOS: 1010 - case DCCPF_ECN_INCAPABLE: 1011 - case DCCPF_SEND_ACK_VECTOR: 1012 - case DCCPF_SEND_NDP_COUNT: 1013 - case DCCPF_DATA_CHECKSUM: 1014 - case DCCPF_SEND_LEV_RATE: 1015 - return val < 2; 1016 - case DCCPF_MIN_CSUM_COVER: 1017 - return val < 16; 1018 - } 1019 - return 0; /* feature unknown */ 1020 - } 1021 - 1022 - static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) 1023 - { 1024 - if (sp_list == NULL || sp_len < 1) 1025 - return 0; 1026 - while (sp_len--) 1027 - if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) 1028 - return 0; 1029 - return 1; 1030 - } 1031 - 1032 - /** 1033 - * dccp_feat_insert_opts - Generate FN options from current list state 1034 - * @skb: next sk_buff to be sent to the peer 1035 - * @dp: for client during handshake and general negotiation 1036 - * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) 1037 - */ 1038 - int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, 1039 - struct sk_buff *skb) 1040 - { 1041 - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; 1042 - struct dccp_feat_entry *pos, *next; 1043 - u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; 1044 - bool rpt; 1045 - 1046 - /* put entries into @skb in the order they appear in the list */ 1047 - list_for_each_entry_safe_reverse(pos, next, fn, node) { 1048 - opt = dccp_feat_genopt(pos); 1049 - type = dccp_feat_type(pos->feat_num); 1050 - rpt = false; 1051 - 1052 - if (pos->empty_confirm) { 1053 - len = 0; 1054 - ptr = NULL; 1055 - } else { 1056 - if (type == FEAT_SP) { 1057 - len = pos->val.sp.len; 1058 - ptr = pos->val.sp.vec; 1059 - rpt = pos->needs_confirm; 1060 - } else if (type == FEAT_NN) { 1061 - len = dccp_feat_valid_nn_length(pos->feat_num); 1062 - ptr = nn_in_nbo; 1063 - dccp_encode_value_var(pos->val.nn, ptr, len); 1064 - } else { 1065 - DCCP_BUG("unknown feature %u", pos->feat_num); 1066 - return -1; 1067 - } 1068 - } 1069 - dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); 1070 - 1071 - if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) 1072 - return -1; 1073 - if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) 1074 - return -1; 1075 - /* 1076 - * Enter CHANGING after transmitting the Change option (6.6.2). 1077 - */ 1078 - if (pos->state == FEAT_INITIALISING) 1079 - pos->state = FEAT_CHANGING; 1080 - } 1081 - return 0; 1082 - } 1083 - 1084 - /** 1085 - * __feat_register_nn - Register new NN value on socket 1086 - * @fn: feature-negotiation list to register with 1087 - * @feat: an NN feature from %dccp_feature_numbers 1088 - * @mandatory: use Mandatory option if 1 1089 - * @nn_val: value to register (restricted to 4 bytes) 1090 - * Note that NN features are local by definition (RFC 4340, 6.3.2). 1091 - */ 1092 - static int __feat_register_nn(struct list_head *fn, u8 feat, 1093 - u8 mandatory, u64 nn_val) 1094 - { 1095 - dccp_feat_val fval = { .nn = nn_val }; 1096 - 1097 - if (dccp_feat_type(feat) != FEAT_NN || 1098 - !dccp_feat_is_valid_nn_val(feat, nn_val)) 1099 - return -EINVAL; 1100 - 1101 - /* Don't bother with default values, they will be activated anyway. */ 1102 - if (nn_val - (u64)dccp_feat_default_value(feat) == 0) 1103 - return 0; 1104 - 1105 - return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); 1106 - } 1107 - 1108 - /** 1109 - * __feat_register_sp - Register new SP value/list on socket 1110 - * @fn: feature-negotiation list to register with 1111 - * @feat: an SP feature from %dccp_feature_numbers 1112 - * @is_local: whether the local (1) or the remote (0) @feat is meant 1113 - * @mandatory: use Mandatory option if 1 1114 - * @sp_val: SP value followed by optional preference list 1115 - * @sp_len: length of @sp_val in bytes 1116 - */ 1117 - static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, 1118 - u8 mandatory, u8 const *sp_val, u8 sp_len) 1119 - { 1120 - dccp_feat_val fval; 1121 - 1122 - if (dccp_feat_type(feat) != FEAT_SP || 1123 - !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) 1124 - return -EINVAL; 1125 - 1126 - /* Avoid negotiating alien CCIDs by only advertising supported ones */ 1127 - if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) 1128 - return -EOPNOTSUPP; 1129 - 1130 - if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) 1131 - return -ENOMEM; 1132 - 1133 - return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); 1134 - } 1135 - 1136 - /** 1137 - * dccp_feat_register_sp - Register requests to change SP feature values 1138 - * @sk: client or listening socket 1139 - * @feat: one of %dccp_feature_numbers 1140 - * @is_local: whether the local (1) or remote (0) @feat is meant 1141 - * @list: array of preferred values, in descending order of preference 1142 - * @len: length of @list in bytes 1143 - */ 1144 - int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, 1145 - u8 const *list, u8 len) 1146 - { /* any changes must be registered before establishing the connection */ 1147 - if (sk->sk_state != DCCP_CLOSED) 1148 - return -EISCONN; 1149 - if (dccp_feat_type(feat) != FEAT_SP) 1150 - return -EINVAL; 1151 - return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, 1152 - 0, list, len); 1153 - } 1154 - 1155 - /* Analogous to dccp_feat_register_sp(), but for non-negotiable values */ 1156 - int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val) 1157 - { 1158 - /* any changes must be registered before establishing the connection */ 1159 - if (sk->sk_state != DCCP_CLOSED) 1160 - return -EISCONN; 1161 - if (dccp_feat_type(feat) != FEAT_NN) 1162 - return -EINVAL; 1163 - return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val); 1164 - } 1165 - 1166 - /** 1167 - * dccp_feat_signal_nn_change - Update NN values for an established connection 1168 - * @sk: DCCP socket of an established connection 1169 - * @feat: NN feature number from %dccp_feature_numbers 1170 - * @nn_val: the new value to use 1171 - * This function is used to communicate NN updates out-of-band. The difference 1172 - * to feature negotiation during connection setup is that values are activated 1173 - * immediately after validation, i.e. we don't wait for the Confirm: either the 1174 - * value is accepted by the peer (and then the waiting is futile), or it is not 1175 - * (Reset or empty Confirm). We don't accept empty Confirms - transmitted values 1176 - * are validated, and the peer "MUST accept any valid value" (RFC 4340, 6.3.2). 1177 - */ 1178 - int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) 1179 - { 1180 - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; 1181 - dccp_feat_val fval = { .nn = nn_val }; 1182 - struct dccp_feat_entry *entry; 1183 - 1184 - if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) 1185 - return 0; 1186 - 1187 - if (dccp_feat_type(feat) != FEAT_NN || 1188 - !dccp_feat_is_valid_nn_val(feat, nn_val)) 1189 - return -EINVAL; 1190 - 1191 - entry = dccp_feat_list_lookup(fn, feat, 1); 1192 - if (entry != NULL) { 1193 - dccp_pr_debug("Ignoring %llu, entry %llu exists in state %s\n", 1194 - (unsigned long long)nn_val, 1195 - (unsigned long long)entry->val.nn, 1196 - dccp_feat_sname[entry->state]); 1197 - return 0; 1198 - } 1199 - 1200 - if (dccp_feat_activate(sk, feat, 1, &fval)) 1201 - return -EADV; 1202 - 1203 - inet_csk_schedule_ack(sk); 1204 - return dccp_feat_push_change(fn, feat, 1, 0, &fval); 1205 - } 1206 - EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); 1207 - 1208 - /* 1209 - * Tracking features whose value depend on the choice of CCID 1210 - * 1211 - * This is designed with an extension in mind so that a list walk could be done 1212 - * before activating any features. However, the existing framework was found to 1213 - * work satisfactorily up until now, the automatic verification is left open. 1214 - * When adding new CCIDs, add a corresponding dependency table here. 1215 - */ 1216 - static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) 1217 - { 1218 - static const struct ccid_dependency ccid2_dependencies[2][2] = { 1219 - /* 1220 - * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX 1221 - * feature and Send Ack Vector is an RX feature, `is_local' 1222 - * needs to be reversed. 1223 - */ 1224 - { /* Dependencies of the receiver-side (remote) CCID2 */ 1225 - { 1226 - .dependent_feat = DCCPF_SEND_ACK_VECTOR, 1227 - .is_local = true, 1228 - .is_mandatory = true, 1229 - .val = 1 1230 - }, 1231 - { 0, 0, 0, 0 } 1232 - }, 1233 - { /* Dependencies of the sender-side (local) CCID2 */ 1234 - { 1235 - .dependent_feat = DCCPF_SEND_ACK_VECTOR, 1236 - .is_local = false, 1237 - .is_mandatory = true, 1238 - .val = 1 1239 - }, 1240 - { 0, 0, 0, 0 } 1241 - } 1242 - }; 1243 - static const struct ccid_dependency ccid3_dependencies[2][5] = { 1244 - { /* 1245 - * Dependencies of the receiver-side CCID3 1246 - */ 1247 - { /* locally disable Ack Vectors */ 1248 - .dependent_feat = DCCPF_SEND_ACK_VECTOR, 1249 - .is_local = true, 1250 - .is_mandatory = false, 1251 - .val = 0 1252 - }, 1253 - { /* see below why Send Loss Event Rate is on */ 1254 - .dependent_feat = DCCPF_SEND_LEV_RATE, 1255 - .is_local = true, 1256 - .is_mandatory = true, 1257 - .val = 1 1258 - }, 1259 - { /* NDP Count is needed as per RFC 4342, 6.1.1 */ 1260 - .dependent_feat = DCCPF_SEND_NDP_COUNT, 1261 - .is_local = false, 1262 - .is_mandatory = true, 1263 - .val = 1 1264 - }, 1265 - { 0, 0, 0, 0 }, 1266 - }, 1267 - { /* 1268 - * CCID3 at the TX side: we request that the HC-receiver 1269 - * will not send Ack Vectors (they will be ignored, so 1270 - * Mandatory is not set); we enable Send Loss Event Rate 1271 - * (Mandatory since the implementation does not support 1272 - * the Loss Intervals option of RFC 4342, 8.6). 1273 - * The last two options are for peer's information only. 1274 - */ 1275 - { 1276 - .dependent_feat = DCCPF_SEND_ACK_VECTOR, 1277 - .is_local = false, 1278 - .is_mandatory = false, 1279 - .val = 0 1280 - }, 1281 - { 1282 - .dependent_feat = DCCPF_SEND_LEV_RATE, 1283 - .is_local = false, 1284 - .is_mandatory = true, 1285 - .val = 1 1286 - }, 1287 - { /* this CCID does not support Ack Ratio */ 1288 - .dependent_feat = DCCPF_ACK_RATIO, 1289 - .is_local = true, 1290 - .is_mandatory = false, 1291 - .val = 0 1292 - }, 1293 - { /* tell receiver we are sending NDP counts */ 1294 - .dependent_feat = DCCPF_SEND_NDP_COUNT, 1295 - .is_local = true, 1296 - .is_mandatory = false, 1297 - .val = 1 1298 - }, 1299 - { 0, 0, 0, 0 } 1300 - } 1301 - }; 1302 - switch (ccid) { 1303 - case DCCPC_CCID2: 1304 - return ccid2_dependencies[is_local]; 1305 - case DCCPC_CCID3: 1306 - return ccid3_dependencies[is_local]; 1307 - default: 1308 - return NULL; 1309 - } 1310 - } 1311 - 1312 - /** 1313 - * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID 1314 - * @fn: feature-negotiation list to update 1315 - * @id: CCID number to track 1316 - * @is_local: whether TX CCID (1) or RX CCID (0) is meant 1317 - * This function needs to be called after registering all other features. 1318 - */ 1319 - static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) 1320 - { 1321 - const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); 1322 - int i, rc = (table == NULL); 1323 - 1324 - for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) 1325 - if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) 1326 - rc = __feat_register_sp(fn, table[i].dependent_feat, 1327 - table[i].is_local, 1328 - table[i].is_mandatory, 1329 - &table[i].val, 1); 1330 - else 1331 - rc = __feat_register_nn(fn, table[i].dependent_feat, 1332 - table[i].is_mandatory, 1333 - table[i].val); 1334 - return rc; 1335 - } 1336 - 1337 - /** 1338 - * dccp_feat_finalise_settings - Finalise settings before starting negotiation 1339 - * @dp: client or listening socket (settings will be inherited) 1340 - * This is called after all registrations (socket initialisation, sysctls, and 1341 - * sockopt calls), and before sending the first packet containing Change options 1342 - * (ie. client-Request or server-Response), to ensure internal consistency. 1343 - */ 1344 - int dccp_feat_finalise_settings(struct dccp_sock *dp) 1345 - { 1346 - struct list_head *fn = &dp->dccps_featneg; 1347 - struct dccp_feat_entry *entry; 1348 - int i = 2, ccids[2] = { -1, -1 }; 1349 - 1350 - /* 1351 - * Propagating CCIDs: 1352 - * 1) not useful to propagate CCID settings if this host advertises more 1353 - * than one CCID: the choice of CCID may still change - if this is 1354 - * the client, or if this is the server and the client sends 1355 - * singleton CCID values. 1356 - * 2) since is that propagate_ccid changes the list, we defer changing 1357 - * the sorted list until after the traversal. 1358 - */ 1359 - list_for_each_entry(entry, fn, node) 1360 - if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) 1361 - ccids[entry->is_local] = entry->val.sp.vec[0]; 1362 - while (i--) 1363 - if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) 1364 - return -1; 1365 - dccp_feat_print_fnlist(fn); 1366 - return 0; 1367 - } 1368 - 1369 - /** 1370 - * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features 1371 - * It is the server which resolves the dependencies once the CCID has been 1372 - * fully negotiated. If no CCID has been negotiated, it uses the default CCID. 1373 - */ 1374 - int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) 1375 - { 1376 - struct list_head *fn = &dreq->dreq_featneg; 1377 - struct dccp_feat_entry *entry; 1378 - u8 is_local, ccid; 1379 - 1380 - for (is_local = 0; is_local <= 1; is_local++) { 1381 - entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); 1382 - 1383 - if (entry != NULL && !entry->empty_confirm) 1384 - ccid = entry->val.sp.vec[0]; 1385 - else 1386 - ccid = dccp_feat_default_value(DCCPF_CCID); 1387 - 1388 - if (dccp_feat_propagate_ccid(fn, ccid, is_local)) 1389 - return -1; 1390 - } 1391 - return 0; 1392 - } 1393 - 1394 - /* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ 1395 - static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) 1396 - { 1397 - u8 c, s; 1398 - 1399 - for (s = 0; s < slen; s++) 1400 - for (c = 0; c < clen; c++) 1401 - if (servlist[s] == clilist[c]) 1402 - return servlist[s]; 1403 - return -1; 1404 - } 1405 - 1406 - /** 1407 - * dccp_feat_prefer - Move preferred entry to the start of array 1408 - * Reorder the @array_len elements in @array so that @preferred_value comes 1409 - * first. Returns >0 to indicate that @preferred_value does occur in @array. 1410 - */ 1411 - static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) 1412 - { 1413 - u8 i, does_occur = 0; 1414 - 1415 - if (array != NULL) { 1416 - for (i = 0; i < array_len; i++) 1417 - if (array[i] == preferred_value) { 1418 - array[i] = array[0]; 1419 - does_occur++; 1420 - } 1421 - if (does_occur) 1422 - array[0] = preferred_value; 1423 - } 1424 - return does_occur; 1425 - } 1426 - 1427 - /** 1428 - * dccp_feat_reconcile - Reconcile SP preference lists 1429 - * @fval: SP list to reconcile into 1430 - * @arr: received SP preference list 1431 - * @len: length of @arr in bytes 1432 - * @is_server: whether this side is the server (and @fv is the server's list) 1433 - * @reorder: whether to reorder the list in @fv after reconciling with @arr 1434 - * When successful, > 0 is returned and the reconciled list is in @fval. 1435 - * A value of 0 means that negotiation failed (no shared entry). 1436 - */ 1437 - static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, 1438 - bool is_server, bool reorder) 1439 - { 1440 - int rc; 1441 - 1442 - if (!fv->sp.vec || !arr) { 1443 - DCCP_CRIT("NULL feature value or array"); 1444 - return 0; 1445 - } 1446 - 1447 - if (is_server) 1448 - rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); 1449 - else 1450 - rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); 1451 - 1452 - if (!reorder) 1453 - return rc; 1454 - if (rc < 0) 1455 - return 0; 1456 - 1457 - /* 1458 - * Reorder list: used for activating features and in dccp_insert_fn_opt. 1459 - */ 1460 - return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); 1461 - } 1462 - 1463 - /** 1464 - * dccp_feat_change_recv - Process incoming ChangeL/R options 1465 - * @fn: feature-negotiation list to update 1466 - * @is_mandatory: whether the Change was preceded by a Mandatory option 1467 - * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R 1468 - * @feat: one of %dccp_feature_numbers 1469 - * @val: NN value or SP value/preference list 1470 - * @len: length of @val in bytes 1471 - * @server: whether this node is the server (1) or the client (0) 1472 - */ 1473 - static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, 1474 - u8 feat, u8 *val, u8 len, const bool server) 1475 - { 1476 - u8 defval, type = dccp_feat_type(feat); 1477 - const bool local = (opt == DCCPO_CHANGE_R); 1478 - struct dccp_feat_entry *entry; 1479 - dccp_feat_val fval; 1480 - 1481 - if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ 1482 - goto unknown_feature_or_value; 1483 - 1484 - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); 1485 - 1486 - /* 1487 - * Negotiation of NN features: Change R is invalid, so there is no 1488 - * simultaneous negotiation; hence we do not look up in the list. 1489 - */ 1490 - if (type == FEAT_NN) { 1491 - if (local || len > sizeof(fval.nn)) 1492 - goto unknown_feature_or_value; 1493 - 1494 - /* 6.3.2: "The feature remote MUST accept any valid value..." */ 1495 - fval.nn = dccp_decode_value_var(val, len); 1496 - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) 1497 - goto unknown_feature_or_value; 1498 - 1499 - return dccp_feat_push_confirm(fn, feat, local, &fval); 1500 - } 1501 - 1502 - /* 1503 - * Unidirectional/simultaneous negotiation of SP features (6.3.1) 1504 - */ 1505 - entry = dccp_feat_list_lookup(fn, feat, local); 1506 - if (entry == NULL) { 1507 - /* 1508 - * No particular preferences have been registered. We deal with 1509 - * this situation by assuming that all valid values are equally 1510 - * acceptable, and apply the following checks: 1511 - * - if the peer's list is a singleton, we accept a valid value; 1512 - * - if we are the server, we first try to see if the peer (the 1513 - * client) advertises the default value. If yes, we use it, 1514 - * otherwise we accept the preferred value; 1515 - * - else if we are the client, we use the first list element. 1516 - */ 1517 - if (dccp_feat_clone_sp_val(&fval, val, 1)) 1518 - return DCCP_RESET_CODE_TOO_BUSY; 1519 - 1520 - if (len > 1 && server) { 1521 - defval = dccp_feat_default_value(feat); 1522 - if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) 1523 - fval.sp.vec[0] = defval; 1524 - } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { 1525 - kfree(fval.sp.vec); 1526 - goto unknown_feature_or_value; 1527 - } 1528 - 1529 - /* Treat unsupported CCIDs like invalid values */ 1530 - if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { 1531 - kfree(fval.sp.vec); 1532 - goto not_valid_or_not_known; 1533 - } 1534 - 1535 - return dccp_feat_push_confirm(fn, feat, local, &fval); 1536 - 1537 - } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ 1538 - return 0; 1539 - } 1540 - 1541 - if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { 1542 - entry->empty_confirm = 0; 1543 - } else if (is_mandatory) { 1544 - return DCCP_RESET_CODE_MANDATORY_ERROR; 1545 - } else if (entry->state == FEAT_INITIALISING) { 1546 - /* 1547 - * Failed simultaneous negotiation (server only): try to `save' 1548 - * the connection by checking whether entry contains the default 1549 - * value for @feat. If yes, send an empty Confirm to signal that 1550 - * the received Change was not understood - which implies using 1551 - * the default value. 1552 - * If this also fails, we use Reset as the last resort. 1553 - */ 1554 - WARN_ON(!server); 1555 - defval = dccp_feat_default_value(feat); 1556 - if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) 1557 - return DCCP_RESET_CODE_OPTION_ERROR; 1558 - entry->empty_confirm = 1; 1559 - } 1560 - entry->needs_confirm = 1; 1561 - entry->needs_mandatory = 0; 1562 - entry->state = FEAT_STABLE; 1563 - return 0; 1564 - 1565 - unknown_feature_or_value: 1566 - if (!is_mandatory) 1567 - return dccp_push_empty_confirm(fn, feat, local); 1568 - 1569 - not_valid_or_not_known: 1570 - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR 1571 - : DCCP_RESET_CODE_OPTION_ERROR; 1572 - } 1573 - 1574 - /** 1575 - * dccp_feat_confirm_recv - Process received Confirm options 1576 - * @fn: feature-negotiation list to update 1577 - * @is_mandatory: whether @opt was preceded by a Mandatory option 1578 - * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R 1579 - * @feat: one of %dccp_feature_numbers 1580 - * @val: NN value or SP value/preference list 1581 - * @len: length of @val in bytes 1582 - * @server: whether this node is server (1) or client (0) 1583 - */ 1584 - static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, 1585 - u8 feat, u8 *val, u8 len, const bool server) 1586 - { 1587 - u8 *plist, plen, type = dccp_feat_type(feat); 1588 - const bool local = (opt == DCCPO_CONFIRM_R); 1589 - struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); 1590 - 1591 - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); 1592 - 1593 - if (entry == NULL) { /* nothing queued: ignore or handle error */ 1594 - if (is_mandatory && type == FEAT_UNKNOWN) 1595 - return DCCP_RESET_CODE_MANDATORY_ERROR; 1596 - 1597 - if (!local && type == FEAT_NN) /* 6.3.2 */ 1598 - goto confirmation_failed; 1599 - return 0; 1600 - } 1601 - 1602 - if (entry->state != FEAT_CHANGING) /* 6.6.2 */ 1603 - return 0; 1604 - 1605 - if (len == 0) { 1606 - if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ 1607 - goto confirmation_failed; 1608 - /* 1609 - * Empty Confirm during connection setup: this means reverting 1610 - * to the `old' value, which in this case is the default. Since 1611 - * we handle default values automatically when no other values 1612 - * have been set, we revert to the old value by removing this 1613 - * entry from the list. 1614 - */ 1615 - dccp_feat_list_pop(entry); 1616 - return 0; 1617 - } 1618 - 1619 - if (type == FEAT_NN) { 1620 - if (len > sizeof(entry->val.nn)) 1621 - goto confirmation_failed; 1622 - 1623 - if (entry->val.nn == dccp_decode_value_var(val, len)) 1624 - goto confirmation_succeeded; 1625 - 1626 - DCCP_WARN("Bogus Confirm for non-existing value\n"); 1627 - goto confirmation_failed; 1628 - } 1629 - 1630 - /* 1631 - * Parsing SP Confirms: the first element of @val is the preferred 1632 - * SP value which the peer confirms, the remainder depends on @len. 1633 - * Note that only the confirmed value need to be a valid SP value. 1634 - */ 1635 - if (!dccp_feat_is_valid_sp_val(feat, *val)) 1636 - goto confirmation_failed; 1637 - 1638 - if (len == 1) { /* peer didn't supply a preference list */ 1639 - plist = val; 1640 - plen = len; 1641 - } else { /* preferred value + preference list */ 1642 - plist = val + 1; 1643 - plen = len - 1; 1644 - } 1645 - 1646 - /* Check whether the peer got the reconciliation right (6.6.8) */ 1647 - if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { 1648 - DCCP_WARN("Confirm selected the wrong value %u\n", *val); 1649 - return DCCP_RESET_CODE_OPTION_ERROR; 1650 - } 1651 - entry->val.sp.vec[0] = *val; 1652 - 1653 - confirmation_succeeded: 1654 - entry->state = FEAT_STABLE; 1655 - return 0; 1656 - 1657 - confirmation_failed: 1658 - DCCP_WARN("Confirmation failed\n"); 1659 - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR 1660 - : DCCP_RESET_CODE_OPTION_ERROR; 1661 - } 1662 - 1663 - /** 1664 - * dccp_feat_handle_nn_established - Fast-path reception of NN options 1665 - * @sk: socket of an established DCCP connection 1666 - * @mandatory: whether @opt was preceded by a Mandatory option 1667 - * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) 1668 - * @feat: NN number, one of %dccp_feature_numbers 1669 - * @val: NN value 1670 - * @len: length of @val in bytes 1671 - * This function combines the functionality of change_recv/confirm_recv, with 1672 - * the following differences (reset codes are the same): 1673 - * - cleanup after receiving the Confirm; 1674 - * - values are directly activated after successful parsing; 1675 - * - deliberately restricted to NN features. 1676 - * The restriction to NN features is essential since SP features can have non- 1677 - * predictable outcomes (depending on the remote configuration), and are inter- 1678 - * dependent (CCIDs for instance cause further dependencies). 1679 - */ 1680 - static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, 1681 - u8 feat, u8 *val, u8 len) 1682 - { 1683 - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; 1684 - const bool local = (opt == DCCPO_CONFIRM_R); 1685 - struct dccp_feat_entry *entry; 1686 - u8 type = dccp_feat_type(feat); 1687 - dccp_feat_val fval; 1688 - 1689 - dccp_feat_print_opt(opt, feat, val, len, mandatory); 1690 - 1691 - /* Ignore non-mandatory unknown and non-NN features */ 1692 - if (type == FEAT_UNKNOWN) { 1693 - if (local && !mandatory) 1694 - return 0; 1695 - goto fast_path_unknown; 1696 - } else if (type != FEAT_NN) { 1697 - return 0; 1698 - } 1699 - 1700 - /* 1701 - * We don't accept empty Confirms, since in fast-path feature 1702 - * negotiation the values are enabled immediately after sending 1703 - * the Change option. 1704 - * Empty Changes on the other hand are invalid (RFC 4340, 6.1). 1705 - */ 1706 - if (len == 0 || len > sizeof(fval.nn)) 1707 - goto fast_path_unknown; 1708 - 1709 - if (opt == DCCPO_CHANGE_L) { 1710 - fval.nn = dccp_decode_value_var(val, len); 1711 - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) 1712 - goto fast_path_unknown; 1713 - 1714 - if (dccp_feat_push_confirm(fn, feat, local, &fval) || 1715 - dccp_feat_activate(sk, feat, local, &fval)) 1716 - return DCCP_RESET_CODE_TOO_BUSY; 1717 - 1718 - /* set the `Ack Pending' flag to piggyback a Confirm */ 1719 - inet_csk_schedule_ack(sk); 1720 - 1721 - } else if (opt == DCCPO_CONFIRM_R) { 1722 - entry = dccp_feat_list_lookup(fn, feat, local); 1723 - if (entry == NULL || entry->state != FEAT_CHANGING) 1724 - return 0; 1725 - 1726 - fval.nn = dccp_decode_value_var(val, len); 1727 - if (fval.nn != entry->val.nn) { 1728 - DCCP_WARN("Bogus Confirm for non-existing value\n"); 1729 - goto fast_path_failed; 1730 - } 1731 - 1732 - /* It has been confirmed - so remove the entry */ 1733 - dccp_feat_list_pop(entry); 1734 - 1735 - } else { 1736 - DCCP_WARN("Received illegal option %u\n", opt); 1737 - goto fast_path_failed; 1738 - } 1739 - return 0; 1740 - 1741 - fast_path_unknown: 1742 - if (!mandatory) 1743 - return dccp_push_empty_confirm(fn, feat, local); 1744 - 1745 - fast_path_failed: 1746 - return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR 1747 - : DCCP_RESET_CODE_OPTION_ERROR; 1748 - } 1749 - 1750 - /** 1751 - * dccp_feat_parse_options - Process Feature-Negotiation Options 1752 - * @sk: for general use and used by the client during connection setup 1753 - * @dreq: used by the server during connection setup 1754 - * @mandatory: whether @opt was preceded by a Mandatory option 1755 - * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R 1756 - * @feat: one of %dccp_feature_numbers 1757 - * @val: value contents of @opt 1758 - * @len: length of @val in bytes 1759 - * Returns 0 on success, a Reset code for ending the connection otherwise. 1760 - */ 1761 - int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, 1762 - u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) 1763 - { 1764 - struct dccp_sock *dp = dccp_sk(sk); 1765 - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; 1766 - bool server = false; 1767 - 1768 - switch (sk->sk_state) { 1769 - /* 1770 - * Negotiation during connection setup 1771 - */ 1772 - case DCCP_LISTEN: 1773 - server = true; /* fall through */ 1774 - case DCCP_REQUESTING: 1775 - switch (opt) { 1776 - case DCCPO_CHANGE_L: 1777 - case DCCPO_CHANGE_R: 1778 - return dccp_feat_change_recv(fn, mandatory, opt, feat, 1779 - val, len, server); 1780 - case DCCPO_CONFIRM_R: 1781 - case DCCPO_CONFIRM_L: 1782 - return dccp_feat_confirm_recv(fn, mandatory, opt, feat, 1783 - val, len, server); 1784 - } 1785 - break; 1786 - /* 1787 - * Support for exchanging NN options on an established connection 1788 - * This is currently restricted to Ack Ratio (RFC 4341, 6.1.2) 1789 - */ 1790 - case DCCP_OPEN: 1791 - case DCCP_PARTOPEN: 1792 - return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, 1793 - val, len); 1794 - } 1795 - return 0; /* ignore FN options in all other states */ 1796 - } 1797 - 1798 - /** 1799 - * dccp_feat_init - Seed feature negotiation with host-specific defaults 1800 - * This initialises global defaults, depending on the value of the sysctls. 1801 - * These can later be overridden by registering changes via setsockopt calls. 1802 - * The last link in the chain is finalise_settings, to make sure that between 1803 - * here and the start of actual feature negotiation no inconsistencies enter. 1804 - * 1805 - * All features not appearing below use either defaults or are otherwise 1806 - * later adjusted through dccp_feat_finalise_settings(). 1807 - */ 1808 - int dccp_feat_init(struct sock *sk) 1809 - { 1810 - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; 1811 - u8 on = 1, off = 0; 1812 - int rc; 1813 - struct { 1814 - u8 *val; 1815 - u8 len; 1816 - } tx, rx; 1817 - 1818 - /* Non-negotiable (NN) features */ 1819 - rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, 1820 - sysctl_dccp_sequence_window); 1821 - if (rc) 1822 - return rc; 1823 - 1824 - /* Server-priority (SP) features */ 1825 - 1826 - /* Advertise that short seqnos are not supported (7.6.1) */ 1827 - rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); 1828 - if (rc) 1829 - return rc; 1830 - 1831 - /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ 1832 - rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); 1833 - if (rc) 1834 - return rc; 1835 - 1836 - /* 1837 - * We advertise the available list of CCIDs and reorder according to 1838 - * preferences, to avoid failure resulting from negotiating different 1839 - * singleton values (which always leads to failure). 1840 - * These settings can still (later) be overridden via sockopts. 1841 - */ 1842 - if (ccid_get_builtin_ccids(&tx.val, &tx.len) || 1843 - ccid_get_builtin_ccids(&rx.val, &rx.len)) 1844 - return -ENOBUFS; 1845 - 1846 - /* Pre-load all CCID modules that are going to be advertised */ 1847 - rc = -EUNATCH; 1848 - if (ccid_request_modules(tx.val, tx.len)) 1849 - goto free_ccid_lists; 1850 - 1851 - if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || 1852 - !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) 1853 - goto free_ccid_lists; 1854 - 1855 - rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); 1856 - if (rc) 1857 - goto free_ccid_lists; 1858 - 1859 - rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); 1860 - 1861 - free_ccid_lists: 1862 - kfree(tx.val); 1863 - kfree(rx.val); 1864 - return rc; 1865 - } 1866 - 1867 - int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) 1868 - { 1869 - struct dccp_sock *dp = dccp_sk(sk); 1870 - struct dccp_feat_entry *cur, *next; 1871 - int idx; 1872 - dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { 1873 - [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } 1874 - }; 1875 - 1876 - list_for_each_entry(cur, fn_list, node) { 1877 - /* 1878 - * An empty Confirm means that either an unknown feature type 1879 - * or an invalid value was present. In the first case there is 1880 - * nothing to activate, in the other the default value is used. 1881 - */ 1882 - if (cur->empty_confirm) 1883 - continue; 1884 - 1885 - idx = dccp_feat_index(cur->feat_num); 1886 - if (idx < 0) { 1887 - DCCP_BUG("Unknown feature %u", cur->feat_num); 1888 - goto activation_failed; 1889 - } 1890 - if (cur->state != FEAT_STABLE) { 1891 - DCCP_CRIT("Negotiation of %s %s failed in state %s", 1892 - cur->is_local ? "local" : "remote", 1893 - dccp_feat_fname(cur->feat_num), 1894 - dccp_feat_sname[cur->state]); 1895 - goto activation_failed; 1896 - } 1897 - fvals[idx][cur->is_local] = &cur->val; 1898 - } 1899 - 1900 - /* 1901 - * Activate in decreasing order of index, so that the CCIDs are always 1902 - * activated as the last feature. This avoids the case where a CCID 1903 - * relies on the initialisation of one or more features that it depends 1904 - * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). 1905 - */ 1906 - for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) 1907 - if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || 1908 - __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { 1909 - DCCP_CRIT("Could not activate %d", idx); 1910 - goto activation_failed; 1911 - } 1912 - 1913 - /* Clean up Change options which have been confirmed already */ 1914 - list_for_each_entry_safe(cur, next, fn_list, node) 1915 - if (!cur->needs_confirm) 1916 - dccp_feat_list_pop(cur); 1917 - 1918 - dccp_pr_debug("Activation OK\n"); 1919 - return 0; 1920 - 1921 - activation_failed: 1922 - /* 1923 - * We clean up everything that may have been allocated, since 1924 - * it is difficult to track at which stage negotiation failed. 1925 - * This is ok, since all allocation functions below are robust 1926 - * against NULL arguments. 1927 - */ 1928 - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); 1929 - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); 1930 - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; 1931 - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); 1932 - dp->dccps_hc_rx_ackvec = NULL; 1933 - return -1; 1934 - } 246 + EXPORT_SYMBOL_GPL(dccp_feat_name); 247 + #endif /* CONFIG_IP_DCCP_DEBUG */
+23 -119
net/dccp/feat.h
··· 3 3 /* 4 4 * net/dccp/feat.h 5 5 * 6 - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) 7 - * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> 6 + * An implementation of the DCCP protocol 8 7 * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> 9 8 * 10 - * This program is free software; you can redistribute it and/or modify it 11 - * under the terms of the GNU General Public License version 2 as 12 - * published by the Free Software Foundation. 9 + * This program is free software; you can redistribute it and/or modify it 10 + * under the terms of the GNU General Public License version 2 as 11 + * published by the Free Software Foundation. 13 12 */ 13 + 14 14 #include <linux/types.h> 15 15 #include "dccp.h" 16 16 17 - /* 18 - * Known limit values 19 - */ 20 - /* Ack Ratio takes 2-byte integer values (11.3) */ 21 - #define DCCPF_ACK_RATIO_MAX 0xFFFF 22 - /* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ 23 - #define DCCPF_SEQ_WMIN 32 24 - #define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull 25 - /* Maximum number of SP values that fit in a single (Confirm) option */ 26 - #define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) 17 + #ifdef CONFIG_IP_DCCP_DEBUG 18 + extern const char *dccp_feat_typename(const u8 type); 19 + extern const char *dccp_feat_name(const u8 feat); 27 20 28 - enum dccp_feat_type { 29 - FEAT_AT_RX = 1, /* located at RX side of half-connection */ 30 - FEAT_AT_TX = 2, /* located at TX side of half-connection */ 31 - FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ 32 - FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ 33 - FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ 34 - }; 35 - 36 - enum dccp_feat_state { 37 - FEAT_DEFAULT = 0, /* using default values from 6.4 */ 38 - FEAT_INITIALISING, /* feature is being initialised */ 39 - FEAT_CHANGING, /* Change sent but not confirmed yet */ 40 - FEAT_UNSTABLE, /* local modification in state CHANGING */ 41 - FEAT_STABLE /* both ends (think they) agree */ 42 - }; 43 - 44 - /** 45 - * dccp_feat_val - Container for SP or NN feature values 46 - * @nn: single NN value 47 - * @sp.vec: single SP value plus optional preference list 48 - * @sp.len: length of @sp.vec in bytes 49 - */ 50 - typedef union { 51 - u64 nn; 52 - struct { 53 - u8 *vec; 54 - u8 len; 55 - } sp; 56 - } dccp_feat_val; 57 - 58 - /** 59 - * struct feat_entry - Data structure to perform feature negotiation 60 - * @feat_num: one of %dccp_feature_numbers 61 - * @val: feature's current value (SP features may have preference list) 62 - * @state: feature's current state 63 - * @needs_mandatory: whether Mandatory options should be sent 64 - * @needs_confirm: whether to send a Confirm instead of a Change 65 - * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) 66 - * @is_local: feature location (1) or feature-remote (0) 67 - * @node: list pointers, entries arranged in FIFO order 68 - */ 69 - struct dccp_feat_entry { 70 - u8 feat_num; 71 - dccp_feat_val val; 72 - enum dccp_feat_state state:8; 73 - bool needs_mandatory:1, 74 - needs_confirm:1, 75 - empty_confirm:1, 76 - is_local:1; 77 - 78 - struct list_head node; 79 - }; 80 - 81 - static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) 21 + static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) 82 22 { 83 - if (entry->needs_confirm) 84 - return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; 85 - return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; 23 + dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type), 24 + dccp_feat_name(feat), feat, val); 86 25 } 26 + #else 27 + #define dccp_feat_debug(type, feat, val) 28 + #endif /* CONFIG_IP_DCCP_DEBUG */ 87 29 88 - /** 89 - * struct ccid_dependency - Track changes resulting from choosing a CCID 90 - * @dependent_feat: one of %dccp_feature_numbers 91 - * @is_local: local (1) or remote (0) @dependent_feat 92 - * @is_mandatory: whether presence of @dependent_feat is mission-critical or not 93 - * @val: corresponding default value for @dependent_feat (u8 is sufficient here) 94 - */ 95 - struct ccid_dependency { 96 - u8 dependent_feat; 97 - bool is_local:1, 98 - is_mandatory:1; 99 - u8 val; 100 - }; 30 + extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, 31 + u8 *val, u8 len, gfp_t gfp); 32 + extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, 33 + u8 *val, u8 len); 34 + extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, 35 + u8 *val, u8 len); 36 + extern void dccp_feat_clean(struct dccp_minisock *dmsk); 37 + extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); 38 + extern int dccp_feat_init(struct dccp_minisock *dmsk); 101 39 102 - /* 103 - * Sysctls to seed defaults for feature negotiation 104 - */ 105 - extern unsigned long sysctl_dccp_sequence_window; 106 - extern int sysctl_dccp_rx_ccid; 107 - extern int sysctl_dccp_tx_ccid; 108 - 109 - extern int dccp_feat_init(struct sock *sk); 110 - extern void dccp_feat_initialise_sysctls(void); 111 - extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, 112 - u8 const *list, u8 len); 113 - extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); 114 - extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, 115 - u8 mand, u8 opt, u8 feat, u8 *val, u8 len); 116 - extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); 117 - 118 - /* 119 - * Encoding variable-length options and their maximum length. 120 - * 121 - * This affects NN options (SP options are all u8) and other variable-length 122 - * options (see table 3 in RFC 4340). The limit is currently given the Sequence 123 - * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other 124 - * options consume less than 6 bytes (timestamps are 4 bytes). 125 - * When updating this constant (e.g. due to new internet drafts / RFCs), make 126 - * sure that you also update all code which refers to it. 127 - */ 128 - #define DCCP_OPTVAL_MAXLEN 6 129 - 130 - extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); 131 - extern u64 dccp_decode_value_var(const u8 *bf, const u8 len); 132 - 133 - extern int dccp_insert_option_mandatory(struct sk_buff *skb); 134 - extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, 135 - u8 *val, u8 len, bool repeat_first); 136 40 #endif /* _DCCP_FEAT_H */
+91 -73
net/dccp/input.c
··· 159 159 dccp_time_wait(sk, DCCP_TIME_WAIT, 0); 160 160 } 161 161 162 - static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) 162 + static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) 163 163 { 164 - struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; 164 + struct dccp_sock *dp = dccp_sk(sk); 165 165 166 - if (av == NULL) 167 - return; 168 - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 169 - dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); 170 - dccp_ackvec_input(av, skb); 166 + if (dccp_msk(sk)->dccpms_send_ack_vector) 167 + dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, 168 + DCCP_SKB_CB(skb)->dccpd_ack_seq); 171 169 } 172 170 173 171 static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) ··· 364 366 int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, 365 367 const struct dccp_hdr *dh, const unsigned len) 366 368 { 369 + struct dccp_sock *dp = dccp_sk(sk); 370 + 367 371 if (dccp_check_seqno(sk, skb)) 368 372 goto discard; 369 373 370 374 if (dccp_parse_options(sk, NULL, skb)) 371 375 return 1; 372 376 373 - dccp_handle_ackvec_processing(sk, skb); 377 + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 378 + dccp_event_ack_recv(sk, skb); 379 + 380 + if (dccp_msk(sk)->dccpms_send_ack_vector && 381 + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, 382 + DCCP_SKB_CB(skb)->dccpd_seq, 383 + DCCP_ACKVEC_STATE_RECEIVED)) 384 + goto discard; 374 385 dccp_deliver_input_to_ccids(sk, skb); 375 386 376 387 return __dccp_rcv_established(sk, skb, dh, len); ··· 421 414 goto out_invalid_packet; 422 415 } 423 416 424 - /* 425 - * If option processing (Step 8) failed, return 1 here so that 426 - * dccp_v4_do_rcv() sends a Reset. The Reset code depends on 427 - * the option type and is set in dccp_parse_options(). 428 - */ 429 417 if (dccp_parse_options(sk, NULL, skb)) 430 - return 1; 418 + goto out_invalid_packet; 431 419 432 420 /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ 433 421 if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) 434 422 dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - 435 423 dp->dccps_options_received.dccpor_timestamp_echo)); 424 + 425 + if (dccp_msk(sk)->dccpms_send_ack_vector && 426 + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, 427 + DCCP_SKB_CB(skb)->dccpd_seq, 428 + DCCP_ACKVEC_STATE_RECEIVED)) 429 + goto out_invalid_packet; /* FIXME: change error code */ 436 430 437 431 /* Stop the REQUEST timer */ 438 432 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); ··· 441 433 kfree_skb(sk->sk_send_head); 442 434 sk->sk_send_head = NULL; 443 435 436 + dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; 437 + dccp_update_gsr(sk, dp->dccps_isr); 444 438 /* 445 - * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect 446 - * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH 447 - * is done as part of activating the feature values below, since 448 - * these settings depend on the local/remote Sequence Window 449 - * features, which were undefined or not confirmed until now. 439 + * SWL and AWL are initially adjusted so that they are not less than 440 + * the initial Sequence Numbers received and sent, respectively: 441 + * SWL := max(GSR + 1 - floor(W/4), ISR), 442 + * AWL := max(GSS - W' + 1, ISS). 443 + * These adjustments MUST be applied only at the beginning of the 444 + * connection. 445 + * 446 + * AWL was adjusted in dccp_v4_connect -acme 450 447 */ 451 - dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; 448 + dccp_set_seqno(&dp->dccps_swl, 449 + max48(dp->dccps_swl, dp->dccps_isr)); 452 450 453 451 dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); 454 452 ··· 474 460 * three-way handshake * / 475 461 */ 476 462 dccp_set_state(sk, DCCP_PARTOPEN); 477 - 478 - /* 479 - * If feature negotiation was successful, activate features now; 480 - * an activation failure means that this host could not activate 481 - * one ore more features (e.g. insufficient memory), which would 482 - * leave at least one feature in an undefined state. 483 - */ 484 - if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) 485 - goto unable_to_proceed; 486 463 487 464 /* Make sure socket is routed, for correct metrics. */ 488 465 icsk->icsk_af_ops->rebuild_header(sk); ··· 508 503 out_invalid_packet: 509 504 /* dccp_v4_do_rcv will send a reset */ 510 505 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; 511 - return 1; 512 - 513 - unable_to_proceed: 514 - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; 515 - /* 516 - * We mark this socket as no longer usable, so that the loop in 517 - * dccp_sendmsg() terminates and the application gets notified. 518 - */ 519 - dccp_set_state(sk, DCCP_CLOSED); 520 - sk->sk_err = ECOMM; 521 506 return 1; 522 507 } 523 508 ··· 590 595 if (inet_csk(sk)->icsk_af_ops->conn_request(sk, 591 596 skb) < 0) 592 597 return 1; 598 + 599 + /* FIXME: do congestion control initialization */ 593 600 goto discard; 594 601 } 595 602 if (dh->dccph_type == DCCP_PKT_RESET) ··· 600 603 /* Caller (dccp_v4_do_rcv) will send Reset */ 601 604 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; 602 605 return 1; 603 - } else if (sk->sk_state == DCCP_CLOSED) { 604 - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; 605 - return 1; 606 606 } 607 607 608 - /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ 609 - if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) 610 - goto discard; 608 + if (sk->sk_state != DCCP_REQUESTING) { 609 + if (dccp_check_seqno(sk, skb)) 610 + goto discard; 611 611 612 - /* 613 - * Step 7: Check for unexpected packet types 614 - * If (S.is_server and P.type == Response) 615 - * or (S.is_client and P.type == Request) 616 - * or (S.state == RESPOND and P.type == Data), 617 - * Send Sync packet acknowledging P.seqno 618 - * Drop packet and return 619 - */ 620 - if ((dp->dccps_role != DCCP_ROLE_CLIENT && 621 - dh->dccph_type == DCCP_PKT_RESPONSE) || 622 - (dp->dccps_role == DCCP_ROLE_CLIENT && 623 - dh->dccph_type == DCCP_PKT_REQUEST) || 624 - (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { 625 - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); 626 - goto discard; 612 + /* 613 + * Step 8: Process options and mark acknowledgeable 614 + */ 615 + if (dccp_parse_options(sk, NULL, skb)) 616 + return 1; 617 + 618 + if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 619 + dccp_event_ack_recv(sk, skb); 620 + 621 + if (dccp_msk(sk)->dccpms_send_ack_vector && 622 + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, 623 + DCCP_SKB_CB(skb)->dccpd_seq, 624 + DCCP_ACKVEC_STATE_RECEIVED)) 625 + goto discard; 626 + 627 + dccp_deliver_input_to_ccids(sk, skb); 627 628 } 628 - 629 - /* Step 8: Process options */ 630 - if (dccp_parse_options(sk, NULL, skb)) 631 - return 1; 632 629 633 630 /* 634 631 * Step 9: Process Reset ··· 631 640 * S.state := TIMEWAIT 632 641 * Set TIMEWAIT timer 633 642 * Drop packet and return 634 - */ 643 + */ 635 644 if (dh->dccph_type == DCCP_PKT_RESET) { 636 645 dccp_rcv_reset(sk, skb); 637 646 return 0; 638 - } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ 647 + /* 648 + * Step 7: Check for unexpected packet types 649 + * If (S.is_server and P.type == Response) 650 + * or (S.is_client and P.type == Request) 651 + * or (S.state == RESPOND and P.type == Data), 652 + * Send Sync packet acknowledging P.seqno 653 + * Drop packet and return 654 + */ 655 + } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && 656 + dh->dccph_type == DCCP_PKT_RESPONSE) || 657 + (dp->dccps_role == DCCP_ROLE_CLIENT && 658 + dh->dccph_type == DCCP_PKT_REQUEST) || 659 + (sk->sk_state == DCCP_RESPOND && 660 + dh->dccph_type == DCCP_PKT_DATA)) { 661 + dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); 662 + goto discard; 663 + } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { 639 664 if (dccp_rcv_closereq(sk, skb)) 640 665 return 0; 641 666 goto discard; 642 - } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ 667 + } else if (dh->dccph_type == DCCP_PKT_CLOSE) { 643 668 if (dccp_rcv_close(sk, skb)) 644 669 return 0; 645 670 goto discard; 646 671 } 647 672 648 673 switch (sk->sk_state) { 674 + case DCCP_CLOSED: 675 + dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; 676 + return 1; 677 + 649 678 case DCCP_REQUESTING: 679 + /* FIXME: do congestion control initialization */ 680 + 650 681 queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); 651 682 if (queued >= 0) 652 683 return queued; ··· 676 663 __kfree_skb(skb); 677 664 return 0; 678 665 679 - case DCCP_PARTOPEN: 680 - /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ 681 - dccp_handle_ackvec_processing(sk, skb); 682 - dccp_deliver_input_to_ccids(sk, skb); 683 - /* fall through */ 684 666 case DCCP_RESPOND: 667 + case DCCP_PARTOPEN: 685 668 queued = dccp_rcv_respond_partopen_state_process(sk, skb, 686 669 dh, len); 687 670 break; ··· 716 707 /* dccpor_elapsed_time is either zeroed out or set and > 0 */ 717 708 delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; 718 709 719 - return dccp_sane_rtt(delta); 710 + if (unlikely(delta <= 0)) { 711 + DCCP_WARN("unusable RTT sample %ld, using min\n", delta); 712 + return DCCP_SANE_RTT_MIN; 713 + } 714 + if (unlikely(delta > DCCP_SANE_RTT_MAX)) { 715 + DCCP_WARN("RTT sample %ld too large, using max\n", delta); 716 + return DCCP_SANE_RTT_MAX; 717 + } 718 + 719 + return delta; 720 720 } 721 721 722 722 EXPORT_SYMBOL_GPL(dccp_sample_rtt);
+1 -3
net/dccp/ipv4.c
··· 545 545 546 546 static void dccp_v4_reqsk_destructor(struct request_sock *req) 547 547 { 548 - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); 549 548 kfree(inet_rsk(req)->opt); 550 549 } 551 550 ··· 595 596 if (req == NULL) 596 597 goto drop; 597 598 598 - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) 599 - goto drop_and_free; 599 + dccp_reqsk_init(req, skb); 600 600 601 601 dreq = dccp_rsk(req); 602 602 if (dccp_parse_options(sk, dreq, skb))
+1 -3
net/dccp/ipv6.c
··· 302 302 303 303 static void dccp_v6_reqsk_destructor(struct request_sock *req) 304 304 { 305 - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); 306 305 if (inet6_rsk(req)->pktopts != NULL) 307 306 kfree_skb(inet6_rsk(req)->pktopts); 308 307 } ··· 424 425 if (req == NULL) 425 426 goto drop; 426 427 427 - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) 428 - goto drop_and_free; 428 + dccp_reqsk_init(req, skb); 429 429 430 430 dreq = dccp_rsk(req); 431 431 if (dccp_parse_options(sk, dreq, skb))
+66 -25
net/dccp/minisocks.c
··· 42 42 43 43 EXPORT_SYMBOL_GPL(dccp_death_row); 44 44 45 + void dccp_minisock_init(struct dccp_minisock *dmsk) 46 + { 47 + dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; 48 + dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; 49 + dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; 50 + dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio; 51 + dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; 52 + dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; 53 + } 54 + 45 55 void dccp_time_wait(struct sock *sk, int state, int timeo) 46 56 { 47 57 struct inet_timewait_sock *tw = NULL; ··· 112 102 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); 113 103 114 104 if (newsk != NULL) { 115 - struct dccp_request_sock *dreq = dccp_rsk(req); 105 + const struct dccp_request_sock *dreq = dccp_rsk(req); 116 106 struct inet_connection_sock *newicsk = inet_csk(newsk); 117 107 struct dccp_sock *newdp = dccp_sk(newsk); 108 + struct dccp_minisock *newdmsk = dccp_msk(newsk); 118 109 119 110 newdp->dccps_role = DCCP_ROLE_SERVER; 120 111 newdp->dccps_hc_rx_ackvec = NULL; ··· 125 114 newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; 126 115 newicsk->icsk_rto = DCCP_TIMEOUT_INIT; 127 116 128 - INIT_LIST_HEAD(&newdp->dccps_featneg); 129 - /* 130 - * Step 3: Process LISTEN state 131 - * 132 - * Choose S.ISS (initial seqno) or set from Init Cookies 133 - * Initialize S.GAR := S.ISS 134 - * Set S.ISR, S.GSR from packet (or Init Cookies) 135 - * 136 - * Setting AWL/AWH and SWL/SWH happens as part of the feature 137 - * activation below, as these windows all depend on the local 138 - * and remote Sequence Window feature values (7.5.2). 139 - */ 140 - newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss; 141 - newdp->dccps_gar = newdp->dccps_iss; 142 - newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr; 117 + if (dccp_feat_clone(sk, newsk)) 118 + goto out_free; 143 119 144 - /* 145 - * Activate features: initialise CCIDs, sequence windows etc. 146 - */ 147 - if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { 120 + if (newdmsk->dccpms_send_ack_vector) { 121 + newdp->dccps_hc_rx_ackvec = 122 + dccp_ackvec_alloc(GFP_ATOMIC); 123 + if (unlikely(newdp->dccps_hc_rx_ackvec == NULL)) 124 + goto out_free; 125 + } 126 + 127 + newdp->dccps_hc_rx_ccid = 128 + ccid_hc_rx_new(newdmsk->dccpms_rx_ccid, 129 + newsk, GFP_ATOMIC); 130 + newdp->dccps_hc_tx_ccid = 131 + ccid_hc_tx_new(newdmsk->dccpms_tx_ccid, 132 + newsk, GFP_ATOMIC); 133 + if (unlikely(newdp->dccps_hc_rx_ccid == NULL || 134 + newdp->dccps_hc_tx_ccid == NULL)) { 135 + dccp_ackvec_free(newdp->dccps_hc_rx_ackvec); 136 + ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk); 137 + ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk); 138 + out_free: 148 139 /* It is still raw copy of parent, so invalidate 149 140 * destructor and make plain sk_free() */ 150 141 newsk->sk_destruct = NULL; 151 142 sk_free(newsk); 152 143 return NULL; 153 144 } 145 + 146 + /* 147 + * Step 3: Process LISTEN state 148 + * 149 + * Choose S.ISS (initial seqno) or set from Init Cookies 150 + * Initialize S.GAR := S.ISS 151 + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies 152 + */ 153 + 154 + /* See dccp_v4_conn_request */ 155 + newdmsk->dccpms_sequence_window = req->rcv_wnd; 156 + 157 + newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; 158 + dccp_update_gss(newsk, dreq->dreq_iss); 159 + 160 + newdp->dccps_isr = dreq->dreq_isr; 161 + dccp_update_gsr(newsk, dreq->dreq_isr); 162 + 163 + /* 164 + * SWL and AWL are initially adjusted so that they are not less than 165 + * the initial Sequence Numbers received and sent, respectively: 166 + * SWL := max(GSR + 1 - floor(W/4), ISR), 167 + * AWL := max(GSS - W' + 1, ISS). 168 + * These adjustments MUST be applied only at the beginning of the 169 + * connection. 170 + */ 171 + dccp_set_seqno(&newdp->dccps_swl, 172 + max48(newdp->dccps_swl, newdp->dccps_isr)); 173 + dccp_set_seqno(&newdp->dccps_awl, 174 + max48(newdp->dccps_awl, newdp->dccps_iss)); 175 + 154 176 dccp_init_xmit_timers(newsk); 155 177 156 178 DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); ··· 304 260 305 261 EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); 306 262 307 - int dccp_reqsk_init(struct request_sock *req, 308 - struct dccp_sock const *dp, struct sk_buff const *skb) 263 + void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) 309 264 { 310 265 struct dccp_request_sock *dreq = dccp_rsk(req); 311 266 312 267 inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; 313 268 inet_rsk(req)->acked = 0; 269 + req->rcv_wnd = sysctl_dccp_feat_sequence_window; 314 270 dreq->dreq_timestamp_echo = 0; 315 - 316 - /* inherit feature negotiation options from listening socket */ 317 - return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); 318 271 } 319 272 320 273 EXPORT_SYMBOL_GPL(dccp_reqsk_init);
+171 -186
net/dccp/options.c
··· 23 23 #include "dccp.h" 24 24 #include "feat.h" 25 25 26 - u64 dccp_decode_value_var(const u8 *bf, const u8 len) 27 - { 28 - u64 value = 0; 26 + int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; 27 + int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; 28 + int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; 29 + int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO; 30 + int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; 31 + int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; 29 32 30 - if (len >= DCCP_OPTVAL_MAXLEN) 31 - value += ((u64)*bf++) << 40; 32 - if (len > 4) 33 - value += ((u64)*bf++) << 32; 33 + static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) 34 + { 35 + u32 value = 0; 36 + 34 37 if (len > 3) 35 - value += ((u64)*bf++) << 24; 38 + value += *bf++ << 24; 36 39 if (len > 2) 37 - value += ((u64)*bf++) << 16; 40 + value += *bf++ << 16; 38 41 if (len > 1) 39 - value += ((u64)*bf++) << 8; 42 + value += *bf++ << 8; 40 43 if (len > 0) 41 44 value += *bf; 42 45 ··· 57 54 struct dccp_sock *dp = dccp_sk(sk); 58 55 const struct dccp_hdr *dh = dccp_hdr(skb); 59 56 const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; 57 + u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; 60 58 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); 61 59 unsigned char *opt_ptr = options; 62 60 const unsigned char *opt_end = (unsigned char *)dh + ··· 99 95 } 100 96 101 97 /* 98 + * CCID-Specific Options (from RFC 4340, sec. 10.3): 99 + * 100 + * Option numbers 128 through 191 are for options sent from the 101 + * HC-Sender to the HC-Receiver; option numbers 192 through 255 102 + * are for options sent from the HC-Receiver to the HC-Sender. 103 + * 102 104 * CCID-specific options are ignored during connection setup, as 103 105 * negotiation may still be in progress (see RFC 4340, 10.3). 104 106 * The same applies to Ack Vectors, as these depend on the CCID. 107 + * 105 108 */ 106 - if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || 109 + if (dreq != NULL && (opt >= 128 || 107 110 opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) 108 111 goto ignore_option; 109 112 ··· 131 120 dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), 132 121 (unsigned long long)opt_recv->dccpor_ndp); 133 122 break; 134 - case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: 135 - if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ 123 + case DCCPO_CHANGE_L: 124 + /* fall through */ 125 + case DCCPO_CHANGE_R: 126 + if (pkt_type == DCCP_PKT_DATA) 136 127 break; 137 - rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, 138 - *value, value + 1, len - 1); 139 - if (rc) 140 - goto out_featneg_failed; 128 + if (len < 2) 129 + goto out_invalid_option; 130 + rc = dccp_feat_change_recv(sk, opt, *value, value + 1, 131 + len - 1); 132 + /* 133 + * When there is a change error, change_recv is 134 + * responsible for dealing with it. i.e. reply with an 135 + * empty confirm. 136 + * If the change was mandatory, then we need to die. 137 + */ 138 + if (rc && mandatory) 139 + goto out_invalid_option; 140 + break; 141 + case DCCPO_CONFIRM_L: 142 + /* fall through */ 143 + case DCCPO_CONFIRM_R: 144 + if (pkt_type == DCCP_PKT_DATA) 145 + break; 146 + if (len < 2) /* FIXME this disallows empty confirm */ 147 + goto out_invalid_option; 148 + if (dccp_feat_confirm_recv(sk, opt, *value, 149 + value + 1, len - 1)) 150 + goto out_invalid_option; 151 + break; 152 + case DCCPO_ACK_VECTOR_0: 153 + case DCCPO_ACK_VECTOR_1: 154 + if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ 155 + break; 156 + 157 + if (dccp_msk(sk)->dccpms_send_ack_vector && 158 + dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) 159 + goto out_invalid_option; 141 160 break; 142 161 case DCCPO_TIMESTAMP: 143 162 if (len != 4) ··· 195 154 dccp_role(sk), ntohl(opt_val), 196 155 (unsigned long long) 197 156 DCCP_SKB_CB(skb)->dccpd_ack_seq); 198 - /* schedule an Ack in case this sender is quiescent */ 199 - inet_csk_schedule_ack(sk); 200 157 break; 201 158 case DCCPO_TIMESTAMP_ECHO: 202 159 if (len != 4 && len != 6 && len != 8) ··· 251 212 dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", 252 213 dccp_role(sk), elapsed_time); 253 214 break; 254 - case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: 215 + case 128 ... 191: { 216 + const u16 idx = value - options; 217 + 255 218 if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, 256 - pkt_type, opt, value, len)) 219 + opt, len, idx, 220 + value) != 0) 257 221 goto out_invalid_option; 222 + } 258 223 break; 259 - case DCCPO_ACK_VECTOR_0: 260 - case DCCPO_ACK_VECTOR_1: 261 - if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ 262 - break; 263 - /* 264 - * Ack vectors are processed by the TX CCID if it is 265 - * interested. The RX CCID need not parse Ack Vectors, 266 - * since it is only interested in clearing old state. 267 - * Fall through. 268 - */ 269 - case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: 224 + case 192 ... 255: { 225 + const u16 idx = value - options; 226 + 270 227 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, 271 - pkt_type, opt, value, len)) 228 + opt, len, idx, 229 + value) != 0) 272 230 goto out_invalid_option; 231 + } 273 232 break; 274 233 default: 275 234 DCCP_CRIT("DCCP(%p): option %d(len=%d) not " ··· 289 252 290 253 out_invalid_option: 291 254 DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); 292 - rc = DCCP_RESET_CODE_OPTION_ERROR; 293 - out_featneg_failed: 294 - DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); 295 - DCCP_SKB_CB(skb)->dccpd_reset_code = rc; 255 + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; 256 + DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len); 296 257 DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; 297 258 DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; 298 259 DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; ··· 299 264 300 265 EXPORT_SYMBOL_GPL(dccp_parse_options); 301 266 302 - void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) 267 + static void dccp_encode_value_var(const u32 value, unsigned char *to, 268 + const unsigned int len) 303 269 { 304 - if (len >= DCCP_OPTVAL_MAXLEN) 305 - *to++ = (value & 0xFF0000000000ull) >> 40; 306 - if (len > 4) 307 - *to++ = (value & 0xFF00000000ull) >> 32; 308 270 if (len > 3) 309 271 *to++ = (value & 0xFF000000) >> 24; 310 272 if (len > 2) ··· 461 429 return 0; 462 430 } 463 431 464 - static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) 432 + static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, 433 + u8 *val, u8 len) 465 434 { 466 - struct dccp_sock *dp = dccp_sk(sk); 467 - struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; 468 - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 469 - const u16 buflen = dccp_ackvec_buflen(av); 470 - /* Figure out how many options do we need to represent the ackvec */ 471 - const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); 472 - u16 len = buflen + 2 * nr_opts; 473 - u8 i, nonce = 0; 474 - const unsigned char *tail, *from; 475 - unsigned char *to; 435 + u8 *to; 476 436 477 - if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { 478 - DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, 479 - dccp_packet_name(dcb->dccpd_type)); 480 - return -1; 481 - } 482 - /* 483 - * Since Ack Vectors are variable-length, we can not always predict 484 - * their size. To catch exception cases where the space is running out 485 - * on the skb, a separate Sync is scheduled to carry the Ack Vector. 486 - */ 487 - if (len > DCCPAV_MIN_OPTLEN && 488 - len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { 489 - DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " 490 - "MPS=%u ==> reduce payload size?\n", len, skb->len, 491 - dcb->dccpd_opt_len, dp->dccps_mss_cache); 492 - dp->dccps_sync_scheduled = 1; 493 - return 0; 494 - } 495 - dcb->dccpd_opt_len += len; 496 - 497 - to = skb_push(skb, len); 498 - len = buflen; 499 - from = av->av_buf + av->av_buf_head; 500 - tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; 501 - 502 - for (i = 0; i < nr_opts; ++i) { 503 - int copylen = len; 504 - 505 - if (len > DCCP_SINGLE_OPT_MAXLEN) 506 - copylen = DCCP_SINGLE_OPT_MAXLEN; 507 - 508 - /* 509 - * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via 510 - * its type; ack_nonce is the sum of all individual buf_nonce's. 511 - */ 512 - nonce ^= av->av_buf_nonce[i]; 513 - 514 - *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; 515 - *to++ = copylen + 2; 516 - 517 - /* Check if buf_head wraps */ 518 - if (from + copylen > tail) { 519 - const u16 tailsize = tail - from; 520 - 521 - memcpy(to, from, tailsize); 522 - to += tailsize; 523 - len -= tailsize; 524 - copylen -= tailsize; 525 - from = av->av_buf; 526 - } 527 - 528 - memcpy(to, from, copylen); 529 - from += copylen; 530 - to += copylen; 531 - len -= copylen; 532 - } 533 - /* 534 - * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. 535 - */ 536 - if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) 537 - return -ENOBUFS; 538 - return 0; 539 - } 540 - 541 - /** 542 - * dccp_insert_option_mandatory - Mandatory option (5.8.2) 543 - * Note that since we are using skb_push, this function needs to be called 544 - * _after_ inserting the option it is supposed to influence (stack order). 545 - */ 546 - int dccp_insert_option_mandatory(struct sk_buff *skb) 547 - { 548 - if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) 549 - return -1; 550 - 551 - DCCP_SKB_CB(skb)->dccpd_opt_len++; 552 - *skb_push(skb, 1) = DCCPO_MANDATORY; 553 - return 0; 554 - } 555 - 556 - /** 557 - * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb 558 - * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R 559 - * @feat: one out of %dccp_feature_numbers 560 - * @val: NN value or SP array (preferred element first) to copy 561 - * @len: true length of @val in bytes (excluding first element repetition) 562 - * @repeat_first: whether to copy the first element of @val twice 563 - * The last argument is used to construct Confirm options, where the preferred 564 - * value and the preference list appear separately (RFC 4340, 6.3.1). Preference 565 - * lists are kept such that the preferred entry is always first, so we only need 566 - * to copy twice, and avoid the overhead of cloning into a bigger array. 567 - */ 568 - int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, 569 - u8 *val, u8 len, bool repeat_first) 570 - { 571 - u8 tot_len, *to; 572 - 573 - /* take the `Feature' field and possible repetition into account */ 574 - if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { 575 - DCCP_WARN("length %u for feature %u too large\n", len, feat); 576 - return -1; 577 - } 578 - 579 - if (unlikely(val == NULL || len == 0)) 580 - len = repeat_first = 0; 581 - tot_len = 3 + repeat_first + len; 582 - 583 - if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { 437 + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) { 584 438 DCCP_WARN("packet too small for feature %d option!\n", feat); 585 439 return -1; 586 440 } 587 - DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; 588 441 589 - to = skb_push(skb, tot_len); 442 + DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3; 443 + 444 + to = skb_push(skb, len + 3); 590 445 *to++ = type; 591 - *to++ = tot_len; 446 + *to++ = len + 3; 592 447 *to++ = feat; 593 448 594 - if (repeat_first) 595 - *to++ = *val; 596 449 if (len) 597 450 memcpy(to, val, len); 451 + 452 + dccp_pr_debug("%s(%s (%d), ...), length %d\n", 453 + dccp_feat_typename(type), 454 + dccp_feat_name(feat), feat, len); 455 + return 0; 456 + } 457 + 458 + static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) 459 + { 460 + struct dccp_sock *dp = dccp_sk(sk); 461 + struct dccp_minisock *dmsk = dccp_msk(sk); 462 + struct dccp_opt_pend *opt, *next; 463 + int change = 0; 464 + 465 + /* confirm any options [NN opts] */ 466 + list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { 467 + dccp_insert_feat_opt(skb, opt->dccpop_type, 468 + opt->dccpop_feat, opt->dccpop_val, 469 + opt->dccpop_len); 470 + /* fear empty confirms */ 471 + if (opt->dccpop_val) 472 + kfree(opt->dccpop_val); 473 + kfree(opt); 474 + } 475 + INIT_LIST_HEAD(&dmsk->dccpms_conf); 476 + 477 + /* see which features we need to send */ 478 + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 479 + /* see if we need to send any confirm */ 480 + if (opt->dccpop_sc) { 481 + dccp_insert_feat_opt(skb, opt->dccpop_type + 1, 482 + opt->dccpop_feat, 483 + opt->dccpop_sc->dccpoc_val, 484 + opt->dccpop_sc->dccpoc_len); 485 + 486 + BUG_ON(!opt->dccpop_sc->dccpoc_val); 487 + kfree(opt->dccpop_sc->dccpoc_val); 488 + kfree(opt->dccpop_sc); 489 + opt->dccpop_sc = NULL; 490 + } 491 + 492 + /* any option not confirmed, re-send it */ 493 + if (!opt->dccpop_conf) { 494 + dccp_insert_feat_opt(skb, opt->dccpop_type, 495 + opt->dccpop_feat, opt->dccpop_val, 496 + opt->dccpop_len); 497 + change++; 498 + } 499 + } 500 + 501 + /* Retransmit timer. 502 + * If this is the master listening sock, we don't set a timer on it. It 503 + * should be fine because if the dude doesn't receive our RESPONSE 504 + * [which will contain the CHANGE] he will send another REQUEST which 505 + * will "retrnasmit" the change. 506 + */ 507 + if (change && dp->dccps_role != DCCP_ROLE_LISTEN) { 508 + dccp_pr_debug("reset feat negotiation timer %p\n", sk); 509 + 510 + /* XXX don't reset the timer on re-transmissions. I.e. reset it 511 + * only when sending new stuff i guess. Currently the timer 512 + * never backs off because on re-transmission it just resets it! 513 + */ 514 + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 515 + inet_csk(sk)->icsk_rto, DCCP_RTO_MAX); 516 + } 517 + 598 518 return 0; 599 519 } 600 520 ··· 565 581 int dccp_insert_options(struct sock *sk, struct sk_buff *skb) 566 582 { 567 583 struct dccp_sock *dp = dccp_sk(sk); 584 + struct dccp_minisock *dmsk = dccp_msk(sk); 568 585 569 586 DCCP_SKB_CB(skb)->dccpd_opt_len = 0; 570 587 571 - if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) 588 + if (dmsk->dccpms_send_ndp_count && 589 + dccp_insert_option_ndp(sk, skb)) 572 590 return -1; 573 591 574 - if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { 575 - 576 - /* Feature Negotiation */ 577 - if (dccp_feat_insert_opts(dp, NULL, skb)) 592 + if (!dccp_packet_without_ack(skb)) { 593 + if (dmsk->dccpms_send_ack_vector && 594 + dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && 595 + dccp_insert_option_ackvec(sk, skb)) 578 596 return -1; 579 - 580 - if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { 581 - /* 582 - * Obtain RTT sample from Request/Response exchange. 583 - * This is currently used in CCID 3 initialisation. 584 - */ 585 - if (dccp_insert_option_timestamp(sk, skb)) 586 - return -1; 587 - 588 - } else if (dccp_ackvec_pending(sk) && 589 - dccp_insert_option_ackvec(sk, skb)) { 590 - return -1; 591 - } 592 597 } 593 598 594 599 if (dp->dccps_hc_rx_insert_options) { ··· 585 612 return -1; 586 613 dp->dccps_hc_rx_insert_options = 0; 587 614 } 615 + 616 + /* Feature negotiation */ 617 + /* Data packets can't do feat negotiation */ 618 + if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA && 619 + DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK && 620 + dccp_insert_options_feat(sk, skb)) 621 + return -1; 622 + 623 + /* 624 + * Obtain RTT sample from Request/Response exchange. 625 + * This is currently used in CCID 3 initialisation. 626 + */ 627 + if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST && 628 + dccp_insert_option_timestamp(sk, skb)) 629 + return -1; 588 630 589 631 if (dp->dccps_timestamp_echo != 0 && 590 632 dccp_insert_option_timestamp_echo(dp, NULL, skb)) ··· 612 624 int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) 613 625 { 614 626 DCCP_SKB_CB(skb)->dccpd_opt_len = 0; 615 - 616 - if (dccp_feat_insert_opts(NULL, dreq, skb)) 617 - return -1; 618 627 619 628 if (dreq->dreq_timestamp_echo != 0 && 620 629 dccp_insert_option_timestamp_echo(NULL, dreq, skb))
+102 -177
net/dccp/output.c
··· 26 26 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); 27 27 } 28 28 29 - /* enqueue @skb on sk_send_head for retransmission, return clone to send now */ 30 - static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) 29 + static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb) 31 30 { 32 31 skb_set_owner_w(skb, sk); 33 32 WARN_ON(sk->sk_send_head); 34 33 sk->sk_send_head = skb; 35 - return skb_clone(sk->sk_send_head, gfp_any()); 36 34 } 37 35 38 36 /* ··· 161 163 struct inet_connection_sock *icsk = inet_csk(sk); 162 164 struct dccp_sock *dp = dccp_sk(sk); 163 165 u32 ccmps = dccp_determine_ccmps(dp); 164 - u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; 166 + int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; 165 167 166 168 /* Account for header lengths and IPv4/v6 option overhead */ 167 169 cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + 168 170 sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); 169 171 170 172 /* 171 - * Leave enough headroom for common DCCP header options. 172 - * This only considers options which may appear on DCCP-Data packets, as 173 - * per table 3 in RFC 4340, 5.8. When running out of space for other 174 - * options (eg. Ack Vector which can take up to 255 bytes), it is better 175 - * to schedule a separate Ack. Thus we leave headroom for the following: 176 - * - 1 byte for Slow Receiver (11.6) 177 - * - 6 bytes for Timestamp (13.1) 178 - * - 10 bytes for Timestamp Echo (13.3) 179 - * - 8 bytes for NDP count (7.7, when activated) 180 - * - 6 bytes for Data Checksum (9.3) 181 - * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) 173 + * FIXME: this should come from the CCID infrastructure, where, say, 174 + * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets 175 + * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED 176 + * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to 177 + * make it a multiple of 4 182 178 */ 183 - cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + 184 - (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); 179 + 180 + cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; 185 181 186 182 /* And store cached results */ 187 183 icsk->icsk_pmtu_cookie = pmtu; ··· 200 208 } 201 209 202 210 /** 203 - * dccp_wait_for_ccid - Await CCID send permission 211 + * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet 204 212 * @sk: socket to wait for 205 - * @delay: timeout in jiffies 206 - * This is used by CCIDs which need to delay the send time in process context. 213 + * @skb: current skb to pass on for waiting 214 + * @delay: sleep timeout in milliseconds (> 0) 215 + * This function is called by default when the socket is closed, and 216 + * when a non-zero linger time is set on the socket. For consistency 207 217 */ 208 - static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) 218 + static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) 209 219 { 220 + struct dccp_sock *dp = dccp_sk(sk); 210 221 DEFINE_WAIT(wait); 211 - long remaining; 222 + unsigned long jiffdelay; 223 + int rc; 212 224 213 - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 214 - sk->sk_write_pending++; 215 - release_sock(sk); 225 + do { 226 + dccp_pr_debug("delayed send by %d msec\n", delay); 227 + jiffdelay = msecs_to_jiffies(delay); 216 228 217 - remaining = schedule_timeout(delay); 229 + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 218 230 219 - lock_sock(sk); 220 - sk->sk_write_pending--; 221 - finish_wait(sk->sk_sleep, &wait); 231 + sk->sk_write_pending++; 232 + release_sock(sk); 233 + schedule_timeout(jiffdelay); 234 + lock_sock(sk); 235 + sk->sk_write_pending--; 222 236 223 - if (signal_pending(current) || sk->sk_err) 224 - return -1; 225 - return remaining; 226 - } 237 + if (sk->sk_err) 238 + goto do_error; 239 + if (signal_pending(current)) 240 + goto do_interrupted; 227 241 228 - /** 229 - * dccp_xmit_packet - Send data packet under control of CCID 230 - * Transmits next-queued payload and informs CCID to account for the packet. 231 - */ 232 - static void dccp_xmit_packet(struct sock *sk) 233 - { 234 - int err, len; 235 - struct dccp_sock *dp = dccp_sk(sk); 236 - struct sk_buff *skb = dccp_qpolicy_pop(sk); 237 - 238 - if (unlikely(skb == NULL)) 239 - return; 240 - len = skb->len; 241 - 242 - if (sk->sk_state == DCCP_PARTOPEN) { 243 - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; 244 - /* 245 - * See 8.1.5 - Handshake Completion. 246 - * 247 - * For robustness we resend Confirm options until the client has 248 - * entered OPEN. During the initial feature negotiation, the MPS 249 - * is smaller than usual, reduced by the Change/Confirm options. 250 - */ 251 - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { 252 - DCCP_WARN("Payload too large (%d) for featneg.\n", len); 253 - dccp_send_ack(sk); 254 - dccp_feat_list_purge(&dp->dccps_featneg); 255 - } 256 - 257 - inet_csk_schedule_ack(sk); 258 - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 259 - inet_csk(sk)->icsk_rto, 260 - DCCP_RTO_MAX); 261 - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; 262 - } else if (dccp_ack_pending(sk)) { 263 - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; 264 - } else { 265 - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; 266 - } 267 - 268 - err = dccp_transmit_skb(sk, skb); 269 - if (err) 270 - dccp_pr_debug("transmit_skb() returned err=%d\n", err); 271 - /* 272 - * Register this one as sent even if an error occurred. To the remote 273 - * end a local packet drop is indistinguishable from network loss, i.e. 274 - * any local drop will eventually be reported via receiver feedback. 275 - */ 276 - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); 277 - 278 - /* 279 - * If the CCID needs to transfer additional header options out-of-band 280 - * (e.g. Ack Vectors or feature-negotiation options), it activates this 281 - * flag to schedule a Sync. The Sync will automatically incorporate all 282 - * currently pending header options, thus clearing the backlog. 283 - */ 284 - if (dp->dccps_sync_scheduled) 285 - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); 286 - } 287 - 288 - /** 289 - * dccp_flush_write_queue - Drain queue at end of connection 290 - * Since dccp_sendmsg queues packets without waiting for them to be sent, it may 291 - * happen that the TX queue is not empty at the end of a connection. We give the 292 - * HC-sender CCID a grace period of up to @time_budget jiffies. If this function 293 - * returns with a non-empty write queue, it will be purged later. 294 - */ 295 - void dccp_flush_write_queue(struct sock *sk, long *time_budget) 296 - { 297 - struct dccp_sock *dp = dccp_sk(sk); 298 - struct sk_buff *skb; 299 - long delay, rc; 300 - 301 - while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { 302 242 rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 243 + } while ((delay = rc) > 0); 244 + out: 245 + finish_wait(sk->sk_sleep, &wait); 246 + return rc; 303 247 304 - switch (ccid_packet_dequeue_eval(rc)) { 305 - case CCID_PACKET_WILL_DEQUEUE_LATER: 306 - /* 307 - * If the CCID determines when to send, the next sending 308 - * time is unknown or the CCID may not even send again 309 - * (e.g. remote host crashes or lost Ack packets). 310 - */ 311 - DCCP_WARN("CCID did not manage to send all packets\n"); 312 - return; 313 - case CCID_PACKET_DELAY: 314 - delay = msecs_to_jiffies(rc); 315 - if (delay > *time_budget) 316 - return; 317 - rc = dccp_wait_for_ccid(sk, delay); 318 - if (rc < 0) 319 - return; 320 - *time_budget -= (delay - rc); 321 - /* check again if we can send now */ 322 - break; 323 - case CCID_PACKET_SEND_AT_ONCE: 324 - dccp_xmit_packet(sk); 325 - break; 326 - case CCID_PACKET_ERR: 327 - skb_dequeue(&sk->sk_write_queue); 328 - kfree_skb(skb); 329 - dccp_pr_debug("packet discarded due to err=%ld\n", rc); 330 - } 331 - } 248 + do_error: 249 + rc = -EPIPE; 250 + goto out; 251 + do_interrupted: 252 + rc = -EINTR; 253 + goto out; 332 254 } 333 255 334 - void dccp_write_xmit(struct sock *sk) 256 + void dccp_write_xmit(struct sock *sk, int block) 335 257 { 336 258 struct dccp_sock *dp = dccp_sk(sk); 337 259 struct sk_buff *skb; 338 260 339 - while ((skb = dccp_qpolicy_top(sk))) { 340 - int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 261 + while ((skb = skb_peek(&sk->sk_write_queue))) { 262 + int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 341 263 342 - switch (ccid_packet_dequeue_eval(rc)) { 343 - case CCID_PACKET_WILL_DEQUEUE_LATER: 344 - return; 345 - case CCID_PACKET_DELAY: 346 - sk_reset_timer(sk, &dp->dccps_xmit_timer, 347 - jiffies + msecs_to_jiffies(rc)); 348 - return; 349 - case CCID_PACKET_SEND_AT_ONCE: 350 - dccp_xmit_packet(sk); 351 - break; 352 - case CCID_PACKET_ERR: 353 - dccp_qpolicy_drop(sk, skb); 354 - dccp_pr_debug("packet discarded due to err=%d\n", rc); 264 + if (err > 0) { 265 + if (!block) { 266 + sk_reset_timer(sk, &dp->dccps_xmit_timer, 267 + msecs_to_jiffies(err)+jiffies); 268 + break; 269 + } else 270 + err = dccp_wait_for_ccid(sk, skb, err); 271 + if (err && err != -EINTR) 272 + DCCP_BUG("err=%d after dccp_wait_for_ccid", err); 273 + } 274 + 275 + skb_dequeue(&sk->sk_write_queue); 276 + if (err == 0) { 277 + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 278 + const int len = skb->len; 279 + 280 + if (sk->sk_state == DCCP_PARTOPEN) { 281 + /* See 8.1.5. Handshake Completion */ 282 + inet_csk_schedule_ack(sk); 283 + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 284 + inet_csk(sk)->icsk_rto, 285 + DCCP_RTO_MAX); 286 + dcb->dccpd_type = DCCP_PKT_DATAACK; 287 + } else if (dccp_ack_pending(sk)) 288 + dcb->dccpd_type = DCCP_PKT_DATAACK; 289 + else 290 + dcb->dccpd_type = DCCP_PKT_DATA; 291 + 292 + err = dccp_transmit_skb(sk, skb); 293 + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); 294 + if (err) 295 + DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", 296 + err); 297 + } else { 298 + dccp_pr_debug("packet discarded due to err=%d\n", err); 299 + kfree_skb(skb); 355 300 } 356 301 } 357 302 } ··· 339 410 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; 340 411 DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; 341 412 342 - /* Resolve feature dependencies resulting from choice of CCID */ 343 - if (dccp_feat_server_ccid_dependencies(dreq)) 344 - goto response_failed; 345 - 346 - if (dccp_insert_options_rsk(dreq, skb)) 347 - goto response_failed; 413 + if (dccp_insert_options_rsk(dreq, skb)) { 414 + kfree_skb(skb); 415 + return NULL; 416 + } 348 417 349 418 /* Build and checksum header */ 350 419 dh = dccp_zeroed_hdr(skb, dccp_header_size); ··· 363 436 inet_rsk(req)->acked = 1; 364 437 DCCP_INC_STATS(DCCP_MIB_OUTSEGS); 365 438 return skb; 366 - response_failed: 367 - kfree_skb(skb); 368 - return NULL; 369 439 } 370 440 371 441 EXPORT_SYMBOL_GPL(dccp_make_response); ··· 447 523 /* 448 524 * Do all connect socket setups that can be done AF independent. 449 525 */ 450 - int dccp_connect(struct sock *sk) 526 + static inline void dccp_connect_init(struct sock *sk) 451 527 { 452 - struct sk_buff *skb; 453 528 struct dccp_sock *dp = dccp_sk(sk); 454 529 struct dst_entry *dst = __sk_dst_get(sk); 455 530 struct inet_connection_sock *icsk = inet_csk(sk); ··· 458 535 459 536 dccp_sync_mss(sk, dst_mtu(dst)); 460 537 461 - /* do not connect if feature negotiation setup fails */ 462 - if (dccp_feat_finalise_settings(dccp_sk(sk))) 463 - return -EPROTO; 464 - 465 538 /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ 466 539 dp->dccps_gar = dp->dccps_iss; 540 + 541 + icsk->icsk_retransmits = 0; 542 + } 543 + 544 + int dccp_connect(struct sock *sk) 545 + { 546 + struct sk_buff *skb; 547 + struct inet_connection_sock *icsk = inet_csk(sk); 548 + 549 + dccp_connect_init(sk); 467 550 468 551 skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); 469 552 if (unlikely(skb == NULL)) ··· 480 551 481 552 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; 482 553 483 - dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); 554 + dccp_skb_entail(sk, skb); 555 + dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); 484 556 DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); 485 557 486 558 /* Timer for repeating the REQUEST until an answer. */ 487 - icsk->icsk_retransmits = 0; 488 559 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 489 560 icsk->icsk_rto, DCCP_RTO_MAX); 490 561 return 0; ··· 571 642 DCCP_SKB_CB(skb)->dccpd_type = pkt_type; 572 643 DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; 573 644 574 - /* 575 - * Clear the flag in case the Sync was scheduled for out-of-band data, 576 - * such as carrying a long Ack Vector. 577 - */ 578 - dccp_sk(sk)->dccps_sync_scheduled = 0; 579 - 580 645 dccp_transmit_skb(sk, skb); 581 646 } 582 647 ··· 599 676 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; 600 677 601 678 if (active) { 602 - skb = dccp_skb_entail(sk, skb); 679 + dccp_write_xmit(sk, 1); 680 + dccp_skb_entail(sk, skb); 681 + dccp_transmit_skb(sk, skb_clone(skb, prio)); 603 682 /* 604 683 * Retransmission timer for active-close: RFC 4340, 8.3 requires 605 684 * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ ··· 614 689 */ 615 690 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 616 691 DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); 617 - } 618 - dccp_transmit_skb(sk, skb); 692 + } else 693 + dccp_transmit_skb(sk, skb); 619 694 }
+48 -27
net/dccp/probe.c
··· 46 46 struct kfifo *fifo; 47 47 spinlock_t lock; 48 48 wait_queue_head_t wait; 49 - ktime_t start; 49 + struct timespec tstart; 50 50 } dccpw; 51 51 52 - static void jdccp_write_xmit(struct sock *sk) 52 + static void printl(const char *fmt, ...) 53 53 { 54 + va_list args; 55 + int len; 56 + struct timespec now; 57 + char tbuf[256]; 58 + 59 + va_start(args, fmt); 60 + getnstimeofday(&now); 61 + 62 + now = timespec_sub(now, dccpw.tstart); 63 + 64 + len = sprintf(tbuf, "%lu.%06lu ", 65 + (unsigned long) now.tv_sec, 66 + (unsigned long) now.tv_nsec / NSEC_PER_USEC); 67 + len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); 68 + va_end(args); 69 + 70 + kfifo_put(dccpw.fifo, tbuf, len); 71 + wake_up(&dccpw.wait); 72 + } 73 + 74 + static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, 75 + struct msghdr *msg, size_t size) 76 + { 77 + const struct dccp_minisock *dmsk = dccp_msk(sk); 54 78 const struct inet_sock *inet = inet_sk(sk); 55 - struct ccid3_hc_tx_sock *hctx = NULL; 56 - struct timespec tv; 57 - char buf[256]; 58 - int len, ccid = ccid_get_current_tx_ccid(dccp_sk(sk)); 79 + const struct ccid3_hc_tx_sock *hctx; 59 80 60 - if (ccid == DCCPC_CCID3) 81 + if (dmsk->dccpms_tx_ccid == DCCPC_CCID3) 61 82 hctx = ccid3_hc_tx_sk(sk); 83 + else 84 + hctx = NULL; 62 85 63 - if (!port || ntohs(inet->dport) == port || ntohs(inet->sport) == port) { 64 - 65 - tv = ktime_to_timespec(ktime_sub(ktime_get(), dccpw.start)); 66 - len = sprintf(buf, "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u %d", 67 - (unsigned long)tv.tv_sec, 68 - (unsigned long)tv.tv_nsec, 69 - NIPQUAD(inet->saddr), ntohs(inet->sport), 70 - NIPQUAD(inet->daddr), ntohs(inet->dport), ccid); 71 - 86 + if (port == 0 || ntohs(inet->dport) == port || 87 + ntohs(inet->sport) == port) { 72 88 if (hctx) 73 - len += sprintf(buf + len, " %d %d %d %u %u %u %d", 74 - hctx->s, hctx->rtt, hctx->p, hctx->x_calc, 75 - (unsigned)(hctx->x_recv >> 6), 76 - (unsigned)(hctx->x >> 6), hctx->t_ipi); 77 - 78 - len += sprintf(buf + len, "\n"); 79 - kfifo_put(dccpw.fifo, buf, len); 80 - wake_up(&dccpw.wait); 89 + printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u " 90 + "%llu %llu %d\n", 91 + NIPQUAD(inet->saddr), ntohs(inet->sport), 92 + NIPQUAD(inet->daddr), ntohs(inet->dport), size, 93 + hctx->ccid3hctx_s, hctx->ccid3hctx_rtt, 94 + hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc, 95 + hctx->ccid3hctx_x_recv >> 6, 96 + hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi); 97 + else 98 + printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n", 99 + NIPQUAD(inet->saddr), ntohs(inet->sport), 100 + NIPQUAD(inet->daddr), ntohs(inet->dport), size); 81 101 } 82 102 83 103 jprobe_return(); 104 + return 0; 84 105 } 85 106 86 107 static struct jprobe dccp_send_probe = { 87 108 .kp = { 88 - .symbol_name = "dccp_write_xmit", 109 + .symbol_name = "dccp_sendmsg", 89 110 }, 90 - .entry = jdccp_write_xmit, 111 + .entry = jdccp_sendmsg, 91 112 }; 92 113 93 114 static int dccpprobe_open(struct inode *inode, struct file *file) 94 115 { 95 116 kfifo_reset(dccpw.fifo); 96 - dccpw.start = ktime_get(); 117 + getnstimeofday(&dccpw.tstart); 97 118 return 0; 98 119 } 99 120
+108 -177
net/dccp/proto.c
··· 67 67 case DCCP_OPEN: 68 68 if (oldstate != DCCP_OPEN) 69 69 DCCP_INC_STATS(DCCP_MIB_CURRESTAB); 70 - /* Client retransmits all Confirm options until entering OPEN */ 71 - if (oldstate == DCCP_PARTOPEN) 72 - dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); 73 70 break; 74 71 75 72 case DCCP_CLOSED: ··· 175 178 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) 176 179 { 177 180 struct dccp_sock *dp = dccp_sk(sk); 181 + struct dccp_minisock *dmsk = dccp_msk(sk); 178 182 struct inet_connection_sock *icsk = inet_csk(sk); 183 + 184 + dccp_minisock_init(&dp->dccps_minisock); 179 185 180 186 icsk->icsk_rto = DCCP_TIMEOUT_INIT; 181 187 icsk->icsk_syn_retries = sysctl_dccp_request_retries; 182 188 sk->sk_state = DCCP_CLOSED; 183 189 sk->sk_write_space = dccp_write_space; 184 190 icsk->icsk_sync_mss = dccp_sync_mss; 185 - dp->dccps_mss_cache = TCP_MIN_RCVMSS; 191 + dp->dccps_mss_cache = 536; 186 192 dp->dccps_rate_last = jiffies; 187 193 dp->dccps_role = DCCP_ROLE_UNDEFINED; 188 194 dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; 189 - dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; 195 + dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; 190 196 191 197 dccp_init_xmit_timers(sk); 192 198 193 - INIT_LIST_HEAD(&dp->dccps_featneg); 194 - /* control socket doesn't need feat nego */ 195 - if (likely(ctl_sock_initialized)) 196 - return dccp_feat_init(sk); 199 + /* 200 + * FIXME: We're hardcoding the CCID, and doing this at this point makes 201 + * the listening (master) sock get CCID control blocks, which is not 202 + * necessary, but for now, to not mess with the test userspace apps, 203 + * lets leave it here, later the real solution is to do this in a 204 + * setsockopt(CCIDs-I-want/accept). -acme 205 + */ 206 + if (likely(ctl_sock_initialized)) { 207 + int rc = dccp_feat_init(dmsk); 208 + 209 + if (rc) 210 + return rc; 211 + 212 + if (dmsk->dccpms_send_ack_vector) { 213 + dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL); 214 + if (dp->dccps_hc_rx_ackvec == NULL) 215 + return -ENOMEM; 216 + } 217 + dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid, 218 + sk, GFP_KERNEL); 219 + dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid, 220 + sk, GFP_KERNEL); 221 + if (unlikely(dp->dccps_hc_rx_ccid == NULL || 222 + dp->dccps_hc_tx_ccid == NULL)) { 223 + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); 224 + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); 225 + if (dmsk->dccpms_send_ack_vector) { 226 + dccp_ackvec_free(dp->dccps_hc_rx_ackvec); 227 + dp->dccps_hc_rx_ackvec = NULL; 228 + } 229 + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; 230 + return -ENOMEM; 231 + } 232 + } else { 233 + /* control socket doesn't need feat nego */ 234 + INIT_LIST_HEAD(&dmsk->dccpms_pending); 235 + INIT_LIST_HEAD(&dmsk->dccpms_conf); 236 + } 237 + 197 238 return 0; 198 239 } 199 240 ··· 240 205 void dccp_destroy_sock(struct sock *sk) 241 206 { 242 207 struct dccp_sock *dp = dccp_sk(sk); 208 + struct dccp_minisock *dmsk = dccp_msk(sk); 243 209 244 210 /* 245 211 * DCCP doesn't use sk_write_queue, just sk_send_head ··· 258 222 kfree(dp->dccps_service_list); 259 223 dp->dccps_service_list = NULL; 260 224 261 - if (dp->dccps_hc_rx_ackvec != NULL) { 225 + if (dmsk->dccpms_send_ack_vector) { 262 226 dccp_ackvec_free(dp->dccps_hc_rx_ackvec); 263 227 dp->dccps_hc_rx_ackvec = NULL; 264 228 } ··· 267 231 dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; 268 232 269 233 /* clean up feature negotiation state */ 270 - dccp_feat_list_purge(&dp->dccps_featneg); 234 + dccp_feat_clean(dmsk); 271 235 } 272 236 273 237 EXPORT_SYMBOL_GPL(dccp_destroy_sock); ··· 277 241 struct dccp_sock *dp = dccp_sk(sk); 278 242 279 243 dp->dccps_role = DCCP_ROLE_LISTEN; 280 - /* do not start to listen if feature negotiation setup fails */ 281 - if (dccp_feat_finalise_settings(dp)) 282 - return -EPROTO; 283 244 return inet_csk_listen_start(sk, backlog); 284 245 } 285 246 ··· 466 433 return 0; 467 434 } 468 435 469 - static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) 436 + /* byte 1 is feature. the rest is the preference list */ 437 + static int dccp_setsockopt_change(struct sock *sk, int type, 438 + struct dccp_so_feat __user *optval) 470 439 { 471 - u8 *list, len; 472 - int i, rc; 473 - 474 - if (cscov < 0 || cscov > 15) 475 - return -EINVAL; 476 - /* 477 - * Populate a list of permissible values, in the range cscov...15. This 478 - * is necessary since feature negotiation of single values only works if 479 - * both sides incidentally choose the same value. Since the list starts 480 - * lowest-value first, negotiation will pick the smallest shared value. 481 - */ 482 - if (cscov == 0) 483 - return 0; 484 - len = 16 - cscov; 485 - 486 - list = kmalloc(len, GFP_KERNEL); 487 - if (list == NULL) 488 - return -ENOBUFS; 489 - 490 - for (i = 0; i < len; i++) 491 - list[i] = cscov++; 492 - 493 - rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); 494 - 495 - if (rc == 0) { 496 - if (rx) 497 - dccp_sk(sk)->dccps_pcrlen = cscov; 498 - else 499 - dccp_sk(sk)->dccps_pcslen = cscov; 500 - } 501 - kfree(list); 502 - return rc; 503 - } 504 - 505 - static int dccp_setsockopt_ccid(struct sock *sk, int type, 506 - char __user *optval, int optlen) 507 - { 440 + struct dccp_so_feat opt; 508 441 u8 *val; 509 - int rc = 0; 442 + int rc; 510 443 511 - if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) 444 + if (copy_from_user(&opt, optval, sizeof(opt))) 445 + return -EFAULT; 446 + /* 447 + * rfc4340: 6.1. Change Options 448 + */ 449 + if (opt.dccpsf_len < 1) 512 450 return -EINVAL; 513 451 514 - val = kmalloc(optlen, GFP_KERNEL); 515 - if (val == NULL) 452 + val = kmalloc(opt.dccpsf_len, GFP_KERNEL); 453 + if (!val) 516 454 return -ENOMEM; 517 455 518 - if (copy_from_user(val, optval, optlen)) { 519 - kfree(val); 520 - return -EFAULT; 456 + if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) { 457 + rc = -EFAULT; 458 + goto out_free_val; 521 459 } 522 460 523 - lock_sock(sk); 524 - if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) 525 - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); 461 + rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat, 462 + val, opt.dccpsf_len, GFP_KERNEL); 463 + if (rc) 464 + goto out_free_val; 526 465 527 - if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) 528 - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); 529 - release_sock(sk); 530 - 531 - kfree(val); 466 + out: 532 467 return rc; 468 + 469 + out_free_val: 470 + kfree(val); 471 + goto out; 533 472 } 534 473 535 474 static int do_dccp_setsockopt(struct sock *sk, int level, int optname, ··· 510 505 struct dccp_sock *dp = dccp_sk(sk); 511 506 int val, err = 0; 512 507 513 - switch (optname) { 514 - case DCCP_SOCKOPT_PACKET_SIZE: 515 - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); 516 - return 0; 517 - case DCCP_SOCKOPT_CHANGE_L: 518 - case DCCP_SOCKOPT_CHANGE_R: 519 - DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); 520 - return 0; 521 - case DCCP_SOCKOPT_CCID: 522 - case DCCP_SOCKOPT_RX_CCID: 523 - case DCCP_SOCKOPT_TX_CCID: 524 - return dccp_setsockopt_ccid(sk, optname, optval, optlen); 525 - } 526 - 527 - if (optlen < (int)sizeof(int)) 508 + if (optlen < sizeof(int)) 528 509 return -EINVAL; 529 510 530 511 if (get_user(val, (int __user *)optval)) ··· 521 530 522 531 lock_sock(sk); 523 532 switch (optname) { 533 + case DCCP_SOCKOPT_PACKET_SIZE: 534 + DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); 535 + err = 0; 536 + break; 537 + case DCCP_SOCKOPT_CHANGE_L: 538 + if (optlen != sizeof(struct dccp_so_feat)) 539 + err = -EINVAL; 540 + else 541 + err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L, 542 + (struct dccp_so_feat __user *) 543 + optval); 544 + break; 545 + case DCCP_SOCKOPT_CHANGE_R: 546 + if (optlen != sizeof(struct dccp_so_feat)) 547 + err = -EINVAL; 548 + else 549 + err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R, 550 + (struct dccp_so_feat __user *) 551 + optval); 552 + break; 524 553 case DCCP_SOCKOPT_SERVER_TIMEWAIT: 525 554 if (dp->dccps_role != DCCP_ROLE_SERVER) 526 555 err = -EOPNOTSUPP; 527 556 else 528 557 dp->dccps_server_timewait = (val != 0); 529 558 break; 530 - case DCCP_SOCKOPT_SEND_CSCOV: 531 - err = dccp_setsockopt_cscov(sk, val, false); 532 - break; 533 - case DCCP_SOCKOPT_RECV_CSCOV: 534 - err = dccp_setsockopt_cscov(sk, val, true); 535 - break; 536 - case DCCP_SOCKOPT_QPOLICY_ID: 537 - if (sk->sk_state != DCCP_CLOSED) 538 - err = -EISCONN; 539 - else if (val < 0 || val >= DCCPQ_POLICY_MAX) 559 + case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ 560 + if (val < 0 || val > 15) 540 561 err = -EINVAL; 541 562 else 542 - dp->dccps_qpolicy = val; 563 + dp->dccps_pcslen = val; 543 564 break; 544 - case DCCP_SOCKOPT_QPOLICY_TXQLEN: 545 - if (val < 0) 565 + case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */ 566 + if (val < 0 || val > 15) 546 567 err = -EINVAL; 547 - else 548 - dp->dccps_tx_qlen = val; 568 + else { 569 + dp->dccps_pcrlen = val; 570 + /* FIXME: add feature negotiation, 571 + * ChangeL(MinimumChecksumCoverage, val) */ 572 + } 549 573 break; 550 574 default: 551 575 err = -ENOPROTOOPT; 552 576 break; 553 577 } 554 - release_sock(sk); 555 578 579 + release_sock(sk); 556 580 return err; 557 581 } 558 582 ··· 648 642 case DCCP_SOCKOPT_GET_CUR_MPS: 649 643 val = dp->dccps_mss_cache; 650 644 break; 651 - case DCCP_SOCKOPT_AVAILABLE_CCIDS: 652 - return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); 653 - case DCCP_SOCKOPT_TX_CCID: 654 - val = ccid_get_current_tx_ccid(dp); 655 - if (val < 0) 656 - return -ENOPROTOOPT; 657 - break; 658 - case DCCP_SOCKOPT_RX_CCID: 659 - val = ccid_get_current_rx_ccid(dp); 660 - if (val < 0) 661 - return -ENOPROTOOPT; 662 - break; 663 645 case DCCP_SOCKOPT_SERVER_TIMEWAIT: 664 646 val = dp->dccps_server_timewait; 665 647 break; ··· 656 662 break; 657 663 case DCCP_SOCKOPT_RECV_CSCOV: 658 664 val = dp->dccps_pcrlen; 659 - break; 660 - case DCCP_SOCKOPT_QPOLICY_ID: 661 - val = dp->dccps_qpolicy; 662 - break; 663 - case DCCP_SOCKOPT_QPOLICY_TXQLEN: 664 - val = dp->dccps_tx_qlen; 665 665 break; 666 666 case 128 ... 191: 667 667 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, ··· 699 711 EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); 700 712 #endif 701 713 702 - static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) 703 - { 704 - struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); 705 - 706 - /* 707 - * Assign an (opaque) qpolicy priority value to skb->priority. 708 - * 709 - * We are overloading this skb field for use with the qpolicy subystem. 710 - * The skb->priority is normally used for the SO_PRIORITY option, which 711 - * is initialised from sk_priority. Since the assignment of sk_priority 712 - * to skb->priority happens later (on layer 3), we overload this field 713 - * for use with queueing priorities as long as the skb is on layer 4. 714 - * The default priority value (if nothing is set) is 0. 715 - */ 716 - skb->priority = 0; 717 - 718 - for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { 719 - 720 - if (!CMSG_OK(msg, cmsg)) 721 - return -EINVAL; 722 - 723 - if (cmsg->cmsg_level != SOL_DCCP) 724 - continue; 725 - 726 - if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && 727 - !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) 728 - return -EINVAL; 729 - 730 - switch (cmsg->cmsg_type) { 731 - case DCCP_SCM_PRIORITY: 732 - if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) 733 - return -EINVAL; 734 - skb->priority = *(__u32 *)CMSG_DATA(cmsg); 735 - break; 736 - default: 737 - return -EINVAL; 738 - } 739 - } 740 - return 0; 741 - } 742 - 743 714 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 744 715 size_t len) 745 716 { ··· 714 767 715 768 lock_sock(sk); 716 769 717 - if (dccp_qpolicy_full(sk)) { 770 + if (sysctl_dccp_tx_qlen && 771 + (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) { 718 772 rc = -EAGAIN; 719 773 goto out_release; 720 774 } ··· 743 795 if (rc != 0) 744 796 goto out_discard; 745 797 746 - rc = dccp_msghdr_parse(msg, skb); 747 - if (rc != 0) 748 - goto out_discard; 749 - 750 - dccp_qpolicy_push(sk, skb); 751 - dccp_write_xmit(sk); 798 + skb_queue_tail(&sk->sk_write_queue, skb); 799 + dccp_write_xmit(sk,0); 752 800 out_release: 753 801 release_sock(sk); 754 802 return rc ? : len; ··· 967 1023 /* Check zero linger _after_ checking for unread data. */ 968 1024 sk->sk_prot->disconnect(sk, 0); 969 1025 } else if (sk->sk_state != DCCP_CLOSED) { 970 - /* 971 - * Normal connection termination. May need to wait if there are 972 - * still packets in the TX queue that are delayed by the CCID. 973 - */ 974 - dccp_flush_write_queue(sk, &timeout); 975 1026 dccp_terminate_connection(sk); 976 1027 } 977 - 978 - /* 979 - * Flush write queue. This may be necessary in several cases: 980 - * - we have been closed by the peer but still have application data; 981 - * - abortive termination (unread data or zero linger time), 982 - * - normal termination but queue could not be flushed within time limit 983 - */ 984 - __skb_queue_purge(&sk->sk_write_queue); 985 1028 986 1029 sk_stream_wait_close(sk, timeout); 987 1030
-137
net/dccp/qpolicy.c
··· 1 - /* 2 - * net/dccp/qpolicy.c 3 - * 4 - * Policy-based packet dequeueing interface for DCCP. 5 - * 6 - * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net> 7 - * 8 - * This program is free software; you can redistribute it and/or 9 - * modify it under the terms of the GNU General Public License v2 10 - * as published by the Free Software Foundation. 11 - */ 12 - #include "dccp.h" 13 - 14 - /* 15 - * Simple Dequeueing Policy: 16 - * If tx_qlen is different from 0, enqueue up to tx_qlen elements. 17 - */ 18 - static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) 19 - { 20 - skb_queue_tail(&sk->sk_write_queue, skb); 21 - } 22 - 23 - static bool qpolicy_simple_full(struct sock *sk) 24 - { 25 - return dccp_sk(sk)->dccps_tx_qlen && 26 - sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; 27 - } 28 - 29 - static struct sk_buff *qpolicy_simple_top(struct sock *sk) 30 - { 31 - return skb_peek(&sk->sk_write_queue); 32 - } 33 - 34 - /* 35 - * Priority-based Dequeueing Policy: 36 - * If tx_qlen is different from 0 and the queue has reached its upper bound 37 - * of tx_qlen elements, replace older packets lowest-priority-first. 38 - */ 39 - static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) 40 - { 41 - struct sk_buff *skb, *best = NULL; 42 - 43 - skb_queue_walk(&sk->sk_write_queue, skb) 44 - if (best == NULL || skb->priority > best->priority) 45 - best = skb; 46 - return best; 47 - } 48 - 49 - static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) 50 - { 51 - struct sk_buff *skb, *worst = NULL; 52 - 53 - skb_queue_walk(&sk->sk_write_queue, skb) 54 - if (worst == NULL || skb->priority < worst->priority) 55 - worst = skb; 56 - return worst; 57 - } 58 - 59 - static bool qpolicy_prio_full(struct sock *sk) 60 - { 61 - if (qpolicy_simple_full(sk)) 62 - dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); 63 - return false; 64 - } 65 - 66 - /** 67 - * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface 68 - * @push: add a new @skb to the write queue 69 - * @full: indicates that no more packets will be admitted 70 - * @top: peeks at whatever the queueing policy defines as its `top' 71 - */ 72 - static struct dccp_qpolicy_operations { 73 - void (*push) (struct sock *sk, struct sk_buff *skb); 74 - bool (*full) (struct sock *sk); 75 - struct sk_buff* (*top) (struct sock *sk); 76 - __be32 params; 77 - 78 - } qpol_table[DCCPQ_POLICY_MAX] = { 79 - [DCCPQ_POLICY_SIMPLE] = { 80 - .push = qpolicy_simple_push, 81 - .full = qpolicy_simple_full, 82 - .top = qpolicy_simple_top, 83 - .params = 0, 84 - }, 85 - [DCCPQ_POLICY_PRIO] = { 86 - .push = qpolicy_simple_push, 87 - .full = qpolicy_prio_full, 88 - .top = qpolicy_prio_best_skb, 89 - .params = DCCP_SCM_PRIORITY, 90 - }, 91 - }; 92 - 93 - /* 94 - * Externally visible interface 95 - */ 96 - void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) 97 - { 98 - qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); 99 - } 100 - 101 - bool dccp_qpolicy_full(struct sock *sk) 102 - { 103 - return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); 104 - } 105 - 106 - void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) 107 - { 108 - if (skb != NULL) { 109 - skb_unlink(skb, &sk->sk_write_queue); 110 - kfree_skb(skb); 111 - } 112 - } 113 - 114 - struct sk_buff *dccp_qpolicy_top(struct sock *sk) 115 - { 116 - return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); 117 - } 118 - 119 - struct sk_buff *dccp_qpolicy_pop(struct sock *sk) 120 - { 121 - struct sk_buff *skb = dccp_qpolicy_top(sk); 122 - 123 - /* Clear any skb fields that we used internally */ 124 - skb->priority = 0; 125 - 126 - if (skb) 127 - skb_unlink(skb, &sk->sk_write_queue); 128 - return skb; 129 - } 130 - 131 - bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) 132 - { 133 - /* check if exactly one bit is set */ 134 - if (!param || (param & (param - 1))) 135 - return false; 136 - return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; 137 - }
+34 -30
net/dccp/sysctl.c
··· 18 18 #error This file should not be compiled without CONFIG_SYSCTL defined 19 19 #endif 20 20 21 - /* Boundary values */ 22 - static int zero = 0, 23 - u8_max = 0xFF; 24 - static unsigned long seqw_min = 32; 25 - 26 21 static struct ctl_table dccp_default_table[] = { 27 22 { 28 23 .procname = "seq_window", 29 - .data = &sysctl_dccp_sequence_window, 30 - .maxlen = sizeof(sysctl_dccp_sequence_window), 24 + .data = &sysctl_dccp_feat_sequence_window, 25 + .maxlen = sizeof(sysctl_dccp_feat_sequence_window), 31 26 .mode = 0644, 32 - .proc_handler = proc_doulongvec_minmax, 33 - .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ 27 + .proc_handler = proc_dointvec, 34 28 }, 35 29 { 36 30 .procname = "rx_ccid", 37 - .data = &sysctl_dccp_rx_ccid, 38 - .maxlen = sizeof(sysctl_dccp_rx_ccid), 31 + .data = &sysctl_dccp_feat_rx_ccid, 32 + .maxlen = sizeof(sysctl_dccp_feat_rx_ccid), 39 33 .mode = 0644, 40 - .proc_handler = proc_dointvec_minmax, 41 - .extra1 = &zero, 42 - .extra2 = &u8_max, /* RFC 4340, 10. */ 34 + .proc_handler = proc_dointvec, 43 35 }, 44 36 { 45 37 .procname = "tx_ccid", 46 - .data = &sysctl_dccp_tx_ccid, 47 - .maxlen = sizeof(sysctl_dccp_tx_ccid), 38 + .data = &sysctl_dccp_feat_tx_ccid, 39 + .maxlen = sizeof(sysctl_dccp_feat_tx_ccid), 48 40 .mode = 0644, 49 - .proc_handler = proc_dointvec_minmax, 50 - .extra1 = &zero, 51 - .extra2 = &u8_max, /* RFC 4340, 10. */ 41 + .proc_handler = proc_dointvec, 42 + }, 43 + { 44 + .procname = "ack_ratio", 45 + .data = &sysctl_dccp_feat_ack_ratio, 46 + .maxlen = sizeof(sysctl_dccp_feat_ack_ratio), 47 + .mode = 0644, 48 + .proc_handler = proc_dointvec, 49 + }, 50 + { 51 + .procname = "send_ackvec", 52 + .data = &sysctl_dccp_feat_send_ack_vector, 53 + .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector), 54 + .mode = 0644, 55 + .proc_handler = proc_dointvec, 56 + }, 57 + { 58 + .procname = "send_ndp", 59 + .data = &sysctl_dccp_feat_send_ndp_count, 60 + .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count), 61 + .mode = 0644, 62 + .proc_handler = proc_dointvec, 52 63 }, 53 64 { 54 65 .procname = "request_retries", 55 66 .data = &sysctl_dccp_request_retries, 56 67 .maxlen = sizeof(sysctl_dccp_request_retries), 57 68 .mode = 0644, 58 - .proc_handler = proc_dointvec_minmax, 59 - .extra1 = &zero, 60 - .extra2 = &u8_max, 69 + .proc_handler = proc_dointvec, 61 70 }, 62 71 { 63 72 .procname = "retries1", 64 73 .data = &sysctl_dccp_retries1, 65 74 .maxlen = sizeof(sysctl_dccp_retries1), 66 75 .mode = 0644, 67 - .proc_handler = proc_dointvec_minmax, 68 - .extra1 = &zero, 69 - .extra2 = &u8_max, 76 + .proc_handler = proc_dointvec, 70 77 }, 71 78 { 72 79 .procname = "retries2", 73 80 .data = &sysctl_dccp_retries2, 74 81 .maxlen = sizeof(sysctl_dccp_retries2), 75 82 .mode = 0644, 76 - .proc_handler = proc_dointvec_minmax, 77 - .extra1 = &zero, 78 - .extra2 = &u8_max, 83 + .proc_handler = proc_dointvec, 79 84 }, 80 85 { 81 86 .procname = "tx_qlen", 82 87 .data = &sysctl_dccp_tx_qlen, 83 88 .maxlen = sizeof(sysctl_dccp_tx_qlen), 84 89 .mode = 0644, 85 - .proc_handler = proc_dointvec_minmax, 86 - .extra1 = &zero, 90 + .proc_handler = proc_dointvec, 87 91 }, 88 92 { 89 93 .procname = "sync_ratelimit",
+26 -16
net/dccp/timer.c
··· 87 87 { 88 88 struct inet_connection_sock *icsk = inet_csk(sk); 89 89 90 + /* retransmit timer is used for feature negotiation throughout 91 + * connection. In this case, no packet is re-transmitted, but rather an 92 + * ack is generated and pending changes are placed into its options. 93 + */ 94 + if (sk->sk_send_head == NULL) { 95 + dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk); 96 + if (sk->sk_state == DCCP_OPEN) 97 + dccp_send_ack(sk); 98 + goto backoff; 99 + } 100 + 90 101 /* 91 102 * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was 92 103 * sent, no need to retransmit, this sock is dead. ··· 126 115 return; 127 116 } 128 117 118 + backoff: 129 119 icsk->icsk_backoff++; 130 120 131 121 icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); ··· 249 237 sock_put(sk); 250 238 } 251 239 252 - /** 253 - * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface 254 - * See the comments above %ccid_dequeueing_decision for supported modes. 255 - */ 256 - static void dccp_write_xmitlet(unsigned long data) 240 + /* Transmit-delay timer: used by the CCIDs to delay actual send time */ 241 + static void dccp_write_xmit_timer(unsigned long data) 257 242 { 258 243 struct sock *sk = (struct sock *)data; 244 + struct dccp_sock *dp = dccp_sk(sk); 259 245 260 246 bh_lock_sock(sk); 261 247 if (sock_owned_by_user(sk)) 262 - sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); 248 + sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); 263 249 else 264 - dccp_write_xmit(sk); 250 + dccp_write_xmit(sk, 0); 265 251 bh_unlock_sock(sk); 252 + sock_put(sk); 266 253 } 267 254 268 - static void dccp_write_xmit_timer(unsigned long data) 255 + static void dccp_init_write_xmit_timer(struct sock *sk) 269 256 { 270 - dccp_write_xmitlet(data); 271 - sock_put((struct sock *)data); 257 + struct dccp_sock *dp = dccp_sk(sk); 258 + 259 + setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 260 + (unsigned long)sk); 272 261 } 273 262 274 263 void dccp_init_xmit_timers(struct sock *sk) 275 264 { 276 - struct dccp_sock *dp = dccp_sk(sk); 277 - 278 - tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); 279 - setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 280 - (unsigned long)sk); 265 + dccp_init_write_xmit_timer(sk); 281 266 inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, 282 267 &dccp_keepalive_timer); 283 268 } ··· 290 281 { 291 282 s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); 292 283 293 - return div_u64(delta, DCCP_TIME_RESOLUTION); 284 + do_div(delta, 10); 285 + return delta; 294 286 } 295 287 EXPORT_SYMBOL_GPL(dccp_timestamp); 296 288
+15 -2
net/ipv4/tcp_input.c
··· 811 811 } 812 812 } 813 813 814 + /* Numbers are taken from RFC3390. 815 + * 816 + * John Heffner states: 817 + * 818 + * The RFC specifies a window of no more than 4380 bytes 819 + * unless 2*MSS > 4380. Reading the pseudocode in the RFC 820 + * is a bit misleading because they use a clamp at 4380 bytes 821 + * rather than use a multiplier in the relevant range. 822 + */ 814 823 __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) 815 824 { 816 825 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 817 826 818 - if (!cwnd) 819 - cwnd = rfc3390_bytes_to_packets(tp->mss_cache); 827 + if (!cwnd) { 828 + if (tp->mss_cache > 1460) 829 + cwnd = 2; 830 + else 831 + cwnd = (tp->mss_cache > 1095) ? 3 : 4; 832 + } 820 833 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 821 834 } 822 835