diff -ur ../vger3-020314/linux/Documentation/networking/ip-sysctl.txt linux/Documentation/networking/ip-sysctl.txt --- ../vger3-020314/linux/Documentation/networking/ip-sysctl.txt Fri Mar 15 22:22:02 2002 +++ linux/Documentation/networking/ip-sysctl.txt Fri Mar 15 22:24:55 2002 @@ -126,7 +126,13 @@ if network conditions require more than default value. tcp_tw_recycle - BOOLEAN - Enable fast recycling TIME-WAIT sockets. Default value is 1. + Enable fast recycling TIME-WAIT sockets. Default value is 0. + It should not be changed without advice/request of technical + experts. + +tcp_tw_reuse - BOOLEAN + Allow to reuse TIME-WAIT sockets for new connections when it is + safe from protocol viewpoint. Default value is 0. It should not be changed without advice/request of technical experts. diff -ur ../vger3-020314/linux/include/linux/sysctl.h linux/include/linux/sysctl.h --- ../vger3-020314/linux/include/linux/sysctl.h Mon Dec 3 20:24:00 2001 +++ linux/include/linux/sysctl.h Fri Mar 15 22:24:55 2002 @@ -289,7 +289,8 @@ NET_TCP_ADV_WIN_SCALE=87, NET_IPV4_NONLOCAL_BIND=88, NET_IPV4_ICMP_RATELIMIT=89, - NET_IPV4_ICMP_RATEMASK=90 + NET_IPV4_ICMP_RATEMASK=90, + NET_TCP_TW_REUSE=91 }; enum { diff -ur ../vger3-020314/linux/include/net/tcp.h linux/include/net/tcp.h --- ../vger3-020314/linux/include/net/tcp.h Fri Mar 15 22:22:02 2002 +++ linux/include/net/tcp.h Fri Mar 15 22:24:55 2002 @@ -75,7 +75,7 @@ */ struct tcp_bind_bucket { unsigned short port; - unsigned short fastreuse; + signed short fastreuse; struct tcp_bind_bucket *next; struct sock *owners; struct tcp_bind_bucket **pprev; @@ -460,6 +460,7 @@ extern int sysctl_tcp_rmem[3]; extern int sysctl_tcp_app_win; extern int sysctl_tcp_adv_win_scale; +extern int sysctl_tcp_tw_reuse; extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; @@ -568,9 +569,7 @@ struct sk_buff *skb, struct open_request *req, struct dst_entry *dst); - - int (*hash_connecting) (struct sock *sk); - + int (*remember_stamp) (struct sock *sk); __u16 net_header_len; @@ -772,8 +771,7 @@ struct sockaddr *uaddr, int addr_len); -extern int tcp_connect(struct sock *sk, - struct sk_buff *skb); +extern int tcp_connect(struct sock *sk); extern struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, diff -ur ../vger3-020314/linux/net/ipv4/af_inet.c linux/net/ipv4/af_inet.c --- ../vger3-020314/linux/net/ipv4/af_inet.c Sat Nov 10 21:45:08 2001 +++ linux/net/ipv4/af_inet.c Fri Mar 15 22:24:55 2002 @@ -620,13 +620,6 @@ if (sk->state != TCP_CLOSE) goto out; - err = -EAGAIN; - if (sk->num == 0) { - if (sk->prot->get_port(sk, 0) != 0) - goto out; - sk->sport = htons(sk->num); - } - err = sk->prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; diff -ur ../vger3-020314/linux/net/ipv4/sysctl_net_ipv4.c linux/net/ipv4/sysctl_net_ipv4.c --- ../vger3-020314/linux/net/ipv4/sysctl_net_ipv4.c Sat Oct 27 21:44:30 2001 +++ linux/net/ipv4/sysctl_net_ipv4.c Fri Mar 15 22:24:56 2002 @@ -219,6 +219,8 @@ &sysctl_icmp_ratelimit, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ICMP_RATEMASK, "icmp_ratemask", &sysctl_icmp_ratemask, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_TW_REUSE, "tcp_tw_reuse", + &sysctl_tcp_tw_reuse, sizeof(int), 0644, NULL, &proc_dointvec}, {0} }; diff -ur ../vger3-020314/linux/net/ipv4/tcp_ipv4.c linux/net/ipv4/tcp_ipv4.c --- ../vger3-020314/linux/net/ipv4/tcp_ipv4.c Sat Feb 2 22:18:06 2002 +++ linux/net/ipv4/tcp_ipv4.c Fri Mar 15 22:24:56 2002 @@ -64,6 +64,7 @@ #include extern int sysctl_ip_dynaddr; +int sysctl_tcp_tw_reuse = 0; /* Check TCP sequence numbers in ICMP packets. */ #define ICMP_MIN_LENGTH 8 @@ -162,23 +163,24 @@ local_bh_enable(); } -static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum) -{ - sk->num = snum; +static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum) +{ + sk->num = snum; if ((sk->bind_next = tb->owners) != NULL) tb->owners->bind_pprev = &sk->bind_next; tb->owners = sk; sk->bind_pprev = &tb->owners; sk->prev = (struct sock *) tb; -} +} static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) -{ +{ struct sock *sk2 = tb->owners; int sk_reuse = sk->reuse; for( ; sk2 != NULL; sk2 = sk2->bind_next) { if (sk != sk2 && + sk2->reuse <= 1 && sk->bound_dev_if == sk2->bound_dev_if) { if (!sk_reuse || !sk2->reuse || @@ -190,8 +192,8 @@ } } } - return sk2 != NULL; -} + return sk2 != NULL; +} /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. @@ -244,12 +246,14 @@ break; } if (tb != NULL && tb->owners != NULL) { - if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) { + if (sk->reuse > 1) + goto success; + if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) { goto success; } else { - ret = 1; + ret = 1; if (tcp_bind_conflict(sk, tb)) - goto fail_unlock; + goto fail_unlock; } } ret = 1; @@ -266,7 +270,7 @@ tb->fastreuse = 0; success: if (sk->prev == NULL) - tcp_bind_hash(sk, tb, snum); + tcp_bind_hash(sk, tb, snum); BUG_TRAP(sk->prev == (struct sock *) tb); ret = 0; @@ -337,13 +341,13 @@ } } -static __inline__ void __tcp_v4_hash(struct sock *sk) +static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) { struct sock **skp; rwlock_t *lock; BUG_TRAP(sk->pprev==NULL); - if(sk->state == TCP_LISTEN) { + if(listen_possible && sk->state == TCP_LISTEN) { skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; lock = &tcp_lhash_lock; tcp_listen_wlock(); @@ -358,7 +362,7 @@ sk->pprev = skp; sock_prot_inc_use(sk->prot); write_unlock(lock); - if (sk->state == TCP_LISTEN) + if (listen_possible && sk->state == TCP_LISTEN) wake_up(&tcp_lhash_wait); } @@ -366,7 +370,7 @@ { if (sk->state != TCP_CLOSE) { local_bh_disable(); - __tcp_v4_hash(sk); + __tcp_v4_hash(sk, 1); local_bh_enable(); } } @@ -375,6 +379,9 @@ { rwlock_t *lock; + if (!sk->pprev) + goto ende; + if (sk->state == TCP_LISTEN) { local_bh_disable(); tcp_listen_wlock(); @@ -393,6 +400,8 @@ sock_prot_dec_use(sk->prot); } write_unlock_bh(lock); + + ende: if (sk->state == TCP_LISTEN) wake_up(&tcp_lhash_wait); } @@ -530,19 +539,21 @@ skb->h.th->source); } -static int tcp_v4_check_established(struct sock *sk) +/* called with local bh disabled */ +static int __tcp_v4_check_established(struct sock *sk, __u16 lport, + struct tcp_tw_bucket **twp) { u32 daddr = sk->rcv_saddr; u32 saddr = sk->daddr; int dif = sk->bound_dev_if; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) - __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num); - int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport); + __u32 ports = TCP_COMBINED_PORTS(sk->dport, lport); + int hash = tcp_hashfn(daddr, lport, saddr, sk->dport); struct tcp_ehash_bucket *head = &tcp_ehash[hash]; struct sock *sk2, **skp; struct tcp_tw_bucket *tw; - write_lock_bh(&head->lock); + write_lock(&head->lock); /* Check TIME-WAIT sockets first. */ for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL; @@ -566,7 +577,9 @@ fall back to VJ's scheme and use initial timestamp retrieved from peer table. */ - if (tw->ts_recent_stamp) { + if (tw->ts_recent_stamp && + (!twp || (sysctl_tcp_tw_reuse && + xtime.tv_sec - tw->ts_recent_stamp > 1))) { if ((tp->write_seq = tw->snd_nxt+65535+2) == 0) tp->write_seq = 1; tp->ts_recent = tw->ts_recent; @@ -587,6 +600,10 @@ } unique: + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + sk->num = lport; + sk->sport = htons(lport); BUG_TRAP(sk->pprev==NULL); if ((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; @@ -595,15 +612,16 @@ sk->pprev = skp; sk->hashent = hash; sock_prot_inc_use(sk->prot); - write_unlock_bh(&head->lock); + write_unlock(&head->lock); - if (tw) { + if (twp) { + *twp = tw; + NET_INC_STATS_BH(TimeWaitRecycled); + } else if (tw) { /* Silly. Should hash-dance instead... */ - local_bh_disable(); tcp_tw_deschedule(tw); tcp_timewait_kill(tw); NET_INC_STATS_BH(TimeWaitRecycled); - local_bh_enable(); tcp_tw_put(tw); } @@ -611,34 +629,120 @@ return 0; not_unique: - write_unlock_bh(&head->lock); + write_unlock(&head->lock); return -EADDRNOTAVAIL; } -/* Hash SYN-SENT socket to established hash table after - * checking that it is unique. Note, that without kernel lock - * we MUST make these two operations atomically. - * - * Optimization: if it is bound and tcp_bind_bucket has the only - * owner (us), we need not to scan established bucket. +/* + * Bind a port for a connect operation and hash it. */ - -int tcp_v4_hash_connecting(struct sock *sk) +static int tcp_v4_hash_connect(struct sock *sk) { unsigned short snum = sk->num; - struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)]; - struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev; + struct tcp_bind_hashbucket *head; + struct tcp_bind_bucket *tb; + + if (snum == 0) { + int rover; + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + struct tcp_tw_bucket *tw = NULL; + + local_bh_disable(); + + /* TODO. Actually it is not so bad idea to remove + * tcp_portalloc_lock before next submission to Linus. + * As soon as we touch this place at all it is time to think. + * + * Now it protects single _advisory_ variable tcp_port_rover, + * hence it is mostly useless. + * Code will work nicely if we just delete it, but + * I am afraid in contented case it will work not better or + * even worse: another cpu just will hit the same bucket + * and spin there. + * So some cpu salt could remove both contention and + * memory pingpong. Any ideas how to do this in a nice way? + */ + spin_lock(&tcp_portalloc_lock); + rover = tcp_port_rover; + + do { + rover++; + if ((rover < low) || (rover > high)) + rover = low; + head = &tcp_bhash[tcp_bhashfn(rover)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + for (tb = head->chain; tb; tb = tb->next) { + if (tb->port == rover) { + BUG_TRAP(tb->owners != NULL); + if (tb->fastreuse >= 0) + goto next_port; + if (!__tcp_v4_check_established(sk, rover, &tw)) + goto ok; + goto next_port; + } + } + + tb = tcp_bucket_create(head, rover); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } while (--remaining > 0); + tcp_port_rover = rover; + spin_unlock(&tcp_portalloc_lock); + + local_bh_enable(); + + return -EADDRNOTAVAIL; + + ok: + /* All locks still held and bhs disabled */ + tcp_port_rover = rover; + spin_unlock(&tcp_portalloc_lock); + + tcp_bind_hash(sk, tb, rover); + if (!sk->pprev) { + sk->sport = htons(rover); + __tcp_v4_hash(sk, 0); + } + spin_unlock(&head->lock); + if (tw) { + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + tcp_tw_put(tw); + } + + local_bh_enable(); + return 0; + } + + head = &tcp_bhash[tcp_bhashfn(snum)]; + tb = (struct tcp_bind_bucket *)sk->prev; spin_lock_bh(&head->lock); if (tb->owners == sk && sk->bind_next == NULL) { - __tcp_v4_hash(sk); + __tcp_v4_hash(sk, 0); spin_unlock_bh(&head->lock); return 0; } else { - spin_unlock_bh(&head->lock); - + int ret; + spin_unlock(&head->lock); /* No definite answer... Walk to established hash table */ - return tcp_v4_check_established(sk); + ret = __tcp_v4_check_established(sk, snum, NULL); + local_bh_enable(); + return ret; } } @@ -647,7 +751,6 @@ { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; - struct sk_buff *buff; struct rtable *rt; u32 daddr, nexthop; int tmp; @@ -682,12 +785,6 @@ if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr) daddr = rt->rt_dst; - err = -ENOBUFS; - buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation); - - if (buff == NULL) - goto failure; - if (!sk->saddr) sk->saddr = rt->rt_src; sk->rcv_saddr = sk->saddr; @@ -718,22 +815,36 @@ sk->dport = usin->sin_port; sk->daddr = daddr; + tp->ext_header_len = 0; + if (sk->protinfo.af_inet.opt) + tp->ext_header_len = sk->protinfo.af_inet.opt->optlen; + + tp->mss_clamp = 536; + + /* Socket identity is still unknown (sport may be zero). + * However we set state to SYN-SENT and not releasing socket + * lock select source port, enter ourselves into the hash tables and + * complete initalization after this. + */ + tcp_set_state(sk, TCP_SYN_SENT); + err = tcp_v4_hash_connect(sk); + if (err) + goto failure; + if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, sk->sport, usin->sin_port); - tp->ext_header_len = 0; - if (sk->protinfo.af_inet.opt) - tp->ext_header_len = sk->protinfo.af_inet.opt->optlen; sk->protinfo.af_inet.id = tp->write_seq^jiffies; - tp->mss_clamp = 536; + err = tcp_connect(sk); + if (err) + goto failure; - err = tcp_connect(sk, buff); - if (err == 0) - return 0; + return 0; failure: + tcp_set_state(sk, TCP_CLOSE); __sk_dst_reset(sk); sk->route_caps = 0; sk->dport = 0; @@ -786,7 +897,6 @@ req->expires = jiffies + TCP_TIMEOUT_INIT; req->retrans = 0; req->sk = NULL; - req->index = h; req->dl_next = lopt->syn_table[h]; write_lock(&tp->syn_wait_lock); @@ -1456,7 +1566,7 @@ newtp->advmss = dst->advmss; tcp_initialize_rcv_mss(newsk); - __tcp_v4_hash(newsk); + __tcp_v4_hash(newsk, 0); __tcp_inherit_port(sk, newsk); return newsk; @@ -1876,7 +1986,6 @@ tcp_v4_rebuild_header, tcp_v4_conn_request, tcp_v4_syn_recv_sock, - tcp_v4_hash_connecting, tcp_v4_remember_stamp, sizeof(struct iphdr), diff -ur ../vger3-020314/linux/net/ipv4/tcp_output.c linux/net/ipv4/tcp_output.c --- ../vger3-020314/linux/net/ipv4/tcp_output.c Sat Nov 10 21:45:08 2001 +++ linux/net/ipv4/tcp_output.c Fri Mar 15 22:24:56 2002 @@ -38,6 +38,7 @@ #include +#include #include /* People can turn this off for buggy TCP's found in printers etc. */ @@ -1157,14 +1158,14 @@ return skb; } -int tcp_connect(struct sock *sk, struct sk_buff *buff) +/* + * Do all connect socket setups that can be done AF independent. + */ +static inline void tcp_connect_init(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* Reserve space for headers. */ - skb_reserve(buff, MAX_TCP_HEADER); - /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ @@ -1191,14 +1192,6 @@ tp->rcv_ssthresh = tp->rcv_wnd; - /* Socket identity change complete, no longer - * in TCP_CLOSE, so enter ourselves into the - * hash tables. - */ - tcp_set_state(sk,TCP_SYN_SENT); - if (tp->af_specific->hash_connecting(sk)) - goto err_out; - sk->err = 0; sk->done = 0; tp->snd_wnd = 0; @@ -1212,6 +1205,24 @@ tp->rto = TCP_TIMEOUT_INIT; tp->retransmits = 0; tcp_clear_retrans(tp); +} + +/* + * Build a SYN and send it off. + */ +int tcp_connect(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *buff; + + tcp_connect_init(sk); + + buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation); + if (unlikely(buff == NULL)) + return -ENOBUFS; + + /* Reserve space for headers. */ + skb_reserve(buff, MAX_TCP_HEADER); TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; TCP_ECN_send_syn(tp, buff); @@ -1234,11 +1245,6 @@ /* Timer for repeating the SYN until an answer. */ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); return 0; - -err_out: - tcp_set_state(sk,TCP_CLOSE); - kfree_skb(buff); - return -EADDRNOTAVAIL; } /* Send out a delayed ack, the caller does the policy checking diff -ur ../vger3-020314/linux/net/ipv6/tcp_ipv6.c linux/net/ipv6/tcp_ipv6.c --- ../vger3-020314/linux/net/ipv6/tcp_ipv6.c Fri Mar 15 22:22:02 2002 +++ linux/net/ipv6/tcp_ipv6.c Fri Mar 15 22:24:56 2002 @@ -133,7 +133,7 @@ break; } if (tb != NULL && tb->owners != NULL) { - if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) { + if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) { goto success; } else { struct sock *sk2 = tb->owners; @@ -486,11 +486,21 @@ return -EADDRNOTAVAIL; } -static int tcp_v6_hash_connecting(struct sock *sk) +static int tcp_v6_hash_connect(struct sock *sk) { - unsigned short snum = sk->num; - struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)]; - struct tcp_bind_bucket *tb = head->chain; + struct tcp_bind_hashbucket *head; + struct tcp_bind_bucket *tb; + + /* XXX */ + if (sk->num == 0) { + int err = tcp_v6_get_port(sk, sk->num); + if (err) + return err; + sk->sport = htons(sk->num); + } + + head = &tcp_bhash[tcp_bhashfn(sk->num)]; + tb = head->chain; spin_lock_bh(&head->lock); @@ -520,7 +530,6 @@ struct in6_addr saddr_buf; struct flowi fl; struct dst_entry *dst; - struct sk_buff *buff; int addr_type; int err; @@ -660,27 +669,25 @@ tp->ext_header_len = np->opt->opt_flen+np->opt->opt_nflen; tp->mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); - err = -ENOBUFS; - buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation); - - if (buff == NULL) - goto failure; - sk->dport = usin->sin6_port; - /* - * Init variables - */ + tcp_set_state(sk, TCP_SYN_SENT); + err = tcp_v6_hash_connect(sk); + if (err) + goto late_failure; if (!tp->write_seq) tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, np->daddr.s6_addr32, sk->sport, sk->dport); + err = tcp_connect(sk); + if (err) + goto late_failure; - err = tcp_connect(sk, buff); - if (err == 0) - return 0; + return 0; +late_failure: + tcp_set_state(sk, TCP_CLOSE); failure: __sk_dst_reset(sk); sk->dport = 0; @@ -1757,7 +1764,6 @@ tcp_v6_rebuild_header, tcp_v6_conn_request, tcp_v6_syn_recv_sock, - tcp_v6_hash_connecting, tcp_v6_remember_stamp, sizeof(struct ipv6hdr), @@ -1777,7 +1783,6 @@ tcp_v4_rebuild_header, tcp_v6_conn_request, tcp_v6_syn_recv_sock, - tcp_v4_hash_connecting, tcp_v4_remember_stamp, sizeof(struct iphdr), diff -ur ../vger3-020314/linux/net/netsyms.c linux/net/netsyms.c --- ../vger3-020314/linux/net/netsyms.c Mon Mar 11 20:56:34 2002 +++ linux/net/netsyms.c Fri Mar 15 22:24:56 2002 @@ -364,7 +364,6 @@ EXPORT_SYMBOL(tcp_v4_syn_recv_sock); EXPORT_SYMBOL(tcp_v4_do_rcv); EXPORT_SYMBOL(tcp_v4_connect); -EXPORT_SYMBOL(tcp_v4_hash_connecting); EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(udp_prot); EXPORT_SYMBOL(tcp_prot);