patch-2.4.19 linux-2.4.19/net/ipv4/tcp_ipv4.c

Next file: linux-2.4.19/net/ipv4/tcp_minisocks.c
Previous file: linux-2.4.19/net/ipv4/tcp_input.c
Back to the patch index
Back to the overall index

diff -urN linux-2.4.18/net/ipv4/tcp_ipv4.c linux-2.4.19/net/ipv4/tcp_ipv4.c
@@ -64,6 +64,8 @@
 #include <linux/ipsec.h>
 
 extern int sysctl_ip_dynaddr;
+extern int sysctl_ip_default_ttl;
+int sysctl_tcp_tw_reuse = 0;
 
 /* Check TCP sequence numbers in ICMP packets. */
 #define ICMP_MIN_LENGTH 8
@@ -162,23 +164,24 @@
 	local_bh_enable();
 }
 
-static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum) 
-{ 
-	sk->num = snum; 
+static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
+{
+	sk->num = snum;
 	if ((sk->bind_next = tb->owners) != NULL)
 		tb->owners->bind_pprev = &sk->bind_next;
 	tb->owners = sk;
 	sk->bind_pprev = &tb->owners;
 	sk->prev = (struct sock *) tb;
-} 
+}
 
 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
-{ 
+{
 	struct sock *sk2 = tb->owners;
 	int sk_reuse = sk->reuse;
 	
 	for( ; sk2 != NULL; sk2 = sk2->bind_next) {
 		if (sk != sk2 &&
+		    sk2->reuse <= 1 &&
 		    sk->bound_dev_if == sk2->bound_dev_if) {
 			if (!sk_reuse	||
 			    !sk2->reuse	||
@@ -190,8 +193,8 @@
 			}
 		}
 	}
-	return sk2 != NULL; 
-} 
+	return sk2 != NULL;
+}
 
 /* Obtain a reference to a local port for the given sock,
  * if snum is zero it means select any available local port.
@@ -244,12 +247,14 @@
 				break;
 	}
 	if (tb != NULL && tb->owners != NULL) {
-		if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
+		if (sk->reuse > 1)
+			goto success;
+		if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
 			goto success;
 		} else {
-			ret = 1; 
+			ret = 1;
 			if (tcp_bind_conflict(sk, tb))
-				goto fail_unlock; 
+				goto fail_unlock;
 		}
 	}
 	ret = 1;
@@ -266,7 +271,7 @@
 		tb->fastreuse = 0;
 success:
 	if (sk->prev == NULL)
-		tcp_bind_hash(sk, tb, snum); 
+		tcp_bind_hash(sk, tb, snum);
 	BUG_TRAP(sk->prev == (struct sock *) tb);
  	ret = 0;
 
@@ -337,13 +342,13 @@
 	}
 }
 
-static __inline__ void __tcp_v4_hash(struct sock *sk)
+static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 {
 	struct sock **skp;
 	rwlock_t *lock;
 
 	BUG_TRAP(sk->pprev==NULL);
-	if(sk->state == TCP_LISTEN) {
+	if(listen_possible && sk->state == TCP_LISTEN) {
 		skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 		lock = &tcp_lhash_lock;
 		tcp_listen_wlock();
@@ -358,7 +363,7 @@
 	sk->pprev = skp;
 	sock_prot_inc_use(sk->prot);
 	write_unlock(lock);
-	if (sk->state == TCP_LISTEN)
+	if (listen_possible && sk->state == TCP_LISTEN)
 		wake_up(&tcp_lhash_wait);
 }
 
@@ -366,7 +371,7 @@
 {
 	if (sk->state != TCP_CLOSE) {
 		local_bh_disable();
-		__tcp_v4_hash(sk);
+		__tcp_v4_hash(sk, 1);
 		local_bh_enable();
 	}
 }
@@ -375,6 +380,9 @@
 {
 	rwlock_t *lock;
 
+	if (!sk->pprev)
+		goto ende;
+
 	if (sk->state == TCP_LISTEN) {
 		local_bh_disable();
 		tcp_listen_wlock();
@@ -393,6 +401,8 @@
 		sock_prot_dec_use(sk->prot);
 	}
 	write_unlock_bh(lock);
+
+ ende:
 	if (sk->state == TCP_LISTEN)
 		wake_up(&tcp_lhash_wait);
 }
@@ -530,19 +540,21 @@
 					  skb->h.th->source);
 }
 
-static int tcp_v4_check_established(struct sock *sk)
+/* called with local bh disabled */
+static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
+				      struct tcp_tw_bucket **twp)
 {
 	u32 daddr = sk->rcv_saddr;
 	u32 saddr = sk->daddr;
 	int dif = sk->bound_dev_if;
 	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
-	__u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
-	int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
+	__u32 ports = TCP_COMBINED_PORTS(sk->dport, lport);
+	int hash = tcp_hashfn(daddr, lport, saddr, sk->dport);
 	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 	struct sock *sk2, **skp;
 	struct tcp_tw_bucket *tw;
 
-	write_lock_bh(&head->lock);
+	write_lock(&head->lock);
 
 	/* Check TIME-WAIT sockets first. */
 	for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
@@ -566,7 +578,9 @@
 			   fall back to VJ's scheme and use initial
 			   timestamp retrieved from peer table.
 			 */
-			if (tw->ts_recent_stamp) {
+			if (tw->ts_recent_stamp &&
+			    (!twp || (sysctl_tcp_tw_reuse &&
+				      xtime.tv_sec - tw->ts_recent_stamp > 1))) {
 				if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
 					tp->write_seq = 1;
 				tp->ts_recent = tw->ts_recent;
@@ -587,6 +601,10 @@
 	}
 
 unique:
+	/* Must record num and sport now. Otherwise we will see
+	 * in hash table socket with a funny identity. */
+	sk->num = lport;
+	sk->sport = htons(lport);
 	BUG_TRAP(sk->pprev==NULL);
 	if ((sk->next = *skp) != NULL)
 		(*skp)->pprev = &sk->next;
@@ -595,15 +613,16 @@
 	sk->pprev = skp;
 	sk->hashent = hash;
 	sock_prot_inc_use(sk->prot);
-	write_unlock_bh(&head->lock);
+	write_unlock(&head->lock);
 
-	if (tw) {
+	if (twp) {
+		*twp = tw;
+		NET_INC_STATS_BH(TimeWaitRecycled);
+	} else if (tw) {
 		/* Silly. Should hash-dance instead... */
-		local_bh_disable();
 		tcp_tw_deschedule(tw);
 		tcp_timewait_kill(tw);
 		NET_INC_STATS_BH(TimeWaitRecycled);
-		local_bh_enable();
 
 		tcp_tw_put(tw);
 	}
@@ -611,34 +630,120 @@
 	return 0;
 
 not_unique:
-	write_unlock_bh(&head->lock);
+	write_unlock(&head->lock);
 	return -EADDRNOTAVAIL;
 }
 
-/* Hash SYN-SENT socket to established hash table after
- * checking that it is unique. Note, that without kernel lock
- * we MUST make these two operations atomically.
- *
- * Optimization: if it is bound and tcp_bind_bucket has the only
- * owner (us), we need not to scan established bucket.
+/*
+ * Bind a port for a connect operation and hash it.
  */
-
-int tcp_v4_hash_connecting(struct sock *sk)
+static int tcp_v4_hash_connect(struct sock *sk)
 {
 	unsigned short snum = sk->num;
-	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
-	struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
+	struct tcp_bind_hashbucket *head;
+	struct tcp_bind_bucket *tb;
+
+	if (snum == 0) {
+		int rover;
+		int low = sysctl_local_port_range[0];
+		int high = sysctl_local_port_range[1];
+		int remaining = (high - low) + 1;
+		struct tcp_tw_bucket *tw = NULL;
+
+		local_bh_disable();
 
+		/* TODO. Actually it is not so bad idea to remove
+		 * tcp_portalloc_lock before next submission to Linus.
+		 * As soon as we touch this place at all it is time to think.
+		 *
+		 * Now it protects single _advisory_ variable tcp_port_rover,
+		 * hence it is mostly useless.
+		 * Code will work nicely if we just delete it, but
+		 * I am afraid in contented case it will work not better or
+		 * even worse: another cpu just will hit the same bucket
+		 * and spin there.
+		 * So some cpu salt could remove both contention and
+		 * memory pingpong. Any ideas how to do this in a nice way?
+		 */
+		spin_lock(&tcp_portalloc_lock);
+		rover = tcp_port_rover;
+
+		do {
+			rover++;
+			if ((rover < low) || (rover > high))
+				rover = low;
+			head = &tcp_bhash[tcp_bhashfn(rover)];
+			spin_lock(&head->lock);		
+
+			/* Does not bother with rcv_saddr checks,
+			 * because the established check is already
+			 * unique enough.
+			 */
+			for (tb = head->chain; tb; tb = tb->next) {
+				if (tb->port == rover) {
+					BUG_TRAP(tb->owners != NULL);
+					if (tb->fastreuse >= 0)
+						goto next_port;
+					if (!__tcp_v4_check_established(sk, rover, &tw))
+						goto ok;
+					goto next_port;
+				}
+			}
+
+			tb = tcp_bucket_create(head, rover);
+			if (!tb) {
+				spin_unlock(&head->lock);
+				break;
+			}
+			tb->fastreuse = -1;
+			goto ok;
+
+		next_port:
+			spin_unlock(&head->lock);
+		} while (--remaining > 0);
+		tcp_port_rover = rover;
+		spin_unlock(&tcp_portalloc_lock);
+
+		local_bh_enable();
+
+		return -EADDRNOTAVAIL;
+
+	ok:
+		/* All locks still held and bhs disabled */
+		tcp_port_rover = rover;
+		spin_unlock(&tcp_portalloc_lock);
+
+		tcp_bind_hash(sk, tb, rover);
+		if (!sk->pprev) {
+			sk->sport = htons(rover);
+			__tcp_v4_hash(sk, 0);
+		}
+		spin_unlock(&head->lock);
+
+		if (tw) {
+			tcp_tw_deschedule(tw);
+			tcp_timewait_kill(tw);
+			tcp_tw_put(tw);
+		}
+
+		local_bh_enable();
+		return 0;
+	}
+
+	head  = &tcp_bhash[tcp_bhashfn(snum)];
+	tb  = (struct tcp_bind_bucket *)sk->prev;
 	spin_lock_bh(&head->lock);
 	if (tb->owners == sk && sk->bind_next == NULL) {
-		__tcp_v4_hash(sk);
+		__tcp_v4_hash(sk, 0);
 		spin_unlock_bh(&head->lock);
 		return 0;
 	} else {
-		spin_unlock_bh(&head->lock);
-
+		int ret;
+		spin_unlock(&head->lock);
 		/* No definite answer... Walk to established hash table */
-		return tcp_v4_check_established(sk);
+		ret = __tcp_v4_check_established(sk, snum, NULL);
+		local_bh_enable();
+		return ret;
 	}
 }
 
@@ -647,7 +752,6 @@
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
-	struct sk_buff *buff;
 	struct rtable *rt;
 	u32 daddr, nexthop;
 	int tmp;
@@ -682,12 +786,6 @@
 	if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
 		daddr = rt->rt_dst;
 
-	err = -ENOBUFS;
-	buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation);
-
-	if (buff == NULL)
-		goto failure;
-
 	if (!sk->saddr)
 		sk->saddr = rt->rt_src;
 	sk->rcv_saddr = sk->saddr;
@@ -718,22 +816,36 @@
 	sk->dport = usin->sin_port;
 	sk->daddr = daddr;
 
+	tp->ext_header_len = 0;
+	if (sk->protinfo.af_inet.opt)
+		tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
+
+	tp->mss_clamp = 536;
+
+	/* Socket identity is still unknown (sport may be zero).
+	 * However we set state to SYN-SENT and not releasing socket
+	 * lock select source port, enter ourselves into the hash tables and
+	 * complete initalization after this.
+	 */
+	tcp_set_state(sk, TCP_SYN_SENT);
+	err = tcp_v4_hash_connect(sk);
+	if (err)
+		goto failure;
+
 	if (!tp->write_seq)
 		tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
 							   sk->sport, usin->sin_port);
 
-	tp->ext_header_len = 0;
-	if (sk->protinfo.af_inet.opt)
-		tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
 	sk->protinfo.af_inet.id = tp->write_seq^jiffies;
 
-	tp->mss_clamp = 536;
+	err = tcp_connect(sk);
+	if (err)
+		goto failure;
 
-	err = tcp_connect(sk, buff);
-	if (err == 0)
-		return 0;
+	return 0;
 
 failure:
+	tcp_set_state(sk, TCP_CLOSE);
 	__sk_dst_reset(sk);
 	sk->route_caps = 0;
 	sk->dport = 0;
@@ -786,7 +898,6 @@
 	req->expires = jiffies + TCP_TIMEOUT_INIT;
 	req->retrans = 0;
 	req->sk = NULL;
-	req->index = h;
 	req->dl_next = lopt->syn_table[h];
 
 	write_lock(&tp->syn_wait_lock);
@@ -1072,6 +1183,7 @@
 	arg.n_iov = 1;
 	arg.csumoffset = offsetof(struct tcphdr, check) / 2; 
 
+	tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
 	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 
 	TCP_INC_STATS_BH(TcpOutSegs);
@@ -1387,7 +1499,6 @@
 			NETDEBUG(if (net_ratelimit()) \
 				printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
 					NIPQUAD(saddr), ntohs(skb->h.th->source)));
-			TCP_INC_STATS_BH(TcpAttemptFails);
 			dst_release(dst);
 			goto drop_and_free;
 		}
@@ -1456,7 +1567,7 @@
 	newtp->advmss = dst->advmss;
 	tcp_initialize_rcv_mss(newsk);
 
-	__tcp_v4_hash(newsk);
+	__tcp_v4_hash(newsk, 0);
 	__tcp_inherit_port(sk, newsk);
 
 	return newsk;
@@ -1876,7 +1987,6 @@
 	tcp_v4_rebuild_header,
 	tcp_v4_conn_request,
 	tcp_v4_syn_recv_sock,
-	tcp_v4_hash_connecting,
 	tcp_v4_remember_stamp,
 	sizeof(struct iphdr),
 

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)