Blame - net/ipv4/tcp_ipv4.c - linux

blob: 5d91213d34c06c2f52a9a3217cb1dc9a2ef4c5cc [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
				9	*
				10	* IPv4 specific functions
				11	*
				12	*
				13	* code split from:
				14	* linux/ipv4/tcp.c
				15	* linux/ipv4/tcp_input.c
				16	* linux/ipv4/tcp_output.c
				17	*
				18	* See tcp.c for author information
				19	*
				20	* This program is free software; you can redistribute it and/or
				21	* modify it under the terms of the GNU General Public License
				22	* as published by the Free Software Foundation; either version
				23	* 2 of the License, or (at your option) any later version.
				24	*/
				25
				26	/*
				27	* Changes:
				28	* David S. Miller : New socket lookup architecture.
				29	* This code is dedicated to John Dyson.
				30	* David S. Miller : Change semantics of established hash,
				31	* half is devoted to TIME_WAIT sockets
				32	* and the rest go in the other half.
				33	* Andi Kleen : Add support for syncookies and fixed
				34	* some bugs: ip options weren't passed to
				35	* the TCP layer, missed a check for an
				36	* ACK bit.
				37	* Andi Kleen : Implemented fast path mtu discovery.
				38	* Fixed many serious bugs in the
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	39	* request_sock handling and moved
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	40	* most of it into the af independent code.
				41	* Added tail drop and some other bugfixes.
				42	* Added new listen sematics.
				43	* Mike McLagan : Routing by source
				44	* Juan Jose Ciarlante: ip_dynaddr bits
				45	* Andi Kleen: various fixes.
				46	* Vitaly E. Lavrov : Transparent proxy revived after year
				47	* coma.
				48	* Andi Kleen : Fix new listen.
				49	* Andi Kleen : Fix accept error reporting.
				50	* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
				51	* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
				52	* a single port at the same time.
				53	*/
				54
				55	#include <linux/config.h>
				56
				57	#include <linux/types.h>
				58	#include <linux/fcntl.h>
				59	#include <linux/module.h>
				60	#include <linux/random.h>
				61	#include <linux/cache.h>
				62	#include <linux/jhash.h>
				63	#include <linux/init.h>
				64	#include <linux/times.h>
				65
				66	#include <net/icmp.h>
				67	#include <net/tcp.h>
				68	#include <net/ipv6.h>
				69	#include <net/inet_common.h>
				70	#include <net/xfrm.h>
				71
				72	#include <linux/inet.h>
				73	#include <linux/ipv6.h>
				74	#include <linux/stddef.h>
				75	#include <linux/proc_fs.h>
				76	#include <linux/seq_file.h>
				77
				78	extern int sysctl_ip_dynaddr;
				79	int sysctl_tcp_tw_reuse;
				80	int sysctl_tcp_low_latency;
				81
				82	/* Check TCP sequence numbers in ICMP packets. */
				83	#define ICMP_MIN_LENGTH 8
				84
				85	/* Socket used for sending RSTs */
				86	static struct socket *tcp_socket;
				87
				88	void tcp_v4_send_check(struct sock sk, struct tcphdr th, int len,
				89	struct sk_buff *skb);
				90
				91	struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
				92	.__tcp_lhash_lock = RW_LOCK_UNLOCKED,
				93	.__tcp_lhash_users = ATOMIC_INIT(0),
				94	.__tcp_lhash_wait
				95	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
				96	.__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
				97	};
				98
				99	/*
				100	* This array holds the first and last local port number.
				101	* For high-usage systems, use sysctl to change this to
				102	* 32768-61000
				103	*/
				104	int sysctl_local_port_range[2] = { 1024, 4999 };
				105	int tcp_port_rover = 1024 - 1;
				106
				107	static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
				108	__u32 faddr, __u16 fport)
				109	{
				110	int h = (laddr ^ lport) ^ (faddr ^ fport);
				111	h ^= h >> 16;
				112	h ^= h >> 8;
				113	return h & (tcp_ehash_size - 1);
				114	}
				115
				116	static __inline__ int tcp_sk_hashfn(struct sock *sk)
				117	{
				118	struct inet_sock *inet = inet_sk(sk);
				119	__u32 laddr = inet->rcv_saddr;
				120	__u16 lport = inet->num;
				121	__u32 faddr = inet->daddr;
				122	__u16 fport = inet->dport;
				123
				124	return tcp_hashfn(laddr, lport, faddr, fport);
				125	}
				126
				127	/* Allocate and initialize a new TCP local port bind bucket.
				128	* The bindhash mutex for snum's hash chain must be held here.
				129	*/
				130	struct tcp_bind_bucket tcp_bucket_create(struct tcp_bind_hashbucket head,
				131	unsigned short snum)
				132	{
				133	struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
				134	SLAB_ATOMIC);
				135	if (tb) {
				136	tb->port = snum;
				137	tb->fastreuse = 0;
				138	INIT_HLIST_HEAD(&tb->owners);
				139	hlist_add_head(&tb->node, &head->chain);
				140	}
				141	return tb;
				142	}
				143
				144	/* Caller must hold hashbucket lock for this tb with local BH disabled */
				145	void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
				146	{
				147	if (hlist_empty(&tb->owners)) {
				148	__hlist_del(&tb->node);
				149	kmem_cache_free(tcp_bucket_cachep, tb);
				150	}
				151	}
				152
				153	/* Caller must disable local BH processing. */
				154	static __inline__ void __tcp_inherit_port(struct sock sk, struct sock child)
				155	{
				156	struct tcp_bind_hashbucket *head =
				157	&tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
				158	struct tcp_bind_bucket *tb;
				159
				160	spin_lock(&head->lock);
				161	tb = tcp_sk(sk)->bind_hash;
				162	sk_add_bind_node(child, &tb->owners);
				163	tcp_sk(child)->bind_hash = tb;
				164	spin_unlock(&head->lock);
				165	}
				166
				167	inline void tcp_inherit_port(struct sock sk, struct sock child)
				168	{
				169	local_bh_disable();
				170	__tcp_inherit_port(sk, child);
				171	local_bh_enable();
				172	}
				173
				174	void tcp_bind_hash(struct sock sk, struct tcp_bind_bucket tb,
				175	unsigned short snum)
				176	{
				177	inet_sk(sk)->num = snum;
				178	sk_add_bind_node(sk, &tb->owners);
				179	tcp_sk(sk)->bind_hash = tb;
				180	}
				181
				182	static inline int tcp_bind_conflict(struct sock sk, struct tcp_bind_bucket tb)
				183	{
				184	const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
				185	struct sock *sk2;
				186	struct hlist_node *node;
				187	int reuse = sk->sk_reuse;
				188
				189	sk_for_each_bound(sk2, node, &tb->owners) {
				190	if (sk != sk2 &&
				191	!tcp_v6_ipv6only(sk2) &&
				192	(!sk->sk_bound_dev_if \|\|
				193	!sk2->sk_bound_dev_if \|\|
				194	sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
				195	if (!reuse \|\| !sk2->sk_reuse \|\|
				196	sk2->sk_state == TCP_LISTEN) {
				197	const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
				198	if (!sk2_rcv_saddr \|\| !sk_rcv_saddr \|\|
				199	sk2_rcv_saddr == sk_rcv_saddr)
				200	break;
				201	}
				202	}
				203	}
				204	return node != NULL;
				205	}
				206
				207	/* Obtain a reference to a local port for the given sock,
				208	* if snum is zero it means select any available local port.
				209	*/
				210	static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
				211	{
				212	struct tcp_bind_hashbucket *head;
				213	struct hlist_node *node;
				214	struct tcp_bind_bucket *tb;
				215	int ret;
				216
				217	local_bh_disable();
				218	if (!snum) {
				219	int low = sysctl_local_port_range[0];
				220	int high = sysctl_local_port_range[1];
				221	int remaining = (high - low) + 1;
				222	int rover;
				223
				224	spin_lock(&tcp_portalloc_lock);
Folkert van Heusden	0b2531b	2005-05-03 14:36:08 -0700	[diff] [blame]	225	if (tcp_port_rover < low)
				226	rover = low;
				227	else
				228	rover = tcp_port_rover;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	229	do {
				230	rover++;
Folkert van Heusden	0b2531b	2005-05-03 14:36:08 -0700	[diff] [blame]	231	if (rover > high)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	232	rover = low;
				233	head = &tcp_bhash[tcp_bhashfn(rover)];
				234	spin_lock(&head->lock);
				235	tb_for_each(tb, node, &head->chain)
				236	if (tb->port == rover)
				237	goto next;
				238	break;
				239	next:
				240	spin_unlock(&head->lock);
				241	} while (--remaining > 0);
				242	tcp_port_rover = rover;
				243	spin_unlock(&tcp_portalloc_lock);
				244
				245	/* Exhausted local port range during search? */
				246	ret = 1;
				247	if (remaining <= 0)
				248	goto fail;
				249
				250	/* OK, here is the one we will use. HEAD is
				251	* non-NULL and we hold it's mutex.
				252	*/
				253	snum = rover;
				254	} else {
				255	head = &tcp_bhash[tcp_bhashfn(snum)];
				256	spin_lock(&head->lock);
				257	tb_for_each(tb, node, &head->chain)
				258	if (tb->port == snum)
				259	goto tb_found;
				260	}
				261	tb = NULL;
				262	goto tb_not_found;
				263	tb_found:
				264	if (!hlist_empty(&tb->owners)) {
				265	if (sk->sk_reuse > 1)
				266	goto success;
				267	if (tb->fastreuse > 0 &&
				268	sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
				269	goto success;
				270	} else {
				271	ret = 1;
				272	if (tcp_bind_conflict(sk, tb))
				273	goto fail_unlock;
				274	}
				275	}
				276	tb_not_found:
				277	ret = 1;
				278	if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
				279	goto fail_unlock;
				280	if (hlist_empty(&tb->owners)) {
				281	if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
				282	tb->fastreuse = 1;
				283	else
				284	tb->fastreuse = 0;
				285	} else if (tb->fastreuse &&
				286	(!sk->sk_reuse \|\| sk->sk_state == TCP_LISTEN))
				287	tb->fastreuse = 0;
				288	success:
				289	if (!tcp_sk(sk)->bind_hash)
				290	tcp_bind_hash(sk, tb, snum);
				291	BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
				292	ret = 0;
				293
				294	fail_unlock:
				295	spin_unlock(&head->lock);
				296	fail:
				297	local_bh_enable();
				298	return ret;
				299	}
				300
				301	/* Get rid of any references to a local port held by the
				302	* given sock.
				303	*/
				304	static void __tcp_put_port(struct sock *sk)
				305	{
				306	struct inet_sock *inet = inet_sk(sk);
				307	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
				308	struct tcp_bind_bucket *tb;
				309
				310	spin_lock(&head->lock);
				311	tb = tcp_sk(sk)->bind_hash;
				312	__sk_del_bind_node(sk);
				313	tcp_sk(sk)->bind_hash = NULL;
				314	inet->num = 0;
				315	tcp_bucket_destroy(tb);
				316	spin_unlock(&head->lock);
				317	}
				318
				319	void tcp_put_port(struct sock *sk)
				320	{
				321	local_bh_disable();
				322	__tcp_put_port(sk);
				323	local_bh_enable();
				324	}
				325
				326	/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
				327	* Look, when several writers sleep and reader wakes them up, all but one
				328	* immediately hit write lock and grab all the cpus. Exclusive sleep solves
				329	* this, _but_ remember, it adds useless work on UP machines (wake up each
				330	* exclusive lock release). It should be ifdefed really.
				331	*/
				332
				333	void tcp_listen_wlock(void)
				334	{
				335	write_lock(&tcp_lhash_lock);
				336
				337	if (atomic_read(&tcp_lhash_users)) {
				338	DEFINE_WAIT(wait);
				339
				340	for (;;) {
				341	prepare_to_wait_exclusive(&tcp_lhash_wait,
				342	&wait, TASK_UNINTERRUPTIBLE);
				343	if (!atomic_read(&tcp_lhash_users))
				344	break;
				345	write_unlock_bh(&tcp_lhash_lock);
				346	schedule();
				347	write_lock_bh(&tcp_lhash_lock);
				348	}
				349
				350	finish_wait(&tcp_lhash_wait, &wait);
				351	}
				352	}
				353
				354	static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
				355	{
				356	struct hlist_head *list;
				357	rwlock_t *lock;
				358
				359	BUG_TRAP(sk_unhashed(sk));
				360	if (listen_possible && sk->sk_state == TCP_LISTEN) {
				361	list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
				362	lock = &tcp_lhash_lock;
				363	tcp_listen_wlock();
				364	} else {
				365	list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
				366	lock = &tcp_ehash[sk->sk_hashent].lock;
				367	write_lock(lock);
				368	}
				369	__sk_add_node(sk, list);
				370	sock_prot_inc_use(sk->sk_prot);
				371	write_unlock(lock);
				372	if (listen_possible && sk->sk_state == TCP_LISTEN)
				373	wake_up(&tcp_lhash_wait);
				374	}
				375
				376	static void tcp_v4_hash(struct sock *sk)
				377	{
				378	if (sk->sk_state != TCP_CLOSE) {
				379	local_bh_disable();
				380	__tcp_v4_hash(sk, 1);
				381	local_bh_enable();
				382	}
				383	}
				384
				385	void tcp_unhash(struct sock *sk)
				386	{
				387	rwlock_t *lock;
				388
				389	if (sk_unhashed(sk))
				390	goto ende;
				391
				392	if (sk->sk_state == TCP_LISTEN) {
				393	local_bh_disable();
				394	tcp_listen_wlock();
				395	lock = &tcp_lhash_lock;
				396	} else {
				397	struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
				398	lock = &head->lock;
				399	write_lock_bh(&head->lock);
				400	}
				401
				402	if (__sk_del_node_init(sk))
				403	sock_prot_dec_use(sk->sk_prot);
				404	write_unlock_bh(lock);
				405
				406	ende:
				407	if (sk->sk_state == TCP_LISTEN)
				408	wake_up(&tcp_lhash_wait);
				409	}
				410
				411	/* Don't inline this cruft. Here are some nice properties to
				412	* exploit here. The BSD API does not allow a listening TCP
				413	* to specify the remote port nor the remote address for the
				414	* connection. So always assume those are both wildcarded
				415	* during the search since they can never be otherwise.
				416	*/
				417	static struct sock __tcp_v4_lookup_listener(struct hlist_head head, u32 daddr,
				418	unsigned short hnum, int dif)
				419	{
				420	struct sock result = NULL, sk;
				421	struct hlist_node *node;
				422	int score, hiscore;
				423
				424	hiscore=-1;
				425	sk_for_each(sk, node, head) {
				426	struct inet_sock *inet = inet_sk(sk);
				427
				428	if (inet->num == hnum && !ipv6_only_sock(sk)) {
				429	__u32 rcv_saddr = inet->rcv_saddr;
				430
				431	score = (sk->sk_family == PF_INET ? 1 : 0);
				432	if (rcv_saddr) {
				433	if (rcv_saddr != daddr)
				434	continue;
				435	score+=2;
				436	}
				437	if (sk->sk_bound_dev_if) {
				438	if (sk->sk_bound_dev_if != dif)
				439	continue;
				440	score+=2;
				441	}
				442	if (score == 5)
				443	return sk;
				444	if (score > hiscore) {
				445	hiscore = score;
				446	result = sk;
				447	}
				448	}
				449	}
				450	return result;
				451	}
				452
				453	/* Optimize the common listener case. */
				454	static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
				455	unsigned short hnum, int dif)
				456	{
				457	struct sock *sk = NULL;
				458	struct hlist_head *head;
				459
				460	read_lock(&tcp_lhash_lock);
				461	head = &tcp_listening_hash[tcp_lhashfn(hnum)];
				462	if (!hlist_empty(head)) {
				463	struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
				464
				465	if (inet->num == hnum && !sk->sk_node.next &&
				466	(!inet->rcv_saddr \|\| inet->rcv_saddr == daddr) &&
				467	(sk->sk_family == PF_INET \|\| !ipv6_only_sock(sk)) &&
				468	!sk->sk_bound_dev_if)
				469	goto sherry_cache;
				470	sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
				471	}
				472	if (sk) {
				473	sherry_cache:
				474	sock_hold(sk);
				475	}
				476	read_unlock(&tcp_lhash_lock);
				477	return sk;
				478	}
				479
				480	/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
				481	* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
				482	*
				483	* Local BH must be disabled here.
				484	*/
				485
				486	static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
				487	u32 daddr, u16 hnum,
				488	int dif)
				489	{
				490	struct tcp_ehash_bucket *head;
				491	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
				492	__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
				493	struct sock *sk;
				494	struct hlist_node *node;
				495	/* Optimize here for direct hit, only listening connections can
				496	* have wildcards anyways.
				497	*/
				498	int hash = tcp_hashfn(daddr, hnum, saddr, sport);
				499	head = &tcp_ehash[hash];
				500	read_lock(&head->lock);
				501	sk_for_each(sk, node, &head->chain) {
				502	if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
				503	goto hit; /* You sunk my battleship! */
				504	}
				505
				506	/* Must check for a TIME_WAIT'er before going to listener hash. */
				507	sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
				508	if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
				509	goto hit;
				510	}
				511	sk = NULL;
				512	out:
				513	read_unlock(&head->lock);
				514	return sk;
				515	hit:
				516	sock_hold(sk);
				517	goto out;
				518	}
				519
				520	static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
				521	u32 daddr, u16 hnum, int dif)
				522	{
				523	struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
				524	daddr, hnum, dif);
				525
				526	return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
				527	}
				528
				529	inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
				530	u16 dport, int dif)
				531	{
				532	struct sock *sk;
				533
				534	local_bh_disable();
				535	sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
				536	local_bh_enable();
				537
				538	return sk;
				539	}
				540
				541	EXPORT_SYMBOL_GPL(tcp_v4_lookup);
				542
				543	static inline __u32 tcp_v4_init_sequence(struct sock sk, struct sk_buff skb)
				544	{
				545	return secure_tcp_sequence_number(skb->nh.iph->daddr,
				546	skb->nh.iph->saddr,
				547	skb->h.th->dest,
				548	skb->h.th->source);
				549	}
				550
				551	/* called with local bh disabled */
				552	static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
				553	struct tcp_tw_bucket **twp)
				554	{
				555	struct inet_sock *inet = inet_sk(sk);
				556	u32 daddr = inet->rcv_saddr;
				557	u32 saddr = inet->daddr;
				558	int dif = sk->sk_bound_dev_if;
				559	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
				560	__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
				561	int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
				562	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
				563	struct sock *sk2;
				564	struct hlist_node *node;
				565	struct tcp_tw_bucket *tw;
				566
				567	write_lock(&head->lock);
				568
				569	/* Check TIME-WAIT sockets first. */
				570	sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
				571	tw = (struct tcp_tw_bucket *)sk2;
				572
				573	if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
				574	struct tcp_sock *tp = tcp_sk(sk);
				575
				576	/* With PAWS, it is safe from the viewpoint
				577	of data integrity. Even without PAWS it
				578	is safe provided sequence spaces do not
				579	overlap i.e. at data rates <= 80Mbit/sec.
				580
				581	Actually, the idea is close to VJ's one,
				582	only timestamp cache is held not per host,
				583	but per port pair and TW bucket is used
				584	as state holder.
				585
				586	If TW bucket has been already destroyed we
				587	fall back to VJ's scheme and use initial
				588	timestamp retrieved from peer table.
				589	*/
				590	if (tw->tw_ts_recent_stamp &&
				591	(!twp \|\| (sysctl_tcp_tw_reuse &&
				592	xtime.tv_sec -
				593	tw->tw_ts_recent_stamp > 1))) {
				594	if ((tp->write_seq =
				595	tw->tw_snd_nxt + 65535 + 2) == 0)
				596	tp->write_seq = 1;
				597	tp->rx_opt.ts_recent = tw->tw_ts_recent;
				598	tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
				599	sock_hold(sk2);
				600	goto unique;
				601	} else
				602	goto not_unique;
				603	}
				604	}
				605	tw = NULL;
				606
				607	/* And established part... */
				608	sk_for_each(sk2, node, &head->chain) {
				609	if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
				610	goto not_unique;
				611	}
				612
				613	unique:
				614	/* Must record num and sport now. Otherwise we will see
				615	* in hash table socket with a funny identity. */
				616	inet->num = lport;
				617	inet->sport = htons(lport);
				618	sk->sk_hashent = hash;
				619	BUG_TRAP(sk_unhashed(sk));
				620	__sk_add_node(sk, &head->chain);
				621	sock_prot_inc_use(sk->sk_prot);
				622	write_unlock(&head->lock);
				623
				624	if (twp) {
				625	*twp = tw;
				626	NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
				627	} else if (tw) {
				628	/* Silly. Should hash-dance instead... */
				629	tcp_tw_deschedule(tw);
				630	NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
				631
				632	tcp_tw_put(tw);
				633	}
				634
				635	return 0;
				636
				637	not_unique:
				638	write_unlock(&head->lock);
				639	return -EADDRNOTAVAIL;
				640	}
				641
				642	static inline u32 connect_port_offset(const struct sock *sk)
				643	{
				644	const struct inet_sock *inet = inet_sk(sk);
				645
				646	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
				647	inet->dport);
				648	}
				649
				650	/*
				651	* Bind a port for a connect operation and hash it.
				652	*/
				653	static inline int tcp_v4_hash_connect(struct sock *sk)
				654	{
				655	unsigned short snum = inet_sk(sk)->num;
				656	struct tcp_bind_hashbucket *head;
				657	struct tcp_bind_bucket *tb;
				658	int ret;
				659
				660	if (!snum) {
				661	int low = sysctl_local_port_range[0];
				662	int high = sysctl_local_port_range[1];
				663	int range = high - low;
				664	int i;
				665	int port;
				666	static u32 hint;
				667	u32 offset = hint + connect_port_offset(sk);
				668	struct hlist_node *node;
				669	struct tcp_tw_bucket *tw = NULL;
				670
				671	local_bh_disable();
				672	for (i = 1; i <= range; i++) {
				673	port = low + (i + offset) % range;
				674	head = &tcp_bhash[tcp_bhashfn(port)];
				675	spin_lock(&head->lock);
				676
				677	/* Does not bother with rcv_saddr checks,
				678	* because the established check is already
				679	* unique enough.
				680	*/
				681	tb_for_each(tb, node, &head->chain) {
				682	if (tb->port == port) {
				683	BUG_TRAP(!hlist_empty(&tb->owners));
				684	if (tb->fastreuse >= 0)
				685	goto next_port;
				686	if (!__tcp_v4_check_established(sk,
				687	port,
				688	&tw))
				689	goto ok;
				690	goto next_port;
				691	}
				692	}
				693
				694	tb = tcp_bucket_create(head, port);
				695	if (!tb) {
				696	spin_unlock(&head->lock);
				697	break;
				698	}
				699	tb->fastreuse = -1;
				700	goto ok;
				701
				702	next_port:
				703	spin_unlock(&head->lock);
				704	}
				705	local_bh_enable();
				706
				707	return -EADDRNOTAVAIL;
				708
				709	ok:
				710	hint += i;
				711
				712	/* Head lock still held and bh's disabled */
				713	tcp_bind_hash(sk, tb, port);
				714	if (sk_unhashed(sk)) {
				715	inet_sk(sk)->sport = htons(port);
				716	__tcp_v4_hash(sk, 0);
				717	}
				718	spin_unlock(&head->lock);
				719
				720	if (tw) {
				721	tcp_tw_deschedule(tw);
				722	tcp_tw_put(tw);
				723	}
				724
				725	ret = 0;
				726	goto out;
				727	}
				728
				729	head = &tcp_bhash[tcp_bhashfn(snum)];
				730	tb = tcp_sk(sk)->bind_hash;
				731	spin_lock_bh(&head->lock);
				732	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
				733	__tcp_v4_hash(sk, 0);
				734	spin_unlock_bh(&head->lock);
				735	return 0;
				736	} else {
				737	spin_unlock(&head->lock);
				738	/* No definite answer... Walk to established hash table */
				739	ret = __tcp_v4_check_established(sk, snum, NULL);
				740	out:
				741	local_bh_enable();
				742	return ret;
				743	}
				744	}
				745
				746	/* This will initiate an outgoing connection. */
				747	int tcp_v4_connect(struct sock sk, struct sockaddr uaddr, int addr_len)
				748	{
				749	struct inet_sock *inet = inet_sk(sk);
				750	struct tcp_sock *tp = tcp_sk(sk);
				751	struct sockaddr_in usin = (struct sockaddr_in )uaddr;
				752	struct rtable *rt;
				753	u32 daddr, nexthop;
				754	int tmp;
				755	int err;
				756
				757	if (addr_len < sizeof(struct sockaddr_in))
				758	return -EINVAL;
				759
				760	if (usin->sin_family != AF_INET)
				761	return -EAFNOSUPPORT;
				762
				763	nexthop = daddr = usin->sin_addr.s_addr;
				764	if (inet->opt && inet->opt->srr) {
				765	if (!daddr)
				766	return -EINVAL;
				767	nexthop = inet->opt->faddr;
				768	}
				769
				770	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
				771	RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
				772	IPPROTO_TCP,
				773	inet->sport, usin->sin_port, sk);
				774	if (tmp < 0)
				775	return tmp;
				776
				777	if (rt->rt_flags & (RTCF_MULTICAST \| RTCF_BROADCAST)) {
				778	ip_rt_put(rt);
				779	return -ENETUNREACH;
				780	}
				781
				782	if (!inet->opt \|\| !inet->opt->srr)
				783	daddr = rt->rt_dst;
				784
				785	if (!inet->saddr)
				786	inet->saddr = rt->rt_src;
				787	inet->rcv_saddr = inet->saddr;
				788
				789	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
				790	/* Reset inherited state */
				791	tp->rx_opt.ts_recent = 0;
				792	tp->rx_opt.ts_recent_stamp = 0;
				793	tp->write_seq = 0;
				794	}
				795
				796	if (sysctl_tcp_tw_recycle &&
				797	!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
				798	struct inet_peer *peer = rt_get_peer(rt);
				799
				800	/* VJ's idea. We save last timestamp seen from
				801	* the destination in peer table, when entering state TIME-WAIT
				802	* and initialize rx_opt.ts_recent from it, when trying new connection.
				803	*/
				804
				805	if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
				806	tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
				807	tp->rx_opt.ts_recent = peer->tcp_ts;
				808	}
				809	}
				810
				811	inet->dport = usin->sin_port;
				812	inet->daddr = daddr;
				813
				814	tp->ext_header_len = 0;
				815	if (inet->opt)
				816	tp->ext_header_len = inet->opt->optlen;
				817
				818	tp->rx_opt.mss_clamp = 536;
				819
				820	/* Socket identity is still unknown (sport may be zero).
				821	* However we set state to SYN-SENT and not releasing socket
				822	* lock select source port, enter ourselves into the hash tables and
				823	* complete initialization after this.
				824	*/
				825	tcp_set_state(sk, TCP_SYN_SENT);
				826	err = tcp_v4_hash_connect(sk);
				827	if (err)
				828	goto failure;
				829
				830	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
				831	if (err)
				832	goto failure;
				833
				834	/* OK, now commit destination to socket. */
				835	__sk_dst_set(sk, &rt->u.dst);
				836	tcp_v4_setup_caps(sk, &rt->u.dst);
				837
				838	if (!tp->write_seq)
				839	tp->write_seq = secure_tcp_sequence_number(inet->saddr,
				840	inet->daddr,
				841	inet->sport,
				842	usin->sin_port);
				843
				844	inet->id = tp->write_seq ^ jiffies;
				845
				846	err = tcp_connect(sk);
				847	rt = NULL;
				848	if (err)
				849	goto failure;
				850
				851	return 0;
				852
				853	failure:
				854	/* This unhashes the socket and releases the local port, if necessary. */
				855	tcp_set_state(sk, TCP_CLOSE);
				856	ip_rt_put(rt);
				857	sk->sk_route_caps = 0;
				858	inet->dport = 0;
				859	return err;
				860	}
				861
				862	static __inline__ int tcp_v4_iif(struct sk_buff *skb)
				863	{
				864	return ((struct rtable *)skb->dst)->rt_iif;
				865	}
				866
				867	static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
				868	{
				869	return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
				870	}
				871
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	872	static struct request_sock tcp_v4_search_req(struct tcp_sock tp,
				873	struct request_sock ***prevp,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	874	__u16 rport,
				875	__u32 raddr, __u32 laddr)
				876	{
Arnaldo Carvalho de Melo	2ad69c5	2005-06-18 22:48:55 -0700	[diff] [blame]	877	struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	878	struct request_sock req, *prev;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	879
				880	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
				881	(req = *prev) != NULL;
				882	prev = &req->dl_next) {
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	883	const struct inet_request_sock *ireq = inet_rsk(req);
				884
				885	if (ireq->rmt_port == rport &&
				886	ireq->rmt_addr == raddr &&
				887	ireq->loc_addr == laddr &&
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	888	TCP_INET_FAMILY(req->rsk_ops->family)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	889	BUG_TRAP(!req->sk);
				890	*prevp = prev;
				891	break;
				892	}
				893	}
				894
				895	return req;
				896	}
				897
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	898	static void tcp_v4_synq_add(struct sock sk, struct request_sock req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	899	{
				900	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	2ad69c5	2005-06-18 22:48:55 -0700	[diff] [blame]	901	struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	902	u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	903
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	904	reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	905	tcp_synq_added(sk);
				906	}
				907
				908
				909	/*
				910	* This routine does path mtu discovery as defined in RFC1191.
				911	*/
				912	static inline void do_pmtu_discovery(struct sock sk, struct iphdr iph,
				913	u32 mtu)
				914	{
				915	struct dst_entry *dst;
				916	struct inet_sock *inet = inet_sk(sk);
				917	struct tcp_sock *tp = tcp_sk(sk);
				918
				919	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
				920	* send out by Linux are always <576bytes so they should go through
				921	* unfragmented).
				922	*/
				923	if (sk->sk_state == TCP_LISTEN)
				924	return;
				925
				926	/* We don't check in the destentry if pmtu discovery is forbidden
				927	* on this route. We just assume that no packet_to_big packets
				928	* are send back when pmtu discovery is not active.
				929	* There is a small race when the user changes this flag in the
				930	* route, but I think that's acceptable.
				931	*/
				932	if ((dst = __sk_dst_check(sk, 0)) == NULL)
				933	return;
				934
				935	dst->ops->update_pmtu(dst, mtu);
				936
				937	/* Something is about to be wrong... Remember soft error
				938	* for the case, if this connection will not able to recover.
				939	*/
				940	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
				941	sk->sk_err_soft = EMSGSIZE;
				942
				943	mtu = dst_mtu(dst);
				944
				945	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
				946	tp->pmtu_cookie > mtu) {
				947	tcp_sync_mss(sk, mtu);
				948
				949	/* Resend the TCP packet because it's
				950	* clear that the old packet has been
				951	* dropped. This is the new "fast" path mtu
				952	* discovery.
				953	*/
				954	tcp_simple_retransmit(sk);
				955	} /* else let the usual retransmit timer handle it */
				956	}
				957
				958	/*
				959	* This routine is called by the ICMP module when it gets some
				960	* sort of error condition. If err < 0 then the socket should
				961	* be closed and the error returned to the user. If err > 0
				962	* it's just the icmp type << 8 \| icmp code. After adjustment
				963	* header points to the first 8 bytes of the tcp header. We need
				964	* to find the appropriate port.
				965	*
				966	* The locking strategy used here is very "optimistic". When
				967	* someone else accesses the socket the ICMP is just dropped
				968	* and for some paths there is no check at all.
				969	* A more general error queue to queue errors for later handling
				970	* is probably better.
				971	*
				972	*/
				973
				974	void tcp_v4_err(struct sk_buff *skb, u32 info)
				975	{
				976	struct iphdr iph = (struct iphdr )skb->data;
				977	struct tcphdr th = (struct tcphdr )(skb->data + (iph->ihl << 2));
				978	struct tcp_sock *tp;
				979	struct inet_sock *inet;
				980	int type = skb->h.icmph->type;
				981	int code = skb->h.icmph->code;
				982	struct sock *sk;
				983	__u32 seq;
				984	int err;
				985
				986	if (skb->len < (iph->ihl << 2) + 8) {
				987	ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
				988	return;
				989	}
				990
				991	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
				992	th->source, tcp_v4_iif(skb));
				993	if (!sk) {
				994	ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
				995	return;
				996	}
				997	if (sk->sk_state == TCP_TIME_WAIT) {
				998	tcp_tw_put((struct tcp_tw_bucket *)sk);
				999	return;
				1000	}
				1001
				1002	bh_lock_sock(sk);
				1003	/* If too many ICMPs get dropped on busy
				1004	* servers this needs to be solved differently.
				1005	*/
				1006	if (sock_owned_by_user(sk))
				1007	NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
				1008
				1009	if (sk->sk_state == TCP_CLOSE)
				1010	goto out;
				1011
				1012	tp = tcp_sk(sk);
				1013	seq = ntohl(th->seq);
				1014	if (sk->sk_state != TCP_LISTEN &&
				1015	!between(seq, tp->snd_una, tp->snd_nxt)) {
				1016	NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
				1017	goto out;
				1018	}
				1019
				1020	switch (type) {
				1021	case ICMP_SOURCE_QUENCH:
				1022	/* Just silently ignore these. */
				1023	goto out;
				1024	case ICMP_PARAMETERPROB:
				1025	err = EPROTO;
				1026	break;
				1027	case ICMP_DEST_UNREACH:
				1028	if (code > NR_ICMP_UNREACH)
				1029	goto out;
				1030
				1031	if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
				1032	if (!sock_owned_by_user(sk))
				1033	do_pmtu_discovery(sk, iph, info);
				1034	goto out;
				1035	}
				1036
				1037	err = icmp_err_convert[code].errno;
				1038	break;
				1039	case ICMP_TIME_EXCEEDED:
				1040	err = EHOSTUNREACH;
				1041	break;
				1042	default:
				1043	goto out;
				1044	}
				1045
				1046	switch (sk->sk_state) {
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1047	struct request_sock req, *prev;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1048	case TCP_LISTEN:
				1049	if (sock_owned_by_user(sk))
				1050	goto out;
				1051
				1052	req = tcp_v4_search_req(tp, &prev, th->dest,
				1053	iph->daddr, iph->saddr);
				1054	if (!req)
				1055	goto out;
				1056
				1057	/* ICMPs are not backlogged, hence we cannot get
				1058	an established socket here.
				1059	*/
				1060	BUG_TRAP(!req->sk);
				1061
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1062	if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1063	NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
				1064	goto out;
				1065	}
				1066
				1067	/*
				1068	* Still in SYN_RECV, just remove it silently.
				1069	* There is no good way to pass the error to the newly
				1070	* created socket, and POSIX does not want network
				1071	* errors returned from accept().
				1072	*/
				1073	tcp_synq_drop(sk, req, prev);
				1074	goto out;
				1075
				1076	case TCP_SYN_SENT:
				1077	case TCP_SYN_RECV: /* Cannot happen.
				1078	It can f.e. if SYNs crossed.
				1079	*/
				1080	if (!sock_owned_by_user(sk)) {
				1081	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
				1082	sk->sk_err = err;
				1083
				1084	sk->sk_error_report(sk);
				1085
				1086	tcp_done(sk);
				1087	} else {
				1088	sk->sk_err_soft = err;
				1089	}
				1090	goto out;
				1091	}
				1092
				1093	/* If we've already connected we will keep trying
				1094	* until we time out, or the user gives up.
				1095	*
				1096	* rfc1122 4.2.3.9 allows to consider as hard errors
				1097	* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
				1098	* but it is obsoleted by pmtu discovery).
				1099	*
				1100	* Note, that in modern internet, where routing is unreliable
				1101	* and in each dark corner broken firewalls sit, sending random
				1102	* errors ordered by their masters even this two messages finally lose
				1103	* their original sense (even Linux sends invalid PORT_UNREACHs)
				1104	*
				1105	* Now we are in compliance with RFCs.
				1106	* --ANK (980905)
				1107	*/
				1108
				1109	inet = inet_sk(sk);
				1110	if (!sock_owned_by_user(sk) && inet->recverr) {
				1111	sk->sk_err = err;
				1112	sk->sk_error_report(sk);
				1113	} else { /* Only an error on timeout */
				1114	sk->sk_err_soft = err;
				1115	}
				1116
				1117	out:
				1118	bh_unlock_sock(sk);
				1119	sock_put(sk);
				1120	}
				1121
				1122	/* This routine computes an IPv4 TCP checksum. */
				1123	void tcp_v4_send_check(struct sock sk, struct tcphdr th, int len,
				1124	struct sk_buff *skb)
				1125	{
				1126	struct inet_sock *inet = inet_sk(sk);
				1127
				1128	if (skb->ip_summed == CHECKSUM_HW) {
				1129	th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
				1130	skb->csum = offsetof(struct tcphdr, check);
				1131	} else {
				1132	th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
				1133	csum_partial((char *)th,
				1134	th->doff << 2,
				1135	skb->csum));
				1136	}
				1137	}
				1138
				1139	/*
				1140	* This routine will send an RST to the other tcp.
				1141	*
				1142	* Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
				1143	* for reset.
				1144	* Answer: if a packet caused RST, it is not for a socket
				1145	* existing in our system, if it is matched to a socket,
				1146	* it is just duplicate segment or bug in other side's TCP.
				1147	* So that we build reply only basing on parameters
				1148	* arrived with segment.
				1149	* Exception: precedence violation. We do not implement it in any case.
				1150	*/
				1151
				1152	static void tcp_v4_send_reset(struct sk_buff *skb)
				1153	{
				1154	struct tcphdr *th = skb->h.th;
				1155	struct tcphdr rth;
				1156	struct ip_reply_arg arg;
				1157
				1158	/* Never send a reset in response to a reset. */
				1159	if (th->rst)
				1160	return;
				1161
				1162	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
				1163	return;
				1164
				1165	/* Swap the send and the receive. */
				1166	memset(&rth, 0, sizeof(struct tcphdr));
				1167	rth.dest = th->source;
				1168	rth.source = th->dest;
				1169	rth.doff = sizeof(struct tcphdr) / 4;
				1170	rth.rst = 1;
				1171
				1172	if (th->ack) {
				1173	rth.seq = th->ack_seq;
				1174	} else {
				1175	rth.ack = 1;
				1176	rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
				1177	skb->len - (th->doff << 2));
				1178	}
				1179
				1180	memset(&arg, 0, sizeof arg);
				1181	arg.iov[0].iov_base = (unsigned char *)&rth;
				1182	arg.iov[0].iov_len = sizeof rth;
				1183	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
				1184	skb->nh.iph->saddr, /XXX/
				1185	sizeof(struct tcphdr), IPPROTO_TCP, 0);
				1186	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
				1187
				1188	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
				1189
				1190	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
				1191	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
				1192	}
				1193
				1194	/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
				1195	outside socket context is ugly, certainly. What can I do?
				1196	*/
				1197
				1198	static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
				1199	u32 win, u32 ts)
				1200	{
				1201	struct tcphdr *th = skb->h.th;
				1202	struct {
				1203	struct tcphdr th;
				1204	u32 tsopt[3];
				1205	} rep;
				1206	struct ip_reply_arg arg;
				1207
				1208	memset(&rep.th, 0, sizeof(struct tcphdr));
				1209	memset(&arg, 0, sizeof arg);
				1210
				1211	arg.iov[0].iov_base = (unsigned char *)&rep;
				1212	arg.iov[0].iov_len = sizeof(rep.th);
				1213	if (ts) {
				1214	rep.tsopt[0] = htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				1215	(TCPOPT_TIMESTAMP << 8) \|
				1216	TCPOLEN_TIMESTAMP);
				1217	rep.tsopt[1] = htonl(tcp_time_stamp);
				1218	rep.tsopt[2] = htonl(ts);
				1219	arg.iov[0].iov_len = sizeof(rep);
				1220	}
				1221
				1222	/* Swap the send and the receive. */
				1223	rep.th.dest = th->source;
				1224	rep.th.source = th->dest;
				1225	rep.th.doff = arg.iov[0].iov_len / 4;
				1226	rep.th.seq = htonl(seq);
				1227	rep.th.ack_seq = htonl(ack);
				1228	rep.th.ack = 1;
				1229	rep.th.window = htons(win);
				1230
				1231	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
				1232	skb->nh.iph->saddr, /XXX/
				1233	arg.iov[0].iov_len, IPPROTO_TCP, 0);
				1234	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
				1235
				1236	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
				1237
				1238	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
				1239	}
				1240
				1241	static void tcp_v4_timewait_ack(struct sock sk, struct sk_buff skb)
				1242	{
				1243	struct tcp_tw_bucket tw = (struct tcp_tw_bucket )sk;
				1244
				1245	tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
				1246	tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
				1247
				1248	tcp_tw_put(tw);
				1249	}
				1250
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1251	static void tcp_v4_reqsk_send_ack(struct sk_buff skb, struct request_sock req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1252	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1253	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1254	req->ts_recent);
				1255	}
				1256
				1257	static struct dst_entry* tcp_v4_route_req(struct sock *sk,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1258	struct request_sock *req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1259	{
				1260	struct rtable *rt;
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1261	const struct inet_request_sock *ireq = inet_rsk(req);
				1262	struct ip_options *opt = inet_rsk(req)->opt;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1263	struct flowi fl = { .oif = sk->sk_bound_dev_if,
				1264	.nl_u = { .ip4_u =
				1265	{ .daddr = ((opt && opt->srr) ?
				1266	opt->faddr :
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1267	ireq->rmt_addr),
				1268	.saddr = ireq->loc_addr,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1269	.tos = RT_CONN_FLAGS(sk) } },
				1270	.proto = IPPROTO_TCP,
				1271	.uli_u = { .ports =
				1272	{ .sport = inet_sk(sk)->sport,
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1273	.dport = ireq->rmt_port } } };
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1274
				1275	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
				1276	IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
				1277	return NULL;
				1278	}
				1279	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
				1280	ip_rt_put(rt);
				1281	IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
				1282	return NULL;
				1283	}
				1284	return &rt->u.dst;
				1285	}
				1286
				1287	/*
				1288	* Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1289	* This still operates on a request_sock only, not on a big
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1290	* socket.
				1291	*/
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1292	static int tcp_v4_send_synack(struct sock sk, struct request_sock req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1293	struct dst_entry *dst)
				1294	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1295	const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1296	int err = -1;
				1297	struct sk_buff * skb;
				1298
				1299	/* First, grab a route. */
				1300	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
				1301	goto out;
				1302
				1303	skb = tcp_make_synack(sk, dst, req);
				1304
				1305	if (skb) {
				1306	struct tcphdr *th = skb->h.th;
				1307
				1308	th->check = tcp_v4_check(th, skb->len,
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1309	ireq->loc_addr,
				1310	ireq->rmt_addr,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1311	csum_partial((char *)th, skb->len,
				1312	skb->csum));
				1313
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1314	err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
				1315	ireq->rmt_addr,
				1316	ireq->opt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1317	if (err == NET_XMIT_CN)
				1318	err = 0;
				1319	}
				1320
				1321	out:
				1322	dst_release(dst);
				1323	return err;
				1324	}
				1325
				1326	/*
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1327	* IPv4 request_sock destructor.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1328	*/
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1329	static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1330	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1331	if (inet_rsk(req)->opt)
				1332	kfree(inet_rsk(req)->opt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1333	}
				1334
				1335	static inline void syn_flood_warning(struct sk_buff *skb)
				1336	{
				1337	static unsigned long warntime;
				1338
				1339	if (time_after(jiffies, (warntime + HZ * 60))) {
				1340	warntime = jiffies;
				1341	printk(KERN_INFO
				1342	"possible SYN flooding on port %d. Sending cookies.\n",
				1343	ntohs(skb->h.th->dest));
				1344	}
				1345	}
				1346
				1347	/*
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1348	* Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1349	*/
				1350	static inline struct ip_options tcp_v4_save_options(struct sock sk,
				1351	struct sk_buff *skb)
				1352	{
				1353	struct ip_options *opt = &(IPCB(skb)->opt);
				1354	struct ip_options *dopt = NULL;
				1355
				1356	if (opt && opt->optlen) {
				1357	int opt_size = optlength(opt);
				1358	dopt = kmalloc(opt_size, GFP_ATOMIC);
				1359	if (dopt) {
				1360	if (ip_options_echo(dopt, skb)) {
				1361	kfree(dopt);
				1362	dopt = NULL;
				1363	}
				1364	}
				1365	}
				1366	return dopt;
				1367	}
				1368
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1369	struct request_sock_ops tcp_request_sock_ops = {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1370	.family = PF_INET,
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1371	.obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1372	.rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1373	.send_ack = tcp_v4_reqsk_send_ack,
				1374	.destructor = tcp_v4_reqsk_destructor,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1375	.send_reset = tcp_v4_send_reset,
				1376	};
				1377
				1378	int tcp_v4_conn_request(struct sock sk, struct sk_buff skb)
				1379	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1380	struct inet_request_sock *ireq;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1381	struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1382	struct request_sock *req;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1383	__u32 saddr = skb->nh.iph->saddr;
				1384	__u32 daddr = skb->nh.iph->daddr;
				1385	__u32 isn = TCP_SKB_CB(skb)->when;
				1386	struct dst_entry *dst = NULL;
				1387	#ifdef CONFIG_SYN_COOKIES
				1388	int want_cookie = 0;
				1389	#else
				1390	#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
				1391	#endif
				1392
				1393	/* Never answer to SYNs send to broadcast or multicast */
				1394	if (((struct rtable *)skb->dst)->rt_flags &
				1395	(RTCF_BROADCAST \| RTCF_MULTICAST))
				1396	goto drop;
				1397
				1398	/* TW buckets are converted to open requests without
				1399	* limitations, they conserve resources and peer is
				1400	* evidently real one.
				1401	*/
				1402	if (tcp_synq_is_full(sk) && !isn) {
				1403	#ifdef CONFIG_SYN_COOKIES
				1404	if (sysctl_tcp_syncookies) {
				1405	want_cookie = 1;
				1406	} else
				1407	#endif
				1408	goto drop;
				1409	}
				1410
				1411	/* Accept backlog is full. If we have already queued enough
				1412	* of warm entries in syn queue, drop request. It is better than
				1413	* clogging syn queue with openreqs with exponentially increasing
				1414	* timeout.
				1415	*/
				1416	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
				1417	goto drop;
				1418
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1419	req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1420	if (!req)
				1421	goto drop;
				1422
				1423	tcp_clear_options(&tmp_opt);
				1424	tmp_opt.mss_clamp = 536;
				1425	tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
				1426
				1427	tcp_parse_options(skb, &tmp_opt, 0);
				1428
				1429	if (want_cookie) {
				1430	tcp_clear_options(&tmp_opt);
				1431	tmp_opt.saw_tstamp = 0;
				1432	}
				1433
				1434	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
				1435	/* Some OSes (unknown ones, but I see them on web server, which
				1436	* contains information interesting only for windows'
				1437	* users) do not send their stamp in SYN. It is easy case.
				1438	* We simply do not advertise TS support.
				1439	*/
				1440	tmp_opt.saw_tstamp = 0;
				1441	tmp_opt.tstamp_ok = 0;
				1442	}
				1443	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
				1444
				1445	tcp_openreq_init(req, &tmp_opt, skb);
				1446
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1447	ireq = inet_rsk(req);
				1448	ireq->loc_addr = daddr;
				1449	ireq->rmt_addr = saddr;
				1450	ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1451	if (!want_cookie)
				1452	TCP_ECN_create_request(req, skb->h.th);
				1453
				1454	if (want_cookie) {
				1455	#ifdef CONFIG_SYN_COOKIES
				1456	syn_flood_warning(skb);
				1457	#endif
				1458	isn = cookie_v4_init_sequence(sk, skb, &req->mss);
				1459	} else if (!isn) {
				1460	struct inet_peer *peer = NULL;
				1461
				1462	/* VJ's idea. We save last timestamp seen
				1463	* from the destination in peer table, when entering
				1464	* state TIME-WAIT, and check against it before
				1465	* accepting new connection request.
				1466	*
				1467	* If "isn" is not zero, this request hit alive
				1468	* timewait bucket, so that all the necessary checks
				1469	* are made in the function processing timewait state.
				1470	*/
				1471	if (tmp_opt.saw_tstamp &&
				1472	sysctl_tcp_tw_recycle &&
				1473	(dst = tcp_v4_route_req(sk, req)) != NULL &&
				1474	(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
				1475	peer->v4daddr == saddr) {
				1476	if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
				1477	(s32)(peer->tcp_ts - req->ts_recent) >
				1478	TCP_PAWS_WINDOW) {
				1479	NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
				1480	dst_release(dst);
				1481	goto drop_and_free;
				1482	}
				1483	}
				1484	/* Kill the following clause, if you dislike this way. */
				1485	else if (!sysctl_tcp_syncookies &&
				1486	(sysctl_max_syn_backlog - tcp_synq_len(sk) <
				1487	(sysctl_max_syn_backlog >> 2)) &&
				1488	(!peer \|\| !peer->tcp_ts_stamp) &&
				1489	(!dst \|\| !dst_metric(dst, RTAX_RTT))) {
				1490	/* Without syncookies last quarter of
				1491	* backlog is filled with destinations,
				1492	* proven to be alive.
				1493	* It means that we continue to communicate
				1494	* to destinations, already remembered
				1495	* to the moment of synflood.
				1496	*/
Heikki Orsila	ca93345	2005-08-08 14:26:52 -0700	[diff] [blame]	1497	LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
				1498	"request from %u.%u."
				1499	"%u.%u/%u\n",
				1500	NIPQUAD(saddr),
				1501	ntohs(skb->h.th->source)));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1502	dst_release(dst);
				1503	goto drop_and_free;
				1504	}
				1505
				1506	isn = tcp_v4_init_sequence(sk, skb);
				1507	}
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1508	tcp_rsk(req)->snt_isn = isn;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1509
				1510	if (tcp_v4_send_synack(sk, req, dst))
				1511	goto drop_and_free;
				1512
				1513	if (want_cookie) {
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1514	reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1515	} else {
				1516	tcp_v4_synq_add(sk, req);
				1517	}
				1518	return 0;
				1519
				1520	drop_and_free:
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1521	reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1522	drop:
				1523	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
				1524	return 0;
				1525	}
				1526
				1527
				1528	/*
				1529	* The three way handshake has completed - we got a valid synack -
				1530	* now create the new socket.
				1531	*/
				1532	struct sock tcp_v4_syn_recv_sock(struct sock sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1533	struct request_sock *req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1534	struct dst_entry *dst)
				1535	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1536	struct inet_request_sock *ireq;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1537	struct inet_sock *newinet;
				1538	struct tcp_sock *newtp;
				1539	struct sock *newsk;
				1540
				1541	if (sk_acceptq_is_full(sk))
				1542	goto exit_overflow;
				1543
				1544	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
				1545	goto exit;
				1546
				1547	newsk = tcp_create_openreq_child(sk, req, skb);
				1548	if (!newsk)
				1549	goto exit;
				1550
				1551	newsk->sk_dst_cache = dst;
				1552	tcp_v4_setup_caps(newsk, dst);
				1553
				1554	newtp = tcp_sk(newsk);
				1555	newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1556	ireq = inet_rsk(req);
				1557	newinet->daddr = ireq->rmt_addr;
				1558	newinet->rcv_saddr = ireq->loc_addr;
				1559	newinet->saddr = ireq->loc_addr;
				1560	newinet->opt = ireq->opt;
				1561	ireq->opt = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1562	newinet->mc_index = tcp_v4_iif(skb);
				1563	newinet->mc_ttl = skb->nh.iph->ttl;
				1564	newtp->ext_header_len = 0;
				1565	if (newinet->opt)
				1566	newtp->ext_header_len = newinet->opt->optlen;
				1567	newinet->id = newtp->write_seq ^ jiffies;
				1568
				1569	tcp_sync_mss(newsk, dst_mtu(dst));
				1570	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
				1571	tcp_initialize_rcv_mss(newsk);
				1572
				1573	__tcp_v4_hash(newsk, 0);
				1574	__tcp_inherit_port(sk, newsk);
				1575
				1576	return newsk;
				1577
				1578	exit_overflow:
				1579	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
				1580	exit:
				1581	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
				1582	dst_release(dst);
				1583	return NULL;
				1584	}
				1585
				1586	static struct sock tcp_v4_hnd_req(struct sock sk, struct sk_buff *skb)
				1587	{
				1588	struct tcphdr *th = skb->h.th;
				1589	struct iphdr *iph = skb->nh.iph;
				1590	struct tcp_sock *tp = tcp_sk(sk);
				1591	struct sock *nsk;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1592	struct request_sock **prev;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1593	/* Find possible connection requests. */
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1594	struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1595	iph->saddr, iph->daddr);
				1596	if (req)
				1597	return tcp_check_req(sk, skb, req, prev);
				1598
				1599	nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
				1600	th->source,
				1601	skb->nh.iph->daddr,
				1602	ntohs(th->dest),
				1603	tcp_v4_iif(skb));
				1604
				1605	if (nsk) {
				1606	if (nsk->sk_state != TCP_TIME_WAIT) {
				1607	bh_lock_sock(nsk);
				1608	return nsk;
				1609	}
				1610	tcp_tw_put((struct tcp_tw_bucket *)nsk);
				1611	return NULL;
				1612	}
				1613
				1614	#ifdef CONFIG_SYN_COOKIES
				1615	if (!th->rst && !th->syn && th->ack)
				1616	sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
				1617	#endif
				1618	return sk;
				1619	}
				1620
				1621	static int tcp_v4_checksum_init(struct sk_buff *skb)
				1622	{
				1623	if (skb->ip_summed == CHECKSUM_HW) {
				1624	skb->ip_summed = CHECKSUM_UNNECESSARY;
				1625	if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
				1626	skb->nh.iph->daddr, skb->csum))
				1627	return 0;
				1628
Heikki Orsila	ca93345	2005-08-08 14:26:52 -0700	[diff] [blame]	1629	LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1630	skb->ip_summed = CHECKSUM_NONE;
				1631	}
				1632	if (skb->len <= 76) {
				1633	if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
				1634	skb->nh.iph->daddr,
				1635	skb_checksum(skb, 0, skb->len, 0)))
				1636	return -1;
				1637	skb->ip_summed = CHECKSUM_UNNECESSARY;
				1638	} else {
				1639	skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
				1640	skb->nh.iph->saddr,
				1641	skb->nh.iph->daddr, 0);
				1642	}
				1643	return 0;
				1644	}
				1645
				1646
				1647	/* The socket must have it's spinlock held when we get
				1648	* here.
				1649	*
				1650	* We have a potential double-lock case here, so even when
				1651	* doing backlog processing we use the BH locking scheme.
				1652	* This is because we cannot sleep with the original spinlock
				1653	* held.
				1654	*/
				1655	int tcp_v4_do_rcv(struct sock sk, struct sk_buff skb)
				1656	{
				1657	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
				1658	TCP_CHECK_TIMER(sk);
				1659	if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
				1660	goto reset;
				1661	TCP_CHECK_TIMER(sk);
				1662	return 0;
				1663	}
				1664
				1665	if (skb->len < (skb->h.th->doff << 2) \|\| tcp_checksum_complete(skb))
				1666	goto csum_err;
				1667
				1668	if (sk->sk_state == TCP_LISTEN) {
				1669	struct sock *nsk = tcp_v4_hnd_req(sk, skb);
				1670	if (!nsk)
				1671	goto discard;
				1672
				1673	if (nsk != sk) {
				1674	if (tcp_child_process(sk, nsk, skb))
				1675	goto reset;
				1676	return 0;
				1677	}
				1678	}
				1679
				1680	TCP_CHECK_TIMER(sk);
				1681	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
				1682	goto reset;
				1683	TCP_CHECK_TIMER(sk);
				1684	return 0;
				1685
				1686	reset:
				1687	tcp_v4_send_reset(skb);
				1688	discard:
				1689	kfree_skb(skb);
				1690	/* Be careful here. If this function gets more complicated and
				1691	* gcc suffers from register pressure on the x86, sk (in %ebx)
				1692	* might be destroyed here. This current version compiles correctly,
				1693	* but you have been warned.
				1694	*/
				1695	return 0;
				1696
				1697	csum_err:
				1698	TCP_INC_STATS_BH(TCP_MIB_INERRS);
				1699	goto discard;
				1700	}
				1701
				1702	/*
				1703	* From tcp_input.c
				1704	*/
				1705
				1706	int tcp_v4_rcv(struct sk_buff *skb)
				1707	{
				1708	struct tcphdr *th;
				1709	struct sock *sk;
				1710	int ret;
				1711
				1712	if (skb->pkt_type != PACKET_HOST)
				1713	goto discard_it;
				1714
				1715	/* Count it even if it's bad */
				1716	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
				1717
				1718	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
				1719	goto discard_it;
				1720
				1721	th = skb->h.th;
				1722
				1723	if (th->doff < sizeof(struct tcphdr) / 4)
				1724	goto bad_packet;
				1725	if (!pskb_may_pull(skb, th->doff * 4))
				1726	goto discard_it;
				1727
				1728	/* An explanation is required here, I think.
				1729	* Packet length and doff are validated by header prediction,
				1730	* provided case of th->doff==0 is elimineted.
				1731	* So, we defer the checks. */
				1732	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
				1733	tcp_v4_checksum_init(skb) < 0))
				1734	goto bad_packet;
				1735
				1736	th = skb->h.th;
				1737	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
				1738	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
				1739	skb->len - th->doff * 4);
				1740	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
				1741	TCP_SKB_CB(skb)->when = 0;
				1742	TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
				1743	TCP_SKB_CB(skb)->sacked = 0;
				1744
				1745	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
				1746	skb->nh.iph->daddr, ntohs(th->dest),
				1747	tcp_v4_iif(skb));
				1748
				1749	if (!sk)
				1750	goto no_tcp_socket;
				1751
				1752	process:
				1753	if (sk->sk_state == TCP_TIME_WAIT)
				1754	goto do_time_wait;
				1755
				1756	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
				1757	goto discard_and_relse;
				1758
				1759	if (sk_filter(sk, skb, 0))
				1760	goto discard_and_relse;
				1761
				1762	skb->dev = NULL;
				1763
				1764	bh_lock_sock(sk);
				1765	ret = 0;
				1766	if (!sock_owned_by_user(sk)) {
				1767	if (!tcp_prequeue(sk, skb))
				1768	ret = tcp_v4_do_rcv(sk, skb);
				1769	} else
				1770	sk_add_backlog(sk, skb);
				1771	bh_unlock_sock(sk);
				1772
				1773	sock_put(sk);
				1774
				1775	return ret;
				1776
				1777	no_tcp_socket:
				1778	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
				1779	goto discard_it;
				1780
				1781	if (skb->len < (th->doff << 2) \|\| tcp_checksum_complete(skb)) {
				1782	bad_packet:
				1783	TCP_INC_STATS_BH(TCP_MIB_INERRS);
				1784	} else {
				1785	tcp_v4_send_reset(skb);
				1786	}
				1787
				1788	discard_it:
				1789	/* Discard frame. */
				1790	kfree_skb(skb);
				1791	return 0;
				1792
				1793	discard_and_relse:
				1794	sock_put(sk);
				1795	goto discard_it;
				1796
				1797	do_time_wait:
				1798	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				1799	tcp_tw_put((struct tcp_tw_bucket *) sk);
				1800	goto discard_it;
				1801	}
				1802
				1803	if (skb->len < (th->doff << 2) \|\| tcp_checksum_complete(skb)) {
				1804	TCP_INC_STATS_BH(TCP_MIB_INERRS);
				1805	tcp_tw_put((struct tcp_tw_bucket *) sk);
				1806	goto discard_it;
				1807	}
				1808	switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
				1809	skb, th, skb->len)) {
				1810	case TCP_TW_SYN: {
				1811	struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
				1812	ntohs(th->dest),
				1813	tcp_v4_iif(skb));
				1814	if (sk2) {
				1815	tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
				1816	tcp_tw_put((struct tcp_tw_bucket *)sk);
				1817	sk = sk2;
				1818	goto process;
				1819	}
				1820	/* Fall through to ACK */
				1821	}
				1822	case TCP_TW_ACK:
				1823	tcp_v4_timewait_ack(sk, skb);
				1824	break;
				1825	case TCP_TW_RST:
				1826	goto no_tcp_socket;
				1827	case TCP_TW_SUCCESS:;
				1828	}
				1829	goto discard_it;
				1830	}
				1831
				1832	/* With per-bucket locks this operation is not-atomic, so that
				1833	* this version is not worse.
				1834	*/
				1835	static void __tcp_v4_rehash(struct sock *sk)
				1836	{
				1837	sk->sk_prot->unhash(sk);
				1838	sk->sk_prot->hash(sk);
				1839	}
				1840
				1841	static int tcp_v4_reselect_saddr(struct sock *sk)
				1842	{
				1843	struct inet_sock *inet = inet_sk(sk);
				1844	int err;
				1845	struct rtable *rt;
				1846	__u32 old_saddr = inet->saddr;
				1847	__u32 new_saddr;
				1848	__u32 daddr = inet->daddr;
				1849
				1850	if (inet->opt && inet->opt->srr)
				1851	daddr = inet->opt->faddr;
				1852
				1853	/* Query new route. */
				1854	err = ip_route_connect(&rt, daddr, 0,
				1855	RT_CONN_FLAGS(sk),
				1856	sk->sk_bound_dev_if,
				1857	IPPROTO_TCP,
				1858	inet->sport, inet->dport, sk);
				1859	if (err)
				1860	return err;
				1861
				1862	__sk_dst_set(sk, &rt->u.dst);
				1863	tcp_v4_setup_caps(sk, &rt->u.dst);
				1864
				1865	new_saddr = rt->rt_src;
				1866
				1867	if (new_saddr == old_saddr)
				1868	return 0;
				1869
				1870	if (sysctl_ip_dynaddr > 1) {
				1871	printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
				1872	"saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
				1873	NIPQUAD(old_saddr),
				1874	NIPQUAD(new_saddr));
				1875	}
				1876
				1877	inet->saddr = new_saddr;
				1878	inet->rcv_saddr = new_saddr;
				1879
				1880	/* XXX The only one ugly spot where we need to
				1881	* XXX really change the sockets identity after
				1882	* XXX it has entered the hashes. -DaveM
				1883	*
				1884	* Besides that, it does not check for connection
				1885	* uniqueness. Wait for troubles.
				1886	*/
				1887	__tcp_v4_rehash(sk);
				1888	return 0;
				1889	}
				1890
				1891	int tcp_v4_rebuild_header(struct sock *sk)
				1892	{
				1893	struct inet_sock *inet = inet_sk(sk);
				1894	struct rtable rt = (struct rtable )__sk_dst_check(sk, 0);
				1895	u32 daddr;
				1896	int err;
				1897
				1898	/* Route is OK, nothing to do. */
				1899	if (rt)
				1900	return 0;
				1901
				1902	/* Reroute. */
				1903	daddr = inet->daddr;
				1904	if (inet->opt && inet->opt->srr)
				1905	daddr = inet->opt->faddr;
				1906
				1907	{
				1908	struct flowi fl = { .oif = sk->sk_bound_dev_if,
				1909	.nl_u = { .ip4_u =
				1910	{ .daddr = daddr,
				1911	.saddr = inet->saddr,
				1912	.tos = RT_CONN_FLAGS(sk) } },
				1913	.proto = IPPROTO_TCP,
				1914	.uli_u = { .ports =
				1915	{ .sport = inet->sport,
				1916	.dport = inet->dport } } };
				1917
				1918	err = ip_route_output_flow(&rt, &fl, sk, 0);
				1919	}
				1920	if (!err) {
				1921	__sk_dst_set(sk, &rt->u.dst);
				1922	tcp_v4_setup_caps(sk, &rt->u.dst);
				1923	return 0;
				1924	}
				1925
				1926	/* Routing failed... */
				1927	sk->sk_route_caps = 0;
				1928
				1929	if (!sysctl_ip_dynaddr \|\|
				1930	sk->sk_state != TCP_SYN_SENT \|\|
				1931	(sk->sk_userlocks & SOCK_BINDADDR_LOCK) \|\|
				1932	(err = tcp_v4_reselect_saddr(sk)) != 0)
				1933	sk->sk_err_soft = -err;
				1934
				1935	return err;
				1936	}
				1937
				1938	static void v4_addr2sockaddr(struct sock sk, struct sockaddr uaddr)
				1939	{
				1940	struct sockaddr_in sin = (struct sockaddr_in ) uaddr;
				1941	struct inet_sock *inet = inet_sk(sk);
				1942
				1943	sin->sin_family = AF_INET;
				1944	sin->sin_addr.s_addr = inet->daddr;
				1945	sin->sin_port = inet->dport;
				1946	}
				1947
				1948	/* VJ's idea. Save last timestamp seen from this destination
				1949	* and hold it at least for normal timewait interval to use for duplicate
				1950	* segment detection in subsequent connections, before they enter synchronized
				1951	* state.
				1952	*/
				1953
				1954	int tcp_v4_remember_stamp(struct sock *sk)
				1955	{
				1956	struct inet_sock *inet = inet_sk(sk);
				1957	struct tcp_sock *tp = tcp_sk(sk);
				1958	struct rtable rt = (struct rtable )__sk_dst_get(sk);
				1959	struct inet_peer *peer = NULL;
				1960	int release_it = 0;
				1961
				1962	if (!rt \|\| rt->rt_dst != inet->daddr) {
				1963	peer = inet_getpeer(inet->daddr, 1);
				1964	release_it = 1;
				1965	} else {
				1966	if (!rt->peer)
				1967	rt_bind_peer(rt, 1);
				1968	peer = rt->peer;
				1969	}
				1970
				1971	if (peer) {
				1972	if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 \|\|
				1973	(peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
				1974	peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
				1975	peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
				1976	peer->tcp_ts = tp->rx_opt.ts_recent;
				1977	}
				1978	if (release_it)
				1979	inet_putpeer(peer);
				1980	return 1;
				1981	}
				1982
				1983	return 0;
				1984	}
				1985
				1986	int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
				1987	{
				1988	struct inet_peer *peer = NULL;
				1989
				1990	peer = inet_getpeer(tw->tw_daddr, 1);
				1991
				1992	if (peer) {
				1993	if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 \|\|
				1994	(peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
				1995	peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
				1996	peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
				1997	peer->tcp_ts = tw->tw_ts_recent;
				1998	}
				1999	inet_putpeer(peer);
				2000	return 1;
				2001	}
				2002
				2003	return 0;
				2004	}
				2005
				2006	struct tcp_func ipv4_specific = {
				2007	.queue_xmit = ip_queue_xmit,
				2008	.send_check = tcp_v4_send_check,
				2009	.rebuild_header = tcp_v4_rebuild_header,
				2010	.conn_request = tcp_v4_conn_request,
				2011	.syn_recv_sock = tcp_v4_syn_recv_sock,
				2012	.remember_stamp = tcp_v4_remember_stamp,
				2013	.net_header_len = sizeof(struct iphdr),
				2014	.setsockopt = ip_setsockopt,
				2015	.getsockopt = ip_getsockopt,
				2016	.addr2sockaddr = v4_addr2sockaddr,
				2017	.sockaddr_len = sizeof(struct sockaddr_in),
				2018	};
				2019
				2020	/* NOTE: A lot of things set to zero explicitly by call to
				2021	* sk_alloc() so need not be done here.
				2022	*/
				2023	static int tcp_v4_init_sock(struct sock *sk)
				2024	{
				2025	struct tcp_sock *tp = tcp_sk(sk);
				2026
				2027	skb_queue_head_init(&tp->out_of_order_queue);
				2028	tcp_init_xmit_timers(sk);
				2029	tcp_prequeue_init(tp);
				2030
				2031	tp->rto = TCP_TIMEOUT_INIT;
				2032	tp->mdev = TCP_TIMEOUT_INIT;
				2033
				2034	/* So many TCP implementations out there (incorrectly) count the
				2035	* initial SYN frame in their delayed-ACK and congestion control
				2036	* algorithms that we must have the following bandaid to talk
				2037	* efficiently to them. -DaveM
				2038	*/
				2039	tp->snd_cwnd = 2;
				2040
				2041	/* See draft-stevens-tcpca-spec-01 for discussion of the
				2042	* initialization of these values.
				2043	*/
				2044	tp->snd_ssthresh = 0x7fffffff; /* Infinity */
				2045	tp->snd_cwnd_clamp = ~0;
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	2046	tp->mss_cache = 536;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2047
				2048	tp->reordering = sysctl_tcp_reordering;
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2049	tp->ca_ops = &tcp_init_congestion_ops;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2050
				2051	sk->sk_state = TCP_CLOSE;
				2052
				2053	sk->sk_write_space = sk_stream_write_space;
				2054	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
				2055
				2056	tp->af_specific = &ipv4_specific;
				2057
				2058	sk->sk_sndbuf = sysctl_tcp_wmem[1];
				2059	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
				2060
				2061	atomic_inc(&tcp_sockets_allocated);
				2062
				2063	return 0;
				2064	}
				2065
				2066	int tcp_v4_destroy_sock(struct sock *sk)
				2067	{
				2068	struct tcp_sock *tp = tcp_sk(sk);
				2069
				2070	tcp_clear_xmit_timers(sk);
				2071
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	2072	tcp_cleanup_congestion_control(tp);
				2073
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2074	/* Cleanup up the write buffer. */
				2075	sk_stream_writequeue_purge(sk);
				2076
				2077	/* Cleans up our, hopefully empty, out_of_order_queue. */
				2078	__skb_queue_purge(&tp->out_of_order_queue);
				2079
				2080	/* Clean prequeue, it must be empty really */
				2081	__skb_queue_purge(&tp->ucopy.prequeue);
				2082
				2083	/* Clean up a referenced TCP bind bucket. */
				2084	if (tp->bind_hash)
				2085	tcp_put_port(sk);
				2086
				2087	/*
				2088	* If sendmsg cached page exists, toss it.
				2089	*/
				2090	if (sk->sk_sndmsg_page) {
				2091	__free_page(sk->sk_sndmsg_page);
				2092	sk->sk_sndmsg_page = NULL;
				2093	}
				2094
				2095	atomic_dec(&tcp_sockets_allocated);
				2096
				2097	return 0;
				2098	}
				2099
				2100	EXPORT_SYMBOL(tcp_v4_destroy_sock);
				2101
				2102	#ifdef CONFIG_PROC_FS
				2103	/* Proc filesystem TCP sock list dumping. */
				2104
				2105	static inline struct tcp_tw_bucket tw_head(struct hlist_head head)
				2106	{
				2107	return hlist_empty(head) ? NULL :
				2108	list_entry(head->first, struct tcp_tw_bucket, tw_node);
				2109	}
				2110
				2111	static inline struct tcp_tw_bucket tw_next(struct tcp_tw_bucket tw)
				2112	{
				2113	return tw->tw_node.next ?
				2114	hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
				2115	}
				2116
				2117	static void listening_get_next(struct seq_file seq, void *cur)
				2118	{
				2119	struct tcp_sock *tp;
				2120	struct hlist_node *node;
				2121	struct sock *sk = cur;
				2122	struct tcp_iter_state* st = seq->private;
				2123
				2124	if (!sk) {
				2125	st->bucket = 0;
				2126	sk = sk_head(&tcp_listening_hash[0]);
				2127	goto get_sk;
				2128	}
				2129
				2130	++st->num;
				2131
				2132	if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	2133	struct request_sock *req = cur;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2134
				2135	tp = tcp_sk(st->syn_wait_sk);
				2136	req = req->dl_next;
				2137	while (1) {
				2138	while (req) {
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	2139	if (req->rsk_ops->family == st->family) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2140	cur = req;
				2141	goto out;
				2142	}
				2143	req = req->dl_next;
				2144	}
				2145	if (++st->sbucket >= TCP_SYNQ_HSIZE)
				2146	break;
				2147	get_req:
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2148	req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2149	}
				2150	sk = sk_next(st->syn_wait_sk);
				2151	st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2152	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2153	} else {
				2154	tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2155	read_lock_bh(&tp->accept_queue.syn_wait_lock);
				2156	if (reqsk_queue_len(&tp->accept_queue))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2157	goto start_req;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2158	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2159	sk = sk_next(sk);
				2160	}
				2161	get_sk:
				2162	sk_for_each_from(sk, node) {
				2163	if (sk->sk_family == st->family) {
				2164	cur = sk;
				2165	goto out;
				2166	}
				2167	tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2168	read_lock_bh(&tp->accept_queue.syn_wait_lock);
				2169	if (reqsk_queue_len(&tp->accept_queue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2170	start_req:
				2171	st->uid = sock_i_uid(sk);
				2172	st->syn_wait_sk = sk;
				2173	st->state = TCP_SEQ_STATE_OPENREQ;
				2174	st->sbucket = 0;
				2175	goto get_req;
				2176	}
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2177	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2178	}
				2179	if (++st->bucket < TCP_LHTABLE_SIZE) {
				2180	sk = sk_head(&tcp_listening_hash[st->bucket]);
				2181	goto get_sk;
				2182	}
				2183	cur = NULL;
				2184	out:
				2185	return cur;
				2186	}
				2187
				2188	static void listening_get_idx(struct seq_file seq, loff_t *pos)
				2189	{
				2190	void *rc = listening_get_next(seq, NULL);
				2191
				2192	while (rc && *pos) {
				2193	rc = listening_get_next(seq, rc);
				2194	--*pos;
				2195	}
				2196	return rc;
				2197	}
				2198
				2199	static void established_get_first(struct seq_file seq)
				2200	{
				2201	struct tcp_iter_state* st = seq->private;
				2202	void *rc = NULL;
				2203
				2204	for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
				2205	struct sock *sk;
				2206	struct hlist_node *node;
				2207	struct tcp_tw_bucket *tw;
				2208
				2209	/* We can reschedule _before_ having picked the target: */
				2210	cond_resched_softirq();
				2211
				2212	read_lock(&tcp_ehash[st->bucket].lock);
				2213	sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
				2214	if (sk->sk_family != st->family) {
				2215	continue;
				2216	}
				2217	rc = sk;
				2218	goto out;
				2219	}
				2220	st->state = TCP_SEQ_STATE_TIME_WAIT;
				2221	tw_for_each(tw, node,
				2222	&tcp_ehash[st->bucket + tcp_ehash_size].chain) {
				2223	if (tw->tw_family != st->family) {
				2224	continue;
				2225	}
				2226	rc = tw;
				2227	goto out;
				2228	}
				2229	read_unlock(&tcp_ehash[st->bucket].lock);
				2230	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2231	}
				2232	out:
				2233	return rc;
				2234	}
				2235
				2236	static void established_get_next(struct seq_file seq, void *cur)
				2237	{
				2238	struct sock *sk = cur;
				2239	struct tcp_tw_bucket *tw;
				2240	struct hlist_node *node;
				2241	struct tcp_iter_state* st = seq->private;
				2242
				2243	++st->num;
				2244
				2245	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
				2246	tw = cur;
				2247	tw = tw_next(tw);
				2248	get_tw:
				2249	while (tw && tw->tw_family != st->family) {
				2250	tw = tw_next(tw);
				2251	}
				2252	if (tw) {
				2253	cur = tw;
				2254	goto out;
				2255	}
				2256	read_unlock(&tcp_ehash[st->bucket].lock);
				2257	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2258
				2259	/* We can reschedule between buckets: */
				2260	cond_resched_softirq();
				2261
				2262	if (++st->bucket < tcp_ehash_size) {
				2263	read_lock(&tcp_ehash[st->bucket].lock);
				2264	sk = sk_head(&tcp_ehash[st->bucket].chain);
				2265	} else {
				2266	cur = NULL;
				2267	goto out;
				2268	}
				2269	} else
				2270	sk = sk_next(sk);
				2271
				2272	sk_for_each_from(sk, node) {
				2273	if (sk->sk_family == st->family)
				2274	goto found;
				2275	}
				2276
				2277	st->state = TCP_SEQ_STATE_TIME_WAIT;
				2278	tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
				2279	goto get_tw;
				2280	found:
				2281	cur = sk;
				2282	out:
				2283	return cur;
				2284	}
				2285
				2286	static void established_get_idx(struct seq_file seq, loff_t pos)
				2287	{
				2288	void *rc = established_get_first(seq);
				2289
				2290	while (rc && pos) {
				2291	rc = established_get_next(seq, rc);
				2292	--pos;
				2293	}
				2294	return rc;
				2295	}
				2296
				2297	static void tcp_get_idx(struct seq_file seq, loff_t pos)
				2298	{
				2299	void *rc;
				2300	struct tcp_iter_state* st = seq->private;
				2301
				2302	tcp_listen_lock();
				2303	st->state = TCP_SEQ_STATE_LISTENING;
				2304	rc = listening_get_idx(seq, &pos);
				2305
				2306	if (!rc) {
				2307	tcp_listen_unlock();
				2308	local_bh_disable();
				2309	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2310	rc = established_get_idx(seq, pos);
				2311	}
				2312
				2313	return rc;
				2314	}
				2315
				2316	static void tcp_seq_start(struct seq_file seq, loff_t *pos)
				2317	{
				2318	struct tcp_iter_state* st = seq->private;
				2319	st->state = TCP_SEQ_STATE_LISTENING;
				2320	st->num = 0;
				2321	return pos ? tcp_get_idx(seq, pos - 1) : SEQ_START_TOKEN;
				2322	}
				2323
				2324	static void tcp_seq_next(struct seq_file seq, void v, loff_t pos)
				2325	{
				2326	void *rc = NULL;
				2327	struct tcp_iter_state* st;
				2328
				2329	if (v == SEQ_START_TOKEN) {
				2330	rc = tcp_get_idx(seq, 0);
				2331	goto out;
				2332	}
				2333	st = seq->private;
				2334
				2335	switch (st->state) {
				2336	case TCP_SEQ_STATE_OPENREQ:
				2337	case TCP_SEQ_STATE_LISTENING:
				2338	rc = listening_get_next(seq, v);
				2339	if (!rc) {
				2340	tcp_listen_unlock();
				2341	local_bh_disable();
				2342	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2343	rc = established_get_first(seq);
				2344	}
				2345	break;
				2346	case TCP_SEQ_STATE_ESTABLISHED:
				2347	case TCP_SEQ_STATE_TIME_WAIT:
				2348	rc = established_get_next(seq, v);
				2349	break;
				2350	}
				2351	out:
				2352	++*pos;
				2353	return rc;
				2354	}
				2355
				2356	static void tcp_seq_stop(struct seq_file seq, void v)
				2357	{
				2358	struct tcp_iter_state* st = seq->private;
				2359
				2360	switch (st->state) {
				2361	case TCP_SEQ_STATE_OPENREQ:
				2362	if (v) {
				2363	struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2364	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2365	}
				2366	case TCP_SEQ_STATE_LISTENING:
				2367	if (v != SEQ_START_TOKEN)
				2368	tcp_listen_unlock();
				2369	break;
				2370	case TCP_SEQ_STATE_TIME_WAIT:
				2371	case TCP_SEQ_STATE_ESTABLISHED:
				2372	if (v)
				2373	read_unlock(&tcp_ehash[st->bucket].lock);
				2374	local_bh_enable();
				2375	break;
				2376	}
				2377	}
				2378
				2379	static int tcp_seq_open(struct inode inode, struct file file)
				2380	{
				2381	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
				2382	struct seq_file *seq;
				2383	struct tcp_iter_state *s;
				2384	int rc;
				2385
				2386	if (unlikely(afinfo == NULL))
				2387	return -EINVAL;
				2388
				2389	s = kmalloc(sizeof(*s), GFP_KERNEL);
				2390	if (!s)
				2391	return -ENOMEM;
				2392	memset(s, 0, sizeof(*s));
				2393	s->family = afinfo->family;
				2394	s->seq_ops.start = tcp_seq_start;
				2395	s->seq_ops.next = tcp_seq_next;
				2396	s->seq_ops.show = afinfo->seq_show;
				2397	s->seq_ops.stop = tcp_seq_stop;
				2398
				2399	rc = seq_open(file, &s->seq_ops);
				2400	if (rc)
				2401	goto out_kfree;
				2402	seq = file->private_data;
				2403	seq->private = s;
				2404	out:
				2405	return rc;
				2406	out_kfree:
				2407	kfree(s);
				2408	goto out;
				2409	}
				2410
				2411	int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
				2412	{
				2413	int rc = 0;
				2414	struct proc_dir_entry *p;
				2415
				2416	if (!afinfo)
				2417	return -EINVAL;
				2418	afinfo->seq_fops->owner = afinfo->owner;
				2419	afinfo->seq_fops->open = tcp_seq_open;
				2420	afinfo->seq_fops->read = seq_read;
				2421	afinfo->seq_fops->llseek = seq_lseek;
				2422	afinfo->seq_fops->release = seq_release_private;
				2423
				2424	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
				2425	if (p)
				2426	p->data = afinfo;
				2427	else
				2428	rc = -ENOMEM;
				2429	return rc;
				2430	}
				2431
				2432	void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
				2433	{
				2434	if (!afinfo)
				2435	return;
				2436	proc_net_remove(afinfo->name);
				2437	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
				2438	}
				2439
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	2440	static void get_openreq4(struct sock sk, struct request_sock req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2441	char *tmpbuf, int i, int uid)
				2442	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	2443	const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2444	int ttd = req->expires - jiffies;
				2445
				2446	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
				2447	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
				2448	i,
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	2449	ireq->loc_addr,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2450	ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	2451	ireq->rmt_addr,
				2452	ntohs(ireq->rmt_port),
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2453	TCP_SYN_RECV,
				2454	0, 0, /* could print option size, but that is af dependent. */
				2455	1, /* timers active (only the expire timer) */
				2456	jiffies_to_clock_t(ttd),
				2457	req->retrans,
				2458	uid,
				2459	0, /* non standard timer */
				2460	0, /* open_requests have no inode */
				2461	atomic_read(&sk->sk_refcnt),
				2462	req);
				2463	}
				2464
				2465	static void get_tcp4_sock(struct sock sp, char tmpbuf, int i)
				2466	{
				2467	int timer_active;
				2468	unsigned long timer_expires;
				2469	struct tcp_sock *tp = tcp_sk(sp);
				2470	struct inet_sock *inet = inet_sk(sp);
				2471	unsigned int dest = inet->daddr;
				2472	unsigned int src = inet->rcv_saddr;
				2473	__u16 destp = ntohs(inet->dport);
				2474	__u16 srcp = ntohs(inet->sport);
				2475
				2476	if (tp->pending == TCP_TIME_RETRANS) {
				2477	timer_active = 1;
				2478	timer_expires = tp->timeout;
				2479	} else if (tp->pending == TCP_TIME_PROBE0) {
				2480	timer_active = 4;
				2481	timer_expires = tp->timeout;
				2482	} else if (timer_pending(&sp->sk_timer)) {
				2483	timer_active = 2;
				2484	timer_expires = sp->sk_timer.expires;
				2485	} else {
				2486	timer_active = 0;
				2487	timer_expires = jiffies;
				2488	}
				2489
				2490	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
				2491	"%08X %5d %8d %lu %d %p %u %u %u %u %d",
				2492	i, src, srcp, dest, destp, sp->sk_state,
				2493	tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
				2494	timer_active,
				2495	jiffies_to_clock_t(timer_expires - jiffies),
				2496	tp->retransmits,
				2497	sock_i_uid(sp),
				2498	tp->probes_out,
				2499	sock_i_ino(sp),
				2500	atomic_read(&sp->sk_refcnt), sp,
				2501	tp->rto, tp->ack.ato, (tp->ack.quick << 1) \| tp->ack.pingpong,
				2502	tp->snd_cwnd,
				2503	tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
				2504	}
				2505
				2506	static void get_timewait4_sock(struct tcp_tw_bucket tw, char tmpbuf, int i)
				2507	{
				2508	unsigned int dest, src;
				2509	__u16 destp, srcp;
				2510	int ttd = tw->tw_ttd - jiffies;
				2511
				2512	if (ttd < 0)
				2513	ttd = 0;
				2514
				2515	dest = tw->tw_daddr;
				2516	src = tw->tw_rcv_saddr;
				2517	destp = ntohs(tw->tw_dport);
				2518	srcp = ntohs(tw->tw_sport);
				2519
				2520	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
				2521	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
				2522	i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
				2523	3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
				2524	atomic_read(&tw->tw_refcnt), tw);
				2525	}
				2526
				2527	#define TMPSZ 150
				2528
				2529	static int tcp4_seq_show(struct seq_file seq, void v)
				2530	{
				2531	struct tcp_iter_state* st;
				2532	char tmpbuf[TMPSZ + 1];
				2533
				2534	if (v == SEQ_START_TOKEN) {
				2535	seq_printf(seq, "%-*s\n", TMPSZ - 1,
				2536	" sl local_address rem_address st tx_queue "
				2537	"rx_queue tr tm->when retrnsmt uid timeout "
				2538	"inode");
				2539	goto out;
				2540	}
				2541	st = seq->private;
				2542
				2543	switch (st->state) {
				2544	case TCP_SEQ_STATE_LISTENING:
				2545	case TCP_SEQ_STATE_ESTABLISHED:
				2546	get_tcp4_sock(v, tmpbuf, st->num);
				2547	break;
				2548	case TCP_SEQ_STATE_OPENREQ:
				2549	get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
				2550	break;
				2551	case TCP_SEQ_STATE_TIME_WAIT:
				2552	get_timewait4_sock(v, tmpbuf, st->num);
				2553	break;
				2554	}
				2555	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
				2556	out:
				2557	return 0;
				2558	}
				2559
				2560	static struct file_operations tcp4_seq_fops;
				2561	static struct tcp_seq_afinfo tcp4_seq_afinfo = {
				2562	.owner = THIS_MODULE,
				2563	.name = "tcp",
				2564	.family = AF_INET,
				2565	.seq_show = tcp4_seq_show,
				2566	.seq_fops = &tcp4_seq_fops,
				2567	};
				2568
				2569	int __init tcp4_proc_init(void)
				2570	{
				2571	return tcp_proc_register(&tcp4_seq_afinfo);
				2572	}
				2573
				2574	void tcp4_proc_exit(void)
				2575	{
				2576	tcp_proc_unregister(&tcp4_seq_afinfo);
				2577	}
				2578	#endif /* CONFIG_PROC_FS */
				2579
				2580	struct proto tcp_prot = {
				2581	.name = "TCP",
				2582	.owner = THIS_MODULE,
				2583	.close = tcp_close,
				2584	.connect = tcp_v4_connect,
				2585	.disconnect = tcp_disconnect,
				2586	.accept = tcp_accept,
				2587	.ioctl = tcp_ioctl,
				2588	.init = tcp_v4_init_sock,
				2589	.destroy = tcp_v4_destroy_sock,
				2590	.shutdown = tcp_shutdown,
				2591	.setsockopt = tcp_setsockopt,
				2592	.getsockopt = tcp_getsockopt,
				2593	.sendmsg = tcp_sendmsg,
				2594	.recvmsg = tcp_recvmsg,
				2595	.backlog_rcv = tcp_v4_do_rcv,
				2596	.hash = tcp_v4_hash,
				2597	.unhash = tcp_unhash,
				2598	.get_port = tcp_v4_get_port,
				2599	.enter_memory_pressure = tcp_enter_memory_pressure,
				2600	.sockets_allocated = &tcp_sockets_allocated,
				2601	.memory_allocated = &tcp_memory_allocated,
				2602	.memory_pressure = &tcp_memory_pressure,
				2603	.sysctl_mem = sysctl_tcp_mem,
				2604	.sysctl_wmem = sysctl_tcp_wmem,
				2605	.sysctl_rmem = sysctl_tcp_rmem,
				2606	.max_header = MAX_TCP_HEADER,
				2607	.obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	2608	.rsk_prot = &tcp_request_sock_ops,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2609	};
				2610
				2611
				2612
				2613	void __init tcp_v4_init(struct net_proto_family *ops)
				2614	{
				2615	int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
				2616	if (err < 0)
				2617	panic("Failed to create the TCP control socket.\n");
				2618	tcp_socket->sk->sk_allocation = GFP_ATOMIC;
				2619	inet_sk(tcp_socket->sk)->uc_ttl = -1;
				2620
				2621	/* Unhash it so that IP input processing does not even
				2622	* see it, we do not wish this socket to see incoming
				2623	* packets.
				2624	*/
				2625	tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
				2626	}
				2627
				2628	EXPORT_SYMBOL(ipv4_specific);
				2629	EXPORT_SYMBOL(tcp_bind_hash);
				2630	EXPORT_SYMBOL(tcp_bucket_create);
				2631	EXPORT_SYMBOL(tcp_hashinfo);
				2632	EXPORT_SYMBOL(tcp_inherit_port);
				2633	EXPORT_SYMBOL(tcp_listen_wlock);
				2634	EXPORT_SYMBOL(tcp_port_rover);
				2635	EXPORT_SYMBOL(tcp_prot);
				2636	EXPORT_SYMBOL(tcp_put_port);
				2637	EXPORT_SYMBOL(tcp_unhash);
				2638	EXPORT_SYMBOL(tcp_v4_conn_request);
				2639	EXPORT_SYMBOL(tcp_v4_connect);
				2640	EXPORT_SYMBOL(tcp_v4_do_rcv);
				2641	EXPORT_SYMBOL(tcp_v4_rebuild_header);
				2642	EXPORT_SYMBOL(tcp_v4_remember_stamp);
				2643	EXPORT_SYMBOL(tcp_v4_send_check);
				2644	EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
				2645
				2646	#ifdef CONFIG_PROC_FS
				2647	EXPORT_SYMBOL(tcp_proc_register);
				2648	EXPORT_SYMBOL(tcp_proc_unregister);
				2649	#endif
				2650	EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2651	EXPORT_SYMBOL(sysctl_tcp_low_latency);
				2652	EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
				2653