0001-Implement-autosizing-TCP-socket-buffers.patch

pavalos, 07/26/2009 12:16 AM

Download (20.3 KB)

View differences:

sys/kern/uipc_socket.c
197 197
	ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE];
198 198
	ai.p_ucred = p->p_ucred;
199 199
	ai.fd_rdir = p->p_fd->fd_rdir;
200
	/*
201
	 * Auto-sizing of socket buffers is managed by the protocols and
202
	 * the appropriate flags must be set in the pru_attach function.
203
	 */
200 204
	error = so_pru_attach(so, proto, &ai);
201 205
	if (error) {
202 206
		so->so_state |= SS_NOFDREF;
......
1371 1375
					error = ENOBUFS;
1372 1376
					goto bad;
1373 1377
				}
1378
				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
1379
				    &so->so_rcv)->ssb_flags &= ~SSB_AUTOSIZE;
1374 1380
				break;
1375 1381

  
1376 1382
			/*
sys/kern/uipc_socket2.c
245 245
	so->so_linger = head->so_linger;
246 246
	so->so_state = head->so_state | SS_NOFDREF;
247 247
	so->so_proto = head->so_proto;
248
	so->so_timeo = head->so_timeo;
249 248
	so->so_cred = crhold(head->so_cred);
250 249
	ai.sb_rlimit = NULL;
251 250
	ai.p_ucred = NULL;
......
256 255
		sodealloc(so);
257 256
		return (NULL);
258 257
	}
259

  
258
	so->so_rcv.ssb_lowat = head->so_rcv.ssb_lowat;
259
	so->so_snd.ssb_lowat = head->so_snd.ssb_lowat;
260
	so->so_rcv.ssb_timeo = head->so_rcv.ssb_timeo;
261
	so->so_snd.ssb_timeo = head->so_snd.ssb_timeo;
262
	so->so_rcv.ssb_flags |= head->so_rcv.ssb_flags & SSB_AUTOSIZE;
263
	so->so_snd.ssb_flags |= head->so_snd.ssb_flags & SSB_AUTOSIZE;
260 264
	if (connstatus) {
261 265
		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
262 266
		so->so_state |= SS_COMP;
sys/kern/uipc_usrreq.c
1353 1353
	    !(so->so_proto->pr_flags & PR_RIGHTS))
1354 1354
		return(0);
1355 1355
#ifdef notdef
1356
	if (so->so_rcv.sb_flags & SB_LOCK) {
1356
	if (so->so_rcv.ssb_flags & SSB_LOCK) {
1357 1357
		/*
1358 1358
		 * This is problematical; it's not clear
1359 1359
		 * we need to wait for the sockbuf to be
sys/netinet/tcp.h
126 126
#define	TCP_MSS	1460
127 127

  
128 128
/*
129
 * TCP_MINMSS is defined to be 256 which is fine for the smallest
130
 * link MTU (296 bytes, SLIP interface) in the Internet.
131
 * However it is very unlikely to come across such low MTU interfaces
132
 * these days (anno dato 2003).
133
 * Probably it can be set to 512 without ill effects. But we play safe.
134
 * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments.
135
 * Setting this to "0" disables the minmss check.
136
 */
137
#define	TCP_MINMSS 256
138
/*
139
 * TCP_MINMSSOVERLOAD is defined to be 1000 which should cover any type
140
 * of interactive TCP session.
141
 * See tcp_subr.c tcp_minmssoverload SYSCTL declaration and tcp_input.c
142
 * for more comments.
143
 * Setting this to "0" disables the minmssoverload check.
144
 */
145
#define	TCP_MINMSSOVERLOAD 1000
146

  
147
/*
129 148
 * Default maximum segment size for TCP6.
130 149
 * With an IP6 MSS of 1280, this is 1220,
131 150
 * but 1024 is probably more convenient. (xxx kazu in doubt)
sys/netinet/tcp_input.c
212 212
    &tcp_reass_overflows, 0,
213 213
    "Global number of TCP Segment Reassembly Queue Overflows");
214 214

  
215
int tcp_do_autorcvbuf = 1;
216
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
217
    &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
218

  
219
int tcp_autorcvbuf_inc = 16*1024;
220
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
221
    &tcp_autorcvbuf_inc, 0,
222
    "Incrementor step size of automatic receive buffer");
223

  
224
int tcp_autorcvbuf_max = 16*1024*1024;
225
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
226
    &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
227

  
228

  
215 229
static void	 tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t);
216 230
static void	 tcp_pulloutofband(struct socket *,
217 231
		     struct tcphdr *, struct mbuf *, int);
......
1057 1071
	KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN state"));
1058 1072

  
1059 1073
	/*
1074
	 * This is the second part of the MSS DoS prevention code (after
1075
	 * minmss on the sending side) and it deals with too many too small
1076
	 * tcp packets in a too short timeframe (1 second).
1077
	 *
1078
	 * For every full second we count the number of received packets
1079
	 * and bytes. If we get a lot of packets per second for this connection
1080
	 * (tcp_minmssoverload) we take a closer look at it and compute the
1081
	 * average packet size for the past second. If that is less than
1082
	 * tcp_minmss we get too many packets with very small payload which
1083
	 * is not good and burdens our system (and every packet generates
1084
	 * a wakeup to the process connected to our socket). We can reasonable
1085
	 * expect this to be small packet DoS attack to exhaust our CPU
1086
	 * cycles.
1087
	 *
1088
	 * Care has to be taken for the minimum packet overload value. This
1089
	 * value defines the minimum number of packets per second before we
1090
	 * start to worry. This must not be too low to avoid killing for
1091
	 * example interactive connections with many small packets like
1092
	 * telnet or SSH.
1093
	 *
1094
	 * Setting either tcp_minmssoverload or tcp_minmss to "0" disables
1095
	 * this check.
1096
	 *
1097
	 * Account for packet if payload packet, skip over ACK, etc.
1098
	 */
1099
	if (tcp_minmss && tcp_minmssoverload &&
1100
	    tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
1101
		if (tp->rcv_second > ticks) {
1102
			tp->rcv_pps++;
1103
			tp->rcv_byps += tlen + off;
1104
			if (tp->rcv_pps > tcp_minmssoverload) {
1105
				if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) {
1106
					kprintf("too many small tcp packets from "
1107
					       "%s:%u, av. %lubyte/packet, "
1108
					       "dropping connection\n",
1109
#ifdef INET6
1110
						isipv6 ?
1111
						ip6_sprintf(&inp->inp_inc.inc6_faddr) :
1112
#endif
1113
						inet_ntoa(inp->inp_inc.inc_faddr),
1114
						inp->inp_inc.inc_fport,
1115
						tp->rcv_byps / tp->rcv_pps);
1116
					tp = tcp_drop(tp, ECONNRESET);
1117
					tcpstat.tcps_minmssdrops++;
1118
					goto drop;
1119
				}
1120
			}
1121
		} else {
1122
			tp->rcv_second = ticks + hz;
1123
			tp->rcv_pps = 1;
1124
			tp->rcv_byps = tlen + off;
1125
		}
1126
	}
1127

  
1128
	/*
1060 1129
	 * Segment received on connection.
1061 1130
	 * Reset idle time and keep-alive timer.
1062 1131
	 */
......
1235 1304
		    th->th_ack == tp->snd_una &&
1236 1305
		    LIST_EMPTY(&tp->t_segq) &&
1237 1306
		    tlen <= ssb_space(&so->so_rcv)) {
1307
			int newsize = 0;	/* automatic sockbuf scaling */
1238 1308
			/*
1239 1309
			 * This is a pure, in-sequence data packet
1240 1310
			 * with nothing on the reassembly queue and
......
1245 1315
			tcpstat.tcps_rcvpack++;
1246 1316
			tcpstat.tcps_rcvbyte += tlen;
1247 1317
			ND6_HINT(tp);	/* some progress has been done */
1318
		/*
1319
		 * Automatic sizing of receive socket buffer.  Often the send
1320
		 * buffer size is not optimally adjusted to the actual network
1321
		 * conditions at hand (delay bandwidth product).  Setting the
1322
		 * buffer size too small limits throughput on links with high
1323
		 * bandwidth and high delay (eg. trans-continental/oceanic links).
1324
		 *
1325
		 * On the receive side the socket buffer memory is only rarely
1326
		 * used to any significant extent.  This allows us to be much
1327
		 * more aggressive in scaling the receive socket buffer.  For
1328
		 * the case that the buffer space is actually used to a large
1329
		 * extent and we run out of kernel memory we can simply drop
1330
		 * the new segments; TCP on the sender will just retransmit it
1331
		 * later.  Setting the buffer size too big may only consume too
1332
		 * much kernel memory if the application doesn't read() from
1333
		 * the socket or packet loss or reordering makes use of the
1334
		 * reassembly queue.
1335
		 *
1336
		 * The criteria to step up the receive buffer one notch are:
1337
		 *  1. the number of bytes received during the time it takes
1338
		 *     one timestamp to be reflected back to us (the RTT);
1339
		 *  2. received bytes per RTT is within seven eighth of the
1340
		 *     current socket buffer size;
1341
		 *  3. receive buffer size has not hit maximal automatic size;
1342
		 *
1343
		 * This algorithm does one step per RTT at most and only if
1344
		 * we receive a bulk stream w/o packet losses or reorderings.
1345
		 * Shrinking the buffer during idle times is not necessary as
1346
		 * it doesn't consume any memory when idle.
1347
		 *
1348
		 * TODO: Only step up if the application is actually serving
1349
		 * the buffer to better manage the socket buffer resources.
1350
		 */
1351
			if (tcp_do_autorcvbuf &&
1352
			    to.to_tsecr &&
1353
			    (so->so_rcv.ssb_flags & SSB_AUTOSIZE)) {
1354
				if (to.to_tsecr > tp->rfbuf_ts &&
1355
				    to.to_tsecr - tp->rfbuf_ts < hz) {
1356
					if (tp->rfbuf_cnt >
1357
					    (so->so_rcv.ssb_hiwat / 8 * 7) &&
1358
					    so->so_rcv.ssb_hiwat <
1359
					    tcp_autorcvbuf_max) {
1360
						newsize =
1361
						    min(so->so_rcv.ssb_hiwat +
1362
						    tcp_autorcvbuf_inc,
1363
						    tcp_autorcvbuf_max);
1364
					}
1365
					/* Start over with next RTT. */
1366
					tp->rfbuf_ts = 0;
1367
					tp->rfbuf_cnt = 0;
1368
				} else
1369
					tp->rfbuf_cnt += tlen;	/* add up */
1370
			}
1248 1371
			/*
1249 1372
			 * Add data to socket buffer.
1250 1373
			 */
1251 1374
			if (so->so_state & SS_CANTRCVMORE) {
1252 1375
				m_freem(m);
1253 1376
			} else {
1377
				/*
1378
				 * Set new socket buffer size.
1379
				 * Give up when limit is reached.
1380
				 */
1381
				if (newsize)
1382
					if (!ssb_reserve(&so->so_rcv, newsize,
1383
					    so, NULL))
1384
						so->so_rcv.ssb_flags &= ~SSB_AUTOSIZE;
1254 1385
				m_adj(m, drop_hdrlen); /* delayed header drop */
1255 1386
				ssb_appendstream(&so->so_rcv, m);
1256 1387
			}
......
1309 1440
		recvwin = 0;
1310 1441
	tp->rcv_wnd = imax(recvwin, (int)(tp->rcv_adv - tp->rcv_nxt));
1311 1442

  
1443
	/* Reset receive buffer auto scaling when not in bulk receive mode. */
1444
	tp->rfbuf_ts = 0;
1445
	tp->rfbuf_cnt = 0;
1446

  
1312 1447
	switch (tp->t_state) {
1313 1448
	/*
1314 1449
	 * If the state is SYN_RECEIVED:
......
2943 3078
	 * Offer == 0 means that there was no MSS on the SYN segment,
2944 3079
	 * in this case we use tcp_mssdflt.
2945 3080
	 */
2946
	if (offer == 0)
3081
	if (offer == 0) {
2947 3082
		offer = (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
2948
	else
3083
	} else {
3084
		/*
3085
		 * Prevent DoS attack with too small MSS. Round up
3086
		 * to at least minmss.
3087
		 */
3088
		offer = max(offer, tcp_minmss);
2949 3089
		/*
2950 3090
		 * Sanity check: make sure that maxopd will be large
2951 3091
		 * enough to allow some data on segments even is the
......
2953 3093
		 * funny things may happen in tcp_output.
2954 3094
		 */
2955 3095
		offer = max(offer, 64);
3096
	}
2956 3097
	taop->tao_mssopt = offer;
2957 3098

  
2958 3099
	/*
sys/netinet/tcp_output.c
128 128
SYSCTL_INT(_net_inet_tcp, OID_AUTO, avoid_pure_win_update, CTLFLAG_RW,
129 129
	&avoid_pure_win_update, 1, "Avoid pure window updates when possible");
130 130

  
131
int tcp_do_autosndbuf = 1;
132
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
133
    &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
134

  
135
int tcp_autosndbuf_inc = 8*1024;
136
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
137
    &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
138

  
139
int tcp_autosndbuf_max = 16*1024*1024;
140
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
141
    &tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
142

  
131 143
/*
132 144
 * Tcp output routine: figure out what should be sent and send it.
133 145
 */
......
315 327
		}
316 328
	}
317 329

  
330
	KASSERT(len >= 0, ("%s: len < 0", __func__));
318 331
	/*
319
	 * len will be >= 0 after this point.  Truncate to the maximum
320
	 * segment length and ensure that FIN is removed if the length
321
	 * no longer contains the last data byte.
332
	 * Automatic sizing of send socket buffer.  Often the send buffer
333
	 * size is not optimally adjusted to the actual network conditions
334
	 * at hand (delay bandwidth product).  Setting the buffer size too
335
	 * small limits throughput on links with high bandwidth and high
336
	 * delay (eg. trans-continental/oceanic links).  Setting the
337
	 * buffer size too big consumes too much real kernel memory,
338
	 * especially with many connections on busy servers.
339
	 *
340
	 * The criteria to step up the send buffer one notch are:
341
	 *  1. receive window of remote host is larger than send buffer
342
	 *     (with a fudge factor of 5/4th);
343
	 *  2. send buffer is filled to 7/8th with data (so we actually
344
	 *     have data to make use of it);
345
	 *  3. send buffer fill has not hit maximal automatic size;
346
	 *  4. our send window (slow start and cogestion controlled) is
347
	 *     larger than sent but unacknowledged data in send buffer.
348
	 *
349
	 * The remote host receive window scaling factor may limit the
350
	 * growing of the send buffer before it reaches its allowed
351
	 * maximum.
352
	 *
353
	 * It scales directly with slow start or congestion window
354
	 * and does at most one step per received ACK.  This fast
355
	 * scaling has the drawback of growing the send buffer beyond
356
	 * what is strictly necessary to make full use of a given
357
	 * delay*bandwith product.  However testing has shown this not
358
	 * to be much of an problem.  At worst we are trading wasting
359
	 * of available bandwith (the non-use of it) for wasting some
360
	 * socket buffer memory.
361
	 *
362
	 * TODO: Shrink send buffer during idle periods together
363
	 * with congestion window.  Requires another timer.  Has to
364
	 * wait for upcoming tcp timer rewrite.
365
	 */
366
	if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) {
367
		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat &&
368
		    so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) &&
369
		    so->so_snd.ssb_cc < tcp_autosndbuf_max &&
370
		    sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) {
371
			if (!ssb_reserve(&so->so_snd,
372
			    min(so->so_snd.ssb_hiwat + tcp_autosndbuf_inc,
373
			     tcp_autosndbuf_max), so, NULL))
374
				so->so_snd.ssb_flags &= ~SSB_AUTOSIZE;
375
		}
376
	}
377

  
378
	/*
379
	 * Truncate to the maximum segment length and ensure that FIN is
380
	 * removed if the length no longer contains the last data byte.
322 381
	 */
323 382
	if (len > tp->t_maxseg) {
324 383
		len = tp->t_maxseg;
......
520 579
		optlen += TCPOLEN_TSTAMP_APPA;
521 580
	}
522 581

  
582
	/* Set receive buffer autosizing timestamp. */
583
	if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
584
		tp->rfbuf_ts = ticks;
585

  
523 586
	/*
524 587
	 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
525 588
	 * options are allowed (!TF_NOOPT) and it's not a RST.
sys/netinet/tcp_subr.c
178 178
    &tcp_v6mssdflt, 0, "Default TCP Maximum Segment Size for IPv6");
179 179
#endif
180 180

  
181
/*
182
 * Minimum MSS we accept and use. This prevents DoS attacks where
183
 * we are forced to a ridiculous low MSS like 20 and send hundreds
184
 * of packets instead of one. The effect scales with the available
185
 * bandwidth and quickly saturates the CPU and network interface
186
 * with packet generation and sending. Set to zero to disable MINMSS
187
 * checking. This setting prevents us from sending too small packets.
188
 */
189
int tcp_minmss = TCP_MINMSS;
190
SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
191
    &tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
192
/*
193
 * Number of TCP segments per second we accept from remote host
194
 * before we start to calculate average segment size. If average
195
 * segment size drops below the minimum TCP MSS we assume a DoS
196
 * attack and reset+drop the connection. Care has to be taken not to
197
 * set this value too small to not kill interactive type connections
198
 * (telnet, SSH) which send many small packets.
199
 */
200
int tcp_minmssoverload = TCP_MINMSSOVERLOAD;
201
SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW,
202
    &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to"
203
    "be under the MINMSS Size");
204

  
181 205
#if 0
182 206
static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
183 207
SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
sys/netinet/tcp_usrreq.c
1239 1239
			break;
1240 1240

  
1241 1241
		case TCP_MAXSEG:
1242
			if (optval > 0 && optval <= tp->t_maxseg)
1242
			if (optval > 0 && optval <= tp->t_maxseg &&
1243
			    optval + 40 >= tcp_minmss)
1243 1244
				tp->t_maxseg = optval;
1244 1245
			else
1245 1246
				error = EINVAL;
......
1314 1315
		if (error)
1315 1316
			return (error);
1316 1317
	}
1318
	so->so_rcv.ssb_flags |= SSB_AUTOSIZE;
1319
	so->so_snd.ssb_flags |= SSB_AUTOSIZE;
1317 1320
	cpu = mycpu->gd_cpuid;
1318 1321
	error = in_pcballoc(so, &tcbinfo[cpu]);
1319 1322
	if (error)
sys/netinet/tcp_var.h
264 264
	u_long	snd_bwnd;		/* bandwidth-controlled window */
265 265
	int	t_bw_rtttime;		/* used for bandwidth calculation */
266 266
	tcp_seq	t_bw_rtseq;		/* used for bandwidth calculation */
267

  
268
/* anti DoS counters */
269
	u_long	rcv_second;		/* start of interval second */
270
	u_long	rcv_pps;		/* received packets per second */
271
	u_long	rcv_byps;		/* received bytes per second */
272

  
273
	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
274
	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
267 275
};
268 276

  
269 277
#define	IN_FASTRECOVERY(tp)	(tp->t_flags & TF_FASTRECOVERY)
......
279 287
	u_long	tcps_connects;		/* connections established */
280 288
	u_long	tcps_drops;		/* connections dropped */
281 289
	u_long	tcps_conndrops;		/* embryonic connections dropped */
290
	u_long	tcps_minmssdrops;	/* average minmss too low drops */
282 291
	u_long	tcps_closed;		/* conn. closed (includes drops) */
283 292
	u_long	tcps_segstimed;		/* segs where we tried to get rtt */
284 293
	u_long	tcps_rttupdated;	/* times we succeeded */
......
564 573
extern	struct tcpcbackqhead tcpcbackq[];
565 574

  
566 575
extern	int tcp_mssdflt;	/* XXX */
576
extern	int tcp_minmss;
577
extern	int tcp_minmssoverload;
567 578
extern	int tcp_delack_enabled;
568 579
extern	int path_mtu_discovery;
569 580

  
sys/sys/socketvar.h
84 84
#define SSB_KNOTE	0x100		/* kernel note attached */
85 85
#define SSB_MEVENT	0x200		/* need message event notification */
86 86
#define SSB_STOP	0x400		/* backpressure indicator */
87
#define	SSB_AUTOSIZE	0x800		/* automatically size socket buffer */
87 88

  
88 89
/*
89 90
 * Per-socket kernel structure.  Contains universal send and receive queues,
90
-