Fri Jul 24 07:44:35 2015 UTC ()
Pull up following revision(s) (requested by matt in ticket #1973):
	sys/netinet/tcp_output.c: revision 1.184
	sys/netinet/tcp_input.c: revision 1.343

If we are sending a window probe and there's unacked data in the
socket, make sure at least the persist timer is running.
Make sure that snd_win doesn't go negative.


(martin)
diff -r1.291.4.5 -r1.291.4.5.6.1 src/sys/netinet/tcp_input.c
diff -r1.167.10.1 -r1.167.10.1.2.1 src/sys/netinet/tcp_output.c

cvs diff -r1.291.4.5 -r1.291.4.5.6.1 src/sys/netinet/tcp_input.c (switch to unified diff)

--- src/sys/netinet/tcp_input.c 2010/06/11 23:36:07 1.291.4.5
+++ src/sys/netinet/tcp_input.c 2015/07/24 07:44:35 1.291.4.5.6.1
@@ -1,1147 +1,1147 @@ @@ -1,1147 +1,1147 @@
1/* $NetBSD: tcp_input.c,v 1.291.4.5 2010/06/11 23:36:07 riz Exp $ */ 1/* $NetBSD: tcp_input.c,v 1.291.4.5.6.1 2015/07/24 07:44:35 martin Exp $ */
2 2
3/* 3/*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors 15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software 16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission. 17 * without specific prior written permission.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE. 29 * SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
34 * 34 *
35 * NRL grants permission for redistribution and use in source and binary 35 * NRL grants permission for redistribution and use in source and binary
36 * forms, with or without modification, of the software and documentation 36 * forms, with or without modification, of the software and documentation
37 * created at NRL provided that the following conditions are met: 37 * created at NRL provided that the following conditions are met:
38 * 38 *
39 * 1. Redistributions of source code must retain the above copyright 39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer. 40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright 41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the 42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution. 43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software 44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgements: 45 * must display the following acknowledgements:
46 * This product includes software developed by the University of 46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors. 47 * California, Berkeley and its contributors.
48 * This product includes software developed at the Information 48 * This product includes software developed at the Information
49 * Technology Division, US Naval Research Laboratory. 49 * Technology Division, US Naval Research Laboratory.
50 * 4. Neither the name of the NRL nor the names of its contributors 50 * 4. Neither the name of the NRL nor the names of its contributors
51 * may be used to endorse or promote products derived from this software 51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission. 52 * without specific prior written permission.
53 * 53 *
54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 * 65 *
66 * The views and conclusions contained in the software and documentation 66 * The views and conclusions contained in the software and documentation
67 * are those of the authors and should not be interpreted as representing 67 * are those of the authors and should not be interpreted as representing
68 * official policies, either expressed or implied, of the US Naval 68 * official policies, either expressed or implied, of the US Naval
69 * Research Laboratory (NRL). 69 * Research Laboratory (NRL).
70 */ 70 */
71 71
72/*- 72/*-
73 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc. 73 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
74 * All rights reserved. 74 * All rights reserved.
75 * 75 *
76 * This code is derived from software contributed to The NetBSD Foundation 76 * This code is derived from software contributed to The NetBSD Foundation
77 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 77 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
78 * Facility, NASA Ames Research Center. 78 * Facility, NASA Ames Research Center.
79 * This code is derived from software contributed to The NetBSD Foundation 79 * This code is derived from software contributed to The NetBSD Foundation
80 * by Charles M. Hannum. 80 * by Charles M. Hannum.
81 * This code is derived from software contributed to The NetBSD Foundation 81 * This code is derived from software contributed to The NetBSD Foundation
82 * by Rui Paulo. 82 * by Rui Paulo.
83 * 83 *
84 * Redistribution and use in source and binary forms, with or without 84 * Redistribution and use in source and binary forms, with or without
85 * modification, are permitted provided that the following conditions 85 * modification, are permitted provided that the following conditions
86 * are met: 86 * are met:
87 * 1. Redistributions of source code must retain the above copyright 87 * 1. Redistributions of source code must retain the above copyright
88 * notice, this list of conditions and the following disclaimer. 88 * notice, this list of conditions and the following disclaimer.
89 * 2. Redistributions in binary form must reproduce the above copyright 89 * 2. Redistributions in binary form must reproduce the above copyright
90 * notice, this list of conditions and the following disclaimer in the 90 * notice, this list of conditions and the following disclaimer in the
91 * documentation and/or other materials provided with the distribution. 91 * documentation and/or other materials provided with the distribution.
92 * 92 *
93 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 93 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
94 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 94 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
95 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 95 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
96 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 96 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
97 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 97 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
98 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 98 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
99 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 99 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
100 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 100 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
101 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 101 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
103 * POSSIBILITY OF SUCH DAMAGE. 103 * POSSIBILITY OF SUCH DAMAGE.
104 */ 104 */
105 105
106/* 106/*
107 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 107 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
108 * The Regents of the University of California. All rights reserved. 108 * The Regents of the University of California. All rights reserved.
109 * 109 *
110 * Redistribution and use in source and binary forms, with or without 110 * Redistribution and use in source and binary forms, with or without
111 * modification, are permitted provided that the following conditions 111 * modification, are permitted provided that the following conditions
112 * are met: 112 * are met:
113 * 1. Redistributions of source code must retain the above copyright 113 * 1. Redistributions of source code must retain the above copyright
114 * notice, this list of conditions and the following disclaimer. 114 * notice, this list of conditions and the following disclaimer.
115 * 2. Redistributions in binary form must reproduce the above copyright 115 * 2. Redistributions in binary form must reproduce the above copyright
116 * notice, this list of conditions and the following disclaimer in the 116 * notice, this list of conditions and the following disclaimer in the
117 * documentation and/or other materials provided with the distribution. 117 * documentation and/or other materials provided with the distribution.
118 * 3. Neither the name of the University nor the names of its contributors 118 * 3. Neither the name of the University nor the names of its contributors
119 * may be used to endorse or promote products derived from this software 119 * may be used to endorse or promote products derived from this software
120 * without specific prior written permission. 120 * without specific prior written permission.
121 * 121 *
122 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 122 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
123 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 123 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
124 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 124 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
125 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 125 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
126 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 126 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
127 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 127 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
128 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 128 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
129 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 129 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
130 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 130 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
131 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 131 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
132 * SUCH DAMAGE. 132 * SUCH DAMAGE.
133 * 133 *
134 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 134 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
135 */ 135 */
136 136
137/* 137/*
138 * TODO list for SYN cache stuff: 138 * TODO list for SYN cache stuff:
139 * 139 *
140 * Find room for a "state" field, which is needed to keep a 140 * Find room for a "state" field, which is needed to keep a
141 * compressed state for TIME_WAIT TCBs. It's been noted already 141 * compressed state for TIME_WAIT TCBs. It's been noted already
142 * that this is fairly important for very high-volume web and 142 * that this is fairly important for very high-volume web and
143 * mail servers, which use a large number of short-lived 143 * mail servers, which use a large number of short-lived
144 * connections. 144 * connections.
145 */ 145 */
146 146
147#include <sys/cdefs.h> 147#include <sys/cdefs.h>
148__KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.291.4.5 2010/06/11 23:36:07 riz Exp $"); 148__KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.291.4.5.6.1 2015/07/24 07:44:35 martin Exp $");
149 149
150#include "opt_inet.h" 150#include "opt_inet.h"
151#include "opt_ipsec.h" 151#include "opt_ipsec.h"
152#include "opt_inet_csum.h" 152#include "opt_inet_csum.h"
153#include "opt_tcp_debug.h" 153#include "opt_tcp_debug.h"
154 154
155#include <sys/param.h> 155#include <sys/param.h>
156#include <sys/systm.h> 156#include <sys/systm.h>
157#include <sys/malloc.h> 157#include <sys/malloc.h>
158#include <sys/mbuf.h> 158#include <sys/mbuf.h>
159#include <sys/protosw.h> 159#include <sys/protosw.h>
160#include <sys/socket.h> 160#include <sys/socket.h>
161#include <sys/socketvar.h> 161#include <sys/socketvar.h>
162#include <sys/errno.h> 162#include <sys/errno.h>
163#include <sys/syslog.h> 163#include <sys/syslog.h>
164#include <sys/pool.h> 164#include <sys/pool.h>
165#include <sys/domain.h> 165#include <sys/domain.h>
166#include <sys/kernel.h> 166#include <sys/kernel.h>
167#ifdef TCP_SIGNATURE 167#ifdef TCP_SIGNATURE
168#include <sys/md5.h> 168#include <sys/md5.h>
169#endif 169#endif
170#include <sys/lwp.h> /* for lwp0 */ 170#include <sys/lwp.h> /* for lwp0 */
171 171
172#include <net/if.h> 172#include <net/if.h>
173#include <net/route.h> 173#include <net/route.h>
174#include <net/if_types.h> 174#include <net/if_types.h>
175 175
176#include <netinet/in.h> 176#include <netinet/in.h>
177#include <netinet/in_systm.h> 177#include <netinet/in_systm.h>
178#include <netinet/ip.h> 178#include <netinet/ip.h>
179#include <netinet/in_pcb.h> 179#include <netinet/in_pcb.h>
180#include <netinet/in_var.h> 180#include <netinet/in_var.h>
181#include <netinet/ip_var.h> 181#include <netinet/ip_var.h>
182#include <netinet/in_offload.h> 182#include <netinet/in_offload.h>
183 183
184#ifdef INET6 184#ifdef INET6
185#ifndef INET 185#ifndef INET
186#include <netinet/in.h> 186#include <netinet/in.h>
187#endif 187#endif
188#include <netinet/ip6.h> 188#include <netinet/ip6.h>
189#include <netinet6/ip6_var.h> 189#include <netinet6/ip6_var.h>
190#include <netinet6/in6_pcb.h> 190#include <netinet6/in6_pcb.h>
191#include <netinet6/ip6_var.h> 191#include <netinet6/ip6_var.h>
192#include <netinet6/in6_var.h> 192#include <netinet6/in6_var.h>
193#include <netinet/icmp6.h> 193#include <netinet/icmp6.h>
194#include <netinet6/nd6.h> 194#include <netinet6/nd6.h>
195#ifdef TCP_SIGNATURE 195#ifdef TCP_SIGNATURE
196#include <netinet6/scope6_var.h> 196#include <netinet6/scope6_var.h>
197#endif 197#endif
198#endif 198#endif
199 199
200#ifndef INET6 200#ifndef INET6
201/* always need ip6.h for IP6_EXTHDR_GET */ 201/* always need ip6.h for IP6_EXTHDR_GET */
202#include <netinet/ip6.h> 202#include <netinet/ip6.h>
203#endif 203#endif
204 204
205#include <netinet/tcp.h> 205#include <netinet/tcp.h>
206#include <netinet/tcp_fsm.h> 206#include <netinet/tcp_fsm.h>
207#include <netinet/tcp_seq.h> 207#include <netinet/tcp_seq.h>
208#include <netinet/tcp_timer.h> 208#include <netinet/tcp_timer.h>
209#include <netinet/tcp_var.h> 209#include <netinet/tcp_var.h>
210#include <netinet/tcp_private.h> 210#include <netinet/tcp_private.h>
211#include <netinet/tcpip.h> 211#include <netinet/tcpip.h>
212#include <netinet/tcp_congctl.h> 212#include <netinet/tcp_congctl.h>
213#include <netinet/tcp_debug.h> 213#include <netinet/tcp_debug.h>
214 214
215#include <machine/stdarg.h> 215#include <machine/stdarg.h>
216 216
217#ifdef IPSEC 217#ifdef IPSEC
218#include <netinet6/ipsec.h> 218#include <netinet6/ipsec.h>
219#include <netinet6/ipsec_private.h> 219#include <netinet6/ipsec_private.h>
220#include <netkey/key.h> 220#include <netkey/key.h>
221#endif /*IPSEC*/ 221#endif /*IPSEC*/
222#ifdef INET6 222#ifdef INET6
223#include "faith.h" 223#include "faith.h"
224#if defined(NFAITH) && NFAITH > 0 224#if defined(NFAITH) && NFAITH > 0
225#include <net/if_faith.h> 225#include <net/if_faith.h>
226#endif 226#endif
227#endif /* IPSEC */ 227#endif /* IPSEC */
228 228
229#ifdef FAST_IPSEC 229#ifdef FAST_IPSEC
230#include <netipsec/ipsec.h> 230#include <netipsec/ipsec.h>
231#include <netipsec/ipsec_var.h> 231#include <netipsec/ipsec_var.h>
232#include <netipsec/ipsec_private.h> 232#include <netipsec/ipsec_private.h>
233#include <netipsec/key.h> 233#include <netipsec/key.h>
234#ifdef INET6 234#ifdef INET6
235#include <netipsec/ipsec6.h> 235#include <netipsec/ipsec6.h>
236#endif 236#endif
237#endif /* FAST_IPSEC*/ 237#endif /* FAST_IPSEC*/
238 238
239int tcprexmtthresh = 3; 239int tcprexmtthresh = 3;
240int tcp_log_refused; 240int tcp_log_refused;
241 241
242int tcp_do_autorcvbuf = 0; 242int tcp_do_autorcvbuf = 0;
243int tcp_autorcvbuf_inc = 16 * 1024; 243int tcp_autorcvbuf_inc = 16 * 1024;
244int tcp_autorcvbuf_max = 256 * 1024; 244int tcp_autorcvbuf_max = 256 * 1024;
245int tcp_msl = (TCPTV_MSL / PR_SLOWHZ); 245int tcp_msl = (TCPTV_MSL / PR_SLOWHZ);
246 246
247static int tcp_rst_ppslim_count = 0; 247static int tcp_rst_ppslim_count = 0;
248static struct timeval tcp_rst_ppslim_last; 248static struct timeval tcp_rst_ppslim_last;
249static int tcp_ackdrop_ppslim_count = 0; 249static int tcp_ackdrop_ppslim_count = 0;
250static struct timeval tcp_ackdrop_ppslim_last; 250static struct timeval tcp_ackdrop_ppslim_last;
251 251
252#define TCP_PAWS_IDLE (24U * 24 * 60 * 60 * PR_SLOWHZ) 252#define TCP_PAWS_IDLE (24U * 24 * 60 * 60 * PR_SLOWHZ)
253 253
254/* for modulo comparisons of timestamps */ 254/* for modulo comparisons of timestamps */
255#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 255#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
256#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 256#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
257 257
258/* 258/*
259 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 259 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
260 */ 260 */
261#ifdef INET6 261#ifdef INET6
262static inline void 262static inline void
263nd6_hint(struct tcpcb *tp) 263nd6_hint(struct tcpcb *tp)
264{ 264{
265 struct rtentry *rt; 265 struct rtentry *rt;
266 266
267 if (tp != NULL && tp->t_in6pcb != NULL && tp->t_family == AF_INET6 && 267 if (tp != NULL && tp->t_in6pcb != NULL && tp->t_family == AF_INET6 &&
268 (rt = rtcache_validate(&tp->t_in6pcb->in6p_route)) != NULL) 268 (rt = rtcache_validate(&tp->t_in6pcb->in6p_route)) != NULL)
269 nd6_nud_hint(rt, NULL, 0); 269 nd6_nud_hint(rt, NULL, 0);
270} 270}
271#else 271#else
272static inline void 272static inline void
273nd6_hint(struct tcpcb *tp) 273nd6_hint(struct tcpcb *tp)
274{ 274{
275} 275}
276#endif 276#endif
277 277
278/* 278/*
279 * Compute ACK transmission behavior. Delay the ACK unless 279 * Compute ACK transmission behavior. Delay the ACK unless
280 * we have already delayed an ACK (must send an ACK every two segments). 280 * we have already delayed an ACK (must send an ACK every two segments).
281 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 281 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
282 * option is enabled. 282 * option is enabled.
283 */ 283 */
284static void 284static void
285tcp_setup_ack(struct tcpcb *tp, const struct tcphdr *th) 285tcp_setup_ack(struct tcpcb *tp, const struct tcphdr *th)
286{ 286{
287 287
288 if (tp->t_flags & TF_DELACK || 288 if (tp->t_flags & TF_DELACK ||
289 (tcp_ack_on_push && th->th_flags & TH_PUSH)) 289 (tcp_ack_on_push && th->th_flags & TH_PUSH))
290 tp->t_flags |= TF_ACKNOW; 290 tp->t_flags |= TF_ACKNOW;
291 else 291 else
292 TCP_SET_DELACK(tp); 292 TCP_SET_DELACK(tp);
293} 293}
294 294
295static void 295static void
296icmp_check(struct tcpcb *tp, const struct tcphdr *th, int acked) 296icmp_check(struct tcpcb *tp, const struct tcphdr *th, int acked)
297{ 297{
298 298
299 /* 299 /*
300 * If we had a pending ICMP message that refers to data that have 300 * If we had a pending ICMP message that refers to data that have
301 * just been acknowledged, disregard the recorded ICMP message. 301 * just been acknowledged, disregard the recorded ICMP message.
302 */ 302 */
303 if ((tp->t_flags & TF_PMTUD_PEND) && 303 if ((tp->t_flags & TF_PMTUD_PEND) &&
304 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 304 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
305 tp->t_flags &= ~TF_PMTUD_PEND; 305 tp->t_flags &= ~TF_PMTUD_PEND;
306 306
307 /* 307 /*
308 * Keep track of the largest chunk of data 308 * Keep track of the largest chunk of data
309 * acknowledged since last PMTU update 309 * acknowledged since last PMTU update
310 */ 310 */
311 if (tp->t_pmtud_mss_acked < acked) 311 if (tp->t_pmtud_mss_acked < acked)
312 tp->t_pmtud_mss_acked = acked; 312 tp->t_pmtud_mss_acked = acked;
313} 313}
314 314
315/* 315/*
316 * Convert TCP protocol fields to host order for easier processing. 316 * Convert TCP protocol fields to host order for easier processing.
317 */ 317 */
318static void 318static void
319tcp_fields_to_host(struct tcphdr *th) 319tcp_fields_to_host(struct tcphdr *th)
320{ 320{
321 321
322 NTOHL(th->th_seq); 322 NTOHL(th->th_seq);
323 NTOHL(th->th_ack); 323 NTOHL(th->th_ack);
324 NTOHS(th->th_win); 324 NTOHS(th->th_win);
325 NTOHS(th->th_urp); 325 NTOHS(th->th_urp);
326} 326}
327 327
328/* 328/*
329 * ... and reverse the above. 329 * ... and reverse the above.
330 */ 330 */
331static void 331static void
332tcp_fields_to_net(struct tcphdr *th) 332tcp_fields_to_net(struct tcphdr *th)
333{ 333{
334 334
335 HTONL(th->th_seq); 335 HTONL(th->th_seq);
336 HTONL(th->th_ack); 336 HTONL(th->th_ack);
337 HTONS(th->th_win); 337 HTONS(th->th_win);
338 HTONS(th->th_urp); 338 HTONS(th->th_urp);
339} 339}
340 340
341#ifdef TCP_CSUM_COUNTERS 341#ifdef TCP_CSUM_COUNTERS
342#include <sys/device.h> 342#include <sys/device.h>
343 343
344#if defined(INET) 344#if defined(INET)
345extern struct evcnt tcp_hwcsum_ok; 345extern struct evcnt tcp_hwcsum_ok;
346extern struct evcnt tcp_hwcsum_bad; 346extern struct evcnt tcp_hwcsum_bad;
347extern struct evcnt tcp_hwcsum_data; 347extern struct evcnt tcp_hwcsum_data;
348extern struct evcnt tcp_swcsum; 348extern struct evcnt tcp_swcsum;
349#endif /* defined(INET) */ 349#endif /* defined(INET) */
350#if defined(INET6) 350#if defined(INET6)
351extern struct evcnt tcp6_hwcsum_ok; 351extern struct evcnt tcp6_hwcsum_ok;
352extern struct evcnt tcp6_hwcsum_bad; 352extern struct evcnt tcp6_hwcsum_bad;
353extern struct evcnt tcp6_hwcsum_data; 353extern struct evcnt tcp6_hwcsum_data;
354extern struct evcnt tcp6_swcsum; 354extern struct evcnt tcp6_swcsum;
355#endif /* defined(INET6) */ 355#endif /* defined(INET6) */
356 356
357#define TCP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++ 357#define TCP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++
358 358
359#else 359#else
360 360
361#define TCP_CSUM_COUNTER_INCR(ev) /* nothing */ 361#define TCP_CSUM_COUNTER_INCR(ev) /* nothing */
362 362
363#endif /* TCP_CSUM_COUNTERS */ 363#endif /* TCP_CSUM_COUNTERS */
364 364
365#ifdef TCP_REASS_COUNTERS 365#ifdef TCP_REASS_COUNTERS
366#include <sys/device.h> 366#include <sys/device.h>
367 367
368extern struct evcnt tcp_reass_; 368extern struct evcnt tcp_reass_;
369extern struct evcnt tcp_reass_empty; 369extern struct evcnt tcp_reass_empty;
370extern struct evcnt tcp_reass_iteration[8]; 370extern struct evcnt tcp_reass_iteration[8];
371extern struct evcnt tcp_reass_prependfirst; 371extern struct evcnt tcp_reass_prependfirst;
372extern struct evcnt tcp_reass_prepend; 372extern struct evcnt tcp_reass_prepend;
373extern struct evcnt tcp_reass_insert; 373extern struct evcnt tcp_reass_insert;
374extern struct evcnt tcp_reass_inserttail; 374extern struct evcnt tcp_reass_inserttail;
375extern struct evcnt tcp_reass_append; 375extern struct evcnt tcp_reass_append;
376extern struct evcnt tcp_reass_appendtail; 376extern struct evcnt tcp_reass_appendtail;
377extern struct evcnt tcp_reass_overlaptail; 377extern struct evcnt tcp_reass_overlaptail;
378extern struct evcnt tcp_reass_overlapfront; 378extern struct evcnt tcp_reass_overlapfront;
379extern struct evcnt tcp_reass_segdup; 379extern struct evcnt tcp_reass_segdup;
380extern struct evcnt tcp_reass_fragdup; 380extern struct evcnt tcp_reass_fragdup;
381 381
382#define TCP_REASS_COUNTER_INCR(ev) (ev)->ev_count++ 382#define TCP_REASS_COUNTER_INCR(ev) (ev)->ev_count++
383 383
384#else 384#else
385 385
386#define TCP_REASS_COUNTER_INCR(ev) /* nothing */ 386#define TCP_REASS_COUNTER_INCR(ev) /* nothing */
387 387
388#endif /* TCP_REASS_COUNTERS */ 388#endif /* TCP_REASS_COUNTERS */
389 389
390static int tcp_reass(struct tcpcb *, const struct tcphdr *, struct mbuf *, 390static int tcp_reass(struct tcpcb *, const struct tcphdr *, struct mbuf *,
391 int *); 391 int *);
392static int tcp_dooptions(struct tcpcb *, const u_char *, int, 392static int tcp_dooptions(struct tcpcb *, const u_char *, int,
393 struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *); 393 struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *);
394 394
395#ifdef INET 395#ifdef INET
396static void tcp4_log_refused(const struct ip *, const struct tcphdr *); 396static void tcp4_log_refused(const struct ip *, const struct tcphdr *);
397#endif 397#endif
398#ifdef INET6 398#ifdef INET6
399static void tcp6_log_refused(const struct ip6_hdr *, const struct tcphdr *); 399static void tcp6_log_refused(const struct ip6_hdr *, const struct tcphdr *);
400#endif 400#endif
401 401
402#define TRAVERSE(x) while ((x)->m_next) (x) = (x)->m_next 402#define TRAVERSE(x) while ((x)->m_next) (x) = (x)->m_next
403 403
404#if defined(MBUFTRACE) 404#if defined(MBUFTRACE)
405struct mowner tcp_reass_mowner = MOWNER_INIT("tcp", "reass"); 405struct mowner tcp_reass_mowner = MOWNER_INIT("tcp", "reass");
406#endif /* defined(MBUFTRACE) */ 406#endif /* defined(MBUFTRACE) */
407 407
408static POOL_INIT(tcpipqent_pool, sizeof(struct ipqent), 0, 0, 0, "tcpipqepl", 408static POOL_INIT(tcpipqent_pool, sizeof(struct ipqent), 0, 0, 0, "tcpipqepl",
409 NULL, IPL_VM); 409 NULL, IPL_VM);
410 410
411struct ipqent * 411struct ipqent *
412tcpipqent_alloc(void) 412tcpipqent_alloc(void)
413{ 413{
414 struct ipqent *ipqe; 414 struct ipqent *ipqe;
415 int s; 415 int s;
416 416
417 s = splvm(); 417 s = splvm();
418 ipqe = pool_get(&tcpipqent_pool, PR_NOWAIT); 418 ipqe = pool_get(&tcpipqent_pool, PR_NOWAIT);
419 splx(s); 419 splx(s);
420 420
421 return ipqe; 421 return ipqe;
422} 422}
423 423
424void 424void
425tcpipqent_free(struct ipqent *ipqe) 425tcpipqent_free(struct ipqent *ipqe)
426{ 426{
427 int s; 427 int s;
428 428
429 s = splvm(); 429 s = splvm();
430 pool_put(&tcpipqent_pool, ipqe); 430 pool_put(&tcpipqent_pool, ipqe);
431 splx(s); 431 splx(s);
432} 432}
433 433
434static int 434static int
435tcp_reass(struct tcpcb *tp, const struct tcphdr *th, struct mbuf *m, int *tlen) 435tcp_reass(struct tcpcb *tp, const struct tcphdr *th, struct mbuf *m, int *tlen)
436{ 436{
437 struct ipqent *p, *q, *nq, *tiqe = NULL; 437 struct ipqent *p, *q, *nq, *tiqe = NULL;
438 struct socket *so = NULL; 438 struct socket *so = NULL;
439 int pkt_flags; 439 int pkt_flags;
440 tcp_seq pkt_seq; 440 tcp_seq pkt_seq;
441 unsigned pkt_len; 441 unsigned pkt_len;
442 u_long rcvpartdupbyte = 0; 442 u_long rcvpartdupbyte = 0;
443 u_long rcvoobyte; 443 u_long rcvoobyte;
444#ifdef TCP_REASS_COUNTERS 444#ifdef TCP_REASS_COUNTERS
445 u_int count = 0; 445 u_int count = 0;
446#endif 446#endif
447 uint64_t *tcps; 447 uint64_t *tcps;
448 448
449 if (tp->t_inpcb) 449 if (tp->t_inpcb)
450 so = tp->t_inpcb->inp_socket; 450 so = tp->t_inpcb->inp_socket;
451#ifdef INET6 451#ifdef INET6
452 else if (tp->t_in6pcb) 452 else if (tp->t_in6pcb)
453 so = tp->t_in6pcb->in6p_socket; 453 so = tp->t_in6pcb->in6p_socket;
454#endif 454#endif
455 455
456 TCP_REASS_LOCK_CHECK(tp); 456 TCP_REASS_LOCK_CHECK(tp);
457 457
458 /* 458 /*
459 * Call with th==0 after become established to 459 * Call with th==0 after become established to
460 * force pre-ESTABLISHED data up to user socket. 460 * force pre-ESTABLISHED data up to user socket.
461 */ 461 */
462 if (th == 0) 462 if (th == 0)
463 goto present; 463 goto present;
464 464
465 m_claimm(m, &tcp_reass_mowner); 465 m_claimm(m, &tcp_reass_mowner);
466 466
467 rcvoobyte = *tlen; 467 rcvoobyte = *tlen;
468 /* 468 /*
469 * Copy these to local variables because the tcpiphdr 469 * Copy these to local variables because the tcpiphdr
470 * gets munged while we are collapsing mbufs. 470 * gets munged while we are collapsing mbufs.
471 */ 471 */
472 pkt_seq = th->th_seq; 472 pkt_seq = th->th_seq;
473 pkt_len = *tlen; 473 pkt_len = *tlen;
474 pkt_flags = th->th_flags; 474 pkt_flags = th->th_flags;
475 475
476 TCP_REASS_COUNTER_INCR(&tcp_reass_); 476 TCP_REASS_COUNTER_INCR(&tcp_reass_);
477 477
478 if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) { 478 if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) {
479 /* 479 /*
480 * When we miss a packet, the vast majority of time we get 480 * When we miss a packet, the vast majority of time we get
481 * packets that follow it in order. So optimize for that. 481 * packets that follow it in order. So optimize for that.
482 */ 482 */
483 if (pkt_seq == p->ipqe_seq + p->ipqe_len) { 483 if (pkt_seq == p->ipqe_seq + p->ipqe_len) {
484 p->ipqe_len += pkt_len; 484 p->ipqe_len += pkt_len;
485 p->ipqe_flags |= pkt_flags; 485 p->ipqe_flags |= pkt_flags;
486 m_cat(p->ipre_mlast, m); 486 m_cat(p->ipre_mlast, m);
487 TRAVERSE(p->ipre_mlast); 487 TRAVERSE(p->ipre_mlast);
488 m = NULL; 488 m = NULL;
489 tiqe = p; 489 tiqe = p;
490 TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq); 490 TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq);
491 TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail); 491 TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail);
492 goto skip_replacement; 492 goto skip_replacement;
493 } 493 }
494 /* 494 /*
495 * While we're here, if the pkt is completely beyond 495 * While we're here, if the pkt is completely beyond
496 * anything we have, just insert it at the tail. 496 * anything we have, just insert it at the tail.
497 */ 497 */
498 if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) { 498 if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) {
499 TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail); 499 TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail);
500 goto insert_it; 500 goto insert_it;
501 } 501 }
502 } 502 }
503 503
504 q = TAILQ_FIRST(&tp->segq); 504 q = TAILQ_FIRST(&tp->segq);
505 505
506 if (q != NULL) { 506 if (q != NULL) {
507 /* 507 /*
508 * If this segment immediately precedes the first out-of-order 508 * If this segment immediately precedes the first out-of-order
509 * block, simply slap the segment in front of it and (mostly) 509 * block, simply slap the segment in front of it and (mostly)
510 * skip the complicated logic. 510 * skip the complicated logic.
511 */ 511 */
512 if (pkt_seq + pkt_len == q->ipqe_seq) { 512 if (pkt_seq + pkt_len == q->ipqe_seq) {
513 q->ipqe_seq = pkt_seq; 513 q->ipqe_seq = pkt_seq;
514 q->ipqe_len += pkt_len; 514 q->ipqe_len += pkt_len;
515 q->ipqe_flags |= pkt_flags; 515 q->ipqe_flags |= pkt_flags;
516 m_cat(m, q->ipqe_m); 516 m_cat(m, q->ipqe_m);
517 q->ipqe_m = m; 517 q->ipqe_m = m;
518 q->ipre_mlast = m; /* last mbuf may have changed */ 518 q->ipre_mlast = m; /* last mbuf may have changed */
519 TRAVERSE(q->ipre_mlast); 519 TRAVERSE(q->ipre_mlast);
520 tiqe = q; 520 tiqe = q;
521 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); 521 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
522 TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst); 522 TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst);
523 goto skip_replacement; 523 goto skip_replacement;
524 } 524 }
525 } else { 525 } else {
526 TCP_REASS_COUNTER_INCR(&tcp_reass_empty); 526 TCP_REASS_COUNTER_INCR(&tcp_reass_empty);
527 } 527 }
528 528
529 /* 529 /*
530 * Find a segment which begins after this one does. 530 * Find a segment which begins after this one does.
531 */ 531 */
532 for (p = NULL; q != NULL; q = nq) { 532 for (p = NULL; q != NULL; q = nq) {
533 nq = TAILQ_NEXT(q, ipqe_q); 533 nq = TAILQ_NEXT(q, ipqe_q);
534#ifdef TCP_REASS_COUNTERS 534#ifdef TCP_REASS_COUNTERS
535 count++; 535 count++;
536#endif 536#endif
537 /* 537 /*
538 * If the received segment is just right after this 538 * If the received segment is just right after this
539 * fragment, merge the two together and then check 539 * fragment, merge the two together and then check
540 * for further overlaps. 540 * for further overlaps.
541 */ 541 */
542 if (q->ipqe_seq + q->ipqe_len == pkt_seq) { 542 if (q->ipqe_seq + q->ipqe_len == pkt_seq) {
543#ifdef TCPREASS_DEBUG 543#ifdef TCPREASS_DEBUG
544 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n", 544 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n",
545 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 545 tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
546 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len); 546 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len);
547#endif 547#endif
548 pkt_len += q->ipqe_len; 548 pkt_len += q->ipqe_len;
549 pkt_flags |= q->ipqe_flags; 549 pkt_flags |= q->ipqe_flags;
550 pkt_seq = q->ipqe_seq; 550 pkt_seq = q->ipqe_seq;
551 m_cat(q->ipre_mlast, m); 551 m_cat(q->ipre_mlast, m);
552 TRAVERSE(q->ipre_mlast); 552 TRAVERSE(q->ipre_mlast);
553 m = q->ipqe_m; 553 m = q->ipqe_m;
554 TCP_REASS_COUNTER_INCR(&tcp_reass_append); 554 TCP_REASS_COUNTER_INCR(&tcp_reass_append);
555 goto free_ipqe; 555 goto free_ipqe;
556 } 556 }
557 /* 557 /*
558 * If the received segment is completely past this 558 * If the received segment is completely past this
559 * fragment, we need to go the next fragment. 559 * fragment, we need to go the next fragment.
560 */ 560 */
561 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 561 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
562 p = q; 562 p = q;
563 continue; 563 continue;
564 } 564 }
565 /* 565 /*
566 * If the fragment is past the received segment, 566 * If the fragment is past the received segment,
567 * it (or any following) can't be concatenated. 567 * it (or any following) can't be concatenated.
568 */ 568 */
569 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) { 569 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) {
570 TCP_REASS_COUNTER_INCR(&tcp_reass_insert); 570 TCP_REASS_COUNTER_INCR(&tcp_reass_insert);
571 break; 571 break;
572 } 572 }
573 573
574 /* 574 /*
575 * We've received all the data in this segment before. 575 * We've received all the data in this segment before.
576 * mark it as a duplicate and return. 576 * mark it as a duplicate and return.
577 */ 577 */
578 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) && 578 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) &&
579 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 579 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
580 tcps = TCP_STAT_GETREF(); 580 tcps = TCP_STAT_GETREF();
581 tcps[TCP_STAT_RCVDUPPACK]++; 581 tcps[TCP_STAT_RCVDUPPACK]++;
582 tcps[TCP_STAT_RCVDUPBYTE] += pkt_len; 582 tcps[TCP_STAT_RCVDUPBYTE] += pkt_len;
583 TCP_STAT_PUTREF(); 583 TCP_STAT_PUTREF();
584 tcp_new_dsack(tp, pkt_seq, pkt_len); 584 tcp_new_dsack(tp, pkt_seq, pkt_len);
585 m_freem(m); 585 m_freem(m);
586 if (tiqe != NULL) { 586 if (tiqe != NULL) {
587 tcpipqent_free(tiqe); 587 tcpipqent_free(tiqe);
588 } 588 }
589 TCP_REASS_COUNTER_INCR(&tcp_reass_segdup); 589 TCP_REASS_COUNTER_INCR(&tcp_reass_segdup);
590 return (0); 590 return (0);
591 } 591 }
592 /* 592 /*
593 * Received segment completely overlaps this fragment 593 * Received segment completely overlaps this fragment
594 * so we drop the fragment (this keeps the temporal 594 * so we drop the fragment (this keeps the temporal
595 * ordering of segments correct). 595 * ordering of segments correct).
596 */ 596 */
597 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) && 597 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) &&
598 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 598 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
599 rcvpartdupbyte += q->ipqe_len; 599 rcvpartdupbyte += q->ipqe_len;
600 m_freem(q->ipqe_m); 600 m_freem(q->ipqe_m);
601 TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup); 601 TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup);
602 goto free_ipqe; 602 goto free_ipqe;
603 } 603 }
604 /* 604 /*
605 * RX'ed segment extends past the end of the 605 * RX'ed segment extends past the end of the
606 * fragment. Drop the overlapping bytes. Then 606 * fragment. Drop the overlapping bytes. Then
607 * merge the fragment and segment then treat as 607 * merge the fragment and segment then treat as
608 * a longer received packet. 608 * a longer received packet.
609 */ 609 */
610 if (SEQ_LT(q->ipqe_seq, pkt_seq) && 610 if (SEQ_LT(q->ipqe_seq, pkt_seq) &&
611 SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 611 SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
612 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq; 612 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq;
613#ifdef TCPREASS_DEBUG 613#ifdef TCPREASS_DEBUG
614 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n", 614 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n",
615 tp, overlap, 615 tp, overlap,
616 pkt_seq, pkt_seq + pkt_len, pkt_len); 616 pkt_seq, pkt_seq + pkt_len, pkt_len);
617#endif 617#endif
618 m_adj(m, overlap); 618 m_adj(m, overlap);
619 rcvpartdupbyte += overlap; 619 rcvpartdupbyte += overlap;
620 m_cat(q->ipre_mlast, m); 620 m_cat(q->ipre_mlast, m);
621 TRAVERSE(q->ipre_mlast); 621 TRAVERSE(q->ipre_mlast);
622 m = q->ipqe_m; 622 m = q->ipqe_m;
623 pkt_seq = q->ipqe_seq; 623 pkt_seq = q->ipqe_seq;
624 pkt_len += q->ipqe_len - overlap; 624 pkt_len += q->ipqe_len - overlap;
625 rcvoobyte -= overlap; 625 rcvoobyte -= overlap;
626 TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail); 626 TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail);
627 goto free_ipqe; 627 goto free_ipqe;
628 } 628 }
629 /* 629 /*
630 * RX'ed segment extends past the front of the 630 * RX'ed segment extends past the front of the
631 * fragment. Drop the overlapping bytes on the 631 * fragment. Drop the overlapping bytes on the
632 * received packet. The packet will then be 632 * received packet. The packet will then be
633 * contatentated with this fragment a bit later. 633 * contatentated with this fragment a bit later.
634 */ 634 */
635 if (SEQ_GT(q->ipqe_seq, pkt_seq) && 635 if (SEQ_GT(q->ipqe_seq, pkt_seq) &&
636 SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) { 636 SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) {
637 int overlap = pkt_seq + pkt_len - q->ipqe_seq; 637 int overlap = pkt_seq + pkt_len - q->ipqe_seq;
638#ifdef TCPREASS_DEBUG 638#ifdef TCPREASS_DEBUG
639 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n", 639 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n",
640 tp, overlap, 640 tp, overlap,
641 pkt_seq, pkt_seq + pkt_len, pkt_len); 641 pkt_seq, pkt_seq + pkt_len, pkt_len);
642#endif 642#endif
643 m_adj(m, -overlap); 643 m_adj(m, -overlap);
644 pkt_len -= overlap; 644 pkt_len -= overlap;
645 rcvpartdupbyte += overlap; 645 rcvpartdupbyte += overlap;
646 TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront); 646 TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront);
647 rcvoobyte -= overlap; 647 rcvoobyte -= overlap;
648 } 648 }
649 /* 649 /*
650 * If the received segment immediates precedes this 650 * If the received segment immediates precedes this
651 * fragment then tack the fragment onto this segment 651 * fragment then tack the fragment onto this segment
652 * and reinsert the data. 652 * and reinsert the data.
653 */ 653 */
654 if (q->ipqe_seq == pkt_seq + pkt_len) { 654 if (q->ipqe_seq == pkt_seq + pkt_len) {
655#ifdef TCPREASS_DEBUG 655#ifdef TCPREASS_DEBUG
656 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n", 656 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n",
657 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len, 657 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len,
658 pkt_seq, pkt_seq + pkt_len, pkt_len); 658 pkt_seq, pkt_seq + pkt_len, pkt_len);
659#endif 659#endif
660 pkt_len += q->ipqe_len; 660 pkt_len += q->ipqe_len;
661 pkt_flags |= q->ipqe_flags; 661 pkt_flags |= q->ipqe_flags;
662 m_cat(m, q->ipqe_m); 662 m_cat(m, q->ipqe_m);
663 TAILQ_REMOVE(&tp->segq, q, ipqe_q); 663 TAILQ_REMOVE(&tp->segq, q, ipqe_q);
664 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); 664 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
665 tp->t_segqlen--; 665 tp->t_segqlen--;
666 KASSERT(tp->t_segqlen >= 0); 666 KASSERT(tp->t_segqlen >= 0);
667 KASSERT(tp->t_segqlen != 0 || 667 KASSERT(tp->t_segqlen != 0 ||
668 (TAILQ_EMPTY(&tp->segq) && 668 (TAILQ_EMPTY(&tp->segq) &&
669 TAILQ_EMPTY(&tp->timeq))); 669 TAILQ_EMPTY(&tp->timeq)));
670 if (tiqe == NULL) { 670 if (tiqe == NULL) {
671 tiqe = q; 671 tiqe = q;
672 } else { 672 } else {
673 tcpipqent_free(q); 673 tcpipqent_free(q);
674 } 674 }
675 TCP_REASS_COUNTER_INCR(&tcp_reass_prepend); 675 TCP_REASS_COUNTER_INCR(&tcp_reass_prepend);
676 break; 676 break;
677 } 677 }
678 /* 678 /*
679 * If the fragment is before the segment, remember it. 679 * If the fragment is before the segment, remember it.
680 * When this loop is terminated, p will contain the 680 * When this loop is terminated, p will contain the
681 * pointer to fragment that is right before the received 681 * pointer to fragment that is right before the received
682 * segment. 682 * segment.
683 */ 683 */
684 if (SEQ_LEQ(q->ipqe_seq, pkt_seq)) 684 if (SEQ_LEQ(q->ipqe_seq, pkt_seq))
685 p = q; 685 p = q;
686 686
687 continue; 687 continue;
688 688
689 /* 689 /*
690 * This is a common operation. It also will allow 690 * This is a common operation. It also will allow
691 * to save doing a malloc/free in most instances. 691 * to save doing a malloc/free in most instances.
692 */ 692 */
693 free_ipqe: 693 free_ipqe:
694 TAILQ_REMOVE(&tp->segq, q, ipqe_q); 694 TAILQ_REMOVE(&tp->segq, q, ipqe_q);
695 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); 695 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
696 tp->t_segqlen--; 696 tp->t_segqlen--;
697 KASSERT(tp->t_segqlen >= 0); 697 KASSERT(tp->t_segqlen >= 0);
698 KASSERT(tp->t_segqlen != 0 || 698 KASSERT(tp->t_segqlen != 0 ||
699 (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq))); 699 (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
700 if (tiqe == NULL) { 700 if (tiqe == NULL) {
701 tiqe = q; 701 tiqe = q;
702 } else { 702 } else {
703 tcpipqent_free(q); 703 tcpipqent_free(q);
704 } 704 }
705 } 705 }
706 706
707#ifdef TCP_REASS_COUNTERS 707#ifdef TCP_REASS_COUNTERS
708 if (count > 7) 708 if (count > 7)
709 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[0]); 709 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[0]);
710 else if (count > 0) 710 else if (count > 0)
711 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]); 711 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]);
712#endif 712#endif
713 713
714 insert_it: 714 insert_it:
715 715
716 /* 716 /*
717 * Allocate a new queue entry since the received segment did not 717 * Allocate a new queue entry since the received segment did not
718 * collapse onto any other out-of-order block; thus we are allocating 718 * collapse onto any other out-of-order block; thus we are allocating
719 * a new block. If it had collapsed, tiqe would not be NULL and 719 * a new block. If it had collapsed, tiqe would not be NULL and
720 * we would be reusing it. 720 * we would be reusing it.
721 * XXX If we can't, just drop the packet. XXX 721 * XXX If we can't, just drop the packet. XXX
722 */ 722 */
723 if (tiqe == NULL) { 723 if (tiqe == NULL) {
724 tiqe = tcpipqent_alloc(); 724 tiqe = tcpipqent_alloc();
725 if (tiqe == NULL) { 725 if (tiqe == NULL) {
726 TCP_STATINC(TCP_STAT_RCVMEMDROP); 726 TCP_STATINC(TCP_STAT_RCVMEMDROP);
727 m_freem(m); 727 m_freem(m);
728 return (0); 728 return (0);
729 } 729 }
730 } 730 }
731 731
732 /* 732 /*
733 * Update the counters. 733 * Update the counters.
734 */ 734 */
735 tcps = TCP_STAT_GETREF(); 735 tcps = TCP_STAT_GETREF();
736 tcps[TCP_STAT_RCVOOPACK]++; 736 tcps[TCP_STAT_RCVOOPACK]++;
737 tcps[TCP_STAT_RCVOOBYTE] += rcvoobyte; 737 tcps[TCP_STAT_RCVOOBYTE] += rcvoobyte;
738 if (rcvpartdupbyte) { 738 if (rcvpartdupbyte) {
739 tcps[TCP_STAT_RCVPARTDUPPACK]++; 739 tcps[TCP_STAT_RCVPARTDUPPACK]++;
740 tcps[TCP_STAT_RCVPARTDUPBYTE] += rcvpartdupbyte; 740 tcps[TCP_STAT_RCVPARTDUPBYTE] += rcvpartdupbyte;
741 } 741 }
742 TCP_STAT_PUTREF(); 742 TCP_STAT_PUTREF();
743 743
744 /* 744 /*
745 * Insert the new fragment queue entry into both queues. 745 * Insert the new fragment queue entry into both queues.
746 */ 746 */
747 tiqe->ipqe_m = m; 747 tiqe->ipqe_m = m;
748 tiqe->ipre_mlast = m; 748 tiqe->ipre_mlast = m;
749 tiqe->ipqe_seq = pkt_seq; 749 tiqe->ipqe_seq = pkt_seq;
750 tiqe->ipqe_len = pkt_len; 750 tiqe->ipqe_len = pkt_len;
751 tiqe->ipqe_flags = pkt_flags; 751 tiqe->ipqe_flags = pkt_flags;
752 if (p == NULL) { 752 if (p == NULL) {
753 TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 753 TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
754#ifdef TCPREASS_DEBUG 754#ifdef TCPREASS_DEBUG
755 if (tiqe->ipqe_seq != tp->rcv_nxt) 755 if (tiqe->ipqe_seq != tp->rcv_nxt)
756 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n", 756 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n",
757 tp, pkt_seq, pkt_seq + pkt_len, pkt_len); 757 tp, pkt_seq, pkt_seq + pkt_len, pkt_len);
758#endif 758#endif
759 } else { 759 } else {
760 TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q); 760 TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q);
761#ifdef TCPREASS_DEBUG 761#ifdef TCPREASS_DEBUG
762 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n", 762 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n",
763 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 763 tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
764 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len); 764 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len);
765#endif 765#endif
766 } 766 }
767 tp->t_segqlen++; 767 tp->t_segqlen++;
768 768
769skip_replacement: 769skip_replacement:
770 770
771 TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq); 771 TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq);
772 772
773present: 773present:
774 /* 774 /*
775 * Present data to user, advancing rcv_nxt through 775 * Present data to user, advancing rcv_nxt through
776 * completed sequence space. 776 * completed sequence space.
777 */ 777 */
778 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 778 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
779 return (0); 779 return (0);
780 q = TAILQ_FIRST(&tp->segq); 780 q = TAILQ_FIRST(&tp->segq);
781 if (q == NULL || q->ipqe_seq != tp->rcv_nxt) 781 if (q == NULL || q->ipqe_seq != tp->rcv_nxt)
782 return (0); 782 return (0);
783 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len) 783 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len)
784 return (0); 784 return (0);
785 785
786 tp->rcv_nxt += q->ipqe_len; 786 tp->rcv_nxt += q->ipqe_len;
787 pkt_flags = q->ipqe_flags & TH_FIN; 787 pkt_flags = q->ipqe_flags & TH_FIN;
788 nd6_hint(tp); 788 nd6_hint(tp);
789 789
790 TAILQ_REMOVE(&tp->segq, q, ipqe_q); 790 TAILQ_REMOVE(&tp->segq, q, ipqe_q);
791 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); 791 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
792 tp->t_segqlen--; 792 tp->t_segqlen--;
793 KASSERT(tp->t_segqlen >= 0); 793 KASSERT(tp->t_segqlen >= 0);
794 KASSERT(tp->t_segqlen != 0 || 794 KASSERT(tp->t_segqlen != 0 ||
795 (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq))); 795 (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
796 if (so->so_state & SS_CANTRCVMORE) 796 if (so->so_state & SS_CANTRCVMORE)
797 m_freem(q->ipqe_m); 797 m_freem(q->ipqe_m);
798 else 798 else
799 sbappendstream(&so->so_rcv, q->ipqe_m); 799 sbappendstream(&so->so_rcv, q->ipqe_m);
800 tcpipqent_free(q); 800 tcpipqent_free(q);
801 sorwakeup(so); 801 sorwakeup(so);
802 return (pkt_flags); 802 return (pkt_flags);
803} 803}
804 804
805#ifdef INET6 805#ifdef INET6
806int 806int
807tcp6_input(struct mbuf **mp, int *offp, int proto) 807tcp6_input(struct mbuf **mp, int *offp, int proto)
808{ 808{
809 struct mbuf *m = *mp; 809 struct mbuf *m = *mp;
810 810
811 /* 811 /*
812 * draft-itojun-ipv6-tcp-to-anycast 812 * draft-itojun-ipv6-tcp-to-anycast
813 * better place to put this in? 813 * better place to put this in?
814 */ 814 */
815 if (m->m_flags & M_ANYCAST6) { 815 if (m->m_flags & M_ANYCAST6) {
816 struct ip6_hdr *ip6; 816 struct ip6_hdr *ip6;
817 if (m->m_len < sizeof(struct ip6_hdr)) { 817 if (m->m_len < sizeof(struct ip6_hdr)) {
818 if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { 818 if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
819 TCP_STATINC(TCP_STAT_RCVSHORT); 819 TCP_STATINC(TCP_STAT_RCVSHORT);
820 return IPPROTO_DONE; 820 return IPPROTO_DONE;
821 } 821 }
822 } 822 }
823 ip6 = mtod(m, struct ip6_hdr *); 823 ip6 = mtod(m, struct ip6_hdr *);
824 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 824 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
825 (char *)&ip6->ip6_dst - (char *)ip6); 825 (char *)&ip6->ip6_dst - (char *)ip6);
826 return IPPROTO_DONE; 826 return IPPROTO_DONE;
827 } 827 }
828 828
829 tcp_input(m, *offp, proto); 829 tcp_input(m, *offp, proto);
830 return IPPROTO_DONE; 830 return IPPROTO_DONE;
831} 831}
832#endif 832#endif
833 833
834#ifdef INET 834#ifdef INET
835static void 835static void
836tcp4_log_refused(const struct ip *ip, const struct tcphdr *th) 836tcp4_log_refused(const struct ip *ip, const struct tcphdr *th)
837{ 837{
838 char src[4*sizeof "123"]; 838 char src[4*sizeof "123"];
839 char dst[4*sizeof "123"]; 839 char dst[4*sizeof "123"];
840 840
841 if (ip) { 841 if (ip) {
842 strlcpy(src, inet_ntoa(ip->ip_src), sizeof(src)); 842 strlcpy(src, inet_ntoa(ip->ip_src), sizeof(src));
843 strlcpy(dst, inet_ntoa(ip->ip_dst), sizeof(dst)); 843 strlcpy(dst, inet_ntoa(ip->ip_dst), sizeof(dst));
844 } 844 }
845 else { 845 else {
846 strlcpy(src, "(unknown)", sizeof(src)); 846 strlcpy(src, "(unknown)", sizeof(src));
847 strlcpy(dst, "(unknown)", sizeof(dst)); 847 strlcpy(dst, "(unknown)", sizeof(dst));
848 } 848 }
849 log(LOG_INFO, 849 log(LOG_INFO,
850 "Connection attempt to TCP %s:%d from %s:%d\n", 850 "Connection attempt to TCP %s:%d from %s:%d\n",
851 dst, ntohs(th->th_dport), 851 dst, ntohs(th->th_dport),
852 src, ntohs(th->th_sport)); 852 src, ntohs(th->th_sport));
853} 853}
854#endif 854#endif
855 855
856#ifdef INET6 856#ifdef INET6
857static void 857static void
858tcp6_log_refused(const struct ip6_hdr *ip6, const struct tcphdr *th) 858tcp6_log_refused(const struct ip6_hdr *ip6, const struct tcphdr *th)
859{ 859{
860 char src[INET6_ADDRSTRLEN]; 860 char src[INET6_ADDRSTRLEN];
861 char dst[INET6_ADDRSTRLEN]; 861 char dst[INET6_ADDRSTRLEN];
862 862
863 if (ip6) { 863 if (ip6) {
864 strlcpy(src, ip6_sprintf(&ip6->ip6_src), sizeof(src)); 864 strlcpy(src, ip6_sprintf(&ip6->ip6_src), sizeof(src));
865 strlcpy(dst, ip6_sprintf(&ip6->ip6_dst), sizeof(dst)); 865 strlcpy(dst, ip6_sprintf(&ip6->ip6_dst), sizeof(dst));
866 } 866 }
867 else { 867 else {
868 strlcpy(src, "(unknown v6)", sizeof(src)); 868 strlcpy(src, "(unknown v6)", sizeof(src));
869 strlcpy(dst, "(unknown v6)", sizeof(dst)); 869 strlcpy(dst, "(unknown v6)", sizeof(dst));
870 } 870 }
871 log(LOG_INFO, 871 log(LOG_INFO,
872 "Connection attempt to TCP [%s]:%d from [%s]:%d\n", 872 "Connection attempt to TCP [%s]:%d from [%s]:%d\n",
873 dst, ntohs(th->th_dport), 873 dst, ntohs(th->th_dport),
874 src, ntohs(th->th_sport)); 874 src, ntohs(th->th_sport));
875} 875}
876#endif 876#endif
877 877
878/* 878/*
879 * Checksum extended TCP header and data. 879 * Checksum extended TCP header and data.
880 */ 880 */
881int 881int
882tcp_input_checksum(int af, struct mbuf *m, const struct tcphdr *th, 882tcp_input_checksum(int af, struct mbuf *m, const struct tcphdr *th,
883 int toff, int off, int tlen) 883 int toff, int off, int tlen)
884{ 884{
885 885
886 /* 886 /*
887 * XXX it's better to record and check if this mbuf is 887 * XXX it's better to record and check if this mbuf is
888 * already checked. 888 * already checked.
889 */ 889 */
890 890
891 switch (af) { 891 switch (af) {
892#ifdef INET 892#ifdef INET
893 case AF_INET: 893 case AF_INET:
894 switch (m->m_pkthdr.csum_flags & 894 switch (m->m_pkthdr.csum_flags &
895 ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv4) | 895 ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv4) |
896 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { 896 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
897 case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD: 897 case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD:
898 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad); 898 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad);
899 goto badcsum; 899 goto badcsum;
900 900
901 case M_CSUM_TCPv4|M_CSUM_DATA: { 901 case M_CSUM_TCPv4|M_CSUM_DATA: {
902 u_int32_t hw_csum = m->m_pkthdr.csum_data; 902 u_int32_t hw_csum = m->m_pkthdr.csum_data;
903 903
904 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data); 904 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data);
905 if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) { 905 if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) {
906 const struct ip *ip = 906 const struct ip *ip =
907 mtod(m, const struct ip *); 907 mtod(m, const struct ip *);
908 908
909 hw_csum = in_cksum_phdr(ip->ip_src.s_addr, 909 hw_csum = in_cksum_phdr(ip->ip_src.s_addr,
910 ip->ip_dst.s_addr, 910 ip->ip_dst.s_addr,
911 htons(hw_csum + tlen + off + IPPROTO_TCP)); 911 htons(hw_csum + tlen + off + IPPROTO_TCP));
912 } 912 }
913 if ((hw_csum ^ 0xffff) != 0) 913 if ((hw_csum ^ 0xffff) != 0)
914 goto badcsum; 914 goto badcsum;
915 break; 915 break;
916 } 916 }
917 917
918 case M_CSUM_TCPv4: 918 case M_CSUM_TCPv4:
919 /* Checksum was okay. */ 919 /* Checksum was okay. */
920 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok); 920 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok);
921 break; 921 break;
922 922
923 default: 923 default:
924 /* 924 /*
925 * Must compute it ourselves. Maybe skip checksum 925 * Must compute it ourselves. Maybe skip checksum
926 * on loopback interfaces. 926 * on loopback interfaces.
927 */ 927 */
928 if (__predict_true(!(m->m_pkthdr.rcvif->if_flags & 928 if (__predict_true(!(m->m_pkthdr.rcvif->if_flags &
929 IFF_LOOPBACK) || 929 IFF_LOOPBACK) ||
930 tcp_do_loopback_cksum)) { 930 tcp_do_loopback_cksum)) {
931 TCP_CSUM_COUNTER_INCR(&tcp_swcsum); 931 TCP_CSUM_COUNTER_INCR(&tcp_swcsum);
932 if (in4_cksum(m, IPPROTO_TCP, toff, 932 if (in4_cksum(m, IPPROTO_TCP, toff,
933 tlen + off) != 0) 933 tlen + off) != 0)
934 goto badcsum; 934 goto badcsum;
935 } 935 }
936 break; 936 break;
937 } 937 }
938 break; 938 break;
939#endif /* INET4 */ 939#endif /* INET4 */
940 940
941#ifdef INET6 941#ifdef INET6
942 case AF_INET6: 942 case AF_INET6:
943 switch (m->m_pkthdr.csum_flags & 943 switch (m->m_pkthdr.csum_flags &
944 ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv6) | 944 ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv6) |
945 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { 945 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
946 case M_CSUM_TCPv6|M_CSUM_TCP_UDP_BAD: 946 case M_CSUM_TCPv6|M_CSUM_TCP_UDP_BAD:
947 TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_bad); 947 TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_bad);
948 goto badcsum; 948 goto badcsum;
949 949
950#if 0 /* notyet */ 950#if 0 /* notyet */
951 case M_CSUM_TCPv6|M_CSUM_DATA: 951 case M_CSUM_TCPv6|M_CSUM_DATA:
952#endif 952#endif
953 953
954 case M_CSUM_TCPv6: 954 case M_CSUM_TCPv6:
955 /* Checksum was okay. */ 955 /* Checksum was okay. */
956 TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_ok); 956 TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_ok);
957 break; 957 break;
958 958
959 default: 959 default:
960 /* 960 /*
961 * Must compute it ourselves. Maybe skip checksum 961 * Must compute it ourselves. Maybe skip checksum
962 * on loopback interfaces. 962 * on loopback interfaces.
963 */ 963 */
964 if (__predict_true((m->m_flags & M_LOOP) == 0 || 964 if (__predict_true((m->m_flags & M_LOOP) == 0 ||
965 tcp_do_loopback_cksum)) { 965 tcp_do_loopback_cksum)) {
966 TCP_CSUM_COUNTER_INCR(&tcp6_swcsum); 966 TCP_CSUM_COUNTER_INCR(&tcp6_swcsum);
967 if (in6_cksum(m, IPPROTO_TCP, toff, 967 if (in6_cksum(m, IPPROTO_TCP, toff,
968 tlen + off) != 0) 968 tlen + off) != 0)
969 goto badcsum; 969 goto badcsum;
970 } 970 }
971 } 971 }
972 break; 972 break;
973#endif /* INET6 */ 973#endif /* INET6 */
974 } 974 }
975 975
976 return 0; 976 return 0;
977 977
978badcsum: 978badcsum:
979 TCP_STATINC(TCP_STAT_RCVBADSUM); 979 TCP_STATINC(TCP_STAT_RCVBADSUM);
980 return -1; 980 return -1;
981} 981}
982 982
983/* 983/*
984 * TCP input routine, follows pages 65-76 of RFC 793 very closely. 984 * TCP input routine, follows pages 65-76 of RFC 793 very closely.
985 */ 985 */
986void 986void
987tcp_input(struct mbuf *m, ...) 987tcp_input(struct mbuf *m, ...)
988{ 988{
989 struct tcphdr *th; 989 struct tcphdr *th;
990 struct ip *ip; 990 struct ip *ip;
991 struct inpcb *inp; 991 struct inpcb *inp;
992#ifdef INET6 992#ifdef INET6
993 struct ip6_hdr *ip6; 993 struct ip6_hdr *ip6;
994 struct in6pcb *in6p; 994 struct in6pcb *in6p;
995#endif 995#endif
996 u_int8_t *optp = NULL; 996 u_int8_t *optp = NULL;
997 int optlen = 0; 997 int optlen = 0;
998 int len, tlen, toff, hdroptlen = 0; 998 int len, tlen, toff, hdroptlen = 0;
999 struct tcpcb *tp = 0; 999 struct tcpcb *tp = 0;
1000 int tiflags; 1000 int tiflags;
1001 struct socket *so = NULL; 1001 struct socket *so = NULL;
1002 int todrop, dupseg, acked, ourfinisacked, needoutput = 0; 1002 int todrop, dupseg, acked, ourfinisacked, needoutput = 0;
1003#ifdef TCP_DEBUG 1003#ifdef TCP_DEBUG
1004 short ostate = 0; 1004 short ostate = 0;
1005#endif 1005#endif
1006 u_long tiwin; 1006 u_long tiwin;
1007 struct tcp_opt_info opti; 1007 struct tcp_opt_info opti;
1008 int off, iphlen; 1008 int off, iphlen;
1009 va_list ap; 1009 va_list ap;
1010 int af; /* af on the wire */ 1010 int af; /* af on the wire */
1011 struct mbuf *tcp_saveti = NULL; 1011 struct mbuf *tcp_saveti = NULL;
1012 uint32_t ts_rtt; 1012 uint32_t ts_rtt;
1013 uint8_t iptos; 1013 uint8_t iptos;
1014 uint64_t *tcps; 1014 uint64_t *tcps;
1015 1015
1016 MCLAIM(m, &tcp_rx_mowner); 1016 MCLAIM(m, &tcp_rx_mowner);
1017 va_start(ap, m); 1017 va_start(ap, m);
1018 toff = va_arg(ap, int); 1018 toff = va_arg(ap, int);
1019 (void)va_arg(ap, int); /* ignore value, advance ap */ 1019 (void)va_arg(ap, int); /* ignore value, advance ap */
1020 va_end(ap); 1020 va_end(ap);
1021 1021
1022 TCP_STATINC(TCP_STAT_RCVTOTAL); 1022 TCP_STATINC(TCP_STAT_RCVTOTAL);
1023 1023
1024 bzero(&opti, sizeof(opti)); 1024 bzero(&opti, sizeof(opti));
1025 opti.ts_present = 0; 1025 opti.ts_present = 0;
1026 opti.maxseg = 0; 1026 opti.maxseg = 0;
1027 1027
1028 /* 1028 /*
1029 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN. 1029 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN.
1030 * 1030 *
1031 * TCP is, by definition, unicast, so we reject all 1031 * TCP is, by definition, unicast, so we reject all
1032 * multicast outright. 1032 * multicast outright.
1033 * 1033 *
1034 * Note, there are additional src/dst address checks in 1034 * Note, there are additional src/dst address checks in
1035 * the AF-specific code below. 1035 * the AF-specific code below.
1036 */ 1036 */
1037 if (m->m_flags & (M_BCAST|M_MCAST)) { 1037 if (m->m_flags & (M_BCAST|M_MCAST)) {
1038 /* XXX stat */ 1038 /* XXX stat */
1039 goto drop; 1039 goto drop;
1040 } 1040 }
1041#ifdef INET6 1041#ifdef INET6
1042 if (m->m_flags & M_ANYCAST6) { 1042 if (m->m_flags & M_ANYCAST6) {
1043 /* XXX stat */ 1043 /* XXX stat */
1044 goto drop; 1044 goto drop;
1045 } 1045 }
1046#endif 1046#endif
1047 1047
1048 /* 1048 /*
1049 * Get IP and TCP header. 1049 * Get IP and TCP header.
1050 * Note: IP leaves IP header in first mbuf. 1050 * Note: IP leaves IP header in first mbuf.
1051 */ 1051 */
1052 ip = mtod(m, struct ip *); 1052 ip = mtod(m, struct ip *);
1053#ifdef INET6 1053#ifdef INET6
1054 ip6 = NULL; 1054 ip6 = NULL;
1055#endif 1055#endif
1056 switch (ip->ip_v) { 1056 switch (ip->ip_v) {
1057#ifdef INET 1057#ifdef INET
1058 case 4: 1058 case 4:
1059 af = AF_INET; 1059 af = AF_INET;
1060 iphlen = sizeof(struct ip); 1060 iphlen = sizeof(struct ip);
1061 ip = mtod(m, struct ip *); 1061 ip = mtod(m, struct ip *);
1062 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 1062 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
1063 sizeof(struct tcphdr)); 1063 sizeof(struct tcphdr));
1064 if (th == NULL) { 1064 if (th == NULL) {
1065 TCP_STATINC(TCP_STAT_RCVSHORT); 1065 TCP_STATINC(TCP_STAT_RCVSHORT);
1066 return; 1066 return;
1067 } 1067 }
1068 /* We do the checksum after PCB lookup... */ 1068 /* We do the checksum after PCB lookup... */
1069 len = ntohs(ip->ip_len); 1069 len = ntohs(ip->ip_len);
1070 tlen = len - toff; 1070 tlen = len - toff;
1071 iptos = ip->ip_tos; 1071 iptos = ip->ip_tos;
1072 break; 1072 break;
1073#endif 1073#endif
1074#ifdef INET6 1074#ifdef INET6
1075 case 6: 1075 case 6:
1076 ip = NULL; 1076 ip = NULL;
1077 iphlen = sizeof(struct ip6_hdr); 1077 iphlen = sizeof(struct ip6_hdr);
1078 af = AF_INET6; 1078 af = AF_INET6;
1079 ip6 = mtod(m, struct ip6_hdr *); 1079 ip6 = mtod(m, struct ip6_hdr *);
1080 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 1080 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
1081 sizeof(struct tcphdr)); 1081 sizeof(struct tcphdr));
1082 if (th == NULL) { 1082 if (th == NULL) {
1083 TCP_STATINC(TCP_STAT_RCVSHORT); 1083 TCP_STATINC(TCP_STAT_RCVSHORT);
1084 return; 1084 return;
1085 } 1085 }
1086 1086
1087 /* Be proactive about malicious use of IPv4 mapped address */ 1087 /* Be proactive about malicious use of IPv4 mapped address */
1088 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 1088 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
1089 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 1089 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
1090 /* XXX stat */ 1090 /* XXX stat */
1091 goto drop; 1091 goto drop;
1092 } 1092 }
1093 1093
1094 /* 1094 /*
1095 * Be proactive about unspecified IPv6 address in source. 1095 * Be proactive about unspecified IPv6 address in source.
1096 * As we use all-zero to indicate unbounded/unconnected pcb, 1096 * As we use all-zero to indicate unbounded/unconnected pcb,
1097 * unspecified IPv6 address can be used to confuse us. 1097 * unspecified IPv6 address can be used to confuse us.
1098 * 1098 *
1099 * Note that packets with unspecified IPv6 destination is 1099 * Note that packets with unspecified IPv6 destination is
1100 * already dropped in ip6_input. 1100 * already dropped in ip6_input.
1101 */ 1101 */
1102 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 1102 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
1103 /* XXX stat */ 1103 /* XXX stat */
1104 goto drop; 1104 goto drop;
1105 } 1105 }
1106 1106
1107 /* 1107 /*
1108 * Make sure destination address is not multicast. 1108 * Make sure destination address is not multicast.
1109 * Source address checked in ip6_input(). 1109 * Source address checked in ip6_input().
1110 */ 1110 */
1111 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 1111 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
1112 /* XXX stat */ 1112 /* XXX stat */
1113 goto drop; 1113 goto drop;
1114 } 1114 }
1115 1115
1116 /* We do the checksum after PCB lookup... */ 1116 /* We do the checksum after PCB lookup... */
1117 len = m->m_pkthdr.len; 1117 len = m->m_pkthdr.len;
1118 tlen = len - toff; 1118 tlen = len - toff;
1119 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 1119 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
1120 break; 1120 break;
1121#endif 1121#endif
1122 default: 1122 default:
1123 m_freem(m); 1123 m_freem(m);
1124 return; 1124 return;
1125 } 1125 }
1126 1126
1127 KASSERT(TCP_HDR_ALIGNED_P(th)); 1127 KASSERT(TCP_HDR_ALIGNED_P(th));
1128 1128
1129 /* 1129 /*
1130 * Check that TCP offset makes sense, 1130 * Check that TCP offset makes sense,
1131 * pull out TCP options and adjust length. XXX 1131 * pull out TCP options and adjust length. XXX
1132 */ 1132 */
1133 off = th->th_off << 2; 1133 off = th->th_off << 2;
1134 if (off < sizeof (struct tcphdr) || off > tlen) { 1134 if (off < sizeof (struct tcphdr) || off > tlen) {
1135 TCP_STATINC(TCP_STAT_RCVBADOFF); 1135 TCP_STATINC(TCP_STAT_RCVBADOFF);
1136 goto drop; 1136 goto drop;
1137 } 1137 }
1138 tlen -= off; 1138 tlen -= off;
1139 1139
1140 /* 1140 /*
1141 * tcp_input() has been modified to use tlen to mean the TCP data 1141 * tcp_input() has been modified to use tlen to mean the TCP data
1142 * length throughout the function. Other functions can use 1142 * length throughout the function. Other functions can use
1143 * m->m_pkthdr.len as the basis for calculating the TCP data length. 1143 * m->m_pkthdr.len as the basis for calculating the TCP data length.
1144 * rja 1144 * rja
1145 */ 1145 */
1146 1146
1147 if (off > sizeof (struct tcphdr)) { 1147 if (off > sizeof (struct tcphdr)) {
@@ -1442,1999 +1442,2002 @@ findpcb: @@ -1442,1999 +1442,2002 @@ findpcb:
1442 */ 1442 */
1443 m = NULL; 1443 m = NULL;
1444 } else { 1444 } else {
1445 /* 1445 /*
1446 * We have created a 1446 * We have created a
1447 * full-blown connection. 1447 * full-blown connection.
1448 */ 1448 */
1449 tp = NULL; 1449 tp = NULL;
1450 inp = NULL; 1450 inp = NULL;
1451#ifdef INET6 1451#ifdef INET6
1452 in6p = NULL; 1452 in6p = NULL;
1453#endif 1453#endif
1454 switch (so->so_proto->pr_domain->dom_family) { 1454 switch (so->so_proto->pr_domain->dom_family) {
1455#ifdef INET 1455#ifdef INET
1456 case AF_INET: 1456 case AF_INET:
1457 inp = sotoinpcb(so); 1457 inp = sotoinpcb(so);
1458 tp = intotcpcb(inp); 1458 tp = intotcpcb(inp);
1459 break; 1459 break;
1460#endif 1460#endif
1461#ifdef INET6 1461#ifdef INET6
1462 case AF_INET6: 1462 case AF_INET6:
1463 in6p = sotoin6pcb(so); 1463 in6p = sotoin6pcb(so);
1464 tp = in6totcpcb(in6p); 1464 tp = in6totcpcb(in6p);
1465 break; 1465 break;
1466#endif 1466#endif
1467 } 1467 }
1468 if (tp == NULL) 1468 if (tp == NULL)
1469 goto badsyn; /*XXX*/ 1469 goto badsyn; /*XXX*/
1470 tiwin <<= tp->snd_scale; 1470 tiwin <<= tp->snd_scale;
1471 goto after_listen; 1471 goto after_listen;
1472 } 1472 }
1473 } else { 1473 } else {
1474 /* 1474 /*
1475 * None of RST, SYN or ACK was set. 1475 * None of RST, SYN or ACK was set.
1476 * This is an invalid packet for a 1476 * This is an invalid packet for a
1477 * TCB in LISTEN state. Send a RST. 1477 * TCB in LISTEN state. Send a RST.
1478 */ 1478 */
1479 goto badsyn; 1479 goto badsyn;
1480 } 1480 }
1481 } else { 1481 } else {
1482 /* 1482 /*
1483 * Received a SYN. 1483 * Received a SYN.
1484 * 1484 *
1485 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 1485 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1486 */ 1486 */
1487 if (m->m_flags & (M_BCAST|M_MCAST)) 1487 if (m->m_flags & (M_BCAST|M_MCAST))
1488 goto drop; 1488 goto drop;
1489 1489
1490 switch (af) { 1490 switch (af) {
1491#ifdef INET6 1491#ifdef INET6
1492 case AF_INET6: 1492 case AF_INET6:
1493 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) 1493 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
1494 goto drop; 1494 goto drop;
1495 break; 1495 break;
1496#endif /* INET6 */ 1496#endif /* INET6 */
1497 case AF_INET: 1497 case AF_INET:
1498 if (IN_MULTICAST(ip->ip_dst.s_addr) || 1498 if (IN_MULTICAST(ip->ip_dst.s_addr) ||
1499 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 1499 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1500 goto drop; 1500 goto drop;
1501 break; 1501 break;
1502 } 1502 }
1503 1503
1504#ifdef INET6 1504#ifdef INET6
1505 /* 1505 /*
1506 * If deprecated address is forbidden, we do 1506 * If deprecated address is forbidden, we do
1507 * not accept SYN to deprecated interface 1507 * not accept SYN to deprecated interface
1508 * address to prevent any new inbound 1508 * address to prevent any new inbound
1509 * connection from getting established. 1509 * connection from getting established.
1510 * When we do not accept SYN, we send a TCP 1510 * When we do not accept SYN, we send a TCP
1511 * RST, with deprecated source address (instead 1511 * RST, with deprecated source address (instead
1512 * of dropping it). We compromise it as it is 1512 * of dropping it). We compromise it as it is
1513 * much better for peer to send a RST, and 1513 * much better for peer to send a RST, and
1514 * RST will be the final packet for the 1514 * RST will be the final packet for the
1515 * exchange. 1515 * exchange.
1516 * 1516 *
1517 * If we do not forbid deprecated addresses, we 1517 * If we do not forbid deprecated addresses, we
1518 * accept the SYN packet. RFC2462 does not 1518 * accept the SYN packet. RFC2462 does not
1519 * suggest dropping SYN in this case. 1519 * suggest dropping SYN in this case.
1520 * If we decipher RFC2462 5.5.4, it says like 1520 * If we decipher RFC2462 5.5.4, it says like
1521 * this: 1521 * this:
1522 * 1. use of deprecated addr with existing 1522 * 1. use of deprecated addr with existing
1523 * communication is okay - "SHOULD continue 1523 * communication is okay - "SHOULD continue
1524 * to be used" 1524 * to be used"
1525 * 2. use of it with new communication: 1525 * 2. use of it with new communication:
1526 * (2a) "SHOULD NOT be used if alternate 1526 * (2a) "SHOULD NOT be used if alternate
1527 * address with sufficient scope is 1527 * address with sufficient scope is
1528 * available" 1528 * available"
1529 * (2b) nothing mentioned otherwise. 1529 * (2b) nothing mentioned otherwise.
1530 * Here we fall into (2b) case as we have no 1530 * Here we fall into (2b) case as we have no
1531 * choice in our source address selection - we 1531 * choice in our source address selection - we
1532 * must obey the peer. 1532 * must obey the peer.
1533 * 1533 *
1534 * The wording in RFC2462 is confusing, and 1534 * The wording in RFC2462 is confusing, and
1535 * there are multiple description text for 1535 * there are multiple description text for
1536 * deprecated address handling - worse, they 1536 * deprecated address handling - worse, they
1537 * are not exactly the same. I believe 5.5.4 1537 * are not exactly the same. I believe 5.5.4
1538 * is the best one, so we follow 5.5.4. 1538 * is the best one, so we follow 5.5.4.
1539 */ 1539 */
1540 if (af == AF_INET6 && !ip6_use_deprecated) { 1540 if (af == AF_INET6 && !ip6_use_deprecated) {
1541 struct in6_ifaddr *ia6; 1541 struct in6_ifaddr *ia6;
1542 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, 1542 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif,
1543 &ip6->ip6_dst)) && 1543 &ip6->ip6_dst)) &&
1544 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 1544 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
1545 tp = NULL; 1545 tp = NULL;
1546 goto dropwithreset; 1546 goto dropwithreset;
1547 } 1547 }
1548 } 1548 }
1549#endif 1549#endif
1550 1550
1551#if defined(IPSEC) || defined(FAST_IPSEC) 1551#if defined(IPSEC) || defined(FAST_IPSEC)
1552 switch (af) { 1552 switch (af) {
1553#ifdef INET 1553#ifdef INET
1554 case AF_INET: 1554 case AF_INET:
1555 if (ipsec4_in_reject_so(m, so)) { 1555 if (ipsec4_in_reject_so(m, so)) {
1556 IPSEC_STATINC(IPSEC_STAT_IN_POLVIO); 1556 IPSEC_STATINC(IPSEC_STAT_IN_POLVIO);
1557 tp = NULL; 1557 tp = NULL;
1558 goto dropwithreset; 1558 goto dropwithreset;
1559 } 1559 }
1560 break; 1560 break;
1561#endif 1561#endif
1562#ifdef INET6 1562#ifdef INET6
1563 case AF_INET6: 1563 case AF_INET6:
1564 if (ipsec6_in_reject_so(m, so)) { 1564 if (ipsec6_in_reject_so(m, so)) {
1565 IPSEC6_STATINC(IPSEC_STAT_IN_POLVIO); 1565 IPSEC6_STATINC(IPSEC_STAT_IN_POLVIO);
1566 tp = NULL; 1566 tp = NULL;
1567 goto dropwithreset; 1567 goto dropwithreset;
1568 } 1568 }
1569 break; 1569 break;
1570#endif /*INET6*/ 1570#endif /*INET6*/
1571 } 1571 }
1572#endif /*IPSEC*/ 1572#endif /*IPSEC*/
1573 1573
1574 /* 1574 /*
1575 * LISTEN socket received a SYN 1575 * LISTEN socket received a SYN
1576 * from itself? This can't possibly 1576 * from itself? This can't possibly
1577 * be valid; drop the packet. 1577 * be valid; drop the packet.
1578 */ 1578 */
1579 if (th->th_sport == th->th_dport) { 1579 if (th->th_sport == th->th_dport) {
1580 int i; 1580 int i;
1581 1581
1582 switch (af) { 1582 switch (af) {
1583#ifdef INET 1583#ifdef INET
1584 case AF_INET: 1584 case AF_INET:
1585 i = in_hosteq(ip->ip_src, ip->ip_dst); 1585 i = in_hosteq(ip->ip_src, ip->ip_dst);
1586 break; 1586 break;
1587#endif 1587#endif
1588#ifdef INET6 1588#ifdef INET6
1589 case AF_INET6: 1589 case AF_INET6:
1590 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst); 1590 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst);
1591 break; 1591 break;
1592#endif 1592#endif
1593 default: 1593 default:
1594 i = 1; 1594 i = 1;
1595 } 1595 }
1596 if (i) { 1596 if (i) {
1597 TCP_STATINC(TCP_STAT_BADSYN); 1597 TCP_STATINC(TCP_STAT_BADSYN);
1598 goto drop; 1598 goto drop;
1599 } 1599 }
1600 } 1600 }
1601 1601
1602 /* 1602 /*
1603 * SYN looks ok; create compressed TCP 1603 * SYN looks ok; create compressed TCP
1604 * state for it. 1604 * state for it.
1605 */ 1605 */
1606 if (so->so_qlen <= so->so_qlimit && 1606 if (so->so_qlen <= so->so_qlimit &&
1607 syn_cache_add(&src.sa, &dst.sa, th, tlen, 1607 syn_cache_add(&src.sa, &dst.sa, th, tlen,
1608 so, m, optp, optlen, &opti)) 1608 so, m, optp, optlen, &opti))
1609 m = NULL; 1609 m = NULL;
1610 } 1610 }
1611 goto drop; 1611 goto drop;
1612 } 1612 }
1613 } 1613 }
1614 1614
1615after_listen: 1615after_listen:
1616#ifdef DIAGNOSTIC 1616#ifdef DIAGNOSTIC
1617 /* 1617 /*
1618 * Should not happen now that all embryonic connections 1618 * Should not happen now that all embryonic connections
1619 * are handled with compressed state. 1619 * are handled with compressed state.
1620 */ 1620 */
1621 if (tp->t_state == TCPS_LISTEN) 1621 if (tp->t_state == TCPS_LISTEN)
1622 panic("tcp_input: TCPS_LISTEN"); 1622 panic("tcp_input: TCPS_LISTEN");
1623#endif 1623#endif
1624 1624
1625 /* 1625 /*
1626 * Segment received on connection. 1626 * Segment received on connection.
1627 * Reset idle time and keep-alive timer. 1627 * Reset idle time and keep-alive timer.
1628 */ 1628 */
1629 tp->t_rcvtime = tcp_now; 1629 tp->t_rcvtime = tcp_now;
1630 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1630 if (TCPS_HAVEESTABLISHED(tp->t_state))
1631 TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle); 1631 TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
1632 1632
1633 /* 1633 /*
1634 * Process options. 1634 * Process options.
1635 */ 1635 */
1636#ifdef TCP_SIGNATURE 1636#ifdef TCP_SIGNATURE
1637 if (optp || (tp->t_flags & TF_SIGNATURE)) 1637 if (optp || (tp->t_flags & TF_SIGNATURE))
1638#else 1638#else
1639 if (optp) 1639 if (optp)
1640#endif 1640#endif
1641 if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0) 1641 if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0)
1642 goto drop; 1642 goto drop;
1643 1643
1644 if (TCP_SACK_ENABLED(tp)) { 1644 if (TCP_SACK_ENABLED(tp)) {
1645 tcp_del_sackholes(tp, th); 1645 tcp_del_sackholes(tp, th);
1646 } 1646 }
1647 1647
1648 if (TCP_ECN_ALLOWED(tp)) { 1648 if (TCP_ECN_ALLOWED(tp)) {
1649 switch (iptos & IPTOS_ECN_MASK) { 1649 switch (iptos & IPTOS_ECN_MASK) {
1650 case IPTOS_ECN_CE: 1650 case IPTOS_ECN_CE:
1651 tp->t_flags |= TF_ECN_SND_ECE; 1651 tp->t_flags |= TF_ECN_SND_ECE;
1652 TCP_STATINC(TCP_STAT_ECN_CE); 1652 TCP_STATINC(TCP_STAT_ECN_CE);
1653 break; 1653 break;
1654 case IPTOS_ECN_ECT0: 1654 case IPTOS_ECN_ECT0:
1655 TCP_STATINC(TCP_STAT_ECN_ECT); 1655 TCP_STATINC(TCP_STAT_ECN_ECT);
1656 break; 1656 break;
1657 case IPTOS_ECN_ECT1: 1657 case IPTOS_ECN_ECT1:
1658 /* XXX: ignore for now -- rpaulo */ 1658 /* XXX: ignore for now -- rpaulo */
1659 break; 1659 break;
1660 } 1660 }
1661 1661
1662 if (tiflags & TH_CWR) 1662 if (tiflags & TH_CWR)
1663 tp->t_flags &= ~TF_ECN_SND_ECE; 1663 tp->t_flags &= ~TF_ECN_SND_ECE;
1664 1664
1665 /* 1665 /*
1666 * Congestion experienced. 1666 * Congestion experienced.
1667 * Ignore if we are already trying to recover. 1667 * Ignore if we are already trying to recover.
1668 */ 1668 */
1669 if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover)) 1669 if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover))
1670 tp->t_congctl->cong_exp(tp); 1670 tp->t_congctl->cong_exp(tp);
1671 } 1671 }
1672 1672
1673 if (opti.ts_present && opti.ts_ecr) { 1673 if (opti.ts_present && opti.ts_ecr) {
1674 /* 1674 /*
1675 * Calculate the RTT from the returned time stamp and the 1675 * Calculate the RTT from the returned time stamp and the
1676 * connection's time base. If the time stamp is later than 1676 * connection's time base. If the time stamp is later than
1677 * the current time, or is extremely old, fall back to non-1323 1677 * the current time, or is extremely old, fall back to non-1323
1678 * RTT calculation. Since ts_ecr is unsigned, we can test both 1678 * RTT calculation. Since ts_ecr is unsigned, we can test both
1679 * at the same time. 1679 * at the same time.
1680 */ 1680 */
1681 ts_rtt = TCP_TIMESTAMP(tp) - opti.ts_ecr + 1; 1681 ts_rtt = TCP_TIMESTAMP(tp) - opti.ts_ecr + 1;
1682 if (ts_rtt > TCP_PAWS_IDLE) 1682 if (ts_rtt > TCP_PAWS_IDLE)
1683 ts_rtt = 0; 1683 ts_rtt = 0;
1684 } else { 1684 } else {
1685 ts_rtt = 0; 1685 ts_rtt = 0;
1686 } 1686 }
1687 1687
1688 /* 1688 /*
1689 * Header prediction: check for the two common cases 1689 * Header prediction: check for the two common cases
1690 * of a uni-directional data xfer. If the packet has 1690 * of a uni-directional data xfer. If the packet has
1691 * no control flags, is in-sequence, the window didn't 1691 * no control flags, is in-sequence, the window didn't
1692 * change and we're not retransmitting, it's a 1692 * change and we're not retransmitting, it's a
1693 * candidate. If the length is zero and the ack moved 1693 * candidate. If the length is zero and the ack moved
1694 * forward, we're the sender side of the xfer. Just 1694 * forward, we're the sender side of the xfer. Just
1695 * free the data acked & wake any higher level process 1695 * free the data acked & wake any higher level process
1696 * that was blocked waiting for space. If the length 1696 * that was blocked waiting for space. If the length
1697 * is non-zero and the ack didn't move, we're the 1697 * is non-zero and the ack didn't move, we're the
1698 * receiver side. If we're getting packets in-order 1698 * receiver side. If we're getting packets in-order
1699 * (the reassembly queue is empty), add the data to 1699 * (the reassembly queue is empty), add the data to
1700 * the socket buffer and note that we need a delayed ack. 1700 * the socket buffer and note that we need a delayed ack.
1701 */ 1701 */
1702 if (tp->t_state == TCPS_ESTABLISHED && 1702 if (tp->t_state == TCPS_ESTABLISHED &&
1703 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) 1703 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK))
1704 == TH_ACK && 1704 == TH_ACK &&
1705 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1705 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
1706 th->th_seq == tp->rcv_nxt && 1706 th->th_seq == tp->rcv_nxt &&
1707 tiwin && tiwin == tp->snd_wnd && 1707 tiwin && tiwin == tp->snd_wnd &&
1708 tp->snd_nxt == tp->snd_max) { 1708 tp->snd_nxt == tp->snd_max) {
1709 1709
1710 /* 1710 /*
1711 * If last ACK falls within this segment's sequence numbers, 1711 * If last ACK falls within this segment's sequence numbers,
1712 * record the timestamp. 1712 * record the timestamp.
1713 * NOTE that the test is modified according to the latest 1713 * NOTE that the test is modified according to the latest
1714 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1714 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1715 * 1715 *
1716 * note that we already know 1716 * note that we already know
1717 * TSTMP_GEQ(opti.ts_val, tp->ts_recent) 1717 * TSTMP_GEQ(opti.ts_val, tp->ts_recent)
1718 */ 1718 */
1719 if (opti.ts_present && 1719 if (opti.ts_present &&
1720 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1720 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1721 tp->ts_recent_age = tcp_now; 1721 tp->ts_recent_age = tcp_now;
1722 tp->ts_recent = opti.ts_val; 1722 tp->ts_recent = opti.ts_val;
1723 } 1723 }
1724 1724
1725 if (tlen == 0) { 1725 if (tlen == 0) {
1726 /* Ack prediction. */ 1726 /* Ack prediction. */
1727 if (SEQ_GT(th->th_ack, tp->snd_una) && 1727 if (SEQ_GT(th->th_ack, tp->snd_una) &&
1728 SEQ_LEQ(th->th_ack, tp->snd_max) && 1728 SEQ_LEQ(th->th_ack, tp->snd_max) &&
1729 tp->snd_cwnd >= tp->snd_wnd && 1729 tp->snd_cwnd >= tp->snd_wnd &&
1730 tp->t_partialacks < 0) { 1730 tp->t_partialacks < 0) {
1731 /* 1731 /*
1732 * this is a pure ack for outstanding data. 1732 * this is a pure ack for outstanding data.
1733 */ 1733 */
1734 if (ts_rtt) 1734 if (ts_rtt)
1735 tcp_xmit_timer(tp, ts_rtt); 1735 tcp_xmit_timer(tp, ts_rtt);
1736 else if (tp->t_rtttime && 1736 else if (tp->t_rtttime &&
1737 SEQ_GT(th->th_ack, tp->t_rtseq)) 1737 SEQ_GT(th->th_ack, tp->t_rtseq))
1738 tcp_xmit_timer(tp, 1738 tcp_xmit_timer(tp,
1739 tcp_now - tp->t_rtttime); 1739 tcp_now - tp->t_rtttime);
1740 acked = th->th_ack - tp->snd_una; 1740 acked = th->th_ack - tp->snd_una;
1741 tcps = TCP_STAT_GETREF(); 1741 tcps = TCP_STAT_GETREF();
1742 tcps[TCP_STAT_PREDACK]++; 1742 tcps[TCP_STAT_PREDACK]++;
1743 tcps[TCP_STAT_RCVACKPACK]++; 1743 tcps[TCP_STAT_RCVACKPACK]++;
1744 tcps[TCP_STAT_RCVACKBYTE] += acked; 1744 tcps[TCP_STAT_RCVACKBYTE] += acked;
1745 TCP_STAT_PUTREF(); 1745 TCP_STAT_PUTREF();
1746 nd6_hint(tp); 1746 nd6_hint(tp);
1747 1747
1748 if (acked > (tp->t_lastoff - tp->t_inoff)) 1748 if (acked > (tp->t_lastoff - tp->t_inoff))
1749 tp->t_lastm = NULL; 1749 tp->t_lastm = NULL;
1750 sbdrop(&so->so_snd, acked); 1750 sbdrop(&so->so_snd, acked);
1751 tp->t_lastoff -= acked; 1751 tp->t_lastoff -= acked;
1752 1752
1753 icmp_check(tp, th, acked); 1753 icmp_check(tp, th, acked);
1754 1754
1755 tp->snd_una = th->th_ack; 1755 tp->snd_una = th->th_ack;
1756 tp->snd_fack = tp->snd_una; 1756 tp->snd_fack = tp->snd_una;
1757 if (SEQ_LT(tp->snd_high, tp->snd_una)) 1757 if (SEQ_LT(tp->snd_high, tp->snd_una))
1758 tp->snd_high = tp->snd_una; 1758 tp->snd_high = tp->snd_una;
1759 m_freem(m); 1759 m_freem(m);
1760 1760
1761 /* 1761 /*
1762 * If all outstanding data are acked, stop 1762 * If all outstanding data are acked, stop
1763 * retransmit timer, otherwise restart timer 1763 * retransmit timer, otherwise restart timer
1764 * using current (possibly backed-off) value. 1764 * using current (possibly backed-off) value.
1765 * If process is waiting for space, 1765 * If process is waiting for space,
1766 * wakeup/selnotify/signal. If data 1766 * wakeup/selnotify/signal. If data
1767 * are ready to send, let tcp_output 1767 * are ready to send, let tcp_output
1768 * decide between more output or persist. 1768 * decide between more output or persist.
1769 */ 1769 */
1770 if (tp->snd_una == tp->snd_max) 1770 if (tp->snd_una == tp->snd_max)
1771 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1771 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1772 else if (TCP_TIMER_ISARMED(tp, 1772 else if (TCP_TIMER_ISARMED(tp,
1773 TCPT_PERSIST) == 0) 1773 TCPT_PERSIST) == 0)
1774 TCP_TIMER_ARM(tp, TCPT_REXMT, 1774 TCP_TIMER_ARM(tp, TCPT_REXMT,
1775 tp->t_rxtcur); 1775 tp->t_rxtcur);
1776 1776
1777 sowwakeup(so); 1777 sowwakeup(so);
1778 if (so->so_snd.sb_cc) 1778 if (so->so_snd.sb_cc)
1779 (void) tcp_output(tp); 1779 (void) tcp_output(tp);
1780 if (tcp_saveti) 1780 if (tcp_saveti)
1781 m_freem(tcp_saveti); 1781 m_freem(tcp_saveti);
1782 return; 1782 return;
1783 } 1783 }
1784 } else if (th->th_ack == tp->snd_una && 1784 } else if (th->th_ack == tp->snd_una &&
1785 TAILQ_FIRST(&tp->segq) == NULL && 1785 TAILQ_FIRST(&tp->segq) == NULL &&
1786 tlen <= sbspace(&so->so_rcv)) { 1786 tlen <= sbspace(&so->so_rcv)) {
1787 int newsize = 0; /* automatic sockbuf scaling */ 1787 int newsize = 0; /* automatic sockbuf scaling */
1788 1788
1789 /* 1789 /*
1790 * this is a pure, in-sequence data packet 1790 * this is a pure, in-sequence data packet
1791 * with nothing on the reassembly queue and 1791 * with nothing on the reassembly queue and
1792 * we have enough buffer space to take it. 1792 * we have enough buffer space to take it.
1793 */ 1793 */
1794 tp->rcv_nxt += tlen; 1794 tp->rcv_nxt += tlen;
1795 tcps = TCP_STAT_GETREF(); 1795 tcps = TCP_STAT_GETREF();
1796 tcps[TCP_STAT_PREDDAT]++; 1796 tcps[TCP_STAT_PREDDAT]++;
1797 tcps[TCP_STAT_RCVPACK]++; 1797 tcps[TCP_STAT_RCVPACK]++;
1798 tcps[TCP_STAT_RCVBYTE] += tlen; 1798 tcps[TCP_STAT_RCVBYTE] += tlen;
1799 TCP_STAT_PUTREF(); 1799 TCP_STAT_PUTREF();
1800 nd6_hint(tp); 1800 nd6_hint(tp);
1801 1801
1802 /* 1802 /*
1803 * Automatic sizing enables the performance of large buffers 1803 * Automatic sizing enables the performance of large buffers
1804 * and most of the efficiency of small ones by only allocating 1804 * and most of the efficiency of small ones by only allocating
1805 * space when it is needed. 1805 * space when it is needed.
1806 * 1806 *
1807 * On the receive side the socket buffer memory is only rarely 1807 * On the receive side the socket buffer memory is only rarely
1808 * used to any significant extent. This allows us to be much 1808 * used to any significant extent. This allows us to be much
1809 * more aggressive in scaling the receive socket buffer. For 1809 * more aggressive in scaling the receive socket buffer. For
1810 * the case that the buffer space is actually used to a large 1810 * the case that the buffer space is actually used to a large
1811 * extent and we run out of kernel memory we can simply drop 1811 * extent and we run out of kernel memory we can simply drop
1812 * the new segments; TCP on the sender will just retransmit it 1812 * the new segments; TCP on the sender will just retransmit it
1813 * later. Setting the buffer size too big may only consume too 1813 * later. Setting the buffer size too big may only consume too
1814 * much kernel memory if the application doesn't read() from 1814 * much kernel memory if the application doesn't read() from
1815 * the socket or packet loss or reordering makes use of the 1815 * the socket or packet loss or reordering makes use of the
1816 * reassembly queue. 1816 * reassembly queue.
1817 * 1817 *
1818 * The criteria to step up the receive buffer one notch are: 1818 * The criteria to step up the receive buffer one notch are:
1819 * 1. the number of bytes received during the time it takes 1819 * 1. the number of bytes received during the time it takes
1820 * one timestamp to be reflected back to us (the RTT); 1820 * one timestamp to be reflected back to us (the RTT);
1821 * 2. received bytes per RTT is within seven eighth of the 1821 * 2. received bytes per RTT is within seven eighth of the
1822 * current socket buffer size; 1822 * current socket buffer size;
1823 * 3. receive buffer size has not hit maximal automatic size; 1823 * 3. receive buffer size has not hit maximal automatic size;
1824 * 1824 *
1825 * This algorithm does one step per RTT at most and only if 1825 * This algorithm does one step per RTT at most and only if
1826 * we receive a bulk stream w/o packet losses or reorderings. 1826 * we receive a bulk stream w/o packet losses or reorderings.
1827 * Shrinking the buffer during idle times is not necessary as 1827 * Shrinking the buffer during idle times is not necessary as
1828 * it doesn't consume any memory when idle. 1828 * it doesn't consume any memory when idle.
1829 * 1829 *
1830 * TODO: Only step up if the application is actually serving 1830 * TODO: Only step up if the application is actually serving
1831 * the buffer to better manage the socket buffer resources. 1831 * the buffer to better manage the socket buffer resources.
1832 */ 1832 */
1833 if (tcp_do_autorcvbuf && 1833 if (tcp_do_autorcvbuf &&
1834 opti.ts_ecr && 1834 opti.ts_ecr &&
1835 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 1835 (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
1836 if (opti.ts_ecr > tp->rfbuf_ts && 1836 if (opti.ts_ecr > tp->rfbuf_ts &&
1837 opti.ts_ecr - tp->rfbuf_ts < PR_SLOWHZ) { 1837 opti.ts_ecr - tp->rfbuf_ts < PR_SLOWHZ) {
1838 if (tp->rfbuf_cnt > 1838 if (tp->rfbuf_cnt >
1839 (so->so_rcv.sb_hiwat / 8 * 7) && 1839 (so->so_rcv.sb_hiwat / 8 * 7) &&
1840 so->so_rcv.sb_hiwat < 1840 so->so_rcv.sb_hiwat <
1841 tcp_autorcvbuf_max) { 1841 tcp_autorcvbuf_max) {
1842 newsize = 1842 newsize =
1843 min(so->so_rcv.sb_hiwat + 1843 min(so->so_rcv.sb_hiwat +
1844 tcp_autorcvbuf_inc, 1844 tcp_autorcvbuf_inc,
1845 tcp_autorcvbuf_max); 1845 tcp_autorcvbuf_max);
1846 } 1846 }
1847 /* Start over with next RTT. */ 1847 /* Start over with next RTT. */
1848 tp->rfbuf_ts = 0; 1848 tp->rfbuf_ts = 0;
1849 tp->rfbuf_cnt = 0; 1849 tp->rfbuf_cnt = 0;
1850 } else 1850 } else
1851 tp->rfbuf_cnt += tlen; /* add up */ 1851 tp->rfbuf_cnt += tlen; /* add up */
1852 } 1852 }
1853 1853
1854 /* 1854 /*
1855 * Drop TCP, IP headers and TCP options then add data 1855 * Drop TCP, IP headers and TCP options then add data
1856 * to socket buffer. 1856 * to socket buffer.
1857 */ 1857 */
1858 if (so->so_state & SS_CANTRCVMORE) 1858 if (so->so_state & SS_CANTRCVMORE)
1859 m_freem(m); 1859 m_freem(m);
1860 else { 1860 else {
1861 /* 1861 /*
1862 * Set new socket buffer size. 1862 * Set new socket buffer size.
1863 * Give up when limit is reached. 1863 * Give up when limit is reached.
1864 */ 1864 */
1865 if (newsize) 1865 if (newsize)
1866 if (!sbreserve(&so->so_rcv, 1866 if (!sbreserve(&so->so_rcv,
1867 newsize, so)) 1867 newsize, so))
1868 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1868 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
1869 m_adj(m, toff + off); 1869 m_adj(m, toff + off);
1870 sbappendstream(&so->so_rcv, m); 1870 sbappendstream(&so->so_rcv, m);
1871 } 1871 }
1872 sorwakeup(so); 1872 sorwakeup(so);
1873 tcp_setup_ack(tp, th); 1873 tcp_setup_ack(tp, th);
1874 if (tp->t_flags & TF_ACKNOW) 1874 if (tp->t_flags & TF_ACKNOW)
1875 (void) tcp_output(tp); 1875 (void) tcp_output(tp);
1876 if (tcp_saveti) 1876 if (tcp_saveti)
1877 m_freem(tcp_saveti); 1877 m_freem(tcp_saveti);
1878 return; 1878 return;
1879 } 1879 }
1880 } 1880 }
1881 1881
1882 /* 1882 /*
1883 * Compute mbuf offset to TCP data segment. 1883 * Compute mbuf offset to TCP data segment.
1884 */ 1884 */
1885 hdroptlen = toff + off; 1885 hdroptlen = toff + off;
1886 1886
1887 /* 1887 /*
1888 * Calculate amount of space in receive window, 1888 * Calculate amount of space in receive window,
1889 * and then do TCP input processing. 1889 * and then do TCP input processing.
1890 * Receive window is amount of space in rcv queue, 1890 * Receive window is amount of space in rcv queue,
1891 * but not less than advertised window. 1891 * but not less than advertised window.
1892 */ 1892 */
1893 { int win; 1893 { int win;
1894 1894
1895 win = sbspace(&so->so_rcv); 1895 win = sbspace(&so->so_rcv);
1896 if (win < 0) 1896 if (win < 0)
1897 win = 0; 1897 win = 0;
1898 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1898 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1899 } 1899 }
1900 1900
1901 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1901 /* Reset receive buffer auto scaling when not in bulk receive mode. */
1902 tp->rfbuf_ts = 0; 1902 tp->rfbuf_ts = 0;
1903 tp->rfbuf_cnt = 0; 1903 tp->rfbuf_cnt = 0;
1904 1904
1905 switch (tp->t_state) { 1905 switch (tp->t_state) {
1906 /* 1906 /*
1907 * If the state is SYN_SENT: 1907 * If the state is SYN_SENT:
1908 * if seg contains an ACK, but not for our SYN, drop the input. 1908 * if seg contains an ACK, but not for our SYN, drop the input.
1909 * if seg contains a RST, then drop the connection. 1909 * if seg contains a RST, then drop the connection.
1910 * if seg does not contain SYN, then drop it. 1910 * if seg does not contain SYN, then drop it.
1911 * Otherwise this is an acceptable SYN segment 1911 * Otherwise this is an acceptable SYN segment
1912 * initialize tp->rcv_nxt and tp->irs 1912 * initialize tp->rcv_nxt and tp->irs
1913 * if seg contains ack then advance tp->snd_una 1913 * if seg contains ack then advance tp->snd_una
1914 * if seg contains a ECE and ECN support is enabled, the stream 1914 * if seg contains a ECE and ECN support is enabled, the stream
1915 * is ECN capable. 1915 * is ECN capable.
1916 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1916 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1917 * arrange for segment to be acked (eventually) 1917 * arrange for segment to be acked (eventually)
1918 * continue processing rest of data/controls, beginning with URG 1918 * continue processing rest of data/controls, beginning with URG
1919 */ 1919 */
1920 case TCPS_SYN_SENT: 1920 case TCPS_SYN_SENT:
1921 if ((tiflags & TH_ACK) && 1921 if ((tiflags & TH_ACK) &&
1922 (SEQ_LEQ(th->th_ack, tp->iss) || 1922 (SEQ_LEQ(th->th_ack, tp->iss) ||
1923 SEQ_GT(th->th_ack, tp->snd_max))) 1923 SEQ_GT(th->th_ack, tp->snd_max)))
1924 goto dropwithreset; 1924 goto dropwithreset;
1925 if (tiflags & TH_RST) { 1925 if (tiflags & TH_RST) {
1926 if (tiflags & TH_ACK) 1926 if (tiflags & TH_ACK)
1927 tp = tcp_drop(tp, ECONNREFUSED); 1927 tp = tcp_drop(tp, ECONNREFUSED);
1928 goto drop; 1928 goto drop;
1929 } 1929 }
1930 if ((tiflags & TH_SYN) == 0) 1930 if ((tiflags & TH_SYN) == 0)
1931 goto drop; 1931 goto drop;
1932 if (tiflags & TH_ACK) { 1932 if (tiflags & TH_ACK) {
1933 tp->snd_una = th->th_ack; 1933 tp->snd_una = th->th_ack;
1934 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1934 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1935 tp->snd_nxt = tp->snd_una; 1935 tp->snd_nxt = tp->snd_una;
1936 if (SEQ_LT(tp->snd_high, tp->snd_una)) 1936 if (SEQ_LT(tp->snd_high, tp->snd_una))
1937 tp->snd_high = tp->snd_una; 1937 tp->snd_high = tp->snd_una;
1938 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1938 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1939 1939
1940 if ((tiflags & TH_ECE) && tcp_do_ecn) { 1940 if ((tiflags & TH_ECE) && tcp_do_ecn) {
1941 tp->t_flags |= TF_ECN_PERMIT; 1941 tp->t_flags |= TF_ECN_PERMIT;
1942 TCP_STATINC(TCP_STAT_ECN_SHS); 1942 TCP_STATINC(TCP_STAT_ECN_SHS);
1943 } 1943 }
1944 1944
1945 } 1945 }
1946 tp->irs = th->th_seq; 1946 tp->irs = th->th_seq;
1947 tcp_rcvseqinit(tp); 1947 tcp_rcvseqinit(tp);
1948 tp->t_flags |= TF_ACKNOW; 1948 tp->t_flags |= TF_ACKNOW;
1949 tcp_mss_from_peer(tp, opti.maxseg); 1949 tcp_mss_from_peer(tp, opti.maxseg);
1950 1950
1951 /* 1951 /*
1952 * Initialize the initial congestion window. If we 1952 * Initialize the initial congestion window. If we
1953 * had to retransmit the SYN, we must initialize cwnd 1953 * had to retransmit the SYN, we must initialize cwnd
1954 * to 1 segment (i.e. the Loss Window). 1954 * to 1 segment (i.e. the Loss Window).
1955 */ 1955 */
1956 if (tp->t_flags & TF_SYN_REXMT) 1956 if (tp->t_flags & TF_SYN_REXMT)
1957 tp->snd_cwnd = tp->t_peermss; 1957 tp->snd_cwnd = tp->t_peermss;
1958 else { 1958 else {
1959 int ss = tcp_init_win; 1959 int ss = tcp_init_win;
1960#ifdef INET 1960#ifdef INET
1961 if (inp != NULL && in_localaddr(inp->inp_faddr)) 1961 if (inp != NULL && in_localaddr(inp->inp_faddr))
1962 ss = tcp_init_win_local; 1962 ss = tcp_init_win_local;
1963#endif 1963#endif
1964#ifdef INET6 1964#ifdef INET6
1965 if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr)) 1965 if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
1966 ss = tcp_init_win_local; 1966 ss = tcp_init_win_local;
1967#endif 1967#endif
1968 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss); 1968 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
1969 } 1969 }
1970 1970
1971 tcp_rmx_rtt(tp); 1971 tcp_rmx_rtt(tp);
1972 if (tiflags & TH_ACK) { 1972 if (tiflags & TH_ACK) {
1973 TCP_STATINC(TCP_STAT_CONNECTS); 1973 TCP_STATINC(TCP_STAT_CONNECTS);
1974 soisconnected(so); 1974 soisconnected(so);
1975 tcp_established(tp); 1975 tcp_established(tp);
1976 /* Do window scaling on this connection? */ 1976 /* Do window scaling on this connection? */
1977 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1977 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1978 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1978 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1979 tp->snd_scale = tp->requested_s_scale; 1979 tp->snd_scale = tp->requested_s_scale;
1980 tp->rcv_scale = tp->request_r_scale; 1980 tp->rcv_scale = tp->request_r_scale;
1981 } 1981 }
1982 TCP_REASS_LOCK(tp); 1982 TCP_REASS_LOCK(tp);
1983 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1983 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
1984 TCP_REASS_UNLOCK(tp); 1984 TCP_REASS_UNLOCK(tp);
1985 /* 1985 /*
1986 * if we didn't have to retransmit the SYN, 1986 * if we didn't have to retransmit the SYN,
1987 * use its rtt as our initial srtt & rtt var. 1987 * use its rtt as our initial srtt & rtt var.
1988 */ 1988 */
1989 if (tp->t_rtttime) 1989 if (tp->t_rtttime)
1990 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1990 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
1991 } else 1991 } else
1992 tp->t_state = TCPS_SYN_RECEIVED; 1992 tp->t_state = TCPS_SYN_RECEIVED;
1993 1993
1994 /* 1994 /*
1995 * Advance th->th_seq to correspond to first data byte. 1995 * Advance th->th_seq to correspond to first data byte.
1996 * If data, trim to stay within window, 1996 * If data, trim to stay within window,
1997 * dropping FIN if necessary. 1997 * dropping FIN if necessary.
1998 */ 1998 */
1999 th->th_seq++; 1999 th->th_seq++;
2000 if (tlen > tp->rcv_wnd) { 2000 if (tlen > tp->rcv_wnd) {
2001 todrop = tlen - tp->rcv_wnd; 2001 todrop = tlen - tp->rcv_wnd;
2002 m_adj(m, -todrop); 2002 m_adj(m, -todrop);
2003 tlen = tp->rcv_wnd; 2003 tlen = tp->rcv_wnd;
2004 tiflags &= ~TH_FIN; 2004 tiflags &= ~TH_FIN;
2005 tcps = TCP_STAT_GETREF(); 2005 tcps = TCP_STAT_GETREF();
2006 tcps[TCP_STAT_RCVPACKAFTERWIN]++; 2006 tcps[TCP_STAT_RCVPACKAFTERWIN]++;
2007 tcps[TCP_STAT_RCVBYTEAFTERWIN] += todrop; 2007 tcps[TCP_STAT_RCVBYTEAFTERWIN] += todrop;
2008 TCP_STAT_PUTREF(); 2008 TCP_STAT_PUTREF();
2009 } 2009 }
2010 tp->snd_wl1 = th->th_seq - 1; 2010 tp->snd_wl1 = th->th_seq - 1;
2011 tp->rcv_up = th->th_seq; 2011 tp->rcv_up = th->th_seq;
2012 goto step6; 2012 goto step6;
2013 2013
2014 /* 2014 /*
2015 * If the state is SYN_RECEIVED: 2015 * If the state is SYN_RECEIVED:
2016 * If seg contains an ACK, but not for our SYN, drop the input 2016 * If seg contains an ACK, but not for our SYN, drop the input
2017 * and generate an RST. See page 36, rfc793 2017 * and generate an RST. See page 36, rfc793
2018 */ 2018 */
2019 case TCPS_SYN_RECEIVED: 2019 case TCPS_SYN_RECEIVED:
2020 if ((tiflags & TH_ACK) && 2020 if ((tiflags & TH_ACK) &&
2021 (SEQ_LEQ(th->th_ack, tp->iss) || 2021 (SEQ_LEQ(th->th_ack, tp->iss) ||
2022 SEQ_GT(th->th_ack, tp->snd_max))) 2022 SEQ_GT(th->th_ack, tp->snd_max)))
2023 goto dropwithreset; 2023 goto dropwithreset;
2024 break; 2024 break;
2025 } 2025 }
2026 2026
2027 /* 2027 /*
2028 * States other than LISTEN or SYN_SENT. 2028 * States other than LISTEN or SYN_SENT.
2029 * First check timestamp, if present. 2029 * First check timestamp, if present.
2030 * Then check that at least some bytes of segment are within 2030 * Then check that at least some bytes of segment are within
2031 * receive window. If segment begins before rcv_nxt, 2031 * receive window. If segment begins before rcv_nxt,
2032 * drop leading data (and SYN); if nothing left, just ack. 2032 * drop leading data (and SYN); if nothing left, just ack.
2033 * 2033 *
2034 * RFC 1323 PAWS: If we have a timestamp reply on this segment 2034 * RFC 1323 PAWS: If we have a timestamp reply on this segment
2035 * and it's less than ts_recent, drop it. 2035 * and it's less than ts_recent, drop it.
2036 */ 2036 */
2037 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 2037 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
2038 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 2038 TSTMP_LT(opti.ts_val, tp->ts_recent)) {
2039 2039
2040 /* Check to see if ts_recent is over 24 days old. */ 2040 /* Check to see if ts_recent is over 24 days old. */
2041 if (tcp_now - tp->ts_recent_age > TCP_PAWS_IDLE) { 2041 if (tcp_now - tp->ts_recent_age > TCP_PAWS_IDLE) {
2042 /* 2042 /*
2043 * Invalidate ts_recent. If this segment updates 2043 * Invalidate ts_recent. If this segment updates
2044 * ts_recent, the age will be reset later and ts_recent 2044 * ts_recent, the age will be reset later and ts_recent
2045 * will get a valid value. If it does not, setting 2045 * will get a valid value. If it does not, setting
2046 * ts_recent to zero will at least satisfy the 2046 * ts_recent to zero will at least satisfy the
2047 * requirement that zero be placed in the timestamp 2047 * requirement that zero be placed in the timestamp
2048 * echo reply when ts_recent isn't valid. The 2048 * echo reply when ts_recent isn't valid. The
2049 * age isn't reset until we get a valid ts_recent 2049 * age isn't reset until we get a valid ts_recent
2050 * because we don't want out-of-order segments to be 2050 * because we don't want out-of-order segments to be
2051 * dropped when ts_recent is old. 2051 * dropped when ts_recent is old.
2052 */ 2052 */
2053 tp->ts_recent = 0; 2053 tp->ts_recent = 0;
2054 } else { 2054 } else {
2055 tcps = TCP_STAT_GETREF(); 2055 tcps = TCP_STAT_GETREF();
2056 tcps[TCP_STAT_RCVDUPPACK]++; 2056 tcps[TCP_STAT_RCVDUPPACK]++;
2057 tcps[TCP_STAT_RCVDUPBYTE] += tlen; 2057 tcps[TCP_STAT_RCVDUPBYTE] += tlen;
2058 tcps[TCP_STAT_PAWSDROP]++; 2058 tcps[TCP_STAT_PAWSDROP]++;
2059 TCP_STAT_PUTREF(); 2059 TCP_STAT_PUTREF();
2060 tcp_new_dsack(tp, th->th_seq, tlen); 2060 tcp_new_dsack(tp, th->th_seq, tlen);
2061 goto dropafterack; 2061 goto dropafterack;
2062 } 2062 }
2063 } 2063 }
2064 2064
2065 todrop = tp->rcv_nxt - th->th_seq; 2065 todrop = tp->rcv_nxt - th->th_seq;
2066 dupseg = false; 2066 dupseg = false;
2067 if (todrop > 0) { 2067 if (todrop > 0) {
2068 if (tiflags & TH_SYN) { 2068 if (tiflags & TH_SYN) {
2069 tiflags &= ~TH_SYN; 2069 tiflags &= ~TH_SYN;
2070 th->th_seq++; 2070 th->th_seq++;
2071 if (th->th_urp > 1) 2071 if (th->th_urp > 1)
2072 th->th_urp--; 2072 th->th_urp--;
2073 else { 2073 else {
2074 tiflags &= ~TH_URG; 2074 tiflags &= ~TH_URG;
2075 th->th_urp = 0; 2075 th->th_urp = 0;
2076 } 2076 }
2077 todrop--; 2077 todrop--;
2078 } 2078 }
2079 if (todrop > tlen || 2079 if (todrop > tlen ||
2080 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 2080 (todrop == tlen && (tiflags & TH_FIN) == 0)) {
2081 /* 2081 /*
2082 * Any valid FIN or RST must be to the left of the 2082 * Any valid FIN or RST must be to the left of the
2083 * window. At this point the FIN or RST must be a 2083 * window. At this point the FIN or RST must be a
2084 * duplicate or out of sequence; drop it. 2084 * duplicate or out of sequence; drop it.
2085 */ 2085 */
2086 if (tiflags & TH_RST) 2086 if (tiflags & TH_RST)
2087 goto drop; 2087 goto drop;
2088 tiflags &= ~(TH_FIN|TH_RST); 2088 tiflags &= ~(TH_FIN|TH_RST);
2089 /* 2089 /*
2090 * Send an ACK to resynchronize and drop any data. 2090 * Send an ACK to resynchronize and drop any data.
2091 * But keep on processing for RST or ACK. 2091 * But keep on processing for RST or ACK.
2092 */ 2092 */
2093 tp->t_flags |= TF_ACKNOW; 2093 tp->t_flags |= TF_ACKNOW;
2094 todrop = tlen; 2094 todrop = tlen;
2095 dupseg = true; 2095 dupseg = true;
2096 tcps = TCP_STAT_GETREF(); 2096 tcps = TCP_STAT_GETREF();
2097 tcps[TCP_STAT_RCVDUPPACK]++; 2097 tcps[TCP_STAT_RCVDUPPACK]++;
2098 tcps[TCP_STAT_RCVDUPBYTE] += todrop; 2098 tcps[TCP_STAT_RCVDUPBYTE] += todrop;
2099 TCP_STAT_PUTREF(); 2099 TCP_STAT_PUTREF();
2100 } else if ((tiflags & TH_RST) && 2100 } else if ((tiflags & TH_RST) &&
2101 th->th_seq != tp->rcv_nxt) { 2101 th->th_seq != tp->rcv_nxt) {
2102 /* 2102 /*
2103 * Test for reset before adjusting the sequence 2103 * Test for reset before adjusting the sequence
2104 * number for overlapping data. 2104 * number for overlapping data.
2105 */ 2105 */
2106 goto dropafterack_ratelim; 2106 goto dropafterack_ratelim;
2107 } else { 2107 } else {
2108 tcps = TCP_STAT_GETREF(); 2108 tcps = TCP_STAT_GETREF();
2109 tcps[TCP_STAT_RCVPARTDUPPACK]++; 2109 tcps[TCP_STAT_RCVPARTDUPPACK]++;
2110 tcps[TCP_STAT_RCVPARTDUPBYTE] += todrop; 2110 tcps[TCP_STAT_RCVPARTDUPBYTE] += todrop;
2111 TCP_STAT_PUTREF(); 2111 TCP_STAT_PUTREF();
2112 } 2112 }
2113 tcp_new_dsack(tp, th->th_seq, todrop); 2113 tcp_new_dsack(tp, th->th_seq, todrop);
2114 hdroptlen += todrop; /*drop from head afterwards*/ 2114 hdroptlen += todrop; /*drop from head afterwards*/
2115 th->th_seq += todrop; 2115 th->th_seq += todrop;
2116 tlen -= todrop; 2116 tlen -= todrop;
2117 if (th->th_urp > todrop) 2117 if (th->th_urp > todrop)
2118 th->th_urp -= todrop; 2118 th->th_urp -= todrop;
2119 else { 2119 else {
2120 tiflags &= ~TH_URG; 2120 tiflags &= ~TH_URG;
2121 th->th_urp = 0; 2121 th->th_urp = 0;
2122 } 2122 }
2123 } 2123 }
2124 2124
2125 /* 2125 /*
2126 * If new data are received on a connection after the 2126 * If new data are received on a connection after the
2127 * user processes are gone, then RST the other end. 2127 * user processes are gone, then RST the other end.
2128 */ 2128 */
2129 if ((so->so_state & SS_NOFDREF) && 2129 if ((so->so_state & SS_NOFDREF) &&
2130 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 2130 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
2131 tp = tcp_close(tp); 2131 tp = tcp_close(tp);
2132 TCP_STATINC(TCP_STAT_RCVAFTERCLOSE); 2132 TCP_STATINC(TCP_STAT_RCVAFTERCLOSE);
2133 goto dropwithreset; 2133 goto dropwithreset;
2134 } 2134 }
2135 2135
2136 /* 2136 /*
2137 * If segment ends after window, drop trailing data 2137 * If segment ends after window, drop trailing data
2138 * (and PUSH and FIN); if nothing left, just ACK. 2138 * (and PUSH and FIN); if nothing left, just ACK.
2139 */ 2139 */
2140 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 2140 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
2141 if (todrop > 0) { 2141 if (todrop > 0) {
2142 TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN); 2142 TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN);
2143 if (todrop >= tlen) { 2143 if (todrop >= tlen) {
2144 /* 2144 /*
2145 * The segment actually starts after the window. 2145 * The segment actually starts after the window.
2146 * th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen 2146 * th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen
2147 * th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0 2147 * th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0
2148 * th->th_seq >= tp->rcv_nxt + tp->rcv_wnd 2148 * th->th_seq >= tp->rcv_nxt + tp->rcv_wnd
2149 */ 2149 */
2150 TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen); 2150 TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen);
2151 /* 2151 /*
2152 * If a new connection request is received 2152 * If a new connection request is received
2153 * while in TIME_WAIT, drop the old connection 2153 * while in TIME_WAIT, drop the old connection
2154 * and start over if the sequence numbers 2154 * and start over if the sequence numbers
2155 * are above the previous ones. 2155 * are above the previous ones.
2156 * 2156 *
2157 * NOTE: We will checksum the packet again, and 2157 * NOTE: We will checksum the packet again, and
2158 * so we need to put the header fields back into 2158 * so we need to put the header fields back into
2159 * network order! 2159 * network order!
2160 * XXX This kind of sucks, but we don't expect 2160 * XXX This kind of sucks, but we don't expect
2161 * XXX this to happen very often, so maybe it 2161 * XXX this to happen very often, so maybe it
2162 * XXX doesn't matter so much. 2162 * XXX doesn't matter so much.
2163 */ 2163 */
2164 if (tiflags & TH_SYN && 2164 if (tiflags & TH_SYN &&
2165 tp->t_state == TCPS_TIME_WAIT && 2165 tp->t_state == TCPS_TIME_WAIT &&
2166 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 2166 SEQ_GT(th->th_seq, tp->rcv_nxt)) {
2167 tp = tcp_close(tp); 2167 tp = tcp_close(tp);
2168 tcp_fields_to_net(th); 2168 tcp_fields_to_net(th);
2169 goto findpcb; 2169 goto findpcb;
2170 } 2170 }
2171 /* 2171 /*
2172 * If window is closed can only take segments at 2172 * If window is closed can only take segments at
2173 * window edge, and have to drop data and PUSH from 2173 * window edge, and have to drop data and PUSH from
2174 * incoming segments. Continue processing, but 2174 * incoming segments. Continue processing, but
2175 * remember to ack. Otherwise, drop segment 2175 * remember to ack. Otherwise, drop segment
2176 * and (if not RST) ack. 2176 * and (if not RST) ack.
2177 */ 2177 */
2178 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 2178 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
2179 tp->t_flags |= TF_ACKNOW; 2179 tp->t_flags |= TF_ACKNOW;
2180 TCP_STATINC(TCP_STAT_RCVWINPROBE); 2180 TCP_STATINC(TCP_STAT_RCVWINPROBE);
2181 } else 2181 } else
2182 goto dropafterack; 2182 goto dropafterack;
2183 } else 2183 } else
2184 TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop); 2184 TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop);
2185 m_adj(m, -todrop); 2185 m_adj(m, -todrop);
2186 tlen -= todrop; 2186 tlen -= todrop;
2187 tiflags &= ~(TH_PUSH|TH_FIN); 2187 tiflags &= ~(TH_PUSH|TH_FIN);
2188 } 2188 }
2189 2189
2190 /* 2190 /*
2191 * If last ACK falls within this segment's sequence numbers, 2191 * If last ACK falls within this segment's sequence numbers,
2192 * record the timestamp. 2192 * record the timestamp.
2193 * NOTE:  2193 * NOTE:
2194 * 1) That the test incorporates suggestions from the latest 2194 * 1) That the test incorporates suggestions from the latest
2195 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2195 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2196 * 2) That updating only on newer timestamps interferes with 2196 * 2) That updating only on newer timestamps interferes with
2197 * our earlier PAWS tests, so this check should be solely 2197 * our earlier PAWS tests, so this check should be solely
2198 * predicated on the sequence space of this segment. 2198 * predicated on the sequence space of this segment.
2199 * 3) That we modify the segment boundary check to be  2199 * 3) That we modify the segment boundary check to be
2200 * Last.ACK.Sent <= SEG.SEQ + SEG.Len  2200 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
2201 * instead of RFC1323's 2201 * instead of RFC1323's
2202 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 2202 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
2203 * This modified check allows us to overcome RFC1323's 2203 * This modified check allows us to overcome RFC1323's
2204 * limitations as described in Stevens TCP/IP Illustrated 2204 * limitations as described in Stevens TCP/IP Illustrated
2205 * Vol. 2 p.869. In such cases, we can still calculate the 2205 * Vol. 2 p.869. In such cases, we can still calculate the
2206 * RTT correctly when RCV.NXT == Last.ACK.Sent. 2206 * RTT correctly when RCV.NXT == Last.ACK.Sent.
2207 */ 2207 */
2208 if (opti.ts_present && 2208 if (opti.ts_present &&
2209 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 2209 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
2210 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 2210 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
2211 ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 2211 ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
2212 tp->ts_recent_age = tcp_now; 2212 tp->ts_recent_age = tcp_now;
2213 tp->ts_recent = opti.ts_val; 2213 tp->ts_recent = opti.ts_val;
2214 } 2214 }
2215 2215
2216 /* 2216 /*
2217 * If the RST bit is set examine the state: 2217 * If the RST bit is set examine the state:
2218 * SYN_RECEIVED STATE: 2218 * SYN_RECEIVED STATE:
2219 * If passive open, return to LISTEN state. 2219 * If passive open, return to LISTEN state.
2220 * If active open, inform user that connection was refused. 2220 * If active open, inform user that connection was refused.
2221 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 2221 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
2222 * Inform user that connection was reset, and close tcb. 2222 * Inform user that connection was reset, and close tcb.
2223 * CLOSING, LAST_ACK, TIME_WAIT STATES 2223 * CLOSING, LAST_ACK, TIME_WAIT STATES
2224 * Close the tcb. 2224 * Close the tcb.
2225 */ 2225 */
2226 if (tiflags & TH_RST) { 2226 if (tiflags & TH_RST) {
2227 if (th->th_seq != tp->rcv_nxt) 2227 if (th->th_seq != tp->rcv_nxt)
2228 goto dropafterack_ratelim; 2228 goto dropafterack_ratelim;
2229 2229
2230 switch (tp->t_state) { 2230 switch (tp->t_state) {
2231 case TCPS_SYN_RECEIVED: 2231 case TCPS_SYN_RECEIVED:
2232 so->so_error = ECONNREFUSED; 2232 so->so_error = ECONNREFUSED;
2233 goto close; 2233 goto close;
2234 2234
2235 case TCPS_ESTABLISHED: 2235 case TCPS_ESTABLISHED:
2236 case TCPS_FIN_WAIT_1: 2236 case TCPS_FIN_WAIT_1:
2237 case TCPS_FIN_WAIT_2: 2237 case TCPS_FIN_WAIT_2:
2238 case TCPS_CLOSE_WAIT: 2238 case TCPS_CLOSE_WAIT:
2239 so->so_error = ECONNRESET; 2239 so->so_error = ECONNRESET;
2240 close: 2240 close:
2241 tp->t_state = TCPS_CLOSED; 2241 tp->t_state = TCPS_CLOSED;
2242 TCP_STATINC(TCP_STAT_DROPS); 2242 TCP_STATINC(TCP_STAT_DROPS);
2243 tp = tcp_close(tp); 2243 tp = tcp_close(tp);
2244 goto drop; 2244 goto drop;
2245 2245
2246 case TCPS_CLOSING: 2246 case TCPS_CLOSING:
2247 case TCPS_LAST_ACK: 2247 case TCPS_LAST_ACK:
2248 case TCPS_TIME_WAIT: 2248 case TCPS_TIME_WAIT:
2249 tp = tcp_close(tp); 2249 tp = tcp_close(tp);
2250 goto drop; 2250 goto drop;
2251 } 2251 }
2252 } 2252 }
2253 2253
2254 /* 2254 /*
2255 * Since we've covered the SYN-SENT and SYN-RECEIVED states above 2255 * Since we've covered the SYN-SENT and SYN-RECEIVED states above
2256 * we must be in a synchronized state. RFC791 states (under RST 2256 * we must be in a synchronized state. RFC791 states (under RST
2257 * generation) that any unacceptable segment (an out-of-order SYN 2257 * generation) that any unacceptable segment (an out-of-order SYN
2258 * qualifies) received in a synchronized state must elicit only an 2258 * qualifies) received in a synchronized state must elicit only an
2259 * empty acknowledgment segment ... and the connection remains in 2259 * empty acknowledgment segment ... and the connection remains in
2260 * the same state. 2260 * the same state.
2261 */ 2261 */
2262 if (tiflags & TH_SYN) { 2262 if (tiflags & TH_SYN) {
2263 if (tp->rcv_nxt == th->th_seq) { 2263 if (tp->rcv_nxt == th->th_seq) {
2264 tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack - 1, 2264 tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack - 1,
2265 TH_ACK); 2265 TH_ACK);
2266 if (tcp_saveti) 2266 if (tcp_saveti)
2267 m_freem(tcp_saveti); 2267 m_freem(tcp_saveti);
2268 return; 2268 return;
2269 } 2269 }
2270 2270
2271 goto dropafterack_ratelim; 2271 goto dropafterack_ratelim;
2272 } 2272 }
2273 2273
2274 /* 2274 /*
2275 * If the ACK bit is off we drop the segment and return. 2275 * If the ACK bit is off we drop the segment and return.
2276 */ 2276 */
2277 if ((tiflags & TH_ACK) == 0) { 2277 if ((tiflags & TH_ACK) == 0) {
2278 if (tp->t_flags & TF_ACKNOW) 2278 if (tp->t_flags & TF_ACKNOW)
2279 goto dropafterack; 2279 goto dropafterack;
2280 else 2280 else
2281 goto drop; 2281 goto drop;
2282 } 2282 }
2283 2283
2284 /* 2284 /*
2285 * Ack processing. 2285 * Ack processing.
2286 */ 2286 */
2287 switch (tp->t_state) { 2287 switch (tp->t_state) {
2288 2288
2289 /* 2289 /*
2290 * In SYN_RECEIVED state if the ack ACKs our SYN then enter 2290 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
2291 * ESTABLISHED state and continue processing, otherwise 2291 * ESTABLISHED state and continue processing, otherwise
2292 * send an RST. 2292 * send an RST.
2293 */ 2293 */
2294 case TCPS_SYN_RECEIVED: 2294 case TCPS_SYN_RECEIVED:
2295 if (SEQ_GT(tp->snd_una, th->th_ack) || 2295 if (SEQ_GT(tp->snd_una, th->th_ack) ||
2296 SEQ_GT(th->th_ack, tp->snd_max)) 2296 SEQ_GT(th->th_ack, tp->snd_max))
2297 goto dropwithreset; 2297 goto dropwithreset;
2298 TCP_STATINC(TCP_STAT_CONNECTS); 2298 TCP_STATINC(TCP_STAT_CONNECTS);
2299 soisconnected(so); 2299 soisconnected(so);
2300 tcp_established(tp); 2300 tcp_established(tp);
2301 /* Do window scaling? */ 2301 /* Do window scaling? */
2302 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2302 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2303 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2303 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2304 tp->snd_scale = tp->requested_s_scale; 2304 tp->snd_scale = tp->requested_s_scale;
2305 tp->rcv_scale = tp->request_r_scale; 2305 tp->rcv_scale = tp->request_r_scale;
2306 } 2306 }
2307 TCP_REASS_LOCK(tp); 2307 TCP_REASS_LOCK(tp);
2308 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 2308 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
2309 TCP_REASS_UNLOCK(tp); 2309 TCP_REASS_UNLOCK(tp);
2310 tp->snd_wl1 = th->th_seq - 1; 2310 tp->snd_wl1 = th->th_seq - 1;
2311 /* fall into ... */ 2311 /* fall into ... */
2312 2312
2313 /* 2313 /*
2314 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 2314 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2315 * ACKs. If the ack is in the range 2315 * ACKs. If the ack is in the range
2316 * tp->snd_una < th->th_ack <= tp->snd_max 2316 * tp->snd_una < th->th_ack <= tp->snd_max
2317 * then advance tp->snd_una to th->th_ack and drop 2317 * then advance tp->snd_una to th->th_ack and drop
2318 * data from the retransmission queue. If this ACK reflects 2318 * data from the retransmission queue. If this ACK reflects
2319 * more up to date window information we update our window information. 2319 * more up to date window information we update our window information.
2320 */ 2320 */
2321 case TCPS_ESTABLISHED: 2321 case TCPS_ESTABLISHED:
2322 case TCPS_FIN_WAIT_1: 2322 case TCPS_FIN_WAIT_1:
2323 case TCPS_FIN_WAIT_2: 2323 case TCPS_FIN_WAIT_2:
2324 case TCPS_CLOSE_WAIT: 2324 case TCPS_CLOSE_WAIT:
2325 case TCPS_CLOSING: 2325 case TCPS_CLOSING:
2326 case TCPS_LAST_ACK: 2326 case TCPS_LAST_ACK:
2327 case TCPS_TIME_WAIT: 2327 case TCPS_TIME_WAIT:
2328 2328
2329 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 2329 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2330 if (tlen == 0 && !dupseg && tiwin == tp->snd_wnd) { 2330 if (tlen == 0 && !dupseg && tiwin == tp->snd_wnd) {
2331 TCP_STATINC(TCP_STAT_RCVDUPPACK); 2331 TCP_STATINC(TCP_STAT_RCVDUPPACK);
2332 /* 2332 /*
2333 * If we have outstanding data (other than 2333 * If we have outstanding data (other than
2334 * a window probe), this is a completely 2334 * a window probe), this is a completely
2335 * duplicate ack (ie, window info didn't 2335 * duplicate ack (ie, window info didn't
2336 * change), the ack is the biggest we've 2336 * change), the ack is the biggest we've
2337 * seen and we've seen exactly our rexmt 2337 * seen and we've seen exactly our rexmt
2338 * threshhold of them, assume a packet 2338 * threshhold of them, assume a packet
2339 * has been dropped and retransmit it. 2339 * has been dropped and retransmit it.
2340 * Kludge snd_nxt & the congestion 2340 * Kludge snd_nxt & the congestion
2341 * window so we send only this one 2341 * window so we send only this one
2342 * packet. 2342 * packet.
2343 */ 2343 */
2344 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 || 2344 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 ||
2345 th->th_ack != tp->snd_una) 2345 th->th_ack != tp->snd_una)
2346 tp->t_dupacks = 0; 2346 tp->t_dupacks = 0;
2347 else if (tp->t_partialacks < 0 && 2347 else if (tp->t_partialacks < 0 &&
2348 (++tp->t_dupacks == tcprexmtthresh || 2348 (++tp->t_dupacks == tcprexmtthresh ||
2349 TCP_FACK_FASTRECOV(tp))) { 2349 TCP_FACK_FASTRECOV(tp))) {
2350 /* 2350 /*
2351 * Do the fast retransmit, and adjust 2351 * Do the fast retransmit, and adjust
2352 * congestion control paramenters. 2352 * congestion control paramenters.
2353 */ 2353 */
2354 if (tp->t_congctl->fast_retransmit(tp, th)) { 2354 if (tp->t_congctl->fast_retransmit(tp, th)) {
2355 /* False fast retransmit */ 2355 /* False fast retransmit */
2356 break; 2356 break;
2357 } else 2357 } else
2358 goto drop; 2358 goto drop;
2359 } else if (tp->t_dupacks > tcprexmtthresh) { 2359 } else if (tp->t_dupacks > tcprexmtthresh) {
2360 tp->snd_cwnd += tp->t_segsz; 2360 tp->snd_cwnd += tp->t_segsz;
2361 (void) tcp_output(tp); 2361 (void) tcp_output(tp);
2362 goto drop; 2362 goto drop;
2363 } 2363 }
2364 } else { 2364 } else {
2365 /* 2365 /*
2366 * If the ack appears to be very old, only 2366 * If the ack appears to be very old, only
2367 * allow data that is in-sequence. This 2367 * allow data that is in-sequence. This
2368 * makes it somewhat more difficult to insert 2368 * makes it somewhat more difficult to insert
2369 * forged data by guessing sequence numbers. 2369 * forged data by guessing sequence numbers.
2370 * Sent an ack to try to update the send 2370 * Sent an ack to try to update the send
2371 * sequence number on the other side. 2371 * sequence number on the other side.
2372 */ 2372 */
2373 if (tlen && th->th_seq != tp->rcv_nxt && 2373 if (tlen && th->th_seq != tp->rcv_nxt &&
2374 SEQ_LT(th->th_ack, 2374 SEQ_LT(th->th_ack,
2375 tp->snd_una - tp->max_sndwnd)) 2375 tp->snd_una - tp->max_sndwnd))
2376 goto dropafterack; 2376 goto dropafterack;
2377 } 2377 }
2378 break; 2378 break;
2379 } 2379 }
2380 /* 2380 /*
2381 * If the congestion window was inflated to account 2381 * If the congestion window was inflated to account
2382 * for the other side's cached packets, retract it. 2382 * for the other side's cached packets, retract it.
2383 */ 2383 */
2384 /* XXX: make SACK have his own congestion control 2384 /* XXX: make SACK have his own congestion control
2385 * struct -- rpaulo */ 2385 * struct -- rpaulo */
2386 if (TCP_SACK_ENABLED(tp)) 2386 if (TCP_SACK_ENABLED(tp))
2387 tcp_sack_newack(tp, th); 2387 tcp_sack_newack(tp, th);
2388 else 2388 else
2389 tp->t_congctl->fast_retransmit_newack(tp, th); 2389 tp->t_congctl->fast_retransmit_newack(tp, th);
2390 if (SEQ_GT(th->th_ack, tp->snd_max)) { 2390 if (SEQ_GT(th->th_ack, tp->snd_max)) {
2391 TCP_STATINC(TCP_STAT_RCVACKTOOMUCH); 2391 TCP_STATINC(TCP_STAT_RCVACKTOOMUCH);
2392 goto dropafterack; 2392 goto dropafterack;
2393 } 2393 }
2394 acked = th->th_ack - tp->snd_una; 2394 acked = th->th_ack - tp->snd_una;
2395 tcps = TCP_STAT_GETREF(); 2395 tcps = TCP_STAT_GETREF();
2396 tcps[TCP_STAT_RCVACKPACK]++; 2396 tcps[TCP_STAT_RCVACKPACK]++;
2397 tcps[TCP_STAT_RCVACKBYTE] += acked; 2397 tcps[TCP_STAT_RCVACKBYTE] += acked;
2398 TCP_STAT_PUTREF(); 2398 TCP_STAT_PUTREF();
2399 2399
2400 /* 2400 /*
2401 * If we have a timestamp reply, update smoothed 2401 * If we have a timestamp reply, update smoothed
2402 * round trip time. If no timestamp is present but 2402 * round trip time. If no timestamp is present but
2403 * transmit timer is running and timed sequence 2403 * transmit timer is running and timed sequence
2404 * number was acked, update smoothed round trip time. 2404 * number was acked, update smoothed round trip time.
2405 * Since we now have an rtt measurement, cancel the 2405 * Since we now have an rtt measurement, cancel the
2406 * timer backoff (cf., Phil Karn's retransmit alg.). 2406 * timer backoff (cf., Phil Karn's retransmit alg.).
2407 * Recompute the initial retransmit timer. 2407 * Recompute the initial retransmit timer.
2408 */ 2408 */
2409 if (ts_rtt) 2409 if (ts_rtt)
2410 tcp_xmit_timer(tp, ts_rtt); 2410 tcp_xmit_timer(tp, ts_rtt);
2411 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 2411 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
2412 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 2412 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
2413 2413
2414 /* 2414 /*
2415 * If all outstanding data is acked, stop retransmit 2415 * If all outstanding data is acked, stop retransmit
2416 * timer and remember to restart (more output or persist). 2416 * timer and remember to restart (more output or persist).
2417 * If there is more data to be acked, restart retransmit 2417 * If there is more data to be acked, restart retransmit
2418 * timer, using current (possibly backed-off) value. 2418 * timer, using current (possibly backed-off) value.
2419 */ 2419 */
2420 if (th->th_ack == tp->snd_max) { 2420 if (th->th_ack == tp->snd_max) {
2421 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2421 TCP_TIMER_DISARM(tp, TCPT_REXMT);
2422 needoutput = 1; 2422 needoutput = 1;
2423 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 2423 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
2424 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 2424 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
2425 2425
2426 /* 2426 /*
2427 * New data has been acked, adjust the congestion window. 2427 * New data has been acked, adjust the congestion window.
2428 */ 2428 */
2429 tp->t_congctl->newack(tp, th); 2429 tp->t_congctl->newack(tp, th);
2430 2430
2431 nd6_hint(tp); 2431 nd6_hint(tp);
2432 if (acked > so->so_snd.sb_cc) { 2432 if (acked > so->so_snd.sb_cc) {
2433 tp->snd_wnd -= so->so_snd.sb_cc; 2433 tp->snd_wnd -= so->so_snd.sb_cc;
2434 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 2434 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
2435 ourfinisacked = 1; 2435 ourfinisacked = 1;
2436 } else { 2436 } else {
2437 if (acked > (tp->t_lastoff - tp->t_inoff)) 2437 if (acked > (tp->t_lastoff - tp->t_inoff))
2438 tp->t_lastm = NULL; 2438 tp->t_lastm = NULL;
2439 sbdrop(&so->so_snd, acked); 2439 sbdrop(&so->so_snd, acked);
2440 tp->t_lastoff -= acked; 2440 tp->t_lastoff -= acked;
2441 tp->snd_wnd -= acked; 2441 if (tp->snd_wnd > acked)
 2442 tp->snd_wnd -= acked;
 2443 else
 2444 tp->snd_wnd = 0;
2442 ourfinisacked = 0; 2445 ourfinisacked = 0;
2443 } 2446 }
2444 sowwakeup(so); 2447 sowwakeup(so);
2445 2448
2446 icmp_check(tp, th, acked); 2449 icmp_check(tp, th, acked);
2447 2450
2448 tp->snd_una = th->th_ack; 2451 tp->snd_una = th->th_ack;
2449 if (SEQ_GT(tp->snd_una, tp->snd_fack)) 2452 if (SEQ_GT(tp->snd_una, tp->snd_fack))
2450 tp->snd_fack = tp->snd_una; 2453 tp->snd_fack = tp->snd_una;
2451 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2454 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2452 tp->snd_nxt = tp->snd_una; 2455 tp->snd_nxt = tp->snd_una;
2453 if (SEQ_LT(tp->snd_high, tp->snd_una)) 2456 if (SEQ_LT(tp->snd_high, tp->snd_una))
2454 tp->snd_high = tp->snd_una; 2457 tp->snd_high = tp->snd_una;
2455 2458
2456 switch (tp->t_state) { 2459 switch (tp->t_state) {
2457 2460
2458 /* 2461 /*
2459 * In FIN_WAIT_1 STATE in addition to the processing 2462 * In FIN_WAIT_1 STATE in addition to the processing
2460 * for the ESTABLISHED state if our FIN is now acknowledged 2463 * for the ESTABLISHED state if our FIN is now acknowledged
2461 * then enter FIN_WAIT_2. 2464 * then enter FIN_WAIT_2.
2462 */ 2465 */
2463 case TCPS_FIN_WAIT_1: 2466 case TCPS_FIN_WAIT_1:
2464 if (ourfinisacked) { 2467 if (ourfinisacked) {
2465 /* 2468 /*
2466 * If we can't receive any more 2469 * If we can't receive any more
2467 * data, then closing user can proceed. 2470 * data, then closing user can proceed.
2468 * Starting the timer is contrary to the 2471 * Starting the timer is contrary to the
2469 * specification, but if we don't get a FIN 2472 * specification, but if we don't get a FIN
2470 * we'll hang forever. 2473 * we'll hang forever.
2471 */ 2474 */
2472 if (so->so_state & SS_CANTRCVMORE) { 2475 if (so->so_state & SS_CANTRCVMORE) {
2473 soisdisconnected(so); 2476 soisdisconnected(so);
2474 if (tp->t_maxidle > 0) 2477 if (tp->t_maxidle > 0)
2475 TCP_TIMER_ARM(tp, TCPT_2MSL, 2478 TCP_TIMER_ARM(tp, TCPT_2MSL,
2476 tp->t_maxidle); 2479 tp->t_maxidle);
2477 } 2480 }
2478 tp->t_state = TCPS_FIN_WAIT_2; 2481 tp->t_state = TCPS_FIN_WAIT_2;
2479 } 2482 }
2480 break; 2483 break;
2481 2484
2482 /* 2485 /*
2483 * In CLOSING STATE in addition to the processing for 2486 * In CLOSING STATE in addition to the processing for
2484 * the ESTABLISHED state if the ACK acknowledges our FIN 2487 * the ESTABLISHED state if the ACK acknowledges our FIN
2485 * then enter the TIME-WAIT state, otherwise ignore 2488 * then enter the TIME-WAIT state, otherwise ignore
2486 * the segment. 2489 * the segment.
2487 */ 2490 */
2488 case TCPS_CLOSING: 2491 case TCPS_CLOSING:
2489 if (ourfinisacked) { 2492 if (ourfinisacked) {
2490 tp->t_state = TCPS_TIME_WAIT; 2493 tp->t_state = TCPS_TIME_WAIT;
2491 tcp_canceltimers(tp); 2494 tcp_canceltimers(tp);
2492 TCP_TIMER_ARM(tp, TCPT_2MSL,  2495 TCP_TIMER_ARM(tp, TCPT_2MSL,
2493 2 * PR_SLOWHZ * tcp_msl); 2496 2 * PR_SLOWHZ * tcp_msl);
2494 soisdisconnected(so); 2497 soisdisconnected(so);
2495 } 2498 }
2496 break; 2499 break;
2497 2500
2498 /* 2501 /*
2499 * In LAST_ACK, we may still be waiting for data to drain 2502 * In LAST_ACK, we may still be waiting for data to drain
2500 * and/or to be acked, as well as for the ack of our FIN. 2503 * and/or to be acked, as well as for the ack of our FIN.
2501 * If our FIN is now acknowledged, delete the TCB, 2504 * If our FIN is now acknowledged, delete the TCB,
2502 * enter the closed state and return. 2505 * enter the closed state and return.
2503 */ 2506 */
2504 case TCPS_LAST_ACK: 2507 case TCPS_LAST_ACK:
2505 if (ourfinisacked) { 2508 if (ourfinisacked) {
2506 tp = tcp_close(tp); 2509 tp = tcp_close(tp);
2507 goto drop; 2510 goto drop;
2508 } 2511 }
2509 break; 2512 break;
2510 2513
2511 /* 2514 /*
2512 * In TIME_WAIT state the only thing that should arrive 2515 * In TIME_WAIT state the only thing that should arrive
2513 * is a retransmission of the remote FIN. Acknowledge 2516 * is a retransmission of the remote FIN. Acknowledge
2514 * it and restart the finack timer. 2517 * it and restart the finack timer.
2515 */ 2518 */
2516 case TCPS_TIME_WAIT: 2519 case TCPS_TIME_WAIT:
2517 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * PR_SLOWHZ * tcp_msl); 2520 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * PR_SLOWHZ * tcp_msl);
2518 goto dropafterack; 2521 goto dropafterack;
2519 } 2522 }
2520 } 2523 }
2521 2524
2522step6: 2525step6:
2523 /* 2526 /*
2524 * Update window information. 2527 * Update window information.
2525 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2528 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2526 */ 2529 */
2527 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 2530 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2528 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 2531 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2529 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2532 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2530 /* keep track of pure window updates */ 2533 /* keep track of pure window updates */
2531 if (tlen == 0 && 2534 if (tlen == 0 &&
2532 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2535 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2533 TCP_STATINC(TCP_STAT_RCVWINUPD); 2536 TCP_STATINC(TCP_STAT_RCVWINUPD);
2534 tp->snd_wnd = tiwin; 2537 tp->snd_wnd = tiwin;
2535 tp->snd_wl1 = th->th_seq; 2538 tp->snd_wl1 = th->th_seq;
2536 tp->snd_wl2 = th->th_ack; 2539 tp->snd_wl2 = th->th_ack;
2537 if (tp->snd_wnd > tp->max_sndwnd) 2540 if (tp->snd_wnd > tp->max_sndwnd)
2538 tp->max_sndwnd = tp->snd_wnd; 2541 tp->max_sndwnd = tp->snd_wnd;
2539 needoutput = 1; 2542 needoutput = 1;
2540 } 2543 }
2541 2544
2542 /* 2545 /*
2543 * Process segments with URG. 2546 * Process segments with URG.
2544 */ 2547 */
2545 if ((tiflags & TH_URG) && th->th_urp && 2548 if ((tiflags & TH_URG) && th->th_urp &&
2546 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2549 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2547 /* 2550 /*
2548 * This is a kludge, but if we receive and accept 2551 * This is a kludge, but if we receive and accept
2549 * random urgent pointers, we'll crash in 2552 * random urgent pointers, we'll crash in
2550 * soreceive. It's hard to imagine someone 2553 * soreceive. It's hard to imagine someone
2551 * actually wanting to send this much urgent data. 2554 * actually wanting to send this much urgent data.
2552 */ 2555 */
2553 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2556 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2554 th->th_urp = 0; /* XXX */ 2557 th->th_urp = 0; /* XXX */
2555 tiflags &= ~TH_URG; /* XXX */ 2558 tiflags &= ~TH_URG; /* XXX */
2556 goto dodata; /* XXX */ 2559 goto dodata; /* XXX */
2557 } 2560 }
2558 /* 2561 /*
2559 * If this segment advances the known urgent pointer, 2562 * If this segment advances the known urgent pointer,
2560 * then mark the data stream. This should not happen 2563 * then mark the data stream. This should not happen
2561 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2564 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2562 * a FIN has been received from the remote side. 2565 * a FIN has been received from the remote side.
2563 * In these states we ignore the URG. 2566 * In these states we ignore the URG.
2564 * 2567 *
2565 * According to RFC961 (Assigned Protocols), 2568 * According to RFC961 (Assigned Protocols),
2566 * the urgent pointer points to the last octet 2569 * the urgent pointer points to the last octet
2567 * of urgent data. We continue, however, 2570 * of urgent data. We continue, however,
2568 * to consider it to indicate the first octet 2571 * to consider it to indicate the first octet
2569 * of data past the urgent section as the original 2572 * of data past the urgent section as the original
2570 * spec states (in one of two places). 2573 * spec states (in one of two places).
2571 */ 2574 */
2572 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2575 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2573 tp->rcv_up = th->th_seq + th->th_urp; 2576 tp->rcv_up = th->th_seq + th->th_urp;
2574 so->so_oobmark = so->so_rcv.sb_cc + 2577 so->so_oobmark = so->so_rcv.sb_cc +
2575 (tp->rcv_up - tp->rcv_nxt) - 1; 2578 (tp->rcv_up - tp->rcv_nxt) - 1;
2576 if (so->so_oobmark == 0) 2579 if (so->so_oobmark == 0)
2577 so->so_state |= SS_RCVATMARK; 2580 so->so_state |= SS_RCVATMARK;
2578 sohasoutofband(so); 2581 sohasoutofband(so);
2579 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2582 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2580 } 2583 }
2581 /* 2584 /*
2582 * Remove out of band data so doesn't get presented to user. 2585 * Remove out of band data so doesn't get presented to user.
2583 * This can happen independent of advancing the URG pointer, 2586 * This can happen independent of advancing the URG pointer,
2584 * but if two URG's are pending at once, some out-of-band 2587 * but if two URG's are pending at once, some out-of-band
2585 * data may creep in... ick. 2588 * data may creep in... ick.
2586 */ 2589 */
2587 if (th->th_urp <= (u_int16_t) tlen 2590 if (th->th_urp <= (u_int16_t) tlen
2588#ifdef SO_OOBINLINE 2591#ifdef SO_OOBINLINE
2589 && (so->so_options & SO_OOBINLINE) == 0 2592 && (so->so_options & SO_OOBINLINE) == 0
2590#endif 2593#endif
2591 ) 2594 )
2592 tcp_pulloutofband(so, th, m, hdroptlen); 2595 tcp_pulloutofband(so, th, m, hdroptlen);
2593 } else 2596 } else
2594 /* 2597 /*
2595 * If no out of band data is expected, 2598 * If no out of band data is expected,
2596 * pull receive urgent pointer along 2599 * pull receive urgent pointer along
2597 * with the receive window. 2600 * with the receive window.
2598 */ 2601 */
2599 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2602 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2600 tp->rcv_up = tp->rcv_nxt; 2603 tp->rcv_up = tp->rcv_nxt;
2601dodata: /* XXX */ 2604dodata: /* XXX */
2602 2605
2603 /* 2606 /*
2604 * Process the segment text, merging it into the TCP sequencing queue, 2607 * Process the segment text, merging it into the TCP sequencing queue,
2605 * and arranging for acknowledgement of receipt if necessary. 2608 * and arranging for acknowledgement of receipt if necessary.
2606 * This process logically involves adjusting tp->rcv_wnd as data 2609 * This process logically involves adjusting tp->rcv_wnd as data
2607 * is presented to the user (this happens in tcp_usrreq.c, 2610 * is presented to the user (this happens in tcp_usrreq.c,
2608 * case PRU_RCVD). If a FIN has already been received on this 2611 * case PRU_RCVD). If a FIN has already been received on this
2609 * connection then we just ignore the text. 2612 * connection then we just ignore the text.
2610 */ 2613 */
2611 if ((tlen || (tiflags & TH_FIN)) && 2614 if ((tlen || (tiflags & TH_FIN)) &&
2612 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2615 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2613 /* 2616 /*
2614 * Insert segment ti into reassembly queue of tcp with 2617 * Insert segment ti into reassembly queue of tcp with
2615 * control block tp. Return TH_FIN if reassembly now includes 2618 * control block tp. Return TH_FIN if reassembly now includes
2616 * a segment with FIN. The macro form does the common case 2619 * a segment with FIN. The macro form does the common case
2617 * inline (segment is the next to be received on an 2620 * inline (segment is the next to be received on an
2618 * established connection, and the queue is empty), 2621 * established connection, and the queue is empty),
2619 * avoiding linkage into and removal from the queue and 2622 * avoiding linkage into and removal from the queue and
2620 * repetition of various conversions. 2623 * repetition of various conversions.
2621 * Set DELACK for segments received in order, but ack 2624 * Set DELACK for segments received in order, but ack
2622 * immediately when segments are out of order 2625 * immediately when segments are out of order
2623 * (so fast retransmit can work). 2626 * (so fast retransmit can work).
2624 */ 2627 */
2625 /* NOTE: this was TCP_REASS() macro, but used only once */ 2628 /* NOTE: this was TCP_REASS() macro, but used only once */
2626 TCP_REASS_LOCK(tp); 2629 TCP_REASS_LOCK(tp);
2627 if (th->th_seq == tp->rcv_nxt && 2630 if (th->th_seq == tp->rcv_nxt &&
2628 TAILQ_FIRST(&tp->segq) == NULL && 2631 TAILQ_FIRST(&tp->segq) == NULL &&
2629 tp->t_state == TCPS_ESTABLISHED) { 2632 tp->t_state == TCPS_ESTABLISHED) {
2630 tcp_setup_ack(tp, th); 2633 tcp_setup_ack(tp, th);
2631 tp->rcv_nxt += tlen; 2634 tp->rcv_nxt += tlen;
2632 tiflags = th->th_flags & TH_FIN; 2635 tiflags = th->th_flags & TH_FIN;
2633 tcps = TCP_STAT_GETREF(); 2636 tcps = TCP_STAT_GETREF();
2634 tcps[TCP_STAT_RCVPACK]++; 2637 tcps[TCP_STAT_RCVPACK]++;
2635 tcps[TCP_STAT_RCVBYTE] += tlen; 2638 tcps[TCP_STAT_RCVBYTE] += tlen;
2636 TCP_STAT_PUTREF(); 2639 TCP_STAT_PUTREF();
2637 nd6_hint(tp); 2640 nd6_hint(tp);
2638 if (so->so_state & SS_CANTRCVMORE) 2641 if (so->so_state & SS_CANTRCVMORE)
2639 m_freem(m); 2642 m_freem(m);
2640 else { 2643 else {
2641 m_adj(m, hdroptlen); 2644 m_adj(m, hdroptlen);
2642 sbappendstream(&(so)->so_rcv, m); 2645 sbappendstream(&(so)->so_rcv, m);
2643 } 2646 }
2644 TCP_REASS_UNLOCK(tp); 2647 TCP_REASS_UNLOCK(tp);
2645 sorwakeup(so); 2648 sorwakeup(so);
2646 } else { 2649 } else {
2647 m_adj(m, hdroptlen); 2650 m_adj(m, hdroptlen);
2648 tiflags = tcp_reass(tp, th, m, &tlen); 2651 tiflags = tcp_reass(tp, th, m, &tlen);
2649 tp->t_flags |= TF_ACKNOW; 2652 tp->t_flags |= TF_ACKNOW;
2650 TCP_REASS_UNLOCK(tp); 2653 TCP_REASS_UNLOCK(tp);
2651 } 2654 }
2652 2655
2653 /* 2656 /*
2654 * Note the amount of data that peer has sent into 2657 * Note the amount of data that peer has sent into
2655 * our window, in order to estimate the sender's 2658 * our window, in order to estimate the sender's
2656 * buffer size. 2659 * buffer size.
2657 */ 2660 */
2658 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2661 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2659 } else { 2662 } else {
2660 m_freem(m); 2663 m_freem(m);
2661 m = NULL; 2664 m = NULL;
2662 tiflags &= ~TH_FIN; 2665 tiflags &= ~TH_FIN;
2663 } 2666 }
2664 2667
2665 /* 2668 /*
2666 * If FIN is received ACK the FIN and let the user know 2669 * If FIN is received ACK the FIN and let the user know
2667 * that the connection is closing. Ignore a FIN received before 2670 * that the connection is closing. Ignore a FIN received before
2668 * the connection is fully established. 2671 * the connection is fully established.
2669 */ 2672 */
2670 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2673 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2671 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2674 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2672 socantrcvmore(so); 2675 socantrcvmore(so);
2673 tp->t_flags |= TF_ACKNOW; 2676 tp->t_flags |= TF_ACKNOW;
2674 tp->rcv_nxt++; 2677 tp->rcv_nxt++;
2675 } 2678 }
2676 switch (tp->t_state) { 2679 switch (tp->t_state) {
2677 2680
2678 /* 2681 /*
2679 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2682 * In ESTABLISHED STATE enter the CLOSE_WAIT state.
2680 */ 2683 */
2681 case TCPS_ESTABLISHED: 2684 case TCPS_ESTABLISHED:
2682 tp->t_state = TCPS_CLOSE_WAIT; 2685 tp->t_state = TCPS_CLOSE_WAIT;
2683 break; 2686 break;
2684 2687
2685 /* 2688 /*
2686 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2689 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2687 * enter the CLOSING state. 2690 * enter the CLOSING state.
2688 */ 2691 */
2689 case TCPS_FIN_WAIT_1: 2692 case TCPS_FIN_WAIT_1:
2690 tp->t_state = TCPS_CLOSING; 2693 tp->t_state = TCPS_CLOSING;
2691 break; 2694 break;
2692 2695
2693 /* 2696 /*
2694 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2697 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2695 * starting the time-wait timer, turning off the other 2698 * starting the time-wait timer, turning off the other
2696 * standard timers. 2699 * standard timers.
2697 */ 2700 */
2698 case TCPS_FIN_WAIT_2: 2701 case TCPS_FIN_WAIT_2:
2699 tp->t_state = TCPS_TIME_WAIT; 2702 tp->t_state = TCPS_TIME_WAIT;
2700 tcp_canceltimers(tp); 2703 tcp_canceltimers(tp);
2701 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * PR_SLOWHZ * tcp_msl); 2704 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * PR_SLOWHZ * tcp_msl);
2702 soisdisconnected(so); 2705 soisdisconnected(so);
2703 break; 2706 break;
2704 2707
2705 /* 2708 /*
2706 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2709 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2707 */ 2710 */
2708 case TCPS_TIME_WAIT: 2711 case TCPS_TIME_WAIT:
2709 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * PR_SLOWHZ * tcp_msl); 2712 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * PR_SLOWHZ * tcp_msl);
2710 break; 2713 break;
2711 } 2714 }
2712 } 2715 }
2713#ifdef TCP_DEBUG 2716#ifdef TCP_DEBUG
2714 if (so->so_options & SO_DEBUG) 2717 if (so->so_options & SO_DEBUG)
2715 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0); 2718 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0);
2716#endif 2719#endif
2717 2720
2718 /* 2721 /*
2719 * Return any desired output. 2722 * Return any desired output.
2720 */ 2723 */
2721 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 2724 if (needoutput || (tp->t_flags & TF_ACKNOW)) {
2722 (void) tcp_output(tp); 2725 (void) tcp_output(tp);
2723 } 2726 }
2724 if (tcp_saveti) 2727 if (tcp_saveti)
2725 m_freem(tcp_saveti); 2728 m_freem(tcp_saveti);
2726 return; 2729 return;
2727 2730
2728badsyn: 2731badsyn:
2729 /* 2732 /*
2730 * Received a bad SYN. Increment counters and dropwithreset. 2733 * Received a bad SYN. Increment counters and dropwithreset.
2731 */ 2734 */
2732 TCP_STATINC(TCP_STAT_BADSYN); 2735 TCP_STATINC(TCP_STAT_BADSYN);
2733 tp = NULL; 2736 tp = NULL;
2734 goto dropwithreset; 2737 goto dropwithreset;
2735 2738
2736dropafterack: 2739dropafterack:
2737 /* 2740 /*
2738 * Generate an ACK dropping incoming segment if it occupies 2741 * Generate an ACK dropping incoming segment if it occupies
2739 * sequence space, where the ACK reflects our state. 2742 * sequence space, where the ACK reflects our state.
2740 */ 2743 */
2741 if (tiflags & TH_RST) 2744 if (tiflags & TH_RST)
2742 goto drop; 2745 goto drop;
2743 goto dropafterack2; 2746 goto dropafterack2;
2744 2747
2745dropafterack_ratelim: 2748dropafterack_ratelim:
2746 /* 2749 /*
2747 * We may want to rate-limit ACKs against SYN/RST attack. 2750 * We may want to rate-limit ACKs against SYN/RST attack.
2748 */ 2751 */
2749 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2752 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
2750 tcp_ackdrop_ppslim) == 0) { 2753 tcp_ackdrop_ppslim) == 0) {
2751 /* XXX stat */ 2754 /* XXX stat */
2752 goto drop; 2755 goto drop;
2753 } 2756 }
2754 /* ...fall into dropafterack2... */ 2757 /* ...fall into dropafterack2... */
2755 2758
2756dropafterack2: 2759dropafterack2:
2757 m_freem(m); 2760 m_freem(m);
2758 tp->t_flags |= TF_ACKNOW; 2761 tp->t_flags |= TF_ACKNOW;
2759 (void) tcp_output(tp); 2762 (void) tcp_output(tp);
2760 if (tcp_saveti) 2763 if (tcp_saveti)
2761 m_freem(tcp_saveti); 2764 m_freem(tcp_saveti);
2762 return; 2765 return;
2763 2766
2764dropwithreset_ratelim: 2767dropwithreset_ratelim:
2765 /* 2768 /*
2766 * We may want to rate-limit RSTs in certain situations, 2769 * We may want to rate-limit RSTs in certain situations,
2767 * particularly if we are sending an RST in response to 2770 * particularly if we are sending an RST in response to
2768 * an attempt to connect to or otherwise communicate with 2771 * an attempt to connect to or otherwise communicate with
2769 * a port for which we have no socket. 2772 * a port for which we have no socket.
2770 */ 2773 */
2771 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2774 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
2772 tcp_rst_ppslim) == 0) { 2775 tcp_rst_ppslim) == 0) {
2773 /* XXX stat */ 2776 /* XXX stat */
2774 goto drop; 2777 goto drop;
2775 } 2778 }
2776 /* ...fall into dropwithreset... */ 2779 /* ...fall into dropwithreset... */
2777 2780
2778dropwithreset: 2781dropwithreset:
2779 /* 2782 /*
2780 * Generate a RST, dropping incoming segment. 2783 * Generate a RST, dropping incoming segment.
2781 * Make ACK acceptable to originator of segment. 2784 * Make ACK acceptable to originator of segment.
2782 */ 2785 */
2783 if (tiflags & TH_RST) 2786 if (tiflags & TH_RST)
2784 goto drop; 2787 goto drop;
2785 2788
2786 switch (af) { 2789 switch (af) {
2787#ifdef INET6 2790#ifdef INET6
2788 case AF_INET6: 2791 case AF_INET6:
2789 /* For following calls to tcp_respond */ 2792 /* For following calls to tcp_respond */
2790 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) 2793 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
2791 goto drop; 2794 goto drop;
2792 break; 2795 break;
2793#endif /* INET6 */ 2796#endif /* INET6 */
2794 case AF_INET: 2797 case AF_INET:
2795 if (IN_MULTICAST(ip->ip_dst.s_addr) || 2798 if (IN_MULTICAST(ip->ip_dst.s_addr) ||
2796 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 2799 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2797 goto drop; 2800 goto drop;
2798 } 2801 }
2799 2802
2800 if (tiflags & TH_ACK) 2803 if (tiflags & TH_ACK)
2801 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 2804 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
2802 else { 2805 else {
2803 if (tiflags & TH_SYN) 2806 if (tiflags & TH_SYN)
2804 tlen++; 2807 tlen++;
2805 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0, 2808 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0,
2806 TH_RST|TH_ACK); 2809 TH_RST|TH_ACK);
2807 } 2810 }
2808 if (tcp_saveti) 2811 if (tcp_saveti)
2809 m_freem(tcp_saveti); 2812 m_freem(tcp_saveti);
2810 return; 2813 return;
2811 2814
2812badcsum: 2815badcsum:
2813drop: 2816drop:
2814 /* 2817 /*
2815 * Drop space held by incoming segment and return. 2818 * Drop space held by incoming segment and return.
2816 */ 2819 */
2817 if (tp) { 2820 if (tp) {
2818 if (tp->t_inpcb) 2821 if (tp->t_inpcb)
2819 so = tp->t_inpcb->inp_socket; 2822 so = tp->t_inpcb->inp_socket;
2820#ifdef INET6 2823#ifdef INET6
2821 else if (tp->t_in6pcb) 2824 else if (tp->t_in6pcb)
2822 so = tp->t_in6pcb->in6p_socket; 2825 so = tp->t_in6pcb->in6p_socket;
2823#endif 2826#endif
2824 else 2827 else
2825 so = NULL; 2828 so = NULL;
2826#ifdef TCP_DEBUG 2829#ifdef TCP_DEBUG
2827 if (so && (so->so_options & SO_DEBUG) != 0) 2830 if (so && (so->so_options & SO_DEBUG) != 0)
2828 tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0); 2831 tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0);
2829#endif 2832#endif
2830 } 2833 }
2831 if (tcp_saveti) 2834 if (tcp_saveti)
2832 m_freem(tcp_saveti); 2835 m_freem(tcp_saveti);
2833 m_freem(m); 2836 m_freem(m);
2834 return; 2837 return;
2835} 2838}
2836 2839
2837#ifdef TCP_SIGNATURE 2840#ifdef TCP_SIGNATURE
2838int 2841int
2839tcp_signature_apply(void *fstate, void *data, u_int len) 2842tcp_signature_apply(void *fstate, void *data, u_int len)
2840{ 2843{
2841 2844
2842 MD5Update(fstate, (u_char *)data, len); 2845 MD5Update(fstate, (u_char *)data, len);
2843 return (0); 2846 return (0);
2844} 2847}
2845 2848
2846struct secasvar * 2849struct secasvar *
2847tcp_signature_getsav(struct mbuf *m, struct tcphdr *th) 2850tcp_signature_getsav(struct mbuf *m, struct tcphdr *th)
2848{ 2851{
2849 struct secasvar *sav; 2852 struct secasvar *sav;
2850#ifdef FAST_IPSEC 2853#ifdef FAST_IPSEC
2851 union sockaddr_union dst; 2854 union sockaddr_union dst;
2852#endif 2855#endif
2853 struct ip *ip; 2856 struct ip *ip;
2854 struct ip6_hdr *ip6; 2857 struct ip6_hdr *ip6;
2855 2858
2856 ip = mtod(m, struct ip *); 2859 ip = mtod(m, struct ip *);
2857 switch (ip->ip_v) { 2860 switch (ip->ip_v) {
2858 case 4: 2861 case 4:
2859 ip = mtod(m, struct ip *); 2862 ip = mtod(m, struct ip *);
2860 ip6 = NULL; 2863 ip6 = NULL;
2861 break; 2864 break;
2862 case 6: 2865 case 6:
2863 ip = NULL; 2866 ip = NULL;
2864 ip6 = mtod(m, struct ip6_hdr *); 2867 ip6 = mtod(m, struct ip6_hdr *);
2865 break; 2868 break;
2866 default: 2869 default:
2867 return (NULL); 2870 return (NULL);
2868 } 2871 }
2869 2872
2870#ifdef FAST_IPSEC 2873#ifdef FAST_IPSEC
2871 /* Extract the destination from the IP header in the mbuf. */ 2874 /* Extract the destination from the IP header in the mbuf. */
2872 bzero(&dst, sizeof(union sockaddr_union)); 2875 bzero(&dst, sizeof(union sockaddr_union));
2873 if (ip !=NULL) { 2876 if (ip !=NULL) {
2874 dst.sa.sa_len = sizeof(struct sockaddr_in); 2877 dst.sa.sa_len = sizeof(struct sockaddr_in);
2875 dst.sa.sa_family = AF_INET; 2878 dst.sa.sa_family = AF_INET;
2876 dst.sin.sin_addr = ip->ip_dst; 2879 dst.sin.sin_addr = ip->ip_dst;
2877 } else { 2880 } else {
2878 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2881 dst.sa.sa_len = sizeof(struct sockaddr_in6);
2879 dst.sa.sa_family = AF_INET6; 2882 dst.sa.sa_family = AF_INET6;
2880 dst.sin6.sin6_addr = ip6->ip6_dst; 2883 dst.sin6.sin6_addr = ip6->ip6_dst;
2881 } 2884 }
2882 2885
2883 /* 2886 /*
2884 * Look up an SADB entry which matches the address of the peer. 2887 * Look up an SADB entry which matches the address of the peer.
2885 */ 2888 */
2886 sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); 2889 sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
2887#else 2890#else
2888 if (ip) 2891 if (ip)
2889 sav = key_allocsa(AF_INET, (void *)&ip->ip_src, 2892 sav = key_allocsa(AF_INET, (void *)&ip->ip_src,
2890 (void *)&ip->ip_dst, IPPROTO_TCP, 2893 (void *)&ip->ip_dst, IPPROTO_TCP,
2891 htonl(TCP_SIG_SPI), 0, 0); 2894 htonl(TCP_SIG_SPI), 0, 0);
2892 else 2895 else
2893 sav = key_allocsa(AF_INET6, (void *)&ip6->ip6_src, 2896 sav = key_allocsa(AF_INET6, (void *)&ip6->ip6_src,
2894 (void *)&ip6->ip6_dst, IPPROTO_TCP, 2897 (void *)&ip6->ip6_dst, IPPROTO_TCP,
2895 htonl(TCP_SIG_SPI), 0, 0); 2898 htonl(TCP_SIG_SPI), 0, 0);
2896#endif 2899#endif
2897 2900
2898 return (sav); /* freesav must be performed by caller */ 2901 return (sav); /* freesav must be performed by caller */
2899} 2902}
2900 2903
2901int 2904int
2902tcp_signature(struct mbuf *m, struct tcphdr *th, int thoff, 2905tcp_signature(struct mbuf *m, struct tcphdr *th, int thoff,
2903 struct secasvar *sav, char *sig) 2906 struct secasvar *sav, char *sig)
2904{ 2907{
2905 MD5_CTX ctx; 2908 MD5_CTX ctx;
2906 struct ip *ip; 2909 struct ip *ip;
2907 struct ipovly *ipovly; 2910 struct ipovly *ipovly;
2908 struct ip6_hdr *ip6; 2911 struct ip6_hdr *ip6;
2909 struct ippseudo ippseudo; 2912 struct ippseudo ippseudo;
2910 struct ip6_hdr_pseudo ip6pseudo; 2913 struct ip6_hdr_pseudo ip6pseudo;
2911 struct tcphdr th0; 2914 struct tcphdr th0;
2912 int l, tcphdrlen; 2915 int l, tcphdrlen;
2913 2916
2914 if (sav == NULL) 2917 if (sav == NULL)
2915 return (-1); 2918 return (-1);
2916 2919
2917 tcphdrlen = th->th_off * 4; 2920 tcphdrlen = th->th_off * 4;
2918 2921
2919 switch (mtod(m, struct ip *)->ip_v) { 2922 switch (mtod(m, struct ip *)->ip_v) {
2920 case 4: 2923 case 4:
2921 ip = mtod(m, struct ip *); 2924 ip = mtod(m, struct ip *);
2922 ip6 = NULL; 2925 ip6 = NULL;
2923 break; 2926 break;
2924 case 6: 2927 case 6:
2925 ip = NULL; 2928 ip = NULL;
2926 ip6 = mtod(m, struct ip6_hdr *); 2929 ip6 = mtod(m, struct ip6_hdr *);
2927 break; 2930 break;
2928 default: 2931 default:
2929 return (-1); 2932 return (-1);
2930 } 2933 }
2931 2934
2932 MD5Init(&ctx); 2935 MD5Init(&ctx);
2933 2936
2934 if (ip) { 2937 if (ip) {
2935 memset(&ippseudo, 0, sizeof(ippseudo)); 2938 memset(&ippseudo, 0, sizeof(ippseudo));
2936 ipovly = (struct ipovly *)ip; 2939 ipovly = (struct ipovly *)ip;
2937 ippseudo.ippseudo_src = ipovly->ih_src; 2940 ippseudo.ippseudo_src = ipovly->ih_src;
2938 ippseudo.ippseudo_dst = ipovly->ih_dst; 2941 ippseudo.ippseudo_dst = ipovly->ih_dst;
2939 ippseudo.ippseudo_pad = 0; 2942 ippseudo.ippseudo_pad = 0;
2940 ippseudo.ippseudo_p = IPPROTO_TCP; 2943 ippseudo.ippseudo_p = IPPROTO_TCP;
2941 ippseudo.ippseudo_len = htons(m->m_pkthdr.len - thoff); 2944 ippseudo.ippseudo_len = htons(m->m_pkthdr.len - thoff);
2942 MD5Update(&ctx, (char *)&ippseudo, sizeof(ippseudo)); 2945 MD5Update(&ctx, (char *)&ippseudo, sizeof(ippseudo));
2943 } else { 2946 } else {
2944 memset(&ip6pseudo, 0, sizeof(ip6pseudo)); 2947 memset(&ip6pseudo, 0, sizeof(ip6pseudo));
2945 ip6pseudo.ip6ph_src = ip6->ip6_src; 2948 ip6pseudo.ip6ph_src = ip6->ip6_src;
2946 in6_clearscope(&ip6pseudo.ip6ph_src); 2949 in6_clearscope(&ip6pseudo.ip6ph_src);
2947 ip6pseudo.ip6ph_dst = ip6->ip6_dst; 2950 ip6pseudo.ip6ph_dst = ip6->ip6_dst;
2948 in6_clearscope(&ip6pseudo.ip6ph_dst); 2951 in6_clearscope(&ip6pseudo.ip6ph_dst);
2949 ip6pseudo.ip6ph_len = htons(m->m_pkthdr.len - thoff); 2952 ip6pseudo.ip6ph_len = htons(m->m_pkthdr.len - thoff);
2950 ip6pseudo.ip6ph_nxt = IPPROTO_TCP; 2953 ip6pseudo.ip6ph_nxt = IPPROTO_TCP;
2951 MD5Update(&ctx, (char *)&ip6pseudo, sizeof(ip6pseudo)); 2954 MD5Update(&ctx, (char *)&ip6pseudo, sizeof(ip6pseudo));
2952 } 2955 }
2953 2956
2954 th0 = *th; 2957 th0 = *th;
2955 th0.th_sum = 0; 2958 th0.th_sum = 0;
2956 MD5Update(&ctx, (char *)&th0, sizeof(th0)); 2959 MD5Update(&ctx, (char *)&th0, sizeof(th0));
2957 2960
2958 l = m->m_pkthdr.len - thoff - tcphdrlen; 2961 l = m->m_pkthdr.len - thoff - tcphdrlen;
2959 if (l > 0) 2962 if (l > 0)
2960 m_apply(m, thoff + tcphdrlen, 2963 m_apply(m, thoff + tcphdrlen,
2961 m->m_pkthdr.len - thoff - tcphdrlen, 2964 m->m_pkthdr.len - thoff - tcphdrlen,
2962 tcp_signature_apply, &ctx); 2965 tcp_signature_apply, &ctx);
2963 2966
2964 MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); 2967 MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth));
2965 MD5Final(sig, &ctx); 2968 MD5Final(sig, &ctx);
2966 2969
2967 return (0); 2970 return (0);
2968} 2971}
2969#endif 2972#endif
2970 2973
2971static int 2974static int
2972tcp_dooptions(struct tcpcb *tp, const u_char *cp, int cnt, 2975tcp_dooptions(struct tcpcb *tp, const u_char *cp, int cnt,
2973 struct tcphdr *th, 2976 struct tcphdr *th,
2974 struct mbuf *m, int toff, struct tcp_opt_info *oi) 2977 struct mbuf *m, int toff, struct tcp_opt_info *oi)
2975{ 2978{
2976 u_int16_t mss; 2979 u_int16_t mss;
2977 int opt, optlen = 0; 2980 int opt, optlen = 0;
2978#ifdef TCP_SIGNATURE 2981#ifdef TCP_SIGNATURE
2979 void *sigp = NULL; 2982 void *sigp = NULL;
2980 char sigbuf[TCP_SIGLEN]; 2983 char sigbuf[TCP_SIGLEN];
2981 struct secasvar *sav = NULL; 2984 struct secasvar *sav = NULL;
2982#endif 2985#endif
2983 2986
2984 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2987 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) {
2985 opt = cp[0]; 2988 opt = cp[0];
2986 if (opt == TCPOPT_EOL) 2989 if (opt == TCPOPT_EOL)
2987 break; 2990 break;
2988 if (opt == TCPOPT_NOP) 2991 if (opt == TCPOPT_NOP)
2989 optlen = 1; 2992 optlen = 1;
2990 else { 2993 else {
2991 if (cnt < 2) 2994 if (cnt < 2)
2992 break; 2995 break;
2993 optlen = cp[1]; 2996 optlen = cp[1];
2994 if (optlen < 2 || optlen > cnt) 2997 if (optlen < 2 || optlen > cnt)
2995 break; 2998 break;
2996 } 2999 }
2997 switch (opt) { 3000 switch (opt) {
2998 3001
2999 default: 3002 default:
3000 continue; 3003 continue;
3001 3004
3002 case TCPOPT_MAXSEG: 3005 case TCPOPT_MAXSEG:
3003 if (optlen != TCPOLEN_MAXSEG) 3006 if (optlen != TCPOLEN_MAXSEG)
3004 continue; 3007 continue;
3005 if (!(th->th_flags & TH_SYN)) 3008 if (!(th->th_flags & TH_SYN))
3006 continue; 3009 continue;
3007 if (TCPS_HAVERCVDSYN(tp->t_state)) 3010 if (TCPS_HAVERCVDSYN(tp->t_state))
3008 continue; 3011 continue;
3009 bcopy(cp + 2, &mss, sizeof(mss)); 3012 bcopy(cp + 2, &mss, sizeof(mss));
3010 oi->maxseg = ntohs(mss); 3013 oi->maxseg = ntohs(mss);
3011 break; 3014 break;
3012 3015
3013 case TCPOPT_WINDOW: 3016 case TCPOPT_WINDOW:
3014 if (optlen != TCPOLEN_WINDOW) 3017 if (optlen != TCPOLEN_WINDOW)
3015 continue; 3018 continue;
3016 if (!(th->th_flags & TH_SYN)) 3019 if (!(th->th_flags & TH_SYN))
3017 continue; 3020 continue;
3018 if (TCPS_HAVERCVDSYN(tp->t_state)) 3021 if (TCPS_HAVERCVDSYN(tp->t_state))
3019 continue; 3022 continue;
3020 tp->t_flags |= TF_RCVD_SCALE; 3023 tp->t_flags |= TF_RCVD_SCALE;
3021 tp->requested_s_scale = cp[2]; 3024 tp->requested_s_scale = cp[2];
3022 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) { 3025 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) {
3023#if 0 /*XXX*/ 3026#if 0 /*XXX*/
3024 char *p; 3027 char *p;
3025 3028
3026 if (ip) 3029 if (ip)
3027 p = ntohl(ip->ip_src); 3030 p = ntohl(ip->ip_src);
3028#ifdef INET6 3031#ifdef INET6
3029 else if (ip6) 3032 else if (ip6)
3030 p = ip6_sprintf(&ip6->ip6_src); 3033 p = ip6_sprintf(&ip6->ip6_src);
3031#endif 3034#endif
3032 else 3035 else
3033 p = "(unknown)"; 3036 p = "(unknown)";
3034 log(LOG_ERR, "TCP: invalid wscale %d from %s, " 3037 log(LOG_ERR, "TCP: invalid wscale %d from %s, "
3035 "assuming %d\n", 3038 "assuming %d\n",
3036 tp->requested_s_scale, p, 3039 tp->requested_s_scale, p,
3037 TCP_MAX_WINSHIFT); 3040 TCP_MAX_WINSHIFT);
3038#else 3041#else
3039 log(LOG_ERR, "TCP: invalid wscale %d, " 3042 log(LOG_ERR, "TCP: invalid wscale %d, "
3040 "assuming %d\n", 3043 "assuming %d\n",
3041 tp->requested_s_scale, 3044 tp->requested_s_scale,
3042 TCP_MAX_WINSHIFT); 3045 TCP_MAX_WINSHIFT);
3043#endif 3046#endif
3044 tp->requested_s_scale = TCP_MAX_WINSHIFT; 3047 tp->requested_s_scale = TCP_MAX_WINSHIFT;
3045 } 3048 }
3046 break; 3049 break;
3047 3050
3048 case TCPOPT_TIMESTAMP: 3051 case TCPOPT_TIMESTAMP:
3049 if (optlen != TCPOLEN_TIMESTAMP) 3052 if (optlen != TCPOLEN_TIMESTAMP)
3050 continue; 3053 continue;
3051 oi->ts_present = 1; 3054 oi->ts_present = 1;
3052 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 3055 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
3053 NTOHL(oi->ts_val); 3056 NTOHL(oi->ts_val);
3054 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 3057 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
3055 NTOHL(oi->ts_ecr); 3058 NTOHL(oi->ts_ecr);
3056 3059
3057 if (!(th->th_flags & TH_SYN)) 3060 if (!(th->th_flags & TH_SYN))
3058 continue; 3061 continue;
3059 if (TCPS_HAVERCVDSYN(tp->t_state)) 3062 if (TCPS_HAVERCVDSYN(tp->t_state))
3060 continue; 3063 continue;
3061 /* 3064 /*
3062 * A timestamp received in a SYN makes 3065 * A timestamp received in a SYN makes
3063 * it ok to send timestamp requests and replies. 3066 * it ok to send timestamp requests and replies.
3064 */ 3067 */
3065 tp->t_flags |= TF_RCVD_TSTMP; 3068 tp->t_flags |= TF_RCVD_TSTMP;
3066 tp->ts_recent = oi->ts_val; 3069 tp->ts_recent = oi->ts_val;
3067 tp->ts_recent_age = tcp_now; 3070 tp->ts_recent_age = tcp_now;
3068 break; 3071 break;
3069 3072
3070 case TCPOPT_SACK_PERMITTED: 3073 case TCPOPT_SACK_PERMITTED:
3071 if (optlen != TCPOLEN_SACK_PERMITTED) 3074 if (optlen != TCPOLEN_SACK_PERMITTED)
3072 continue; 3075 continue;
3073 if (!(th->th_flags & TH_SYN)) 3076 if (!(th->th_flags & TH_SYN))
3074 continue; 3077 continue;
3075 if (TCPS_HAVERCVDSYN(tp->t_state)) 3078 if (TCPS_HAVERCVDSYN(tp->t_state))
3076 continue; 3079 continue;
3077 if (tcp_do_sack) { 3080 if (tcp_do_sack) {
3078 tp->t_flags |= TF_SACK_PERMIT; 3081 tp->t_flags |= TF_SACK_PERMIT;
3079 tp->t_flags |= TF_WILL_SACK; 3082 tp->t_flags |= TF_WILL_SACK;
3080 } 3083 }
3081 break; 3084 break;
3082 3085
3083 case TCPOPT_SACK: 3086 case TCPOPT_SACK:
3084 tcp_sack_option(tp, th, cp, optlen); 3087 tcp_sack_option(tp, th, cp, optlen);
3085 break; 3088 break;
3086#ifdef TCP_SIGNATURE 3089#ifdef TCP_SIGNATURE
3087 case TCPOPT_SIGNATURE: 3090 case TCPOPT_SIGNATURE:
3088 if (optlen != TCPOLEN_SIGNATURE) 3091 if (optlen != TCPOLEN_SIGNATURE)
3089 continue; 3092 continue;
3090 if (sigp && bcmp(sigp, cp + 2, TCP_SIGLEN)) 3093 if (sigp && bcmp(sigp, cp + 2, TCP_SIGLEN))
3091 return (-1); 3094 return (-1);
3092 3095
3093 sigp = sigbuf; 3096 sigp = sigbuf;
3094 memcpy(sigbuf, cp + 2, TCP_SIGLEN); 3097 memcpy(sigbuf, cp + 2, TCP_SIGLEN);
3095 tp->t_flags |= TF_SIGNATURE; 3098 tp->t_flags |= TF_SIGNATURE;
3096 break; 3099 break;
3097#endif 3100#endif
3098 } 3101 }
3099 } 3102 }
3100 3103
3101#ifdef TCP_SIGNATURE 3104#ifdef TCP_SIGNATURE
3102 if (tp->t_flags & TF_SIGNATURE) { 3105 if (tp->t_flags & TF_SIGNATURE) {
3103 3106
3104 sav = tcp_signature_getsav(m, th); 3107 sav = tcp_signature_getsav(m, th);
3105 3108
3106 if (sav == NULL && tp->t_state == TCPS_LISTEN) 3109 if (sav == NULL && tp->t_state == TCPS_LISTEN)
3107 return (-1); 3110 return (-1);
3108 } 3111 }
3109 3112
3110 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 3113 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) {
3111 if (sav == NULL) 3114 if (sav == NULL)
3112 return (-1); 3115 return (-1);
3113#ifdef FAST_IPSEC 3116#ifdef FAST_IPSEC
3114 KEY_FREESAV(&sav); 3117 KEY_FREESAV(&sav);
3115#else 3118#else
3116 key_freesav(sav); 3119 key_freesav(sav);
3117#endif 3120#endif
3118 return (-1); 3121 return (-1);
3119 } 3122 }
3120 3123
3121 if (sigp) { 3124 if (sigp) {
3122 char sig[TCP_SIGLEN]; 3125 char sig[TCP_SIGLEN];
3123 3126
3124 tcp_fields_to_net(th); 3127 tcp_fields_to_net(th);
3125 if (tcp_signature(m, th, toff, sav, sig) < 0) { 3128 if (tcp_signature(m, th, toff, sav, sig) < 0) {
3126 tcp_fields_to_host(th); 3129 tcp_fields_to_host(th);
3127 if (sav == NULL) 3130 if (sav == NULL)
3128 return (-1); 3131 return (-1);
3129#ifdef FAST_IPSEC 3132#ifdef FAST_IPSEC
3130 KEY_FREESAV(&sav); 3133 KEY_FREESAV(&sav);
3131#else 3134#else
3132 key_freesav(sav); 3135 key_freesav(sav);
3133#endif 3136#endif
3134 return (-1); 3137 return (-1);
3135 } 3138 }
3136 tcp_fields_to_host(th); 3139 tcp_fields_to_host(th);
3137 3140
3138 if (bcmp(sig, sigp, TCP_SIGLEN)) { 3141 if (bcmp(sig, sigp, TCP_SIGLEN)) {
3139 TCP_STATINC(TCP_STAT_BADSIG); 3142 TCP_STATINC(TCP_STAT_BADSIG);
3140 if (sav == NULL) 3143 if (sav == NULL)
3141 return (-1); 3144 return (-1);
3142#ifdef FAST_IPSEC 3145#ifdef FAST_IPSEC
3143 KEY_FREESAV(&sav); 3146 KEY_FREESAV(&sav);
3144#else 3147#else
3145 key_freesav(sav); 3148 key_freesav(sav);
3146#endif 3149#endif
3147 return (-1); 3150 return (-1);
3148 } else 3151 } else
3149 TCP_STATINC(TCP_STAT_GOODSIG); 3152 TCP_STATINC(TCP_STAT_GOODSIG);
3150 3153
3151 key_sa_recordxfer(sav, m); 3154 key_sa_recordxfer(sav, m);
3152#ifdef FAST_IPSEC 3155#ifdef FAST_IPSEC
3153 KEY_FREESAV(&sav); 3156 KEY_FREESAV(&sav);
3154#else 3157#else
3155 key_freesav(sav); 3158 key_freesav(sav);
3156#endif 3159#endif
3157 } 3160 }
3158#endif 3161#endif
3159 3162
3160 return (0); 3163 return (0);
3161} 3164}
3162 3165
3163/* 3166/*
3164 * Pull out of band byte out of a segment so 3167 * Pull out of band byte out of a segment so
3165 * it doesn't appear in the user's data queue. 3168 * it doesn't appear in the user's data queue.
3166 * It is still reflected in the segment length for 3169 * It is still reflected in the segment length for
3167 * sequencing purposes. 3170 * sequencing purposes.
3168 */ 3171 */
3169void 3172void
3170tcp_pulloutofband(struct socket *so, struct tcphdr *th, 3173tcp_pulloutofband(struct socket *so, struct tcphdr *th,
3171 struct mbuf *m, int off) 3174 struct mbuf *m, int off)
3172{ 3175{
3173 int cnt = off + th->th_urp - 1; 3176 int cnt = off + th->th_urp - 1;
3174 3177
3175 while (cnt >= 0) { 3178 while (cnt >= 0) {
3176 if (m->m_len > cnt) { 3179 if (m->m_len > cnt) {
3177 char *cp = mtod(m, char *) + cnt; 3180 char *cp = mtod(m, char *) + cnt;
3178 struct tcpcb *tp = sototcpcb(so); 3181 struct tcpcb *tp = sototcpcb(so);
3179 3182
3180 tp->t_iobc = *cp; 3183 tp->t_iobc = *cp;
3181 tp->t_oobflags |= TCPOOB_HAVEDATA; 3184 tp->t_oobflags |= TCPOOB_HAVEDATA;
3182 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 3185 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
3183 m->m_len--; 3186 m->m_len--;
3184 return; 3187 return;
3185 } 3188 }
3186 cnt -= m->m_len; 3189 cnt -= m->m_len;
3187 m = m->m_next; 3190 m = m->m_next;
3188 if (m == 0) 3191 if (m == 0)
3189 break; 3192 break;
3190 } 3193 }
3191 panic("tcp_pulloutofband"); 3194 panic("tcp_pulloutofband");
3192} 3195}
3193 3196
3194/* 3197/*
3195 * Collect new round-trip time estimate 3198 * Collect new round-trip time estimate
3196 * and update averages and current timeout. 3199 * and update averages and current timeout.
3197 */ 3200 */
3198void 3201void
3199tcp_xmit_timer(struct tcpcb *tp, uint32_t rtt) 3202tcp_xmit_timer(struct tcpcb *tp, uint32_t rtt)
3200{ 3203{
3201 int32_t delta; 3204 int32_t delta;
3202 3205
3203 TCP_STATINC(TCP_STAT_RTTUPDATED); 3206 TCP_STATINC(TCP_STAT_RTTUPDATED);
3204 if (tp->t_srtt != 0) { 3207 if (tp->t_srtt != 0) {
3205 /* 3208 /*
3206 * srtt is stored as fixed point with 3 bits after the 3209 * srtt is stored as fixed point with 3 bits after the
3207 * binary point (i.e., scaled by 8). The following magic 3210 * binary point (i.e., scaled by 8). The following magic
3208 * is equivalent to the smoothing algorithm in rfc793 with 3211 * is equivalent to the smoothing algorithm in rfc793 with
3209 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 3212 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
3210 * point). Adjust rtt to origin 0. 3213 * point). Adjust rtt to origin 0.
3211 */ 3214 */
3212 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 3215 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT);
3213 if ((tp->t_srtt += delta) <= 0) 3216 if ((tp->t_srtt += delta) <= 0)
3214 tp->t_srtt = 1 << 2; 3217 tp->t_srtt = 1 << 2;
3215 /* 3218 /*
3216 * We accumulate a smoothed rtt variance (actually, a 3219 * We accumulate a smoothed rtt variance (actually, a
3217 * smoothed mean difference), then set the retransmit 3220 * smoothed mean difference), then set the retransmit
3218 * timer to smoothed rtt + 4 times the smoothed variance. 3221 * timer to smoothed rtt + 4 times the smoothed variance.
3219 * rttvar is stored as fixed point with 2 bits after the 3222 * rttvar is stored as fixed point with 2 bits after the
3220 * binary point (scaled by 4). The following is 3223 * binary point (scaled by 4). The following is
3221 * equivalent to rfc793 smoothing with an alpha of .75 3224 * equivalent to rfc793 smoothing with an alpha of .75
3222 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 3225 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
3223 * rfc793's wired-in beta. 3226 * rfc793's wired-in beta.
3224 */ 3227 */
3225 if (delta < 0) 3228 if (delta < 0)
3226 delta = -delta; 3229 delta = -delta;
3227 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 3230 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
3228 if ((tp->t_rttvar += delta) <= 0) 3231 if ((tp->t_rttvar += delta) <= 0)
3229 tp->t_rttvar = 1 << 2; 3232 tp->t_rttvar = 1 << 2;
3230 } else { 3233 } else {
3231 /* 3234 /*
3232 * No rtt measurement yet - use the unsmoothed rtt. 3235 * No rtt measurement yet - use the unsmoothed rtt.
3233 * Set the variance to half the rtt (so our first 3236 * Set the variance to half the rtt (so our first
3234 * retransmit happens at 3*rtt). 3237 * retransmit happens at 3*rtt).
3235 */ 3238 */
3236 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 3239 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2);
3237 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 3240 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1);
3238 } 3241 }
3239 tp->t_rtttime = 0; 3242 tp->t_rtttime = 0;
3240 tp->t_rxtshift = 0; 3243 tp->t_rxtshift = 0;
3241 3244
3242 /* 3245 /*
3243 * the retransmit should happen at rtt + 4 * rttvar. 3246 * the retransmit should happen at rtt + 4 * rttvar.
3244 * Because of the way we do the smoothing, srtt and rttvar 3247 * Because of the way we do the smoothing, srtt and rttvar
3245 * will each average +1/2 tick of bias. When we compute 3248 * will each average +1/2 tick of bias. When we compute
3246 * the retransmit timer, we want 1/2 tick of rounding and 3249 * the retransmit timer, we want 1/2 tick of rounding and
3247 * 1 extra tick because of +-1/2 tick uncertainty in the 3250 * 1 extra tick because of +-1/2 tick uncertainty in the
3248 * firing of the timer. The bias will give us exactly the 3251 * firing of the timer. The bias will give us exactly the
3249 * 1.5 tick we need. But, because the bias is 3252 * 1.5 tick we need. But, because the bias is
3250 * statistical, we have to test that we don't drop below 3253 * statistical, we have to test that we don't drop below
3251 * the minimum feasible timer (which is 2 ticks). 3254 * the minimum feasible timer (which is 2 ticks).
3252 */ 3255 */
3253 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3256 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3254 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 3257 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
3255 3258
3256 /* 3259 /*
3257 * We received an ack for a packet that wasn't retransmitted; 3260 * We received an ack for a packet that wasn't retransmitted;
3258 * it is probably safe to discard any error indications we've 3261 * it is probably safe to discard any error indications we've
3259 * received recently. This isn't quite right, but close enough 3262 * received recently. This isn't quite right, but close enough
3260 * for now (a route might have failed after we sent a segment, 3263 * for now (a route might have failed after we sent a segment,
3261 * and the return path might not be symmetrical). 3264 * and the return path might not be symmetrical).
3262 */ 3265 */
3263 tp->t_softerror = 0; 3266 tp->t_softerror = 0;
3264} 3267}
3265 3268
3266 3269
3267/* 3270/*
3268 * TCP compressed state engine. Currently used to hold compressed 3271 * TCP compressed state engine. Currently used to hold compressed
3269 * state for SYN_RECEIVED. 3272 * state for SYN_RECEIVED.
3270 */ 3273 */
3271 3274
3272u_long syn_cache_count; 3275u_long syn_cache_count;
3273u_int32_t syn_hash1, syn_hash2; 3276u_int32_t syn_hash1, syn_hash2;
3274 3277
3275#define SYN_HASH(sa, sp, dp) \ 3278#define SYN_HASH(sa, sp, dp) \
3276 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3279 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
3277 ((u_int32_t)(sp)))^syn_hash2))) 3280 ((u_int32_t)(sp)))^syn_hash2)))
3278#ifndef INET6 3281#ifndef INET6
3279#define SYN_HASHALL(hash, src, dst) \ 3282#define SYN_HASHALL(hash, src, dst) \
3280do { \ 3283do { \
3281 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ 3284 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
3282 ((const struct sockaddr_in *)(src))->sin_port, \ 3285 ((const struct sockaddr_in *)(src))->sin_port, \
3283 ((const struct sockaddr_in *)(dst))->sin_port); \ 3286 ((const struct sockaddr_in *)(dst))->sin_port); \
3284} while (/*CONSTCOND*/ 0) 3287} while (/*CONSTCOND*/ 0)
3285#else 3288#else
3286#define SYN_HASH6(sa, sp, dp) \ 3289#define SYN_HASH6(sa, sp, dp) \
3287 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3290 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
3288 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3291 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
3289 & 0x7fffffff) 3292 & 0x7fffffff)
3290 3293
3291#define SYN_HASHALL(hash, src, dst) \ 3294#define SYN_HASHALL(hash, src, dst) \
3292do { \ 3295do { \
3293 switch ((src)->sa_family) { \ 3296 switch ((src)->sa_family) { \
3294 case AF_INET: \ 3297 case AF_INET: \
3295 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ 3298 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
3296 ((const struct sockaddr_in *)(src))->sin_port, \ 3299 ((const struct sockaddr_in *)(src))->sin_port, \
3297 ((const struct sockaddr_in *)(dst))->sin_port); \ 3300 ((const struct sockaddr_in *)(dst))->sin_port); \
3298 break; \ 3301 break; \
3299 case AF_INET6: \ 3302 case AF_INET6: \
3300 hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \ 3303 hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
3301 ((const struct sockaddr_in6 *)(src))->sin6_port, \ 3304 ((const struct sockaddr_in6 *)(src))->sin6_port, \
3302 ((const struct sockaddr_in6 *)(dst))->sin6_port); \ 3305 ((const struct sockaddr_in6 *)(dst))->sin6_port); \
3303 break; \ 3306 break; \
3304 default: \ 3307 default: \
3305 hash = 0; \ 3308 hash = 0; \
3306 } \ 3309 } \
3307} while (/*CONSTCOND*/0) 3310} while (/*CONSTCOND*/0)
3308#endif /* INET6 */ 3311#endif /* INET6 */
3309 3312
3310POOL_INIT(syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, "synpl", NULL, 3313POOL_INIT(syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, "synpl", NULL,
3311 IPL_SOFTNET); 3314 IPL_SOFTNET);
3312 3315
3313/* 3316/*
3314 * We don't estimate RTT with SYNs, so each packet starts with the default 3317 * We don't estimate RTT with SYNs, so each packet starts with the default
3315 * RTT and each timer step has a fixed timeout value. 3318 * RTT and each timer step has a fixed timeout value.
3316 */ 3319 */
3317#define SYN_CACHE_TIMER_ARM(sc) \ 3320#define SYN_CACHE_TIMER_ARM(sc) \
3318do { \ 3321do { \
3319 TCPT_RANGESET((sc)->sc_rxtcur, \ 3322 TCPT_RANGESET((sc)->sc_rxtcur, \
3320 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3323 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
3321 TCPTV_REXMTMAX); \ 3324 TCPTV_REXMTMAX); \
3322 callout_reset(&(sc)->sc_timer, \ 3325 callout_reset(&(sc)->sc_timer, \
3323 (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc)); \ 3326 (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc)); \
3324} while (/*CONSTCOND*/0) 3327} while (/*CONSTCOND*/0)
3325 3328
3326#define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase) 3329#define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase)
3327 3330
3328static inline void 3331static inline void
3329syn_cache_rm(struct syn_cache *sc) 3332syn_cache_rm(struct syn_cache *sc)
3330{ 3333{
3331 TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, 3334 TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
3332 sc, sc_bucketq); 3335 sc, sc_bucketq);
3333 sc->sc_tp = NULL; 3336 sc->sc_tp = NULL;
3334 LIST_REMOVE(sc, sc_tpq); 3337 LIST_REMOVE(sc, sc_tpq);
3335 tcp_syn_cache[sc->sc_bucketidx].sch_length--; 3338 tcp_syn_cache[sc->sc_bucketidx].sch_length--;
3336 callout_stop(&sc->sc_timer); 3339 callout_stop(&sc->sc_timer);
3337 syn_cache_count--; 3340 syn_cache_count--;
3338} 3341}
3339 3342
3340static inline void 3343static inline void
3341syn_cache_put(struct syn_cache *sc) 3344syn_cache_put(struct syn_cache *sc)
3342{ 3345{
3343 if (sc->sc_ipopts) 3346 if (sc->sc_ipopts)
3344 (void) m_free(sc->sc_ipopts); 3347 (void) m_free(sc->sc_ipopts);
3345 rtcache_free(&sc->sc_route); 3348 rtcache_free(&sc->sc_route);
3346 sc->sc_flags |= SCF_DEAD; 3349 sc->sc_flags |= SCF_DEAD;
3347 if (!callout_invoking(&sc->sc_timer)) 3350 if (!callout_invoking(&sc->sc_timer))
3348 callout_schedule(&(sc)->sc_timer, 1); 3351 callout_schedule(&(sc)->sc_timer, 1);
3349} 3352}
3350 3353
3351void 3354void
3352syn_cache_init(void) 3355syn_cache_init(void)
3353{ 3356{
3354 int i; 3357 int i;
3355 3358
3356 /* Initialize the hash buckets. */ 3359 /* Initialize the hash buckets. */
3357 for (i = 0; i < tcp_syn_cache_size; i++) 3360 for (i = 0; i < tcp_syn_cache_size; i++)
3358 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3361 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
3359} 3362}
3360 3363
3361void 3364void
3362syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3365syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
3363{ 3366{
3364 struct syn_cache_head *scp; 3367 struct syn_cache_head *scp;
3365 struct syn_cache *sc2; 3368 struct syn_cache *sc2;
3366 int s; 3369 int s;
3367 3370
3368 /* 3371 /*
3369 * If there are no entries in the hash table, reinitialize 3372 * If there are no entries in the hash table, reinitialize
3370 * the hash secrets. 3373 * the hash secrets.
3371 */ 3374 */
3372 if (syn_cache_count == 0) { 3375 if (syn_cache_count == 0) {
3373 syn_hash1 = arc4random(); 3376 syn_hash1 = arc4random();
3374 syn_hash2 = arc4random(); 3377 syn_hash2 = arc4random();
3375 } 3378 }
3376 3379
3377 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3380 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
3378 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3381 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
3379 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3382 scp = &tcp_syn_cache[sc->sc_bucketidx];
3380 3383
3381 /* 3384 /*
3382 * Make sure that we don't overflow the per-bucket 3385 * Make sure that we don't overflow the per-bucket
3383 * limit or the total cache size limit. 3386 * limit or the total cache size limit.
3384 */ 3387 */
3385 s = splsoftnet(); 3388 s = splsoftnet();
3386 if (scp->sch_length >= tcp_syn_bucket_limit) { 3389 if (scp->sch_length >= tcp_syn_bucket_limit) {
3387 TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW); 3390 TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
3388 /* 3391 /*
3389 * The bucket is full. Toss the oldest element in the 3392 * The bucket is full. Toss the oldest element in the
3390 * bucket. This will be the first entry in the bucket. 3393 * bucket. This will be the first entry in the bucket.
3391 */ 3394 */
3392 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3395 sc2 = TAILQ_FIRST(&scp->sch_bucket);
3393#ifdef DIAGNOSTIC 3396#ifdef DIAGNOSTIC
3394 /* 3397 /*
3395 * This should never happen; we should always find an 3398 * This should never happen; we should always find an
3396 * entry in our bucket. 3399 * entry in our bucket.
3397 */ 3400 */
3398 if (sc2 == NULL) 3401 if (sc2 == NULL)
3399 panic("syn_cache_insert: bucketoverflow: impossible"); 3402 panic("syn_cache_insert: bucketoverflow: impossible");
3400#endif 3403#endif
3401 syn_cache_rm(sc2); 3404 syn_cache_rm(sc2);
3402 syn_cache_put(sc2); /* calls pool_put but see spl above */ 3405 syn_cache_put(sc2); /* calls pool_put but see spl above */
3403 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3406 } else if (syn_cache_count >= tcp_syn_cache_limit) {
3404 struct syn_cache_head *scp2, *sce; 3407 struct syn_cache_head *scp2, *sce;
3405 3408
3406 TCP_STATINC(TCP_STAT_SC_OVERFLOWED); 3409 TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
3407 /* 3410 /*
3408 * The cache is full. Toss the oldest entry in the 3411 * The cache is full. Toss the oldest entry in the
3409 * first non-empty bucket we can find. 3412 * first non-empty bucket we can find.
3410 * 3413 *
3411 * XXX We would really like to toss the oldest 3414 * XXX We would really like to toss the oldest
3412 * entry in the cache, but we hope that this 3415 * entry in the cache, but we hope that this
3413 * condition doesn't happen very often. 3416 * condition doesn't happen very often.
3414 */ 3417 */
3415 scp2 = scp; 3418 scp2 = scp;
3416 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3419 if (TAILQ_EMPTY(&scp2->sch_bucket)) {
3417 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3420 sce = &tcp_syn_cache[tcp_syn_cache_size];
3418 for (++scp2; scp2 != scp; scp2++) { 3421 for (++scp2; scp2 != scp; scp2++) {
3419 if (scp2 >= sce) 3422 if (scp2 >= sce)
3420 scp2 = &tcp_syn_cache[0]; 3423 scp2 = &tcp_syn_cache[0];
3421 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3424 if (! TAILQ_EMPTY(&scp2->sch_bucket))
3422 break; 3425 break;
3423 } 3426 }
3424#ifdef DIAGNOSTIC 3427#ifdef DIAGNOSTIC
3425 /* 3428 /*
3426 * This should never happen; we should always find a 3429 * This should never happen; we should always find a
3427 * non-empty bucket. 3430 * non-empty bucket.
3428 */ 3431 */
3429 if (scp2 == scp) 3432 if (scp2 == scp)
3430 panic("syn_cache_insert: cacheoverflow: " 3433 panic("syn_cache_insert: cacheoverflow: "
3431 "impossible"); 3434 "impossible");
3432#endif 3435#endif
3433 } 3436 }
3434 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3437 sc2 = TAILQ_FIRST(&scp2->sch_bucket);
3435 syn_cache_rm(sc2); 3438 syn_cache_rm(sc2);
3436 syn_cache_put(sc2); /* calls pool_put but see spl above */ 3439 syn_cache_put(sc2); /* calls pool_put but see spl above */
3437 } 3440 }
3438 3441
3439 /* 3442 /*
3440 * Initialize the entry's timer. 3443 * Initialize the entry's timer.

cvs diff -r1.167.10.1 -r1.167.10.1.2.1 src/sys/netinet/tcp_output.c (switch to unified diff)

--- src/sys/netinet/tcp_output.c 2011/03/29 20:12:14 1.167.10.1
+++ src/sys/netinet/tcp_output.c 2015/07/24 07:44:35 1.167.10.1.2.1
@@ -1,1710 +1,1720 @@ @@ -1,1710 +1,1720 @@
1/* $NetBSD: tcp_output.c,v 1.167.10.1 2011/03/29 20:12:14 riz Exp $ */ 1/* $NetBSD: tcp_output.c,v 1.167.10.1.2.1 2015/07/24 07:44:35 martin Exp $ */
2 2
3/* 3/*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors 15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software 16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission. 17 * without specific prior written permission.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE. 29 * SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
34 * 34 *
35 * NRL grants permission for redistribution and use in source and binary 35 * NRL grants permission for redistribution and use in source and binary
36 * forms, with or without modification, of the software and documentation 36 * forms, with or without modification, of the software and documentation
37 * created at NRL provided that the following conditions are met: 37 * created at NRL provided that the following conditions are met:
38 * 38 *
39 * 1. Redistributions of source code must retain the above copyright 39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer. 40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright 41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the 42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution. 43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software 44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgements: 45 * must display the following acknowledgements:
46 * This product includes software developed by the University of 46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors. 47 * California, Berkeley and its contributors.
48 * This product includes software developed at the Information 48 * This product includes software developed at the Information
49 * Technology Division, US Naval Research Laboratory. 49 * Technology Division, US Naval Research Laboratory.
50 * 4. Neither the name of the NRL nor the names of its contributors 50 * 4. Neither the name of the NRL nor the names of its contributors
51 * may be used to endorse or promote products derived from this software 51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission. 52 * without specific prior written permission.
53 * 53 *
54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 * 65 *
66 * The views and conclusions contained in the software and documentation 66 * The views and conclusions contained in the software and documentation
67 * are those of the authors and should not be interpreted as representing 67 * are those of the authors and should not be interpreted as representing
68 * official policies, either expressed or implied, of the US Naval 68 * official policies, either expressed or implied, of the US Naval
69 * Research Laboratory (NRL). 69 * Research Laboratory (NRL).
70 */ 70 */
71 71
72/*- 72/*-
73 * Copyright (c) 1997, 1998, 2001, 2005, 2006 The NetBSD Foundation, Inc. 73 * Copyright (c) 1997, 1998, 2001, 2005, 2006 The NetBSD Foundation, Inc.
74 * All rights reserved. 74 * All rights reserved.
75 * 75 *
76 * This code is derived from software contributed to The NetBSD Foundation 76 * This code is derived from software contributed to The NetBSD Foundation
77 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 77 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
78 * Facility, NASA Ames Research Center. 78 * Facility, NASA Ames Research Center.
79 * This code is derived from software contributed to The NetBSD Foundation 79 * This code is derived from software contributed to The NetBSD Foundation
80 * by Charles M. Hannum. 80 * by Charles M. Hannum.
81 * This code is derived from software contributed to The NetBSD Foundation 81 * This code is derived from software contributed to The NetBSD Foundation
82 * by Rui Paulo. 82 * by Rui Paulo.
83 * 83 *
84 * Redistribution and use in source and binary forms, with or without 84 * Redistribution and use in source and binary forms, with or without
85 * modification, are permitted provided that the following conditions 85 * modification, are permitted provided that the following conditions
86 * are met: 86 * are met:
87 * 1. Redistributions of source code must retain the above copyright 87 * 1. Redistributions of source code must retain the above copyright
88 * notice, this list of conditions and the following disclaimer. 88 * notice, this list of conditions and the following disclaimer.
89 * 2. Redistributions in binary form must reproduce the above copyright 89 * 2. Redistributions in binary form must reproduce the above copyright
90 * notice, this list of conditions and the following disclaimer in the 90 * notice, this list of conditions and the following disclaimer in the
91 * documentation and/or other materials provided with the distribution. 91 * documentation and/or other materials provided with the distribution.
92 * 92 *
93 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 93 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
94 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 94 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
95 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 95 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
96 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 96 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
97 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 97 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
98 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 98 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
99 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 99 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
100 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 100 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
101 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 101 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
103 * POSSIBILITY OF SUCH DAMAGE. 103 * POSSIBILITY OF SUCH DAMAGE.
104 */ 104 */
105 105
106/* 106/*
107 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 107 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
108 * The Regents of the University of California. All rights reserved. 108 * The Regents of the University of California. All rights reserved.
109 * 109 *
110 * Redistribution and use in source and binary forms, with or without 110 * Redistribution and use in source and binary forms, with or without
111 * modification, are permitted provided that the following conditions 111 * modification, are permitted provided that the following conditions
112 * are met: 112 * are met:
113 * 1. Redistributions of source code must retain the above copyright 113 * 1. Redistributions of source code must retain the above copyright
114 * notice, this list of conditions and the following disclaimer. 114 * notice, this list of conditions and the following disclaimer.
115 * 2. Redistributions in binary form must reproduce the above copyright 115 * 2. Redistributions in binary form must reproduce the above copyright
116 * notice, this list of conditions and the following disclaimer in the 116 * notice, this list of conditions and the following disclaimer in the
117 * documentation and/or other materials provided with the distribution. 117 * documentation and/or other materials provided with the distribution.
118 * 3. Neither the name of the University nor the names of its contributors 118 * 3. Neither the name of the University nor the names of its contributors
119 * may be used to endorse or promote products derived from this software 119 * may be used to endorse or promote products derived from this software
120 * without specific prior written permission. 120 * without specific prior written permission.
121 * 121 *
122 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 122 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
123 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 123 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
124 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 124 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
125 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 125 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
126 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 126 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
127 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 127 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
128 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 128 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
129 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 129 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
130 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 130 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
131 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 131 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
132 * SUCH DAMAGE. 132 * SUCH DAMAGE.
133 * 133 *
134 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 134 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
135 */ 135 */
136 136
137#include <sys/cdefs.h> 137#include <sys/cdefs.h>
138__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.167.10.1 2011/03/29 20:12:14 riz Exp $"); 138__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.167.10.1.2.1 2015/07/24 07:44:35 martin Exp $");
139 139
140#include "opt_inet.h" 140#include "opt_inet.h"
141#include "opt_ipsec.h" 141#include "opt_ipsec.h"
142#include "opt_tcp_debug.h" 142#include "opt_tcp_debug.h"
143 143
144#include <sys/param.h> 144#include <sys/param.h>
145#include <sys/systm.h> 145#include <sys/systm.h>
146#include <sys/malloc.h> 146#include <sys/malloc.h>
147#include <sys/mbuf.h> 147#include <sys/mbuf.h>
148#include <sys/protosw.h> 148#include <sys/protosw.h>
149#include <sys/socket.h> 149#include <sys/socket.h>
150#include <sys/socketvar.h> 150#include <sys/socketvar.h>
151#include <sys/errno.h> 151#include <sys/errno.h>
152#include <sys/domain.h> 152#include <sys/domain.h>
153#include <sys/kernel.h> 153#include <sys/kernel.h>
154#ifdef TCP_SIGNATURE 154#ifdef TCP_SIGNATURE
155#include <sys/md5.h> 155#include <sys/md5.h>
156#endif 156#endif
157 157
158#include <net/if.h> 158#include <net/if.h>
159#include <net/route.h> 159#include <net/route.h>
160 160
161#include <netinet/in.h> 161#include <netinet/in.h>
162#include <netinet/in_systm.h> 162#include <netinet/in_systm.h>
163#include <netinet/ip.h> 163#include <netinet/ip.h>
164#include <netinet/in_pcb.h> 164#include <netinet/in_pcb.h>
165#include <netinet/ip_var.h> 165#include <netinet/ip_var.h>
166 166
167#ifdef INET6 167#ifdef INET6
168#ifndef INET 168#ifndef INET
169#include <netinet/in.h> 169#include <netinet/in.h>
170#endif 170#endif
171#include <netinet/ip6.h> 171#include <netinet/ip6.h>
172#include <netinet6/in6_var.h> 172#include <netinet6/in6_var.h>
173#include <netinet6/ip6_var.h> 173#include <netinet6/ip6_var.h>
174#include <netinet6/in6_pcb.h> 174#include <netinet6/in6_pcb.h>
175#include <netinet6/nd6.h> 175#include <netinet6/nd6.h>
176#endif 176#endif
177 177
178#ifdef FAST_IPSEC 178#ifdef FAST_IPSEC
179#include <netipsec/ipsec.h> 179#include <netipsec/ipsec.h>
180#include <netipsec/key.h> 180#include <netipsec/key.h>
181#ifdef INET6 181#ifdef INET6
182#include <netipsec/ipsec6.h> 182#include <netipsec/ipsec6.h>
183#endif 183#endif
184#endif /* FAST_IPSEC*/ 184#endif /* FAST_IPSEC*/
185#ifdef IPSEC 185#ifdef IPSEC
186#include <netinet6/ipsec.h> 186#include <netinet6/ipsec.h>
187#endif 187#endif
188 188
189#include <netinet/tcp.h> 189#include <netinet/tcp.h>
190#define TCPOUTFLAGS 190#define TCPOUTFLAGS
191#include <netinet/tcp_fsm.h> 191#include <netinet/tcp_fsm.h>
192#include <netinet/tcp_seq.h> 192#include <netinet/tcp_seq.h>
193#include <netinet/tcp_timer.h> 193#include <netinet/tcp_timer.h>
194#include <netinet/tcp_var.h> 194#include <netinet/tcp_var.h>
195#include <netinet/tcp_private.h> 195#include <netinet/tcp_private.h>
196#include <netinet/tcp_congctl.h> 196#include <netinet/tcp_congctl.h>
197#include <netinet/tcpip.h> 197#include <netinet/tcpip.h>
198#include <netinet/tcp_debug.h> 198#include <netinet/tcp_debug.h>
199#include <netinet/in_offload.h> 199#include <netinet/in_offload.h>
200#include <netinet6/in6_offload.h> 200#include <netinet6/in6_offload.h>
201 201
202#ifdef IPSEC 202#ifdef IPSEC
203#include <netkey/key.h> 203#include <netkey/key.h>
204#endif 204#endif
205 205
206#ifdef notyet 206#ifdef notyet
207extern struct mbuf *m_copypack(); 207extern struct mbuf *m_copypack();
208#endif 208#endif
209 209
210/* 210/*
211 * Knob to enable Congestion Window Monitoring, and control 211 * Knob to enable Congestion Window Monitoring, and control
212 * the burst size it allows. Default burst is 4 packets, per 212 * the burst size it allows. Default burst is 4 packets, per
213 * the Internet draft. 213 * the Internet draft.
214 */ 214 */
215int tcp_cwm = 0; 215int tcp_cwm = 0;
216int tcp_cwm_burstsize = 4; 216int tcp_cwm_burstsize = 4;
217 217
218int tcp_do_autosndbuf = 0; 218int tcp_do_autosndbuf = 0;
219int tcp_autosndbuf_inc = 8 * 1024; 219int tcp_autosndbuf_inc = 8 * 1024;
220int tcp_autosndbuf_max = 256 * 1024; 220int tcp_autosndbuf_max = 256 * 1024;
221 221
222#ifdef TCP_OUTPUT_COUNTERS 222#ifdef TCP_OUTPUT_COUNTERS
223#include <sys/device.h> 223#include <sys/device.h>
224 224
225extern struct evcnt tcp_output_bigheader; 225extern struct evcnt tcp_output_bigheader;
226extern struct evcnt tcp_output_predict_hit; 226extern struct evcnt tcp_output_predict_hit;
227extern struct evcnt tcp_output_predict_miss; 227extern struct evcnt tcp_output_predict_miss;
228extern struct evcnt tcp_output_copysmall; 228extern struct evcnt tcp_output_copysmall;
229extern struct evcnt tcp_output_copybig; 229extern struct evcnt tcp_output_copybig;
230extern struct evcnt tcp_output_refbig; 230extern struct evcnt tcp_output_refbig;
231 231
232#define TCP_OUTPUT_COUNTER_INCR(ev) (ev)->ev_count++ 232#define TCP_OUTPUT_COUNTER_INCR(ev) (ev)->ev_count++
233#else 233#else
234 234
235#define TCP_OUTPUT_COUNTER_INCR(ev) /* nothing */ 235#define TCP_OUTPUT_COUNTER_INCR(ev) /* nothing */
236 236
237#endif /* TCP_OUTPUT_COUNTERS */ 237#endif /* TCP_OUTPUT_COUNTERS */
238 238
239static 239static
240#ifndef GPROF 240#ifndef GPROF
241inline 241inline
242#endif 242#endif
243int 243int
244tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep, 244tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep,
245 bool *alwaysfragp) 245 bool *alwaysfragp)
246{ 246{
247#ifdef INET 247#ifdef INET
248 struct inpcb *inp = tp->t_inpcb; 248 struct inpcb *inp = tp->t_inpcb;
249#endif 249#endif
250#ifdef INET6 250#ifdef INET6
251 struct in6pcb *in6p = tp->t_in6pcb; 251 struct in6pcb *in6p = tp->t_in6pcb;
252#endif 252#endif
253 struct socket *so = NULL; 253 struct socket *so = NULL;
254 struct rtentry *rt; 254 struct rtentry *rt;
255 struct ifnet *ifp; 255 struct ifnet *ifp;
256 int size; 256 int size;
257 int hdrlen; 257 int hdrlen;
258 int optlen; 258 int optlen;
259 259
260 *alwaysfragp = false; 260 *alwaysfragp = false;
261 261
262#ifdef DIAGNOSTIC 262#ifdef DIAGNOSTIC
263 if (tp->t_inpcb && tp->t_in6pcb) 263 if (tp->t_inpcb && tp->t_in6pcb)
264 panic("tcp_segsize: both t_inpcb and t_in6pcb are set"); 264 panic("tcp_segsize: both t_inpcb and t_in6pcb are set");
265#endif 265#endif
266 switch (tp->t_family) { 266 switch (tp->t_family) {
267#ifdef INET 267#ifdef INET
268 case AF_INET: 268 case AF_INET:
269 hdrlen = sizeof(struct ip) + sizeof(struct tcphdr); 269 hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
270 break; 270 break;
271#endif 271#endif
272#ifdef INET6 272#ifdef INET6
273 case AF_INET6: 273 case AF_INET6:
274 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 274 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
275 break; 275 break;
276#endif 276#endif
277 default: 277 default:
278 size = tcp_mssdflt; 278 size = tcp_mssdflt;
279 goto out; 279 goto out;
280 } 280 }
281 281
282 rt = NULL; 282 rt = NULL;
283#ifdef INET 283#ifdef INET
284 if (inp) { 284 if (inp) {
285 rt = in_pcbrtentry(inp); 285 rt = in_pcbrtentry(inp);
286 so = inp->inp_socket; 286 so = inp->inp_socket;
287 } 287 }
288#endif 288#endif
289#ifdef INET6 289#ifdef INET6
290 if (in6p) { 290 if (in6p) {
291 rt = in6_pcbrtentry(in6p); 291 rt = in6_pcbrtentry(in6p);
292 so = in6p->in6p_socket; 292 so = in6p->in6p_socket;
293 } 293 }
294#endif 294#endif
295 if (rt == NULL) { 295 if (rt == NULL) {
296 size = tcp_mssdflt; 296 size = tcp_mssdflt;
297 goto out; 297 goto out;
298 } 298 }
299 299
300 ifp = rt->rt_ifp; 300 ifp = rt->rt_ifp;
301 301
302 size = tcp_mssdflt; 302 size = tcp_mssdflt;
303 if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != 0) { 303 if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != 0) {
304#ifdef INET6 304#ifdef INET6
305 if (in6p && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 305 if (in6p && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
306 /* 306 /*
307 * RFC2460 section 5, last paragraph: if path MTU is 307 * RFC2460 section 5, last paragraph: if path MTU is
308 * smaller than 1280, use 1280 as packet size and 308 * smaller than 1280, use 1280 as packet size and
309 * attach fragment header. 309 * attach fragment header.
310 */ 310 */
311 size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag); 311 size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag);
312 *alwaysfragp = true; 312 *alwaysfragp = true;
313 } else 313 } else
314 size = rt->rt_rmx.rmx_mtu - hdrlen; 314 size = rt->rt_rmx.rmx_mtu - hdrlen;
315#else 315#else
316 size = rt->rt_rmx.rmx_mtu - hdrlen; 316 size = rt->rt_rmx.rmx_mtu - hdrlen;
317#endif 317#endif
318 } else if (ifp->if_flags & IFF_LOOPBACK) 318 } else if (ifp->if_flags & IFF_LOOPBACK)
319 size = ifp->if_mtu - hdrlen; 319 size = ifp->if_mtu - hdrlen;
320#ifdef INET 320#ifdef INET
321 else if (inp && tp->t_mtudisc) 321 else if (inp && tp->t_mtudisc)
322 size = ifp->if_mtu - hdrlen; 322 size = ifp->if_mtu - hdrlen;
323 else if (inp && in_localaddr(inp->inp_faddr)) 323 else if (inp && in_localaddr(inp->inp_faddr))
324 size = ifp->if_mtu - hdrlen; 324 size = ifp->if_mtu - hdrlen;
325#endif 325#endif
326#ifdef INET6 326#ifdef INET6
327 else if (in6p) { 327 else if (in6p) {
328#ifdef INET 328#ifdef INET
329 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) { 329 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) {
330 /* mapped addr case */ 330 /* mapped addr case */
331 struct in_addr d; 331 struct in_addr d;
332 bcopy(&in6p->in6p_faddr.s6_addr32[3], &d, sizeof(d)); 332 bcopy(&in6p->in6p_faddr.s6_addr32[3], &d, sizeof(d));
333 if (tp->t_mtudisc || in_localaddr(d)) 333 if (tp->t_mtudisc || in_localaddr(d))
334 size = ifp->if_mtu - hdrlen; 334 size = ifp->if_mtu - hdrlen;
335 } else 335 } else
336#endif 336#endif
337 { 337 {
338 /* 338 /*
339 * for IPv6, path MTU discovery is always turned on, 339 * for IPv6, path MTU discovery is always turned on,
340 * or the node must use packet size <= 1280. 340 * or the node must use packet size <= 1280.
341 */ 341 */
342 size = tp->t_mtudisc ? IN6_LINKMTU(ifp) : IPV6_MMTU; 342 size = tp->t_mtudisc ? IN6_LINKMTU(ifp) : IPV6_MMTU;
343 size -= hdrlen; 343 size -= hdrlen;
344 } 344 }
345 } 345 }
346#endif 346#endif
347 out: 347 out:
348 /* 348 /*
349 * Now we must make room for whatever extra TCP/IP options are in 349 * Now we must make room for whatever extra TCP/IP options are in
350 * the packet. 350 * the packet.
351 */ 351 */
352 optlen = tcp_optlen(tp); 352 optlen = tcp_optlen(tp);
353 353
354 /* 354 /*
355 * XXX tp->t_ourmss should have the right size, but without this code 355 * XXX tp->t_ourmss should have the right size, but without this code
356 * fragmentation will occur... need more investigation 356 * fragmentation will occur... need more investigation
357 */ 357 */
358#ifdef INET 358#ifdef INET
359 if (inp) { 359 if (inp) {
360#if defined(IPSEC) || defined(FAST_IPSEC) 360#if defined(IPSEC) || defined(FAST_IPSEC)
361 if (! IPSEC_PCB_SKIP_IPSEC(inp->inp_sp, IPSEC_DIR_OUTBOUND)) 361 if (! IPSEC_PCB_SKIP_IPSEC(inp->inp_sp, IPSEC_DIR_OUTBOUND))
362 optlen += ipsec4_hdrsiz_tcp(tp); 362 optlen += ipsec4_hdrsiz_tcp(tp);
363#endif 363#endif
364 optlen += ip_optlen(inp); 364 optlen += ip_optlen(inp);
365 } 365 }
366#endif 366#endif
367#ifdef INET6 367#ifdef INET6
368#ifdef INET 368#ifdef INET
369 if (in6p && tp->t_family == AF_INET) { 369 if (in6p && tp->t_family == AF_INET) {
370#if defined(IPSEC) || defined(FAST_IPSEC) 370#if defined(IPSEC) || defined(FAST_IPSEC)
371 if (! IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND)) 371 if (! IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND))
372 optlen += ipsec4_hdrsiz_tcp(tp); 372 optlen += ipsec4_hdrsiz_tcp(tp);
373#endif 373#endif
374 /* XXX size -= ip_optlen(in6p); */ 374 /* XXX size -= ip_optlen(in6p); */
375 } else 375 } else
376#endif 376#endif
377 if (in6p && tp->t_family == AF_INET6) { 377 if (in6p && tp->t_family == AF_INET6) {
378#if defined(IPSEC) || defined(FAST_IPSEC) 378#if defined(IPSEC) || defined(FAST_IPSEC)
379 if (! IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND)) 379 if (! IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND))
380 optlen += ipsec6_hdrsiz_tcp(tp); 380 optlen += ipsec6_hdrsiz_tcp(tp);
381#endif 381#endif
382 optlen += ip6_optlen(in6p); 382 optlen += ip6_optlen(in6p);
383 } 383 }
384#endif 384#endif
385 size -= optlen; 385 size -= optlen;
386 386
387 /* there may not be any room for data if mtu is too small */ 387 /* there may not be any room for data if mtu is too small */
388 if (size < 0) 388 if (size < 0)
389 return (EMSGSIZE); 389 return (EMSGSIZE);
390 390
391 /* 391 /*
392 * *rxsegsizep holds *estimated* inbound segment size (estimation 392 * *rxsegsizep holds *estimated* inbound segment size (estimation
393 * assumes that path MTU is the same for both ways). this is only 393 * assumes that path MTU is the same for both ways). this is only
394 * for silly window avoidance, do not use the value for other purposes. 394 * for silly window avoidance, do not use the value for other purposes.
395 * 395 *
396 * ipseclen is subtracted from both sides, this may not be right. 396 * ipseclen is subtracted from both sides, this may not be right.
397 * I'm not quite sure about this (could someone comment). 397 * I'm not quite sure about this (could someone comment).
398 */ 398 */
399 *txsegsizep = min(tp->t_peermss - optlen, size); 399 *txsegsizep = min(tp->t_peermss - optlen, size);
400 /* 400 /*
401 * Never send more than half a buffer full. This insures that we can 401 * Never send more than half a buffer full. This insures that we can
402 * always keep 2 packets on the wire, no matter what SO_SNDBUF is, and 402 * always keep 2 packets on the wire, no matter what SO_SNDBUF is, and
403 * therefore acks will never be delayed unless we run out of data to 403 * therefore acks will never be delayed unless we run out of data to
404 * transmit. 404 * transmit.
405 */ 405 */
406 if (so) 406 if (so)
407 *txsegsizep = min(so->so_snd.sb_hiwat >> 1, *txsegsizep); 407 *txsegsizep = min(so->so_snd.sb_hiwat >> 1, *txsegsizep);
408 *rxsegsizep = min(tp->t_ourmss - optlen, size); 408 *rxsegsizep = min(tp->t_ourmss - optlen, size);
409 409
410 if (*txsegsizep != tp->t_segsz) { 410 if (*txsegsizep != tp->t_segsz) {
411 /* 411 /*
412 * If the new segment size is larger, we don't want to 412 * If the new segment size is larger, we don't want to
413 * mess up the congestion window, but if it is smaller 413 * mess up the congestion window, but if it is smaller
414 * we'll have to reduce the congestion window to ensure 414 * we'll have to reduce the congestion window to ensure
415 * that we don't get into trouble with initial windows 415 * that we don't get into trouble with initial windows
416 * and the rest. In any case, if the segment size 416 * and the rest. In any case, if the segment size
417 * has changed, chances are the path has, too, and 417 * has changed, chances are the path has, too, and
418 * our congestion window will be different. 418 * our congestion window will be different.
419 */ 419 */
420 if (*txsegsizep < tp->t_segsz) { 420 if (*txsegsizep < tp->t_segsz) {
421 tp->snd_cwnd = max((tp->snd_cwnd / tp->t_segsz) 421 tp->snd_cwnd = max((tp->snd_cwnd / tp->t_segsz)
422 * *txsegsizep, *txsegsizep); 422 * *txsegsizep, *txsegsizep);
423 tp->snd_ssthresh = max((tp->snd_ssthresh / tp->t_segsz) 423 tp->snd_ssthresh = max((tp->snd_ssthresh / tp->t_segsz)
424 * *txsegsizep, *txsegsizep); 424 * *txsegsizep, *txsegsizep);
425 } 425 }
426 tp->t_segsz = *txsegsizep; 426 tp->t_segsz = *txsegsizep;
427 } 427 }
428 428
429 return (0); 429 return (0);
430} 430}
431 431
432static 432static
433#ifndef GPROF 433#ifndef GPROF
434inline 434inline
435#endif 435#endif
436int 436int
437tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off, 437tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off,
438 long len, int hdrlen, struct mbuf **mp) 438 long len, int hdrlen, struct mbuf **mp)
439{ 439{
440 struct mbuf *m, *m0; 440 struct mbuf *m, *m0;
441 uint64_t *tcps; 441 uint64_t *tcps;
442 442
443 tcps = TCP_STAT_GETREF(); 443 tcps = TCP_STAT_GETREF();
444 if (tp->t_force && len == 1) 444 if (tp->t_force && len == 1)
445 tcps[TCP_STAT_SNDPROBE]++; 445 tcps[TCP_STAT_SNDPROBE]++;
446 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 446 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
447 tcps[TCP_STAT_SNDREXMITPACK]++; 447 tcps[TCP_STAT_SNDREXMITPACK]++;
448 tcps[TCP_STAT_SNDREXMITBYTE] += len; 448 tcps[TCP_STAT_SNDREXMITBYTE] += len;
449 } else { 449 } else {
450 tcps[TCP_STAT_SNDPACK]++; 450 tcps[TCP_STAT_SNDPACK]++;
451 tcps[TCP_STAT_SNDBYTE] += len; 451 tcps[TCP_STAT_SNDBYTE] += len;
452 } 452 }
453 TCP_STAT_PUTREF(); 453 TCP_STAT_PUTREF();
454#ifdef notyet 454#ifdef notyet
455 if ((m = m_copypack(so->so_snd.sb_mb, off, 455 if ((m = m_copypack(so->so_snd.sb_mb, off,
456 (int)len, max_linkhdr + hdrlen)) == 0) 456 (int)len, max_linkhdr + hdrlen)) == 0)
457 return (ENOBUFS); 457 return (ENOBUFS);
458 /* 458 /*
459 * m_copypack left space for our hdr; use it. 459 * m_copypack left space for our hdr; use it.
460 */ 460 */
461 m->m_len += hdrlen; 461 m->m_len += hdrlen;
462 m->m_data -= hdrlen; 462 m->m_data -= hdrlen;
463#else 463#else
464 MGETHDR(m, M_DONTWAIT, MT_HEADER); 464 MGETHDR(m, M_DONTWAIT, MT_HEADER);
465 if (__predict_false(m == NULL)) 465 if (__predict_false(m == NULL))
466 return (ENOBUFS); 466 return (ENOBUFS);
467 MCLAIM(m, &tcp_tx_mowner); 467 MCLAIM(m, &tcp_tx_mowner);
468 468
469 /* 469 /*
470 * XXX Because other code assumes headers will fit in 470 * XXX Because other code assumes headers will fit in
471 * XXX one header mbuf. 471 * XXX one header mbuf.
472 * 472 *
473 * (This code should almost *never* be run.) 473 * (This code should almost *never* be run.)
474 */ 474 */
475 if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) { 475 if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) {
476 TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader); 476 TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader);
477 MCLGET(m, M_DONTWAIT); 477 MCLGET(m, M_DONTWAIT);
478 if ((m->m_flags & M_EXT) == 0) { 478 if ((m->m_flags & M_EXT) == 0) {
479 m_freem(m); 479 m_freem(m);
480 return (ENOBUFS); 480 return (ENOBUFS);
481 } 481 }
482 } 482 }
483 483
484 m->m_data += max_linkhdr; 484 m->m_data += max_linkhdr;
485 m->m_len = hdrlen; 485 m->m_len = hdrlen;
486 486
487 /* 487 /*
488 * To avoid traversing the whole sb_mb chain for correct 488 * To avoid traversing the whole sb_mb chain for correct
489 * data to send, remember last sent mbuf, its offset and 489 * data to send, remember last sent mbuf, its offset and
490 * the sent size. When called the next time, see if the 490 * the sent size. When called the next time, see if the
491 * data to send is directly following the previous transfer. 491 * data to send is directly following the previous transfer.
492 * This is important for large TCP windows. 492 * This is important for large TCP windows.
493 */ 493 */
494 if (off == 0 || tp->t_lastm == NULL || 494 if (off == 0 || tp->t_lastm == NULL ||
495 (tp->t_lastoff + tp->t_lastlen) != off) { 495 (tp->t_lastoff + tp->t_lastlen) != off) {
496 TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss); 496 TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss);
497 /* 497 /*
498 * Either a new packet or a retransmit. 498 * Either a new packet or a retransmit.
499 * Start from the beginning. 499 * Start from the beginning.
500 */ 500 */
501 tp->t_lastm = so->so_snd.sb_mb; 501 tp->t_lastm = so->so_snd.sb_mb;
502 tp->t_inoff = off; 502 tp->t_inoff = off;
503 } else { 503 } else {
504 TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit); 504 TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit);
505 tp->t_inoff += tp->t_lastlen; 505 tp->t_inoff += tp->t_lastlen;
506 } 506 }
507 507
508 /* Traverse forward to next packet */ 508 /* Traverse forward to next packet */
509 while (tp->t_inoff > 0) { 509 while (tp->t_inoff > 0) {
510 if (tp->t_lastm == NULL) 510 if (tp->t_lastm == NULL)
511 panic("tp->t_lastm == NULL"); 511 panic("tp->t_lastm == NULL");
512 if (tp->t_inoff < tp->t_lastm->m_len) 512 if (tp->t_inoff < tp->t_lastm->m_len)
513 break; 513 break;
514 tp->t_inoff -= tp->t_lastm->m_len; 514 tp->t_inoff -= tp->t_lastm->m_len;
515 tp->t_lastm = tp->t_lastm->m_next; 515 tp->t_lastm = tp->t_lastm->m_next;
516 } 516 }
517 517
518 tp->t_lastoff = off; 518 tp->t_lastoff = off;
519 tp->t_lastlen = len; 519 tp->t_lastlen = len;
520 m0 = tp->t_lastm; 520 m0 = tp->t_lastm;
521 off = tp->t_inoff; 521 off = tp->t_inoff;
522 522
523 if (len <= M_TRAILINGSPACE(m)) { 523 if (len <= M_TRAILINGSPACE(m)) {
524 m_copydata(m0, off, (int) len, mtod(m, char *) + hdrlen); 524 m_copydata(m0, off, (int) len, mtod(m, char *) + hdrlen);
525 m->m_len += len; 525 m->m_len += len;
526 TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall); 526 TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall);
527 } else { 527 } else {
528 m->m_next = m_copym(m0, off, (int) len, M_DONTWAIT); 528 m->m_next = m_copym(m0, off, (int) len, M_DONTWAIT);
529 if (m->m_next == NULL) { 529 if (m->m_next == NULL) {
530 m_freem(m); 530 m_freem(m);
531 return (ENOBUFS); 531 return (ENOBUFS);
532 } 532 }
533#ifdef TCP_OUTPUT_COUNTERS 533#ifdef TCP_OUTPUT_COUNTERS
534 if (m->m_next->m_flags & M_EXT) 534 if (m->m_next->m_flags & M_EXT)
535 TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig); 535 TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig);
536 else 536 else
537 TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig); 537 TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig);
538#endif /* TCP_OUTPUT_COUNTERS */ 538#endif /* TCP_OUTPUT_COUNTERS */
539 } 539 }
540#endif 540#endif
541 541
542 *mp = m; 542 *mp = m;
543 return (0); 543 return (0);
544} 544}
545 545
546/* 546/*
547 * Tcp output routine: figure out what should be sent and send it. 547 * Tcp output routine: figure out what should be sent and send it.
548 */ 548 */
549int 549int
550tcp_output(struct tcpcb *tp) 550tcp_output(struct tcpcb *tp)
551{ 551{
552 struct rtentry *rt; 552 struct rtentry *rt;
553 struct socket *so; 553 struct socket *so;
554 struct route *ro; 554 struct route *ro;
555 long len, win; 555 long len, win;
556 int off, flags, error; 556 int off, flags, error;
557 struct mbuf *m; 557 struct mbuf *m;
558 struct ip *ip; 558 struct ip *ip;
559#ifdef INET6 559#ifdef INET6
560 struct ip6_hdr *ip6; 560 struct ip6_hdr *ip6;
561#endif 561#endif
562 struct tcphdr *th; 562 struct tcphdr *th;
563 u_char opt[MAX_TCPOPTLEN]; 563 u_char opt[MAX_TCPOPTLEN];
564 unsigned optlen, hdrlen, packetlen; 564 unsigned optlen, hdrlen, packetlen;
565 unsigned int sack_numblks; 565 unsigned int sack_numblks;
566 int idle, sendalot, txsegsize, rxsegsize; 566 int idle, sendalot, txsegsize, rxsegsize;
567 int txsegsize_nosack; 567 int txsegsize_nosack;
568 int maxburst = TCP_MAXBURST; 568 int maxburst = TCP_MAXBURST;
569 int af; /* address family on the wire */ 569 int af; /* address family on the wire */
570 int iphdrlen; 570 int iphdrlen;
571 int has_tso4, has_tso6; 571 int has_tso4, has_tso6;
572 int has_tso, use_tso; 572 int has_tso, use_tso;
573 bool alwaysfrag; 573 bool alwaysfrag;
574 int sack_rxmit; 574 int sack_rxmit;
575 int sack_bytes_rxmt; 575 int sack_bytes_rxmt;
576 int ecn_tos; 576 int ecn_tos;
577 struct sackhole *p; 577 struct sackhole *p;
578#ifdef TCP_SIGNATURE 578#ifdef TCP_SIGNATURE
579 int sigoff = 0; 579 int sigoff = 0;
580#endif 580#endif
581 uint64_t *tcps; 581 uint64_t *tcps;
582 582
583#ifdef DIAGNOSTIC 583#ifdef DIAGNOSTIC
584 if (tp->t_inpcb && tp->t_in6pcb) 584 if (tp->t_inpcb && tp->t_in6pcb)
585 panic("tcp_output: both t_inpcb and t_in6pcb are set"); 585 panic("tcp_output: both t_inpcb and t_in6pcb are set");
586#endif 586#endif
587 so = NULL; 587 so = NULL;
588 ro = NULL; 588 ro = NULL;
589 if (tp->t_inpcb) { 589 if (tp->t_inpcb) {
590 so = tp->t_inpcb->inp_socket; 590 so = tp->t_inpcb->inp_socket;
591 ro = &tp->t_inpcb->inp_route; 591 ro = &tp->t_inpcb->inp_route;
592 } 592 }
593#ifdef INET6 593#ifdef INET6
594 else if (tp->t_in6pcb) { 594 else if (tp->t_in6pcb) {
595 so = tp->t_in6pcb->in6p_socket; 595 so = tp->t_in6pcb->in6p_socket;
596 ro = &tp->t_in6pcb->in6p_route; 596 ro = &tp->t_in6pcb->in6p_route;
597 } 597 }
598#endif 598#endif
599 599
600 switch (af = tp->t_family) { 600 switch (af = tp->t_family) {
601#ifdef INET 601#ifdef INET
602 case AF_INET: 602 case AF_INET:
603 if (tp->t_inpcb) 603 if (tp->t_inpcb)
604 break; 604 break;
605#ifdef INET6 605#ifdef INET6
606 /* mapped addr case */ 606 /* mapped addr case */
607 if (tp->t_in6pcb) 607 if (tp->t_in6pcb)
608 break; 608 break;
609#endif 609#endif
610 return (EINVAL); 610 return (EINVAL);
611#endif 611#endif
612#ifdef INET6 612#ifdef INET6
613 case AF_INET6: 613 case AF_INET6:
614 if (tp->t_in6pcb) 614 if (tp->t_in6pcb)
615 break; 615 break;
616 return (EINVAL); 616 return (EINVAL);
617#endif 617#endif
618 default: 618 default:
619 return (EAFNOSUPPORT); 619 return (EAFNOSUPPORT);
620 } 620 }
621 621
622 if (tcp_segsize(tp, &txsegsize, &rxsegsize, &alwaysfrag)) 622 if (tcp_segsize(tp, &txsegsize, &rxsegsize, &alwaysfrag))
623 return (EMSGSIZE); 623 return (EMSGSIZE);
624 624
625 idle = (tp->snd_max == tp->snd_una); 625 idle = (tp->snd_max == tp->snd_una);
626 626
627 /* 627 /*
628 * Determine if we can use TCP segmentation offload: 628 * Determine if we can use TCP segmentation offload:
629 * - If we're using IPv4 629 * - If we're using IPv4
630 * - If there is not an IPsec policy that prevents it 630 * - If there is not an IPsec policy that prevents it
631 * - If the interface can do it 631 * - If the interface can do it
632 */ 632 */
633 has_tso4 = has_tso6 = false; 633 has_tso4 = has_tso6 = false;
634#if defined(INET) 634#if defined(INET)
635 has_tso4 = tp->t_inpcb != NULL && 635 has_tso4 = tp->t_inpcb != NULL &&
636#if defined(IPSEC) || defined(FAST_IPSEC) 636#if defined(IPSEC) || defined(FAST_IPSEC)
637 IPSEC_PCB_SKIP_IPSEC(tp->t_inpcb->inp_sp, 637 IPSEC_PCB_SKIP_IPSEC(tp->t_inpcb->inp_sp,
638 IPSEC_DIR_OUTBOUND) && 638 IPSEC_DIR_OUTBOUND) &&
639#endif 639#endif
640 (rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL && 640 (rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL &&
641 (rt->rt_ifp->if_capenable & IFCAP_TSOv4) != 0; 641 (rt->rt_ifp->if_capenable & IFCAP_TSOv4) != 0;
642#endif /* defined(INET) */ 642#endif /* defined(INET) */
643#if defined(INET6) 643#if defined(INET6)
644 has_tso6 = tp->t_in6pcb != NULL && 644 has_tso6 = tp->t_in6pcb != NULL &&
645#if defined(IPSEC) || defined(FAST_IPSEC) 645#if defined(IPSEC) || defined(FAST_IPSEC)
646 IPSEC_PCB_SKIP_IPSEC(tp->t_in6pcb->in6p_sp, 646 IPSEC_PCB_SKIP_IPSEC(tp->t_in6pcb->in6p_sp,
647 IPSEC_DIR_OUTBOUND) && 647 IPSEC_DIR_OUTBOUND) &&
648#endif 648#endif
649 (rt = rtcache_validate(&tp->t_in6pcb->in6p_route)) != NULL && 649 (rt = rtcache_validate(&tp->t_in6pcb->in6p_route)) != NULL &&
650 (rt->rt_ifp->if_capenable & IFCAP_TSOv6) != 0; 650 (rt->rt_ifp->if_capenable & IFCAP_TSOv6) != 0;
651#endif /* defined(INET6) */ 651#endif /* defined(INET6) */
652 has_tso = (has_tso4 || has_tso6) && !alwaysfrag; 652 has_tso = (has_tso4 || has_tso6) && !alwaysfrag;
653 653
654 /* 654 /*
655 * Restart Window computation. From draft-floyd-incr-init-win-03: 655 * Restart Window computation. From draft-floyd-incr-init-win-03:
656 * 656 *
657 * Optionally, a TCP MAY set the restart window to the 657 * Optionally, a TCP MAY set the restart window to the
658 * minimum of the value used for the initial window and 658 * minimum of the value used for the initial window and
659 * the current value of cwnd (in other words, using a 659 * the current value of cwnd (in other words, using a
660 * larger value for the restart window should never increase 660 * larger value for the restart window should never increase
661 * the size of cwnd). 661 * the size of cwnd).
662 */ 662 */
663 if (tcp_cwm) { 663 if (tcp_cwm) {
664 /* 664 /*
665 * Hughes/Touch/Heidemann Congestion Window Monitoring. 665 * Hughes/Touch/Heidemann Congestion Window Monitoring.
666 * Count the number of packets currently pending 666 * Count the number of packets currently pending
667 * acknowledgement, and limit our congestion window 667 * acknowledgement, and limit our congestion window
668 * to a pre-determined allowed burst size plus that count. 668 * to a pre-determined allowed burst size plus that count.
669 * This prevents bursting once all pending packets have 669 * This prevents bursting once all pending packets have
670 * been acknowledged (i.e. transmission is idle). 670 * been acknowledged (i.e. transmission is idle).
671 * 671 *
672 * XXX Link this to Initial Window? 672 * XXX Link this to Initial Window?
673 */ 673 */
674 tp->snd_cwnd = min(tp->snd_cwnd, 674 tp->snd_cwnd = min(tp->snd_cwnd,
675 (tcp_cwm_burstsize * txsegsize) + 675 (tcp_cwm_burstsize * txsegsize) +
676 (tp->snd_nxt - tp->snd_una)); 676 (tp->snd_nxt - tp->snd_una));
677 } else { 677 } else {
678 if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) { 678 if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) {
679 /* 679 /*
680 * We have been idle for "a while" and no acks are 680 * We have been idle for "a while" and no acks are
681 * expected to clock out any data we send -- 681 * expected to clock out any data we send --
682 * slow start to get ack "clock" running again. 682 * slow start to get ack "clock" running again.
683 */ 683 */
684 int ss = tcp_init_win; 684 int ss = tcp_init_win;
685#ifdef INET 685#ifdef INET
686 if (tp->t_inpcb && 686 if (tp->t_inpcb &&
687 in_localaddr(tp->t_inpcb->inp_faddr)) 687 in_localaddr(tp->t_inpcb->inp_faddr))
688 ss = tcp_init_win_local; 688 ss = tcp_init_win_local;
689#endif 689#endif
690#ifdef INET6 690#ifdef INET6
691 if (tp->t_in6pcb && 691 if (tp->t_in6pcb &&
692 in6_localaddr(&tp->t_in6pcb->in6p_faddr)) 692 in6_localaddr(&tp->t_in6pcb->in6p_faddr))
693 ss = tcp_init_win_local; 693 ss = tcp_init_win_local;
694#endif 694#endif
695 tp->snd_cwnd = min(tp->snd_cwnd, 695 tp->snd_cwnd = min(tp->snd_cwnd,
696 TCP_INITIAL_WINDOW(ss, txsegsize)); 696 TCP_INITIAL_WINDOW(ss, txsegsize));
697 } 697 }
698 } 698 }
699 699
700 txsegsize_nosack = txsegsize; 700 txsegsize_nosack = txsegsize;
701again: 701again:
702 ecn_tos = 0; 702 ecn_tos = 0;
703 use_tso = has_tso; 703 use_tso = has_tso;
704 if ((tp->t_flags & (TF_ECN_SND_CWR|TF_ECN_SND_ECE)) != 0) { 704 if ((tp->t_flags & (TF_ECN_SND_CWR|TF_ECN_SND_ECE)) != 0) {
705 /* don't duplicate CWR/ECE. */ 705 /* don't duplicate CWR/ECE. */
706 use_tso = 0; 706 use_tso = 0;
707 } 707 }
708 TCP_REASS_LOCK(tp); 708 TCP_REASS_LOCK(tp);
709 sack_numblks = tcp_sack_numblks(tp); 709 sack_numblks = tcp_sack_numblks(tp);
710 if (sack_numblks) { 710 if (sack_numblks) {
711 int sackoptlen; 711 int sackoptlen;
712 712
713 sackoptlen = TCP_SACK_OPTLEN(sack_numblks); 713 sackoptlen = TCP_SACK_OPTLEN(sack_numblks);
714 if (sackoptlen > txsegsize_nosack) { 714 if (sackoptlen > txsegsize_nosack) {
715 sack_numblks = 0; /* give up SACK */ 715 sack_numblks = 0; /* give up SACK */
716 txsegsize = txsegsize_nosack; 716 txsegsize = txsegsize_nosack;
717 } else { 717 } else {
718 if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) { 718 if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
719 /* don't duplicate D-SACK. */ 719 /* don't duplicate D-SACK. */
720 use_tso = 0; 720 use_tso = 0;
721 } 721 }
722 txsegsize = txsegsize_nosack - sackoptlen; 722 txsegsize = txsegsize_nosack - sackoptlen;
723 } 723 }
724 } else { 724 } else {
725 txsegsize = txsegsize_nosack; 725 txsegsize = txsegsize_nosack;
726 } 726 }
727 727
728 /* 728 /*
729 * Determine length of data that should be transmitted, and 729 * Determine length of data that should be transmitted, and
730 * flags that should be used. If there is some data or critical 730 * flags that should be used. If there is some data or critical
731 * controls (SYN, RST) to send, then transmit; otherwise, 731 * controls (SYN, RST) to send, then transmit; otherwise,
732 * investigate further. 732 * investigate further.
733 * 733 *
734 * Readjust SACK information to avoid resending duplicate data. 734 * Readjust SACK information to avoid resending duplicate data.
735 */ 735 */
736 if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max)) 736 if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
737 tcp_sack_adjust(tp); 737 tcp_sack_adjust(tp);
738 sendalot = 0; 738 sendalot = 0;
739 off = tp->snd_nxt - tp->snd_una; 739 off = tp->snd_nxt - tp->snd_una;
740 win = min(tp->snd_wnd, tp->snd_cwnd); 740 win = min(tp->snd_wnd, tp->snd_cwnd);
741 741
742 flags = tcp_outflags[tp->t_state]; 742 flags = tcp_outflags[tp->t_state];
743 743
744 /* 744 /*
745 * Send any SACK-generated retransmissions. If we're explicitly trying 745 * Send any SACK-generated retransmissions. If we're explicitly trying
746 * to send out new data (when sendalot is 1), bypass this function. 746 * to send out new data (when sendalot is 1), bypass this function.
747 * If we retransmit in fast recovery mode, decrement snd_cwnd, since 747 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
748 * we're replacing a (future) new transmission with a retransmission 748 * we're replacing a (future) new transmission with a retransmission
749 * now, and we previously incremented snd_cwnd in tcp_input(). 749 * now, and we previously incremented snd_cwnd in tcp_input().
750 */ 750 */
751 /* 751 /*
752 * Still in sack recovery , reset rxmit flag to zero. 752 * Still in sack recovery , reset rxmit flag to zero.
753 */ 753 */
754 sack_rxmit = 0; 754 sack_rxmit = 0;
755 sack_bytes_rxmt = 0; 755 sack_bytes_rxmt = 0;
756 len = 0; 756 len = 0;
757 p = NULL; 757 p = NULL;
758 do { 758 do {
759 long cwin; 759 long cwin;
760 if (!TCP_SACK_ENABLED(tp)) 760 if (!TCP_SACK_ENABLED(tp))
761 break; 761 break;
762 if (tp->t_partialacks < 0)  762 if (tp->t_partialacks < 0)
763 break; 763 break;
764 p = tcp_sack_output(tp, &sack_bytes_rxmt); 764 p = tcp_sack_output(tp, &sack_bytes_rxmt);
765 if (p == NULL) 765 if (p == NULL)
766 break; 766 break;
767  767
768 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; 768 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
769 if (cwin < 0) 769 if (cwin < 0)
770 cwin = 0; 770 cwin = 0;
771 /* Do not retransmit SACK segments beyond snd_recover */ 771 /* Do not retransmit SACK segments beyond snd_recover */
772 if (SEQ_GT(p->end, tp->snd_recover)) { 772 if (SEQ_GT(p->end, tp->snd_recover)) {
773 /* 773 /*
774 * (At least) part of sack hole extends beyond 774 * (At least) part of sack hole extends beyond
775 * snd_recover. Check to see if we can rexmit data 775 * snd_recover. Check to see if we can rexmit data
776 * for this hole. 776 * for this hole.
777 */ 777 */
778 if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { 778 if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
779 /* 779 /*
780 * Can't rexmit any more data for this hole. 780 * Can't rexmit any more data for this hole.
781 * That data will be rexmitted in the next 781 * That data will be rexmitted in the next
782 * sack recovery episode, when snd_recover 782 * sack recovery episode, when snd_recover
783 * moves past p->rxmit. 783 * moves past p->rxmit.
784 */ 784 */
785 p = NULL; 785 p = NULL;
786 break; 786 break;
787 } 787 }
788 /* Can rexmit part of the current hole */ 788 /* Can rexmit part of the current hole */
789 len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); 789 len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit));
790 } else 790 } else
791 len = ((long)ulmin(cwin, p->end - p->rxmit)); 791 len = ((long)ulmin(cwin, p->end - p->rxmit));
792 off = p->rxmit - tp->snd_una; 792 off = p->rxmit - tp->snd_una;
793 if (off + len > so->so_snd.sb_cc) { 793 if (off + len > so->so_snd.sb_cc) {
794 /* 1 for TH_FIN */ 794 /* 1 for TH_FIN */
795 KASSERT(off + len == so->so_snd.sb_cc + 1); 795 KASSERT(off + len == so->so_snd.sb_cc + 1);
796 KASSERT(p->rxmit + len == tp->snd_max); 796 KASSERT(p->rxmit + len == tp->snd_max);
797 len = so->so_snd.sb_cc - off; 797 len = so->so_snd.sb_cc - off;
798 } 798 }
799 if (len > 0) { 799 if (len > 0) {
800 sack_rxmit = 1; 800 sack_rxmit = 1;
801 sendalot = 1; 801 sendalot = 1;
802 } 802 }
803 } while (/*CONSTCOND*/0); 803 } while (/*CONSTCOND*/0);
804 804
805 /* 805 /*
806 * If in persist timeout with window of 0, send 1 byte. 806 * If in persist timeout with window of 0, send 1 byte.
807 * Otherwise, if window is small but nonzero 807 * Otherwise, if window is small but nonzero
808 * and timer expired, we will send what we can 808 * and timer expired, we will send what we can
809 * and go to transmit state. 809 * and go to transmit state.
810 */ 810 */
811 if (tp->t_force) { 811 if (tp->t_force) {
812 if (win == 0) { 812 if (win == 0) {
813 /* 813 /*
814 * If we still have some data to send, then 814 * If we still have some data to send, then
815 * clear the FIN bit. Usually this would 815 * clear the FIN bit. Usually this would
816 * happen below when it realizes that we 816 * happen below when it realizes that we
817 * aren't sending all the data. However, 817 * aren't sending all the data. However,
818 * if we have exactly 1 byte of unset data, 818 * if we have exactly 1 byte of unset data,
819 * then it won't clear the FIN bit below, 819 * then it won't clear the FIN bit below,
820 * and if we are in persist state, we wind 820 * and if we are in persist state, we wind
821 * up sending the packet without recording 821 * up sending the packet without recording
822 * that we sent the FIN bit. 822 * that we sent the FIN bit.
823 * 823 *
824 * We can't just blindly clear the FIN bit, 824 * We can't just blindly clear the FIN bit,
825 * because if we don't have any more data 825 * because if we don't have any more data
826 * to send then the probe will be the FIN 826 * to send then the probe will be the FIN
827 * itself. 827 * itself.
828 */ 828 */
829 if (off < so->so_snd.sb_cc) 829 if (off < so->so_snd.sb_cc)
830 flags &= ~TH_FIN; 830 flags &= ~TH_FIN;
831 win = 1; 831 win = 1;
832 } else { 832 } else {
833 TCP_TIMER_DISARM(tp, TCPT_PERSIST); 833 TCP_TIMER_DISARM(tp, TCPT_PERSIST);
834 tp->t_rxtshift = 0; 834 tp->t_rxtshift = 0;
835 } 835 }
836 } 836 }
837 837
838 if (sack_rxmit == 0) { 838 if (sack_rxmit == 0) {
839 if (TCP_SACK_ENABLED(tp) && tp->t_partialacks >= 0) { 839 if (TCP_SACK_ENABLED(tp) && tp->t_partialacks >= 0) {
840 long cwin; 840 long cwin;
841 841
842 /* 842 /*
843 * We are inside of a SACK recovery episode and are 843 * We are inside of a SACK recovery episode and are
844 * sending new data, having retransmitted all the 844 * sending new data, having retransmitted all the
845 * data possible in the scoreboard. 845 * data possible in the scoreboard.
846 */ 846 */
847 if (tp->snd_wnd < so->so_snd.sb_cc) { 847 if (tp->snd_wnd < so->so_snd.sb_cc) {
848 len = tp->snd_wnd - off; 848 len = tp->snd_wnd - off;
849 flags &= ~TH_FIN; 849 flags &= ~TH_FIN;
850 } else { 850 } else {
851 len = so->so_snd.sb_cc - off; 851 len = so->so_snd.sb_cc - off;
852 } 852 }
853 853
854 /* 854 /*
855 * From FreeBSD: 855 * From FreeBSD:
856 * Don't remove this (len > 0) check ! 856 * Don't remove this (len > 0) check !
857 * We explicitly check for len > 0 here (although it  857 * We explicitly check for len > 0 here (although it
858 * isn't really necessary), to work around a gcc  858 * isn't really necessary), to work around a gcc
859 * optimization issue - to force gcc to compute 859 * optimization issue - to force gcc to compute
860 * len above. Without this check, the computation 860 * len above. Without this check, the computation
861 * of len is bungled by the optimizer. 861 * of len is bungled by the optimizer.
862 */ 862 */
863 if (len > 0) { 863 if (len > 0) {
864 cwin = tp->snd_cwnd -  864 cwin = tp->snd_cwnd -
865 (tp->snd_nxt - tp->sack_newdata) - 865 (tp->snd_nxt - tp->sack_newdata) -
866 sack_bytes_rxmt; 866 sack_bytes_rxmt;
867 if (cwin < 0) 867 if (cwin < 0)
868 cwin = 0; 868 cwin = 0;
869 if (cwin < len) { 869 if (cwin < len) {
870 len = cwin; 870 len = cwin;
871 flags &= ~TH_FIN; 871 flags &= ~TH_FIN;
872 } 872 }
873 } 873 }
874 } else if (win < so->so_snd.sb_cc) { 874 } else if (win < so->so_snd.sb_cc) {
875 len = win - off; 875 len = win - off;
876 flags &= ~TH_FIN; 876 flags &= ~TH_FIN;
877 } else { 877 } else {
878 len = so->so_snd.sb_cc - off; 878 len = so->so_snd.sb_cc - off;
879 } 879 }
880 } 880 }
881 881
882 if (len < 0) { 882 if (len < 0) {
883 /* 883 /*
884 * If FIN has been sent but not acked, 884 * If FIN has been sent but not acked,
885 * but we haven't been called to retransmit, 885 * but we haven't been called to retransmit,
886 * len will be -1. Otherwise, window shrank 886 * len will be -1. Otherwise, window shrank
887 * after we sent into it. If window shrank to 0, 887 * after we sent into it. If window shrank to 0,
888 * cancel pending retransmit, pull snd_nxt back 888 * cancel pending retransmit, pull snd_nxt back
889 * to (closed) window, and set the persist timer 889 * to (closed) window, and set the persist timer
890 * if it isn't already going. If the window didn't 890 * if it isn't already going. If the window didn't
891 * close completely, just wait for an ACK. 891 * close completely, just wait for an ACK.
892 * 892 *
893 * If we have a pending FIN, either it has already been 893 * If we have a pending FIN, either it has already been
894 * transmitted or it is outside the window, so drop it. 894 * transmitted or it is outside the window, so drop it.
895 * If the FIN has been transmitted, but this is not a 895 * If the FIN has been transmitted, but this is not a
896 * retransmission, then len must be -1. Therefore we also 896 * retransmission, then len must be -1. Therefore we also
897 * prevent here the sending of `gratuitous FINs'. This 897 * prevent here the sending of `gratuitous FINs'. This
898 * eliminates the need to check for that case below (e.g. 898 * eliminates the need to check for that case below (e.g.
899 * to back up snd_nxt before the FIN so that the sequence 899 * to back up snd_nxt before the FIN so that the sequence
900 * number is correct). 900 * number is correct).
901 */ 901 */
902 len = 0; 902 len = 0;
903 flags &= ~TH_FIN; 903 flags &= ~TH_FIN;
904 if (win == 0) { 904 if (win == 0) {
905 TCP_TIMER_DISARM(tp, TCPT_REXMT); 905 TCP_TIMER_DISARM(tp, TCPT_REXMT);
906 tp->t_rxtshift = 0; 906 tp->t_rxtshift = 0;
907 tp->snd_nxt = tp->snd_una; 907 tp->snd_nxt = tp->snd_una;
908 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 908 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
909 tcp_setpersist(tp); 909 tcp_setpersist(tp);
910 } 910 }
911 } 911 }
912 912
913 /* 913 /*
914 * Automatic sizing enables the performance of large buffers 914 * Automatic sizing enables the performance of large buffers
915 * and most of the efficiency of small ones by only allocating 915 * and most of the efficiency of small ones by only allocating
916 * space when it is needed. 916 * space when it is needed.
917 * 917 *
918 * The criteria to step up the send buffer one notch are: 918 * The criteria to step up the send buffer one notch are:
919 * 1. receive window of remote host is larger than send buffer 919 * 1. receive window of remote host is larger than send buffer
920 * (with a fudge factor of 5/4th); 920 * (with a fudge factor of 5/4th);
921 * 2. send buffer is filled to 7/8th with data (so we actually 921 * 2. send buffer is filled to 7/8th with data (so we actually
922 * have data to make use of it); 922 * have data to make use of it);
923 * 3. send buffer fill has not hit maximal automatic size; 923 * 3. send buffer fill has not hit maximal automatic size;
924 * 4. our send window (slow start and cogestion controlled) is 924 * 4. our send window (slow start and cogestion controlled) is
925 * larger than sent but unacknowledged data in send buffer. 925 * larger than sent but unacknowledged data in send buffer.
926 * 926 *
927 * The remote host receive window scaling factor may limit the 927 * The remote host receive window scaling factor may limit the
928 * growing of the send buffer before it reaches its allowed 928 * growing of the send buffer before it reaches its allowed
929 * maximum. 929 * maximum.
930 * 930 *
931 * It scales directly with slow start or congestion window 931 * It scales directly with slow start or congestion window
932 * and does at most one step per received ACK. This fast 932 * and does at most one step per received ACK. This fast
933 * scaling has the drawback of growing the send buffer beyond 933 * scaling has the drawback of growing the send buffer beyond
934 * what is strictly necessary to make full use of a given 934 * what is strictly necessary to make full use of a given
935 * delay*bandwith product. However testing has shown this not 935 * delay*bandwith product. However testing has shown this not
936 * to be much of an problem. At worst we are trading wasting 936 * to be much of an problem. At worst we are trading wasting
937 * of available bandwith (the non-use of it) for wasting some 937 * of available bandwith (the non-use of it) for wasting some
938 * socket buffer memory. 938 * socket buffer memory.
939 * 939 *
940 * TODO: Shrink send buffer during idle periods together 940 * TODO: Shrink send buffer during idle periods together
941 * with congestion window. Requires another timer. 941 * with congestion window. Requires another timer.
942 */ 942 */
943 if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { 943 if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
944 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && 944 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
945 so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && 945 so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
946 so->so_snd.sb_cc < tcp_autosndbuf_max && 946 so->so_snd.sb_cc < tcp_autosndbuf_max &&
947 win >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { 947 win >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
948 if (!sbreserve(&so->so_snd, 948 if (!sbreserve(&so->so_snd,
949 min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, 949 min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
950 tcp_autosndbuf_max), so)) 950 tcp_autosndbuf_max), so))
951 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 951 so->so_snd.sb_flags &= ~SB_AUTOSIZE;
952 } 952 }
953 } 953 }
954 954
955 if (len > txsegsize) { 955 if (len > txsegsize) {
956 if (use_tso) { 956 if (use_tso) {
957 /* 957 /*
958 * Truncate TSO transfers to IP_MAXPACKET, and make 958 * Truncate TSO transfers to IP_MAXPACKET, and make
959 * sure that we send equal size transfers down the 959 * sure that we send equal size transfers down the
960 * stack (rather than big-small-big-small-...). 960 * stack (rather than big-small-big-small-...).
961 */ 961 */
962#ifdef INET6 962#ifdef INET6
963#if IPV6_MAXPACKET != IP_MAXPACKET 963#if IPV6_MAXPACKET != IP_MAXPACKET
964#error IPV6_MAXPACKET != IP_MAXPACKET 964#error IPV6_MAXPACKET != IP_MAXPACKET
965#endif 965#endif
966#endif 966#endif
967 len = (min(len, IP_MAXPACKET) / txsegsize) * txsegsize; 967 len = (min(len, IP_MAXPACKET) / txsegsize) * txsegsize;
968 if (len <= txsegsize) { 968 if (len <= txsegsize) {
969 use_tso = 0; 969 use_tso = 0;
970 } 970 }
971 } else 971 } else
972 len = txsegsize; 972 len = txsegsize;
973 flags &= ~TH_FIN; 973 flags &= ~TH_FIN;
974 sendalot = 1; 974 sendalot = 1;
975 } else 975 } else
976 use_tso = 0; 976 use_tso = 0;
977 if (sack_rxmit) { 977 if (sack_rxmit) {
978 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) 978 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
979 flags &= ~TH_FIN; 979 flags &= ~TH_FIN;
980 } 980 }
981 981
982 win = sbspace(&so->so_rcv); 982 win = sbspace(&so->so_rcv);
983 983
984 /* 984 /*
985 * Sender silly window avoidance. If connection is idle 985 * Sender silly window avoidance. If connection is idle
986 * and can send all data, a maximum segment, 986 * and can send all data, a maximum segment,
987 * at least a maximum default-size segment do it, 987 * at least a maximum default-size segment do it,
988 * or are forced, do it; otherwise don't bother. 988 * or are forced, do it; otherwise don't bother.
989 * If peer's buffer is tiny, then send 989 * If peer's buffer is tiny, then send
990 * when window is at least half open. 990 * when window is at least half open.
991 * If retransmitting (possibly after persist timer forced us 991 * If retransmitting (possibly after persist timer forced us
992 * to send into a small window), then must resend. 992 * to send into a small window), then must resend.
993 */ 993 */
994 if (len) { 994 if (len) {
995 if (len >= txsegsize) 995 if (len >= txsegsize)
996 goto send; 996 goto send;
997 if ((so->so_state & SS_MORETOCOME) == 0 && 997 if ((so->so_state & SS_MORETOCOME) == 0 &&
998 ((idle || tp->t_flags & TF_NODELAY) && 998 ((idle || tp->t_flags & TF_NODELAY) &&
999 len + off >= so->so_snd.sb_cc)) 999 len + off >= so->so_snd.sb_cc))
1000 goto send; 1000 goto send;
1001 if (tp->t_force) 1001 if (tp->t_force)
1002 goto send; 1002 goto send;
1003 if (len >= tp->max_sndwnd / 2) 1003 if (len >= tp->max_sndwnd / 2)
1004 goto send; 1004 goto send;
1005 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 1005 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
1006 goto send; 1006 goto send;
1007 if (sack_rxmit) 1007 if (sack_rxmit)
1008 goto send; 1008 goto send;
1009 } 1009 }
1010 1010
1011 /* 1011 /*
1012 * Compare available window to amount of window known to peer 1012 * Compare available window to amount of window known to peer
1013 * (as advertised window less next expected input). If the 1013 * (as advertised window less next expected input). If the
1014 * difference is at least twice the size of the largest segment 1014 * difference is at least twice the size of the largest segment
1015 * we expect to receive (i.e. two segments) or at least 50% of 1015 * we expect to receive (i.e. two segments) or at least 50% of
1016 * the maximum possible window, then want to send a window update 1016 * the maximum possible window, then want to send a window update
1017 * to peer. 1017 * to peer.
1018 */ 1018 */
1019 if (win > 0) { 1019 if (win > 0) {
1020 /* 1020 /*
1021 * "adv" is the amount we can increase the window, 1021 * "adv" is the amount we can increase the window,
1022 * taking into account that we are limited by 1022 * taking into account that we are limited by
1023 * TCP_MAXWIN << tp->rcv_scale. 1023 * TCP_MAXWIN << tp->rcv_scale.
1024 */ 1024 */
1025 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - 1025 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
1026 (tp->rcv_adv - tp->rcv_nxt); 1026 (tp->rcv_adv - tp->rcv_nxt);
1027 1027
1028 if (adv >= (long) (2 * rxsegsize)) 1028 if (adv >= (long) (2 * rxsegsize))
1029 goto send; 1029 goto send;
1030 if (2 * adv >= (long) so->so_rcv.sb_hiwat) 1030 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
1031 goto send; 1031 goto send;
1032 } 1032 }
1033 1033
1034 /* 1034 /*
1035 * Send if we owe peer an ACK. 1035 * Send if we owe peer an ACK.
1036 */ 1036 */
1037 if (tp->t_flags & TF_ACKNOW) 1037 if (tp->t_flags & TF_ACKNOW)
1038 goto send; 1038 goto send;
1039 if (flags & (TH_SYN|TH_FIN|TH_RST)) 1039 if (flags & (TH_SYN|TH_FIN|TH_RST))
1040 goto send; 1040 goto send;
1041 if (SEQ_GT(tp->snd_up, tp->snd_una)) 1041 if (SEQ_GT(tp->snd_up, tp->snd_una))
1042 goto send; 1042 goto send;
1043 /* 1043 /*
1044 * In SACK, it is possible for tcp_output to fail to send a segment 1044 * In SACK, it is possible for tcp_output to fail to send a segment
1045 * after the retransmission timer has been turned off. Make sure 1045 * after the retransmission timer has been turned off. Make sure
1046 * that the retransmission timer is set. 1046 * that the retransmission timer is set.
1047 */ 1047 */
1048 if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) && 1048 if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) &&
1049 !TCP_TIMER_ISARMED(tp, TCPT_REXMT) && 1049 !TCP_TIMER_ISARMED(tp, TCPT_REXMT) &&
1050 !TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { 1050 !TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
1051 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1051 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
1052 goto just_return; 1052 goto just_return;
1053 } 1053 }
1054 1054
1055 /* 1055 /*
1056 * TCP window updates are not reliable, rather a polling protocol 1056 * TCP window updates are not reliable, rather a polling protocol
1057 * using ``persist'' packets is used to insure receipt of window 1057 * using ``persist'' packets is used to insure receipt of window
1058 * updates. The three ``states'' for the output side are: 1058 * updates. The three ``states'' for the output side are:
1059 * idle not doing retransmits or persists 1059 * idle not doing retransmits or persists
1060 * persisting to move a small or zero window 1060 * persisting to move a small or zero window
1061 * (re)transmitting and thereby not persisting 1061 * (re)transmitting and thereby not persisting
1062 * 1062 *
1063 * tp->t_timer[TCPT_PERSIST] 1063 * tp->t_timer[TCPT_PERSIST]
1064 * is set when we are in persist state. 1064 * is set when we are in persist state.
1065 * tp->t_force 1065 * tp->t_force
1066 * is set when we are called to send a persist packet. 1066 * is set when we are called to send a persist packet.
1067 * tp->t_timer[TCPT_REXMT] 1067 * tp->t_timer[TCPT_REXMT]
1068 * is set when we are retransmitting 1068 * is set when we are retransmitting
1069 * The output side is idle when both timers are zero. 1069 * The output side is idle when both timers are zero.
1070 * 1070 *
1071 * If send window is too small, there is data to transmit, and no 1071 * If send window is too small, there is data to transmit, and no
1072 * retransmit or persist is pending, then go to persist state. 1072 * retransmit or persist is pending, then go to persist state.
1073 * If nothing happens soon, send when timer expires: 1073 * If nothing happens soon, send when timer expires:
1074 * if window is nonzero, transmit what we can, 1074 * if window is nonzero, transmit what we can,
1075 * otherwise force out a byte. 1075 * otherwise force out a byte.
1076 */ 1076 */
1077 if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 1077 if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
1078 TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { 1078 TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
1079 tp->t_rxtshift = 0; 1079 tp->t_rxtshift = 0;
1080 tcp_setpersist(tp); 1080 tcp_setpersist(tp);
1081 } 1081 }
1082 1082
1083 /* 1083 /*
1084 * No reason to send a segment, just return. 1084 * No reason to send a segment, just return.
1085 */ 1085 */
1086just_return: 1086just_return:
1087 TCP_REASS_UNLOCK(tp); 1087 TCP_REASS_UNLOCK(tp);
1088 return (0); 1088 return (0);
1089 1089
1090send: 1090send:
1091 /* 1091 /*
1092 * Before ESTABLISHED, force sending of initial options 1092 * Before ESTABLISHED, force sending of initial options
1093 * unless TCP set not to do any options. 1093 * unless TCP set not to do any options.
1094 * NOTE: we assume that the IP/TCP header plus TCP options 1094 * NOTE: we assume that the IP/TCP header plus TCP options
1095 * always fit in a single mbuf, leaving room for a maximum 1095 * always fit in a single mbuf, leaving room for a maximum
1096 * link header, i.e. 1096 * link header, i.e.
1097 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES 1097 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
1098 */ 1098 */
1099 optlen = 0; 1099 optlen = 0;
1100 switch (af) { 1100 switch (af) {
1101#ifdef INET 1101#ifdef INET
1102 case AF_INET: 1102 case AF_INET:
1103 iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr); 1103 iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
1104 break; 1104 break;
1105#endif 1105#endif
1106#ifdef INET6 1106#ifdef INET6
1107 case AF_INET6: 1107 case AF_INET6:
1108 iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1108 iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1109 break; 1109 break;
1110#endif 1110#endif
1111 default: /*pacify gcc*/ 1111 default: /*pacify gcc*/
1112 iphdrlen = 0; 1112 iphdrlen = 0;
1113 break; 1113 break;
1114 } 1114 }
1115 hdrlen = iphdrlen; 1115 hdrlen = iphdrlen;
1116 if (flags & TH_SYN) { 1116 if (flags & TH_SYN) {
1117 struct rtentry *synrt; 1117 struct rtentry *synrt;
1118 1118
1119 synrt = NULL; 1119 synrt = NULL;
1120#ifdef INET 1120#ifdef INET
1121 if (tp->t_inpcb) 1121 if (tp->t_inpcb)
1122 synrt = in_pcbrtentry(tp->t_inpcb); 1122 synrt = in_pcbrtentry(tp->t_inpcb);
1123#endif 1123#endif
1124#ifdef INET6 1124#ifdef INET6
1125 if (tp->t_in6pcb) 1125 if (tp->t_in6pcb)
1126 synrt = in6_pcbrtentry(tp->t_in6pcb); 1126 synrt = in6_pcbrtentry(tp->t_in6pcb);
1127#endif 1127#endif
1128 1128
1129 tp->snd_nxt = tp->iss; 1129 tp->snd_nxt = tp->iss;
1130 tp->t_ourmss = tcp_mss_to_advertise(synrt != NULL ? 1130 tp->t_ourmss = tcp_mss_to_advertise(synrt != NULL ?
1131 synrt->rt_ifp : NULL, af); 1131 synrt->rt_ifp : NULL, af);
1132 if ((tp->t_flags & TF_NOOPT) == 0) { 1132 if ((tp->t_flags & TF_NOOPT) == 0) {
1133 opt[0] = TCPOPT_MAXSEG; 1133 opt[0] = TCPOPT_MAXSEG;
1134 opt[1] = 4; 1134 opt[1] = 4;
1135 opt[2] = (tp->t_ourmss >> 8) & 0xff; 1135 opt[2] = (tp->t_ourmss >> 8) & 0xff;
1136 opt[3] = tp->t_ourmss & 0xff; 1136 opt[3] = tp->t_ourmss & 0xff;
1137 optlen = 4; 1137 optlen = 4;
1138 1138
1139 if ((tp->t_flags & TF_REQ_SCALE) && 1139 if ((tp->t_flags & TF_REQ_SCALE) &&
1140 ((flags & TH_ACK) == 0 || 1140 ((flags & TH_ACK) == 0 ||
1141 (tp->t_flags & TF_RCVD_SCALE))) { 1141 (tp->t_flags & TF_RCVD_SCALE))) {
1142 *((u_int32_t *) (opt + optlen)) = htonl( 1142 *((u_int32_t *) (opt + optlen)) = htonl(
1143 TCPOPT_NOP << 24 | 1143 TCPOPT_NOP << 24 |
1144 TCPOPT_WINDOW << 16 | 1144 TCPOPT_WINDOW << 16 |
1145 TCPOLEN_WINDOW << 8 | 1145 TCPOLEN_WINDOW << 8 |
1146 tp->request_r_scale); 1146 tp->request_r_scale);
1147 optlen += 4; 1147 optlen += 4;
1148 } 1148 }
1149 if (tcp_do_sack) { 1149 if (tcp_do_sack) {
1150 u_int8_t *cp = (u_int8_t *)(opt + optlen); 1150 u_int8_t *cp = (u_int8_t *)(opt + optlen);
1151 1151
1152 cp[0] = TCPOPT_SACK_PERMITTED; 1152 cp[0] = TCPOPT_SACK_PERMITTED;
1153 cp[1] = 2; 1153 cp[1] = 2;
1154 cp[2] = TCPOPT_NOP; 1154 cp[2] = TCPOPT_NOP;
1155 cp[3] = TCPOPT_NOP; 1155 cp[3] = TCPOPT_NOP;
1156 optlen += 4; 1156 optlen += 4;
1157 } 1157 }
1158 } 1158 }
1159 } 1159 }
1160 1160
1161 /* 1161 /*
1162 * Send a timestamp and echo-reply if this is a SYN and our side 1162 * Send a timestamp and echo-reply if this is a SYN and our side
1163 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side 1163 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
1164 * and our peer have sent timestamps in our SYN's. 1164 * and our peer have sent timestamps in our SYN's.
1165 */ 1165 */
1166 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 1166 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1167 (flags & TH_RST) == 0 && 1167 (flags & TH_RST) == 0 &&
1168 ((flags & (TH_SYN|TH_ACK)) == TH_SYN || 1168 ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
1169 (tp->t_flags & TF_RCVD_TSTMP))) { 1169 (tp->t_flags & TF_RCVD_TSTMP))) {
1170 u_int32_t *lp = (u_int32_t *)(opt + optlen); 1170 u_int32_t *lp = (u_int32_t *)(opt + optlen);
1171 1171
1172 /* Form timestamp option as shown in appendix A of RFC 1323. */ 1172 /* Form timestamp option as shown in appendix A of RFC 1323. */
1173 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 1173 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
1174 *lp++ = htonl(TCP_TIMESTAMP(tp)); 1174 *lp++ = htonl(TCP_TIMESTAMP(tp));
1175 *lp = htonl(tp->ts_recent); 1175 *lp = htonl(tp->ts_recent);
1176 optlen += TCPOLEN_TSTAMP_APPA; 1176 optlen += TCPOLEN_TSTAMP_APPA;
1177 1177
1178 /* Set receive buffer autosizing timestamp. */ 1178 /* Set receive buffer autosizing timestamp. */
1179 if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) 1179 if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE))
1180 tp->rfbuf_ts = TCP_TIMESTAMP(tp); 1180 tp->rfbuf_ts = TCP_TIMESTAMP(tp);
1181 } 1181 }
1182 1182
1183 /* 1183 /*
1184 * Tack on the SACK block if it is necessary. 1184 * Tack on the SACK block if it is necessary.
1185 */ 1185 */
1186 if (sack_numblks) { 1186 if (sack_numblks) {
1187 int sack_len; 1187 int sack_len;
1188 u_char *bp = (u_char *)(opt + optlen); 1188 u_char *bp = (u_char *)(opt + optlen);
1189 u_int32_t *lp = (u_int32_t *)(bp + 4); 1189 u_int32_t *lp = (u_int32_t *)(bp + 4);
1190 struct ipqent *tiqe; 1190 struct ipqent *tiqe;
1191 1191
1192 sack_len = sack_numblks * 8 + 2; 1192 sack_len = sack_numblks * 8 + 2;
1193 bp[0] = TCPOPT_NOP; 1193 bp[0] = TCPOPT_NOP;
1194 bp[1] = TCPOPT_NOP; 1194 bp[1] = TCPOPT_NOP;
1195 bp[2] = TCPOPT_SACK; 1195 bp[2] = TCPOPT_SACK;
1196 bp[3] = sack_len; 1196 bp[3] = sack_len;
1197 if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) { 1197 if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
1198 sack_numblks--; 1198 sack_numblks--;
1199 *lp++ = htonl(tp->rcv_dsack_block.left); 1199 *lp++ = htonl(tp->rcv_dsack_block.left);
1200 *lp++ = htonl(tp->rcv_dsack_block.right); 1200 *lp++ = htonl(tp->rcv_dsack_block.right);
1201 tp->rcv_sack_flags &= ~TCPSACK_HAVED; 1201 tp->rcv_sack_flags &= ~TCPSACK_HAVED;
1202 } 1202 }
1203 for (tiqe = TAILQ_FIRST(&tp->timeq); 1203 for (tiqe = TAILQ_FIRST(&tp->timeq);
1204 sack_numblks > 0; tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) { 1204 sack_numblks > 0; tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) {
1205 KASSERT(tiqe != NULL); 1205 KASSERT(tiqe != NULL);
1206 sack_numblks--; 1206 sack_numblks--;
1207 *lp++ = htonl(tiqe->ipqe_seq); 1207 *lp++ = htonl(tiqe->ipqe_seq);
1208 *lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len + 1208 *lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len +
1209 ((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0)); 1209 ((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0));
1210 } 1210 }
1211 optlen += sack_len + 2; 1211 optlen += sack_len + 2;
1212 } 1212 }
1213 TCP_REASS_UNLOCK(tp); 1213 TCP_REASS_UNLOCK(tp);
1214 1214
1215#ifdef TCP_SIGNATURE 1215#ifdef TCP_SIGNATURE
1216 if (tp->t_flags & TF_SIGNATURE) { 1216 if (tp->t_flags & TF_SIGNATURE) {
1217 u_char *bp; 1217 u_char *bp;
1218 /* 1218 /*
1219 * Initialize TCP-MD5 option (RFC2385) 1219 * Initialize TCP-MD5 option (RFC2385)
1220 */ 1220 */
1221 bp = (u_char *)opt + optlen; 1221 bp = (u_char *)opt + optlen;
1222 *bp++ = TCPOPT_SIGNATURE; 1222 *bp++ = TCPOPT_SIGNATURE;
1223 *bp++ = TCPOLEN_SIGNATURE; 1223 *bp++ = TCPOLEN_SIGNATURE;
1224 sigoff = optlen + 2; 1224 sigoff = optlen + 2;
1225 bzero(bp, TCP_SIGLEN); 1225 bzero(bp, TCP_SIGLEN);
1226 bp += TCP_SIGLEN; 1226 bp += TCP_SIGLEN;
1227 optlen += TCPOLEN_SIGNATURE; 1227 optlen += TCPOLEN_SIGNATURE;
1228 /* 1228 /*
1229 * Terminate options list and maintain 32-bit alignment. 1229 * Terminate options list and maintain 32-bit alignment.
1230 */ 1230 */
1231 *bp++ = TCPOPT_NOP; 1231 *bp++ = TCPOPT_NOP;
1232 *bp++ = TCPOPT_EOL; 1232 *bp++ = TCPOPT_EOL;
1233 optlen += 2; 1233 optlen += 2;
1234 } 1234 }
1235#endif /* TCP_SIGNATURE */ 1235#endif /* TCP_SIGNATURE */
1236 1236
1237 hdrlen += optlen; 1237 hdrlen += optlen;
1238 1238
1239#ifdef DIAGNOSTIC 1239#ifdef DIAGNOSTIC
1240 if (!use_tso && len > txsegsize) 1240 if (!use_tso && len > txsegsize)
1241 panic("tcp data to be sent is larger than segment"); 1241 panic("tcp data to be sent is larger than segment");
1242 else if (use_tso && len > IP_MAXPACKET) 1242 else if (use_tso && len > IP_MAXPACKET)
1243 panic("tcp data to be sent is larger than max TSO size"); 1243 panic("tcp data to be sent is larger than max TSO size");
1244 if (max_linkhdr + hdrlen > MCLBYTES) 1244 if (max_linkhdr + hdrlen > MCLBYTES)
1245 panic("tcphdr too big"); 1245 panic("tcphdr too big");
1246#endif 1246#endif
1247 1247
1248 /* 1248 /*
1249 * Grab a header mbuf, attaching a copy of data to 1249 * Grab a header mbuf, attaching a copy of data to
1250 * be transmitted, and initialize the header from 1250 * be transmitted, and initialize the header from
1251 * the template for sends on this connection. 1251 * the template for sends on this connection.
1252 */ 1252 */
1253 if (len) { 1253 if (len) {
1254 error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m); 1254 error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m);
1255 if (error) 1255 if (error)
1256 goto out; 1256 goto out;
1257 /* 1257 /*
1258 * If we're sending everything we've got, set PUSH. 1258 * If we're sending everything we've got, set PUSH.
1259 * (This will keep happy those implementations which only 1259 * (This will keep happy those implementations which only
1260 * give data to the user when a buffer fills or 1260 * give data to the user when a buffer fills or
1261 * a PUSH comes in.) 1261 * a PUSH comes in.)
1262 */ 1262 */
1263 if (off + len == so->so_snd.sb_cc) 1263 if (off + len == so->so_snd.sb_cc)
1264 flags |= TH_PUSH; 1264 flags |= TH_PUSH;
1265 } else { 1265 } else {
1266 tcps = TCP_STAT_GETREF(); 1266 tcps = TCP_STAT_GETREF();
1267 if (tp->t_flags & TF_ACKNOW) 1267 if (tp->t_flags & TF_ACKNOW)
1268 tcps[TCP_STAT_SNDACKS]++; 1268 tcps[TCP_STAT_SNDACKS]++;
1269 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 1269 else if (flags & (TH_SYN|TH_FIN|TH_RST))
1270 tcps[TCP_STAT_SNDCTRL]++; 1270 tcps[TCP_STAT_SNDCTRL]++;
1271 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 1271 else if (SEQ_GT(tp->snd_up, tp->snd_una))
1272 tcps[TCP_STAT_SNDURG]++; 1272 tcps[TCP_STAT_SNDURG]++;
1273 else 1273 else
1274 tcps[TCP_STAT_SNDWINUP]++; 1274 tcps[TCP_STAT_SNDWINUP]++;
1275 TCP_STAT_PUTREF(); 1275 TCP_STAT_PUTREF();
1276 1276
1277 MGETHDR(m, M_DONTWAIT, MT_HEADER); 1277 MGETHDR(m, M_DONTWAIT, MT_HEADER);
1278 if (m != NULL && max_linkhdr + hdrlen > MHLEN) { 1278 if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
1279 MCLGET(m, M_DONTWAIT); 1279 MCLGET(m, M_DONTWAIT);
1280 if ((m->m_flags & M_EXT) == 0) { 1280 if ((m->m_flags & M_EXT) == 0) {
1281 m_freem(m); 1281 m_freem(m);
1282 m = NULL; 1282 m = NULL;
1283 } 1283 }
1284 } 1284 }
1285 if (m == NULL) { 1285 if (m == NULL) {
1286 error = ENOBUFS; 1286 error = ENOBUFS;
1287 goto out; 1287 goto out;
1288 } 1288 }
1289 MCLAIM(m, &tcp_tx_mowner); 1289 MCLAIM(m, &tcp_tx_mowner);
1290 m->m_data += max_linkhdr; 1290 m->m_data += max_linkhdr;
1291 m->m_len = hdrlen; 1291 m->m_len = hdrlen;
1292 } 1292 }
1293 m->m_pkthdr.rcvif = (struct ifnet *)0; 1293 m->m_pkthdr.rcvif = (struct ifnet *)0;
1294 switch (af) { 1294 switch (af) {
1295#ifdef INET 1295#ifdef INET
1296 case AF_INET: 1296 case AF_INET:
1297 ip = mtod(m, struct ip *); 1297 ip = mtod(m, struct ip *);
1298#ifdef INET6 1298#ifdef INET6
1299 ip6 = NULL; 1299 ip6 = NULL;
1300#endif 1300#endif
1301 th = (struct tcphdr *)(ip + 1); 1301 th = (struct tcphdr *)(ip + 1);
1302 break; 1302 break;
1303#endif 1303#endif
1304#ifdef INET6 1304#ifdef INET6
1305 case AF_INET6: 1305 case AF_INET6:
1306 ip = NULL; 1306 ip = NULL;
1307 ip6 = mtod(m, struct ip6_hdr *); 1307 ip6 = mtod(m, struct ip6_hdr *);
1308 th = (struct tcphdr *)(ip6 + 1); 1308 th = (struct tcphdr *)(ip6 + 1);
1309 break; 1309 break;
1310#endif 1310#endif
1311 default: /*pacify gcc*/ 1311 default: /*pacify gcc*/
1312 ip = NULL; 1312 ip = NULL;
1313#ifdef INET6 1313#ifdef INET6
1314 ip6 = NULL; 1314 ip6 = NULL;
1315#endif 1315#endif
1316 th = NULL; 1316 th = NULL;
1317 break; 1317 break;
1318 } 1318 }
1319 if (tp->t_template == 0) 1319 if (tp->t_template == 0)
1320 panic("tcp_output"); 1320 panic("tcp_output");
1321 if (tp->t_template->m_len < iphdrlen) 1321 if (tp->t_template->m_len < iphdrlen)
1322 panic("tcp_output"); 1322 panic("tcp_output");
1323 bcopy(mtod(tp->t_template, void *), mtod(m, void *), iphdrlen); 1323 bcopy(mtod(tp->t_template, void *), mtod(m, void *), iphdrlen);
1324 1324
1325 /* 1325 /*
1326 * If we are starting a connection, send ECN setup 1326 * If we are starting a connection, send ECN setup
1327 * SYN packet. If we are on a retransmit, we may 1327 * SYN packet. If we are on a retransmit, we may
1328 * resend those bits a number of times as per 1328 * resend those bits a number of times as per
1329 * RFC 3168. 1329 * RFC 3168.
1330 */ 1330 */
1331 if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) { 1331 if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) {
1332 if (tp->t_flags & TF_SYN_REXMT) { 1332 if (tp->t_flags & TF_SYN_REXMT) {
1333 if (tp->t_ecn_retries--) 1333 if (tp->t_ecn_retries--)
1334 flags |= TH_ECE|TH_CWR; 1334 flags |= TH_ECE|TH_CWR;
1335 } else { 1335 } else {
1336 flags |= TH_ECE|TH_CWR; 1336 flags |= TH_ECE|TH_CWR;
1337 tp->t_ecn_retries = tcp_ecn_maxretries; 1337 tp->t_ecn_retries = tcp_ecn_maxretries;
1338 } 1338 }
1339 } 1339 }
1340 1340
1341 if (TCP_ECN_ALLOWED(tp)) { 1341 if (TCP_ECN_ALLOWED(tp)) {
1342 /* 1342 /*
1343 * If the peer has ECN, mark data packets 1343 * If the peer has ECN, mark data packets
1344 * ECN capable. Ignore pure ack packets, retransmissions 1344 * ECN capable. Ignore pure ack packets, retransmissions
1345 * and window probes. 1345 * and window probes.
1346 */ 1346 */
1347 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 1347 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
1348 !(tp->t_force && len == 1)) { 1348 !(tp->t_force && len == 1)) {
1349 ecn_tos = IPTOS_ECN_ECT0; 1349 ecn_tos = IPTOS_ECN_ECT0;
1350 TCP_STATINC(TCP_STAT_ECN_ECT); 1350 TCP_STATINC(TCP_STAT_ECN_ECT);
1351 } 1351 }
1352 1352
1353 /* 1353 /*
1354 * Reply with proper ECN notifications. 1354 * Reply with proper ECN notifications.
1355 */ 1355 */
1356 if (tp->t_flags & TF_ECN_SND_CWR) { 1356 if (tp->t_flags & TF_ECN_SND_CWR) {
1357 flags |= TH_CWR; 1357 flags |= TH_CWR;
1358 tp->t_flags &= ~TF_ECN_SND_CWR; 1358 tp->t_flags &= ~TF_ECN_SND_CWR;
1359 }  1359 }
1360 if (tp->t_flags & TF_ECN_SND_ECE) { 1360 if (tp->t_flags & TF_ECN_SND_ECE) {
1361 flags |= TH_ECE; 1361 flags |= TH_ECE;
1362 } 1362 }
1363 } 1363 }
1364 1364
1365 1365
1366 /* 1366 /*
1367 * If we are doing retransmissions, then snd_nxt will 1367 * If we are doing retransmissions, then snd_nxt will
1368 * not reflect the first unsent octet. For ACK only 1368 * not reflect the first unsent octet. For ACK only
1369 * packets, we do not want the sequence number of the 1369 * packets, we do not want the sequence number of the
1370 * retransmitted packet, we want the sequence number 1370 * retransmitted packet, we want the sequence number
1371 * of the next unsent octet. So, if there is no data 1371 * of the next unsent octet. So, if there is no data
1372 * (and no SYN or FIN), use snd_max instead of snd_nxt 1372 * (and no SYN or FIN), use snd_max instead of snd_nxt
1373 * when filling in ti_seq. But if we are in persist 1373 * when filling in ti_seq. But if we are in persist
1374 * state, snd_max might reflect one byte beyond the 1374 * state, snd_max might reflect one byte beyond the
1375 * right edge of the window, so use snd_nxt in that 1375 * right edge of the window, so use snd_nxt in that
1376 * case, since we know we aren't doing a retransmission. 1376 * case, since we know we aren't doing a retransmission.
1377 * (retransmit and persist are mutually exclusive...) 1377 * (retransmit and persist are mutually exclusive...)
1378 */ 1378 */
1379 if (TCP_SACK_ENABLED(tp) && sack_rxmit) { 1379 if (TCP_SACK_ENABLED(tp) && sack_rxmit) {
1380 th->th_seq = htonl(p->rxmit); 1380 th->th_seq = htonl(p->rxmit);
1381 p->rxmit += len; 1381 p->rxmit += len;
1382 } else { 1382 } else {
1383 if (len || (flags & (TH_SYN|TH_FIN)) || 1383 if (len || (flags & (TH_SYN|TH_FIN)) ||
1384 TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) 1384 TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
1385 th->th_seq = htonl(tp->snd_nxt); 1385 th->th_seq = htonl(tp->snd_nxt);
1386 else 1386 else
1387 th->th_seq = htonl(tp->snd_max); 1387 th->th_seq = htonl(tp->snd_max);
1388 } 1388 }
1389 th->th_ack = htonl(tp->rcv_nxt); 1389 th->th_ack = htonl(tp->rcv_nxt);
1390 if (optlen) { 1390 if (optlen) {
1391 bcopy((void *)opt, (void *)(th + 1), optlen); 1391 bcopy((void *)opt, (void *)(th + 1), optlen);
1392 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 1392 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
1393 } 1393 }
1394 th->th_flags = flags; 1394 th->th_flags = flags;
1395 /* 1395 /*
1396 * Calculate receive window. Don't shrink window, 1396 * Calculate receive window. Don't shrink window,
1397 * but avoid silly window syndrome. 1397 * but avoid silly window syndrome.
1398 */ 1398 */
1399 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize) 1399 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize)
1400 win = 0; 1400 win = 0;
1401 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 1401 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
1402 win = (long)TCP_MAXWIN << tp->rcv_scale; 1402 win = (long)TCP_MAXWIN << tp->rcv_scale;
1403 if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt)) 1403 if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
1404 win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt); 1404 win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
1405 th->th_win = htons((u_int16_t) (win>>tp->rcv_scale)); 1405 th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
1406 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 1406 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1407 u_int32_t urp = tp->snd_up - tp->snd_nxt; 1407 u_int32_t urp = tp->snd_up - tp->snd_nxt;
1408 if (urp > IP_MAXPACKET) 1408 if (urp > IP_MAXPACKET)
1409 urp = IP_MAXPACKET; 1409 urp = IP_MAXPACKET;
1410 th->th_urp = htons((u_int16_t)urp); 1410 th->th_urp = htons((u_int16_t)urp);
1411 th->th_flags |= TH_URG; 1411 th->th_flags |= TH_URG;
1412 } else 1412 } else
1413 /* 1413 /*
1414 * If no urgent pointer to send, then we pull 1414 * If no urgent pointer to send, then we pull
1415 * the urgent pointer to the left edge of the send window 1415 * the urgent pointer to the left edge of the send window
1416 * so that it doesn't drift into the send window on sequence 1416 * so that it doesn't drift into the send window on sequence
1417 * number wraparound. 1417 * number wraparound.
1418 */ 1418 */
1419 tp->snd_up = tp->snd_una; /* drag it along */ 1419 tp->snd_up = tp->snd_una; /* drag it along */
1420 1420
1421#ifdef TCP_SIGNATURE 1421#ifdef TCP_SIGNATURE
1422 if (sigoff && (tp->t_flags & TF_SIGNATURE)) { 1422 if (sigoff && (tp->t_flags & TF_SIGNATURE)) {
1423 struct secasvar *sav; 1423 struct secasvar *sav;
1424 u_int8_t *sigp; 1424 u_int8_t *sigp;
1425 1425
1426 sav = tcp_signature_getsav(m, th); 1426 sav = tcp_signature_getsav(m, th);
1427 1427
1428 if (sav == NULL) { 1428 if (sav == NULL) {
1429 if (m) 1429 if (m)
1430 m_freem(m); 1430 m_freem(m);
1431 return (EPERM); 1431 return (EPERM);
1432 } 1432 }
1433 1433
1434 m->m_pkthdr.len = hdrlen + len; 1434 m->m_pkthdr.len = hdrlen + len;
1435 sigp = (char *)th + sizeof(*th) + sigoff; 1435 sigp = (char *)th + sizeof(*th) + sigoff;
1436 tcp_signature(m, th, (char *)th - mtod(m, char *), sav, sigp); 1436 tcp_signature(m, th, (char *)th - mtod(m, char *), sav, sigp);
1437 1437
1438 key_sa_recordxfer(sav, m); 1438 key_sa_recordxfer(sav, m);
1439#ifdef FAST_IPSEC 1439#ifdef FAST_IPSEC
1440 KEY_FREESAV(&sav); 1440 KEY_FREESAV(&sav);
1441#else 1441#else
1442 key_freesav(sav); 1442 key_freesav(sav);
1443#endif 1443#endif
1444 } 1444 }
1445#endif 1445#endif
1446 1446
1447 /* 1447 /*
1448 * Set ourselves up to be checksummed just before the packet 1448 * Set ourselves up to be checksummed just before the packet
1449 * hits the wire. 1449 * hits the wire.
1450 */ 1450 */
1451 switch (af) { 1451 switch (af) {
1452#ifdef INET 1452#ifdef INET
1453 case AF_INET: 1453 case AF_INET:
1454 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 1454 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1455 if (use_tso) { 1455 if (use_tso) {
1456 m->m_pkthdr.segsz = txsegsize; 1456 m->m_pkthdr.segsz = txsegsize;
1457 m->m_pkthdr.csum_flags = M_CSUM_TSOv4; 1457 m->m_pkthdr.csum_flags = M_CSUM_TSOv4;
1458 } else { 1458 } else {
1459 m->m_pkthdr.csum_flags = M_CSUM_TCPv4; 1459 m->m_pkthdr.csum_flags = M_CSUM_TCPv4;
1460 if (len + optlen) { 1460 if (len + optlen) {
1461 /* Fixup the pseudo-header checksum. */ 1461 /* Fixup the pseudo-header checksum. */
1462 /* XXXJRT Not IP Jumbogram safe. */ 1462 /* XXXJRT Not IP Jumbogram safe. */
1463 th->th_sum = in_cksum_addword(th->th_sum, 1463 th->th_sum = in_cksum_addword(th->th_sum,
1464 htons((u_int16_t) (len + optlen))); 1464 htons((u_int16_t) (len + optlen)));
1465 } 1465 }
1466 } 1466 }
1467 break; 1467 break;
1468#endif 1468#endif
1469#ifdef INET6 1469#ifdef INET6
1470 case AF_INET6: 1470 case AF_INET6:
1471 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 1471 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1472 if (use_tso) { 1472 if (use_tso) {
1473 m->m_pkthdr.segsz = txsegsize; 1473 m->m_pkthdr.segsz = txsegsize;
1474 m->m_pkthdr.csum_flags = M_CSUM_TSOv6; 1474 m->m_pkthdr.csum_flags = M_CSUM_TSOv6;
1475 } else { 1475 } else {
1476 m->m_pkthdr.csum_flags = M_CSUM_TCPv6; 1476 m->m_pkthdr.csum_flags = M_CSUM_TCPv6;
1477 if (len + optlen) { 1477 if (len + optlen) {
1478 /* Fixup the pseudo-header checksum. */ 1478 /* Fixup the pseudo-header checksum. */
1479 /* XXXJRT: Not IPv6 Jumbogram safe. */ 1479 /* XXXJRT: Not IPv6 Jumbogram safe. */
1480 th->th_sum = in_cksum_addword(th->th_sum, 1480 th->th_sum = in_cksum_addword(th->th_sum,
1481 htons((u_int16_t) (len + optlen))); 1481 htons((u_int16_t) (len + optlen)));
1482 } 1482 }
1483 } 1483 }
1484 break; 1484 break;
1485#endif 1485#endif
1486 } 1486 }
1487 1487
1488 /* 1488 /*
1489 * In transmit state, time the transmission and arrange for 1489 * In transmit state, time the transmission and arrange for
1490 * the retransmit. In persist state, just set snd_max. 1490 * the retransmit. In persist state, just set snd_max.
1491 */ 1491 */
1492 if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { 1492 if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
1493 tcp_seq startseq = tp->snd_nxt; 1493 tcp_seq startseq = tp->snd_nxt;
1494 1494
1495 /* 1495 /*
1496 * Advance snd_nxt over sequence space of this segment. 1496 * Advance snd_nxt over sequence space of this segment.
1497 * There are no states in which we send both a SYN and a FIN, 1497 * There are no states in which we send both a SYN and a FIN,
1498 * so we collapse the tests for these flags. 1498 * so we collapse the tests for these flags.
1499 */ 1499 */
1500 if (flags & (TH_SYN|TH_FIN)) 1500 if (flags & (TH_SYN|TH_FIN))
1501 tp->snd_nxt++; 1501 tp->snd_nxt++;
1502 if (sack_rxmit) 1502 if (sack_rxmit)
1503 goto timer; 1503 goto timer;
1504 tp->snd_nxt += len; 1504 tp->snd_nxt += len;
1505 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 1505 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1506 tp->snd_max = tp->snd_nxt; 1506 tp->snd_max = tp->snd_nxt;
1507 /* 1507 /*
1508 * Time this transmission if not a retransmission and 1508 * Time this transmission if not a retransmission and
1509 * not currently timing anything. 1509 * not currently timing anything.
1510 */ 1510 */
1511 if (tp->t_rtttime == 0) { 1511 if (tp->t_rtttime == 0) {
1512 tp->t_rtttime = tcp_now; 1512 tp->t_rtttime = tcp_now;
1513 tp->t_rtseq = startseq; 1513 tp->t_rtseq = startseq;
1514 TCP_STATINC(TCP_STAT_SEGSTIMED); 1514 TCP_STATINC(TCP_STAT_SEGSTIMED);
1515 } 1515 }
1516 } 1516 }
1517 1517
1518 /* 1518 /*
1519 * Set retransmit timer if not currently set, 1519 * Set retransmit timer if not currently set,
1520 * and not doing an ack or a keep-alive probe. 1520 * and not doing an ack or a keep-alive probe.
1521 * Initial value for retransmit timer is smoothed 1521 * Initial value for retransmit timer is smoothed
1522 * round-trip time + 2 * round-trip time variance. 1522 * round-trip time + 2 * round-trip time variance.
1523 * Initialize shift counter which is used for backoff 1523 * Initialize shift counter which is used for backoff
1524 * of retransmit time. 1524 * of retransmit time.
1525 */ 1525 */
1526timer: 1526timer:
1527 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 1527 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) {
1528 ((sack_rxmit && tp->snd_nxt != tp->snd_max) || 1528 if ((sack_rxmit && tp->snd_nxt != tp->snd_max)
1529 tp->snd_nxt != tp->snd_una)) { 1529 || tp->snd_nxt != tp->snd_una) {
1530 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { 1530 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
1531 TCP_TIMER_DISARM(tp, TCPT_PERSIST); 1531 TCP_TIMER_DISARM(tp, TCPT_PERSIST);
 1532 tp->t_rxtshift = 0;
 1533 }
 1534 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 1535 } else if (len == 0 && so->so_snd.sb_cc > 0
 1536 && TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
 1537 /*
 1538 * If we are sending a window probe and there's
 1539 * unacked data in the socket, make sure at
 1540 * least the persist timer is running.
 1541 */
1532 tp->t_rxtshift = 0; 1542 tp->t_rxtshift = 0;
 1543 tcp_setpersist(tp);
1533 } 1544 }
1534 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 
1535 } 1545 }
1536 } else 1546 } else
1537 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 1547 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
1538 tp->snd_max = tp->snd_nxt + len; 1548 tp->snd_max = tp->snd_nxt + len;
1539 1549
1540#ifdef TCP_DEBUG 1550#ifdef TCP_DEBUG
1541 /* 1551 /*
1542 * Trace. 1552 * Trace.
1543 */ 1553 */
1544 if (so->so_options & SO_DEBUG) 1554 if (so->so_options & SO_DEBUG)
1545 tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0); 1555 tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0);
1546#endif 1556#endif
1547 1557
1548 /* 1558 /*
1549 * Fill in IP length and desired time to live and 1559 * Fill in IP length and desired time to live and
1550 * send to IP level. There should be a better way 1560 * send to IP level. There should be a better way
1551 * to handle ttl and tos; we could keep them in 1561 * to handle ttl and tos; we could keep them in
1552 * the template, but need a way to checksum without them. 1562 * the template, but need a way to checksum without them.
1553 */ 1563 */
1554 m->m_pkthdr.len = hdrlen + len; 1564 m->m_pkthdr.len = hdrlen + len;
1555 1565
1556 switch (af) { 1566 switch (af) {
1557#ifdef INET 1567#ifdef INET
1558 case AF_INET: 1568 case AF_INET:
1559 ip->ip_len = htons(m->m_pkthdr.len); 1569 ip->ip_len = htons(m->m_pkthdr.len);
1560 packetlen = m->m_pkthdr.len; 1570 packetlen = m->m_pkthdr.len;
1561 if (tp->t_inpcb) { 1571 if (tp->t_inpcb) {
1562 ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; 1572 ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
1563 ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos | ecn_tos; 1573 ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos | ecn_tos;
1564 } 1574 }
1565#ifdef INET6 1575#ifdef INET6
1566 else if (tp->t_in6pcb) { 1576 else if (tp->t_in6pcb) {
1567 ip->ip_ttl = in6_selecthlim(tp->t_in6pcb, NULL); /*XXX*/ 1577 ip->ip_ttl = in6_selecthlim(tp->t_in6pcb, NULL); /*XXX*/
1568 ip->ip_tos = ecn_tos; /*XXX*/ 1578 ip->ip_tos = ecn_tos; /*XXX*/
1569 } 1579 }
1570#endif 1580#endif
1571 break; 1581 break;
1572#endif 1582#endif
1573#ifdef INET6 1583#ifdef INET6
1574 case AF_INET6: 1584 case AF_INET6:
1575 packetlen = m->m_pkthdr.len; 1585 packetlen = m->m_pkthdr.len;
1576 ip6->ip6_nxt = IPPROTO_TCP; 1586 ip6->ip6_nxt = IPPROTO_TCP;
1577 if (tp->t_in6pcb) { 1587 if (tp->t_in6pcb) {
1578 /* 1588 /*
1579 * we separately set hoplimit for every segment, since 1589 * we separately set hoplimit for every segment, since
1580 * the user might want to change the value via 1590 * the user might want to change the value via
1581 * setsockopt. Also, desired default hop limit might 1591 * setsockopt. Also, desired default hop limit might
1582 * be changed via Neighbor Discovery. 1592 * be changed via Neighbor Discovery.
1583 */ 1593 */
1584 ip6->ip6_hlim = in6_selecthlim(tp->t_in6pcb, 1594 ip6->ip6_hlim = in6_selecthlim(tp->t_in6pcb,
1585 (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp 1595 (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp
1586 : NULL); 1596 : NULL);
1587 } 1597 }
1588 ip6->ip6_flow |= htonl(ecn_tos << 20); 1598 ip6->ip6_flow |= htonl(ecn_tos << 20);
1589 /* ip6->ip6_flow = ??? (from template) */ 1599 /* ip6->ip6_flow = ??? (from template) */
1590 /* ip6_plen will be filled in ip6_output(). */ 1600 /* ip6_plen will be filled in ip6_output(). */
1591 break; 1601 break;
1592#endif 1602#endif
1593 default: /*pacify gcc*/ 1603 default: /*pacify gcc*/
1594 packetlen = 0; 1604 packetlen = 0;
1595 break; 1605 break;
1596 } 1606 }
1597 1607
1598 switch (af) { 1608 switch (af) {
1599#ifdef INET 1609#ifdef INET
1600 case AF_INET: 1610 case AF_INET:
1601 { 1611 {
1602 struct mbuf *opts; 1612 struct mbuf *opts;
1603 1613
1604 if (tp->t_inpcb) 1614 if (tp->t_inpcb)
1605 opts = tp->t_inpcb->inp_options; 1615 opts = tp->t_inpcb->inp_options;
1606 else 1616 else
1607 opts = NULL; 1617 opts = NULL;
1608 error = ip_output(m, opts, ro, 1618 error = ip_output(m, opts, ro,
1609 (tp->t_mtudisc ? IP_MTUDISC : 0) | 1619 (tp->t_mtudisc ? IP_MTUDISC : 0) |
1610 (so->so_options & SO_DONTROUTE), 1620 (so->so_options & SO_DONTROUTE),
1611 (struct ip_moptions *)0, so); 1621 (struct ip_moptions *)0, so);
1612 break; 1622 break;
1613 } 1623 }
1614#endif 1624#endif
1615#ifdef INET6 1625#ifdef INET6
1616 case AF_INET6: 1626 case AF_INET6:
1617 { 1627 {
1618 struct ip6_pktopts *opts; 1628 struct ip6_pktopts *opts;
1619 1629
1620 if (tp->t_in6pcb) 1630 if (tp->t_in6pcb)
1621 opts = tp->t_in6pcb->in6p_outputopts; 1631 opts = tp->t_in6pcb->in6p_outputopts;
1622 else 1632 else
1623 opts = NULL; 1633 opts = NULL;
1624 error = ip6_output(m, opts, ro, so->so_options & SO_DONTROUTE, 1634 error = ip6_output(m, opts, ro, so->so_options & SO_DONTROUTE,
1625 NULL, so, NULL); 1635 NULL, so, NULL);
1626 break; 1636 break;
1627 } 1637 }
1628#endif 1638#endif
1629 default: 1639 default:
1630 error = EAFNOSUPPORT; 1640 error = EAFNOSUPPORT;
1631 break; 1641 break;
1632 } 1642 }
1633 if (error) { 1643 if (error) {
1634out: 1644out:
1635 if (error == ENOBUFS) { 1645 if (error == ENOBUFS) {
1636 TCP_STATINC(TCP_STAT_SELFQUENCH); 1646 TCP_STATINC(TCP_STAT_SELFQUENCH);
1637#ifdef INET 1647#ifdef INET
1638 if (tp->t_inpcb) 1648 if (tp->t_inpcb)
1639 tcp_quench(tp->t_inpcb, 0); 1649 tcp_quench(tp->t_inpcb, 0);
1640#endif 1650#endif
1641#ifdef INET6 1651#ifdef INET6
1642 if (tp->t_in6pcb) 1652 if (tp->t_in6pcb)
1643 tcp6_quench(tp->t_in6pcb, 0); 1653 tcp6_quench(tp->t_in6pcb, 0);
1644#endif 1654#endif
1645 error = 0; 1655 error = 0;
1646 } else if ((error == EHOSTUNREACH || error == ENETDOWN) && 1656 } else if ((error == EHOSTUNREACH || error == ENETDOWN) &&
1647 TCPS_HAVERCVDSYN(tp->t_state)) { 1657 TCPS_HAVERCVDSYN(tp->t_state)) {
1648 tp->t_softerror = error; 1658 tp->t_softerror = error;
1649 error = 0; 1659 error = 0;
1650 } 1660 }
1651 1661
1652 /* Back out the seqence number advance. */ 1662 /* Back out the seqence number advance. */
1653 if (sack_rxmit) 1663 if (sack_rxmit)
1654 p->rxmit -= len; 1664 p->rxmit -= len;
1655 1665
1656 /* Restart the delayed ACK timer, if necessary. */ 1666 /* Restart the delayed ACK timer, if necessary. */
1657 if (tp->t_flags & TF_DELACK) 1667 if (tp->t_flags & TF_DELACK)
1658 TCP_RESTART_DELACK(tp); 1668 TCP_RESTART_DELACK(tp);
1659 1669
1660 return (error); 1670 return (error);
1661 } 1671 }
1662 1672
1663 if (packetlen > tp->t_pmtud_mtu_sent) 1673 if (packetlen > tp->t_pmtud_mtu_sent)
1664 tp->t_pmtud_mtu_sent = packetlen; 1674 tp->t_pmtud_mtu_sent = packetlen;
1665  1675
1666 tcps = TCP_STAT_GETREF(); 1676 tcps = TCP_STAT_GETREF();
1667 tcps[TCP_STAT_SNDTOTAL]++; 1677 tcps[TCP_STAT_SNDTOTAL]++;
1668 if (tp->t_flags & TF_DELACK) 1678 if (tp->t_flags & TF_DELACK)
1669 tcps[TCP_STAT_DELACK]++; 1679 tcps[TCP_STAT_DELACK]++;
1670 TCP_STAT_PUTREF(); 1680 TCP_STAT_PUTREF();
1671 1681
1672 /* 1682 /*
1673 * Data sent (as far as we can tell). 1683 * Data sent (as far as we can tell).
1674 * If this advertises a larger window than any other segment, 1684 * If this advertises a larger window than any other segment,
1675 * then remember the size of the advertised window. 1685 * then remember the size of the advertised window.
1676 * Any pending ACK has now been sent. 1686 * Any pending ACK has now been sent.
1677 */ 1687 */
1678 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 1688 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
1679 tp->rcv_adv = tp->rcv_nxt + win; 1689 tp->rcv_adv = tp->rcv_nxt + win;
1680 tp->last_ack_sent = tp->rcv_nxt; 1690 tp->last_ack_sent = tp->rcv_nxt;
1681 tp->t_flags &= ~TF_ACKNOW; 1691 tp->t_flags &= ~TF_ACKNOW;
1682 TCP_CLEAR_DELACK(tp); 1692 TCP_CLEAR_DELACK(tp);
1683#ifdef DIAGNOSTIC 1693#ifdef DIAGNOSTIC
1684 if (maxburst < 0) 1694 if (maxburst < 0)
1685 printf("tcp_output: maxburst exceeded by %d\n", -maxburst); 1695 printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
1686#endif 1696#endif
1687 if (sendalot && (tp->t_congctl == &tcp_reno_ctl || --maxburst)) 1697 if (sendalot && (tp->t_congctl == &tcp_reno_ctl || --maxburst))
1688 goto again; 1698 goto again;
1689 return (0); 1699 return (0);
1690} 1700}
1691 1701
1692void 1702void
1693tcp_setpersist(struct tcpcb *tp) 1703tcp_setpersist(struct tcpcb *tp)
1694{ 1704{
1695 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2); 1705 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2);
1696 int nticks; 1706 int nticks;
1697 1707
1698 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT)) 1708 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
1699 panic("tcp_output REXMT"); 1709 panic("tcp_output REXMT");
1700 /* 1710 /*
1701 * Start/restart persistance timer. 1711 * Start/restart persistance timer.
1702 */ 1712 */
1703 if (t < tp->t_rttmin) 1713 if (t < tp->t_rttmin)
1704 t = tp->t_rttmin; 1714 t = tp->t_rttmin;
1705 TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift], 1715 TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
1706 TCPTV_PERSMIN, TCPTV_PERSMAX); 1716 TCPTV_PERSMIN, TCPTV_PERSMAX);
1707 TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks); 1717 TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
1708 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 1718 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1709 tp->t_rxtshift++; 1719 tp->t_rxtshift++;
1710} 1720}