Fri Jan 7 03:17:44 2011 UTC ()
Backout an inadverdant change.


(matt)
diff -r1.275.4.1.8.1 -r1.275.4.1.8.2 src/sys/netinet/ip_input.c

cvs diff -r1.275.4.1.8.1 -r1.275.4.1.8.2 src/sys/netinet/ip_input.c (switch to unified diff)

--- src/sys/netinet/ip_input.c 2011/01/07 03:16:14 1.275.4.1.8.1
+++ src/sys/netinet/ip_input.c 2011/01/07 03:17:44 1.275.4.1.8.2
@@ -1,1477 +1,1475 @@ @@ -1,1477 +1,1475 @@
1/* $NetBSD: ip_input.c,v 1.275.4.1.8.1 2011/01/07 03:16:14 matt Exp $ */ 1/* $NetBSD: ip_input.c,v 1.275.4.1.8.2 2011/01/07 03:17:44 matt Exp $ */
2 2
3/* 3/*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors 15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software 16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission. 17 * without specific prior written permission.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE. 29 * SUCH DAMAGE.
30 */ 30 */
31 31
32/*- 32/*-
33 * Copyright (c) 1998 The NetBSD Foundation, Inc. 33 * Copyright (c) 1998 The NetBSD Foundation, Inc.
34 * All rights reserved. 34 * All rights reserved.
35 * 35 *
36 * This code is derived from software contributed to The NetBSD Foundation 36 * This code is derived from software contributed to The NetBSD Foundation
37 * by Public Access Networks Corporation ("Panix"). It was developed under 37 * by Public Access Networks Corporation ("Panix"). It was developed under
38 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon. 38 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
39 * 39 *
40 * Redistribution and use in source and binary forms, with or without 40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions 41 * modification, are permitted provided that the following conditions
42 * are met: 42 * are met:
43 * 1. Redistributions of source code must retain the above copyright 43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer. 44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright 45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the 46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution. 47 * documentation and/or other materials provided with the distribution.
48 * 48 *
49 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 49 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
50 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 50 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
51 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 51 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
52 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 52 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
53 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 53 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
54 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 54 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
55 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 55 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
56 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 56 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
57 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 57 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
58 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 58 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
59 * POSSIBILITY OF SUCH DAMAGE. 59 * POSSIBILITY OF SUCH DAMAGE.
60 */ 60 */
61 61
62/* 62/*
63 * Copyright (c) 1982, 1986, 1988, 1993 63 * Copyright (c) 1982, 1986, 1988, 1993
64 * The Regents of the University of California. All rights reserved. 64 * The Regents of the University of California. All rights reserved.
65 * 65 *
66 * Redistribution and use in source and binary forms, with or without 66 * Redistribution and use in source and binary forms, with or without
67 * modification, are permitted provided that the following conditions 67 * modification, are permitted provided that the following conditions
68 * are met: 68 * are met:
69 * 1. Redistributions of source code must retain the above copyright 69 * 1. Redistributions of source code must retain the above copyright
70 * notice, this list of conditions and the following disclaimer. 70 * notice, this list of conditions and the following disclaimer.
71 * 2. Redistributions in binary form must reproduce the above copyright 71 * 2. Redistributions in binary form must reproduce the above copyright
72 * notice, this list of conditions and the following disclaimer in the 72 * notice, this list of conditions and the following disclaimer in the
73 * documentation and/or other materials provided with the distribution. 73 * documentation and/or other materials provided with the distribution.
74 * 3. Neither the name of the University nor the names of its contributors 74 * 3. Neither the name of the University nor the names of its contributors
75 * may be used to endorse or promote products derived from this software 75 * may be used to endorse or promote products derived from this software
76 * without specific prior written permission. 76 * without specific prior written permission.
77 * 77 *
78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
88 * SUCH DAMAGE. 88 * SUCH DAMAGE.
89 * 89 *
90 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 90 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94
91 */ 91 */
92 92
93#include <sys/cdefs.h> 93#include <sys/cdefs.h>
94__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.275.4.1.8.1 2011/01/07 03:16:14 matt Exp $"); 94__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.275.4.1.8.2 2011/01/07 03:17:44 matt Exp $");
95 95
96#include "opt_inet.h" 96#include "opt_inet.h"
97#include "opt_gateway.h" 97#include "opt_gateway.h"
98#include "opt_pfil_hooks.h" 98#include "opt_pfil_hooks.h"
99#include "opt_ipsec.h" 99#include "opt_ipsec.h"
100#include "opt_mrouting.h" 100#include "opt_mrouting.h"
101#include "opt_mbuftrace.h" 101#include "opt_mbuftrace.h"
102#include "opt_inet_csum.h" 102#include "opt_inet_csum.h"
103 103
104#include <sys/param.h> 104#include <sys/param.h>
105#include <sys/systm.h> 105#include <sys/systm.h>
106#include <sys/malloc.h> 106#include <sys/malloc.h>
107#include <sys/mbuf.h> 107#include <sys/mbuf.h>
108#include <sys/domain.h> 108#include <sys/domain.h>
109#include <sys/protosw.h> 109#include <sys/protosw.h>
110#include <sys/socket.h> 110#include <sys/socket.h>
111#include <sys/socketvar.h> 111#include <sys/socketvar.h>
112#include <sys/errno.h> 112#include <sys/errno.h>
113#include <sys/time.h> 113#include <sys/time.h>
114#include <sys/kernel.h> 114#include <sys/kernel.h>
115#include <sys/pool.h> 115#include <sys/pool.h>
116#include <sys/sysctl.h> 116#include <sys/sysctl.h>
117#include <sys/kauth.h> 117#include <sys/kauth.h>
118 118
119#include <net/if.h> 119#include <net/if.h>
120#include <net/if_dl.h> 120#include <net/if_dl.h>
121#include <net/route.h> 121#include <net/route.h>
122#include <net/pfil.h> 122#include <net/pfil.h>
123 123
124#include <netinet/in.h> 124#include <netinet/in.h>
125#include <netinet/in_systm.h> 125#include <netinet/in_systm.h>
126#include <netinet/ip.h> 126#include <netinet/ip.h>
127#include <netinet/in_pcb.h> 127#include <netinet/in_pcb.h>
128#include <netinet/in_proto.h> 128#include <netinet/in_proto.h>
129#include <netinet/in_var.h> 129#include <netinet/in_var.h>
130#include <netinet/ip_var.h> 130#include <netinet/ip_var.h>
131#include <netinet/ip_private.h> 131#include <netinet/ip_private.h>
132#include <netinet/ip_icmp.h> 132#include <netinet/ip_icmp.h>
133/* just for gif_ttl */ 133/* just for gif_ttl */
134#include <netinet/in_gif.h> 134#include <netinet/in_gif.h>
135#include "gif.h" 135#include "gif.h"
136#include <net/if_gre.h> 136#include <net/if_gre.h>
137#include "gre.h" 137#include "gre.h"
138 138
139#ifdef MROUTING 139#ifdef MROUTING
140#include <netinet/ip_mroute.h> 140#include <netinet/ip_mroute.h>
141#endif 141#endif
142 142
143#ifdef IPSEC 143#ifdef IPSEC
144#include <netinet6/ipsec.h> 144#include <netinet6/ipsec.h>
145#include <netinet6/ipsec_private.h> 145#include <netinet6/ipsec_private.h>
146#include <netkey/key.h> 146#include <netkey/key.h>
147#endif 147#endif
148#ifdef FAST_IPSEC 148#ifdef FAST_IPSEC
149#include <netipsec/ipsec.h> 149#include <netipsec/ipsec.h>
150#include <netipsec/key.h> 150#include <netipsec/key.h>
151#endif /* FAST_IPSEC*/ 151#endif /* FAST_IPSEC*/
152 152
153#ifndef IPFORWARDING 153#ifndef IPFORWARDING
154#ifdef GATEWAY 154#ifdef GATEWAY
155#define IPFORWARDING 1 /* forward IP packets not for us */ 155#define IPFORWARDING 1 /* forward IP packets not for us */
156#else /* GATEWAY */ 156#else /* GATEWAY */
157#define IPFORWARDING 0 /* don't forward IP packets not for us */ 157#define IPFORWARDING 0 /* don't forward IP packets not for us */
158#endif /* GATEWAY */ 158#endif /* GATEWAY */
159#endif /* IPFORWARDING */ 159#endif /* IPFORWARDING */
160#ifndef IPSENDREDIRECTS 160#ifndef IPSENDREDIRECTS
161#define IPSENDREDIRECTS 1 161#define IPSENDREDIRECTS 1
162#endif 162#endif
163#ifndef IPFORWSRCRT 163#ifndef IPFORWSRCRT
164#define IPFORWSRCRT 1 /* forward source-routed packets */ 164#define IPFORWSRCRT 1 /* forward source-routed packets */
165#endif 165#endif
166#ifndef IPALLOWSRCRT 166#ifndef IPALLOWSRCRT
167#define IPALLOWSRCRT 1 /* allow source-routed packets */ 167#define IPALLOWSRCRT 1 /* allow source-routed packets */
168#endif 168#endif
169#ifndef IPMTUDISC 169#ifndef IPMTUDISC
170#define IPMTUDISC 1 170#define IPMTUDISC 1
171#endif 171#endif
172#ifndef IPMTUDISCTIMEOUT 172#ifndef IPMTUDISCTIMEOUT
173#define IPMTUDISCTIMEOUT (10 * 60) /* as per RFC 1191 */ 173#define IPMTUDISCTIMEOUT (10 * 60) /* as per RFC 1191 */
174#endif 174#endif
175 175
176/* 176/*
177 * Note: DIRECTED_BROADCAST is handled this way so that previous 177 * Note: DIRECTED_BROADCAST is handled this way so that previous
178 * configuration using this option will Just Work. 178 * configuration using this option will Just Work.
179 */ 179 */
180#ifndef IPDIRECTEDBCAST 180#ifndef IPDIRECTEDBCAST
181#ifdef DIRECTED_BROADCAST 181#ifdef DIRECTED_BROADCAST
182#define IPDIRECTEDBCAST 1 182#define IPDIRECTEDBCAST 1
183#else 183#else
184#define IPDIRECTEDBCAST 0 184#define IPDIRECTEDBCAST 0
185#endif /* DIRECTED_BROADCAST */ 185#endif /* DIRECTED_BROADCAST */
186#endif /* IPDIRECTEDBCAST */ 186#endif /* IPDIRECTEDBCAST */
187int ipforwarding = IPFORWARDING; 187int ipforwarding = IPFORWARDING;
188int ipsendredirects = IPSENDREDIRECTS; 188int ipsendredirects = IPSENDREDIRECTS;
189int ip_defttl = IPDEFTTL; 189int ip_defttl = IPDEFTTL;
190int ip_forwsrcrt = IPFORWSRCRT; 190int ip_forwsrcrt = IPFORWSRCRT;
191int ip_directedbcast = IPDIRECTEDBCAST; 191int ip_directedbcast = IPDIRECTEDBCAST;
192int ip_allowsrcrt = IPALLOWSRCRT; 192int ip_allowsrcrt = IPALLOWSRCRT;
193int ip_mtudisc = IPMTUDISC; 193int ip_mtudisc = IPMTUDISC;
194int ip_mtudisc_timeout = IPMTUDISCTIMEOUT; 194int ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
195#ifdef DIAGNOSTIC 195#ifdef DIAGNOSTIC
196int ipprintfs = 0; 196int ipprintfs = 0;
197#endif 197#endif
198 198
199int ip_do_randomid = 0; 199int ip_do_randomid = 0;
200 200
201/* 201/*
202 * XXX - Setting ip_checkinterface mostly implements the receive side of 202 * XXX - Setting ip_checkinterface mostly implements the receive side of
203 * the Strong ES model described in RFC 1122, but since the routing table 203 * the Strong ES model described in RFC 1122, but since the routing table
204 * and transmit implementation do not implement the Strong ES model, 204 * and transmit implementation do not implement the Strong ES model,
205 * setting this to 1 results in an odd hybrid. 205 * setting this to 1 results in an odd hybrid.
206 * 206 *
207 * XXX - ip_checkinterface currently must be disabled if you use ipnat 207 * XXX - ip_checkinterface currently must be disabled if you use ipnat
208 * to translate the destination address to another local interface. 208 * to translate the destination address to another local interface.
209 * 209 *
210 * XXX - ip_checkinterface must be disabled if you add IP aliases 210 * XXX - ip_checkinterface must be disabled if you add IP aliases
211 * to the loopback interface instead of the interface where the 211 * to the loopback interface instead of the interface where the
212 * packets for those addresses are received. 212 * packets for those addresses are received.
213 */ 213 */
214int ip_checkinterface = 0; 214int ip_checkinterface = 0;
215 215
216 216
217struct rttimer_queue *ip_mtudisc_timeout_q = NULL; 217struct rttimer_queue *ip_mtudisc_timeout_q = NULL;
218 218
219int ipqmaxlen = IFQ_MAXLEN; 219int ipqmaxlen = IFQ_MAXLEN;
220u_long in_ifaddrhash; /* size of hash table - 1 */ 220u_long in_ifaddrhash; /* size of hash table - 1 */
221int in_ifaddrentries; /* total number of addrs */ 221int in_ifaddrentries; /* total number of addrs */
222struct in_ifaddrhead in_ifaddrhead; 222struct in_ifaddrhead in_ifaddrhead;
223struct in_ifaddrhashhead *in_ifaddrhashtbl; 223struct in_ifaddrhashhead *in_ifaddrhashtbl;
224u_long in_multihash; /* size of hash table - 1 */ 224u_long in_multihash; /* size of hash table - 1 */
225int in_multientries; /* total number of addrs */ 225int in_multientries; /* total number of addrs */
226struct in_multihashhead *in_multihashtbl; 226struct in_multihashhead *in_multihashtbl;
227struct ifqueue ipintrq; 227struct ifqueue ipintrq;
228uint16_t ip_id; 228uint16_t ip_id;
229 229
230percpu_t *ipstat_percpu; 230percpu_t *ipstat_percpu;
231 231
232#ifdef PFIL_HOOKS 232#ifdef PFIL_HOOKS
233struct pfil_head inet_pfil_hook; 233struct pfil_head inet_pfil_hook;
234#endif 234#endif
235 235
236/* 236/*
237 * Cached copy of nmbclusters. If nbclusters is different, 237 * Cached copy of nmbclusters. If nbclusters is different,
238 * recalculate IP parameters derived from nmbclusters. 238 * recalculate IP parameters derived from nmbclusters.
239 */ 239 */
240static int ip_nmbclusters; /* copy of nmbclusters */ 240static int ip_nmbclusters; /* copy of nmbclusters */
241static void ip_nmbclusters_changed(void); /* recalc limits */ 241static void ip_nmbclusters_changed(void); /* recalc limits */
242 242
243#define CHECK_NMBCLUSTER_PARAMS() \ 243#define CHECK_NMBCLUSTER_PARAMS() \
244do { \ 244do { \
245 if (__predict_false(ip_nmbclusters != nmbclusters)) \ 245 if (__predict_false(ip_nmbclusters != nmbclusters)) \
246 ip_nmbclusters_changed(); \ 246 ip_nmbclusters_changed(); \
247} while (/*CONSTCOND*/0) 247} while (/*CONSTCOND*/0)
248 248
249/* IP datagram reassembly queues (hashed) */ 249/* IP datagram reassembly queues (hashed) */
250#define IPREASS_NHASH_LOG2 6 250#define IPREASS_NHASH_LOG2 6
251#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) 251#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
252#define IPREASS_HMASK (IPREASS_NHASH - 1) 252#define IPREASS_HMASK (IPREASS_NHASH - 1)
253#define IPREASS_HASH(x,y) \ 253#define IPREASS_HASH(x,y) \
254 (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) 254 (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
255struct ipqhead ipq[IPREASS_NHASH]; 255struct ipqhead ipq[IPREASS_NHASH];
256int ipq_locked; 256int ipq_locked;
257static int ip_nfragpackets; /* packets in reass queue */ 257static int ip_nfragpackets; /* packets in reass queue */
258static int ip_nfrags; /* total fragments in reass queues */ 258static int ip_nfrags; /* total fragments in reass queues */
259 259
260int ip_maxfragpackets = 200; /* limit on packets. XXX sysctl */ 260int ip_maxfragpackets = 200; /* limit on packets. XXX sysctl */
261int ip_maxfrags; /* limit on fragments. XXX sysctl */ 261int ip_maxfrags; /* limit on fragments. XXX sysctl */
262 262
263 263
264/* 264/*
265 * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for 265 * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for
266 * IP reassembly queue buffer managment. 266 * IP reassembly queue buffer managment.
267 * 267 *
268 * We keep a count of total IP fragments (NB: not fragmented packets!) 268 * We keep a count of total IP fragments (NB: not fragmented packets!)
269 * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments. 269 * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments.
270 * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the 270 * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the
271 * total fragments in reassembly queues.This AIMD policy avoids 271 * total fragments in reassembly queues.This AIMD policy avoids
272 * repeatedly deleting single packets under heavy fragmentation load 272 * repeatedly deleting single packets under heavy fragmentation load
273 * (e.g., from lossy NFS peers). 273 * (e.g., from lossy NFS peers).
274 */ 274 */
275static u_int ip_reass_ttl_decr(u_int ticks); 275static u_int ip_reass_ttl_decr(u_int ticks);
276static void ip_reass_drophalf(void); 276static void ip_reass_drophalf(void);
277 277
278 278
279static inline int ipq_lock_try(void); 279static inline int ipq_lock_try(void);
280static inline void ipq_unlock(void); 280static inline void ipq_unlock(void);
281 281
282static inline int 282static inline int
283ipq_lock_try(void) 283ipq_lock_try(void)
284{ 284{
285 int s; 285 int s;
286 286
287 /* 287 /*
288 * Use splvm() -- we're blocking things that would cause 288 * Use splvm() -- we're blocking things that would cause
289 * mbuf allocation. 289 * mbuf allocation.
290 */ 290 */
291 s = splvm(); 291 s = splvm();
292 if (ipq_locked) { 292 if (ipq_locked) {
293 splx(s); 293 splx(s);
294 return (0); 294 return (0);
295 } 295 }
296 ipq_locked = 1; 296 ipq_locked = 1;
297 splx(s); 297 splx(s);
298 return (1); 298 return (1);
299} 299}
300 300
301static inline void 301static inline void
302ipq_unlock(void) 302ipq_unlock(void)
303{ 303{
304 int s; 304 int s;
305 305
306 s = splvm(); 306 s = splvm();
307 ipq_locked = 0; 307 ipq_locked = 0;
308 splx(s); 308 splx(s);
309} 309}
310 310
311#ifdef DIAGNOSTIC 311#ifdef DIAGNOSTIC
312#define IPQ_LOCK() \ 312#define IPQ_LOCK() \
313do { \ 313do { \
314 if (ipq_lock_try() == 0) { \ 314 if (ipq_lock_try() == 0) { \
315 printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \ 315 printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \
316 panic("ipq_lock"); \ 316 panic("ipq_lock"); \
317 } \ 317 } \
318} while (/*CONSTCOND*/ 0) 318} while (/*CONSTCOND*/ 0)
319#define IPQ_LOCK_CHECK() \ 319#define IPQ_LOCK_CHECK() \
320do { \ 320do { \
321 if (ipq_locked == 0) { \ 321 if (ipq_locked == 0) { \
322 printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \ 322 printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \
323 panic("ipq lock check"); \ 323 panic("ipq lock check"); \
324 } \ 324 } \
325} while (/*CONSTCOND*/ 0) 325} while (/*CONSTCOND*/ 0)
326#else 326#else
327#define IPQ_LOCK() (void) ipq_lock_try() 327#define IPQ_LOCK() (void) ipq_lock_try()
328#define IPQ_LOCK_CHECK() /* nothing */ 328#define IPQ_LOCK_CHECK() /* nothing */
329#endif 329#endif
330 330
331#define IPQ_UNLOCK() ipq_unlock() 331#define IPQ_UNLOCK() ipq_unlock()
332 332
333struct pool inmulti_pool; 333struct pool inmulti_pool;
334struct pool ipqent_pool; 334struct pool ipqent_pool;
335 335
336#ifdef INET_CSUM_COUNTERS 336#ifdef INET_CSUM_COUNTERS
337#include <sys/device.h> 337#include <sys/device.h>
338 338
339struct evcnt ip_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 339struct evcnt ip_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
340 NULL, "inet", "hwcsum bad"); 340 NULL, "inet", "hwcsum bad");
341struct evcnt ip_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 341struct evcnt ip_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
342 NULL, "inet", "hwcsum ok"); 342 NULL, "inet", "hwcsum ok");
343struct evcnt ip_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 343struct evcnt ip_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
344 NULL, "inet", "swcsum"); 344 NULL, "inet", "swcsum");
345 345
346#define INET_CSUM_COUNTER_INCR(ev) (ev)->ev_count++ 346#define INET_CSUM_COUNTER_INCR(ev) (ev)->ev_count++
347 347
348EVCNT_ATTACH_STATIC(ip_hwcsum_bad); 348EVCNT_ATTACH_STATIC(ip_hwcsum_bad);
349EVCNT_ATTACH_STATIC(ip_hwcsum_ok); 349EVCNT_ATTACH_STATIC(ip_hwcsum_ok);
350EVCNT_ATTACH_STATIC(ip_swcsum); 350EVCNT_ATTACH_STATIC(ip_swcsum);
351 351
352#else 352#else
353 353
354#define INET_CSUM_COUNTER_INCR(ev) /* nothing */ 354#define INET_CSUM_COUNTER_INCR(ev) /* nothing */
355 355
356#endif /* INET_CSUM_COUNTERS */ 356#endif /* INET_CSUM_COUNTERS */
357 357
358/* 358/*
359 * We need to save the IP options in case a protocol wants to respond 359 * We need to save the IP options in case a protocol wants to respond
360 * to an incoming packet over the same route if the packet got here 360 * to an incoming packet over the same route if the packet got here
361 * using IP source routing. This allows connection establishment and 361 * using IP source routing. This allows connection establishment and
362 * maintenance when the remote end is on a network that is not known 362 * maintenance when the remote end is on a network that is not known
363 * to us. 363 * to us.
364 */ 364 */
365int ip_nhops = 0; 365int ip_nhops = 0;
366static struct ip_srcrt { 366static struct ip_srcrt {
367 struct in_addr dst; /* final destination */ 367 struct in_addr dst; /* final destination */
368 char nop; /* one NOP to align */ 368 char nop; /* one NOP to align */
369 char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */ 369 char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */
370 struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)]; 370 struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)];
371} ip_srcrt; 371} ip_srcrt;
372 372
373static void save_rte(u_char *, struct in_addr); 373static void save_rte(u_char *, struct in_addr);
374 374
375#ifdef MBUFTRACE 375#ifdef MBUFTRACE
376struct mowner ip_rx_mowner = MOWNER_INIT("internet", "rx"); 376struct mowner ip_rx_mowner = MOWNER_INIT("internet", "rx");
377struct mowner ip_tx_mowner = MOWNER_INIT("internet", "tx"); 377struct mowner ip_tx_mowner = MOWNER_INIT("internet", "tx");
378#endif 378#endif
379 379
380/* 380/*
381 * Compute IP limits derived from the value of nmbclusters. 381 * Compute IP limits derived from the value of nmbclusters.
382 */ 382 */
383static void 383static void
384ip_nmbclusters_changed(void) 384ip_nmbclusters_changed(void)
385{ 385{
386 ip_maxfrags = nmbclusters / 4; 386 ip_maxfrags = nmbclusters / 4;
387 ip_nmbclusters = nmbclusters; 387 ip_nmbclusters = nmbclusters;
388} 388}
389 389
390/* 390/*
391 * IP initialization: fill in IP protocol switch table. 391 * IP initialization: fill in IP protocol switch table.
392 * All protocols not implemented in kernel go to raw IP protocol handler. 392 * All protocols not implemented in kernel go to raw IP protocol handler.
393 */ 393 */
394void 394void
395ip_init(void) 395ip_init(void)
396{ 396{
397 const struct protosw *pr; 397 const struct protosw *pr;
398 int i; 398 int i;
399 399
400 pool_init(&inmulti_pool, sizeof(struct in_multi), 0, 0, 0, "inmltpl", 400 pool_init(&inmulti_pool, sizeof(struct in_multi), 0, 0, 0, "inmltpl",
401 NULL, IPL_SOFTNET); 401 NULL, IPL_SOFTNET);
402 pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl", 402 pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
403 NULL, IPL_VM); 403 NULL, IPL_VM);
404 404
405 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 405 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
406 if (pr == 0) 406 if (pr == 0)
407 panic("ip_init"); 407 panic("ip_init");
408 for (i = 0; i < IPPROTO_MAX; i++) 408 for (i = 0; i < IPPROTO_MAX; i++)
409 ip_protox[i] = pr - inetsw; 409 ip_protox[i] = pr - inetsw;
410 for (pr = inetdomain.dom_protosw; 410 for (pr = inetdomain.dom_protosw;
411 pr < inetdomain.dom_protoswNPROTOSW; pr++) 411 pr < inetdomain.dom_protoswNPROTOSW; pr++)
412 if (pr->pr_domain->dom_family == PF_INET && 412 if (pr->pr_domain->dom_family == PF_INET &&
413 pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) 413 pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
414 ip_protox[pr->pr_protocol] = pr - inetsw; 414 ip_protox[pr->pr_protocol] = pr - inetsw;
415 415
416 for (i = 0; i < IPREASS_NHASH; i++) 416 for (i = 0; i < IPREASS_NHASH; i++)
417 LIST_INIT(&ipq[i]); 417 LIST_INIT(&ipq[i]);
418 418
419 ip_initid(); 419 ip_initid();
420 ip_id = time_second & 0xfffff; 420 ip_id = time_second & 0xfffff;
421 421
422 ipintrq.ifq_maxlen = ipqmaxlen; 422 ipintrq.ifq_maxlen = ipqmaxlen;
423 ip_nmbclusters_changed(); 423 ip_nmbclusters_changed();
424 424
425 TAILQ_INIT(&in_ifaddrhead); 425 TAILQ_INIT(&in_ifaddrhead);
426 in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, true, 426 in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, true,
427 &in_ifaddrhash); 427 &in_ifaddrhash);
428 in_multihashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, true, 428 in_multihashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, true,
429 &in_multihash); 429 &in_multihash);
430 ip_mtudisc_timeout_q = rt_timer_queue_create(ip_mtudisc_timeout); 430 ip_mtudisc_timeout_q = rt_timer_queue_create(ip_mtudisc_timeout);
431#ifdef GATEWAY 431#ifdef GATEWAY
432 ipflow_init(ip_hashsize); 432 ipflow_init(ip_hashsize);
433#endif 433#endif
434 434
435#ifdef PFIL_HOOKS 435#ifdef PFIL_HOOKS
436 /* Register our Packet Filter hook. */ 436 /* Register our Packet Filter hook. */
437 inet_pfil_hook.ph_type = PFIL_TYPE_AF; 437 inet_pfil_hook.ph_type = PFIL_TYPE_AF;
438 inet_pfil_hook.ph_af = AF_INET; 438 inet_pfil_hook.ph_af = AF_INET;
439 i = pfil_head_register(&inet_pfil_hook); 439 i = pfil_head_register(&inet_pfil_hook);
440 if (i != 0) 440 if (i != 0)
441 printf("ip_init: WARNING: unable to register pfil hook, " 441 printf("ip_init: WARNING: unable to register pfil hook, "
442 "error %d\n", i); 442 "error %d\n", i);
443#endif /* PFIL_HOOKS */ 443#endif /* PFIL_HOOKS */
444 444
445#ifdef MBUFTRACE 445#ifdef MBUFTRACE
446 MOWNER_ATTACH(&ip_tx_mowner); 446 MOWNER_ATTACH(&ip_tx_mowner);
447 MOWNER_ATTACH(&ip_rx_mowner); 447 MOWNER_ATTACH(&ip_rx_mowner);
448#endif /* MBUFTRACE */ 448#endif /* MBUFTRACE */
449 449
450 ipstat_percpu = percpu_alloc(sizeof(uint64_t) * IP_NSTATS); 450 ipstat_percpu = percpu_alloc(sizeof(uint64_t) * IP_NSTATS);
451} 451}
452 452
453struct sockaddr_in ipaddr = { 453struct sockaddr_in ipaddr = {
454 .sin_len = sizeof(ipaddr), 454 .sin_len = sizeof(ipaddr),
455 .sin_family = AF_INET, 455 .sin_family = AF_INET,
456}; 456};
457struct route ipforward_rt; 457struct route ipforward_rt;
458 458
459/* 459/*
460 * IP software interrupt routine 460 * IP software interrupt routine
461 */ 461 */
462void 462void
463ipintr(void) 463ipintr(void)
464{ 464{
465 int s; 465 int s;
466 struct mbuf *m; 466 struct mbuf *m;
467 467
468 mutex_enter(softnet_lock); 468 mutex_enter(softnet_lock);
469 KERNEL_LOCK(1, NULL); 469 KERNEL_LOCK(1, NULL);
470 while (!IF_IS_EMPTY(&ipintrq)) { 470 while (!IF_IS_EMPTY(&ipintrq)) {
471 s = splnet(); 471 s = splnet();
472 IF_DEQUEUE(&ipintrq, m); 472 IF_DEQUEUE(&ipintrq, m);
473 splx(s); 473 splx(s);
474 if (m == NULL) 474 if (m == NULL)
475 break; 475 break;
476 KERNEL_UNLOCK_ONE(NULL); 
477 ip_input(m); 476 ip_input(m);
478 KERNEL_LOCK(1, NULL); 
479 } 477 }
480 KERNEL_UNLOCK_ONE(NULL); 478 KERNEL_UNLOCK_ONE(NULL);
481 mutex_exit(softnet_lock); 479 mutex_exit(softnet_lock);
482} 480}
483 481
484/* 482/*
485 * Ip input routine. Checksum and byte swap header. If fragmented 483 * Ip input routine. Checksum and byte swap header. If fragmented
486 * try to reassemble. Process options. Pass to next level. 484 * try to reassemble. Process options. Pass to next level.
487 */ 485 */
488void 486void
489ip_input(struct mbuf *m) 487ip_input(struct mbuf *m)
490{ 488{
491 struct ip *ip = NULL; 489 struct ip *ip = NULL;
492 struct ipq *fp; 490 struct ipq *fp;
493 struct in_ifaddr *ia; 491 struct in_ifaddr *ia;
494 struct ifaddr *ifa; 492 struct ifaddr *ifa;
495 struct ipqent *ipqe; 493 struct ipqent *ipqe;
496 int hlen = 0, mff, len; 494 int hlen = 0, mff, len;
497 int downmatch; 495 int downmatch;
498 int checkif; 496 int checkif;
499 int srcrt = 0; 497 int srcrt = 0;
500 int s; 498 int s;
501 u_int hash; 499 u_int hash;
502#ifdef FAST_IPSEC 500#ifdef FAST_IPSEC
503 struct m_tag *mtag; 501 struct m_tag *mtag;
504 struct tdb_ident *tdbi; 502 struct tdb_ident *tdbi;
505 struct secpolicy *sp; 503 struct secpolicy *sp;
506 int error; 504 int error;
507#endif /* FAST_IPSEC */ 505#endif /* FAST_IPSEC */
508 506
509 MCLAIM(m, &ip_rx_mowner); 507 MCLAIM(m, &ip_rx_mowner);
510#ifdef DIAGNOSTIC 508#ifdef DIAGNOSTIC
511 if ((m->m_flags & M_PKTHDR) == 0) 509 if ((m->m_flags & M_PKTHDR) == 0)
512 panic("ipintr no HDR"); 510 panic("ipintr no HDR");
513#endif 511#endif
514 512
515 /* 513 /*
516 * If no IP addresses have been set yet but the interfaces 514 * If no IP addresses have been set yet but the interfaces
517 * are receiving, can't do anything with incoming packets yet. 515 * are receiving, can't do anything with incoming packets yet.
518 */ 516 */
519 if (TAILQ_FIRST(&in_ifaddrhead) == 0) 517 if (TAILQ_FIRST(&in_ifaddrhead) == 0)
520 goto bad; 518 goto bad;
521 IP_STATINC(IP_STAT_TOTAL); 519 IP_STATINC(IP_STAT_TOTAL);
522 /* 520 /*
523 * If the IP header is not aligned, slurp it up into a new 521 * If the IP header is not aligned, slurp it up into a new
524 * mbuf with space for link headers, in the event we forward 522 * mbuf with space for link headers, in the event we forward
525 * it. Otherwise, if it is aligned, make sure the entire 523 * it. Otherwise, if it is aligned, make sure the entire
526 * base IP header is in the first mbuf of the chain. 524 * base IP header is in the first mbuf of the chain.
527 */ 525 */
528 if (IP_HDR_ALIGNED_P(mtod(m, void *)) == 0) { 526 if (IP_HDR_ALIGNED_P(mtod(m, void *)) == 0) {
529 if ((m = m_copyup(m, sizeof(struct ip), 527 if ((m = m_copyup(m, sizeof(struct ip),
530 (max_linkhdr + 3) & ~3)) == NULL) { 528 (max_linkhdr + 3) & ~3)) == NULL) {
531 /* XXXJRT new stat, please */ 529 /* XXXJRT new stat, please */
532 IP_STATINC(IP_STAT_TOOSMALL); 530 IP_STATINC(IP_STAT_TOOSMALL);
533 return; 531 return;
534 } 532 }
535 } else if (__predict_false(m->m_len < sizeof (struct ip))) { 533 } else if (__predict_false(m->m_len < sizeof (struct ip))) {
536 if ((m = m_pullup(m, sizeof (struct ip))) == NULL) { 534 if ((m = m_pullup(m, sizeof (struct ip))) == NULL) {
537 IP_STATINC(IP_STAT_TOOSMALL); 535 IP_STATINC(IP_STAT_TOOSMALL);
538 return; 536 return;
539 } 537 }
540 } 538 }
541 ip = mtod(m, struct ip *); 539 ip = mtod(m, struct ip *);
542 if (ip->ip_v != IPVERSION) { 540 if (ip->ip_v != IPVERSION) {
543 IP_STATINC(IP_STAT_BADVERS); 541 IP_STATINC(IP_STAT_BADVERS);
544 goto bad; 542 goto bad;
545 } 543 }
546 hlen = ip->ip_hl << 2; 544 hlen = ip->ip_hl << 2;
547 if (hlen < sizeof(struct ip)) { /* minimum header length */ 545 if (hlen < sizeof(struct ip)) { /* minimum header length */
548 IP_STATINC(IP_STAT_BADHLEN); 546 IP_STATINC(IP_STAT_BADHLEN);
549 goto bad; 547 goto bad;
550 } 548 }
551 if (hlen > m->m_len) { 549 if (hlen > m->m_len) {
552 if ((m = m_pullup(m, hlen)) == 0) { 550 if ((m = m_pullup(m, hlen)) == 0) {
553 IP_STATINC(IP_STAT_BADHLEN); 551 IP_STATINC(IP_STAT_BADHLEN);
554 return; 552 return;
555 } 553 }
556 ip = mtod(m, struct ip *); 554 ip = mtod(m, struct ip *);
557 } 555 }
558 556
559 /* 557 /*
560 * RFC1122: packets with a multicast source address are 558 * RFC1122: packets with a multicast source address are
561 * not allowed. 559 * not allowed.
562 */ 560 */
563 if (IN_MULTICAST(ip->ip_src.s_addr)) { 561 if (IN_MULTICAST(ip->ip_src.s_addr)) {
564 IP_STATINC(IP_STAT_BADADDR); 562 IP_STATINC(IP_STAT_BADADDR);
565 goto bad; 563 goto bad;
566 } 564 }
567 565
568 /* 127/8 must not appear on wire - RFC1122 */ 566 /* 127/8 must not appear on wire - RFC1122 */
569 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 567 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
570 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 568 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
571 if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { 569 if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) {
572 IP_STATINC(IP_STAT_BADADDR); 570 IP_STATINC(IP_STAT_BADADDR);
573 goto bad; 571 goto bad;
574 } 572 }
575 } 573 }
576 574
577 switch (m->m_pkthdr.csum_flags & 575 switch (m->m_pkthdr.csum_flags &
578 ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_IPv4) | 576 ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_IPv4) |
579 M_CSUM_IPv4_BAD)) { 577 M_CSUM_IPv4_BAD)) {
580 case M_CSUM_IPv4|M_CSUM_IPv4_BAD: 578 case M_CSUM_IPv4|M_CSUM_IPv4_BAD:
581 INET_CSUM_COUNTER_INCR(&ip_hwcsum_bad); 579 INET_CSUM_COUNTER_INCR(&ip_hwcsum_bad);
582 goto badcsum; 580 goto badcsum;
583 581
584 case M_CSUM_IPv4: 582 case M_CSUM_IPv4:
585 /* Checksum was okay. */ 583 /* Checksum was okay. */
586 INET_CSUM_COUNTER_INCR(&ip_hwcsum_ok); 584 INET_CSUM_COUNTER_INCR(&ip_hwcsum_ok);
587 break; 585 break;
588 586
589 default: 587 default:
590 /* 588 /*
591 * Must compute it ourselves. Maybe skip checksum on 589 * Must compute it ourselves. Maybe skip checksum on
592 * loopback interfaces. 590 * loopback interfaces.
593 */ 591 */
594 if (__predict_true(!(m->m_pkthdr.rcvif->if_flags & 592 if (__predict_true(!(m->m_pkthdr.rcvif->if_flags &
595 IFF_LOOPBACK) || ip_do_loopback_cksum)) { 593 IFF_LOOPBACK) || ip_do_loopback_cksum)) {
596 INET_CSUM_COUNTER_INCR(&ip_swcsum); 594 INET_CSUM_COUNTER_INCR(&ip_swcsum);
597 if (in_cksum(m, hlen) != 0) 595 if (in_cksum(m, hlen) != 0)
598 goto badcsum; 596 goto badcsum;
599 } 597 }
600 break; 598 break;
601 } 599 }
602 600
603 /* Retrieve the packet length. */ 601 /* Retrieve the packet length. */
604 len = ntohs(ip->ip_len); 602 len = ntohs(ip->ip_len);
605 603
606 /* 604 /*
607 * Check for additional length bogosity 605 * Check for additional length bogosity
608 */ 606 */
609 if (len < hlen) { 607 if (len < hlen) {
610 IP_STATINC(IP_STAT_BADLEN); 608 IP_STATINC(IP_STAT_BADLEN);
611 goto bad; 609 goto bad;
612 } 610 }
613 611
614 /* 612 /*
615 * Check that the amount of data in the buffers 613 * Check that the amount of data in the buffers
616 * is as at least much as the IP header would have us expect. 614 * is as at least much as the IP header would have us expect.
617 * Trim mbufs if longer than we expect. 615 * Trim mbufs if longer than we expect.
618 * Drop packet if shorter than we expect. 616 * Drop packet if shorter than we expect.
619 */ 617 */
620 if (m->m_pkthdr.len < len) { 618 if (m->m_pkthdr.len < len) {
621 IP_STATINC(IP_STAT_TOOSHORT); 619 IP_STATINC(IP_STAT_TOOSHORT);
622 goto bad; 620 goto bad;
623 } 621 }
624 if (m->m_pkthdr.len > len) { 622 if (m->m_pkthdr.len > len) {
625 if (m->m_len == m->m_pkthdr.len) { 623 if (m->m_len == m->m_pkthdr.len) {
626 m->m_len = len; 624 m->m_len = len;
627 m->m_pkthdr.len = len; 625 m->m_pkthdr.len = len;
628 } else 626 } else
629 m_adj(m, len - m->m_pkthdr.len); 627 m_adj(m, len - m->m_pkthdr.len);
630 } 628 }
631 629
632#if defined(IPSEC) 630#if defined(IPSEC)
633 /* ipflow (IP fast forwarding) is not compatible with IPsec. */ 631 /* ipflow (IP fast forwarding) is not compatible with IPsec. */
634 m->m_flags &= ~M_CANFASTFWD; 632 m->m_flags &= ~M_CANFASTFWD;
635#else 633#else
636 /* 634 /*
637 * Assume that we can create a fast-forward IP flow entry 635 * Assume that we can create a fast-forward IP flow entry
638 * based on this packet. 636 * based on this packet.
639 */ 637 */
640 m->m_flags |= M_CANFASTFWD; 638 m->m_flags |= M_CANFASTFWD;
641#endif 639#endif
642 640
643#ifdef PFIL_HOOKS 641#ifdef PFIL_HOOKS
644 /* 642 /*
645 * Run through list of hooks for input packets. If there are any 643 * Run through list of hooks for input packets. If there are any
646 * filters which require that additional packets in the flow are 644 * filters which require that additional packets in the flow are
647 * not fast-forwarded, they must clear the M_CANFASTFWD flag. 645 * not fast-forwarded, they must clear the M_CANFASTFWD flag.
648 * Note that filters must _never_ set this flag, as another filter 646 * Note that filters must _never_ set this flag, as another filter
649 * in the list may have previously cleared it. 647 * in the list may have previously cleared it.
650 */ 648 */
651 /* 649 /*
652 * let ipfilter look at packet on the wire, 650 * let ipfilter look at packet on the wire,
653 * not the decapsulated packet. 651 * not the decapsulated packet.
654 */ 652 */
655#ifdef IPSEC 653#ifdef IPSEC
656 if (!ipsec_getnhist(m)) 654 if (!ipsec_getnhist(m))
657#elif defined(FAST_IPSEC) 655#elif defined(FAST_IPSEC)
658 if (!ipsec_indone(m)) 656 if (!ipsec_indone(m))
659#else 657#else
660 if (1) 658 if (1)
661#endif 659#endif
662 { 660 {
663 struct in_addr odst; 661 struct in_addr odst;
664 662
665 odst = ip->ip_dst; 663 odst = ip->ip_dst;
666 if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif, 664 if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif,
667 PFIL_IN) != 0) 665 PFIL_IN) != 0)
668 return; 666 return;
669 if (m == NULL) 667 if (m == NULL)
670 return; 668 return;
671 ip = mtod(m, struct ip *); 669 ip = mtod(m, struct ip *);
672 hlen = ip->ip_hl << 2; 670 hlen = ip->ip_hl << 2;
673 /* 671 /*
674 * XXX The setting of "srcrt" here is to prevent ip_forward() 672 * XXX The setting of "srcrt" here is to prevent ip_forward()
675 * from generating ICMP redirects for packets that have 673 * from generating ICMP redirects for packets that have
676 * been redirected by a hook back out on to the same LAN that 674 * been redirected by a hook back out on to the same LAN that
677 * they came from and is not an indication that the packet 675 * they came from and is not an indication that the packet
678 * is being inffluenced by source routing options. This 676 * is being inffluenced by source routing options. This
679 * allows things like 677 * allows things like
680 * "rdr tlp0 0/0 port 80 -> 1.1.1.200 3128 tcp" 678 * "rdr tlp0 0/0 port 80 -> 1.1.1.200 3128 tcp"
681 * where tlp0 is both on the 1.1.1.0/24 network and is the 679 * where tlp0 is both on the 1.1.1.0/24 network and is the
682 * default route for hosts on 1.1.1.0/24. Of course this 680 * default route for hosts on 1.1.1.0/24. Of course this
683 * also requires a "map tlp0 ..." to complete the story. 681 * also requires a "map tlp0 ..." to complete the story.
684 * One might argue whether or not this kind of network config. 682 * One might argue whether or not this kind of network config.
685 * should be supported in this manner... 683 * should be supported in this manner...
686 */ 684 */
687 srcrt = (odst.s_addr != ip->ip_dst.s_addr); 685 srcrt = (odst.s_addr != ip->ip_dst.s_addr);
688 } 686 }
689#endif /* PFIL_HOOKS */ 687#endif /* PFIL_HOOKS */
690 688
691#ifdef ALTQ 689#ifdef ALTQ
692 /* XXX Temporary until ALTQ is changed to use a pfil hook */ 690 /* XXX Temporary until ALTQ is changed to use a pfil hook */
693 if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) { 691 if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) {
694 /* packet dropped by traffic conditioner */ 692 /* packet dropped by traffic conditioner */
695 return; 693 return;
696 } 694 }
697#endif 695#endif
698 696
699 /* 697 /*
700 * Process options and, if not destined for us, 698 * Process options and, if not destined for us,
701 * ship it on. ip_dooptions returns 1 when an 699 * ship it on. ip_dooptions returns 1 when an
702 * error was detected (causing an icmp message 700 * error was detected (causing an icmp message
703 * to be sent and the original packet to be freed). 701 * to be sent and the original packet to be freed).
704 */ 702 */
705 ip_nhops = 0; /* for source routed packets */ 703 ip_nhops = 0; /* for source routed packets */
706 if (hlen > sizeof (struct ip) && ip_dooptions(m)) 704 if (hlen > sizeof (struct ip) && ip_dooptions(m))
707 return; 705 return;
708 706
709 /* 707 /*
710 * Enable a consistency check between the destination address 708 * Enable a consistency check between the destination address
711 * and the arrival interface for a unicast packet (the RFC 1122 709 * and the arrival interface for a unicast packet (the RFC 1122
712 * strong ES model) if IP forwarding is disabled and the packet 710 * strong ES model) if IP forwarding is disabled and the packet
713 * is not locally generated. 711 * is not locally generated.
714 * 712 *
715 * XXX - Checking also should be disabled if the destination 713 * XXX - Checking also should be disabled if the destination
716 * address is ipnat'ed to a different interface. 714 * address is ipnat'ed to a different interface.
717 * 715 *
718 * XXX - Checking is incompatible with IP aliases added 716 * XXX - Checking is incompatible with IP aliases added
719 * to the loopback interface instead of the interface where 717 * to the loopback interface instead of the interface where
720 * the packets are received. 718 * the packets are received.
721 * 719 *
722 * XXX - We need to add a per ifaddr flag for this so that 720 * XXX - We need to add a per ifaddr flag for this so that
723 * we get finer grain control. 721 * we get finer grain control.
724 */ 722 */
725 checkif = ip_checkinterface && (ipforwarding == 0) && 723 checkif = ip_checkinterface && (ipforwarding == 0) &&
726 (m->m_pkthdr.rcvif != NULL) && 724 (m->m_pkthdr.rcvif != NULL) &&
727 ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0); 725 ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0);
728 726
729 /* 727 /*
730 * Check our list of addresses, to see if the packet is for us. 728 * Check our list of addresses, to see if the packet is for us.
731 * 729 *
732 * Traditional 4.4BSD did not consult IFF_UP at all. 730 * Traditional 4.4BSD did not consult IFF_UP at all.
733 * The behavior here is to treat addresses on !IFF_UP interface 731 * The behavior here is to treat addresses on !IFF_UP interface
734 * as not mine. 732 * as not mine.
735 */ 733 */
736 downmatch = 0; 734 downmatch = 0;
737 LIST_FOREACH(ia, &IN_IFADDR_HASH(ip->ip_dst.s_addr), ia_hash) { 735 LIST_FOREACH(ia, &IN_IFADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
738 if (in_hosteq(ia->ia_addr.sin_addr, ip->ip_dst)) { 736 if (in_hosteq(ia->ia_addr.sin_addr, ip->ip_dst)) {
739 if (checkif && ia->ia_ifp != m->m_pkthdr.rcvif) 737 if (checkif && ia->ia_ifp != m->m_pkthdr.rcvif)
740 continue; 738 continue;
741 if ((ia->ia_ifp->if_flags & IFF_UP) != 0) 739 if ((ia->ia_ifp->if_flags & IFF_UP) != 0)
742 break; 740 break;
743 else 741 else
744 downmatch++; 742 downmatch++;
745 } 743 }
746 } 744 }
747 if (ia != NULL) 745 if (ia != NULL)
748 goto ours; 746 goto ours;
749 if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) { 747 if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) {
750 IFADDR_FOREACH(ifa, m->m_pkthdr.rcvif) { 748 IFADDR_FOREACH(ifa, m->m_pkthdr.rcvif) {
751 if (ifa->ifa_addr->sa_family != AF_INET) 749 if (ifa->ifa_addr->sa_family != AF_INET)
752 continue; 750 continue;
753 ia = ifatoia(ifa); 751 ia = ifatoia(ifa);
754 if (in_hosteq(ip->ip_dst, ia->ia_broadaddr.sin_addr) || 752 if (in_hosteq(ip->ip_dst, ia->ia_broadaddr.sin_addr) ||
755 in_hosteq(ip->ip_dst, ia->ia_netbroadcast) || 753 in_hosteq(ip->ip_dst, ia->ia_netbroadcast) ||
756 /* 754 /*
757 * Look for all-0's host part (old broadcast addr), 755 * Look for all-0's host part (old broadcast addr),
758 * either for subnet or net. 756 * either for subnet or net.
759 */ 757 */
760 ip->ip_dst.s_addr == ia->ia_subnet || 758 ip->ip_dst.s_addr == ia->ia_subnet ||
761 ip->ip_dst.s_addr == ia->ia_net) 759 ip->ip_dst.s_addr == ia->ia_net)
762 goto ours; 760 goto ours;
763 /* 761 /*
764 * An interface with IP address zero accepts 762 * An interface with IP address zero accepts
765 * all packets that arrive on that interface. 763 * all packets that arrive on that interface.
766 */ 764 */
767 if (in_nullhost(ia->ia_addr.sin_addr)) 765 if (in_nullhost(ia->ia_addr.sin_addr))
768 goto ours; 766 goto ours;
769 } 767 }
770 } 768 }
771 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 769 if (IN_MULTICAST(ip->ip_dst.s_addr)) {
772 struct in_multi *inm; 770 struct in_multi *inm;
773#ifdef MROUTING 771#ifdef MROUTING
774 extern struct socket *ip_mrouter; 772 extern struct socket *ip_mrouter;
775 773
776 if (ip_mrouter) { 774 if (ip_mrouter) {
777 /* 775 /*
778 * If we are acting as a multicast router, all 776 * If we are acting as a multicast router, all
779 * incoming multicast packets are passed to the 777 * incoming multicast packets are passed to the
780 * kernel-level multicast forwarding function. 778 * kernel-level multicast forwarding function.
781 * The packet is returned (relatively) intact; if 779 * The packet is returned (relatively) intact; if
782 * ip_mforward() returns a non-zero value, the packet 780 * ip_mforward() returns a non-zero value, the packet
783 * must be discarded, else it may be accepted below. 781 * must be discarded, else it may be accepted below.
784 * 782 *
785 * (The IP ident field is put in the same byte order 783 * (The IP ident field is put in the same byte order
786 * as expected when ip_mforward() is called from 784 * as expected when ip_mforward() is called from
787 * ip_output().) 785 * ip_output().)
788 */ 786 */
789 if (ip_mforward(m, m->m_pkthdr.rcvif) != 0) { 787 if (ip_mforward(m, m->m_pkthdr.rcvif) != 0) {
790 IP_STATINC(IP_STAT_CANTFORWARD); 788 IP_STATINC(IP_STAT_CANTFORWARD);
791 m_freem(m); 789 m_freem(m);
792 return; 790 return;
793 } 791 }
794 792
795 /* 793 /*
796 * The process-level routing demon needs to receive 794 * The process-level routing demon needs to receive
797 * all multicast IGMP packets, whether or not this 795 * all multicast IGMP packets, whether or not this
798 * host belongs to their destination groups. 796 * host belongs to their destination groups.
799 */ 797 */
800 if (ip->ip_p == IPPROTO_IGMP) 798 if (ip->ip_p == IPPROTO_IGMP)
801 goto ours; 799 goto ours;
802 IP_STATINC(IP_STAT_CANTFORWARD); 800 IP_STATINC(IP_STAT_CANTFORWARD);
803 } 801 }
804#endif 802#endif
805 /* 803 /*
806 * See if we belong to the destination multicast group on the 804 * See if we belong to the destination multicast group on the
807 * arrival interface. 805 * arrival interface.
808 */ 806 */
809 IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); 807 IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm);
810 if (inm == NULL) { 808 if (inm == NULL) {
811 IP_STATINC(IP_STAT_CANTFORWARD); 809 IP_STATINC(IP_STAT_CANTFORWARD);
812 m_freem(m); 810 m_freem(m);
813 return; 811 return;
814 } 812 }
815 goto ours; 813 goto ours;
816 } 814 }
817 if (ip->ip_dst.s_addr == INADDR_BROADCAST || 815 if (ip->ip_dst.s_addr == INADDR_BROADCAST ||
818 in_nullhost(ip->ip_dst)) 816 in_nullhost(ip->ip_dst))
819 goto ours; 817 goto ours;
820 818
821 /* 819 /*
822 * Not for us; forward if possible and desirable. 820 * Not for us; forward if possible and desirable.
823 */ 821 */
824 if (ipforwarding == 0) { 822 if (ipforwarding == 0) {
825 IP_STATINC(IP_STAT_CANTFORWARD); 823 IP_STATINC(IP_STAT_CANTFORWARD);
826 m_freem(m); 824 m_freem(m);
827 } else { 825 } else {
828 /* 826 /*
829 * If ip_dst matched any of my address on !IFF_UP interface, 827 * If ip_dst matched any of my address on !IFF_UP interface,
830 * and there's no IFF_UP interface that matches ip_dst, 828 * and there's no IFF_UP interface that matches ip_dst,
831 * send icmp unreach. Forwarding it will result in in-kernel 829 * send icmp unreach. Forwarding it will result in in-kernel
832 * forwarding loop till TTL goes to 0. 830 * forwarding loop till TTL goes to 0.
833 */ 831 */
834 if (downmatch) { 832 if (downmatch) {
835 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); 833 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
836 IP_STATINC(IP_STAT_CANTFORWARD); 834 IP_STATINC(IP_STAT_CANTFORWARD);
837 return; 835 return;
838 } 836 }
839#ifdef IPSEC 837#ifdef IPSEC
840 if (ipsec4_in_reject(m, NULL)) { 838 if (ipsec4_in_reject(m, NULL)) {
841 IPSEC_STATINC(IPSEC_STAT_IN_POLVIO); 839 IPSEC_STATINC(IPSEC_STAT_IN_POLVIO);
842 goto bad; 840 goto bad;
843 } 841 }
844#endif 842#endif
845#ifdef FAST_IPSEC 843#ifdef FAST_IPSEC
846 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 844 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
847 s = splsoftnet(); 845 s = splsoftnet();
848 if (mtag != NULL) { 846 if (mtag != NULL) {
849 tdbi = (struct tdb_ident *)(mtag + 1); 847 tdbi = (struct tdb_ident *)(mtag + 1);
850 sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND); 848 sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
851 } else { 849 } else {
852 sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, 850 sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
853 IP_FORWARDING, &error); 851 IP_FORWARDING, &error);
854 } 852 }
855 if (sp == NULL) { /* NB: can happen if error */ 853 if (sp == NULL) { /* NB: can happen if error */
856 splx(s); 854 splx(s);
857 /*XXX error stat???*/ 855 /*XXX error stat???*/
858 DPRINTF(("ip_input: no SP for forwarding\n")); /*XXX*/ 856 DPRINTF(("ip_input: no SP for forwarding\n")); /*XXX*/
859 goto bad; 857 goto bad;
860 } 858 }
861 859
862 /* 860 /*
863 * Check security policy against packet attributes. 861 * Check security policy against packet attributes.
864 */ 862 */
865 error = ipsec_in_reject(sp, m); 863 error = ipsec_in_reject(sp, m);
866 KEY_FREESP(&sp); 864 KEY_FREESP(&sp);
867 splx(s); 865 splx(s);
868 if (error) { 866 if (error) {
869 IP_STATINC(IP_STAT_CANTFORWARD); 867 IP_STATINC(IP_STAT_CANTFORWARD);
870 goto bad; 868 goto bad;
871 } 869 }
872 870
873 /* 871 /*
874 * Peek at the outbound SP for this packet to determine if 872 * Peek at the outbound SP for this packet to determine if
875 * it's a Fast Forward candidate. 873 * it's a Fast Forward candidate.
876 */ 874 */
877 mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL); 875 mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
878 if (mtag != NULL) 876 if (mtag != NULL)
879 m->m_flags &= ~M_CANFASTFWD; 877 m->m_flags &= ~M_CANFASTFWD;
880 else { 878 else {
881 s = splsoftnet(); 879 s = splsoftnet();
882 sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, 880 sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND,
883 (IP_FORWARDING | 881 (IP_FORWARDING |
884 (ip_directedbcast ? IP_ALLOWBROADCAST : 0)), 882 (ip_directedbcast ? IP_ALLOWBROADCAST : 0)),
885 &error, NULL); 883 &error, NULL);
886 if (sp != NULL) { 884 if (sp != NULL) {
887 m->m_flags &= ~M_CANFASTFWD; 885 m->m_flags &= ~M_CANFASTFWD;
888 KEY_FREESP(&sp); 886 KEY_FREESP(&sp);
889 } 887 }
890 splx(s); 888 splx(s);
891 } 889 }
892#endif /* FAST_IPSEC */ 890#endif /* FAST_IPSEC */
893 891
894 ip_forward(m, srcrt); 892 ip_forward(m, srcrt);
895 } 893 }
896 return; 894 return;
897 895
898ours: 896ours:
899 /* 897 /*
900 * If offset or IP_MF are set, must reassemble. 898 * If offset or IP_MF are set, must reassemble.
901 * Otherwise, nothing need be done. 899 * Otherwise, nothing need be done.
902 * (We could look in the reassembly queue to see 900 * (We could look in the reassembly queue to see
903 * if the packet was previously fragmented, 901 * if the packet was previously fragmented,
904 * but it's not worth the time; just let them time out.) 902 * but it's not worth the time; just let them time out.)
905 */ 903 */
906 if (ip->ip_off & ~htons(IP_DF|IP_RF)) { 904 if (ip->ip_off & ~htons(IP_DF|IP_RF)) {
907 uint16_t off; 905 uint16_t off;
908 /* 906 /*
909 * Prevent TCP blind data attacks by not allowing non-initial 907 * Prevent TCP blind data attacks by not allowing non-initial
910 * fragments to start at less than 68 bytes (minimal fragment 908 * fragments to start at less than 68 bytes (minimal fragment
911 * size) and making sure the first fragment is at least 68 909 * size) and making sure the first fragment is at least 68
912 * bytes. 910 * bytes.
913 */ 911 */
914 off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; 912 off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
915 if ((off > 0 ? off + hlen : len) < IP_MINFRAGSIZE - 1) { 913 if ((off > 0 ? off + hlen : len) < IP_MINFRAGSIZE - 1) {
916 IP_STATINC(IP_STAT_BADFRAGS); 914 IP_STATINC(IP_STAT_BADFRAGS);
917 goto bad; 915 goto bad;
918 } 916 }
919 /* 917 /*
920 * Look for queue of fragments 918 * Look for queue of fragments
921 * of this datagram. 919 * of this datagram.
922 */ 920 */
923 IPQ_LOCK(); 921 IPQ_LOCK();
924 hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); 922 hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
925 LIST_FOREACH(fp, &ipq[hash], ipq_q) { 923 LIST_FOREACH(fp, &ipq[hash], ipq_q) {
926 if (ip->ip_id == fp->ipq_id && 924 if (ip->ip_id == fp->ipq_id &&
927 in_hosteq(ip->ip_src, fp->ipq_src) && 925 in_hosteq(ip->ip_src, fp->ipq_src) &&
928 in_hosteq(ip->ip_dst, fp->ipq_dst) && 926 in_hosteq(ip->ip_dst, fp->ipq_dst) &&
929 ip->ip_p == fp->ipq_p) { 927 ip->ip_p == fp->ipq_p) {
930 /* 928 /*
931 * Make sure the TOS is matches previous 929 * Make sure the TOS is matches previous
932 * fragments. 930 * fragments.
933 */ 931 */
934 if (ip->ip_tos != fp->ipq_tos) { 932 if (ip->ip_tos != fp->ipq_tos) {
935 IP_STATINC(IP_STAT_BADFRAGS); 933 IP_STATINC(IP_STAT_BADFRAGS);
936 IPQ_UNLOCK(); 934 IPQ_UNLOCK();
937 goto bad; 935 goto bad;
938 } 936 }
939 goto found; 937 goto found;
940 } 938 }
941 } 939 }
942 fp = 0; 940 fp = 0;
943found: 941found:
944 942
945 /* 943 /*
946 * Adjust ip_len to not reflect header, 944 * Adjust ip_len to not reflect header,
947 * set ipqe_mff if more fragments are expected, 945 * set ipqe_mff if more fragments are expected,
948 * convert offset of this to bytes. 946 * convert offset of this to bytes.
949 */ 947 */
950 ip->ip_len = htons(ntohs(ip->ip_len) - hlen); 948 ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
951 mff = (ip->ip_off & htons(IP_MF)) != 0; 949 mff = (ip->ip_off & htons(IP_MF)) != 0;
952 if (mff) { 950 if (mff) {
953 /* 951 /*
954 * Make sure that fragments have a data length 952 * Make sure that fragments have a data length
955 * that's a non-zero multiple of 8 bytes. 953 * that's a non-zero multiple of 8 bytes.
956 */ 954 */
957 if (ntohs(ip->ip_len) == 0 || 955 if (ntohs(ip->ip_len) == 0 ||
958 (ntohs(ip->ip_len) & 0x7) != 0) { 956 (ntohs(ip->ip_len) & 0x7) != 0) {
959 IP_STATINC(IP_STAT_BADFRAGS); 957 IP_STATINC(IP_STAT_BADFRAGS);
960 IPQ_UNLOCK(); 958 IPQ_UNLOCK();
961 goto bad; 959 goto bad;
962 } 960 }
963 } 961 }
964 ip->ip_off = htons((ntohs(ip->ip_off) & IP_OFFMASK) << 3); 962 ip->ip_off = htons((ntohs(ip->ip_off) & IP_OFFMASK) << 3);
965 963
966 /* 964 /*
967 * If datagram marked as having more fragments 965 * If datagram marked as having more fragments
968 * or if this is not the first fragment, 966 * or if this is not the first fragment,
969 * attempt reassembly; if it succeeds, proceed. 967 * attempt reassembly; if it succeeds, proceed.
970 */ 968 */
971 if (mff || ip->ip_off != htons(0)) { 969 if (mff || ip->ip_off != htons(0)) {
972 IP_STATINC(IP_STAT_FRAGMENTS); 970 IP_STATINC(IP_STAT_FRAGMENTS);
973 s = splvm(); 971 s = splvm();
974 ipqe = pool_get(&ipqent_pool, PR_NOWAIT); 972 ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
975 splx(s); 973 splx(s);
976 if (ipqe == NULL) { 974 if (ipqe == NULL) {
977 IP_STATINC(IP_STAT_RCVMEMDROP); 975 IP_STATINC(IP_STAT_RCVMEMDROP);
978 IPQ_UNLOCK(); 976 IPQ_UNLOCK();
979 goto bad; 977 goto bad;
980 } 978 }
981 ipqe->ipqe_mff = mff; 979 ipqe->ipqe_mff = mff;
982 ipqe->ipqe_m = m; 980 ipqe->ipqe_m = m;
983 ipqe->ipqe_ip = ip; 981 ipqe->ipqe_ip = ip;
984 m = ip_reass(ipqe, fp, &ipq[hash]); 982 m = ip_reass(ipqe, fp, &ipq[hash]);
985 if (m == 0) { 983 if (m == 0) {
986 IPQ_UNLOCK(); 984 IPQ_UNLOCK();
987 return; 985 return;
988 } 986 }
989 IP_STATINC(IP_STAT_REASSEMBLED); 987 IP_STATINC(IP_STAT_REASSEMBLED);
990 ip = mtod(m, struct ip *); 988 ip = mtod(m, struct ip *);
991 hlen = ip->ip_hl << 2; 989 hlen = ip->ip_hl << 2;
992 ip->ip_len = htons(ntohs(ip->ip_len) + hlen); 990 ip->ip_len = htons(ntohs(ip->ip_len) + hlen);
993 } else 991 } else
994 if (fp) 992 if (fp)
995 ip_freef(fp); 993 ip_freef(fp);
996 IPQ_UNLOCK(); 994 IPQ_UNLOCK();
997 } 995 }
998 996
999#if defined(IPSEC) 997#if defined(IPSEC)
1000 /* 998 /*
1001 * enforce IPsec policy checking if we are seeing last header. 999 * enforce IPsec policy checking if we are seeing last header.
1002 * note that we do not visit this with protocols with pcb layer 1000 * note that we do not visit this with protocols with pcb layer
1003 * code - like udp/tcp/raw ip. 1001 * code - like udp/tcp/raw ip.
1004 */ 1002 */
1005 if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0 && 1003 if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0 &&
1006 ipsec4_in_reject(m, NULL)) { 1004 ipsec4_in_reject(m, NULL)) {
1007 IPSEC_STATINC(IPSEC_STAT_IN_POLVIO); 1005 IPSEC_STATINC(IPSEC_STAT_IN_POLVIO);
1008 goto bad; 1006 goto bad;
1009 } 1007 }
1010#endif 1008#endif
1011#ifdef FAST_IPSEC 1009#ifdef FAST_IPSEC
1012 /* 1010 /*
1013 * enforce IPsec policy checking if we are seeing last header. 1011 * enforce IPsec policy checking if we are seeing last header.
1014 * note that we do not visit this with protocols with pcb layer 1012 * note that we do not visit this with protocols with pcb layer
1015 * code - like udp/tcp/raw ip. 1013 * code - like udp/tcp/raw ip.
1016 */ 1014 */
1017 if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0) { 1015 if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0) {
1018 /* 1016 /*
1019 * Check if the packet has already had IPsec processing 1017 * Check if the packet has already had IPsec processing
1020 * done. If so, then just pass it along. This tag gets 1018 * done. If so, then just pass it along. This tag gets
1021 * set during AH, ESP, etc. input handling, before the 1019 * set during AH, ESP, etc. input handling, before the
1022 * packet is returned to the ip input queue for delivery. 1020 * packet is returned to the ip input queue for delivery.
1023 */ 1021 */
1024 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 1022 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
1025 s = splsoftnet(); 1023 s = splsoftnet();
1026 if (mtag != NULL) { 1024 if (mtag != NULL) {
1027 tdbi = (struct tdb_ident *)(mtag + 1); 1025 tdbi = (struct tdb_ident *)(mtag + 1);
1028 sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND); 1026 sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
1029 } else { 1027 } else {
1030 sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, 1028 sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
1031 IP_FORWARDING, &error); 1029 IP_FORWARDING, &error);
1032 } 1030 }
1033 if (sp != NULL) { 1031 if (sp != NULL) {
1034 /* 1032 /*
1035 * Check security policy against packet attributes. 1033 * Check security policy against packet attributes.
1036 */ 1034 */
1037 error = ipsec_in_reject(sp, m); 1035 error = ipsec_in_reject(sp, m);
1038 KEY_FREESP(&sp); 1036 KEY_FREESP(&sp);
1039 } else { 1037 } else {
1040 /* XXX error stat??? */ 1038 /* XXX error stat??? */
1041 error = EINVAL; 1039 error = EINVAL;
1042DPRINTF(("ip_input: no SP, packet discarded\n"));/*XXX*/ 1040DPRINTF(("ip_input: no SP, packet discarded\n"));/*XXX*/
1043 } 1041 }
1044 splx(s); 1042 splx(s);
1045 if (error) 1043 if (error)
1046 goto bad; 1044 goto bad;
1047 } 1045 }
1048#endif /* FAST_IPSEC */ 1046#endif /* FAST_IPSEC */
1049 1047
1050 /* 1048 /*
1051 * Switch out to protocol's input routine. 1049 * Switch out to protocol's input routine.
1052 */ 1050 */
1053#if IFA_STATS 1051#if IFA_STATS
1054 if (ia && ip) 1052 if (ia && ip)
1055 ia->ia_ifa.ifa_data.ifad_inbytes += ntohs(ip->ip_len); 1053 ia->ia_ifa.ifa_data.ifad_inbytes += ntohs(ip->ip_len);
1056#endif 1054#endif
1057 IP_STATINC(IP_STAT_DELIVERED); 1055 IP_STATINC(IP_STAT_DELIVERED);
1058 { 1056 {
1059 int off = hlen, nh = ip->ip_p; 1057 int off = hlen, nh = ip->ip_p;
1060 1058
1061 (*inetsw[ip_protox[nh]].pr_input)(m, off, nh); 1059 (*inetsw[ip_protox[nh]].pr_input)(m, off, nh);
1062 return; 1060 return;
1063 } 1061 }
1064bad: 1062bad:
1065 m_freem(m); 1063 m_freem(m);
1066 return; 1064 return;
1067 1065
1068badcsum: 1066badcsum:
1069 IP_STATINC(IP_STAT_BADSUM); 1067 IP_STATINC(IP_STAT_BADSUM);
1070 m_freem(m); 1068 m_freem(m);
1071} 1069}
1072 1070
1073/* 1071/*
1074 * Take incoming datagram fragment and try to 1072 * Take incoming datagram fragment and try to
1075 * reassemble it into whole datagram. If a chain for 1073 * reassemble it into whole datagram. If a chain for
1076 * reassembly of this datagram already exists, then it 1074 * reassembly of this datagram already exists, then it
1077 * is given as fp; otherwise have to make a chain. 1075 * is given as fp; otherwise have to make a chain.
1078 */ 1076 */
1079struct mbuf * 1077struct mbuf *
1080ip_reass(struct ipqent *ipqe, struct ipq *fp, struct ipqhead *ipqhead) 1078ip_reass(struct ipqent *ipqe, struct ipq *fp, struct ipqhead *ipqhead)
1081{ 1079{
1082 struct mbuf *m = ipqe->ipqe_m; 1080 struct mbuf *m = ipqe->ipqe_m;
1083 struct ipqent *nq, *p, *q; 1081 struct ipqent *nq, *p, *q;
1084 struct ip *ip; 1082 struct ip *ip;
1085 struct mbuf *t; 1083 struct mbuf *t;
1086 int hlen = ipqe->ipqe_ip->ip_hl << 2; 1084 int hlen = ipqe->ipqe_ip->ip_hl << 2;
1087 int i, next, s; 1085 int i, next, s;
1088 1086
1089 IPQ_LOCK_CHECK(); 1087 IPQ_LOCK_CHECK();
1090 1088
1091 /* 1089 /*
1092 * Presence of header sizes in mbufs 1090 * Presence of header sizes in mbufs
1093 * would confuse code below. 1091 * would confuse code below.
1094 */ 1092 */
1095 m->m_data += hlen; 1093 m->m_data += hlen;
1096 m->m_len -= hlen; 1094 m->m_len -= hlen;
1097 1095
1098#ifdef notyet 1096#ifdef notyet
1099 /* make sure fragment limit is up-to-date */ 1097 /* make sure fragment limit is up-to-date */
1100 CHECK_NMBCLUSTER_PARAMS(); 1098 CHECK_NMBCLUSTER_PARAMS();
1101 1099
1102 /* If we have too many fragments, drop the older half. */ 1100 /* If we have too many fragments, drop the older half. */
1103 if (ip_nfrags >= ip_maxfrags) 1101 if (ip_nfrags >= ip_maxfrags)
1104 ip_reass_drophalf(void); 1102 ip_reass_drophalf(void);
1105#endif 1103#endif
1106 1104
1107 /* 1105 /*
1108 * We are about to add a fragment; increment frag count. 1106 * We are about to add a fragment; increment frag count.
1109 */ 1107 */
1110 ip_nfrags++; 1108 ip_nfrags++;
1111 1109
1112 /* 1110 /*
1113 * If first fragment to arrive, create a reassembly queue. 1111 * If first fragment to arrive, create a reassembly queue.
1114 */ 1112 */
1115 if (fp == 0) { 1113 if (fp == 0) {
1116 /* 1114 /*
1117 * Enforce upper bound on number of fragmented packets 1115 * Enforce upper bound on number of fragmented packets
1118 * for which we attempt reassembly; 1116 * for which we attempt reassembly;
1119 * If maxfrag is 0, never accept fragments. 1117 * If maxfrag is 0, never accept fragments.
1120 * If maxfrag is -1, accept all fragments without limitation. 1118 * If maxfrag is -1, accept all fragments without limitation.
1121 */ 1119 */
1122 if (ip_maxfragpackets < 0) 1120 if (ip_maxfragpackets < 0)
1123 ; 1121 ;
1124 else if (ip_nfragpackets >= ip_maxfragpackets) 1122 else if (ip_nfragpackets >= ip_maxfragpackets)
1125 goto dropfrag; 1123 goto dropfrag;
1126 ip_nfragpackets++; 1124 ip_nfragpackets++;
1127 MALLOC(fp, struct ipq *, sizeof (struct ipq), 1125 MALLOC(fp, struct ipq *, sizeof (struct ipq),
1128 M_FTABLE, M_NOWAIT); 1126 M_FTABLE, M_NOWAIT);
1129 if (fp == NULL) 1127 if (fp == NULL)
1130 goto dropfrag; 1128 goto dropfrag;
1131 LIST_INSERT_HEAD(ipqhead, fp, ipq_q); 1129 LIST_INSERT_HEAD(ipqhead, fp, ipq_q);
1132 fp->ipq_nfrags = 1; 1130 fp->ipq_nfrags = 1;
1133 fp->ipq_ttl = IPFRAGTTL; 1131 fp->ipq_ttl = IPFRAGTTL;
1134 fp->ipq_p = ipqe->ipqe_ip->ip_p; 1132 fp->ipq_p = ipqe->ipqe_ip->ip_p;
1135 fp->ipq_id = ipqe->ipqe_ip->ip_id; 1133 fp->ipq_id = ipqe->ipqe_ip->ip_id;
1136 fp->ipq_tos = ipqe->ipqe_ip->ip_tos; 1134 fp->ipq_tos = ipqe->ipqe_ip->ip_tos;
1137 TAILQ_INIT(&fp->ipq_fragq); 1135 TAILQ_INIT(&fp->ipq_fragq);
1138 fp->ipq_src = ipqe->ipqe_ip->ip_src; 1136 fp->ipq_src = ipqe->ipqe_ip->ip_src;
1139 fp->ipq_dst = ipqe->ipqe_ip->ip_dst; 1137 fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
1140 p = NULL; 1138 p = NULL;
1141 goto insert; 1139 goto insert;
1142 } else { 1140 } else {
1143 fp->ipq_nfrags++; 1141 fp->ipq_nfrags++;
1144 } 1142 }
1145 1143
1146 /* 1144 /*
1147 * Find a segment which begins after this one does. 1145 * Find a segment which begins after this one does.
1148 */ 1146 */
1149 for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; 1147 for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
1150 p = q, q = TAILQ_NEXT(q, ipqe_q)) 1148 p = q, q = TAILQ_NEXT(q, ipqe_q))
1151 if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off)) 1149 if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off))
1152 break; 1150 break;
1153 1151
1154 /* 1152 /*
1155 * If there is a preceding segment, it may provide some of 1153 * If there is a preceding segment, it may provide some of
1156 * our data already. If so, drop the data from the incoming 1154 * our data already. If so, drop the data from the incoming
1157 * segment. If it provides all of our data, drop us. 1155 * segment. If it provides all of our data, drop us.
1158 */ 1156 */
1159 if (p != NULL) { 1157 if (p != NULL) {
1160 i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - 1158 i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) -
1161 ntohs(ipqe->ipqe_ip->ip_off); 1159 ntohs(ipqe->ipqe_ip->ip_off);
1162 if (i > 0) { 1160 if (i > 0) {
1163 if (i >= ntohs(ipqe->ipqe_ip->ip_len)) 1161 if (i >= ntohs(ipqe->ipqe_ip->ip_len))
1164 goto dropfrag; 1162 goto dropfrag;
1165 m_adj(ipqe->ipqe_m, i); 1163 m_adj(ipqe->ipqe_m, i);
1166 ipqe->ipqe_ip->ip_off = 1164 ipqe->ipqe_ip->ip_off =
1167 htons(ntohs(ipqe->ipqe_ip->ip_off) + i); 1165 htons(ntohs(ipqe->ipqe_ip->ip_off) + i);
1168 ipqe->ipqe_ip->ip_len = 1166 ipqe->ipqe_ip->ip_len =
1169 htons(ntohs(ipqe->ipqe_ip->ip_len) - i); 1167 htons(ntohs(ipqe->ipqe_ip->ip_len) - i);
1170 } 1168 }
1171 } 1169 }
1172 1170
1173 /* 1171 /*
1174 * While we overlap succeeding segments trim them or, 1172 * While we overlap succeeding segments trim them or,
1175 * if they are completely covered, dequeue them. 1173 * if they are completely covered, dequeue them.
1176 */ 1174 */
1177 for (; q != NULL && 1175 for (; q != NULL &&
1178 ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) > 1176 ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) >
1179 ntohs(q->ipqe_ip->ip_off); q = nq) { 1177 ntohs(q->ipqe_ip->ip_off); q = nq) {
1180 i = (ntohs(ipqe->ipqe_ip->ip_off) + 1178 i = (ntohs(ipqe->ipqe_ip->ip_off) +
1181 ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off); 1179 ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off);
1182 if (i < ntohs(q->ipqe_ip->ip_len)) { 1180 if (i < ntohs(q->ipqe_ip->ip_len)) {
1183 q->ipqe_ip->ip_len = 1181 q->ipqe_ip->ip_len =
1184 htons(ntohs(q->ipqe_ip->ip_len) - i); 1182 htons(ntohs(q->ipqe_ip->ip_len) - i);
1185 q->ipqe_ip->ip_off = 1183 q->ipqe_ip->ip_off =
1186 htons(ntohs(q->ipqe_ip->ip_off) + i); 1184 htons(ntohs(q->ipqe_ip->ip_off) + i);
1187 m_adj(q->ipqe_m, i); 1185 m_adj(q->ipqe_m, i);
1188 break; 1186 break;
1189 } 1187 }
1190 nq = TAILQ_NEXT(q, ipqe_q); 1188 nq = TAILQ_NEXT(q, ipqe_q);
1191 m_freem(q->ipqe_m); 1189 m_freem(q->ipqe_m);
1192 TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); 1190 TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
1193 s = splvm(); 1191 s = splvm();
1194 pool_put(&ipqent_pool, q); 1192 pool_put(&ipqent_pool, q);
1195 splx(s); 1193 splx(s);
1196 fp->ipq_nfrags--; 1194 fp->ipq_nfrags--;
1197 ip_nfrags--; 1195 ip_nfrags--;
1198 } 1196 }
1199 1197
1200insert: 1198insert:
1201 /* 1199 /*
1202 * Stick new segment in its place; 1200 * Stick new segment in its place;
1203 * check for complete reassembly. 1201 * check for complete reassembly.
1204 */ 1202 */
1205 if (p == NULL) { 1203 if (p == NULL) {
1206 TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q); 1204 TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
1207 } else { 1205 } else {
1208 TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q); 1206 TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q);
1209 } 1207 }
1210 next = 0; 1208 next = 0;
1211 for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; 1209 for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
1212 p = q, q = TAILQ_NEXT(q, ipqe_q)) { 1210 p = q, q = TAILQ_NEXT(q, ipqe_q)) {
1213 if (ntohs(q->ipqe_ip->ip_off) != next) 1211 if (ntohs(q->ipqe_ip->ip_off) != next)
1214 return (0); 1212 return (0);
1215 next += ntohs(q->ipqe_ip->ip_len); 1213 next += ntohs(q->ipqe_ip->ip_len);
1216 } 1214 }
1217 if (p->ipqe_mff) 1215 if (p->ipqe_mff)
1218 return (0); 1216 return (0);
1219 1217
1220 /* 1218 /*
1221 * Reassembly is complete. Check for a bogus message size and 1219 * Reassembly is complete. Check for a bogus message size and
1222 * concatenate fragments. 1220 * concatenate fragments.
1223 */ 1221 */
1224 q = TAILQ_FIRST(&fp->ipq_fragq); 1222 q = TAILQ_FIRST(&fp->ipq_fragq);
1225 ip = q->ipqe_ip; 1223 ip = q->ipqe_ip;
1226 if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) { 1224 if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
1227 IP_STATINC(IP_STAT_TOOLONG); 1225 IP_STATINC(IP_STAT_TOOLONG);
1228 ip_freef(fp); 1226 ip_freef(fp);
1229 return (0); 1227 return (0);
1230 } 1228 }
1231 m = q->ipqe_m; 1229 m = q->ipqe_m;
1232 t = m->m_next; 1230 t = m->m_next;
1233 m->m_next = 0; 1231 m->m_next = 0;
1234 m_cat(m, t); 1232 m_cat(m, t);
1235 nq = TAILQ_NEXT(q, ipqe_q); 1233 nq = TAILQ_NEXT(q, ipqe_q);
1236 s = splvm(); 1234 s = splvm();
1237 pool_put(&ipqent_pool, q); 1235 pool_put(&ipqent_pool, q);
1238 splx(s); 1236 splx(s);
1239 for (q = nq; q != NULL; q = nq) { 1237 for (q = nq; q != NULL; q = nq) {
1240 t = q->ipqe_m; 1238 t = q->ipqe_m;
1241 nq = TAILQ_NEXT(q, ipqe_q); 1239 nq = TAILQ_NEXT(q, ipqe_q);
1242 s = splvm(); 1240 s = splvm();
1243 pool_put(&ipqent_pool, q); 1241 pool_put(&ipqent_pool, q);
1244 splx(s); 1242 splx(s);
1245 m_cat(m, t); 1243 m_cat(m, t);
1246 } 1244 }
1247 ip_nfrags -= fp->ipq_nfrags; 1245 ip_nfrags -= fp->ipq_nfrags;
1248 1246
1249 /* 1247 /*
1250 * Create header for new ip packet by 1248 * Create header for new ip packet by
1251 * modifying header of first packet; 1249 * modifying header of first packet;
1252 * dequeue and discard fragment reassembly header. 1250 * dequeue and discard fragment reassembly header.
1253 * Make header visible. 1251 * Make header visible.
1254 */ 1252 */
1255 ip->ip_len = htons(next); 1253 ip->ip_len = htons(next);
1256 ip->ip_src = fp->ipq_src; 1254 ip->ip_src = fp->ipq_src;
1257 ip->ip_dst = fp->ipq_dst; 1255 ip->ip_dst = fp->ipq_dst;
1258 LIST_REMOVE(fp, ipq_q); 1256 LIST_REMOVE(fp, ipq_q);
1259 FREE(fp, M_FTABLE); 1257 FREE(fp, M_FTABLE);
1260 ip_nfragpackets--; 1258 ip_nfragpackets--;
1261 m->m_len += (ip->ip_hl << 2); 1259 m->m_len += (ip->ip_hl << 2);
1262 m->m_data -= (ip->ip_hl << 2); 1260 m->m_data -= (ip->ip_hl << 2);
1263 /* some debugging cruft by sklower, below, will go away soon */ 1261 /* some debugging cruft by sklower, below, will go away soon */
1264 if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ 1262 if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
1265 int plen = 0; 1263 int plen = 0;
1266 for (t = m; t; t = t->m_next) 1264 for (t = m; t; t = t->m_next)
1267 plen += t->m_len; 1265 plen += t->m_len;
1268 m->m_pkthdr.len = plen; 1266 m->m_pkthdr.len = plen;
1269 m->m_pkthdr.csum_flags = 0; 1267 m->m_pkthdr.csum_flags = 0;
1270 } 1268 }
1271 return (m); 1269 return (m);
1272 1270
1273dropfrag: 1271dropfrag:
1274 if (fp != 0) 1272 if (fp != 0)
1275 fp->ipq_nfrags--; 1273 fp->ipq_nfrags--;
1276 ip_nfrags--; 1274 ip_nfrags--;
1277 IP_STATINC(IP_STAT_FRAGDROPPED); 1275 IP_STATINC(IP_STAT_FRAGDROPPED);
1278 m_freem(m); 1276 m_freem(m);
1279 s = splvm(); 1277 s = splvm();
1280 pool_put(&ipqent_pool, ipqe); 1278 pool_put(&ipqent_pool, ipqe);
1281 splx(s); 1279 splx(s);
1282 return (0); 1280 return (0);
1283} 1281}
1284 1282
1285/* 1283/*
1286 * Free a fragment reassembly header and all 1284 * Free a fragment reassembly header and all
1287 * associated datagrams. 1285 * associated datagrams.
1288 */ 1286 */
1289void 1287void
1290ip_freef(struct ipq *fp) 1288ip_freef(struct ipq *fp)
1291{ 1289{
1292 struct ipqent *q, *p; 1290 struct ipqent *q, *p;
1293 u_int nfrags = 0; 1291 u_int nfrags = 0;
1294 int s; 1292 int s;
1295 1293
1296 IPQ_LOCK_CHECK(); 1294 IPQ_LOCK_CHECK();
1297 1295
1298 for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) { 1296 for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) {
1299 p = TAILQ_NEXT(q, ipqe_q); 1297 p = TAILQ_NEXT(q, ipqe_q);
1300 m_freem(q->ipqe_m); 1298 m_freem(q->ipqe_m);
1301 nfrags++; 1299 nfrags++;
1302 TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); 1300 TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
1303 s = splvm(); 1301 s = splvm();
1304 pool_put(&ipqent_pool, q); 1302 pool_put(&ipqent_pool, q);
1305 splx(s); 1303 splx(s);
1306 } 1304 }
1307 1305
1308 if (nfrags != fp->ipq_nfrags) 1306 if (nfrags != fp->ipq_nfrags)
1309 printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags); 1307 printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags);
1310 ip_nfrags -= nfrags; 1308 ip_nfrags -= nfrags;
1311 LIST_REMOVE(fp, ipq_q); 1309 LIST_REMOVE(fp, ipq_q);
1312 FREE(fp, M_FTABLE); 1310 FREE(fp, M_FTABLE);
1313 ip_nfragpackets--; 1311 ip_nfragpackets--;
1314} 1312}
1315 1313
1316/* 1314/*
1317 * IP reassembly TTL machinery for multiplicative drop. 1315 * IP reassembly TTL machinery for multiplicative drop.
1318 */ 1316 */
1319static u_int fragttl_histo[(IPFRAGTTL+1)]; 1317static u_int fragttl_histo[(IPFRAGTTL+1)];
1320 1318
1321 1319
1322/* 1320/*
1323 * Decrement TTL of all reasembly queue entries by `ticks'. 1321 * Decrement TTL of all reasembly queue entries by `ticks'.
1324 * Count number of distinct fragments (as opposed to partial, fragmented 1322 * Count number of distinct fragments (as opposed to partial, fragmented
1325 * datagrams) in the reassembly queue. While we traverse the entire 1323 * datagrams) in the reassembly queue. While we traverse the entire
1326 * reassembly queue, compute and return the median TTL over all fragments. 1324 * reassembly queue, compute and return the median TTL over all fragments.
1327 */ 1325 */
1328static u_int 1326static u_int
1329ip_reass_ttl_decr(u_int ticks) 1327ip_reass_ttl_decr(u_int ticks)
1330{ 1328{
1331 u_int nfrags, median, dropfraction, keepfraction; 1329 u_int nfrags, median, dropfraction, keepfraction;
1332 struct ipq *fp, *nfp; 1330 struct ipq *fp, *nfp;
1333 int i; 1331 int i;
1334 1332
1335 nfrags = 0; 1333 nfrags = 0;
1336 memset(fragttl_histo, 0, sizeof fragttl_histo); 1334 memset(fragttl_histo, 0, sizeof fragttl_histo);
1337 1335
1338 for (i = 0; i < IPREASS_NHASH; i++) { 1336 for (i = 0; i < IPREASS_NHASH; i++) {
1339 for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) { 1337 for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
1340 fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ? 1338 fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ?
1341 0 : fp->ipq_ttl - ticks); 1339 0 : fp->ipq_ttl - ticks);
1342 nfp = LIST_NEXT(fp, ipq_q); 1340 nfp = LIST_NEXT(fp, ipq_q);
1343 if (fp->ipq_ttl == 0) { 1341 if (fp->ipq_ttl == 0) {
1344 IP_STATINC(IP_STAT_FRAGTIMEOUT); 1342 IP_STATINC(IP_STAT_FRAGTIMEOUT);
1345 ip_freef(fp); 1343 ip_freef(fp);
1346 } else { 1344 } else {
1347 nfrags += fp->ipq_nfrags; 1345 nfrags += fp->ipq_nfrags;
1348 fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags; 1346 fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
1349 } 1347 }
1350 } 1348 }
1351 } 1349 }
1352 1350
1353 KASSERT(ip_nfrags == nfrags); 1351 KASSERT(ip_nfrags == nfrags);
1354 1352
1355 /* Find median (or other drop fraction) in histogram. */ 1353 /* Find median (or other drop fraction) in histogram. */
1356 dropfraction = (ip_nfrags / 2); 1354 dropfraction = (ip_nfrags / 2);
1357 keepfraction = ip_nfrags - dropfraction; 1355 keepfraction = ip_nfrags - dropfraction;
1358 for (i = IPFRAGTTL, median = 0; i >= 0; i--) { 1356 for (i = IPFRAGTTL, median = 0; i >= 0; i--) {
1359 median += fragttl_histo[i]; 1357 median += fragttl_histo[i];
1360 if (median >= keepfraction) 1358 if (median >= keepfraction)
1361 break; 1359 break;
1362 } 1360 }
1363 1361
1364 /* Return TTL of median (or other fraction). */ 1362 /* Return TTL of median (or other fraction). */
1365 return (u_int)i; 1363 return (u_int)i;
1366} 1364}
1367 1365
1368void 1366void
1369ip_reass_drophalf(void) 1367ip_reass_drophalf(void)
1370{ 1368{
1371 1369
1372 u_int median_ticks; 1370 u_int median_ticks;
1373 /* 1371 /*
1374 * Compute median TTL of all fragments, and count frags 1372 * Compute median TTL of all fragments, and count frags
1375 * with that TTL or lower (roughly half of all fragments). 1373 * with that TTL or lower (roughly half of all fragments).
1376 */ 1374 */
1377 median_ticks = ip_reass_ttl_decr(0); 1375 median_ticks = ip_reass_ttl_decr(0);
1378 1376
1379 /* Drop half. */ 1377 /* Drop half. */
1380 median_ticks = ip_reass_ttl_decr(median_ticks); 1378 median_ticks = ip_reass_ttl_decr(median_ticks);
1381 1379
1382} 1380}
1383 1381
1384/* 1382/*
1385 * IP timer processing; 1383 * IP timer processing;
1386 * if a timer expires on a reassembly 1384 * if a timer expires on a reassembly
1387 * queue, discard it. 1385 * queue, discard it.
1388 */ 1386 */
1389void 1387void
1390ip_slowtimo(void) 1388ip_slowtimo(void)
1391{ 1389{
1392 static u_int dropscanidx = 0; 1390 static u_int dropscanidx = 0;
1393 u_int i; 1391 u_int i;
1394 u_int median_ttl; 1392 u_int median_ttl;
1395 1393
1396 mutex_enter(softnet_lock); 1394 mutex_enter(softnet_lock);
1397 KERNEL_LOCK(1, NULL); 1395 KERNEL_LOCK(1, NULL);
1398 1396
1399 IPQ_LOCK(); 1397 IPQ_LOCK();
1400 1398
1401 /* Age TTL of all fragments by 1 tick .*/ 1399 /* Age TTL of all fragments by 1 tick .*/
1402 median_ttl = ip_reass_ttl_decr(1); 1400 median_ttl = ip_reass_ttl_decr(1);
1403 1401
1404 /* make sure fragment limit is up-to-date */ 1402 /* make sure fragment limit is up-to-date */
1405 CHECK_NMBCLUSTER_PARAMS(); 1403 CHECK_NMBCLUSTER_PARAMS();
1406 1404
1407 /* If we have too many fragments, drop the older half. */ 1405 /* If we have too many fragments, drop the older half. */
1408 if (ip_nfrags > ip_maxfrags) 1406 if (ip_nfrags > ip_maxfrags)
1409 ip_reass_ttl_decr(median_ttl); 1407 ip_reass_ttl_decr(median_ttl);
1410 1408
1411 /* 1409 /*
1412 * If we are over the maximum number of fragmented packets 1410 * If we are over the maximum number of fragmented packets
1413 * (due to the limit being lowered), drain off 1411 * (due to the limit being lowered), drain off
1414 * enough to get down to the new limit. Start draining 1412 * enough to get down to the new limit. Start draining
1415 * from the reassembly hashqueue most recently drained. 1413 * from the reassembly hashqueue most recently drained.
1416 */ 1414 */
1417 if (ip_maxfragpackets < 0) 1415 if (ip_maxfragpackets < 0)
1418 ; 1416 ;
1419 else { 1417 else {
1420 int wrapped = 0; 1418 int wrapped = 0;
1421 1419
1422 i = dropscanidx; 1420 i = dropscanidx;
1423 while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) { 1421 while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) {
1424 while (LIST_FIRST(&ipq[i]) != NULL) 1422 while (LIST_FIRST(&ipq[i]) != NULL)
1425 ip_freef(LIST_FIRST(&ipq[i])); 1423 ip_freef(LIST_FIRST(&ipq[i]));
1426 if (++i >= IPREASS_NHASH) { 1424 if (++i >= IPREASS_NHASH) {
1427 i = 0; 1425 i = 0;
1428 } 1426 }
1429 /* 1427 /*
1430 * Dont scan forever even if fragment counters are 1428 * Dont scan forever even if fragment counters are
1431 * wrong: stop after scanning entire reassembly queue. 1429 * wrong: stop after scanning entire reassembly queue.
1432 */ 1430 */
1433 if (i == dropscanidx) 1431 if (i == dropscanidx)
1434 wrapped = 1; 1432 wrapped = 1;
1435 } 1433 }
1436 dropscanidx = i; 1434 dropscanidx = i;
1437 } 1435 }
1438 IPQ_UNLOCK(); 1436 IPQ_UNLOCK();
1439 1437
1440 KERNEL_UNLOCK_ONE(NULL); 1438 KERNEL_UNLOCK_ONE(NULL);
1441 mutex_exit(softnet_lock); 1439 mutex_exit(softnet_lock);
1442} 1440}
1443 1441
1444/* 1442/*
1445 * Drain off all datagram fragments. Don't acquire softnet_lock as 1443 * Drain off all datagram fragments. Don't acquire softnet_lock as
1446 * can be called from hardware interrupt context. 1444 * can be called from hardware interrupt context.
1447 */ 1445 */
1448void 1446void
1449ip_drain(void) 1447ip_drain(void)
1450{ 1448{
1451 1449
1452 KERNEL_LOCK(1, NULL); 1450 KERNEL_LOCK(1, NULL);
1453 1451
1454 /* 1452 /*
1455 * We may be called from a device's interrupt context. If 1453 * We may be called from a device's interrupt context. If
1456 * the ipq is already busy, just bail out now. 1454 * the ipq is already busy, just bail out now.
1457 */ 1455 */
1458 if (ipq_lock_try() != 0) { 1456 if (ipq_lock_try() != 0) {
1459 /* 1457 /*
1460 * Drop half the total fragments now. If more mbufs are 1458 * Drop half the total fragments now. If more mbufs are
1461 * needed, we will be called again soon. 1459 * needed, we will be called again soon.
1462 */ 1460 */
1463 ip_reass_drophalf(); 1461 ip_reass_drophalf();
1464 IPQ_UNLOCK(); 1462 IPQ_UNLOCK();
1465 } 1463 }
1466 1464
1467 KERNEL_UNLOCK_ONE(NULL); 1465 KERNEL_UNLOCK_ONE(NULL);
1468} 1466}
1469 1467
1470/* 1468/*
1471 * Do option processing on a datagram, 1469 * Do option processing on a datagram,
1472 * possibly discarding it if bad options are encountered, 1470 * possibly discarding it if bad options are encountered,
1473 * or forwarding it if source-routed. 1471 * or forwarding it if source-routed.
1474 * Returns 1 if packet has been forwarded/freed, 1472 * Returns 1 if packet has been forwarded/freed,
1475 * 0 if the packet should be processed further. 1473 * 0 if the packet should be processed further.
1476 */ 1474 */
1477int 1475int