Wed Jun 6 09:46:46 2018 UTC ()
Separate receive socket errors from general socket errors.


(roy)
diff -r1.263 -r1.264 src/sys/kern/uipc_socket.c
diff -r1.129 -r1.130 src/sys/kern/uipc_socket2.c
diff -r1.155 -r1.156 src/sys/sys/socketvar.h

cvs diff -r1.263 -r1.264 src/sys/kern/uipc_socket.c (switch to unified diff)

--- src/sys/kern/uipc_socket.c 2018/04/26 19:50:09 1.263
+++ src/sys/kern/uipc_socket.c 2018/06/06 09:46:46 1.264
@@ -1,2539 +1,2545 @@ @@ -1,2539 +1,2545 @@
1/* $NetBSD: uipc_socket.c,v 1.263 2018/04/26 19:50:09 maxv Exp $ */ 1/* $NetBSD: uipc_socket.c,v 1.264 2018/06/06 09:46:46 roy Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc. 4 * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran. 8 * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * Copyright (c) 2004 The FreeBSD Foundation 33 * Copyright (c) 2004 The FreeBSD Foundation
34 * Copyright (c) 2004 Robert Watson 34 * Copyright (c) 2004 Robert Watson
35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993
36 * The Regents of the University of California. All rights reserved. 36 * The Regents of the University of California. All rights reserved.
37 * 37 *
38 * Redistribution and use in source and binary forms, with or without 38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions 39 * modification, are permitted provided that the following conditions
40 * are met: 40 * are met:
41 * 1. Redistributions of source code must retain the above copyright 41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer. 42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright 43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the 44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution. 45 * documentation and/or other materials provided with the distribution.
46 * 3. Neither the name of the University nor the names of its contributors 46 * 3. Neither the name of the University nor the names of its contributors
47 * may be used to endorse or promote products derived from this software 47 * may be used to endorse or promote products derived from this software
48 * without specific prior written permission. 48 * without specific prior written permission.
49 * 49 *
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE. 60 * SUCH DAMAGE.
61 * 61 *
62 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 62 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95
63 */ 63 */
64 64
65/* 65/*
66 * Socket operation routines. 66 * Socket operation routines.
67 * 67 *
68 * These routines are called by the routines in sys_socket.c or from a 68 * These routines are called by the routines in sys_socket.c or from a
69 * system process, and implement the semantics of socket operations by 69 * system process, and implement the semantics of socket operations by
70 * switching out to the protocol specific routines. 70 * switching out to the protocol specific routines.
71 */ 71 */
72 72
73#include <sys/cdefs.h> 73#include <sys/cdefs.h>
74__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.263 2018/04/26 19:50:09 maxv Exp $"); 74__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.264 2018/06/06 09:46:46 roy Exp $");
75 75
76#ifdef _KERNEL_OPT 76#ifdef _KERNEL_OPT
77#include "opt_compat_netbsd.h" 77#include "opt_compat_netbsd.h"
78#include "opt_sock_counters.h" 78#include "opt_sock_counters.h"
79#include "opt_sosend_loan.h" 79#include "opt_sosend_loan.h"
80#include "opt_mbuftrace.h" 80#include "opt_mbuftrace.h"
81#include "opt_somaxkva.h" 81#include "opt_somaxkva.h"
82#include "opt_multiprocessor.h" /* XXX */ 82#include "opt_multiprocessor.h" /* XXX */
83#include "opt_sctp.h" 83#include "opt_sctp.h"
84#endif 84#endif
85 85
86#include <sys/param.h> 86#include <sys/param.h>
87#include <sys/systm.h> 87#include <sys/systm.h>
88#include <sys/proc.h> 88#include <sys/proc.h>
89#include <sys/file.h> 89#include <sys/file.h>
90#include <sys/filedesc.h> 90#include <sys/filedesc.h>
91#include <sys/kmem.h> 91#include <sys/kmem.h>
92#include <sys/mbuf.h> 92#include <sys/mbuf.h>
93#include <sys/domain.h> 93#include <sys/domain.h>
94#include <sys/kernel.h> 94#include <sys/kernel.h>
95#include <sys/protosw.h> 95#include <sys/protosw.h>
96#include <sys/socket.h> 96#include <sys/socket.h>
97#include <sys/socketvar.h> 97#include <sys/socketvar.h>
98#include <sys/signalvar.h> 98#include <sys/signalvar.h>
99#include <sys/resourcevar.h> 99#include <sys/resourcevar.h>
100#include <sys/uidinfo.h> 100#include <sys/uidinfo.h>
101#include <sys/event.h> 101#include <sys/event.h>
102#include <sys/poll.h> 102#include <sys/poll.h>
103#include <sys/kauth.h> 103#include <sys/kauth.h>
104#include <sys/mutex.h> 104#include <sys/mutex.h>
105#include <sys/condvar.h> 105#include <sys/condvar.h>
106#include <sys/kthread.h> 106#include <sys/kthread.h>
107 107
108#ifdef COMPAT_50 108#ifdef COMPAT_50
109#include <compat/sys/time.h> 109#include <compat/sys/time.h>
110#include <compat/sys/socket.h> 110#include <compat/sys/socket.h>
111#endif 111#endif
112 112
113#include <uvm/uvm_extern.h> 113#include <uvm/uvm_extern.h>
114#include <uvm/uvm_loan.h> 114#include <uvm/uvm_loan.h>
115#include <uvm/uvm_page.h> 115#include <uvm/uvm_page.h>
116 116
117MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 117MALLOC_DEFINE(M_SONAME, "soname", "socket name");
118 118
119extern const struct fileops socketops; 119extern const struct fileops socketops;
120 120
121extern int somaxconn; /* patchable (XXX sysctl) */ 121extern int somaxconn; /* patchable (XXX sysctl) */
122int somaxconn = SOMAXCONN; 122int somaxconn = SOMAXCONN;
123kmutex_t *softnet_lock; 123kmutex_t *softnet_lock;
124 124
125#ifdef SOSEND_COUNTERS 125#ifdef SOSEND_COUNTERS
126#include <sys/device.h> 126#include <sys/device.h>
127 127
128static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 128static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
129 NULL, "sosend", "loan big"); 129 NULL, "sosend", "loan big");
130static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 130static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
131 NULL, "sosend", "copy big"); 131 NULL, "sosend", "copy big");
132static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 132static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
133 NULL, "sosend", "copy small"); 133 NULL, "sosend", "copy small");
134static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 134static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
135 NULL, "sosend", "kva limit"); 135 NULL, "sosend", "kva limit");
136 136
137#define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ 137#define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++
138 138
139EVCNT_ATTACH_STATIC(sosend_loan_big); 139EVCNT_ATTACH_STATIC(sosend_loan_big);
140EVCNT_ATTACH_STATIC(sosend_copy_big); 140EVCNT_ATTACH_STATIC(sosend_copy_big);
141EVCNT_ATTACH_STATIC(sosend_copy_small); 141EVCNT_ATTACH_STATIC(sosend_copy_small);
142EVCNT_ATTACH_STATIC(sosend_kvalimit); 142EVCNT_ATTACH_STATIC(sosend_kvalimit);
143#else 143#else
144 144
145#define SOSEND_COUNTER_INCR(ev) /* nothing */ 145#define SOSEND_COUNTER_INCR(ev) /* nothing */
146 146
147#endif /* SOSEND_COUNTERS */ 147#endif /* SOSEND_COUNTERS */
148 148
149#if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR) 149#if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR)
150int sock_loan_thresh = -1; 150int sock_loan_thresh = -1;
151#else 151#else
152int sock_loan_thresh = 4096; 152int sock_loan_thresh = 4096;
153#endif 153#endif
154 154
155static kmutex_t so_pendfree_lock; 155static kmutex_t so_pendfree_lock;
156static struct mbuf *so_pendfree = NULL; 156static struct mbuf *so_pendfree = NULL;
157 157
158#ifndef SOMAXKVA 158#ifndef SOMAXKVA
159#define SOMAXKVA (16 * 1024 * 1024) 159#define SOMAXKVA (16 * 1024 * 1024)
160#endif 160#endif
161int somaxkva = SOMAXKVA; 161int somaxkva = SOMAXKVA;
162static int socurkva; 162static int socurkva;
163static kcondvar_t socurkva_cv; 163static kcondvar_t socurkva_cv;
164 164
165static kauth_listener_t socket_listener; 165static kauth_listener_t socket_listener;
166 166
167#define SOCK_LOAN_CHUNK 65536 167#define SOCK_LOAN_CHUNK 65536
168 168
169static void sopendfree_thread(void *); 169static void sopendfree_thread(void *);
170static kcondvar_t pendfree_thread_cv; 170static kcondvar_t pendfree_thread_cv;
171static lwp_t *sopendfree_lwp; 171static lwp_t *sopendfree_lwp;
172 172
173static void sysctl_kern_socket_setup(void); 173static void sysctl_kern_socket_setup(void);
174static struct sysctllog *socket_sysctllog; 174static struct sysctllog *socket_sysctllog;
175 175
176static vsize_t 176static vsize_t
177sokvareserve(struct socket *so, vsize_t len) 177sokvareserve(struct socket *so, vsize_t len)
178{ 178{
179 int error; 179 int error;
180 180
181 mutex_enter(&so_pendfree_lock); 181 mutex_enter(&so_pendfree_lock);
182 while (socurkva + len > somaxkva) { 182 while (socurkva + len > somaxkva) {
183 SOSEND_COUNTER_INCR(&sosend_kvalimit); 183 SOSEND_COUNTER_INCR(&sosend_kvalimit);
184 error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock); 184 error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock);
185 if (error) { 185 if (error) {
186 len = 0; 186 len = 0;
187 break; 187 break;
188 } 188 }
189 } 189 }
190 socurkva += len; 190 socurkva += len;
191 mutex_exit(&so_pendfree_lock); 191 mutex_exit(&so_pendfree_lock);
192 return len; 192 return len;
193} 193}
194 194
195static void 195static void
196sokvaunreserve(vsize_t len) 196sokvaunreserve(vsize_t len)
197{ 197{
198 198
199 mutex_enter(&so_pendfree_lock); 199 mutex_enter(&so_pendfree_lock);
200 socurkva -= len; 200 socurkva -= len;
201 cv_broadcast(&socurkva_cv); 201 cv_broadcast(&socurkva_cv);
202 mutex_exit(&so_pendfree_lock); 202 mutex_exit(&so_pendfree_lock);
203} 203}
204 204
205/* 205/*
206 * sokvaalloc: allocate kva for loan. 206 * sokvaalloc: allocate kva for loan.
207 */ 207 */
208 208
209vaddr_t 209vaddr_t
210sokvaalloc(vaddr_t sva, vsize_t len, struct socket *so) 210sokvaalloc(vaddr_t sva, vsize_t len, struct socket *so)
211{ 211{
212 vaddr_t lva; 212 vaddr_t lva;
213 213
214 /* 214 /*
215 * reserve kva. 215 * reserve kva.
216 */ 216 */
217 217
218 if (sokvareserve(so, len) == 0) 218 if (sokvareserve(so, len) == 0)
219 return 0; 219 return 0;
220 220
221 /* 221 /*
222 * allocate kva. 222 * allocate kva.
223 */ 223 */
224 224
225 lva = uvm_km_alloc(kernel_map, len, atop(sva) & uvmexp.colormask, 225 lva = uvm_km_alloc(kernel_map, len, atop(sva) & uvmexp.colormask,
226 UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA); 226 UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA);
227 if (lva == 0) { 227 if (lva == 0) {
228 sokvaunreserve(len); 228 sokvaunreserve(len);
229 return (0); 229 return (0);
230 } 230 }
231 231
232 return lva; 232 return lva;
233} 233}
234 234
235/* 235/*
236 * sokvafree: free kva for loan. 236 * sokvafree: free kva for loan.
237 */ 237 */
238 238
239void 239void
240sokvafree(vaddr_t sva, vsize_t len) 240sokvafree(vaddr_t sva, vsize_t len)
241{ 241{
242 242
243 /* 243 /*
244 * free kva. 244 * free kva.
245 */ 245 */
246 246
247 uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY); 247 uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY);
248 248
249 /* 249 /*
250 * unreserve kva. 250 * unreserve kva.
251 */ 251 */
252 252
253 sokvaunreserve(len); 253 sokvaunreserve(len);
254} 254}
255 255
256static void 256static void
257sodoloanfree(struct vm_page **pgs, void *buf, size_t size) 257sodoloanfree(struct vm_page **pgs, void *buf, size_t size)
258{ 258{
259 vaddr_t sva, eva; 259 vaddr_t sva, eva;
260 vsize_t len; 260 vsize_t len;
261 int npgs; 261 int npgs;
262 262
263 KASSERT(pgs != NULL); 263 KASSERT(pgs != NULL);
264 264
265 eva = round_page((vaddr_t) buf + size); 265 eva = round_page((vaddr_t) buf + size);
266 sva = trunc_page((vaddr_t) buf); 266 sva = trunc_page((vaddr_t) buf);
267 len = eva - sva; 267 len = eva - sva;
268 npgs = len >> PAGE_SHIFT; 268 npgs = len >> PAGE_SHIFT;
269 269
270 pmap_kremove(sva, len); 270 pmap_kremove(sva, len);
271 pmap_update(pmap_kernel()); 271 pmap_update(pmap_kernel());
272 uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); 272 uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE);
273 sokvafree(sva, len); 273 sokvafree(sva, len);
274} 274}
275 275
276/* 276/*
277 * sopendfree_thread: free mbufs on "pendfree" list. 277 * sopendfree_thread: free mbufs on "pendfree" list.
278 * unlock and relock so_pendfree_lock when freeing mbufs. 278 * unlock and relock so_pendfree_lock when freeing mbufs.
279 */ 279 */
280 280
281static void 281static void
282sopendfree_thread(void *v) 282sopendfree_thread(void *v)
283{ 283{
284 struct mbuf *m, *next; 284 struct mbuf *m, *next;
285 size_t rv; 285 size_t rv;
286 286
287 mutex_enter(&so_pendfree_lock); 287 mutex_enter(&so_pendfree_lock);
288 288
289 for (;;) { 289 for (;;) {
290 rv = 0; 290 rv = 0;
291 while (so_pendfree != NULL) { 291 while (so_pendfree != NULL) {
292 m = so_pendfree; 292 m = so_pendfree;
293 so_pendfree = NULL; 293 so_pendfree = NULL;
294 mutex_exit(&so_pendfree_lock); 294 mutex_exit(&so_pendfree_lock);
295 295
296 for (; m != NULL; m = next) { 296 for (; m != NULL; m = next) {
297 next = m->m_next; 297 next = m->m_next;
298 KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 298 KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) ==
299 0); 299 0);
300 KASSERT(m->m_ext.ext_refcnt == 0); 300 KASSERT(m->m_ext.ext_refcnt == 0);
301 301
302 rv += m->m_ext.ext_size; 302 rv += m->m_ext.ext_size;
303 sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, 303 sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf,
304 m->m_ext.ext_size); 304 m->m_ext.ext_size);
305 pool_cache_put(mb_cache, m); 305 pool_cache_put(mb_cache, m);
306 } 306 }
307 307
308 mutex_enter(&so_pendfree_lock); 308 mutex_enter(&so_pendfree_lock);
309 } 309 }
310 if (rv) 310 if (rv)
311 cv_broadcast(&socurkva_cv); 311 cv_broadcast(&socurkva_cv);
312 cv_wait(&pendfree_thread_cv, &so_pendfree_lock); 312 cv_wait(&pendfree_thread_cv, &so_pendfree_lock);
313 } 313 }
314 panic("sopendfree_thread"); 314 panic("sopendfree_thread");
315 /* NOTREACHED */ 315 /* NOTREACHED */
316} 316}
317 317
318void 318void
319soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) 319soloanfree(struct mbuf *m, void *buf, size_t size, void *arg)
320{ 320{
321 321
322 KASSERT(m != NULL); 322 KASSERT(m != NULL);
323 323
324 /* 324 /*
325 * postpone freeing mbuf. 325 * postpone freeing mbuf.
326 * 326 *
327 * we can't do it in interrupt context 327 * we can't do it in interrupt context
328 * because we need to put kva back to kernel_map. 328 * because we need to put kva back to kernel_map.
329 */ 329 */
330 330
331 mutex_enter(&so_pendfree_lock); 331 mutex_enter(&so_pendfree_lock);
332 m->m_next = so_pendfree; 332 m->m_next = so_pendfree;
333 so_pendfree = m; 333 so_pendfree = m;
334 cv_signal(&pendfree_thread_cv); 334 cv_signal(&pendfree_thread_cv);
335 mutex_exit(&so_pendfree_lock); 335 mutex_exit(&so_pendfree_lock);
336} 336}
337 337
338static long 338static long
339sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) 339sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space)
340{ 340{
341 struct iovec *iov = uio->uio_iov; 341 struct iovec *iov = uio->uio_iov;
342 vaddr_t sva, eva; 342 vaddr_t sva, eva;
343 vsize_t len; 343 vsize_t len;
344 vaddr_t lva; 344 vaddr_t lva;
345 int npgs, error; 345 int npgs, error;
346 vaddr_t va; 346 vaddr_t va;
347 int i; 347 int i;
348 348
349 if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) 349 if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace))
350 return (0); 350 return (0);
351 351
352 if (iov->iov_len < (size_t) space) 352 if (iov->iov_len < (size_t) space)
353 space = iov->iov_len; 353 space = iov->iov_len;
354 if (space > SOCK_LOAN_CHUNK) 354 if (space > SOCK_LOAN_CHUNK)
355 space = SOCK_LOAN_CHUNK; 355 space = SOCK_LOAN_CHUNK;
356 356
357 eva = round_page((vaddr_t) iov->iov_base + space); 357 eva = round_page((vaddr_t) iov->iov_base + space);
358 sva = trunc_page((vaddr_t) iov->iov_base); 358 sva = trunc_page((vaddr_t) iov->iov_base);
359 len = eva - sva; 359 len = eva - sva;
360 npgs = len >> PAGE_SHIFT; 360 npgs = len >> PAGE_SHIFT;
361 361
362 KASSERT(npgs <= M_EXT_MAXPAGES); 362 KASSERT(npgs <= M_EXT_MAXPAGES);
363 363
364 lva = sokvaalloc(sva, len, so); 364 lva = sokvaalloc(sva, len, so);
365 if (lva == 0) 365 if (lva == 0)
366 return 0; 366 return 0;
367 367
368 error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len, 368 error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len,
369 m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); 369 m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
370 if (error) { 370 if (error) {
371 sokvafree(lva, len); 371 sokvafree(lva, len);
372 return (0); 372 return (0);
373 } 373 }
374 374
375 for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) 375 for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
376 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), 376 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]),
377 VM_PROT_READ, 0); 377 VM_PROT_READ, 0);
378 pmap_update(pmap_kernel()); 378 pmap_update(pmap_kernel());
379 379
380 lva += (vaddr_t) iov->iov_base & PAGE_MASK; 380 lva += (vaddr_t) iov->iov_base & PAGE_MASK;
381 381
382 MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so); 382 MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so);
383 m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; 383 m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
384 384
385 uio->uio_resid -= space; 385 uio->uio_resid -= space;
386 /* uio_offset not updated, not set/used for write(2) */ 386 /* uio_offset not updated, not set/used for write(2) */
387 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space; 387 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space;
388 uio->uio_iov->iov_len -= space; 388 uio->uio_iov->iov_len -= space;
389 if (uio->uio_iov->iov_len == 0) { 389 if (uio->uio_iov->iov_len == 0) {
390 uio->uio_iov++; 390 uio->uio_iov++;
391 uio->uio_iovcnt--; 391 uio->uio_iovcnt--;
392 } 392 }
393 393
394 return (space); 394 return (space);
395} 395}
396 396
397struct mbuf * 397struct mbuf *
398getsombuf(struct socket *so, int type) 398getsombuf(struct socket *so, int type)
399{ 399{
400 struct mbuf *m; 400 struct mbuf *m;
401 401
402 m = m_get(M_WAIT, type); 402 m = m_get(M_WAIT, type);
403 MCLAIM(m, so->so_mowner); 403 MCLAIM(m, so->so_mowner);
404 return m; 404 return m;
405} 405}
406 406
407static int 407static int
408socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, 408socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
409 void *arg0, void *arg1, void *arg2, void *arg3) 409 void *arg0, void *arg1, void *arg2, void *arg3)
410{ 410{
411 int result; 411 int result;
412 enum kauth_network_req req; 412 enum kauth_network_req req;
413 413
414 result = KAUTH_RESULT_DEFER; 414 result = KAUTH_RESULT_DEFER;
415 req = (enum kauth_network_req)arg0; 415 req = (enum kauth_network_req)arg0;
416 416
417 if ((action != KAUTH_NETWORK_SOCKET) && 417 if ((action != KAUTH_NETWORK_SOCKET) &&
418 (action != KAUTH_NETWORK_BIND)) 418 (action != KAUTH_NETWORK_BIND))
419 return result; 419 return result;
420 420
421 switch (req) { 421 switch (req) {
422 case KAUTH_REQ_NETWORK_BIND_PORT: 422 case KAUTH_REQ_NETWORK_BIND_PORT:
423 result = KAUTH_RESULT_ALLOW; 423 result = KAUTH_RESULT_ALLOW;
424 break; 424 break;
425 425
426 case KAUTH_REQ_NETWORK_SOCKET_DROP: { 426 case KAUTH_REQ_NETWORK_SOCKET_DROP: {
427 /* Normal users can only drop their own connections. */ 427 /* Normal users can only drop their own connections. */
428 struct socket *so = (struct socket *)arg1; 428 struct socket *so = (struct socket *)arg1;
429 429
430 if (so->so_cred && proc_uidmatch(cred, so->so_cred) == 0) 430 if (so->so_cred && proc_uidmatch(cred, so->so_cred) == 0)
431 result = KAUTH_RESULT_ALLOW; 431 result = KAUTH_RESULT_ALLOW;
432 432
433 break; 433 break;
434 } 434 }
435 435
436 case KAUTH_REQ_NETWORK_SOCKET_OPEN: 436 case KAUTH_REQ_NETWORK_SOCKET_OPEN:
437 /* We allow "raw" routing/bluetooth sockets to anyone. */ 437 /* We allow "raw" routing/bluetooth sockets to anyone. */
438 switch ((u_long)arg1) { 438 switch ((u_long)arg1) {
439 case PF_ROUTE: 439 case PF_ROUTE:
440 case PF_OROUTE: 440 case PF_OROUTE:
441 case PF_BLUETOOTH: 441 case PF_BLUETOOTH:
442 case PF_CAN: 442 case PF_CAN:
443 result = KAUTH_RESULT_ALLOW; 443 result = KAUTH_RESULT_ALLOW;
444 break; 444 break;
445 default: 445 default:
446 /* Privileged, let secmodel handle this. */ 446 /* Privileged, let secmodel handle this. */
447 if ((u_long)arg2 == SOCK_RAW) 447 if ((u_long)arg2 == SOCK_RAW)
448 break; 448 break;
449 result = KAUTH_RESULT_ALLOW; 449 result = KAUTH_RESULT_ALLOW;
450 break; 450 break;
451 } 451 }
452 break; 452 break;
453 453
454 case KAUTH_REQ_NETWORK_SOCKET_CANSEE: 454 case KAUTH_REQ_NETWORK_SOCKET_CANSEE:
455 result = KAUTH_RESULT_ALLOW; 455 result = KAUTH_RESULT_ALLOW;
456 456
457 break; 457 break;
458 458
459 default: 459 default:
460 break; 460 break;
461 } 461 }
462 462
463 return result; 463 return result;
464} 464}
465 465
466void 466void
467soinit(void) 467soinit(void)
468{ 468{
469 469
470 sysctl_kern_socket_setup(); 470 sysctl_kern_socket_setup();
471 471
472 mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM); 472 mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM);
473 softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 473 softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
474 cv_init(&socurkva_cv, "sokva"); 474 cv_init(&socurkva_cv, "sokva");
475 cv_init(&pendfree_thread_cv, "sopendfr"); 475 cv_init(&pendfree_thread_cv, "sopendfr");
476 soinit2(); 476 soinit2();
477 477
478 /* Set the initial adjusted socket buffer size. */ 478 /* Set the initial adjusted socket buffer size. */
479 if (sb_max_set(sb_max)) 479 if (sb_max_set(sb_max))
480 panic("bad initial sb_max value: %lu", sb_max); 480 panic("bad initial sb_max value: %lu", sb_max);
481 481
482 socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK, 482 socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
483 socket_listener_cb, NULL); 483 socket_listener_cb, NULL);
484} 484}
485 485
486void 486void
487soinit1(void) 487soinit1(void)
488{ 488{
489 int error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, 489 int error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
490 sopendfree_thread, NULL, &sopendfree_lwp, "sopendfree"); 490 sopendfree_thread, NULL, &sopendfree_lwp, "sopendfree");
491 if (error) 491 if (error)
492 panic("soinit1 %d", error); 492 panic("soinit1 %d", error);
493} 493}
494 494
495/* 495/*
496 * socreate: create a new socket of the specified type and the protocol. 496 * socreate: create a new socket of the specified type and the protocol.
497 * 497 *
498 * => Caller may specify another socket for lock sharing (must not be held). 498 * => Caller may specify another socket for lock sharing (must not be held).
499 * => Returns the new socket without lock held. 499 * => Returns the new socket without lock held.
500 */ 500 */
501int 501int
502socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l, 502socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l,
503 struct socket *lockso) 503 struct socket *lockso)
504{ 504{
505 const struct protosw *prp; 505 const struct protosw *prp;
506 struct socket *so; 506 struct socket *so;
507 uid_t uid; 507 uid_t uid;
508 int error; 508 int error;
509 kmutex_t *lock; 509 kmutex_t *lock;
510 510
511 error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, 511 error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
512 KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type), 512 KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type),
513 KAUTH_ARG(proto)); 513 KAUTH_ARG(proto));
514 if (error != 0) 514 if (error != 0)
515 return error; 515 return error;
516 516
517 if (proto) 517 if (proto)
518 prp = pffindproto(dom, proto, type); 518 prp = pffindproto(dom, proto, type);
519 else 519 else
520 prp = pffindtype(dom, type); 520 prp = pffindtype(dom, type);
521 if (prp == NULL) { 521 if (prp == NULL) {
522 /* no support for domain */ 522 /* no support for domain */
523 if (pffinddomain(dom) == 0) 523 if (pffinddomain(dom) == 0)
524 return EAFNOSUPPORT; 524 return EAFNOSUPPORT;
525 /* no support for socket type */ 525 /* no support for socket type */
526 if (proto == 0 && type != 0) 526 if (proto == 0 && type != 0)
527 return EPROTOTYPE; 527 return EPROTOTYPE;
528 return EPROTONOSUPPORT; 528 return EPROTONOSUPPORT;
529 } 529 }
530 if (prp->pr_usrreqs == NULL) 530 if (prp->pr_usrreqs == NULL)
531 return EPROTONOSUPPORT; 531 return EPROTONOSUPPORT;
532 if (prp->pr_type != type) 532 if (prp->pr_type != type)
533 return EPROTOTYPE; 533 return EPROTOTYPE;
534 534
535 so = soget(true); 535 so = soget(true);
536 so->so_type = type; 536 so->so_type = type;
537 so->so_proto = prp; 537 so->so_proto = prp;
538 so->so_send = sosend; 538 so->so_send = sosend;
539 so->so_receive = soreceive; 539 so->so_receive = soreceive;
540#ifdef MBUFTRACE 540#ifdef MBUFTRACE
541 so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; 541 so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner;
542 so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; 542 so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner;
543 so->so_mowner = &prp->pr_domain->dom_mowner; 543 so->so_mowner = &prp->pr_domain->dom_mowner;
544#endif 544#endif
545 uid = kauth_cred_geteuid(l->l_cred); 545 uid = kauth_cred_geteuid(l->l_cred);
546 so->so_uidinfo = uid_find(uid); 546 so->so_uidinfo = uid_find(uid);
547 so->so_cpid = l->l_proc->p_pid; 547 so->so_cpid = l->l_proc->p_pid;
548 548
549 /* 549 /*
550 * Lock assigned and taken during PCB attach, unless we share 550 * Lock assigned and taken during PCB attach, unless we share
551 * the lock with another socket, e.g. socketpair(2) case. 551 * the lock with another socket, e.g. socketpair(2) case.
552 */ 552 */
553 if (lockso) { 553 if (lockso) {
554 lock = lockso->so_lock; 554 lock = lockso->so_lock;
555 so->so_lock = lock; 555 so->so_lock = lock;
556 mutex_obj_hold(lock); 556 mutex_obj_hold(lock);
557 mutex_enter(lock); 557 mutex_enter(lock);
558 } 558 }
559 559
560 /* Attach the PCB (returns with the socket lock held). */ 560 /* Attach the PCB (returns with the socket lock held). */
561 error = (*prp->pr_usrreqs->pr_attach)(so, proto); 561 error = (*prp->pr_usrreqs->pr_attach)(so, proto);
562 KASSERT(solocked(so)); 562 KASSERT(solocked(so));
563 563
564 if (error) { 564 if (error) {
565 KASSERT(so->so_pcb == NULL); 565 KASSERT(so->so_pcb == NULL);
566 so->so_state |= SS_NOFDREF; 566 so->so_state |= SS_NOFDREF;
567 sofree(so); 567 sofree(so);
568 return error; 568 return error;
569 } 569 }
570 so->so_cred = kauth_cred_dup(l->l_cred); 570 so->so_cred = kauth_cred_dup(l->l_cred);
571 sounlock(so); 571 sounlock(so);
572 572
573 *aso = so; 573 *aso = so;
574 return 0; 574 return 0;
575} 575}
576 576
577/* 577/*
578 * fsocreate: create a socket and a file descriptor associated with it. 578 * fsocreate: create a socket and a file descriptor associated with it.
579 * 579 *
580 * => On success, write file descriptor to fdout and return zero. 580 * => On success, write file descriptor to fdout and return zero.
581 * => On failure, return non-zero; *fdout will be undefined. 581 * => On failure, return non-zero; *fdout will be undefined.
582 */ 582 */
583int 583int
584fsocreate(int domain, struct socket **sop, int type, int proto, int *fdout) 584fsocreate(int domain, struct socket **sop, int type, int proto, int *fdout)
585{ 585{
586 lwp_t *l = curlwp; 586 lwp_t *l = curlwp;
587 int error, fd, flags; 587 int error, fd, flags;
588 struct socket *so; 588 struct socket *so;
589 struct file *fp; 589 struct file *fp;
590 590
591 if ((error = fd_allocfile(&fp, &fd)) != 0) { 591 if ((error = fd_allocfile(&fp, &fd)) != 0) {
592 return error; 592 return error;
593 } 593 }
594 flags = type & SOCK_FLAGS_MASK; 594 flags = type & SOCK_FLAGS_MASK;
595 fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0); 595 fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0);
596 fp->f_flag = FREAD|FWRITE|((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0)| 596 fp->f_flag = FREAD|FWRITE|((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0)|
597 ((flags & SOCK_NOSIGPIPE) ? FNOSIGPIPE : 0); 597 ((flags & SOCK_NOSIGPIPE) ? FNOSIGPIPE : 0);
598 fp->f_type = DTYPE_SOCKET; 598 fp->f_type = DTYPE_SOCKET;
599 fp->f_ops = &socketops; 599 fp->f_ops = &socketops;
600 600
601 type &= ~SOCK_FLAGS_MASK; 601 type &= ~SOCK_FLAGS_MASK;
602 error = socreate(domain, &so, type, proto, l, NULL); 602 error = socreate(domain, &so, type, proto, l, NULL);
603 if (error) { 603 if (error) {
604 fd_abort(curproc, fp, fd); 604 fd_abort(curproc, fp, fd);
605 return error; 605 return error;
606 } 606 }
607 if (flags & SOCK_NONBLOCK) { 607 if (flags & SOCK_NONBLOCK) {
608 so->so_state |= SS_NBIO; 608 so->so_state |= SS_NBIO;
609 } 609 }
610 fp->f_socket = so; 610 fp->f_socket = so;
611 fd_affix(curproc, fp, fd); 611 fd_affix(curproc, fp, fd);
612 612
613 if (sop != NULL) { 613 if (sop != NULL) {
614 *sop = so; 614 *sop = so;
615 } 615 }
616 *fdout = fd; 616 *fdout = fd;
617 return error; 617 return error;
618} 618}
619 619
620int 620int
621sofamily(const struct socket *so) 621sofamily(const struct socket *so)
622{ 622{
623 const struct protosw *pr; 623 const struct protosw *pr;
624 const struct domain *dom; 624 const struct domain *dom;
625 625
626 if ((pr = so->so_proto) == NULL) 626 if ((pr = so->so_proto) == NULL)
627 return AF_UNSPEC; 627 return AF_UNSPEC;
628 if ((dom = pr->pr_domain) == NULL) 628 if ((dom = pr->pr_domain) == NULL)
629 return AF_UNSPEC; 629 return AF_UNSPEC;
630 return dom->dom_family; 630 return dom->dom_family;
631} 631}
632 632
633int 633int
634sobind(struct socket *so, struct sockaddr *nam, struct lwp *l) 634sobind(struct socket *so, struct sockaddr *nam, struct lwp *l)
635{ 635{
636 int error; 636 int error;
637 637
638 solock(so); 638 solock(so);
639 if (nam->sa_family != so->so_proto->pr_domain->dom_family) { 639 if (nam->sa_family != so->so_proto->pr_domain->dom_family) {
640 sounlock(so); 640 sounlock(so);
641 return EAFNOSUPPORT; 641 return EAFNOSUPPORT;
642 } 642 }
643 error = (*so->so_proto->pr_usrreqs->pr_bind)(so, nam, l); 643 error = (*so->so_proto->pr_usrreqs->pr_bind)(so, nam, l);
644 sounlock(so); 644 sounlock(so);
645 return error; 645 return error;
646} 646}
647 647
648int 648int
649solisten(struct socket *so, int backlog, struct lwp *l) 649solisten(struct socket *so, int backlog, struct lwp *l)
650{ 650{
651 int error; 651 int error;
652 short oldopt, oldqlimit; 652 short oldopt, oldqlimit;
653 653
654 solock(so); 654 solock(so);
655 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 655 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
656 SS_ISDISCONNECTING)) != 0) { 656 SS_ISDISCONNECTING)) != 0) {
657 sounlock(so); 657 sounlock(so);
658 return EINVAL; 658 return EINVAL;
659 } 659 }
660 oldopt = so->so_options; 660 oldopt = so->so_options;
661 oldqlimit = so->so_qlimit; 661 oldqlimit = so->so_qlimit;
662 if (TAILQ_EMPTY(&so->so_q)) 662 if (TAILQ_EMPTY(&so->so_q))
663 so->so_options |= SO_ACCEPTCONN; 663 so->so_options |= SO_ACCEPTCONN;
664 if (backlog < 0) 664 if (backlog < 0)
665 backlog = 0; 665 backlog = 0;
666 so->so_qlimit = min(backlog, somaxconn); 666 so->so_qlimit = min(backlog, somaxconn);
667 667
668 error = (*so->so_proto->pr_usrreqs->pr_listen)(so, l); 668 error = (*so->so_proto->pr_usrreqs->pr_listen)(so, l);
669 if (error != 0) { 669 if (error != 0) {
670 so->so_options = oldopt; 670 so->so_options = oldopt;
671 so->so_qlimit = oldqlimit; 671 so->so_qlimit = oldqlimit;
672 sounlock(so); 672 sounlock(so);
673 return error; 673 return error;
674 } 674 }
675 sounlock(so); 675 sounlock(so);
676 return 0; 676 return 0;
677} 677}
678 678
679void 679void
680sofree(struct socket *so) 680sofree(struct socket *so)
681{ 681{
682 u_int refs; 682 u_int refs;
683 683
684 KASSERT(solocked(so)); 684 KASSERT(solocked(so));
685 685
686 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 686 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
687 sounlock(so); 687 sounlock(so);
688 return; 688 return;
689 } 689 }
690 if (so->so_head) { 690 if (so->so_head) {
691 /* 691 /*
692 * We must not decommission a socket that's on the accept(2) 692 * We must not decommission a socket that's on the accept(2)
693 * queue. If we do, then accept(2) may hang after select(2) 693 * queue. If we do, then accept(2) may hang after select(2)
694 * indicated that the listening socket was ready. 694 * indicated that the listening socket was ready.
695 */ 695 */
696 if (!soqremque(so, 0)) { 696 if (!soqremque(so, 0)) {
697 sounlock(so); 697 sounlock(so);
698 return; 698 return;
699 } 699 }
700 } 700 }
701 if (so->so_rcv.sb_hiwat) 701 if (so->so_rcv.sb_hiwat)
702 (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, 702 (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0,
703 RLIM_INFINITY); 703 RLIM_INFINITY);
704 if (so->so_snd.sb_hiwat) 704 if (so->so_snd.sb_hiwat)
705 (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, 705 (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0,
706 RLIM_INFINITY); 706 RLIM_INFINITY);
707 sbrelease(&so->so_snd, so); 707 sbrelease(&so->so_snd, so);
708 KASSERT(!cv_has_waiters(&so->so_cv)); 708 KASSERT(!cv_has_waiters(&so->so_cv));
709 KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); 709 KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
710 KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); 710 KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
711 sorflush(so); 711 sorflush(so);
712 refs = so->so_aborting; /* XXX */ 712 refs = so->so_aborting; /* XXX */
713 /* Remove acccept filter if one is present. */ 713 /* Remove acccept filter if one is present. */
714 if (so->so_accf != NULL) 714 if (so->so_accf != NULL)
715 (void)accept_filt_clear(so); 715 (void)accept_filt_clear(so);
716 sounlock(so); 716 sounlock(so);
717 if (refs == 0) /* XXX */ 717 if (refs == 0) /* XXX */
718 soput(so); 718 soput(so);
719} 719}
720 720
721/* 721/*
722 * soclose: close a socket on last file table reference removal. 722 * soclose: close a socket on last file table reference removal.
723 * Initiate disconnect if connected. Free socket when disconnect complete. 723 * Initiate disconnect if connected. Free socket when disconnect complete.
724 */ 724 */
725int 725int
726soclose(struct socket *so) 726soclose(struct socket *so)
727{ 727{
728 struct socket *so2; 728 struct socket *so2;
729 int error = 0; 729 int error = 0;
730 730
731 solock(so); 731 solock(so);
732 if (so->so_options & SO_ACCEPTCONN) { 732 if (so->so_options & SO_ACCEPTCONN) {
733 for (;;) { 733 for (;;) {
734 if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { 734 if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) {
735 KASSERT(solocked2(so, so2)); 735 KASSERT(solocked2(so, so2));
736 (void) soqremque(so2, 0); 736 (void) soqremque(so2, 0);
737 /* soabort drops the lock. */ 737 /* soabort drops the lock. */
738 (void) soabort(so2); 738 (void) soabort(so2);
739 solock(so); 739 solock(so);
740 continue; 740 continue;
741 } 741 }
742 if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { 742 if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) {
743 KASSERT(solocked2(so, so2)); 743 KASSERT(solocked2(so, so2));
744 (void) soqremque(so2, 1); 744 (void) soqremque(so2, 1);
745 /* soabort drops the lock. */ 745 /* soabort drops the lock. */
746 (void) soabort(so2); 746 (void) soabort(so2);
747 solock(so); 747 solock(so);
748 continue; 748 continue;
749 } 749 }
750 break; 750 break;
751 } 751 }
752 } 752 }
753 if (so->so_pcb == NULL) 753 if (so->so_pcb == NULL)
754 goto discard; 754 goto discard;
755 if (so->so_state & SS_ISCONNECTED) { 755 if (so->so_state & SS_ISCONNECTED) {
756 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 756 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
757 error = sodisconnect(so); 757 error = sodisconnect(so);
758 if (error) 758 if (error)
759 goto drop; 759 goto drop;
760 } 760 }
761 if (so->so_options & SO_LINGER) { 761 if (so->so_options & SO_LINGER) {
762 if ((so->so_state & (SS_ISDISCONNECTING|SS_NBIO)) == 762 if ((so->so_state & (SS_ISDISCONNECTING|SS_NBIO)) ==
763 (SS_ISDISCONNECTING|SS_NBIO)) 763 (SS_ISDISCONNECTING|SS_NBIO))
764 goto drop; 764 goto drop;
765 while (so->so_state & SS_ISCONNECTED) { 765 while (so->so_state & SS_ISCONNECTED) {
766 error = sowait(so, true, so->so_linger * hz); 766 error = sowait(so, true, so->so_linger * hz);
767 if (error) 767 if (error)
768 break; 768 break;
769 } 769 }
770 } 770 }
771 } 771 }
772 drop: 772 drop:
773 if (so->so_pcb) { 773 if (so->so_pcb) {
774 KASSERT(solocked(so)); 774 KASSERT(solocked(so));
775 (*so->so_proto->pr_usrreqs->pr_detach)(so); 775 (*so->so_proto->pr_usrreqs->pr_detach)(so);
776 } 776 }
777 discard: 777 discard:
778 KASSERT((so->so_state & SS_NOFDREF) == 0); 778 KASSERT((so->so_state & SS_NOFDREF) == 0);
779 kauth_cred_free(so->so_cred); 779 kauth_cred_free(so->so_cred);
780 so->so_state |= SS_NOFDREF; 780 so->so_state |= SS_NOFDREF;
781 sofree(so); 781 sofree(so);
782 return error; 782 return error;
783} 783}
784 784
785/* 785/*
786 * Must be called with the socket locked.. Will return with it unlocked. 786 * Must be called with the socket locked.. Will return with it unlocked.
787 */ 787 */
788int 788int
789soabort(struct socket *so) 789soabort(struct socket *so)
790{ 790{
791 u_int refs; 791 u_int refs;
792 int error; 792 int error;
793 793
794 KASSERT(solocked(so)); 794 KASSERT(solocked(so));
795 KASSERT(so->so_head == NULL); 795 KASSERT(so->so_head == NULL);
796 796
797 so->so_aborting++; /* XXX */ 797 so->so_aborting++; /* XXX */
798 error = (*so->so_proto->pr_usrreqs->pr_abort)(so); 798 error = (*so->so_proto->pr_usrreqs->pr_abort)(so);
799 refs = --so->so_aborting; /* XXX */ 799 refs = --so->so_aborting; /* XXX */
800 if (error || (refs == 0)) { 800 if (error || (refs == 0)) {
801 sofree(so); 801 sofree(so);
802 } else { 802 } else {
803 sounlock(so); 803 sounlock(so);
804 } 804 }
805 return error; 805 return error;
806} 806}
807 807
808int 808int
809soaccept(struct socket *so, struct sockaddr *nam) 809soaccept(struct socket *so, struct sockaddr *nam)
810{ 810{
811 int error; 811 int error;
812 812
813 KASSERT(solocked(so)); 813 KASSERT(solocked(so));
814 KASSERT((so->so_state & SS_NOFDREF) != 0); 814 KASSERT((so->so_state & SS_NOFDREF) != 0);
815 815
816 so->so_state &= ~SS_NOFDREF; 816 so->so_state &= ~SS_NOFDREF;
817 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 817 if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
818 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 818 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
819 error = (*so->so_proto->pr_usrreqs->pr_accept)(so, nam); 819 error = (*so->so_proto->pr_usrreqs->pr_accept)(so, nam);
820 else 820 else
821 error = ECONNABORTED; 821 error = ECONNABORTED;
822 822
823 return error; 823 return error;
824} 824}
825 825
826int 826int
827soconnect(struct socket *so, struct sockaddr *nam, struct lwp *l) 827soconnect(struct socket *so, struct sockaddr *nam, struct lwp *l)
828{ 828{
829 int error; 829 int error;
830 830
831 KASSERT(solocked(so)); 831 KASSERT(solocked(so));
832 832
833 if (so->so_options & SO_ACCEPTCONN) 833 if (so->so_options & SO_ACCEPTCONN)
834 return EOPNOTSUPP; 834 return EOPNOTSUPP;
835 /* 835 /*
836 * If protocol is connection-based, can only connect once. 836 * If protocol is connection-based, can only connect once.
837 * Otherwise, if connected, try to disconnect first. 837 * Otherwise, if connected, try to disconnect first.
838 * This allows user to disconnect by connecting to, e.g., 838 * This allows user to disconnect by connecting to, e.g.,
839 * a null address. 839 * a null address.
840 */ 840 */
841 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 841 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
842 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 842 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
843 (error = sodisconnect(so)))) { 843 (error = sodisconnect(so)))) {
844 error = EISCONN; 844 error = EISCONN;
845 } else { 845 } else {
846 if (nam->sa_family != so->so_proto->pr_domain->dom_family) { 846 if (nam->sa_family != so->so_proto->pr_domain->dom_family) {
847 return EAFNOSUPPORT; 847 return EAFNOSUPPORT;
848 } 848 }
849 error = (*so->so_proto->pr_usrreqs->pr_connect)(so, nam, l); 849 error = (*so->so_proto->pr_usrreqs->pr_connect)(so, nam, l);
850 } 850 }
851 851
852 return error; 852 return error;
853} 853}
854 854
855int 855int
856soconnect2(struct socket *so1, struct socket *so2) 856soconnect2(struct socket *so1, struct socket *so2)
857{ 857{
858 KASSERT(solocked2(so1, so2)); 858 KASSERT(solocked2(so1, so2));
859 859
860 return (*so1->so_proto->pr_usrreqs->pr_connect2)(so1, so2); 860 return (*so1->so_proto->pr_usrreqs->pr_connect2)(so1, so2);
861} 861}
862 862
863int 863int
864sodisconnect(struct socket *so) 864sodisconnect(struct socket *so)
865{ 865{
866 int error; 866 int error;
867 867
868 KASSERT(solocked(so)); 868 KASSERT(solocked(so));
869 869
870 if ((so->so_state & SS_ISCONNECTED) == 0) { 870 if ((so->so_state & SS_ISCONNECTED) == 0) {
871 error = ENOTCONN; 871 error = ENOTCONN;
872 } else if (so->so_state & SS_ISDISCONNECTING) { 872 } else if (so->so_state & SS_ISDISCONNECTING) {
873 error = EALREADY; 873 error = EALREADY;
874 } else { 874 } else {
875 error = (*so->so_proto->pr_usrreqs->pr_disconnect)(so); 875 error = (*so->so_proto->pr_usrreqs->pr_disconnect)(so);
876 } 876 }
877 return (error); 877 return (error);
878} 878}
879 879
880#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 880#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
881/* 881/*
882 * Send on a socket. 882 * Send on a socket.
883 * If send must go all at once and message is larger than 883 * If send must go all at once and message is larger than
884 * send buffering, then hard error. 884 * send buffering, then hard error.
885 * Lock against other senders. 885 * Lock against other senders.
886 * If must go all at once and not enough room now, then 886 * If must go all at once and not enough room now, then
887 * inform user that this would block and do nothing. 887 * inform user that this would block and do nothing.
888 * Otherwise, if nonblocking, send as much as possible. 888 * Otherwise, if nonblocking, send as much as possible.
889 * The data to be sent is described by "uio" if nonzero, 889 * The data to be sent is described by "uio" if nonzero,
890 * otherwise by the mbuf chain "top" (which must be null 890 * otherwise by the mbuf chain "top" (which must be null
891 * if uio is not). Data provided in mbuf chain must be small 891 * if uio is not). Data provided in mbuf chain must be small
892 * enough to send all at once. 892 * enough to send all at once.
893 * 893 *
894 * Returns nonzero on error, timeout or signal; callers 894 * Returns nonzero on error, timeout or signal; callers
895 * must check for short counts if EINTR/ERESTART are returned. 895 * must check for short counts if EINTR/ERESTART are returned.
896 * Data and control buffers are freed on return. 896 * Data and control buffers are freed on return.
897 */ 897 */
898int 898int
899sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 899sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
900 struct mbuf *top, struct mbuf *control, int flags, struct lwp *l) 900 struct mbuf *top, struct mbuf *control, int flags, struct lwp *l)
901{ 901{
902 struct mbuf **mp, *m; 902 struct mbuf **mp, *m;
903 long space, len, resid, clen, mlen; 903 long space, len, resid, clen, mlen;
904 int error, s, dontroute, atomic; 904 int error, s, dontroute, atomic;
905 short wakeup_state = 0; 905 short wakeup_state = 0;
906 906
907 clen = 0; 907 clen = 0;
908 908
909 /* 909 /*
910 * solock() provides atomicity of access. splsoftnet() prevents 910 * solock() provides atomicity of access. splsoftnet() prevents
911 * protocol processing soft interrupts from interrupting us and 911 * protocol processing soft interrupts from interrupting us and
912 * blocking (expensive). 912 * blocking (expensive).
913 */ 913 */
914 s = splsoftnet(); 914 s = splsoftnet();
915 solock(so); 915 solock(so);
916 atomic = sosendallatonce(so) || top; 916 atomic = sosendallatonce(so) || top;
917 if (uio) 917 if (uio)
918 resid = uio->uio_resid; 918 resid = uio->uio_resid;
919 else 919 else
920 resid = top->m_pkthdr.len; 920 resid = top->m_pkthdr.len;
921 /* 921 /*
922 * In theory resid should be unsigned. 922 * In theory resid should be unsigned.
923 * However, space must be signed, as it might be less than 0 923 * However, space must be signed, as it might be less than 0
924 * if we over-committed, and we must use a signed comparison 924 * if we over-committed, and we must use a signed comparison
925 * of space and resid. On the other hand, a negative resid 925 * of space and resid. On the other hand, a negative resid
926 * causes us to loop sending 0-length segments to the protocol. 926 * causes us to loop sending 0-length segments to the protocol.
927 */ 927 */
928 if (resid < 0) { 928 if (resid < 0) {
929 error = EINVAL; 929 error = EINVAL;
930 goto out; 930 goto out;
931 } 931 }
932 dontroute = 932 dontroute =
933 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 933 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
934 (so->so_proto->pr_flags & PR_ATOMIC); 934 (so->so_proto->pr_flags & PR_ATOMIC);
935 l->l_ru.ru_msgsnd++; 935 l->l_ru.ru_msgsnd++;
936 if (control) 936 if (control)
937 clen = control->m_len; 937 clen = control->m_len;
938 restart: 938 restart:
939 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 939 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
940 goto out; 940 goto out;
941 do { 941 do {
942 if (so->so_state & SS_CANTSENDMORE) { 942 if (so->so_state & SS_CANTSENDMORE) {
943 error = EPIPE; 943 error = EPIPE;
944 goto release; 944 goto release;
945 } 945 }
946 if (so->so_error) { 946 if (so->so_error) {
947 error = so->so_error; 947 error = so->so_error;
948 so->so_error = 0; 948 so->so_error = 0;
949 goto release; 949 goto release;
950 } 950 }
951 if ((so->so_state & SS_ISCONNECTED) == 0) { 951 if ((so->so_state & SS_ISCONNECTED) == 0) {
952 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 952 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
953 if (resid || clen == 0) { 953 if (resid || clen == 0) {
954 error = ENOTCONN; 954 error = ENOTCONN;
955 goto release; 955 goto release;
956 } 956 }
957 } else if (addr == NULL) { 957 } else if (addr == NULL) {
958 error = EDESTADDRREQ; 958 error = EDESTADDRREQ;
959 goto release; 959 goto release;
960 } 960 }
961 } 961 }
962 space = sbspace(&so->so_snd); 962 space = sbspace(&so->so_snd);
963 if (flags & MSG_OOB) 963 if (flags & MSG_OOB)
964 space += 1024; 964 space += 1024;
965 if ((atomic && resid > so->so_snd.sb_hiwat) || 965 if ((atomic && resid > so->so_snd.sb_hiwat) ||
966 clen > so->so_snd.sb_hiwat) { 966 clen > so->so_snd.sb_hiwat) {
967 error = EMSGSIZE; 967 error = EMSGSIZE;
968 goto release; 968 goto release;
969 } 969 }
970 if (space < resid + clen && 970 if (space < resid + clen &&
971 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 971 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
972 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { 972 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
973 error = EWOULDBLOCK; 973 error = EWOULDBLOCK;
974 goto release; 974 goto release;
975 } 975 }
976 sbunlock(&so->so_snd); 976 sbunlock(&so->so_snd);
977 if (wakeup_state & SS_RESTARTSYS) { 977 if (wakeup_state & SS_RESTARTSYS) {
978 error = ERESTART; 978 error = ERESTART;
979 goto out; 979 goto out;
980 } 980 }
981 error = sbwait(&so->so_snd); 981 error = sbwait(&so->so_snd);
982 if (error) 982 if (error)
983 goto out; 983 goto out;
984 wakeup_state = so->so_state; 984 wakeup_state = so->so_state;
985 goto restart; 985 goto restart;
986 } 986 }
987 wakeup_state = 0; 987 wakeup_state = 0;
988 mp = &top; 988 mp = &top;
989 space -= clen; 989 space -= clen;
990 do { 990 do {
991 if (uio == NULL) { 991 if (uio == NULL) {
992 /* 992 /*
993 * Data is prepackaged in "top". 993 * Data is prepackaged in "top".
994 */ 994 */
995 resid = 0; 995 resid = 0;
996 if (flags & MSG_EOR) 996 if (flags & MSG_EOR)
997 top->m_flags |= M_EOR; 997 top->m_flags |= M_EOR;
998 } else do { 998 } else do {
999 sounlock(so); 999 sounlock(so);
1000 splx(s); 1000 splx(s);
1001 if (top == NULL) { 1001 if (top == NULL) {
1002 m = m_gethdr(M_WAIT, MT_DATA); 1002 m = m_gethdr(M_WAIT, MT_DATA);
1003 mlen = MHLEN; 1003 mlen = MHLEN;
1004 m->m_pkthdr.len = 0; 1004 m->m_pkthdr.len = 0;
1005 m_reset_rcvif(m); 1005 m_reset_rcvif(m);
1006 } else { 1006 } else {
1007 m = m_get(M_WAIT, MT_DATA); 1007 m = m_get(M_WAIT, MT_DATA);
1008 mlen = MLEN; 1008 mlen = MLEN;
1009 } 1009 }
1010 MCLAIM(m, so->so_snd.sb_mowner); 1010 MCLAIM(m, so->so_snd.sb_mowner);
1011 if (sock_loan_thresh >= 0 && 1011 if (sock_loan_thresh >= 0 &&
1012 uio->uio_iov->iov_len >= sock_loan_thresh && 1012 uio->uio_iov->iov_len >= sock_loan_thresh &&
1013 space >= sock_loan_thresh && 1013 space >= sock_loan_thresh &&
1014 (len = sosend_loan(so, uio, m, 1014 (len = sosend_loan(so, uio, m,
1015 space)) != 0) { 1015 space)) != 0) {
1016 SOSEND_COUNTER_INCR(&sosend_loan_big); 1016 SOSEND_COUNTER_INCR(&sosend_loan_big);
1017 space -= len; 1017 space -= len;
1018 goto have_data; 1018 goto have_data;
1019 } 1019 }
1020 if (resid >= MINCLSIZE && space >= MCLBYTES) { 1020 if (resid >= MINCLSIZE && space >= MCLBYTES) {
1021 SOSEND_COUNTER_INCR(&sosend_copy_big); 1021 SOSEND_COUNTER_INCR(&sosend_copy_big);
1022 m_clget(m, M_DONTWAIT); 1022 m_clget(m, M_DONTWAIT);
1023 if ((m->m_flags & M_EXT) == 0) 1023 if ((m->m_flags & M_EXT) == 0)
1024 goto nopages; 1024 goto nopages;
1025 mlen = MCLBYTES; 1025 mlen = MCLBYTES;
1026 if (atomic && top == 0) { 1026 if (atomic && top == 0) {
1027 len = lmin(MCLBYTES - max_hdr, 1027 len = lmin(MCLBYTES - max_hdr,
1028 resid); 1028 resid);
1029 m->m_data += max_hdr; 1029 m->m_data += max_hdr;
1030 } else 1030 } else
1031 len = lmin(MCLBYTES, resid); 1031 len = lmin(MCLBYTES, resid);
1032 space -= len; 1032 space -= len;
1033 } else { 1033 } else {
1034 nopages: 1034 nopages:
1035 SOSEND_COUNTER_INCR(&sosend_copy_small); 1035 SOSEND_COUNTER_INCR(&sosend_copy_small);
1036 len = lmin(lmin(mlen, resid), space); 1036 len = lmin(lmin(mlen, resid), space);
1037 space -= len; 1037 space -= len;
1038 /* 1038 /*
1039 * For datagram protocols, leave room 1039 * For datagram protocols, leave room
1040 * for protocol headers in first mbuf. 1040 * for protocol headers in first mbuf.
1041 */ 1041 */
1042 if (atomic && top == 0 && len < mlen) 1042 if (atomic && top == 0 && len < mlen)
1043 MH_ALIGN(m, len); 1043 MH_ALIGN(m, len);
1044 } 1044 }
1045 error = uiomove(mtod(m, void *), (int)len, uio); 1045 error = uiomove(mtod(m, void *), (int)len, uio);
1046 have_data: 1046 have_data:
1047 resid = uio->uio_resid; 1047 resid = uio->uio_resid;
1048 m->m_len = len; 1048 m->m_len = len;
1049 *mp = m; 1049 *mp = m;
1050 top->m_pkthdr.len += len; 1050 top->m_pkthdr.len += len;
1051 s = splsoftnet(); 1051 s = splsoftnet();
1052 solock(so); 1052 solock(so);
1053 if (error != 0) 1053 if (error != 0)
1054 goto release; 1054 goto release;
1055 mp = &m->m_next; 1055 mp = &m->m_next;
1056 if (resid <= 0) { 1056 if (resid <= 0) {
1057 if (flags & MSG_EOR) 1057 if (flags & MSG_EOR)
1058 top->m_flags |= M_EOR; 1058 top->m_flags |= M_EOR;
1059 break; 1059 break;
1060 } 1060 }
1061 } while (space > 0 && atomic); 1061 } while (space > 0 && atomic);
1062 1062
1063 if (so->so_state & SS_CANTSENDMORE) { 1063 if (so->so_state & SS_CANTSENDMORE) {
1064 error = EPIPE; 1064 error = EPIPE;
1065 goto release; 1065 goto release;
1066 } 1066 }
1067 if (dontroute) 1067 if (dontroute)
1068 so->so_options |= SO_DONTROUTE; 1068 so->so_options |= SO_DONTROUTE;
1069 if (resid > 0) 1069 if (resid > 0)
1070 so->so_state |= SS_MORETOCOME; 1070 so->so_state |= SS_MORETOCOME;
1071 if (flags & MSG_OOB) { 1071 if (flags & MSG_OOB) {
1072 error = (*so->so_proto->pr_usrreqs->pr_sendoob)( 1072 error = (*so->so_proto->pr_usrreqs->pr_sendoob)(
1073 so, top, control); 1073 so, top, control);
1074 } else { 1074 } else {
1075 error = (*so->so_proto->pr_usrreqs->pr_send)(so, 1075 error = (*so->so_proto->pr_usrreqs->pr_send)(so,
1076 top, addr, control, l); 1076 top, addr, control, l);
1077 } 1077 }
1078 if (dontroute) 1078 if (dontroute)
1079 so->so_options &= ~SO_DONTROUTE; 1079 so->so_options &= ~SO_DONTROUTE;
1080 if (resid > 0) 1080 if (resid > 0)
1081 so->so_state &= ~SS_MORETOCOME; 1081 so->so_state &= ~SS_MORETOCOME;
1082 clen = 0; 1082 clen = 0;
1083 control = NULL; 1083 control = NULL;
1084 top = NULL; 1084 top = NULL;
1085 mp = &top; 1085 mp = &top;
1086 if (error != 0) 1086 if (error != 0)
1087 goto release; 1087 goto release;
1088 } while (resid && space > 0); 1088 } while (resid && space > 0);
1089 } while (resid); 1089 } while (resid);
1090 1090
1091 release: 1091 release:
1092 sbunlock(&so->so_snd); 1092 sbunlock(&so->so_snd);
1093 out: 1093 out:
1094 sounlock(so); 1094 sounlock(so);
1095 splx(s); 1095 splx(s);
1096 if (top) 1096 if (top)
1097 m_freem(top); 1097 m_freem(top);
1098 if (control) 1098 if (control)
1099 m_freem(control); 1099 m_freem(control);
1100 return (error); 1100 return (error);
1101} 1101}
1102 1102
1103/* 1103/*
1104 * Following replacement or removal of the first mbuf on the first 1104 * Following replacement or removal of the first mbuf on the first
1105 * mbuf chain of a socket buffer, push necessary state changes back 1105 * mbuf chain of a socket buffer, push necessary state changes back
1106 * into the socket buffer so that other consumers see the values 1106 * into the socket buffer so that other consumers see the values
1107 * consistently. 'nextrecord' is the callers locally stored value of 1107 * consistently. 'nextrecord' is the callers locally stored value of
1108 * the original value of sb->sb_mb->m_nextpkt which must be restored 1108 * the original value of sb->sb_mb->m_nextpkt which must be restored
1109 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 1109 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL.
1110 */ 1110 */
1111static void 1111static void
1112sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 1112sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
1113{ 1113{
1114 1114
1115 KASSERT(solocked(sb->sb_so)); 1115 KASSERT(solocked(sb->sb_so));
1116 1116
1117 /* 1117 /*
1118 * First, update for the new value of nextrecord. If necessary, 1118 * First, update for the new value of nextrecord. If necessary,
1119 * make it the first record. 1119 * make it the first record.
1120 */ 1120 */
1121 if (sb->sb_mb != NULL) 1121 if (sb->sb_mb != NULL)
1122 sb->sb_mb->m_nextpkt = nextrecord; 1122 sb->sb_mb->m_nextpkt = nextrecord;
1123 else 1123 else
1124 sb->sb_mb = nextrecord; 1124 sb->sb_mb = nextrecord;
1125 1125
1126 /* 1126 /*
1127 * Now update any dependent socket buffer fields to reflect 1127 * Now update any dependent socket buffer fields to reflect
1128 * the new state. This is an inline of SB_EMPTY_FIXUP, with 1128 * the new state. This is an inline of SB_EMPTY_FIXUP, with
1129 * the addition of a second clause that takes care of the 1129 * the addition of a second clause that takes care of the
1130 * case where sb_mb has been updated, but remains the last 1130 * case where sb_mb has been updated, but remains the last
1131 * record. 1131 * record.
1132 */ 1132 */
1133 if (sb->sb_mb == NULL) { 1133 if (sb->sb_mb == NULL) {
1134 sb->sb_mbtail = NULL; 1134 sb->sb_mbtail = NULL;
1135 sb->sb_lastrecord = NULL; 1135 sb->sb_lastrecord = NULL;
1136 } else if (sb->sb_mb->m_nextpkt == NULL) 1136 } else if (sb->sb_mb->m_nextpkt == NULL)
1137 sb->sb_lastrecord = sb->sb_mb; 1137 sb->sb_lastrecord = sb->sb_mb;
1138} 1138}
1139 1139
1140/* 1140/*
1141 * Implement receive operations on a socket. 1141 * Implement receive operations on a socket.
1142 * We depend on the way that records are added to the sockbuf 1142 * We depend on the way that records are added to the sockbuf
1143 * by sbappend*. In particular, each record (mbufs linked through m_next) 1143 * by sbappend*. In particular, each record (mbufs linked through m_next)
1144 * must begin with an address if the protocol so specifies, 1144 * must begin with an address if the protocol so specifies,
1145 * followed by an optional mbuf or mbufs containing ancillary data, 1145 * followed by an optional mbuf or mbufs containing ancillary data,
1146 * and then zero or more mbufs of data. 1146 * and then zero or more mbufs of data.
1147 * In order to avoid blocking network interrupts for the entire time here, 1147 * In order to avoid blocking network interrupts for the entire time here,
1148 * we splx() while doing the actual copy to user space. 1148 * we splx() while doing the actual copy to user space.
1149 * Although the sockbuf is locked, new data may still be appended, 1149 * Although the sockbuf is locked, new data may still be appended,
1150 * and thus we must maintain consistency of the sockbuf during that time. 1150 * and thus we must maintain consistency of the sockbuf during that time.
1151 * 1151 *
1152 * The caller may receive the data as a single mbuf chain by supplying 1152 * The caller may receive the data as a single mbuf chain by supplying
1153 * an mbuf **mp0 for use in returning the chain. The uio is then used 1153 * an mbuf **mp0 for use in returning the chain. The uio is then used
1154 * only for the count in uio_resid. 1154 * only for the count in uio_resid.
1155 */ 1155 */
1156int 1156int
1157soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 1157soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
1158 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1158 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1159{ 1159{
1160 struct lwp *l = curlwp; 1160 struct lwp *l = curlwp;
1161 struct mbuf *m, **mp, *mt; 1161 struct mbuf *m, **mp, *mt;
1162 size_t len, offset, moff, orig_resid; 1162 size_t len, offset, moff, orig_resid;
1163 int atomic, flags, error, s, type; 1163 int atomic, flags, error, s, type;
1164 const struct protosw *pr; 1164 const struct protosw *pr;
1165 struct mbuf *nextrecord; 1165 struct mbuf *nextrecord;
1166 int mbuf_removed = 0; 1166 int mbuf_removed = 0;
1167 const struct domain *dom; 1167 const struct domain *dom;
1168 short wakeup_state = 0; 1168 short wakeup_state = 0;
1169 1169
1170 pr = so->so_proto; 1170 pr = so->so_proto;
1171 atomic = pr->pr_flags & PR_ATOMIC; 1171 atomic = pr->pr_flags & PR_ATOMIC;
1172 dom = pr->pr_domain; 1172 dom = pr->pr_domain;
1173 mp = mp0; 1173 mp = mp0;
1174 type = 0; 1174 type = 0;
1175 orig_resid = uio->uio_resid; 1175 orig_resid = uio->uio_resid;
1176 1176
1177 if (paddr != NULL) 1177 if (paddr != NULL)
1178 *paddr = NULL; 1178 *paddr = NULL;
1179 if (controlp != NULL) 1179 if (controlp != NULL)
1180 *controlp = NULL; 1180 *controlp = NULL;
1181 if (flagsp != NULL) 1181 if (flagsp != NULL)
1182 flags = *flagsp &~ MSG_EOR; 1182 flags = *flagsp &~ MSG_EOR;
1183 else 1183 else
1184 flags = 0; 1184 flags = 0;
1185 1185
1186 if (flags & MSG_OOB) { 1186 if (flags & MSG_OOB) {
1187 m = m_get(M_WAIT, MT_DATA); 1187 m = m_get(M_WAIT, MT_DATA);
1188 solock(so); 1188 solock(so);
1189 error = (*pr->pr_usrreqs->pr_recvoob)(so, m, flags & MSG_PEEK); 1189 error = (*pr->pr_usrreqs->pr_recvoob)(so, m, flags & MSG_PEEK);
1190 sounlock(so); 1190 sounlock(so);
1191 if (error) 1191 if (error)
1192 goto bad; 1192 goto bad;
1193 do { 1193 do {
1194 error = uiomove(mtod(m, void *), 1194 error = uiomove(mtod(m, void *),
1195 MIN(uio->uio_resid, m->m_len), uio); 1195 MIN(uio->uio_resid, m->m_len), uio);
1196 m = m_free(m); 1196 m = m_free(m);
1197 } while (uio->uio_resid > 0 && error == 0 && m); 1197 } while (uio->uio_resid > 0 && error == 0 && m);
1198 bad: 1198 bad:
1199 if (m != NULL) 1199 if (m != NULL)
1200 m_freem(m); 1200 m_freem(m);
1201 return error; 1201 return error;
1202 } 1202 }
1203 if (mp != NULL) 1203 if (mp != NULL)
1204 *mp = NULL; 1204 *mp = NULL;
1205 1205
1206 /* 1206 /*
1207 * solock() provides atomicity of access. splsoftnet() prevents 1207 * solock() provides atomicity of access. splsoftnet() prevents
1208 * protocol processing soft interrupts from interrupting us and 1208 * protocol processing soft interrupts from interrupting us and
1209 * blocking (expensive). 1209 * blocking (expensive).
1210 */ 1210 */
1211 s = splsoftnet(); 1211 s = splsoftnet();
1212 solock(so); 1212 solock(so);
1213 restart: 1213 restart:
1214 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) { 1214 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) {
1215 sounlock(so); 1215 sounlock(so);
1216 splx(s); 1216 splx(s);
1217 return error; 1217 return error;
1218 } 1218 }
1219 1219
1220 m = so->so_rcv.sb_mb; 1220 m = so->so_rcv.sb_mb;
1221 /* 1221 /*
1222 * If we have less data than requested, block awaiting more 1222 * If we have less data than requested, block awaiting more
1223 * (subject to any timeout) if: 1223 * (subject to any timeout) if:
1224 * 1. the current count is less than the low water mark, 1224 * 1. the current count is less than the low water mark,
1225 * 2. MSG_WAITALL is set, and it is possible to do the entire 1225 * 2. MSG_WAITALL is set, and it is possible to do the entire
1226 * receive operation at once if we block (resid <= hiwat), or 1226 * receive operation at once if we block (resid <= hiwat), or
1227 * 3. MSG_DONTWAIT is not set. 1227 * 3. MSG_DONTWAIT is not set.
1228 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1228 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1229 * we have to do the receive in sections, and thus risk returning 1229 * we have to do the receive in sections, and thus risk returning
1230 * a short count if a timeout or signal occurs after we start. 1230 * a short count if a timeout or signal occurs after we start.
1231 */ 1231 */
1232 if (m == NULL || 1232 if (m == NULL ||
1233 ((flags & MSG_DONTWAIT) == 0 && 1233 ((flags & MSG_DONTWAIT) == 0 &&
1234 so->so_rcv.sb_cc < uio->uio_resid && 1234 so->so_rcv.sb_cc < uio->uio_resid &&
1235 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1235 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1236 ((flags & MSG_WAITALL) && 1236 ((flags & MSG_WAITALL) &&
1237 uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1237 uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1238 m->m_nextpkt == NULL && !atomic)) { 1238 m->m_nextpkt == NULL && !atomic)) {
1239#ifdef DIAGNOSTIC 1239#ifdef DIAGNOSTIC
1240 if (m == NULL && so->so_rcv.sb_cc) 1240 if (m == NULL && so->so_rcv.sb_cc)
1241 panic("receive 1"); 1241 panic("receive 1");
1242#endif 1242#endif
1243 if (so->so_error) { 1243 if (so->so_error || so->so_rerror) {
1244 if (m != NULL) 1244 if (m != NULL)
1245 goto dontblock; 1245 goto dontblock;
1246 error = so->so_error; 1246 if (so->so_error) {
1247 so->so_error = 0; 1247 error = so->so_error;
 1248 so->so_error = 0;
 1249 } else {
 1250 error = so->so_rerror;
 1251 so->so_rerror = 0;
 1252 }
1248 goto release; 1253 goto release;
1249 } 1254 }
1250 if (so->so_state & SS_CANTRCVMORE) { 1255 if (so->so_state & SS_CANTRCVMORE) {
1251 if (m != NULL) 1256 if (m != NULL)
1252 goto dontblock; 1257 goto dontblock;
1253 else 1258 else
1254 goto release; 1259 goto release;
1255 } 1260 }
1256 for (; m != NULL; m = m->m_next) 1261 for (; m != NULL; m = m->m_next)
1257 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1262 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1258 m = so->so_rcv.sb_mb; 1263 m = so->so_rcv.sb_mb;
1259 goto dontblock; 1264 goto dontblock;
1260 } 1265 }
1261 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1266 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1262 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1267 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1263 error = ENOTCONN; 1268 error = ENOTCONN;
1264 goto release; 1269 goto release;
1265 } 1270 }
1266 if (uio->uio_resid == 0) 1271 if (uio->uio_resid == 0)
1267 goto release; 1272 goto release;
1268 if ((so->so_state & SS_NBIO) || 1273 if ((so->so_state & SS_NBIO) ||
1269 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1274 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1270 error = EWOULDBLOCK; 1275 error = EWOULDBLOCK;
1271 goto release; 1276 goto release;
1272 } 1277 }
1273 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 1278 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
1274 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 1279 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
1275 sbunlock(&so->so_rcv); 1280 sbunlock(&so->so_rcv);
1276 if (wakeup_state & SS_RESTARTSYS) 1281 if (wakeup_state & SS_RESTARTSYS)
1277 error = ERESTART; 1282 error = ERESTART;
1278 else 1283 else
1279 error = sbwait(&so->so_rcv); 1284 error = sbwait(&so->so_rcv);
1280 if (error != 0) { 1285 if (error != 0) {
1281 sounlock(so); 1286 sounlock(so);
1282 splx(s); 1287 splx(s);
1283 return error; 1288 return error;
1284 } 1289 }
1285 wakeup_state = so->so_state; 1290 wakeup_state = so->so_state;
1286 goto restart; 1291 goto restart;
1287 } 1292 }
1288 dontblock: 1293 dontblock:
1289 /* 1294 /*
1290 * On entry here, m points to the first record of the socket buffer. 1295 * On entry here, m points to the first record of the socket buffer.
1291 * From this point onward, we maintain 'nextrecord' as a cache of the 1296 * From this point onward, we maintain 'nextrecord' as a cache of the
1292 * pointer to the next record in the socket buffer. We must keep the 1297 * pointer to the next record in the socket buffer. We must keep the
1293 * various socket buffer pointers and local stack versions of the 1298 * various socket buffer pointers and local stack versions of the
1294 * pointers in sync, pushing out modifications before dropping the 1299 * pointers in sync, pushing out modifications before dropping the
1295 * socket lock, and re-reading them when picking it up. 1300 * socket lock, and re-reading them when picking it up.
1296 * 1301 *
1297 * Otherwise, we will race with the network stack appending new data 1302 * Otherwise, we will race with the network stack appending new data
1298 * or records onto the socket buffer by using inconsistent/stale 1303 * or records onto the socket buffer by using inconsistent/stale
1299 * versions of the field, possibly resulting in socket buffer 1304 * versions of the field, possibly resulting in socket buffer
1300 * corruption. 1305 * corruption.
1301 * 1306 *
1302 * By holding the high-level sblock(), we prevent simultaneous 1307 * By holding the high-level sblock(), we prevent simultaneous
1303 * readers from pulling off the front of the socket buffer. 1308 * readers from pulling off the front of the socket buffer.
1304 */ 1309 */
1305 if (l != NULL) 1310 if (l != NULL)
1306 l->l_ru.ru_msgrcv++; 1311 l->l_ru.ru_msgrcv++;
1307 KASSERT(m == so->so_rcv.sb_mb); 1312 KASSERT(m == so->so_rcv.sb_mb);
1308 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 1313 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
1309 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 1314 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
1310 nextrecord = m->m_nextpkt; 1315 nextrecord = m->m_nextpkt;
1311 if (pr->pr_flags & PR_ADDR) { 1316 if (pr->pr_flags & PR_ADDR) {
1312#ifdef DIAGNOSTIC 1317#ifdef DIAGNOSTIC
1313 if (m->m_type != MT_SONAME) 1318 if (m->m_type != MT_SONAME)
1314 panic("receive 1a"); 1319 panic("receive 1a");
1315#endif 1320#endif
1316 orig_resid = 0; 1321 orig_resid = 0;
1317 if (flags & MSG_PEEK) { 1322 if (flags & MSG_PEEK) {
1318 if (paddr) 1323 if (paddr)
1319 *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT); 1324 *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT);
1320 m = m->m_next; 1325 m = m->m_next;
1321 } else { 1326 } else {
1322 sbfree(&so->so_rcv, m); 1327 sbfree(&so->so_rcv, m);
1323 mbuf_removed = 1; 1328 mbuf_removed = 1;
1324 if (paddr != NULL) { 1329 if (paddr != NULL) {
1325 *paddr = m; 1330 *paddr = m;
1326 so->so_rcv.sb_mb = m->m_next; 1331 so->so_rcv.sb_mb = m->m_next;
1327 m->m_next = NULL; 1332 m->m_next = NULL;
1328 m = so->so_rcv.sb_mb; 1333 m = so->so_rcv.sb_mb;
1329 } else { 1334 } else {
1330 m = so->so_rcv.sb_mb = m_free(m); 1335 m = so->so_rcv.sb_mb = m_free(m);
1331 } 1336 }
1332 sbsync(&so->so_rcv, nextrecord); 1337 sbsync(&so->so_rcv, nextrecord);
1333 } 1338 }
1334 } 1339 }
1335 if (pr->pr_flags & PR_ADDR_OPT) { 1340 if (pr->pr_flags & PR_ADDR_OPT) {
1336 /* 1341 /*
1337 * For SCTP we may be getting a 1342 * For SCTP we may be getting a
1338 * whole message OR a partial delivery. 1343 * whole message OR a partial delivery.
1339 */ 1344 */
1340 if (m->m_type == MT_SONAME) { 1345 if (m->m_type == MT_SONAME) {
1341 orig_resid = 0; 1346 orig_resid = 0;
1342 if (flags & MSG_PEEK) { 1347 if (flags & MSG_PEEK) {
1343 if (paddr) 1348 if (paddr)
1344 *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT); 1349 *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT);
1345 m = m->m_next; 1350 m = m->m_next;
1346 } else { 1351 } else {
1347 sbfree(&so->so_rcv, m); 1352 sbfree(&so->so_rcv, m);
1348 if (paddr) { 1353 if (paddr) {
1349 *paddr = m; 1354 *paddr = m;
1350 so->so_rcv.sb_mb = m->m_next; 1355 so->so_rcv.sb_mb = m->m_next;
1351 m->m_next = 0; 1356 m->m_next = 0;
1352 m = so->so_rcv.sb_mb; 1357 m = so->so_rcv.sb_mb;
1353 } else { 1358 } else {
1354 m = so->so_rcv.sb_mb = m_free(m); 1359 m = so->so_rcv.sb_mb = m_free(m);
1355 } 1360 }
1356 } 1361 }
1357 } 1362 }
1358 } 1363 }
1359 1364
1360 /* 1365 /*
1361 * Process one or more MT_CONTROL mbufs present before any data mbufs 1366 * Process one or more MT_CONTROL mbufs present before any data mbufs
1362 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1367 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
1363 * just copy the data; if !MSG_PEEK, we call into the protocol to 1368 * just copy the data; if !MSG_PEEK, we call into the protocol to
1364 * perform externalization (or freeing if controlp == NULL). 1369 * perform externalization (or freeing if controlp == NULL).
1365 */ 1370 */
1366 if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) { 1371 if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) {
1367 struct mbuf *cm = NULL, *cmn; 1372 struct mbuf *cm = NULL, *cmn;
1368 struct mbuf **cme = &cm; 1373 struct mbuf **cme = &cm;
1369 1374
1370 do { 1375 do {
1371 if (flags & MSG_PEEK) { 1376 if (flags & MSG_PEEK) {
1372 if (controlp != NULL) { 1377 if (controlp != NULL) {
1373 *controlp = m_copym(m, 0, m->m_len, M_DONTWAIT); 1378 *controlp = m_copym(m, 0, m->m_len, M_DONTWAIT);
1374 controlp = &(*controlp)->m_next; 1379 controlp = &(*controlp)->m_next;
1375 } 1380 }
1376 m = m->m_next; 1381 m = m->m_next;
1377 } else { 1382 } else {
1378 sbfree(&so->so_rcv, m); 1383 sbfree(&so->so_rcv, m);
1379 so->so_rcv.sb_mb = m->m_next; 1384 so->so_rcv.sb_mb = m->m_next;
1380 m->m_next = NULL; 1385 m->m_next = NULL;
1381 *cme = m; 1386 *cme = m;
1382 cme = &(*cme)->m_next; 1387 cme = &(*cme)->m_next;
1383 m = so->so_rcv.sb_mb; 1388 m = so->so_rcv.sb_mb;
1384 } 1389 }
1385 } while (m != NULL && m->m_type == MT_CONTROL); 1390 } while (m != NULL && m->m_type == MT_CONTROL);
1386 if ((flags & MSG_PEEK) == 0) 1391 if ((flags & MSG_PEEK) == 0)
1387 sbsync(&so->so_rcv, nextrecord); 1392 sbsync(&so->so_rcv, nextrecord);
1388 for (; cm != NULL; cm = cmn) { 1393 for (; cm != NULL; cm = cmn) {
1389 cmn = cm->m_next; 1394 cmn = cm->m_next;
1390 cm->m_next = NULL; 1395 cm->m_next = NULL;
1391 type = mtod(cm, struct cmsghdr *)->cmsg_type; 1396 type = mtod(cm, struct cmsghdr *)->cmsg_type;
1392 if (controlp != NULL) { 1397 if (controlp != NULL) {
1393 if (dom->dom_externalize != NULL && 1398 if (dom->dom_externalize != NULL &&
1394 type == SCM_RIGHTS) { 1399 type == SCM_RIGHTS) {
1395 sounlock(so); 1400 sounlock(so);
1396 splx(s); 1401 splx(s);
1397 error = (*dom->dom_externalize)(cm, l, 1402 error = (*dom->dom_externalize)(cm, l,
1398 (flags & MSG_CMSG_CLOEXEC) ? 1403 (flags & MSG_CMSG_CLOEXEC) ?
1399 O_CLOEXEC : 0); 1404 O_CLOEXEC : 0);
1400 s = splsoftnet(); 1405 s = splsoftnet();
1401 solock(so); 1406 solock(so);
1402 } 1407 }
1403 *controlp = cm; 1408 *controlp = cm;
1404 while (*controlp != NULL) 1409 while (*controlp != NULL)
1405 controlp = &(*controlp)->m_next; 1410 controlp = &(*controlp)->m_next;
1406 } else { 1411 } else {
1407 /* 1412 /*
1408 * Dispose of any SCM_RIGHTS message that went 1413 * Dispose of any SCM_RIGHTS message that went
1409 * through the read path rather than recv. 1414 * through the read path rather than recv.
1410 */ 1415 */
1411 if (dom->dom_dispose != NULL && 1416 if (dom->dom_dispose != NULL &&
1412 type == SCM_RIGHTS) { 1417 type == SCM_RIGHTS) {
1413 sounlock(so); 1418 sounlock(so);
1414 (*dom->dom_dispose)(cm); 1419 (*dom->dom_dispose)(cm);
1415 solock(so); 1420 solock(so);
1416 } 1421 }
1417 m_freem(cm); 1422 m_freem(cm);
1418 } 1423 }
1419 } 1424 }
1420 if (m != NULL) 1425 if (m != NULL)
1421 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1426 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1422 else 1427 else
1423 nextrecord = so->so_rcv.sb_mb; 1428 nextrecord = so->so_rcv.sb_mb;
1424 orig_resid = 0; 1429 orig_resid = 0;
1425 } 1430 }
1426 1431
1427 /* If m is non-NULL, we have some data to read. */ 1432 /* If m is non-NULL, we have some data to read. */
1428 if (__predict_true(m != NULL)) { 1433 if (__predict_true(m != NULL)) {
1429 type = m->m_type; 1434 type = m->m_type;
1430 if (type == MT_OOBDATA) 1435 if (type == MT_OOBDATA)
1431 flags |= MSG_OOB; 1436 flags |= MSG_OOB;
1432 } 1437 }
1433 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1438 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
1434 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1439 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
1435 1440
1436 moff = 0; 1441 moff = 0;
1437 offset = 0; 1442 offset = 0;
1438 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1443 while (m != NULL && uio->uio_resid > 0 && error == 0) {
1439 if (m->m_type == MT_OOBDATA) { 1444 if (m->m_type == MT_OOBDATA) {
1440 if (type != MT_OOBDATA) 1445 if (type != MT_OOBDATA)
1441 break; 1446 break;
1442 } else if (type == MT_OOBDATA) 1447 } else if (type == MT_OOBDATA)
1443 break; 1448 break;
1444#ifdef DIAGNOSTIC 1449#ifdef DIAGNOSTIC
1445 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 1450 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
1446 panic("receive 3"); 1451 panic("receive 3");
1447#endif 1452#endif
1448 so->so_state &= ~SS_RCVATMARK; 1453 so->so_state &= ~SS_RCVATMARK;
1449 wakeup_state = 0; 1454 wakeup_state = 0;
1450 len = uio->uio_resid; 1455 len = uio->uio_resid;
1451 if (so->so_oobmark && len > so->so_oobmark - offset) 1456 if (so->so_oobmark && len > so->so_oobmark - offset)
1452 len = so->so_oobmark - offset; 1457 len = so->so_oobmark - offset;
1453 if (len > m->m_len - moff) 1458 if (len > m->m_len - moff)
1454 len = m->m_len - moff; 1459 len = m->m_len - moff;
1455 /* 1460 /*
1456 * If mp is set, just pass back the mbufs. 1461 * If mp is set, just pass back the mbufs.
1457 * Otherwise copy them out via the uio, then free. 1462 * Otherwise copy them out via the uio, then free.
1458 * Sockbuf must be consistent here (points to current mbuf, 1463 * Sockbuf must be consistent here (points to current mbuf,
1459 * it points to next record) when we drop priority; 1464 * it points to next record) when we drop priority;
1460 * we must note any additions to the sockbuf when we 1465 * we must note any additions to the sockbuf when we
1461 * block interrupts again. 1466 * block interrupts again.
1462 */ 1467 */
1463 if (mp == NULL) { 1468 if (mp == NULL) {
1464 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1469 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
1465 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1470 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
1466 sounlock(so); 1471 sounlock(so);
1467 splx(s); 1472 splx(s);
1468 error = uiomove(mtod(m, char *) + moff, len, uio); 1473 error = uiomove(mtod(m, char *) + moff, len, uio);
1469 s = splsoftnet(); 1474 s = splsoftnet();
1470 solock(so); 1475 solock(so);
1471 if (error != 0) { 1476 if (error != 0) {
1472 /* 1477 /*
1473 * If any part of the record has been removed 1478 * If any part of the record has been removed
1474 * (such as the MT_SONAME mbuf, which will 1479 * (such as the MT_SONAME mbuf, which will
1475 * happen when PR_ADDR, and thus also 1480 * happen when PR_ADDR, and thus also
1476 * PR_ATOMIC, is set), then drop the entire 1481 * PR_ATOMIC, is set), then drop the entire
1477 * record to maintain the atomicity of the 1482 * record to maintain the atomicity of the
1478 * receive operation. 1483 * receive operation.
1479 * 1484 *
1480 * This avoids a later panic("receive 1a") 1485 * This avoids a later panic("receive 1a")
1481 * when compiled with DIAGNOSTIC. 1486 * when compiled with DIAGNOSTIC.
1482 */ 1487 */
1483 if (m && mbuf_removed && atomic) 1488 if (m && mbuf_removed && atomic)
1484 (void) sbdroprecord(&so->so_rcv); 1489 (void) sbdroprecord(&so->so_rcv);
1485 1490
1486 goto release; 1491 goto release;
1487 } 1492 }
1488 } else 1493 } else
1489 uio->uio_resid -= len; 1494 uio->uio_resid -= len;
1490 if (len == m->m_len - moff) { 1495 if (len == m->m_len - moff) {
1491 if (m->m_flags & M_EOR) 1496 if (m->m_flags & M_EOR)
1492 flags |= MSG_EOR; 1497 flags |= MSG_EOR;
1493#ifdef SCTP 1498#ifdef SCTP
1494 if (m->m_flags & M_NOTIFICATION) 1499 if (m->m_flags & M_NOTIFICATION)
1495 flags |= MSG_NOTIFICATION; 1500 flags |= MSG_NOTIFICATION;
1496#endif /* SCTP */ 1501#endif /* SCTP */
1497 if (flags & MSG_PEEK) { 1502 if (flags & MSG_PEEK) {
1498 m = m->m_next; 1503 m = m->m_next;
1499 moff = 0; 1504 moff = 0;
1500 } else { 1505 } else {
1501 nextrecord = m->m_nextpkt; 1506 nextrecord = m->m_nextpkt;
1502 sbfree(&so->so_rcv, m); 1507 sbfree(&so->so_rcv, m);
1503 if (mp) { 1508 if (mp) {
1504 *mp = m; 1509 *mp = m;
1505 mp = &m->m_next; 1510 mp = &m->m_next;
1506 so->so_rcv.sb_mb = m = m->m_next; 1511 so->so_rcv.sb_mb = m = m->m_next;
1507 *mp = NULL; 1512 *mp = NULL;
1508 } else { 1513 } else {
1509 m = so->so_rcv.sb_mb = m_free(m); 1514 m = so->so_rcv.sb_mb = m_free(m);
1510 } 1515 }
1511 /* 1516 /*
1512 * If m != NULL, we also know that 1517 * If m != NULL, we also know that
1513 * so->so_rcv.sb_mb != NULL. 1518 * so->so_rcv.sb_mb != NULL.
1514 */ 1519 */
1515 KASSERT(so->so_rcv.sb_mb == m); 1520 KASSERT(so->so_rcv.sb_mb == m);
1516 if (m) { 1521 if (m) {
1517 m->m_nextpkt = nextrecord; 1522 m->m_nextpkt = nextrecord;
1518 if (nextrecord == NULL) 1523 if (nextrecord == NULL)
1519 so->so_rcv.sb_lastrecord = m; 1524 so->so_rcv.sb_lastrecord = m;
1520 } else { 1525 } else {
1521 so->so_rcv.sb_mb = nextrecord; 1526 so->so_rcv.sb_mb = nextrecord;
1522 SB_EMPTY_FIXUP(&so->so_rcv); 1527 SB_EMPTY_FIXUP(&so->so_rcv);
1523 } 1528 }
1524 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1529 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
1525 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1530 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1526 } 1531 }
1527 } else if (flags & MSG_PEEK) 1532 } else if (flags & MSG_PEEK)
1528 moff += len; 1533 moff += len;
1529 else { 1534 else {
1530 if (mp != NULL) { 1535 if (mp != NULL) {
1531 mt = m_copym(m, 0, len, M_NOWAIT); 1536 mt = m_copym(m, 0, len, M_NOWAIT);
1532 if (__predict_false(mt == NULL)) { 1537 if (__predict_false(mt == NULL)) {
1533 sounlock(so); 1538 sounlock(so);
1534 mt = m_copym(m, 0, len, M_WAIT); 1539 mt = m_copym(m, 0, len, M_WAIT);
1535 solock(so); 1540 solock(so);
1536 } 1541 }
1537 *mp = mt; 1542 *mp = mt;
1538 } 1543 }
1539 m->m_data += len; 1544 m->m_data += len;
1540 m->m_len -= len; 1545 m->m_len -= len;
1541 so->so_rcv.sb_cc -= len; 1546 so->so_rcv.sb_cc -= len;
1542 } 1547 }
1543 if (so->so_oobmark) { 1548 if (so->so_oobmark) {
1544 if ((flags & MSG_PEEK) == 0) { 1549 if ((flags & MSG_PEEK) == 0) {
1545 so->so_oobmark -= len; 1550 so->so_oobmark -= len;
1546 if (so->so_oobmark == 0) { 1551 if (so->so_oobmark == 0) {
1547 so->so_state |= SS_RCVATMARK; 1552 so->so_state |= SS_RCVATMARK;
1548 break; 1553 break;
1549 } 1554 }
1550 } else { 1555 } else {
1551 offset += len; 1556 offset += len;
1552 if (offset == so->so_oobmark) 1557 if (offset == so->so_oobmark)
1553 break; 1558 break;
1554 } 1559 }
1555 } 1560 }
1556 if (flags & MSG_EOR) 1561 if (flags & MSG_EOR)
1557 break; 1562 break;
1558 /* 1563 /*
1559 * If the MSG_WAITALL flag is set (for non-atomic socket), 1564 * If the MSG_WAITALL flag is set (for non-atomic socket),
1560 * we must not quit until "uio->uio_resid == 0" or an error 1565 * we must not quit until "uio->uio_resid == 0" or an error
1561 * termination. If a signal/timeout occurs, return 1566 * termination. If a signal/timeout occurs, return
1562 * with a short count but without error. 1567 * with a short count but without error.
1563 * Keep sockbuf locked against other readers. 1568 * Keep sockbuf locked against other readers.
1564 */ 1569 */
1565 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1570 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1566 !sosendallatonce(so) && !nextrecord) { 1571 !sosendallatonce(so) && !nextrecord) {
1567 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1572 if (so->so_error || so->so_rerror ||
 1573 so->so_state & SS_CANTRCVMORE)
1568 break; 1574 break;
1569 /* 1575 /*
1570 * If we are peeking and the socket receive buffer is 1576 * If we are peeking and the socket receive buffer is
1571 * full, stop since we can't get more data to peek at. 1577 * full, stop since we can't get more data to peek at.
1572 */ 1578 */
1573 if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) 1579 if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0)
1574 break; 1580 break;
1575 /* 1581 /*
1576 * If we've drained the socket buffer, tell the 1582 * If we've drained the socket buffer, tell the
1577 * protocol in case it needs to do something to 1583 * protocol in case it needs to do something to
1578 * get it filled again. 1584 * get it filled again.
1579 */ 1585 */
1580 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1586 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
1581 (*pr->pr_usrreqs->pr_rcvd)(so, flags, l); 1587 (*pr->pr_usrreqs->pr_rcvd)(so, flags, l);
1582 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1588 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
1583 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1589 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
1584 if (wakeup_state & SS_RESTARTSYS) 1590 if (wakeup_state & SS_RESTARTSYS)
1585 error = ERESTART; 1591 error = ERESTART;
1586 else 1592 else
1587 error = sbwait(&so->so_rcv); 1593 error = sbwait(&so->so_rcv);
1588 if (error != 0) { 1594 if (error != 0) {
1589 sbunlock(&so->so_rcv); 1595 sbunlock(&so->so_rcv);
1590 sounlock(so); 1596 sounlock(so);
1591 splx(s); 1597 splx(s);
1592 return 0; 1598 return 0;
1593 } 1599 }
1594 if ((m = so->so_rcv.sb_mb) != NULL) 1600 if ((m = so->so_rcv.sb_mb) != NULL)
1595 nextrecord = m->m_nextpkt; 1601 nextrecord = m->m_nextpkt;
1596 wakeup_state = so->so_state; 1602 wakeup_state = so->so_state;
1597 } 1603 }
1598 } 1604 }
1599 1605
1600 if (m && atomic) { 1606 if (m && atomic) {
1601 flags |= MSG_TRUNC; 1607 flags |= MSG_TRUNC;
1602 if ((flags & MSG_PEEK) == 0) 1608 if ((flags & MSG_PEEK) == 0)
1603 (void) sbdroprecord(&so->so_rcv); 1609 (void) sbdroprecord(&so->so_rcv);
1604 } 1610 }
1605 if ((flags & MSG_PEEK) == 0) { 1611 if ((flags & MSG_PEEK) == 0) {
1606 if (m == NULL) { 1612 if (m == NULL) {
1607 /* 1613 /*
1608 * First part is an inline SB_EMPTY_FIXUP(). Second 1614 * First part is an inline SB_EMPTY_FIXUP(). Second
1609 * part makes sure sb_lastrecord is up-to-date if 1615 * part makes sure sb_lastrecord is up-to-date if
1610 * there is still data in the socket buffer. 1616 * there is still data in the socket buffer.
1611 */ 1617 */
1612 so->so_rcv.sb_mb = nextrecord; 1618 so->so_rcv.sb_mb = nextrecord;
1613 if (so->so_rcv.sb_mb == NULL) { 1619 if (so->so_rcv.sb_mb == NULL) {
1614 so->so_rcv.sb_mbtail = NULL; 1620 so->so_rcv.sb_mbtail = NULL;
1615 so->so_rcv.sb_lastrecord = NULL; 1621 so->so_rcv.sb_lastrecord = NULL;
1616 } else if (nextrecord->m_nextpkt == NULL) 1622 } else if (nextrecord->m_nextpkt == NULL)
1617 so->so_rcv.sb_lastrecord = nextrecord; 1623 so->so_rcv.sb_lastrecord = nextrecord;
1618 } 1624 }
1619 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1625 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
1620 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1626 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1621 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1627 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1622 (*pr->pr_usrreqs->pr_rcvd)(so, flags, l); 1628 (*pr->pr_usrreqs->pr_rcvd)(so, flags, l);
1623 } 1629 }
1624 if (orig_resid == uio->uio_resid && orig_resid && 1630 if (orig_resid == uio->uio_resid && orig_resid &&
1625 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1631 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1626 sbunlock(&so->so_rcv); 1632 sbunlock(&so->so_rcv);
1627 goto restart; 1633 goto restart;
1628 } 1634 }
1629 1635
1630 if (flagsp != NULL) 1636 if (flagsp != NULL)
1631 *flagsp |= flags; 1637 *flagsp |= flags;
1632 release: 1638 release:
1633 sbunlock(&so->so_rcv); 1639 sbunlock(&so->so_rcv);
1634 sounlock(so); 1640 sounlock(so);
1635 splx(s); 1641 splx(s);
1636 return error; 1642 return error;
1637} 1643}
1638 1644
1639int 1645int
1640soshutdown(struct socket *so, int how) 1646soshutdown(struct socket *so, int how)
1641{ 1647{
1642 const struct protosw *pr; 1648 const struct protosw *pr;
1643 int error; 1649 int error;
1644 1650
1645 KASSERT(solocked(so)); 1651 KASSERT(solocked(so));
1646 1652
1647 pr = so->so_proto; 1653 pr = so->so_proto;
1648 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1654 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1649 return (EINVAL); 1655 return (EINVAL);
1650 1656
1651 if (how == SHUT_RD || how == SHUT_RDWR) { 1657 if (how == SHUT_RD || how == SHUT_RDWR) {
1652 sorflush(so); 1658 sorflush(so);
1653 error = 0; 1659 error = 0;
1654 } 1660 }
1655 if (how == SHUT_WR || how == SHUT_RDWR) 1661 if (how == SHUT_WR || how == SHUT_RDWR)
1656 error = (*pr->pr_usrreqs->pr_shutdown)(so); 1662 error = (*pr->pr_usrreqs->pr_shutdown)(so);
1657 1663
1658 return error; 1664 return error;
1659} 1665}
1660 1666
1661void 1667void
1662sorestart(struct socket *so) 1668sorestart(struct socket *so)
1663{ 1669{
1664 /* 1670 /*
1665 * An application has called close() on an fd on which another 1671 * An application has called close() on an fd on which another
1666 * of its threads has called a socket system call. 1672 * of its threads has called a socket system call.
1667 * Mark this and wake everyone up, and code that would block again 1673 * Mark this and wake everyone up, and code that would block again
1668 * instead returns ERESTART. 1674 * instead returns ERESTART.
1669 * On system call re-entry the fd is validated and EBADF returned. 1675 * On system call re-entry the fd is validated and EBADF returned.
1670 * Any other fd will block again on the 2nd syscall. 1676 * Any other fd will block again on the 2nd syscall.
1671 */ 1677 */
1672 solock(so); 1678 solock(so);
1673 so->so_state |= SS_RESTARTSYS; 1679 so->so_state |= SS_RESTARTSYS;
1674 cv_broadcast(&so->so_cv); 1680 cv_broadcast(&so->so_cv);
1675 cv_broadcast(&so->so_snd.sb_cv); 1681 cv_broadcast(&so->so_snd.sb_cv);
1676 cv_broadcast(&so->so_rcv.sb_cv); 1682 cv_broadcast(&so->so_rcv.sb_cv);
1677 sounlock(so); 1683 sounlock(so);
1678} 1684}
1679 1685
1680void 1686void
1681sorflush(struct socket *so) 1687sorflush(struct socket *so)
1682{ 1688{
1683 struct sockbuf *sb, asb; 1689 struct sockbuf *sb, asb;
1684 const struct protosw *pr; 1690 const struct protosw *pr;
1685 1691
1686 KASSERT(solocked(so)); 1692 KASSERT(solocked(so));
1687 1693
1688 sb = &so->so_rcv; 1694 sb = &so->so_rcv;
1689 pr = so->so_proto; 1695 pr = so->so_proto;
1690 socantrcvmore(so); 1696 socantrcvmore(so);
1691 sb->sb_flags |= SB_NOINTR; 1697 sb->sb_flags |= SB_NOINTR;
1692 (void )sblock(sb, M_WAITOK); 1698 (void )sblock(sb, M_WAITOK);
1693 sbunlock(sb); 1699 sbunlock(sb);
1694 asb = *sb; 1700 asb = *sb;
1695 /* 1701 /*
1696 * Clear most of the sockbuf structure, but leave some of the 1702 * Clear most of the sockbuf structure, but leave some of the
1697 * fields valid. 1703 * fields valid.
1698 */ 1704 */
1699 memset(&sb->sb_startzero, 0, 1705 memset(&sb->sb_startzero, 0,
1700 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1706 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1701 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) { 1707 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) {
1702 sounlock(so); 1708 sounlock(so);
1703 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1709 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1704 solock(so); 1710 solock(so);
1705 } 1711 }
1706 sbrelease(&asb, so); 1712 sbrelease(&asb, so);
1707} 1713}
1708 1714
1709/* 1715/*
1710 * internal set SOL_SOCKET options 1716 * internal set SOL_SOCKET options
1711 */ 1717 */
1712static int 1718static int
1713sosetopt1(struct socket *so, const struct sockopt *sopt) 1719sosetopt1(struct socket *so, const struct sockopt *sopt)
1714{ 1720{
1715 int error = EINVAL, opt; 1721 int error = EINVAL, opt;
1716 int optval = 0; /* XXX: gcc */ 1722 int optval = 0; /* XXX: gcc */
1717 struct linger l; 1723 struct linger l;
1718 struct timeval tv; 1724 struct timeval tv;
1719 1725
1720 switch ((opt = sopt->sopt_name)) { 1726 switch ((opt = sopt->sopt_name)) {
1721 1727
1722 case SO_ACCEPTFILTER: 1728 case SO_ACCEPTFILTER:
1723 error = accept_filt_setopt(so, sopt); 1729 error = accept_filt_setopt(so, sopt);
1724 KASSERT(solocked(so)); 1730 KASSERT(solocked(so));
1725 break; 1731 break;
1726 1732
1727 case SO_LINGER: 1733 case SO_LINGER:
1728 error = sockopt_get(sopt, &l, sizeof(l)); 1734 error = sockopt_get(sopt, &l, sizeof(l));
1729 solock(so); 1735 solock(so);
1730 if (error) 1736 if (error)
1731 break; 1737 break;
1732 if (l.l_linger < 0 || l.l_linger > USHRT_MAX || 1738 if (l.l_linger < 0 || l.l_linger > USHRT_MAX ||
1733 l.l_linger > (INT_MAX / hz)) { 1739 l.l_linger > (INT_MAX / hz)) {
1734 error = EDOM; 1740 error = EDOM;
1735 break; 1741 break;
1736 } 1742 }
1737 so->so_linger = l.l_linger; 1743 so->so_linger = l.l_linger;
1738 if (l.l_onoff) 1744 if (l.l_onoff)
1739 so->so_options |= SO_LINGER; 1745 so->so_options |= SO_LINGER;
1740 else 1746 else
1741 so->so_options &= ~SO_LINGER; 1747 so->so_options &= ~SO_LINGER;
1742 break; 1748 break;
1743 1749
1744 case SO_DEBUG: 1750 case SO_DEBUG:
1745 case SO_KEEPALIVE: 1751 case SO_KEEPALIVE:
1746 case SO_DONTROUTE: 1752 case SO_DONTROUTE:
1747 case SO_USELOOPBACK: 1753 case SO_USELOOPBACK:
1748 case SO_BROADCAST: 1754 case SO_BROADCAST:
1749 case SO_REUSEADDR: 1755 case SO_REUSEADDR:
1750 case SO_REUSEPORT: 1756 case SO_REUSEPORT:
1751 case SO_OOBINLINE: 1757 case SO_OOBINLINE:
1752 case SO_TIMESTAMP: 1758 case SO_TIMESTAMP:
1753 case SO_NOSIGPIPE: 1759 case SO_NOSIGPIPE:
1754#ifdef SO_OTIMESTAMP 1760#ifdef SO_OTIMESTAMP
1755 case SO_OTIMESTAMP: 1761 case SO_OTIMESTAMP:
1756#endif 1762#endif
1757 error = sockopt_getint(sopt, &optval); 1763 error = sockopt_getint(sopt, &optval);
1758 solock(so); 1764 solock(so);
1759 if (error) 1765 if (error)
1760 break; 1766 break;
1761 if (optval) 1767 if (optval)
1762 so->so_options |= opt; 1768 so->so_options |= opt;
1763 else 1769 else
1764 so->so_options &= ~opt; 1770 so->so_options &= ~opt;
1765 break; 1771 break;
1766 1772
1767 case SO_SNDBUF: 1773 case SO_SNDBUF:
1768 case SO_RCVBUF: 1774 case SO_RCVBUF:
1769 case SO_SNDLOWAT: 1775 case SO_SNDLOWAT:
1770 case SO_RCVLOWAT: 1776 case SO_RCVLOWAT:
1771 error = sockopt_getint(sopt, &optval); 1777 error = sockopt_getint(sopt, &optval);
1772 solock(so); 1778 solock(so);
1773 if (error) 1779 if (error)
1774 break; 1780 break;
1775 1781
1776 /* 1782 /*
1777 * Values < 1 make no sense for any of these 1783 * Values < 1 make no sense for any of these
1778 * options, so disallow them. 1784 * options, so disallow them.
1779 */ 1785 */
1780 if (optval < 1) { 1786 if (optval < 1) {
1781 error = EINVAL; 1787 error = EINVAL;
1782 break; 1788 break;
1783 } 1789 }
1784 1790
1785 switch (opt) { 1791 switch (opt) {
1786 case SO_SNDBUF: 1792 case SO_SNDBUF:
1787 if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) { 1793 if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) {
1788 error = ENOBUFS; 1794 error = ENOBUFS;
1789 break; 1795 break;
1790 } 1796 }
1791 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 1797 so->so_snd.sb_flags &= ~SB_AUTOSIZE;
1792 break; 1798 break;
1793 1799
1794 case SO_RCVBUF: 1800 case SO_RCVBUF:
1795 if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) { 1801 if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) {
1796 error = ENOBUFS; 1802 error = ENOBUFS;
1797 break; 1803 break;
1798 } 1804 }
1799 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1805 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
1800 break; 1806 break;
1801 1807
1802 /* 1808 /*
1803 * Make sure the low-water is never greater than 1809 * Make sure the low-water is never greater than
1804 * the high-water. 1810 * the high-water.
1805 */ 1811 */
1806 case SO_SNDLOWAT: 1812 case SO_SNDLOWAT:
1807 if (optval > so->so_snd.sb_hiwat) 1813 if (optval > so->so_snd.sb_hiwat)
1808 optval = so->so_snd.sb_hiwat; 1814 optval = so->so_snd.sb_hiwat;
1809 1815
1810 so->so_snd.sb_lowat = optval; 1816 so->so_snd.sb_lowat = optval;
1811 break; 1817 break;
1812 1818
1813 case SO_RCVLOWAT: 1819 case SO_RCVLOWAT:
1814 if (optval > so->so_rcv.sb_hiwat) 1820 if (optval > so->so_rcv.sb_hiwat)
1815 optval = so->so_rcv.sb_hiwat; 1821 optval = so->so_rcv.sb_hiwat;
1816 1822
1817 so->so_rcv.sb_lowat = optval; 1823 so->so_rcv.sb_lowat = optval;
1818 break; 1824 break;
1819 } 1825 }
1820 break; 1826 break;
1821 1827
1822#ifdef COMPAT_50 1828#ifdef COMPAT_50
1823 case SO_OSNDTIMEO: 1829 case SO_OSNDTIMEO:
1824 case SO_ORCVTIMEO: { 1830 case SO_ORCVTIMEO: {
1825 struct timeval50 otv; 1831 struct timeval50 otv;
1826 error = sockopt_get(sopt, &otv, sizeof(otv)); 1832 error = sockopt_get(sopt, &otv, sizeof(otv));
1827 if (error) { 1833 if (error) {
1828 solock(so); 1834 solock(so);
1829 break; 1835 break;
1830 } 1836 }
1831 timeval50_to_timeval(&otv, &tv); 1837 timeval50_to_timeval(&otv, &tv);
1832 opt = opt == SO_OSNDTIMEO ? SO_SNDTIMEO : SO_RCVTIMEO; 1838 opt = opt == SO_OSNDTIMEO ? SO_SNDTIMEO : SO_RCVTIMEO;
1833 error = 0; 1839 error = 0;
1834 /*FALLTHROUGH*/ 1840 /*FALLTHROUGH*/
1835 } 1841 }
1836#endif /* COMPAT_50 */ 1842#endif /* COMPAT_50 */
1837 1843
1838 case SO_SNDTIMEO: 1844 case SO_SNDTIMEO:
1839 case SO_RCVTIMEO: 1845 case SO_RCVTIMEO:
1840 if (error) 1846 if (error)
1841 error = sockopt_get(sopt, &tv, sizeof(tv)); 1847 error = sockopt_get(sopt, &tv, sizeof(tv));
1842 solock(so); 1848 solock(so);
1843 if (error) 1849 if (error)
1844 break; 1850 break;
1845 1851
1846 if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) { 1852 if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) {
1847 error = EDOM; 1853 error = EDOM;
1848 break; 1854 break;
1849 } 1855 }
1850 1856
1851 optval = tv.tv_sec * hz + tv.tv_usec / tick; 1857 optval = tv.tv_sec * hz + tv.tv_usec / tick;
1852 if (optval == 0 && tv.tv_usec != 0) 1858 if (optval == 0 && tv.tv_usec != 0)
1853 optval = 1; 1859 optval = 1;
1854 1860
1855 switch (opt) { 1861 switch (opt) {
1856 case SO_SNDTIMEO: 1862 case SO_SNDTIMEO:
1857 so->so_snd.sb_timeo = optval; 1863 so->so_snd.sb_timeo = optval;
1858 break; 1864 break;
1859 case SO_RCVTIMEO: 1865 case SO_RCVTIMEO:
1860 so->so_rcv.sb_timeo = optval; 1866 so->so_rcv.sb_timeo = optval;
1861 break; 1867 break;
1862 } 1868 }
1863 break; 1869 break;
1864 1870
1865 default: 1871 default:
1866 solock(so); 1872 solock(so);
1867 error = ENOPROTOOPT; 1873 error = ENOPROTOOPT;
1868 break; 1874 break;
1869 } 1875 }
1870 KASSERT(solocked(so)); 1876 KASSERT(solocked(so));
1871 return error; 1877 return error;
1872} 1878}
1873 1879
1874int 1880int
1875sosetopt(struct socket *so, struct sockopt *sopt) 1881sosetopt(struct socket *so, struct sockopt *sopt)
1876{ 1882{
1877 int error, prerr; 1883 int error, prerr;
1878 1884
1879 if (sopt->sopt_level == SOL_SOCKET) { 1885 if (sopt->sopt_level == SOL_SOCKET) {
1880 error = sosetopt1(so, sopt); 1886 error = sosetopt1(so, sopt);
1881 KASSERT(solocked(so)); 1887 KASSERT(solocked(so));
1882 } else { 1888 } else {
1883 error = ENOPROTOOPT; 1889 error = ENOPROTOOPT;
1884 solock(so); 1890 solock(so);
1885 } 1891 }
1886 1892
1887 if ((error == 0 || error == ENOPROTOOPT) && 1893 if ((error == 0 || error == ENOPROTOOPT) &&
1888 so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { 1894 so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) {
1889 /* give the protocol stack a shot */ 1895 /* give the protocol stack a shot */
1890 prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt); 1896 prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt);
1891 if (prerr == 0) 1897 if (prerr == 0)
1892 error = 0; 1898 error = 0;
1893 else if (prerr != ENOPROTOOPT) 1899 else if (prerr != ENOPROTOOPT)
1894 error = prerr; 1900 error = prerr;
1895 } 1901 }
1896 sounlock(so); 1902 sounlock(so);
1897 return error; 1903 return error;
1898} 1904}
1899 1905
1900/* 1906/*
1901 * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt() 1907 * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt()
1902 */ 1908 */
1903int 1909int
1904so_setsockopt(struct lwp *l, struct socket *so, int level, int name, 1910so_setsockopt(struct lwp *l, struct socket *so, int level, int name,
1905 const void *val, size_t valsize) 1911 const void *val, size_t valsize)
1906{ 1912{
1907 struct sockopt sopt; 1913 struct sockopt sopt;
1908 int error; 1914 int error;
1909 1915
1910 KASSERT(valsize == 0 || val != NULL); 1916 KASSERT(valsize == 0 || val != NULL);
1911 1917
1912 sockopt_init(&sopt, level, name, valsize); 1918 sockopt_init(&sopt, level, name, valsize);
1913 sockopt_set(&sopt, val, valsize); 1919 sockopt_set(&sopt, val, valsize);
1914 1920
1915 error = sosetopt(so, &sopt); 1921 error = sosetopt(so, &sopt);
1916 1922
1917 sockopt_destroy(&sopt); 1923 sockopt_destroy(&sopt);
1918 1924
1919 return error; 1925 return error;
1920} 1926}
1921 1927
1922/* 1928/*
1923 * internal get SOL_SOCKET options 1929 * internal get SOL_SOCKET options
1924 */ 1930 */
1925static int 1931static int
1926sogetopt1(struct socket *so, struct sockopt *sopt) 1932sogetopt1(struct socket *so, struct sockopt *sopt)
1927{ 1933{
1928 int error, optval, opt; 1934 int error, optval, opt;
1929 struct linger l; 1935 struct linger l;
1930 struct timeval tv; 1936 struct timeval tv;
1931 1937
1932 switch ((opt = sopt->sopt_name)) { 1938 switch ((opt = sopt->sopt_name)) {
1933 1939
1934 case SO_ACCEPTFILTER: 1940 case SO_ACCEPTFILTER:
1935 error = accept_filt_getopt(so, sopt); 1941 error = accept_filt_getopt(so, sopt);
1936 break; 1942 break;
1937 1943
1938 case SO_LINGER: 1944 case SO_LINGER:
1939 l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0; 1945 l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0;
1940 l.l_linger = so->so_linger; 1946 l.l_linger = so->so_linger;
1941 1947
1942 error = sockopt_set(sopt, &l, sizeof(l)); 1948 error = sockopt_set(sopt, &l, sizeof(l));
1943 break; 1949 break;
1944 1950
1945 case SO_USELOOPBACK: 1951 case SO_USELOOPBACK:
1946 case SO_DONTROUTE: 1952 case SO_DONTROUTE:
1947 case SO_DEBUG: 1953 case SO_DEBUG:
1948 case SO_KEEPALIVE: 1954 case SO_KEEPALIVE:
1949 case SO_REUSEADDR: 1955 case SO_REUSEADDR:
1950 case SO_REUSEPORT: 1956 case SO_REUSEPORT:
1951 case SO_BROADCAST: 1957 case SO_BROADCAST:
1952 case SO_OOBINLINE: 1958 case SO_OOBINLINE:
1953 case SO_TIMESTAMP: 1959 case SO_TIMESTAMP:
1954 case SO_NOSIGPIPE: 1960 case SO_NOSIGPIPE:
1955#ifdef SO_OTIMESTAMP 1961#ifdef SO_OTIMESTAMP
1956 case SO_OTIMESTAMP: 1962 case SO_OTIMESTAMP:
1957#endif 1963#endif
1958 case SO_ACCEPTCONN: 1964 case SO_ACCEPTCONN:
1959 error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0); 1965 error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0);
1960 break; 1966 break;
1961 1967
1962 case SO_TYPE: 1968 case SO_TYPE:
1963 error = sockopt_setint(sopt, so->so_type); 1969 error = sockopt_setint(sopt, so->so_type);
1964 break; 1970 break;
1965 1971
1966 case SO_ERROR: 1972 case SO_ERROR:
1967 error = sockopt_setint(sopt, so->so_error); 1973 error = sockopt_setint(sopt, so->so_error);
1968 so->so_error = 0; 1974 so->so_error = 0;
1969 break; 1975 break;
1970 1976
1971 case SO_SNDBUF: 1977 case SO_SNDBUF:
1972 error = sockopt_setint(sopt, so->so_snd.sb_hiwat); 1978 error = sockopt_setint(sopt, so->so_snd.sb_hiwat);
1973 break; 1979 break;
1974 1980
1975 case SO_RCVBUF: 1981 case SO_RCVBUF:
1976 error = sockopt_setint(sopt, so->so_rcv.sb_hiwat); 1982 error = sockopt_setint(sopt, so->so_rcv.sb_hiwat);
1977 break; 1983 break;
1978 1984
1979 case SO_SNDLOWAT: 1985 case SO_SNDLOWAT:
1980 error = sockopt_setint(sopt, so->so_snd.sb_lowat); 1986 error = sockopt_setint(sopt, so->so_snd.sb_lowat);
1981 break; 1987 break;
1982 1988
1983 case SO_RCVLOWAT: 1989 case SO_RCVLOWAT:
1984 error = sockopt_setint(sopt, so->so_rcv.sb_lowat); 1990 error = sockopt_setint(sopt, so->so_rcv.sb_lowat);
1985 break; 1991 break;
1986 1992
1987#ifdef COMPAT_50 1993#ifdef COMPAT_50
1988 case SO_OSNDTIMEO: 1994 case SO_OSNDTIMEO:
1989 case SO_ORCVTIMEO: { 1995 case SO_ORCVTIMEO: {
1990 struct timeval50 otv; 1996 struct timeval50 otv;
1991 1997
1992 optval = (opt == SO_OSNDTIMEO ? 1998 optval = (opt == SO_OSNDTIMEO ?
1993 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1999 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1994 2000
1995 otv.tv_sec = optval / hz; 2001 otv.tv_sec = optval / hz;
1996 otv.tv_usec = (optval % hz) * tick; 2002 otv.tv_usec = (optval % hz) * tick;
1997 2003
1998 error = sockopt_set(sopt, &otv, sizeof(otv)); 2004 error = sockopt_set(sopt, &otv, sizeof(otv));
1999 break; 2005 break;
2000 } 2006 }
2001#endif /* COMPAT_50 */ 2007#endif /* COMPAT_50 */
2002 2008
2003 case SO_SNDTIMEO: 2009 case SO_SNDTIMEO:
2004 case SO_RCVTIMEO: 2010 case SO_RCVTIMEO:
2005 optval = (opt == SO_SNDTIMEO ? 2011 optval = (opt == SO_SNDTIMEO ?
2006 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2012 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2007 2013
2008 tv.tv_sec = optval / hz; 2014 tv.tv_sec = optval / hz;
2009 tv.tv_usec = (optval % hz) * tick; 2015 tv.tv_usec = (optval % hz) * tick;
2010 2016
2011 error = sockopt_set(sopt, &tv, sizeof(tv)); 2017 error = sockopt_set(sopt, &tv, sizeof(tv));
2012 break; 2018 break;
2013 2019
2014 case SO_OVERFLOWED: 2020 case SO_OVERFLOWED:
2015 error = sockopt_setint(sopt, so->so_rcv.sb_overflowed); 2021 error = sockopt_setint(sopt, so->so_rcv.sb_overflowed);
2016 break; 2022 break;
2017 2023
2018 default: 2024 default:
2019 error = ENOPROTOOPT; 2025 error = ENOPROTOOPT;
2020 break; 2026 break;
2021 } 2027 }
2022 2028
2023 return (error); 2029 return (error);
2024} 2030}
2025 2031
2026int 2032int
2027sogetopt(struct socket *so, struct sockopt *sopt) 2033sogetopt(struct socket *so, struct sockopt *sopt)
2028{ 2034{
2029 int error; 2035 int error;
2030 2036
2031 solock(so); 2037 solock(so);
2032 if (sopt->sopt_level != SOL_SOCKET) { 2038 if (sopt->sopt_level != SOL_SOCKET) {
2033 if (so->so_proto && so->so_proto->pr_ctloutput) { 2039 if (so->so_proto && so->so_proto->pr_ctloutput) {
2034 error = ((*so->so_proto->pr_ctloutput) 2040 error = ((*so->so_proto->pr_ctloutput)
2035 (PRCO_GETOPT, so, sopt)); 2041 (PRCO_GETOPT, so, sopt));
2036 } else 2042 } else
2037 error = (ENOPROTOOPT); 2043 error = (ENOPROTOOPT);
2038 } else { 2044 } else {
2039 error = sogetopt1(so, sopt); 2045 error = sogetopt1(so, sopt);
2040 } 2046 }
2041 sounlock(so); 2047 sounlock(so);
2042 return (error); 2048 return (error);
2043} 2049}
2044 2050
2045/* 2051/*
2046 * alloc sockopt data buffer buffer 2052 * alloc sockopt data buffer buffer
2047 * - will be released at destroy 2053 * - will be released at destroy
2048 */ 2054 */
2049static int 2055static int
2050sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag) 2056sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag)
2051{ 2057{
2052 2058
2053 KASSERT(sopt->sopt_size == 0); 2059 KASSERT(sopt->sopt_size == 0);
2054 2060
2055 if (len > sizeof(sopt->sopt_buf)) { 2061 if (len > sizeof(sopt->sopt_buf)) {
2056 sopt->sopt_data = kmem_zalloc(len, kmflag); 2062 sopt->sopt_data = kmem_zalloc(len, kmflag);
2057 if (sopt->sopt_data == NULL) 2063 if (sopt->sopt_data == NULL)
2058 return ENOMEM; 2064 return ENOMEM;
2059 } else 2065 } else
2060 sopt->sopt_data = sopt->sopt_buf; 2066 sopt->sopt_data = sopt->sopt_buf;
2061 2067
2062 sopt->sopt_size = len; 2068 sopt->sopt_size = len;
2063 return 0; 2069 return 0;
2064} 2070}
2065 2071
2066/* 2072/*
2067 * initialise sockopt storage 2073 * initialise sockopt storage
2068 * - MAY sleep during allocation 2074 * - MAY sleep during allocation
2069 */ 2075 */
2070void 2076void
2071sockopt_init(struct sockopt *sopt, int level, int name, size_t size) 2077sockopt_init(struct sockopt *sopt, int level, int name, size_t size)
2072{ 2078{
2073 2079
2074 memset(sopt, 0, sizeof(*sopt)); 2080 memset(sopt, 0, sizeof(*sopt));
2075 2081
2076 sopt->sopt_level = level; 2082 sopt->sopt_level = level;
2077 sopt->sopt_name = name; 2083 sopt->sopt_name = name;
2078 (void)sockopt_alloc(sopt, size, KM_SLEEP); 2084 (void)sockopt_alloc(sopt, size, KM_SLEEP);
2079} 2085}
2080 2086
2081/* 2087/*
2082 * destroy sockopt storage 2088 * destroy sockopt storage
2083 * - will release any held memory references 2089 * - will release any held memory references
2084 */ 2090 */
2085void 2091void
2086sockopt_destroy(struct sockopt *sopt) 2092sockopt_destroy(struct sockopt *sopt)
2087{ 2093{
2088 2094
2089 if (sopt->sopt_data != sopt->sopt_buf) 2095 if (sopt->sopt_data != sopt->sopt_buf)
2090 kmem_free(sopt->sopt_data, sopt->sopt_size); 2096 kmem_free(sopt->sopt_data, sopt->sopt_size);
2091 2097
2092 memset(sopt, 0, sizeof(*sopt)); 2098 memset(sopt, 0, sizeof(*sopt));
2093} 2099}
2094 2100
2095/* 2101/*
2096 * set sockopt value 2102 * set sockopt value
2097 * - value is copied into sockopt 2103 * - value is copied into sockopt
2098 * - memory is allocated when necessary, will not sleep 2104 * - memory is allocated when necessary, will not sleep
2099 */ 2105 */
2100int 2106int
2101sockopt_set(struct sockopt *sopt, const void *buf, size_t len) 2107sockopt_set(struct sockopt *sopt, const void *buf, size_t len)
2102{ 2108{
2103 int error; 2109 int error;
2104 2110
2105 if (sopt->sopt_size == 0) { 2111 if (sopt->sopt_size == 0) {
2106 error = sockopt_alloc(sopt, len, KM_NOSLEEP); 2112 error = sockopt_alloc(sopt, len, KM_NOSLEEP);
2107 if (error) 2113 if (error)
2108 return error; 2114 return error;
2109 } 2115 }
2110 2116
2111 if (sopt->sopt_size < len) 2117 if (sopt->sopt_size < len)
2112 return EINVAL; 2118 return EINVAL;
2113  2119
2114 memcpy(sopt->sopt_data, buf, len); 2120 memcpy(sopt->sopt_data, buf, len);
2115 sopt->sopt_retsize = len; 2121 sopt->sopt_retsize = len;
2116 2122
2117 return 0; 2123 return 0;
2118} 2124}
2119 2125
2120/* 2126/*
2121 * common case of set sockopt integer value 2127 * common case of set sockopt integer value
2122 */ 2128 */
2123int 2129int
2124sockopt_setint(struct sockopt *sopt, int val) 2130sockopt_setint(struct sockopt *sopt, int val)
2125{ 2131{
2126 2132
2127 return sockopt_set(sopt, &val, sizeof(int)); 2133 return sockopt_set(sopt, &val, sizeof(int));
2128} 2134}
2129 2135
2130/* 2136/*
2131 * get sockopt value 2137 * get sockopt value
2132 * - correct size must be given 2138 * - correct size must be given
2133 */ 2139 */
2134int 2140int
2135sockopt_get(const struct sockopt *sopt, void *buf, size_t len) 2141sockopt_get(const struct sockopt *sopt, void *buf, size_t len)
2136{ 2142{
2137 2143
2138 if (sopt->sopt_size != len) 2144 if (sopt->sopt_size != len)
2139 return EINVAL; 2145 return EINVAL;
2140 2146
2141 memcpy(buf, sopt->sopt_data, len); 2147 memcpy(buf, sopt->sopt_data, len);
2142 return 0; 2148 return 0;
2143} 2149}
2144 2150
2145/* 2151/*
2146 * common case of get sockopt integer value 2152 * common case of get sockopt integer value
2147 */ 2153 */
2148int 2154int
2149sockopt_getint(const struct sockopt *sopt, int *valp) 2155sockopt_getint(const struct sockopt *sopt, int *valp)
2150{ 2156{
2151 2157
2152 return sockopt_get(sopt, valp, sizeof(int)); 2158 return sockopt_get(sopt, valp, sizeof(int));
2153} 2159}
2154 2160
2155/* 2161/*
2156 * set sockopt value from mbuf 2162 * set sockopt value from mbuf
2157 * - ONLY for legacy code 2163 * - ONLY for legacy code
2158 * - mbuf is released by sockopt 2164 * - mbuf is released by sockopt
2159 * - will not sleep 2165 * - will not sleep
2160 */ 2166 */
2161int 2167int
2162sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m) 2168sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m)
2163{ 2169{
2164 size_t len; 2170 size_t len;
2165 int error; 2171 int error;
2166 2172
2167 len = m_length(m); 2173 len = m_length(m);
2168 2174
2169 if (sopt->sopt_size == 0) { 2175 if (sopt->sopt_size == 0) {
2170 error = sockopt_alloc(sopt, len, KM_NOSLEEP); 2176 error = sockopt_alloc(sopt, len, KM_NOSLEEP);
2171 if (error) 2177 if (error)
2172 return error; 2178 return error;
2173 } 2179 }
2174 2180
2175 if (sopt->sopt_size < len) 2181 if (sopt->sopt_size < len)
2176 return EINVAL; 2182 return EINVAL;
2177  2183
2178 m_copydata(m, 0, len, sopt->sopt_data); 2184 m_copydata(m, 0, len, sopt->sopt_data);
2179 m_freem(m); 2185 m_freem(m);
2180 sopt->sopt_retsize = len; 2186 sopt->sopt_retsize = len;
2181 2187
2182 return 0; 2188 return 0;
2183} 2189}
2184 2190
2185/* 2191/*
2186 * get sockopt value into mbuf 2192 * get sockopt value into mbuf
2187 * - ONLY for legacy code 2193 * - ONLY for legacy code
2188 * - mbuf to be released by the caller 2194 * - mbuf to be released by the caller
2189 * - will not sleep 2195 * - will not sleep
2190 */ 2196 */
2191struct mbuf * 2197struct mbuf *
2192sockopt_getmbuf(const struct sockopt *sopt) 2198sockopt_getmbuf(const struct sockopt *sopt)
2193{ 2199{
2194 struct mbuf *m; 2200 struct mbuf *m;
2195 2201
2196 if (sopt->sopt_size > MCLBYTES) 2202 if (sopt->sopt_size > MCLBYTES)
2197 return NULL; 2203 return NULL;
2198 2204
2199 m = m_get(M_DONTWAIT, MT_SOOPTS); 2205 m = m_get(M_DONTWAIT, MT_SOOPTS);
2200 if (m == NULL) 2206 if (m == NULL)
2201 return NULL; 2207 return NULL;
2202 2208
2203 if (sopt->sopt_size > MLEN) { 2209 if (sopt->sopt_size > MLEN) {
2204 MCLGET(m, M_DONTWAIT); 2210 MCLGET(m, M_DONTWAIT);
2205 if ((m->m_flags & M_EXT) == 0) { 2211 if ((m->m_flags & M_EXT) == 0) {
2206 m_free(m); 2212 m_free(m);
2207 return NULL; 2213 return NULL;
2208 } 2214 }
2209 } 2215 }
2210 2216
2211 memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size); 2217 memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size);
2212 m->m_len = sopt->sopt_size; 2218 m->m_len = sopt->sopt_size;
2213 2219
2214 return m; 2220 return m;
2215} 2221}
2216 2222
2217void 2223void
2218sohasoutofband(struct socket *so) 2224sohasoutofband(struct socket *so)
2219{ 2225{
2220 2226
2221 fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); 2227 fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so);
2222 selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT); 2228 selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT);
2223} 2229}
2224 2230
2225static void 2231static void
2226filt_sordetach(struct knote *kn) 2232filt_sordetach(struct knote *kn)
2227{ 2233{
2228 struct socket *so; 2234 struct socket *so;
2229 2235
2230 so = ((file_t *)kn->kn_obj)->f_socket; 2236 so = ((file_t *)kn->kn_obj)->f_socket;
2231 solock(so); 2237 solock(so);
2232 SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); 2238 SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext);
2233 if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) 2239 if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist))
2234 so->so_rcv.sb_flags &= ~SB_KNOTE; 2240 so->so_rcv.sb_flags &= ~SB_KNOTE;
2235 sounlock(so); 2241 sounlock(so);
2236} 2242}
2237 2243
2238/*ARGSUSED*/ 2244/*ARGSUSED*/
2239static int 2245static int
2240filt_soread(struct knote *kn, long hint) 2246filt_soread(struct knote *kn, long hint)
2241{ 2247{
2242 struct socket *so; 2248 struct socket *so;
2243 int rv; 2249 int rv;
2244 2250
2245 so = ((file_t *)kn->kn_obj)->f_socket; 2251 so = ((file_t *)kn->kn_obj)->f_socket;
2246 if (hint != NOTE_SUBMIT) 2252 if (hint != NOTE_SUBMIT)
2247 solock(so); 2253 solock(so);
2248 kn->kn_data = so->so_rcv.sb_cc; 2254 kn->kn_data = so->so_rcv.sb_cc;
2249 if (so->so_state & SS_CANTRCVMORE) { 2255 if (so->so_state & SS_CANTRCVMORE) {
2250 kn->kn_flags |= EV_EOF; 2256 kn->kn_flags |= EV_EOF;
2251 kn->kn_fflags = so->so_error; 2257 kn->kn_fflags = so->so_error;
2252 rv = 1; 2258 rv = 1;
2253 } else if (so->so_error) 2259 } else if (so->so_error || so->so_rerror)
2254 rv = 1; 2260 rv = 1;
2255 else if (kn->kn_sfflags & NOTE_LOWAT) 2261 else if (kn->kn_sfflags & NOTE_LOWAT)
2256 rv = (kn->kn_data >= kn->kn_sdata); 2262 rv = (kn->kn_data >= kn->kn_sdata);
2257 else 2263 else
2258 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2264 rv = (kn->kn_data >= so->so_rcv.sb_lowat);
2259 if (hint != NOTE_SUBMIT) 2265 if (hint != NOTE_SUBMIT)
2260 sounlock(so); 2266 sounlock(so);
2261 return rv; 2267 return rv;
2262} 2268}
2263 2269
2264static void 2270static void
2265filt_sowdetach(struct knote *kn) 2271filt_sowdetach(struct knote *kn)
2266{ 2272{
2267 struct socket *so; 2273 struct socket *so;
2268 2274
2269 so = ((file_t *)kn->kn_obj)->f_socket; 2275 so = ((file_t *)kn->kn_obj)->f_socket;
2270 solock(so); 2276 solock(so);
2271 SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); 2277 SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext);
2272 if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) 2278 if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist))
2273 so->so_snd.sb_flags &= ~SB_KNOTE; 2279 so->so_snd.sb_flags &= ~SB_KNOTE;
2274 sounlock(so); 2280 sounlock(so);
2275} 2281}
2276 2282
2277/*ARGSUSED*/ 2283/*ARGSUSED*/
2278static int 2284static int
2279filt_sowrite(struct knote *kn, long hint) 2285filt_sowrite(struct knote *kn, long hint)
2280{ 2286{
2281 struct socket *so; 2287 struct socket *so;
2282 int rv; 2288 int rv;
2283 2289
2284 so = ((file_t *)kn->kn_obj)->f_socket; 2290 so = ((file_t *)kn->kn_obj)->f_socket;
2285 if (hint != NOTE_SUBMIT) 2291 if (hint != NOTE_SUBMIT)
2286 solock(so); 2292 solock(so);
2287 kn->kn_data = sbspace(&so->so_snd); 2293 kn->kn_data = sbspace(&so->so_snd);
2288 if (so->so_state & SS_CANTSENDMORE) { 2294 if (so->so_state & SS_CANTSENDMORE) {
2289 kn->kn_flags |= EV_EOF; 2295 kn->kn_flags |= EV_EOF;
2290 kn->kn_fflags = so->so_error; 2296 kn->kn_fflags = so->so_error;
2291 rv = 1; 2297 rv = 1;
2292 } else if (so->so_error) 2298 } else if (so->so_error)
2293 rv = 1; 2299 rv = 1;
2294 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2300 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2295 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2301 (so->so_proto->pr_flags & PR_CONNREQUIRED))
2296 rv = 0; 2302 rv = 0;
2297 else if (kn->kn_sfflags & NOTE_LOWAT) 2303 else if (kn->kn_sfflags & NOTE_LOWAT)
2298 rv = (kn->kn_data >= kn->kn_sdata); 2304 rv = (kn->kn_data >= kn->kn_sdata);
2299 else 2305 else
2300 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2306 rv = (kn->kn_data >= so->so_snd.sb_lowat);
2301 if (hint != NOTE_SUBMIT) 2307 if (hint != NOTE_SUBMIT)
2302 sounlock(so); 2308 sounlock(so);
2303 return rv; 2309 return rv;
2304} 2310}
2305 2311
2306/*ARGSUSED*/ 2312/*ARGSUSED*/
2307static int 2313static int
2308filt_solisten(struct knote *kn, long hint) 2314filt_solisten(struct knote *kn, long hint)
2309{ 2315{
2310 struct socket *so; 2316 struct socket *so;
2311 int rv; 2317 int rv;
2312 2318
2313 so = ((file_t *)kn->kn_obj)->f_socket; 2319 so = ((file_t *)kn->kn_obj)->f_socket;
2314 2320
2315 /* 2321 /*
2316 * Set kn_data to number of incoming connections, not 2322 * Set kn_data to number of incoming connections, not
2317 * counting partial (incomplete) connections. 2323 * counting partial (incomplete) connections.
2318 */ 2324 */
2319 if (hint != NOTE_SUBMIT) 2325 if (hint != NOTE_SUBMIT)
2320 solock(so); 2326 solock(so);
2321 kn->kn_data = so->so_qlen; 2327 kn->kn_data = so->so_qlen;
2322 rv = (kn->kn_data > 0); 2328 rv = (kn->kn_data > 0);
2323 if (hint != NOTE_SUBMIT) 2329 if (hint != NOTE_SUBMIT)
2324 sounlock(so); 2330 sounlock(so);
2325 return rv; 2331 return rv;
2326} 2332}
2327 2333
2328static const struct filterops solisten_filtops = { 2334static const struct filterops solisten_filtops = {
2329 .f_isfd = 1, 2335 .f_isfd = 1,
2330 .f_attach = NULL, 2336 .f_attach = NULL,
2331 .f_detach = filt_sordetach, 2337 .f_detach = filt_sordetach,
2332 .f_event = filt_solisten, 2338 .f_event = filt_solisten,
2333}; 2339};
2334 2340
2335static const struct filterops soread_filtops = { 2341static const struct filterops soread_filtops = {
2336 .f_isfd = 1, 2342 .f_isfd = 1,
2337 .f_attach = NULL, 2343 .f_attach = NULL,
2338 .f_detach = filt_sordetach, 2344 .f_detach = filt_sordetach,
2339 .f_event = filt_soread, 2345 .f_event = filt_soread,
2340}; 2346};
2341 2347
2342static const struct filterops sowrite_filtops = { 2348static const struct filterops sowrite_filtops = {
2343 .f_isfd = 1, 2349 .f_isfd = 1,
2344 .f_attach = NULL, 2350 .f_attach = NULL,
2345 .f_detach = filt_sowdetach, 2351 .f_detach = filt_sowdetach,
2346 .f_event = filt_sowrite, 2352 .f_event = filt_sowrite,
2347}; 2353};
2348 2354
2349int 2355int
2350soo_kqfilter(struct file *fp, struct knote *kn) 2356soo_kqfilter(struct file *fp, struct knote *kn)
2351{ 2357{
2352 struct socket *so; 2358 struct socket *so;
2353 struct sockbuf *sb; 2359 struct sockbuf *sb;
2354 2360
2355 so = ((file_t *)kn->kn_obj)->f_socket; 2361 so = ((file_t *)kn->kn_obj)->f_socket;
2356 solock(so); 2362 solock(so);
2357 switch (kn->kn_filter) { 2363 switch (kn->kn_filter) {
2358 case EVFILT_READ: 2364 case EVFILT_READ:
2359 if (so->so_options & SO_ACCEPTCONN) 2365 if (so->so_options & SO_ACCEPTCONN)
2360 kn->kn_fop = &solisten_filtops; 2366 kn->kn_fop = &solisten_filtops;
2361 else 2367 else
2362 kn->kn_fop = &soread_filtops; 2368 kn->kn_fop = &soread_filtops;
2363 sb = &so->so_rcv; 2369 sb = &so->so_rcv;
2364 break; 2370 break;
2365 case EVFILT_WRITE: 2371 case EVFILT_WRITE:
2366 kn->kn_fop = &sowrite_filtops; 2372 kn->kn_fop = &sowrite_filtops;
2367 sb = &so->so_snd; 2373 sb = &so->so_snd;
2368 break; 2374 break;
2369 default: 2375 default:
2370 sounlock(so); 2376 sounlock(so);
2371 return (EINVAL); 2377 return (EINVAL);
2372 } 2378 }
2373 SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); 2379 SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext);
2374 sb->sb_flags |= SB_KNOTE; 2380 sb->sb_flags |= SB_KNOTE;
2375 sounlock(so); 2381 sounlock(so);
2376 return (0); 2382 return (0);
2377} 2383}
2378 2384
2379static int 2385static int
2380sodopoll(struct socket *so, int events) 2386sodopoll(struct socket *so, int events)
2381{ 2387{
2382 int revents; 2388 int revents;
2383 2389
2384 revents = 0; 2390 revents = 0;
2385 2391
2386 if (events & (POLLIN | POLLRDNORM)) 2392 if (events & (POLLIN | POLLRDNORM))
2387 if (soreadable(so)) 2393 if (soreadable(so))
2388 revents |= events & (POLLIN | POLLRDNORM); 2394 revents |= events & (POLLIN | POLLRDNORM);
2389 2395
2390 if (events & (POLLOUT | POLLWRNORM)) 2396 if (events & (POLLOUT | POLLWRNORM))
2391 if (sowritable(so)) 2397 if (sowritable(so))
2392 revents |= events & (POLLOUT | POLLWRNORM); 2398 revents |= events & (POLLOUT | POLLWRNORM);
2393 2399
2394 if (events & (POLLPRI | POLLRDBAND)) 2400 if (events & (POLLPRI | POLLRDBAND))
2395 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) 2401 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
2396 revents |= events & (POLLPRI | POLLRDBAND); 2402 revents |= events & (POLLPRI | POLLRDBAND);
2397 2403
2398 return revents; 2404 return revents;
2399} 2405}
2400 2406
2401int 2407int
2402sopoll(struct socket *so, int events) 2408sopoll(struct socket *so, int events)
2403{ 2409{
2404 int revents = 0; 2410 int revents = 0;
2405 2411
2406#ifndef DIAGNOSTIC 2412#ifndef DIAGNOSTIC
2407 /* 2413 /*
2408 * Do a quick, unlocked check in expectation that the socket 2414 * Do a quick, unlocked check in expectation that the socket
2409 * will be ready for I/O. Don't do this check if DIAGNOSTIC, 2415 * will be ready for I/O. Don't do this check if DIAGNOSTIC,
2410 * as the solocked() assertions will fail. 2416 * as the solocked() assertions will fail.
2411 */ 2417 */
2412 if ((revents = sodopoll(so, events)) != 0) 2418 if ((revents = sodopoll(so, events)) != 0)
2413 return revents; 2419 return revents;
2414#endif 2420#endif
2415 2421
2416 solock(so); 2422 solock(so);
2417 if ((revents = sodopoll(so, events)) == 0) { 2423 if ((revents = sodopoll(so, events)) == 0) {
2418 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 2424 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2419 selrecord(curlwp, &so->so_rcv.sb_sel); 2425 selrecord(curlwp, &so->so_rcv.sb_sel);
2420 so->so_rcv.sb_flags |= SB_NOTIFY; 2426 so->so_rcv.sb_flags |= SB_NOTIFY;
2421 } 2427 }
2422 2428
2423 if (events & (POLLOUT | POLLWRNORM)) { 2429 if (events & (POLLOUT | POLLWRNORM)) {
2424 selrecord(curlwp, &so->so_snd.sb_sel); 2430 selrecord(curlwp, &so->so_snd.sb_sel);
2425 so->so_snd.sb_flags |= SB_NOTIFY; 2431 so->so_snd.sb_flags |= SB_NOTIFY;
2426 } 2432 }
2427 } 2433 }
2428 sounlock(so); 2434 sounlock(so);
2429 2435
2430 return revents; 2436 return revents;
2431} 2437}
2432 2438
2433struct mbuf ** 2439struct mbuf **
2434sbsavetimestamp(int opt, struct mbuf **mp) 2440sbsavetimestamp(int opt, struct mbuf **mp)
2435{ 2441{
2436 struct timeval tv; 2442 struct timeval tv;
2437 microtime(&tv); 2443 microtime(&tv);
2438 2444
2439#ifdef SO_OTIMESTAMP 2445#ifdef SO_OTIMESTAMP
2440 if (opt & SO_OTIMESTAMP) { 2446 if (opt & SO_OTIMESTAMP) {
2441 struct timeval50 tv50; 2447 struct timeval50 tv50;
2442 2448
2443 timeval_to_timeval50(&tv, &tv50); 2449 timeval_to_timeval50(&tv, &tv50);
2444 *mp = sbcreatecontrol(&tv50, sizeof(tv50), 2450 *mp = sbcreatecontrol(&tv50, sizeof(tv50),
2445 SCM_OTIMESTAMP, SOL_SOCKET); 2451 SCM_OTIMESTAMP, SOL_SOCKET);
2446 if (*mp) 2452 if (*mp)
2447 mp = &(*mp)->m_next; 2453 mp = &(*mp)->m_next;
2448 } else 2454 } else
2449#endif 2455#endif
2450 2456
2451 if (opt & SO_TIMESTAMP) { 2457 if (opt & SO_TIMESTAMP) {
2452 *mp = sbcreatecontrol(&tv, sizeof(tv), 2458 *mp = sbcreatecontrol(&tv, sizeof(tv),
2453 SCM_TIMESTAMP, SOL_SOCKET); 2459 SCM_TIMESTAMP, SOL_SOCKET);
2454 if (*mp) 2460 if (*mp)
2455 mp = &(*mp)->m_next; 2461 mp = &(*mp)->m_next;
2456 } 2462 }
2457 return mp; 2463 return mp;
2458} 2464}
2459 2465
2460 2466
2461#include <sys/sysctl.h> 2467#include <sys/sysctl.h>
2462 2468
2463static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); 2469static int sysctl_kern_somaxkva(SYSCTLFN_PROTO);
2464static int sysctl_kern_sbmax(SYSCTLFN_PROTO); 2470static int sysctl_kern_sbmax(SYSCTLFN_PROTO);
2465 2471
2466/* 2472/*
2467 * sysctl helper routine for kern.somaxkva. ensures that the given 2473 * sysctl helper routine for kern.somaxkva. ensures that the given
2468 * value is not too small. 2474 * value is not too small.
2469 * (XXX should we maybe make sure it's not too large as well?) 2475 * (XXX should we maybe make sure it's not too large as well?)
2470 */ 2476 */
2471static int 2477static int
2472sysctl_kern_somaxkva(SYSCTLFN_ARGS) 2478sysctl_kern_somaxkva(SYSCTLFN_ARGS)
2473{ 2479{
2474 int error, new_somaxkva; 2480 int error, new_somaxkva;
2475 struct sysctlnode node; 2481 struct sysctlnode node;
2476 2482
2477 new_somaxkva = somaxkva; 2483 new_somaxkva = somaxkva;
2478 node = *rnode; 2484 node = *rnode;
2479 node.sysctl_data = &new_somaxkva; 2485 node.sysctl_data = &new_somaxkva;
2480 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 2486 error = sysctl_lookup(SYSCTLFN_CALL(&node));
2481 if (error || newp == NULL) 2487 if (error || newp == NULL)
2482 return (error); 2488 return (error);
2483 2489
2484 if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */ 2490 if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */
2485 return (EINVAL); 2491 return (EINVAL);
2486 2492
2487 mutex_enter(&so_pendfree_lock); 2493 mutex_enter(&so_pendfree_lock);
2488 somaxkva = new_somaxkva; 2494 somaxkva = new_somaxkva;
2489 cv_broadcast(&socurkva_cv); 2495 cv_broadcast(&socurkva_cv);
2490 mutex_exit(&so_pendfree_lock); 2496 mutex_exit(&so_pendfree_lock);
2491 2497
2492 return (error); 2498 return (error);
2493} 2499}
2494 2500
2495/* 2501/*
2496 * sysctl helper routine for kern.sbmax. Basically just ensures that 2502 * sysctl helper routine for kern.sbmax. Basically just ensures that
2497 * any new value is not too small. 2503 * any new value is not too small.
2498 */ 2504 */
2499static int 2505static int
2500sysctl_kern_sbmax(SYSCTLFN_ARGS) 2506sysctl_kern_sbmax(SYSCTLFN_ARGS)
2501{ 2507{
2502 int error, new_sbmax; 2508 int error, new_sbmax;
2503 struct sysctlnode node; 2509 struct sysctlnode node;
2504 2510
2505 new_sbmax = sb_max; 2511 new_sbmax = sb_max;
2506 node = *rnode; 2512 node = *rnode;
2507 node.sysctl_data = &new_sbmax; 2513 node.sysctl_data = &new_sbmax;
2508 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 2514 error = sysctl_lookup(SYSCTLFN_CALL(&node));
2509 if (error || newp == NULL) 2515 if (error || newp == NULL)
2510 return (error); 2516 return (error);
2511 2517
2512 KERNEL_LOCK(1, NULL); 2518 KERNEL_LOCK(1, NULL);
2513 error = sb_max_set(new_sbmax); 2519 error = sb_max_set(new_sbmax);
2514 KERNEL_UNLOCK_ONE(NULL); 2520 KERNEL_UNLOCK_ONE(NULL);
2515 2521
2516 return (error); 2522 return (error);
2517} 2523}
2518 2524
2519static void 2525static void
2520sysctl_kern_socket_setup(void) 2526sysctl_kern_socket_setup(void)
2521{ 2527{
2522 2528
2523 KASSERT(socket_sysctllog == NULL); 2529 KASSERT(socket_sysctllog == NULL);
2524 2530
2525 sysctl_createv(&socket_sysctllog, 0, NULL, NULL, 2531 sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
2526 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2532 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2527 CTLTYPE_INT, "somaxkva", 2533 CTLTYPE_INT, "somaxkva",
2528 SYSCTL_DESCR("Maximum amount of kernel memory to be " 2534 SYSCTL_DESCR("Maximum amount of kernel memory to be "
2529 "used for socket buffers"), 2535 "used for socket buffers"),
2530 sysctl_kern_somaxkva, 0, NULL, 0, 2536 sysctl_kern_somaxkva, 0, NULL, 0,
2531 CTL_KERN, KERN_SOMAXKVA, CTL_EOL); 2537 CTL_KERN, KERN_SOMAXKVA, CTL_EOL);
2532 2538
2533 sysctl_createv(&socket_sysctllog, 0, NULL, NULL, 2539 sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
2534 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2540 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2535 CTLTYPE_INT, "sbmax", 2541 CTLTYPE_INT, "sbmax",
2536 SYSCTL_DESCR("Maximum socket buffer size"), 2542 SYSCTL_DESCR("Maximum socket buffer size"),
2537 sysctl_kern_sbmax, 0, NULL, 0, 2543 sysctl_kern_sbmax, 0, NULL, 0,
2538 CTL_KERN, KERN_SBMAX, CTL_EOL); 2544 CTL_KERN, KERN_SBMAX, CTL_EOL);
2539} 2545}

cvs diff -r1.129 -r1.130 src/sys/kern/uipc_socket2.c (switch to unified diff)

--- src/sys/kern/uipc_socket2.c 2018/04/29 07:13:10 1.129
+++ src/sys/kern/uipc_socket2.c 2018/06/06 09:46:46 1.130
@@ -1,1506 +1,1506 @@ @@ -1,1506 +1,1506 @@
1/* $NetBSD: uipc_socket2.c,v 1.129 2018/04/29 07:13:10 maxv Exp $ */ 1/* $NetBSD: uipc_socket2.c,v 1.130 2018/06/06 09:46:46 roy Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29/* 29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993 30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved. 31 * The Regents of the University of California. All rights reserved.
32 * 32 *
33 * Redistribution and use in source and binary forms, with or without 33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions 34 * modification, are permitted provided that the following conditions
35 * are met: 35 * are met:
36 * 1. Redistributions of source code must retain the above copyright 36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer. 37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright 38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the 39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution. 40 * documentation and/or other materials provided with the distribution.
41 * 3. Neither the name of the University nor the names of its contributors 41 * 3. Neither the name of the University nor the names of its contributors
42 * may be used to endorse or promote products derived from this software 42 * may be used to endorse or promote products derived from this software
43 * without specific prior written permission. 43 * without specific prior written permission.
44 * 44 *
45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE. 55 * SUCH DAMAGE.
56 * 56 *
57 * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95 57 * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95
58 */ 58 */
59 59
60#include <sys/cdefs.h> 60#include <sys/cdefs.h>
61__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.129 2018/04/29 07:13:10 maxv Exp $"); 61__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.130 2018/06/06 09:46:46 roy Exp $");
62 62
63#ifdef _KERNEL_OPT 63#ifdef _KERNEL_OPT
64#include "opt_mbuftrace.h" 64#include "opt_mbuftrace.h"
65#include "opt_sb_max.h" 65#include "opt_sb_max.h"
66#endif 66#endif
67 67
68#include <sys/param.h> 68#include <sys/param.h>
69#include <sys/systm.h> 69#include <sys/systm.h>
70#include <sys/proc.h> 70#include <sys/proc.h>
71#include <sys/file.h> 71#include <sys/file.h>
72#include <sys/buf.h> 72#include <sys/buf.h>
73#include <sys/mbuf.h> 73#include <sys/mbuf.h>
74#include <sys/protosw.h> 74#include <sys/protosw.h>
75#include <sys/domain.h> 75#include <sys/domain.h>
76#include <sys/poll.h> 76#include <sys/poll.h>
77#include <sys/socket.h> 77#include <sys/socket.h>
78#include <sys/socketvar.h> 78#include <sys/socketvar.h>
79#include <sys/signalvar.h> 79#include <sys/signalvar.h>
80#include <sys/kauth.h> 80#include <sys/kauth.h>
81#include <sys/pool.h> 81#include <sys/pool.h>
82#include <sys/uidinfo.h> 82#include <sys/uidinfo.h>
83 83
84/* 84/*
85 * Primitive routines for operating on sockets and socket buffers. 85 * Primitive routines for operating on sockets and socket buffers.
86 * 86 *
87 * Connection life-cycle: 87 * Connection life-cycle:
88 * 88 *
89 * Normal sequence from the active (originating) side: 89 * Normal sequence from the active (originating) side:
90 * 90 *
91 * - soisconnecting() is called during processing of connect() call, 91 * - soisconnecting() is called during processing of connect() call,
92 * - resulting in an eventual call to soisconnected() if/when the 92 * - resulting in an eventual call to soisconnected() if/when the
93 * connection is established. 93 * connection is established.
94 * 94 *
95 * When the connection is torn down during processing of disconnect(): 95 * When the connection is torn down during processing of disconnect():
96 * 96 *
97 * - soisdisconnecting() is called and, 97 * - soisdisconnecting() is called and,
98 * - soisdisconnected() is called when the connection to the peer 98 * - soisdisconnected() is called when the connection to the peer
99 * is totally severed. 99 * is totally severed.
100 * 100 *
101 * The semantics of these routines are such that connectionless protocols 101 * The semantics of these routines are such that connectionless protocols
102 * can call soisconnected() and soisdisconnected() only, bypassing the 102 * can call soisconnected() and soisdisconnected() only, bypassing the
103 * in-progress calls when setting up a ``connection'' takes no time. 103 * in-progress calls when setting up a ``connection'' takes no time.
104 * 104 *
105 * From the passive side, a socket is created with two queues of sockets: 105 * From the passive side, a socket is created with two queues of sockets:
106 * 106 *
107 * - so_q0 (0) for partial connections (i.e. connections in progress) 107 * - so_q0 (0) for partial connections (i.e. connections in progress)
108 * - so_q (1) for connections already made and awaiting user acceptance. 108 * - so_q (1) for connections already made and awaiting user acceptance.
109 * 109 *
110 * As a protocol is preparing incoming connections, it creates a socket 110 * As a protocol is preparing incoming connections, it creates a socket
111 * structure queued on so_q0 by calling sonewconn(). When the connection 111 * structure queued on so_q0 by calling sonewconn(). When the connection
112 * is established, soisconnected() is called, and transfers the 112 * is established, soisconnected() is called, and transfers the
113 * socket structure to so_q, making it available to accept(). 113 * socket structure to so_q, making it available to accept().
114 * 114 *
115 * If a socket is closed with sockets on either so_q0 or so_q, these 115 * If a socket is closed with sockets on either so_q0 or so_q, these
116 * sockets are dropped. 116 * sockets are dropped.
117 * 117 *
118 * Locking rules and assumptions: 118 * Locking rules and assumptions:
119 * 119 *
120 * o socket::so_lock can change on the fly. The low level routines used 120 * o socket::so_lock can change on the fly. The low level routines used
121 * to lock sockets are aware of this. When so_lock is acquired, the 121 * to lock sockets are aware of this. When so_lock is acquired, the
122 * routine locking must check to see if so_lock still points to the 122 * routine locking must check to see if so_lock still points to the
123 * lock that was acquired. If so_lock has changed in the meantime, the 123 * lock that was acquired. If so_lock has changed in the meantime, the
124 * now irrelevant lock that was acquired must be dropped and the lock 124 * now irrelevant lock that was acquired must be dropped and the lock
125 * operation retried. Although not proven here, this is completely safe 125 * operation retried. Although not proven here, this is completely safe
126 * on a multiprocessor system, even with relaxed memory ordering, given 126 * on a multiprocessor system, even with relaxed memory ordering, given
127 * the next two rules: 127 * the next two rules:
128 * 128 *
129 * o In order to mutate so_lock, the lock pointed to by the current value 129 * o In order to mutate so_lock, the lock pointed to by the current value
130 * of so_lock must be held: i.e., the socket must be held locked by the 130 * of so_lock must be held: i.e., the socket must be held locked by the
131 * changing thread. The thread must issue membar_exit() to prevent 131 * changing thread. The thread must issue membar_exit() to prevent
132 * memory accesses being reordered, and can set so_lock to the desired 132 * memory accesses being reordered, and can set so_lock to the desired
133 * value. If the lock pointed to by the new value of so_lock is not 133 * value. If the lock pointed to by the new value of so_lock is not
134 * held by the changing thread, the socket must then be considered 134 * held by the changing thread, the socket must then be considered
135 * unlocked. 135 * unlocked.
136 * 136 *
137 * o If so_lock is mutated, and the previous lock referred to by so_lock 137 * o If so_lock is mutated, and the previous lock referred to by so_lock
138 * could still be visible to other threads in the system (e.g. via file 138 * could still be visible to other threads in the system (e.g. via file
139 * descriptor or protocol-internal reference), then the old lock must 139 * descriptor or protocol-internal reference), then the old lock must
140 * remain valid until the socket and/or protocol control block has been 140 * remain valid until the socket and/or protocol control block has been
141 * torn down. 141 * torn down.
142 * 142 *
143 * o If a socket has a non-NULL so_head value (i.e. is in the process of 143 * o If a socket has a non-NULL so_head value (i.e. is in the process of
144 * connecting), then locking the socket must also lock the socket pointed 144 * connecting), then locking the socket must also lock the socket pointed
145 * to by so_head: their lock pointers must match. 145 * to by so_head: their lock pointers must match.
146 * 146 *
147 * o If a socket has connections in progress (so_q, so_q0 not empty) then 147 * o If a socket has connections in progress (so_q, so_q0 not empty) then
148 * locking the socket must also lock the sockets attached to both queues. 148 * locking the socket must also lock the sockets attached to both queues.
149 * Again, their lock pointers must match. 149 * Again, their lock pointers must match.
150 * 150 *
151 * o Beyond the initial lock assignment in socreate(), assigning locks to 151 * o Beyond the initial lock assignment in socreate(), assigning locks to
152 * sockets is the responsibility of the individual protocols / protocol 152 * sockets is the responsibility of the individual protocols / protocol
153 * domains. 153 * domains.
154 */ 154 */
155 155
156static pool_cache_t socket_cache; 156static pool_cache_t socket_cache;
157u_long sb_max = SB_MAX;/* maximum socket buffer size */ 157u_long sb_max = SB_MAX;/* maximum socket buffer size */
158static u_long sb_max_adj; /* adjusted sb_max */ 158static u_long sb_max_adj; /* adjusted sb_max */
159 159
160void 160void
161soisconnecting(struct socket *so) 161soisconnecting(struct socket *so)
162{ 162{
163 163
164 KASSERT(solocked(so)); 164 KASSERT(solocked(so));
165 165
166 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 166 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
167 so->so_state |= SS_ISCONNECTING; 167 so->so_state |= SS_ISCONNECTING;
168} 168}
169 169
170void 170void
171soisconnected(struct socket *so) 171soisconnected(struct socket *so)
172{ 172{
173 struct socket *head; 173 struct socket *head;
174 174
175 head = so->so_head; 175 head = so->so_head;
176 176
177 KASSERT(solocked(so)); 177 KASSERT(solocked(so));
178 KASSERT(head == NULL || solocked2(so, head)); 178 KASSERT(head == NULL || solocked2(so, head));
179 179
180 so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING); 180 so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING);
181 so->so_state |= SS_ISCONNECTED; 181 so->so_state |= SS_ISCONNECTED;
182 if (head && so->so_onq == &head->so_q0) { 182 if (head && so->so_onq == &head->so_q0) {
183 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 183 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
184 /* 184 /*
185 * Re-enqueue and wake up any waiters, e.g. 185 * Re-enqueue and wake up any waiters, e.g.
186 * processes blocking on accept(). 186 * processes blocking on accept().
187 */ 187 */
188 soqremque(so, 0); 188 soqremque(so, 0);
189 soqinsque(head, so, 1); 189 soqinsque(head, so, 1);
190 sorwakeup(head); 190 sorwakeup(head);
191 cv_broadcast(&head->so_cv); 191 cv_broadcast(&head->so_cv);
192 } else { 192 } else {
193 so->so_upcall = 193 so->so_upcall =
194 head->so_accf->so_accept_filter->accf_callback; 194 head->so_accf->so_accept_filter->accf_callback;
195 so->so_upcallarg = head->so_accf->so_accept_filter_arg; 195 so->so_upcallarg = head->so_accf->so_accept_filter_arg;
196 so->so_rcv.sb_flags |= SB_UPCALL; 196 so->so_rcv.sb_flags |= SB_UPCALL;
197 so->so_options &= ~SO_ACCEPTFILTER; 197 so->so_options &= ~SO_ACCEPTFILTER;
198 (*so->so_upcall)(so, so->so_upcallarg, 198 (*so->so_upcall)(so, so->so_upcallarg,
199 POLLIN|POLLRDNORM, M_DONTWAIT); 199 POLLIN|POLLRDNORM, M_DONTWAIT);
200 } 200 }
201 } else { 201 } else {
202 cv_broadcast(&so->so_cv); 202 cv_broadcast(&so->so_cv);
203 sorwakeup(so); 203 sorwakeup(so);
204 sowwakeup(so); 204 sowwakeup(so);
205 } 205 }
206} 206}
207 207
208void 208void
209soisdisconnecting(struct socket *so) 209soisdisconnecting(struct socket *so)
210{ 210{
211 211
212 KASSERT(solocked(so)); 212 KASSERT(solocked(so));
213 213
214 so->so_state &= ~SS_ISCONNECTING; 214 so->so_state &= ~SS_ISCONNECTING;
215 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); 215 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
216 cv_broadcast(&so->so_cv); 216 cv_broadcast(&so->so_cv);
217 sowwakeup(so); 217 sowwakeup(so);
218 sorwakeup(so); 218 sorwakeup(so);
219} 219}
220 220
221void 221void
222soisdisconnected(struct socket *so) 222soisdisconnected(struct socket *so)
223{ 223{
224 224
225 KASSERT(solocked(so)); 225 KASSERT(solocked(so));
226 226
227 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 227 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
228 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); 228 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
229 cv_broadcast(&so->so_cv); 229 cv_broadcast(&so->so_cv);
230 sowwakeup(so); 230 sowwakeup(so);
231 sorwakeup(so); 231 sorwakeup(so);
232} 232}
233 233
234void 234void
235soinit2(void) 235soinit2(void)
236{ 236{
237 237
238 socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0, 238 socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0,
239 "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL); 239 "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL);
240} 240}
241 241
242/* 242/*
243 * sonewconn: accept a new connection. 243 * sonewconn: accept a new connection.
244 * 244 *
245 * When an attempt at a new connection is noted on a socket which accepts 245 * When an attempt at a new connection is noted on a socket which accepts
246 * connections, sonewconn(9) is called. If the connection is possible 246 * connections, sonewconn(9) is called. If the connection is possible
247 * (subject to space constraints, etc) then we allocate a new structure, 247 * (subject to space constraints, etc) then we allocate a new structure,
248 * properly linked into the data structure of the original socket. 248 * properly linked into the data structure of the original socket.
249 * 249 *
250 * => If 'soready' is true, then socket will become ready for accept() i.e. 250 * => If 'soready' is true, then socket will become ready for accept() i.e.
251 * inserted into the so_q queue, SS_ISCONNECTED set and waiters awoken. 251 * inserted into the so_q queue, SS_ISCONNECTED set and waiters awoken.
252 * => May be called from soft-interrupt context. 252 * => May be called from soft-interrupt context.
253 * => Listening socket should be locked. 253 * => Listening socket should be locked.
254 * => Returns the new socket locked. 254 * => Returns the new socket locked.
255 */ 255 */
256struct socket * 256struct socket *
257sonewconn(struct socket *head, bool soready) 257sonewconn(struct socket *head, bool soready)
258{ 258{
259 struct socket *so; 259 struct socket *so;
260 int soqueue, error; 260 int soqueue, error;
261 261
262 KASSERT(solocked(head)); 262 KASSERT(solocked(head));
263 263
264 if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) { 264 if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) {
265 /* 265 /*
266 * Listen queue overflow. If there is an accept filter 266 * Listen queue overflow. If there is an accept filter
267 * active, pass through the oldest cxn it's handling. 267 * active, pass through the oldest cxn it's handling.
268 */ 268 */
269 if (head->so_accf == NULL) { 269 if (head->so_accf == NULL) {
270 return NULL; 270 return NULL;
271 } else { 271 } else {
272 struct socket *so2, *next; 272 struct socket *so2, *next;
273 273
274 /* Pass the oldest connection waiting in the 274 /* Pass the oldest connection waiting in the
275 accept filter */ 275 accept filter */
276 for (so2 = TAILQ_FIRST(&head->so_q0); 276 for (so2 = TAILQ_FIRST(&head->so_q0);
277 so2 != NULL; so2 = next) { 277 so2 != NULL; so2 = next) {
278 next = TAILQ_NEXT(so2, so_qe); 278 next = TAILQ_NEXT(so2, so_qe);
279 if (so2->so_upcall == NULL) { 279 if (so2->so_upcall == NULL) {
280 continue; 280 continue;
281 } 281 }
282 so2->so_upcall = NULL; 282 so2->so_upcall = NULL;
283 so2->so_upcallarg = NULL; 283 so2->so_upcallarg = NULL;
284 so2->so_options &= ~SO_ACCEPTFILTER; 284 so2->so_options &= ~SO_ACCEPTFILTER;
285 so2->so_rcv.sb_flags &= ~SB_UPCALL; 285 so2->so_rcv.sb_flags &= ~SB_UPCALL;
286 soisconnected(so2); 286 soisconnected(so2);
287 break; 287 break;
288 } 288 }
289 289
290 /* If nothing was nudged out of the acept filter, bail 290 /* If nothing was nudged out of the acept filter, bail
291 * out; otherwise proceed allocating the socket. */ 291 * out; otherwise proceed allocating the socket. */
292 if (so2 == NULL) { 292 if (so2 == NULL) {
293 return NULL; 293 return NULL;
294 } 294 }
295 } 295 }
296 } 296 }
297 if ((head->so_options & SO_ACCEPTFILTER) != 0) { 297 if ((head->so_options & SO_ACCEPTFILTER) != 0) {
298 soready = false; 298 soready = false;
299 } 299 }
300 soqueue = soready ? 1 : 0; 300 soqueue = soready ? 1 : 0;
301 301
302 if ((so = soget(false)) == NULL) { 302 if ((so = soget(false)) == NULL) {
303 return NULL; 303 return NULL;
304 } 304 }
305 so->so_type = head->so_type; 305 so->so_type = head->so_type;
306 so->so_options = head->so_options & ~SO_ACCEPTCONN; 306 so->so_options = head->so_options & ~SO_ACCEPTCONN;
307 so->so_linger = head->so_linger; 307 so->so_linger = head->so_linger;
308 so->so_state = head->so_state | SS_NOFDREF; 308 so->so_state = head->so_state | SS_NOFDREF;
309 so->so_proto = head->so_proto; 309 so->so_proto = head->so_proto;
310 so->so_timeo = head->so_timeo; 310 so->so_timeo = head->so_timeo;
311 so->so_pgid = head->so_pgid; 311 so->so_pgid = head->so_pgid;
312 so->so_send = head->so_send; 312 so->so_send = head->so_send;
313 so->so_receive = head->so_receive; 313 so->so_receive = head->so_receive;
314 so->so_uidinfo = head->so_uidinfo; 314 so->so_uidinfo = head->so_uidinfo;
315 so->so_cpid = head->so_cpid; 315 so->so_cpid = head->so_cpid;
316 316
317 /* 317 /*
318 * Share the lock with the listening-socket, it may get unshared 318 * Share the lock with the listening-socket, it may get unshared
319 * once the connection is complete. 319 * once the connection is complete.
320 */ 320 */
321 mutex_obj_hold(head->so_lock); 321 mutex_obj_hold(head->so_lock);
322 so->so_lock = head->so_lock; 322 so->so_lock = head->so_lock;
323 323
324 /* 324 /*
325 * Reserve the space for socket buffers. 325 * Reserve the space for socket buffers.
326 */ 326 */
327#ifdef MBUFTRACE 327#ifdef MBUFTRACE
328 so->so_mowner = head->so_mowner; 328 so->so_mowner = head->so_mowner;
329 so->so_rcv.sb_mowner = head->so_rcv.sb_mowner; 329 so->so_rcv.sb_mowner = head->so_rcv.sb_mowner;
330 so->so_snd.sb_mowner = head->so_snd.sb_mowner; 330 so->so_snd.sb_mowner = head->so_snd.sb_mowner;
331#endif 331#endif
332 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 332 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
333 goto out; 333 goto out;
334 } 334 }
335 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 335 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
336 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 336 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
337 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 337 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
338 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 338 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
339 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & (SB_AUTOSIZE | SB_ASYNC); 339 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
340 so->so_snd.sb_flags |= head->so_snd.sb_flags & (SB_AUTOSIZE | SB_ASYNC); 340 so->so_snd.sb_flags |= head->so_snd.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
341 341
342 /* 342 /*
343 * Finally, perform the protocol attach. Note: a new socket 343 * Finally, perform the protocol attach. Note: a new socket
344 * lock may be assigned at this point (if so, it will be held). 344 * lock may be assigned at this point (if so, it will be held).
345 */ 345 */
346 error = (*so->so_proto->pr_usrreqs->pr_attach)(so, 0); 346 error = (*so->so_proto->pr_usrreqs->pr_attach)(so, 0);
347 if (error) { 347 if (error) {
348out: 348out:
349 KASSERT(solocked(so)); 349 KASSERT(solocked(so));
350 KASSERT(so->so_accf == NULL); 350 KASSERT(so->so_accf == NULL);
351 soput(so); 351 soput(so);
352 352
353 /* Note: the listening socket shall stay locked. */ 353 /* Note: the listening socket shall stay locked. */
354 KASSERT(solocked(head)); 354 KASSERT(solocked(head));
355 return NULL; 355 return NULL;
356 } 356 }
357 KASSERT(solocked2(head, so)); 357 KASSERT(solocked2(head, so));
358 358
359 /* 359 /*
360 * Insert into the queue. If ready, update the connection status 360 * Insert into the queue. If ready, update the connection status
361 * and wake up any waiters, e.g. processes blocking on accept(). 361 * and wake up any waiters, e.g. processes blocking on accept().
362 */ 362 */
363 soqinsque(head, so, soqueue); 363 soqinsque(head, so, soqueue);
364 if (soready) { 364 if (soready) {
365 so->so_state |= SS_ISCONNECTED; 365 so->so_state |= SS_ISCONNECTED;
366 sorwakeup(head); 366 sorwakeup(head);
367 cv_broadcast(&head->so_cv); 367 cv_broadcast(&head->so_cv);
368 } 368 }
369 return so; 369 return so;
370} 370}
371 371
372struct socket * 372struct socket *
373soget(bool waitok) 373soget(bool waitok)
374{ 374{
375 struct socket *so; 375 struct socket *so;
376 376
377 so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); 377 so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
378 if (__predict_false(so == NULL)) 378 if (__predict_false(so == NULL))
379 return (NULL); 379 return (NULL);
380 memset(so, 0, sizeof(*so)); 380 memset(so, 0, sizeof(*so));
381 TAILQ_INIT(&so->so_q0); 381 TAILQ_INIT(&so->so_q0);
382 TAILQ_INIT(&so->so_q); 382 TAILQ_INIT(&so->so_q);
383 cv_init(&so->so_cv, "socket"); 383 cv_init(&so->so_cv, "socket");
384 cv_init(&so->so_rcv.sb_cv, "netio"); 384 cv_init(&so->so_rcv.sb_cv, "netio");
385 cv_init(&so->so_snd.sb_cv, "netio"); 385 cv_init(&so->so_snd.sb_cv, "netio");
386 selinit(&so->so_rcv.sb_sel); 386 selinit(&so->so_rcv.sb_sel);
387 selinit(&so->so_snd.sb_sel); 387 selinit(&so->so_snd.sb_sel);
388 so->so_rcv.sb_so = so; 388 so->so_rcv.sb_so = so;
389 so->so_snd.sb_so = so; 389 so->so_snd.sb_so = so;
390 return so; 390 return so;
391} 391}
392 392
393void 393void
394soput(struct socket *so) 394soput(struct socket *so)
395{ 395{
396 396
397 KASSERT(!cv_has_waiters(&so->so_cv)); 397 KASSERT(!cv_has_waiters(&so->so_cv));
398 KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); 398 KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
399 KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); 399 KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
400 seldestroy(&so->so_rcv.sb_sel); 400 seldestroy(&so->so_rcv.sb_sel);
401 seldestroy(&so->so_snd.sb_sel); 401 seldestroy(&so->so_snd.sb_sel);
402 mutex_obj_free(so->so_lock); 402 mutex_obj_free(so->so_lock);
403 cv_destroy(&so->so_cv); 403 cv_destroy(&so->so_cv);
404 cv_destroy(&so->so_rcv.sb_cv); 404 cv_destroy(&so->so_rcv.sb_cv);
405 cv_destroy(&so->so_snd.sb_cv); 405 cv_destroy(&so->so_snd.sb_cv);
406 pool_cache_put(socket_cache, so); 406 pool_cache_put(socket_cache, so);
407} 407}
408 408
409/* 409/*
410 * soqinsque: insert socket of a new connection into the specified 410 * soqinsque: insert socket of a new connection into the specified
411 * accept queue of the listening socket (head). 411 * accept queue of the listening socket (head).
412 * 412 *
413 * q = 0: queue of partial connections 413 * q = 0: queue of partial connections
414 * q = 1: queue of incoming connections 414 * q = 1: queue of incoming connections
415 */ 415 */
416void 416void
417soqinsque(struct socket *head, struct socket *so, int q) 417soqinsque(struct socket *head, struct socket *so, int q)
418{ 418{
419 KASSERT(q == 0 || q == 1); 419 KASSERT(q == 0 || q == 1);
420 KASSERT(solocked2(head, so)); 420 KASSERT(solocked2(head, so));
421 KASSERT(so->so_onq == NULL); 421 KASSERT(so->so_onq == NULL);
422 KASSERT(so->so_head == NULL); 422 KASSERT(so->so_head == NULL);
423 423
424 so->so_head = head; 424 so->so_head = head;
425 if (q == 0) { 425 if (q == 0) {
426 head->so_q0len++; 426 head->so_q0len++;
427 so->so_onq = &head->so_q0; 427 so->so_onq = &head->so_q0;
428 } else { 428 } else {
429 head->so_qlen++; 429 head->so_qlen++;
430 so->so_onq = &head->so_q; 430 so->so_onq = &head->so_q;
431 } 431 }
432 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 432 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
433} 433}
434 434
435/* 435/*
436 * soqremque: remove socket from the specified queue. 436 * soqremque: remove socket from the specified queue.
437 * 437 *
438 * => Returns true if socket was removed from the specified queue. 438 * => Returns true if socket was removed from the specified queue.
439 * => False if socket was not removed (because it was in other queue). 439 * => False if socket was not removed (because it was in other queue).
440 */ 440 */
441bool 441bool
442soqremque(struct socket *so, int q) 442soqremque(struct socket *so, int q)
443{ 443{
444 struct socket *head = so->so_head; 444 struct socket *head = so->so_head;
445 445
446 KASSERT(q == 0 || q == 1); 446 KASSERT(q == 0 || q == 1);
447 KASSERT(solocked(so)); 447 KASSERT(solocked(so));
448 KASSERT(so->so_onq != NULL); 448 KASSERT(so->so_onq != NULL);
449 KASSERT(head != NULL); 449 KASSERT(head != NULL);
450 450
451 if (q == 0) { 451 if (q == 0) {
452 if (so->so_onq != &head->so_q0) 452 if (so->so_onq != &head->so_q0)
453 return false; 453 return false;
454 head->so_q0len--; 454 head->so_q0len--;
455 } else { 455 } else {
456 if (so->so_onq != &head->so_q) 456 if (so->so_onq != &head->so_q)
457 return false; 457 return false;
458 head->so_qlen--; 458 head->so_qlen--;
459 } 459 }
460 KASSERT(solocked2(so, head)); 460 KASSERT(solocked2(so, head));
461 TAILQ_REMOVE(so->so_onq, so, so_qe); 461 TAILQ_REMOVE(so->so_onq, so, so_qe);
462 so->so_onq = NULL; 462 so->so_onq = NULL;
463 so->so_head = NULL; 463 so->so_head = NULL;
464 return true; 464 return true;
465} 465}
466 466
467/* 467/*
468 * socantsendmore: indicates that no more data will be sent on the 468 * socantsendmore: indicates that no more data will be sent on the
469 * socket; it would normally be applied to a socket when the user 469 * socket; it would normally be applied to a socket when the user
470 * informs the system that no more data is to be sent, by the protocol 470 * informs the system that no more data is to be sent, by the protocol
471 * code (in case pr_shutdown()). 471 * code (in case pr_shutdown()).
472 */ 472 */
473void 473void
474socantsendmore(struct socket *so) 474socantsendmore(struct socket *so)
475{ 475{
476 KASSERT(solocked(so)); 476 KASSERT(solocked(so));
477 477
478 so->so_state |= SS_CANTSENDMORE; 478 so->so_state |= SS_CANTSENDMORE;
479 sowwakeup(so); 479 sowwakeup(so);
480} 480}
481 481
482/* 482/*
483 * socantrcvmore(): indicates that no more data will be received and 483 * socantrcvmore(): indicates that no more data will be received and
484 * will normally be applied to the socket by a protocol when it detects 484 * will normally be applied to the socket by a protocol when it detects
485 * that the peer will send no more data. Data queued for reading in 485 * that the peer will send no more data. Data queued for reading in
486 * the socket may yet be read. 486 * the socket may yet be read.
487 */ 487 */
488void 488void
489socantrcvmore(struct socket *so) 489socantrcvmore(struct socket *so)
490{ 490{
491 KASSERT(solocked(so)); 491 KASSERT(solocked(so));
492 492
493 so->so_state |= SS_CANTRCVMORE; 493 so->so_state |= SS_CANTRCVMORE;
494 sorwakeup(so); 494 sorwakeup(so);
495} 495}
496 496
497/* 497/*
498 * soroverflow(): indicates that data was attempted to be sent 498 * soroverflow(): indicates that data was attempted to be sent
499 * but the receiving buffer overflowed. 499 * but the receiving buffer overflowed.
500 */ 500 */
501void 501void
502soroverflow(struct socket *so) 502soroverflow(struct socket *so)
503{ 503{
504 KASSERT(solocked(so)); 504 KASSERT(solocked(so));
505 505
506 so->so_rcv.sb_overflowed++; 506 so->so_rcv.sb_overflowed++;
507 so->so_error = ENOBUFS; 507 so->so_rerror = ENOBUFS;
508 sorwakeup(so); 508 sorwakeup(so);
509} 509}
510 510
511/* 511/*
512 * Wait for data to arrive at/drain from a socket buffer. 512 * Wait for data to arrive at/drain from a socket buffer.
513 */ 513 */
514int 514int
515sbwait(struct sockbuf *sb) 515sbwait(struct sockbuf *sb)
516{ 516{
517 struct socket *so; 517 struct socket *so;
518 kmutex_t *lock; 518 kmutex_t *lock;
519 int error; 519 int error;
520 520
521 so = sb->sb_so; 521 so = sb->sb_so;
522 522
523 KASSERT(solocked(so)); 523 KASSERT(solocked(so));
524 524
525 sb->sb_flags |= SB_NOTIFY; 525 sb->sb_flags |= SB_NOTIFY;
526 lock = so->so_lock; 526 lock = so->so_lock;
527 if ((sb->sb_flags & SB_NOINTR) != 0) 527 if ((sb->sb_flags & SB_NOINTR) != 0)
528 error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo); 528 error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo);
529 else 529 else
530 error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo); 530 error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo);
531 if (__predict_false(lock != so->so_lock)) 531 if (__predict_false(lock != so->so_lock))
532 solockretry(so, lock); 532 solockretry(so, lock);
533 return error; 533 return error;
534} 534}
535 535
536/* 536/*
537 * Wakeup processes waiting on a socket buffer. 537 * Wakeup processes waiting on a socket buffer.
538 * Do asynchronous notification via SIGIO 538 * Do asynchronous notification via SIGIO
539 * if the socket buffer has the SB_ASYNC flag set. 539 * if the socket buffer has the SB_ASYNC flag set.
540 */ 540 */
541void 541void
542sowakeup(struct socket *so, struct sockbuf *sb, int code) 542sowakeup(struct socket *so, struct sockbuf *sb, int code)
543{ 543{
544 int band; 544 int band;
545 545
546 KASSERT(solocked(so)); 546 KASSERT(solocked(so));
547 KASSERT(sb->sb_so == so); 547 KASSERT(sb->sb_so == so);
548 548
549 if (code == POLL_IN) 549 if (code == POLL_IN)
550 band = POLLIN|POLLRDNORM; 550 band = POLLIN|POLLRDNORM;
551 else 551 else
552 band = POLLOUT|POLLWRNORM; 552 band = POLLOUT|POLLWRNORM;
553 sb->sb_flags &= ~SB_NOTIFY; 553 sb->sb_flags &= ~SB_NOTIFY;
554 selnotify(&sb->sb_sel, band, NOTE_SUBMIT); 554 selnotify(&sb->sb_sel, band, NOTE_SUBMIT);
555 cv_broadcast(&sb->sb_cv); 555 cv_broadcast(&sb->sb_cv);
556 if (sb->sb_flags & SB_ASYNC) 556 if (sb->sb_flags & SB_ASYNC)
557 fownsignal(so->so_pgid, SIGIO, code, band, so); 557 fownsignal(so->so_pgid, SIGIO, code, band, so);
558 if (sb->sb_flags & SB_UPCALL) 558 if (sb->sb_flags & SB_UPCALL)
559 (*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT); 559 (*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT);
560} 560}
561 561
562/* 562/*
563 * Reset a socket's lock pointer. Wake all threads waiting on the 563 * Reset a socket's lock pointer. Wake all threads waiting on the
564 * socket's condition variables so that they can restart their waits 564 * socket's condition variables so that they can restart their waits
565 * using the new lock. The existing lock must be held. 565 * using the new lock. The existing lock must be held.
566 */ 566 */
567void 567void
568solockreset(struct socket *so, kmutex_t *lock) 568solockreset(struct socket *so, kmutex_t *lock)
569{ 569{
570 570
571 KASSERT(solocked(so)); 571 KASSERT(solocked(so));
572 572
573 so->so_lock = lock; 573 so->so_lock = lock;
574 cv_broadcast(&so->so_snd.sb_cv); 574 cv_broadcast(&so->so_snd.sb_cv);
575 cv_broadcast(&so->so_rcv.sb_cv); 575 cv_broadcast(&so->so_rcv.sb_cv);
576 cv_broadcast(&so->so_cv); 576 cv_broadcast(&so->so_cv);
577} 577}
578 578
579/* 579/*
580 * Socket buffer (struct sockbuf) utility routines. 580 * Socket buffer (struct sockbuf) utility routines.
581 * 581 *
582 * Each socket contains two socket buffers: one for sending data and 582 * Each socket contains two socket buffers: one for sending data and
583 * one for receiving data. Each buffer contains a queue of mbufs, 583 * one for receiving data. Each buffer contains a queue of mbufs,
584 * information about the number of mbufs and amount of data in the 584 * information about the number of mbufs and amount of data in the
585 * queue, and other fields allowing poll() statements and notification 585 * queue, and other fields allowing poll() statements and notification
586 * on data availability to be implemented. 586 * on data availability to be implemented.
587 * 587 *
588 * Data stored in a socket buffer is maintained as a list of records. 588 * Data stored in a socket buffer is maintained as a list of records.
589 * Each record is a list of mbufs chained together with the m_next 589 * Each record is a list of mbufs chained together with the m_next
590 * field. Records are chained together with the m_nextpkt field. The upper 590 * field. Records are chained together with the m_nextpkt field. The upper
591 * level routine soreceive() expects the following conventions to be 591 * level routine soreceive() expects the following conventions to be
592 * observed when placing information in the receive buffer: 592 * observed when placing information in the receive buffer:
593 * 593 *
594 * 1. If the protocol requires each message be preceded by the sender's 594 * 1. If the protocol requires each message be preceded by the sender's
595 * name, then a record containing that name must be present before 595 * name, then a record containing that name must be present before
596 * any associated data (mbuf's must be of type MT_SONAME). 596 * any associated data (mbuf's must be of type MT_SONAME).
597 * 2. If the protocol supports the exchange of ``access rights'' (really 597 * 2. If the protocol supports the exchange of ``access rights'' (really
598 * just additional data associated with the message), and there are 598 * just additional data associated with the message), and there are
599 * ``rights'' to be received, then a record containing this data 599 * ``rights'' to be received, then a record containing this data
600 * should be present (mbuf's must be of type MT_CONTROL). 600 * should be present (mbuf's must be of type MT_CONTROL).
601 * 3. If a name or rights record exists, then it must be followed by 601 * 3. If a name or rights record exists, then it must be followed by
602 * a data record, perhaps of zero length. 602 * a data record, perhaps of zero length.
603 * 603 *
604 * Before using a new socket structure it is first necessary to reserve 604 * Before using a new socket structure it is first necessary to reserve
605 * buffer space to the socket, by calling sbreserve(). This should commit 605 * buffer space to the socket, by calling sbreserve(). This should commit
606 * some of the available buffer space in the system buffer pool for the 606 * some of the available buffer space in the system buffer pool for the
607 * socket (currently, it does nothing but enforce limits). The space 607 * socket (currently, it does nothing but enforce limits). The space
608 * should be released by calling sbrelease() when the socket is destroyed. 608 * should be released by calling sbrelease() when the socket is destroyed.
609 */ 609 */
610 610
611int 611int
612sb_max_set(u_long new_sbmax) 612sb_max_set(u_long new_sbmax)
613{ 613{
614 int s; 614 int s;
615 615
616 if (new_sbmax < (16 * 1024)) 616 if (new_sbmax < (16 * 1024))
617 return (EINVAL); 617 return (EINVAL);
618 618
619 s = splsoftnet(); 619 s = splsoftnet();
620 sb_max = new_sbmax; 620 sb_max = new_sbmax;
621 sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES); 621 sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES);
622 splx(s); 622 splx(s);
623 623
624 return (0); 624 return (0);
625} 625}
626 626
627int 627int
628soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 628soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
629{ 629{
630 KASSERT(so->so_pcb == NULL || solocked(so)); 630 KASSERT(so->so_pcb == NULL || solocked(so));
631 631
632 /* 632 /*
633 * there's at least one application (a configure script of screen) 633 * there's at least one application (a configure script of screen)
634 * which expects a fifo is writable even if it has "some" bytes 634 * which expects a fifo is writable even if it has "some" bytes
635 * in its buffer. 635 * in its buffer.
636 * so we want to make sure (hiwat - lowat) >= (some bytes). 636 * so we want to make sure (hiwat - lowat) >= (some bytes).
637 * 637 *
638 * PIPE_BUF here is an arbitrary value chosen as (some bytes) above. 638 * PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
639 * we expect it's large enough for such applications. 639 * we expect it's large enough for such applications.
640 */ 640 */
641 u_long lowat = MAX(sock_loan_thresh, MCLBYTES); 641 u_long lowat = MAX(sock_loan_thresh, MCLBYTES);
642 u_long hiwat = lowat + PIPE_BUF; 642 u_long hiwat = lowat + PIPE_BUF;
643 643
644 if (sndcc < hiwat) 644 if (sndcc < hiwat)
645 sndcc = hiwat; 645 sndcc = hiwat;
646 if (sbreserve(&so->so_snd, sndcc, so) == 0) 646 if (sbreserve(&so->so_snd, sndcc, so) == 0)
647 goto bad; 647 goto bad;
648 if (sbreserve(&so->so_rcv, rcvcc, so) == 0) 648 if (sbreserve(&so->so_rcv, rcvcc, so) == 0)
649 goto bad2; 649 goto bad2;
650 if (so->so_rcv.sb_lowat == 0) 650 if (so->so_rcv.sb_lowat == 0)
651 so->so_rcv.sb_lowat = 1; 651 so->so_rcv.sb_lowat = 1;
652 if (so->so_snd.sb_lowat == 0) 652 if (so->so_snd.sb_lowat == 0)
653 so->so_snd.sb_lowat = lowat; 653 so->so_snd.sb_lowat = lowat;
654 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 654 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
655 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 655 so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
656 return (0); 656 return (0);
657 bad2: 657 bad2:
658 sbrelease(&so->so_snd, so); 658 sbrelease(&so->so_snd, so);
659 bad: 659 bad:
660 return (ENOBUFS); 660 return (ENOBUFS);
661} 661}
662 662
663/* 663/*
664 * Allot mbufs to a sockbuf. 664 * Allot mbufs to a sockbuf.
665 * Attempt to scale mbmax so that mbcnt doesn't become limiting 665 * Attempt to scale mbmax so that mbcnt doesn't become limiting
666 * if buffering efficiency is near the normal case. 666 * if buffering efficiency is near the normal case.
667 */ 667 */
668int 668int
669sbreserve(struct sockbuf *sb, u_long cc, struct socket *so) 669sbreserve(struct sockbuf *sb, u_long cc, struct socket *so)
670{ 670{
671 struct lwp *l = curlwp; /* XXX */ 671 struct lwp *l = curlwp; /* XXX */
672 rlim_t maxcc; 672 rlim_t maxcc;
673 struct uidinfo *uidinfo; 673 struct uidinfo *uidinfo;
674 674
675 KASSERT(so->so_pcb == NULL || solocked(so)); 675 KASSERT(so->so_pcb == NULL || solocked(so));
676 KASSERT(sb->sb_so == so); 676 KASSERT(sb->sb_so == so);
677 KASSERT(sb_max_adj != 0); 677 KASSERT(sb_max_adj != 0);
678 678
679 if (cc == 0 || cc > sb_max_adj) 679 if (cc == 0 || cc > sb_max_adj)
680 return (0); 680 return (0);
681 681
682 maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur; 682 maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur;
683 683
684 uidinfo = so->so_uidinfo; 684 uidinfo = so->so_uidinfo;
685 if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc)) 685 if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc))
686 return 0; 686 return 0;
687 sb->sb_mbmax = min(cc * 2, sb_max); 687 sb->sb_mbmax = min(cc * 2, sb_max);
688 if (sb->sb_lowat > sb->sb_hiwat) 688 if (sb->sb_lowat > sb->sb_hiwat)
689 sb->sb_lowat = sb->sb_hiwat; 689 sb->sb_lowat = sb->sb_hiwat;
690 return (1); 690 return (1);
691} 691}
692 692
693/* 693/*
694 * Free mbufs held by a socket, and reserved mbuf space. We do not assert 694 * Free mbufs held by a socket, and reserved mbuf space. We do not assert
695 * that the socket is held locked here: see sorflush(). 695 * that the socket is held locked here: see sorflush().
696 */ 696 */
697void 697void
698sbrelease(struct sockbuf *sb, struct socket *so) 698sbrelease(struct sockbuf *sb, struct socket *so)
699{ 699{
700 700
701 KASSERT(sb->sb_so == so); 701 KASSERT(sb->sb_so == so);
702 702
703 sbflush(sb); 703 sbflush(sb);
704 (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); 704 (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY);
705 sb->sb_mbmax = 0; 705 sb->sb_mbmax = 0;
706} 706}
707 707
708/* 708/*
709 * Routines to add and remove 709 * Routines to add and remove
710 * data from an mbuf queue. 710 * data from an mbuf queue.
711 * 711 *
712 * The routines sbappend() or sbappendrecord() are normally called to 712 * The routines sbappend() or sbappendrecord() are normally called to
713 * append new mbufs to a socket buffer, after checking that adequate 713 * append new mbufs to a socket buffer, after checking that adequate
714 * space is available, comparing the function sbspace() with the amount 714 * space is available, comparing the function sbspace() with the amount
715 * of data to be added. sbappendrecord() differs from sbappend() in 715 * of data to be added. sbappendrecord() differs from sbappend() in
716 * that data supplied is treated as the beginning of a new record. 716 * that data supplied is treated as the beginning of a new record.
717 * To place a sender's address, optional access rights, and data in a 717 * To place a sender's address, optional access rights, and data in a
718 * socket receive buffer, sbappendaddr() should be used. To place 718 * socket receive buffer, sbappendaddr() should be used. To place
719 * access rights and data in a socket receive buffer, sbappendrights() 719 * access rights and data in a socket receive buffer, sbappendrights()
720 * should be used. In either case, the new data begins a new record. 720 * should be used. In either case, the new data begins a new record.
721 * Note that unlike sbappend() and sbappendrecord(), these routines check 721 * Note that unlike sbappend() and sbappendrecord(), these routines check
722 * for the caller that there will be enough space to store the data. 722 * for the caller that there will be enough space to store the data.
723 * Each fails if there is not enough space, or if it cannot find mbufs 723 * Each fails if there is not enough space, or if it cannot find mbufs
724 * to store additional information in. 724 * to store additional information in.
725 * 725 *
726 * Reliable protocols may use the socket send buffer to hold data 726 * Reliable protocols may use the socket send buffer to hold data
727 * awaiting acknowledgement. Data is normally copied from a socket 727 * awaiting acknowledgement. Data is normally copied from a socket
728 * send buffer in a protocol with m_copym for output to a peer, 728 * send buffer in a protocol with m_copym for output to a peer,
729 * and then removing the data from the socket buffer with sbdrop() 729 * and then removing the data from the socket buffer with sbdrop()
730 * or sbdroprecord() when the data is acknowledged by the peer. 730 * or sbdroprecord() when the data is acknowledged by the peer.
731 */ 731 */
732 732
733#ifdef SOCKBUF_DEBUG 733#ifdef SOCKBUF_DEBUG
734void 734void
735sblastrecordchk(struct sockbuf *sb, const char *where) 735sblastrecordchk(struct sockbuf *sb, const char *where)
736{ 736{
737 struct mbuf *m = sb->sb_mb; 737 struct mbuf *m = sb->sb_mb;
738 738
739 KASSERT(solocked(sb->sb_so)); 739 KASSERT(solocked(sb->sb_so));
740 740
741 while (m && m->m_nextpkt) 741 while (m && m->m_nextpkt)
742 m = m->m_nextpkt; 742 m = m->m_nextpkt;
743 743
744 if (m != sb->sb_lastrecord) { 744 if (m != sb->sb_lastrecord) {
745 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 745 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
746 sb->sb_mb, sb->sb_lastrecord, m); 746 sb->sb_mb, sb->sb_lastrecord, m);
747 printf("packet chain:\n"); 747 printf("packet chain:\n");
748 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 748 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
749 printf("\t%p\n", m); 749 printf("\t%p\n", m);
750 panic("sblastrecordchk from %s", where); 750 panic("sblastrecordchk from %s", where);
751 } 751 }
752} 752}
753 753
754void 754void
755sblastmbufchk(struct sockbuf *sb, const char *where) 755sblastmbufchk(struct sockbuf *sb, const char *where)
756{ 756{
757 struct mbuf *m = sb->sb_mb; 757 struct mbuf *m = sb->sb_mb;
758 struct mbuf *n; 758 struct mbuf *n;
759 759
760 KASSERT(solocked(sb->sb_so)); 760 KASSERT(solocked(sb->sb_so));
761 761
762 while (m && m->m_nextpkt) 762 while (m && m->m_nextpkt)
763 m = m->m_nextpkt; 763 m = m->m_nextpkt;
764 764
765 while (m && m->m_next) 765 while (m && m->m_next)
766 m = m->m_next; 766 m = m->m_next;
767 767
768 if (m != sb->sb_mbtail) { 768 if (m != sb->sb_mbtail) {
769 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 769 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
770 sb->sb_mb, sb->sb_mbtail, m); 770 sb->sb_mb, sb->sb_mbtail, m);
771 printf("packet tree:\n"); 771 printf("packet tree:\n");
772 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 772 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
773 printf("\t"); 773 printf("\t");
774 for (n = m; n != NULL; n = n->m_next) 774 for (n = m; n != NULL; n = n->m_next)
775 printf("%p ", n); 775 printf("%p ", n);
776 printf("\n"); 776 printf("\n");
777 } 777 }
778 panic("sblastmbufchk from %s", where); 778 panic("sblastmbufchk from %s", where);
779 } 779 }
780} 780}
781#endif /* SOCKBUF_DEBUG */ 781#endif /* SOCKBUF_DEBUG */
782 782
783/* 783/*
784 * Link a chain of records onto a socket buffer 784 * Link a chain of records onto a socket buffer
785 */ 785 */
786#define SBLINKRECORDCHAIN(sb, m0, mlast) \ 786#define SBLINKRECORDCHAIN(sb, m0, mlast) \
787do { \ 787do { \
788 if ((sb)->sb_lastrecord != NULL) \ 788 if ((sb)->sb_lastrecord != NULL) \
789 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 789 (sb)->sb_lastrecord->m_nextpkt = (m0); \
790 else \ 790 else \
791 (sb)->sb_mb = (m0); \ 791 (sb)->sb_mb = (m0); \
792 (sb)->sb_lastrecord = (mlast); \ 792 (sb)->sb_lastrecord = (mlast); \
793} while (/*CONSTCOND*/0) 793} while (/*CONSTCOND*/0)
794 794
795 795
796#define SBLINKRECORD(sb, m0) \ 796#define SBLINKRECORD(sb, m0) \
797 SBLINKRECORDCHAIN(sb, m0, m0) 797 SBLINKRECORDCHAIN(sb, m0, m0)
798 798
799/* 799/*
800 * Append mbuf chain m to the last record in the 800 * Append mbuf chain m to the last record in the
801 * socket buffer sb. The additional space associated 801 * socket buffer sb. The additional space associated
802 * the mbuf chain is recorded in sb. Empty mbufs are 802 * the mbuf chain is recorded in sb. Empty mbufs are
803 * discarded and mbufs are compacted where possible. 803 * discarded and mbufs are compacted where possible.
804 */ 804 */
805void 805void
806sbappend(struct sockbuf *sb, struct mbuf *m) 806sbappend(struct sockbuf *sb, struct mbuf *m)
807{ 807{
808 struct mbuf *n; 808 struct mbuf *n;
809 809
810 KASSERT(solocked(sb->sb_so)); 810 KASSERT(solocked(sb->sb_so));
811 811
812 if (m == NULL) 812 if (m == NULL)
813 return; 813 return;
814 814
815#ifdef MBUFTRACE 815#ifdef MBUFTRACE
816 m_claimm(m, sb->sb_mowner); 816 m_claimm(m, sb->sb_mowner);
817#endif 817#endif
818 818
819 SBLASTRECORDCHK(sb, "sbappend 1"); 819 SBLASTRECORDCHK(sb, "sbappend 1");
820 820
821 if ((n = sb->sb_lastrecord) != NULL) { 821 if ((n = sb->sb_lastrecord) != NULL) {
822 /* 822 /*
823 * XXX Would like to simply use sb_mbtail here, but 823 * XXX Would like to simply use sb_mbtail here, but
824 * XXX I need to verify that I won't miss an EOR that 824 * XXX I need to verify that I won't miss an EOR that
825 * XXX way. 825 * XXX way.
826 */ 826 */
827 do { 827 do {
828 if (n->m_flags & M_EOR) { 828 if (n->m_flags & M_EOR) {
829 sbappendrecord(sb, m); /* XXXXXX!!!! */ 829 sbappendrecord(sb, m); /* XXXXXX!!!! */
830 return; 830 return;
831 } 831 }
832 } while (n->m_next && (n = n->m_next)); 832 } while (n->m_next && (n = n->m_next));
833 } else { 833 } else {
834 /* 834 /*
835 * If this is the first record in the socket buffer, it's 835 * If this is the first record in the socket buffer, it's
836 * also the last record. 836 * also the last record.
837 */ 837 */
838 sb->sb_lastrecord = m; 838 sb->sb_lastrecord = m;
839 } 839 }
840 sbcompress(sb, m, n); 840 sbcompress(sb, m, n);
841 SBLASTRECORDCHK(sb, "sbappend 2"); 841 SBLASTRECORDCHK(sb, "sbappend 2");
842} 842}
843 843
844/* 844/*
845 * This version of sbappend() should only be used when the caller 845 * This version of sbappend() should only be used when the caller
846 * absolutely knows that there will never be more than one record 846 * absolutely knows that there will never be more than one record
847 * in the socket buffer, that is, a stream protocol (such as TCP). 847 * in the socket buffer, that is, a stream protocol (such as TCP).
848 */ 848 */
849void 849void
850sbappendstream(struct sockbuf *sb, struct mbuf *m) 850sbappendstream(struct sockbuf *sb, struct mbuf *m)
851{ 851{
852 852
853 KASSERT(solocked(sb->sb_so)); 853 KASSERT(solocked(sb->sb_so));
854 KDASSERT(m->m_nextpkt == NULL); 854 KDASSERT(m->m_nextpkt == NULL);
855 KASSERT(sb->sb_mb == sb->sb_lastrecord); 855 KASSERT(sb->sb_mb == sb->sb_lastrecord);
856 856
857 SBLASTMBUFCHK(sb, __func__); 857 SBLASTMBUFCHK(sb, __func__);
858 858
859#ifdef MBUFTRACE 859#ifdef MBUFTRACE
860 m_claimm(m, sb->sb_mowner); 860 m_claimm(m, sb->sb_mowner);
861#endif 861#endif
862 862
863 sbcompress(sb, m, sb->sb_mbtail); 863 sbcompress(sb, m, sb->sb_mbtail);
864 864
865 sb->sb_lastrecord = sb->sb_mb; 865 sb->sb_lastrecord = sb->sb_mb;
866 SBLASTRECORDCHK(sb, __func__); 866 SBLASTRECORDCHK(sb, __func__);
867} 867}
868 868
869#ifdef SOCKBUF_DEBUG 869#ifdef SOCKBUF_DEBUG
870void 870void
871sbcheck(struct sockbuf *sb) 871sbcheck(struct sockbuf *sb)
872{ 872{
873 struct mbuf *m, *m2; 873 struct mbuf *m, *m2;
874 u_long len, mbcnt; 874 u_long len, mbcnt;
875 875
876 KASSERT(solocked(sb->sb_so)); 876 KASSERT(solocked(sb->sb_so));
877 877
878 len = 0; 878 len = 0;
879 mbcnt = 0; 879 mbcnt = 0;
880 for (m = sb->sb_mb; m; m = m->m_nextpkt) { 880 for (m = sb->sb_mb; m; m = m->m_nextpkt) {
881 for (m2 = m; m2 != NULL; m2 = m2->m_next) { 881 for (m2 = m; m2 != NULL; m2 = m2->m_next) {
882 len += m2->m_len; 882 len += m2->m_len;
883 mbcnt += MSIZE; 883 mbcnt += MSIZE;
884 if (m2->m_flags & M_EXT) 884 if (m2->m_flags & M_EXT)
885 mbcnt += m2->m_ext.ext_size; 885 mbcnt += m2->m_ext.ext_size;
886 if (m2->m_nextpkt != NULL) 886 if (m2->m_nextpkt != NULL)
887 panic("sbcheck nextpkt"); 887 panic("sbcheck nextpkt");
888 } 888 }
889 } 889 }
890 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 890 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
891 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 891 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
892 mbcnt, sb->sb_mbcnt); 892 mbcnt, sb->sb_mbcnt);
893 panic("sbcheck"); 893 panic("sbcheck");
894 } 894 }
895} 895}
896#endif 896#endif
897 897
898/* 898/*
899 * As above, except the mbuf chain 899 * As above, except the mbuf chain
900 * begins a new record. 900 * begins a new record.
901 */ 901 */
902void 902void
903sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 903sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
904{ 904{
905 struct mbuf *m; 905 struct mbuf *m;
906 906
907 KASSERT(solocked(sb->sb_so)); 907 KASSERT(solocked(sb->sb_so));
908 908
909 if (m0 == NULL) 909 if (m0 == NULL)
910 return; 910 return;
911 911
912#ifdef MBUFTRACE 912#ifdef MBUFTRACE
913 m_claimm(m0, sb->sb_mowner); 913 m_claimm(m0, sb->sb_mowner);
914#endif 914#endif
915 /* 915 /*
916 * Put the first mbuf on the queue. 916 * Put the first mbuf on the queue.
917 * Note this permits zero length records. 917 * Note this permits zero length records.
918 */ 918 */
919 sballoc(sb, m0); 919 sballoc(sb, m0);
920 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 920 SBLASTRECORDCHK(sb, "sbappendrecord 1");
921 SBLINKRECORD(sb, m0); 921 SBLINKRECORD(sb, m0);
922 m = m0->m_next; 922 m = m0->m_next;
923 m0->m_next = 0; 923 m0->m_next = 0;
924 if (m && (m0->m_flags & M_EOR)) { 924 if (m && (m0->m_flags & M_EOR)) {
925 m0->m_flags &= ~M_EOR; 925 m0->m_flags &= ~M_EOR;
926 m->m_flags |= M_EOR; 926 m->m_flags |= M_EOR;
927 } 927 }
928 sbcompress(sb, m, m0); 928 sbcompress(sb, m, m0);
929 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 929 SBLASTRECORDCHK(sb, "sbappendrecord 2");
930} 930}
931 931
932/* 932/*
933 * As above except that OOB data 933 * As above except that OOB data
934 * is inserted at the beginning of the sockbuf, 934 * is inserted at the beginning of the sockbuf,
935 * but after any other OOB data. 935 * but after any other OOB data.
936 */ 936 */
937void 937void
938sbinsertoob(struct sockbuf *sb, struct mbuf *m0) 938sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
939{ 939{
940 struct mbuf *m, **mp; 940 struct mbuf *m, **mp;
941 941
942 KASSERT(solocked(sb->sb_so)); 942 KASSERT(solocked(sb->sb_so));
943 943
944 if (m0 == NULL) 944 if (m0 == NULL)
945 return; 945 return;
946 946
947 SBLASTRECORDCHK(sb, "sbinsertoob 1"); 947 SBLASTRECORDCHK(sb, "sbinsertoob 1");
948 948
949 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) { 949 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
950 again: 950 again:
951 switch (m->m_type) { 951 switch (m->m_type) {
952 952
953 case MT_OOBDATA: 953 case MT_OOBDATA:
954 continue; /* WANT next train */ 954 continue; /* WANT next train */
955 955
956 case MT_CONTROL: 956 case MT_CONTROL:
957 if ((m = m->m_next) != NULL) 957 if ((m = m->m_next) != NULL)
958 goto again; /* inspect THIS train further */ 958 goto again; /* inspect THIS train further */
959 } 959 }
960 break; 960 break;
961 } 961 }
962 /* 962 /*
963 * Put the first mbuf on the queue. 963 * Put the first mbuf on the queue.
964 * Note this permits zero length records. 964 * Note this permits zero length records.
965 */ 965 */
966 sballoc(sb, m0); 966 sballoc(sb, m0);
967 m0->m_nextpkt = *mp; 967 m0->m_nextpkt = *mp;
968 if (*mp == NULL) { 968 if (*mp == NULL) {
969 /* m0 is actually the new tail */ 969 /* m0 is actually the new tail */
970 sb->sb_lastrecord = m0; 970 sb->sb_lastrecord = m0;
971 } 971 }
972 *mp = m0; 972 *mp = m0;
973 m = m0->m_next; 973 m = m0->m_next;
974 m0->m_next = 0; 974 m0->m_next = 0;
975 if (m && (m0->m_flags & M_EOR)) { 975 if (m && (m0->m_flags & M_EOR)) {
976 m0->m_flags &= ~M_EOR; 976 m0->m_flags &= ~M_EOR;
977 m->m_flags |= M_EOR; 977 m->m_flags |= M_EOR;
978 } 978 }
979 sbcompress(sb, m, m0); 979 sbcompress(sb, m, m0);
980 SBLASTRECORDCHK(sb, "sbinsertoob 2"); 980 SBLASTRECORDCHK(sb, "sbinsertoob 2");
981} 981}
982 982
983/* 983/*
984 * Append address and data, and optionally, control (ancillary) data 984 * Append address and data, and optionally, control (ancillary) data
985 * to the receive queue of a socket. If present, 985 * to the receive queue of a socket. If present,
986 * m0 must include a packet header with total length. 986 * m0 must include a packet header with total length.
987 * Returns 0 if no space in sockbuf or insufficient mbufs. 987 * Returns 0 if no space in sockbuf or insufficient mbufs.
988 */ 988 */
989int 989int
990sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 990sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
991 struct mbuf *control) 991 struct mbuf *control)
992{ 992{
993 struct mbuf *m, *n, *nlast; 993 struct mbuf *m, *n, *nlast;
994 int space, len; 994 int space, len;
995 995
996 KASSERT(solocked(sb->sb_so)); 996 KASSERT(solocked(sb->sb_so));
997 997
998 space = asa->sa_len; 998 space = asa->sa_len;
999 999
1000 if (m0 != NULL) { 1000 if (m0 != NULL) {
1001 if ((m0->m_flags & M_PKTHDR) == 0) 1001 if ((m0->m_flags & M_PKTHDR) == 0)
1002 panic("sbappendaddr"); 1002 panic("sbappendaddr");
1003 space += m0->m_pkthdr.len; 1003 space += m0->m_pkthdr.len;
1004#ifdef MBUFTRACE 1004#ifdef MBUFTRACE
1005 m_claimm(m0, sb->sb_mowner); 1005 m_claimm(m0, sb->sb_mowner);
1006#endif 1006#endif
1007 } 1007 }
1008 for (n = control; n; n = n->m_next) { 1008 for (n = control; n; n = n->m_next) {
1009 space += n->m_len; 1009 space += n->m_len;
1010 MCLAIM(n, sb->sb_mowner); 1010 MCLAIM(n, sb->sb_mowner);
1011 if (n->m_next == NULL) /* keep pointer to last control buf */ 1011 if (n->m_next == NULL) /* keep pointer to last control buf */
1012 break; 1012 break;
1013 } 1013 }
1014 if (space > sbspace(sb)) 1014 if (space > sbspace(sb))
1015 return (0); 1015 return (0);
1016 m = m_get(M_DONTWAIT, MT_SONAME); 1016 m = m_get(M_DONTWAIT, MT_SONAME);
1017 if (m == NULL) 1017 if (m == NULL)
1018 return (0); 1018 return (0);
1019 MCLAIM(m, sb->sb_mowner); 1019 MCLAIM(m, sb->sb_mowner);
1020 /* 1020 /*
1021 * XXX avoid 'comparison always true' warning which isn't easily 1021 * XXX avoid 'comparison always true' warning which isn't easily
1022 * avoided. 1022 * avoided.
1023 */ 1023 */
1024 len = asa->sa_len; 1024 len = asa->sa_len;
1025 if (len > MLEN) { 1025 if (len > MLEN) {
1026 MEXTMALLOC(m, asa->sa_len, M_NOWAIT); 1026 MEXTMALLOC(m, asa->sa_len, M_NOWAIT);
1027 if ((m->m_flags & M_EXT) == 0) { 1027 if ((m->m_flags & M_EXT) == 0) {
1028 m_free(m); 1028 m_free(m);
1029 return (0); 1029 return (0);
1030 } 1030 }
1031 } 1031 }
1032 m->m_len = asa->sa_len; 1032 m->m_len = asa->sa_len;
1033 memcpy(mtod(m, void *), asa, asa->sa_len); 1033 memcpy(mtod(m, void *), asa, asa->sa_len);
1034 if (n) 1034 if (n)
1035 n->m_next = m0; /* concatenate data to control */ 1035 n->m_next = m0; /* concatenate data to control */
1036 else 1036 else
1037 control = m0; 1037 control = m0;
1038 m->m_next = control; 1038 m->m_next = control;
1039 1039
1040 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 1040 SBLASTRECORDCHK(sb, "sbappendaddr 1");
1041 1041
1042 for (n = m; n->m_next != NULL; n = n->m_next) 1042 for (n = m; n->m_next != NULL; n = n->m_next)
1043 sballoc(sb, n); 1043 sballoc(sb, n);
1044 sballoc(sb, n); 1044 sballoc(sb, n);
1045 nlast = n; 1045 nlast = n;
1046 SBLINKRECORD(sb, m); 1046 SBLINKRECORD(sb, m);
1047 1047
1048 sb->sb_mbtail = nlast; 1048 sb->sb_mbtail = nlast;
1049 SBLASTMBUFCHK(sb, "sbappendaddr"); 1049 SBLASTMBUFCHK(sb, "sbappendaddr");
1050 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 1050 SBLASTRECORDCHK(sb, "sbappendaddr 2");
1051 1051
1052 return (1); 1052 return (1);
1053} 1053}
1054 1054
1055/* 1055/*
1056 * Helper for sbappendchainaddr: prepend a struct sockaddr* to 1056 * Helper for sbappendchainaddr: prepend a struct sockaddr* to
1057 * an mbuf chain. 1057 * an mbuf chain.
1058 */ 1058 */
1059static inline struct mbuf * 1059static inline struct mbuf *
1060m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0, 1060m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0,
1061 const struct sockaddr *asa) 1061 const struct sockaddr *asa)
1062{ 1062{
1063 struct mbuf *m; 1063 struct mbuf *m;
1064 const int salen = asa->sa_len; 1064 const int salen = asa->sa_len;
1065 1065
1066 KASSERT(solocked(sb->sb_so)); 1066 KASSERT(solocked(sb->sb_so));
1067 1067
1068 /* only the first in each chain need be a pkthdr */ 1068 /* only the first in each chain need be a pkthdr */
1069 m = m_gethdr(M_DONTWAIT, MT_SONAME); 1069 m = m_gethdr(M_DONTWAIT, MT_SONAME);
1070 if (m == NULL) 1070 if (m == NULL)
1071 return NULL; 1071 return NULL;
1072 MCLAIM(m, sb->sb_mowner); 1072 MCLAIM(m, sb->sb_mowner);
1073#ifdef notyet 1073#ifdef notyet
1074 if (salen > MHLEN) { 1074 if (salen > MHLEN) {
1075 MEXTMALLOC(m, salen, M_NOWAIT); 1075 MEXTMALLOC(m, salen, M_NOWAIT);
1076 if ((m->m_flags & M_EXT) == 0) { 1076 if ((m->m_flags & M_EXT) == 0) {
1077 m_free(m); 1077 m_free(m);
1078 return NULL; 1078 return NULL;
1079 } 1079 }
1080 } 1080 }
1081#else 1081#else
1082 KASSERT(salen <= MHLEN); 1082 KASSERT(salen <= MHLEN);
1083#endif 1083#endif
1084 m->m_len = salen; 1084 m->m_len = salen;
1085 memcpy(mtod(m, void *), asa, salen); 1085 memcpy(mtod(m, void *), asa, salen);
1086 m->m_next = m0; 1086 m->m_next = m0;
1087 m->m_pkthdr.len = salen + m0->m_pkthdr.len; 1087 m->m_pkthdr.len = salen + m0->m_pkthdr.len;
1088 1088
1089 return m; 1089 return m;
1090} 1090}
1091 1091
1092int 1092int
1093sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa, 1093sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
1094 struct mbuf *m0, int sbprio) 1094 struct mbuf *m0, int sbprio)
1095{ 1095{
1096 struct mbuf *m, *n, *n0, *nlast; 1096 struct mbuf *m, *n, *n0, *nlast;
1097 int error; 1097 int error;
1098 1098
1099 KASSERT(solocked(sb->sb_so)); 1099 KASSERT(solocked(sb->sb_so));
1100 1100
1101 /* 1101 /*
1102 * XXX sbprio reserved for encoding priority of this* request: 1102 * XXX sbprio reserved for encoding priority of this* request:
1103 * SB_PRIO_NONE --> honour normal sb limits 1103 * SB_PRIO_NONE --> honour normal sb limits
1104 * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space, 1104 * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
1105 * take whole chain. Intended for large requests 1105 * take whole chain. Intended for large requests
1106 * that should be delivered atomically (all, or none). 1106 * that should be delivered atomically (all, or none).
1107 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow 1107 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
1108 * over normal socket limits, for messages indicating 1108 * over normal socket limits, for messages indicating
1109 * buffer overflow in earlier normal/lower-priority messages 1109 * buffer overflow in earlier normal/lower-priority messages
1110 * SB_PRIO_BESTEFFORT --> ignore limits entirely. 1110 * SB_PRIO_BESTEFFORT --> ignore limits entirely.
1111 * Intended for kernel-generated messages only. 1111 * Intended for kernel-generated messages only.
1112 * Up to generator to avoid total mbuf resource exhaustion. 1112 * Up to generator to avoid total mbuf resource exhaustion.
1113 */ 1113 */
1114 (void)sbprio; 1114 (void)sbprio;
1115 1115
1116 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 1116 if (m0 && (m0->m_flags & M_PKTHDR) == 0)
1117 panic("sbappendaddrchain"); 1117 panic("sbappendaddrchain");
1118 1118
1119#ifdef notyet 1119#ifdef notyet
1120 space = sbspace(sb); 1120 space = sbspace(sb);
1121 1121
1122 /* 1122 /*
1123 * Enforce SB_PRIO_* limits as described above. 1123 * Enforce SB_PRIO_* limits as described above.
1124 */ 1124 */
1125#endif 1125#endif
1126 1126
1127 n0 = NULL; 1127 n0 = NULL;
1128 nlast = NULL; 1128 nlast = NULL;
1129 for (m = m0; m; m = m->m_nextpkt) { 1129 for (m = m0; m; m = m->m_nextpkt) {
1130 struct mbuf *np; 1130 struct mbuf *np;
1131 1131
1132#ifdef MBUFTRACE 1132#ifdef MBUFTRACE
1133 m_claimm(m, sb->sb_mowner); 1133 m_claimm(m, sb->sb_mowner);
1134#endif 1134#endif
1135 1135
1136 /* Prepend sockaddr to this record (m) of input chain m0 */ 1136 /* Prepend sockaddr to this record (m) of input chain m0 */
1137 n = m_prepend_sockaddr(sb, m, asa); 1137 n = m_prepend_sockaddr(sb, m, asa);
1138 if (n == NULL) { 1138 if (n == NULL) {
1139 error = ENOBUFS; 1139 error = ENOBUFS;
1140 goto bad; 1140 goto bad;
1141 } 1141 }
1142 1142
1143 /* Append record (asa+m) to end of new chain n0 */ 1143 /* Append record (asa+m) to end of new chain n0 */
1144 if (n0 == NULL) { 1144 if (n0 == NULL) {
1145 n0 = n; 1145 n0 = n;
1146 } else { 1146 } else {
1147 nlast->m_nextpkt = n; 1147 nlast->m_nextpkt = n;
1148 } 1148 }
1149 /* Keep track of last record on new chain */ 1149 /* Keep track of last record on new chain */
1150 nlast = n; 1150 nlast = n;
1151 1151
1152 for (np = n; np; np = np->m_next) 1152 for (np = n; np; np = np->m_next)
1153 sballoc(sb, np); 1153 sballoc(sb, np);
1154 } 1154 }
1155 1155
1156 SBLASTRECORDCHK(sb, "sbappendaddrchain 1"); 1156 SBLASTRECORDCHK(sb, "sbappendaddrchain 1");
1157 1157
1158 /* Drop the entire chain of (asa+m) records onto the socket */ 1158 /* Drop the entire chain of (asa+m) records onto the socket */
1159 SBLINKRECORDCHAIN(sb, n0, nlast); 1159 SBLINKRECORDCHAIN(sb, n0, nlast);
1160 1160
1161 SBLASTRECORDCHK(sb, "sbappendaddrchain 2"); 1161 SBLASTRECORDCHK(sb, "sbappendaddrchain 2");
1162 1162
1163 for (m = nlast; m->m_next; m = m->m_next) 1163 for (m = nlast; m->m_next; m = m->m_next)
1164 ; 1164 ;
1165 sb->sb_mbtail = m; 1165 sb->sb_mbtail = m;
1166 SBLASTMBUFCHK(sb, "sbappendaddrchain"); 1166 SBLASTMBUFCHK(sb, "sbappendaddrchain");
1167 1167
1168 return (1); 1168 return (1);
1169 1169
1170bad: 1170bad:
1171 /* 1171 /*
1172 * On error, free the prepended addreseses. For consistency 1172 * On error, free the prepended addreseses. For consistency
1173 * with sbappendaddr(), leave it to our caller to free 1173 * with sbappendaddr(), leave it to our caller to free
1174 * the input record chain passed to us as m0. 1174 * the input record chain passed to us as m0.
1175 */ 1175 */
1176 while ((n = n0) != NULL) { 1176 while ((n = n0) != NULL) {
1177 struct mbuf *np; 1177 struct mbuf *np;
1178 1178
1179 /* Undo the sballoc() of this record */ 1179 /* Undo the sballoc() of this record */
1180 for (np = n; np; np = np->m_next) 1180 for (np = n; np; np = np->m_next)
1181 sbfree(sb, np); 1181 sbfree(sb, np);
1182 1182
1183 n0 = n->m_nextpkt; /* iterate at next prepended address */ 1183 n0 = n->m_nextpkt; /* iterate at next prepended address */
1184 np = m_free(n); /* free prepended address (not data) */ 1184 np = m_free(n); /* free prepended address (not data) */
1185 } 1185 }
1186 return error; 1186 return error;
1187} 1187}
1188 1188
1189 1189
1190int 1190int
1191sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 1191sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
1192{ 1192{
1193 struct mbuf *m, *mlast, *n; 1193 struct mbuf *m, *mlast, *n;
1194 int space; 1194 int space;
1195 1195
1196 KASSERT(solocked(sb->sb_so)); 1196 KASSERT(solocked(sb->sb_so));
1197 1197
1198 space = 0; 1198 space = 0;
1199 if (control == NULL) 1199 if (control == NULL)
1200 panic("sbappendcontrol"); 1200 panic("sbappendcontrol");
1201 for (m = control; ; m = m->m_next) { 1201 for (m = control; ; m = m->m_next) {
1202 space += m->m_len; 1202 space += m->m_len;
1203 MCLAIM(m, sb->sb_mowner); 1203 MCLAIM(m, sb->sb_mowner);
1204 if (m->m_next == NULL) 1204 if (m->m_next == NULL)
1205 break; 1205 break;
1206 } 1206 }
1207 n = m; /* save pointer to last control buffer */ 1207 n = m; /* save pointer to last control buffer */
1208 for (m = m0; m; m = m->m_next) { 1208 for (m = m0; m; m = m->m_next) {
1209 MCLAIM(m, sb->sb_mowner); 1209 MCLAIM(m, sb->sb_mowner);
1210 space += m->m_len; 1210 space += m->m_len;
1211 } 1211 }
1212 if (space > sbspace(sb)) 1212 if (space > sbspace(sb))
1213 return (0); 1213 return (0);
1214 n->m_next = m0; /* concatenate data to control */ 1214 n->m_next = m0; /* concatenate data to control */
1215 1215
1216 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 1216 SBLASTRECORDCHK(sb, "sbappendcontrol 1");
1217 1217
1218 for (m = control; m->m_next != NULL; m = m->m_next) 1218 for (m = control; m->m_next != NULL; m = m->m_next)
1219 sballoc(sb, m); 1219 sballoc(sb, m);
1220 sballoc(sb, m); 1220 sballoc(sb, m);
1221 mlast = m; 1221 mlast = m;
1222 SBLINKRECORD(sb, control); 1222 SBLINKRECORD(sb, control);
1223 1223
1224 sb->sb_mbtail = mlast; 1224 sb->sb_mbtail = mlast;
1225 SBLASTMBUFCHK(sb, "sbappendcontrol"); 1225 SBLASTMBUFCHK(sb, "sbappendcontrol");
1226 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 1226 SBLASTRECORDCHK(sb, "sbappendcontrol 2");
1227 1227
1228 return (1); 1228 return (1);
1229} 1229}
1230 1230
1231/* 1231/*
1232 * Compress mbuf chain m into the socket 1232 * Compress mbuf chain m into the socket
1233 * buffer sb following mbuf n. If n 1233 * buffer sb following mbuf n. If n
1234 * is null, the buffer is presumed empty. 1234 * is null, the buffer is presumed empty.
1235 */ 1235 */
1236void 1236void
1237sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 1237sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1238{ 1238{
1239 int eor; 1239 int eor;
1240 struct mbuf *o; 1240 struct mbuf *o;
1241 1241
1242 KASSERT(solocked(sb->sb_so)); 1242 KASSERT(solocked(sb->sb_so));
1243 1243
1244 eor = 0; 1244 eor = 0;
1245 while (m) { 1245 while (m) {
1246 eor |= m->m_flags & M_EOR; 1246 eor |= m->m_flags & M_EOR;
1247 if (m->m_len == 0 && 1247 if (m->m_len == 0 &&
1248 (eor == 0 || 1248 (eor == 0 ||
1249 (((o = m->m_next) || (o = n)) && 1249 (((o = m->m_next) || (o = n)) &&
1250 o->m_type == m->m_type))) { 1250 o->m_type == m->m_type))) {
1251 if (sb->sb_lastrecord == m) 1251 if (sb->sb_lastrecord == m)
1252 sb->sb_lastrecord = m->m_next; 1252 sb->sb_lastrecord = m->m_next;
1253 m = m_free(m); 1253 m = m_free(m);
1254 continue; 1254 continue;
1255 } 1255 }
1256 if (n && (n->m_flags & M_EOR) == 0 && 1256 if (n && (n->m_flags & M_EOR) == 0 &&
1257 /* M_TRAILINGSPACE() checks buffer writeability */ 1257 /* M_TRAILINGSPACE() checks buffer writeability */
1258 m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */ 1258 m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
1259 m->m_len <= M_TRAILINGSPACE(n) && 1259 m->m_len <= M_TRAILINGSPACE(n) &&
1260 n->m_type == m->m_type) { 1260 n->m_type == m->m_type) {
1261 memcpy(mtod(n, char *) + n->m_len, mtod(m, void *), 1261 memcpy(mtod(n, char *) + n->m_len, mtod(m, void *),
1262 (unsigned)m->m_len); 1262 (unsigned)m->m_len);
1263 n->m_len += m->m_len; 1263 n->m_len += m->m_len;
1264 sb->sb_cc += m->m_len; 1264 sb->sb_cc += m->m_len;
1265 m = m_free(m); 1265 m = m_free(m);
1266 continue; 1266 continue;
1267 } 1267 }
1268 if (n) 1268 if (n)
1269 n->m_next = m; 1269 n->m_next = m;
1270 else 1270 else
1271 sb->sb_mb = m; 1271 sb->sb_mb = m;
1272 sb->sb_mbtail = m; 1272 sb->sb_mbtail = m;
1273 sballoc(sb, m); 1273 sballoc(sb, m);
1274 n = m; 1274 n = m;
1275 m->m_flags &= ~M_EOR; 1275 m->m_flags &= ~M_EOR;
1276 m = m->m_next; 1276 m = m->m_next;
1277 n->m_next = 0; 1277 n->m_next = 0;
1278 } 1278 }
1279 if (eor) { 1279 if (eor) {
1280 if (n) 1280 if (n)
1281 n->m_flags |= eor; 1281 n->m_flags |= eor;
1282 else 1282 else
1283 printf("semi-panic: sbcompress\n"); 1283 printf("semi-panic: sbcompress\n");
1284 } 1284 }
1285 SBLASTMBUFCHK(sb, __func__); 1285 SBLASTMBUFCHK(sb, __func__);
1286} 1286}
1287 1287
1288/* 1288/*
1289 * Free all mbufs in a sockbuf. 1289 * Free all mbufs in a sockbuf.
1290 * Check that all resources are reclaimed. 1290 * Check that all resources are reclaimed.
1291 */ 1291 */
1292void 1292void
1293sbflush(struct sockbuf *sb) 1293sbflush(struct sockbuf *sb)
1294{ 1294{
1295 1295
1296 KASSERT(solocked(sb->sb_so)); 1296 KASSERT(solocked(sb->sb_so));
1297 KASSERT((sb->sb_flags & SB_LOCK) == 0); 1297 KASSERT((sb->sb_flags & SB_LOCK) == 0);
1298 1298
1299 while (sb->sb_mbcnt) 1299 while (sb->sb_mbcnt)
1300 sbdrop(sb, (int)sb->sb_cc); 1300 sbdrop(sb, (int)sb->sb_cc);
1301 1301
1302 KASSERT(sb->sb_cc == 0); 1302 KASSERT(sb->sb_cc == 0);
1303 KASSERT(sb->sb_mb == NULL); 1303 KASSERT(sb->sb_mb == NULL);
1304 KASSERT(sb->sb_mbtail == NULL); 1304 KASSERT(sb->sb_mbtail == NULL);
1305 KASSERT(sb->sb_lastrecord == NULL); 1305 KASSERT(sb->sb_lastrecord == NULL);
1306} 1306}
1307 1307
1308/* 1308/*
1309 * Drop data from (the front of) a sockbuf. 1309 * Drop data from (the front of) a sockbuf.
1310 */ 1310 */
1311void 1311void
1312sbdrop(struct sockbuf *sb, int len) 1312sbdrop(struct sockbuf *sb, int len)
1313{ 1313{
1314 struct mbuf *m, *next; 1314 struct mbuf *m, *next;
1315 1315
1316 KASSERT(solocked(sb->sb_so)); 1316 KASSERT(solocked(sb->sb_so));
1317 1317
1318 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; 1318 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL;
1319 while (len > 0) { 1319 while (len > 0) {
1320 if (m == NULL) { 1320 if (m == NULL) {
1321 if (next == NULL) 1321 if (next == NULL)
1322 panic("sbdrop(%p,%d): cc=%lu", 1322 panic("sbdrop(%p,%d): cc=%lu",
1323 sb, len, sb->sb_cc); 1323 sb, len, sb->sb_cc);
1324 m = next; 1324 m = next;
1325 next = m->m_nextpkt; 1325 next = m->m_nextpkt;
1326 continue; 1326 continue;
1327 } 1327 }
1328 if (m->m_len > len) { 1328 if (m->m_len > len) {
1329 m->m_len -= len; 1329 m->m_len -= len;
1330 m->m_data += len; 1330 m->m_data += len;
1331 sb->sb_cc -= len; 1331 sb->sb_cc -= len;
1332 break; 1332 break;
1333 } 1333 }
1334 len -= m->m_len; 1334 len -= m->m_len;
1335 sbfree(sb, m); 1335 sbfree(sb, m);
1336 m = m_free(m); 1336 m = m_free(m);
1337 } 1337 }
1338 while (m && m->m_len == 0) { 1338 while (m && m->m_len == 0) {
1339 sbfree(sb, m); 1339 sbfree(sb, m);
1340 m = m_free(m); 1340 m = m_free(m);
1341 } 1341 }
1342 if (m) { 1342 if (m) {
1343 sb->sb_mb = m; 1343 sb->sb_mb = m;
1344 m->m_nextpkt = next; 1344 m->m_nextpkt = next;
1345 } else 1345 } else
1346 sb->sb_mb = next; 1346 sb->sb_mb = next;
1347 /* 1347 /*
1348 * First part is an inline SB_EMPTY_FIXUP(). Second part 1348 * First part is an inline SB_EMPTY_FIXUP(). Second part
1349 * makes sure sb_lastrecord is up-to-date if we dropped 1349 * makes sure sb_lastrecord is up-to-date if we dropped
1350 * part of the last record. 1350 * part of the last record.
1351 */ 1351 */
1352 m = sb->sb_mb; 1352 m = sb->sb_mb;
1353 if (m == NULL) { 1353 if (m == NULL) {
1354 sb->sb_mbtail = NULL; 1354 sb->sb_mbtail = NULL;
1355 sb->sb_lastrecord = NULL; 1355 sb->sb_lastrecord = NULL;
1356 } else if (m->m_nextpkt == NULL) 1356 } else if (m->m_nextpkt == NULL)
1357 sb->sb_lastrecord = m; 1357 sb->sb_lastrecord = m;
1358} 1358}
1359 1359
1360/* 1360/*
1361 * Drop a record off the front of a sockbuf 1361 * Drop a record off the front of a sockbuf
1362 * and move the next record to the front. 1362 * and move the next record to the front.
1363 */ 1363 */
1364void 1364void
1365sbdroprecord(struct sockbuf *sb) 1365sbdroprecord(struct sockbuf *sb)
1366{ 1366{
1367 struct mbuf *m, *mn; 1367 struct mbuf *m, *mn;
1368 1368
1369 KASSERT(solocked(sb->sb_so)); 1369 KASSERT(solocked(sb->sb_so));
1370 1370
1371 m = sb->sb_mb; 1371 m = sb->sb_mb;
1372 if (m) { 1372 if (m) {
1373 sb->sb_mb = m->m_nextpkt; 1373 sb->sb_mb = m->m_nextpkt;
1374 do { 1374 do {
1375 sbfree(sb, m); 1375 sbfree(sb, m);
1376 mn = m_free(m); 1376 mn = m_free(m);
1377 } while ((m = mn) != NULL); 1377 } while ((m = mn) != NULL);
1378 } 1378 }
1379 SB_EMPTY_FIXUP(sb); 1379 SB_EMPTY_FIXUP(sb);
1380} 1380}
1381 1381
1382/* 1382/*
1383 * Create a "control" mbuf containing the specified data 1383 * Create a "control" mbuf containing the specified data
1384 * with the specified type for presentation on a socket buffer. 1384 * with the specified type for presentation on a socket buffer.
1385 */ 1385 */
1386struct mbuf * 1386struct mbuf *
1387sbcreatecontrol1(void **p, int size, int type, int level, int flags) 1387sbcreatecontrol1(void **p, int size, int type, int level, int flags)
1388{ 1388{
1389 struct cmsghdr *cp; 1389 struct cmsghdr *cp;
1390 struct mbuf *m; 1390 struct mbuf *m;
1391 int space = CMSG_SPACE(size); 1391 int space = CMSG_SPACE(size);
1392 1392
1393 if ((flags & M_DONTWAIT) && space > MCLBYTES) { 1393 if ((flags & M_DONTWAIT) && space > MCLBYTES) {
1394 printf("%s: message too large %d\n", __func__, space); 1394 printf("%s: message too large %d\n", __func__, space);
1395 return NULL; 1395 return NULL;
1396 } 1396 }
1397 1397
1398 if ((m = m_get(flags, MT_CONTROL)) == NULL) 1398 if ((m = m_get(flags, MT_CONTROL)) == NULL)
1399 return NULL; 1399 return NULL;
1400 if (space > MLEN) { 1400 if (space > MLEN) {
1401 if (space > MCLBYTES) 1401 if (space > MCLBYTES)
1402 MEXTMALLOC(m, space, M_WAITOK); 1402 MEXTMALLOC(m, space, M_WAITOK);
1403 else 1403 else
1404 MCLGET(m, flags); 1404 MCLGET(m, flags);
1405 if ((m->m_flags & M_EXT) == 0) { 1405 if ((m->m_flags & M_EXT) == 0) {
1406 m_free(m); 1406 m_free(m);
1407 return NULL; 1407 return NULL;
1408 } 1408 }
1409 } 1409 }
1410 cp = mtod(m, struct cmsghdr *); 1410 cp = mtod(m, struct cmsghdr *);
1411 *p = CMSG_DATA(cp); 1411 *p = CMSG_DATA(cp);
1412 m->m_len = space; 1412 m->m_len = space;
1413 cp->cmsg_len = CMSG_LEN(size); 1413 cp->cmsg_len = CMSG_LEN(size);
1414 cp->cmsg_level = level; 1414 cp->cmsg_level = level;
1415 cp->cmsg_type = type; 1415 cp->cmsg_type = type;
1416 return m; 1416 return m;
1417} 1417}
1418 1418
1419struct mbuf * 1419struct mbuf *
1420sbcreatecontrol(void *p, int size, int type, int level) 1420sbcreatecontrol(void *p, int size, int type, int level)
1421{ 1421{
1422 struct mbuf *m; 1422 struct mbuf *m;
1423 void *v; 1423 void *v;
1424 1424
1425 m = sbcreatecontrol1(&v, size, type, level, M_DONTWAIT); 1425 m = sbcreatecontrol1(&v, size, type, level, M_DONTWAIT);
1426 if (m == NULL) 1426 if (m == NULL)
1427 return NULL; 1427 return NULL;
1428 memcpy(v, p, size); 1428 memcpy(v, p, size);
1429 return m; 1429 return m;
1430} 1430}
1431 1431
1432void 1432void
1433solockretry(struct socket *so, kmutex_t *lock) 1433solockretry(struct socket *so, kmutex_t *lock)
1434{ 1434{
1435 1435
1436 while (lock != so->so_lock) { 1436 while (lock != so->so_lock) {
1437 mutex_exit(lock); 1437 mutex_exit(lock);
1438 lock = so->so_lock; 1438 lock = so->so_lock;
1439 mutex_enter(lock); 1439 mutex_enter(lock);
1440 } 1440 }
1441} 1441}
1442 1442
1443bool 1443bool
1444solocked(const struct socket *so) 1444solocked(const struct socket *so)
1445{ 1445{
1446 1446
1447 return mutex_owned(so->so_lock); 1447 return mutex_owned(so->so_lock);
1448} 1448}
1449 1449
1450bool 1450bool
1451solocked2(const struct socket *so1, const struct socket *so2) 1451solocked2(const struct socket *so1, const struct socket *so2)
1452{ 1452{
1453 const kmutex_t *lock; 1453 const kmutex_t *lock;
1454 1454
1455 lock = so1->so_lock; 1455 lock = so1->so_lock;
1456 if (lock != so2->so_lock) 1456 if (lock != so2->so_lock)
1457 return false; 1457 return false;
1458 return mutex_owned(lock); 1458 return mutex_owned(lock);
1459} 1459}
1460 1460
1461/* 1461/*
1462 * sosetlock: assign a default lock to a new socket. 1462 * sosetlock: assign a default lock to a new socket.
1463 */ 1463 */
1464void 1464void
1465sosetlock(struct socket *so) 1465sosetlock(struct socket *so)
1466{ 1466{
1467 if (so->so_lock == NULL) { 1467 if (so->so_lock == NULL) {
1468 kmutex_t *lock = softnet_lock; 1468 kmutex_t *lock = softnet_lock;
1469 1469
1470 so->so_lock = lock; 1470 so->so_lock = lock;
1471 mutex_obj_hold(lock); 1471 mutex_obj_hold(lock);
1472 mutex_enter(lock); 1472 mutex_enter(lock);
1473 } 1473 }
1474 KASSERT(solocked(so)); 1474 KASSERT(solocked(so));
1475} 1475}
1476 1476
1477/* 1477/*
1478 * Set lock on sockbuf sb; sleep if lock is already held. 1478 * Set lock on sockbuf sb; sleep if lock is already held.
1479 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible. 1479 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1480 * Returns error without lock if sleep is interrupted. 1480 * Returns error without lock if sleep is interrupted.
1481 */ 1481 */
1482int 1482int
1483sblock(struct sockbuf *sb, int wf) 1483sblock(struct sockbuf *sb, int wf)
1484{ 1484{
1485 struct socket *so; 1485 struct socket *so;
1486 kmutex_t *lock; 1486 kmutex_t *lock;
1487 int error; 1487 int error;
1488 1488
1489 KASSERT(solocked(sb->sb_so)); 1489 KASSERT(solocked(sb->sb_so));
1490 1490
1491 for (;;) { 1491 for (;;) {
1492 if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) { 1492 if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) {
1493 sb->sb_flags |= SB_LOCK; 1493 sb->sb_flags |= SB_LOCK;
1494 return 0; 1494 return 0;
1495 } 1495 }
1496 if (wf != M_WAITOK) 1496 if (wf != M_WAITOK)
1497 return EWOULDBLOCK; 1497 return EWOULDBLOCK;
1498 so = sb->sb_so; 1498 so = sb->sb_so;
1499 lock = so->so_lock; 1499 lock = so->so_lock;
1500 if ((sb->sb_flags & SB_NOINTR) != 0) { 1500 if ((sb->sb_flags & SB_NOINTR) != 0) {
1501 cv_wait(&so->so_cv, lock); 1501 cv_wait(&so->so_cv, lock);
1502 error = 0; 1502 error = 0;
1503 } else 1503 } else
1504 error = cv_wait_sig(&so->so_cv, lock); 1504 error = cv_wait_sig(&so->so_cv, lock);
1505 if (__predict_false(lock != so->so_lock)) 1505 if (__predict_false(lock != so->so_lock))
1506 solockretry(so, lock); 1506 solockretry(so, lock);

cvs diff -r1.155 -r1.156 src/sys/sys/socketvar.h (switch to unified diff)

--- src/sys/sys/socketvar.h 2018/05/04 08:35:07 1.155
+++ src/sys/sys/socketvar.h 2018/06/06 09:46:46 1.156
@@ -1,580 +1,581 @@ @@ -1,580 +1,581 @@
1/* $NetBSD: socketvar.h,v 1.155 2018/05/04 08:35:07 christos Exp $ */ 1/* $NetBSD: socketvar.h,v 1.156 2018/06/06 09:46:46 roy Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/*- 32/*-
33 * Copyright (c) 1982, 1986, 1990, 1993 33 * Copyright (c) 1982, 1986, 1990, 1993
34 * The Regents of the University of California. All rights reserved. 34 * The Regents of the University of California. All rights reserved.
35 * 35 *
36 * Redistribution and use in source and binary forms, with or without 36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions 37 * modification, are permitted provided that the following conditions
38 * are met: 38 * are met:
39 * 1. Redistributions of source code must retain the above copyright 39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer. 40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright 41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the 42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution. 43 * documentation and/or other materials provided with the distribution.
44 * 3. Neither the name of the University nor the names of its contributors 44 * 3. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software 45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission. 46 * without specific prior written permission.
47 * 47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE. 58 * SUCH DAMAGE.
59 * 59 *
60 * @(#)socketvar.h 8.3 (Berkeley) 2/19/95 60 * @(#)socketvar.h 8.3 (Berkeley) 2/19/95
61 */ 61 */
62 62
63#ifndef _SYS_SOCKETVAR_H_ 63#ifndef _SYS_SOCKETVAR_H_
64#define _SYS_SOCKETVAR_H_ 64#define _SYS_SOCKETVAR_H_
65 65
66#include <sys/select.h> 66#include <sys/select.h>
67#include <sys/selinfo.h> /* for struct selinfo */ 67#include <sys/selinfo.h> /* for struct selinfo */
68#include <sys/queue.h> 68#include <sys/queue.h>
69#include <sys/mutex.h> 69#include <sys/mutex.h>
70#include <sys/condvar.h> 70#include <sys/condvar.h>
71 71
72#if !defined(_KERNEL) 72#if !defined(_KERNEL)
73struct uio; 73struct uio;
74struct lwp; 74struct lwp;
75struct uidinfo; 75struct uidinfo;
76#else 76#else
77#include <sys/uidinfo.h> 77#include <sys/uidinfo.h>
78#endif 78#endif
79 79
80TAILQ_HEAD(soqhead, socket); 80TAILQ_HEAD(soqhead, socket);
81 81
82/* 82/*
83 * Variables for socket buffering. 83 * Variables for socket buffering.
84 */ 84 */
85struct sockbuf { 85struct sockbuf {
86 struct selinfo sb_sel; /* process selecting read/write */ 86 struct selinfo sb_sel; /* process selecting read/write */
87 struct mowner *sb_mowner; /* who owns data for this sockbuf */ 87 struct mowner *sb_mowner; /* who owns data for this sockbuf */
88 struct socket *sb_so; /* back pointer to socket */ 88 struct socket *sb_so; /* back pointer to socket */
89 kcondvar_t sb_cv; /* notifier */ 89 kcondvar_t sb_cv; /* notifier */
90 /* When re-zeroing this struct, we zero from sb_startzero to the end */ 90 /* When re-zeroing this struct, we zero from sb_startzero to the end */
91#define sb_startzero sb_cc 91#define sb_startzero sb_cc
92 u_long sb_cc; /* actual chars in buffer */ 92 u_long sb_cc; /* actual chars in buffer */
93 u_long sb_hiwat; /* max actual char count */ 93 u_long sb_hiwat; /* max actual char count */
94 u_long sb_mbcnt; /* chars of mbufs used */ 94 u_long sb_mbcnt; /* chars of mbufs used */
95 u_long sb_mbmax; /* max chars of mbufs to use */ 95 u_long sb_mbmax; /* max chars of mbufs to use */
96 u_long sb_lowat; /* low water mark */ 96 u_long sb_lowat; /* low water mark */
97 struct mbuf *sb_mb; /* the mbuf chain */ 97 struct mbuf *sb_mb; /* the mbuf chain */
98 struct mbuf *sb_mbtail; /* the last mbuf in the chain */ 98 struct mbuf *sb_mbtail; /* the last mbuf in the chain */
99 struct mbuf *sb_lastrecord; /* first mbuf of last record in 99 struct mbuf *sb_lastrecord; /* first mbuf of last record in
100 socket buffer */ 100 socket buffer */
101 int sb_flags; /* flags, see below */ 101 int sb_flags; /* flags, see below */
102 int sb_timeo; /* timeout for read/write */ 102 int sb_timeo; /* timeout for read/write */
103 u_long sb_overflowed; /* # of drops due to full buffer */ 103 u_long sb_overflowed; /* # of drops due to full buffer */
104}; 104};
105 105
106#ifndef SB_MAX 106#ifndef SB_MAX
107#define SB_MAX (256*1024) /* default for max chars in sockbuf */ 107#define SB_MAX (256*1024) /* default for max chars in sockbuf */
108#endif 108#endif
109 109
110#define SB_LOCK 0x01 /* lock on data queue */ 110#define SB_LOCK 0x01 /* lock on data queue */
111#define SB_NOTIFY 0x04 /* someone is waiting for data/space */ 111#define SB_NOTIFY 0x04 /* someone is waiting for data/space */
112#define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ 112#define SB_ASYNC 0x10 /* ASYNC I/O, need signals */
113#define SB_UPCALL 0x20 /* someone wants an upcall */ 113#define SB_UPCALL 0x20 /* someone wants an upcall */
114#define SB_NOINTR 0x40 /* operations not interruptible */ 114#define SB_NOINTR 0x40 /* operations not interruptible */
115#define SB_KNOTE 0x100 /* kernel note attached */ 115#define SB_KNOTE 0x100 /* kernel note attached */
116#define SB_AUTOSIZE 0x800 /* automatically size socket buffer */ 116#define SB_AUTOSIZE 0x800 /* automatically size socket buffer */
117 117
118/* 118/*
119 * Kernel structure per socket. 119 * Kernel structure per socket.
120 * Contains send and receive buffer queues, 120 * Contains send and receive buffer queues,
121 * handle on protocol and pointer to protocol 121 * handle on protocol and pointer to protocol
122 * private data and error information. 122 * private data and error information.
123 */ 123 */
124struct so_accf { 124struct so_accf {
125 struct accept_filter *so_accept_filter; 125 struct accept_filter *so_accept_filter;
126 void *so_accept_filter_arg; /* saved filter args */ 126 void *so_accept_filter_arg; /* saved filter args */
127 char *so_accept_filter_str; /* saved user args */ 127 char *so_accept_filter_str; /* saved user args */
128}; 128};
129 129
130struct sockaddr; 130struct sockaddr;
131 131
132struct socket { 132struct socket {
133 kmutex_t * volatile so_lock; /* pointer to lock on structure */ 133 kmutex_t * volatile so_lock; /* pointer to lock on structure */
134 kcondvar_t so_cv; /* notifier */ 134 kcondvar_t so_cv; /* notifier */
135 short so_type; /* generic type, see socket.h */ 135 short so_type; /* generic type, see socket.h */
136 short so_options; /* from socket call, see socket.h */ 136 short so_options; /* from socket call, see socket.h */
137 u_short so_linger; /* time to linger while closing */ 137 u_short so_linger; /* time to linger while closing */
138 short so_state; /* internal state flags SS_*, below */ 138 short so_state; /* internal state flags SS_*, below */
139 int so_unused; /* used to be so_nbio */ 139 int so_unused; /* used to be so_nbio */
140 void *so_pcb; /* protocol control block */ 140 void *so_pcb; /* protocol control block */
141 const struct protosw *so_proto; /* protocol handle */ 141 const struct protosw *so_proto; /* protocol handle */
142/* 142/*
143 * Variables for connection queueing. 143 * Variables for connection queueing.
144 * Socket where accepts occur is so_head in all subsidiary sockets. 144 * Socket where accepts occur is so_head in all subsidiary sockets.
145 * If so_head is 0, socket is not related to an accept. 145 * If so_head is 0, socket is not related to an accept.
146 * For head socket so_q0 queues partially completed connections, 146 * For head socket so_q0 queues partially completed connections,
147 * while so_q is a queue of connections ready to be accepted. 147 * while so_q is a queue of connections ready to be accepted.
148 * If a connection is aborted and it has so_head set, then 148 * If a connection is aborted and it has so_head set, then
149 * it has to be pulled out of either so_q0 or so_q. 149 * it has to be pulled out of either so_q0 or so_q.
150 * We allow connections to queue up based on current queue lengths 150 * We allow connections to queue up based on current queue lengths
151 * and limit on number of queued connections for this socket. 151 * and limit on number of queued connections for this socket.
152 */ 152 */
153 struct socket *so_head; /* back pointer to accept socket */ 153 struct socket *so_head; /* back pointer to accept socket */
154 struct soqhead *so_onq; /* queue (q or q0) that we're on */ 154 struct soqhead *so_onq; /* queue (q or q0) that we're on */
155 struct soqhead so_q0; /* queue of partial connections */ 155 struct soqhead so_q0; /* queue of partial connections */
156 struct soqhead so_q; /* queue of incoming connections */ 156 struct soqhead so_q; /* queue of incoming connections */
157 TAILQ_ENTRY(socket) so_qe; /* our queue entry (q or q0) */ 157 TAILQ_ENTRY(socket) so_qe; /* our queue entry (q or q0) */
158 short so_q0len; /* partials on so_q0 */ 158 short so_q0len; /* partials on so_q0 */
159 short so_qlen; /* number of connections on so_q */ 159 short so_qlen; /* number of connections on so_q */
160 short so_qlimit; /* max number queued connections */ 160 short so_qlimit; /* max number queued connections */
161 short so_timeo; /* connection timeout */ 161 short so_timeo; /* connection timeout */
162 u_short so_error; /* error affecting connection */ 162 u_short so_error; /* error affecting connection */
 163 u_short so_rerror; /* error affecting receiving */
163 u_short so_aborting; /* references from soabort() */ 164 u_short so_aborting; /* references from soabort() */
164 pid_t so_pgid; /* pgid for signals */ 165 pid_t so_pgid; /* pgid for signals */
165 u_long so_oobmark; /* chars to oob mark */ 166 u_long so_oobmark; /* chars to oob mark */
166 struct sockbuf so_snd; /* send buffer */ 167 struct sockbuf so_snd; /* send buffer */
167 struct sockbuf so_rcv; /* receive buffer */ 168 struct sockbuf so_rcv; /* receive buffer */
168 169
169 void *so_internal; /* Space for svr4 stream data */ 170 void *so_internal; /* Space for svr4 stream data */
170 void (*so_upcall) (struct socket *, void *, int, int); 171 void (*so_upcall) (struct socket *, void *, int, int);
171 void * so_upcallarg; /* Arg for above */ 172 void * so_upcallarg; /* Arg for above */
172 int (*so_send) (struct socket *, struct sockaddr *, 173 int (*so_send) (struct socket *, struct sockaddr *,
173 struct uio *, struct mbuf *, 174 struct uio *, struct mbuf *,
174 struct mbuf *, int, struct lwp *); 175 struct mbuf *, int, struct lwp *);
175 int (*so_receive) (struct socket *, 176 int (*so_receive) (struct socket *,
176 struct mbuf **, 177 struct mbuf **,
177 struct uio *, struct mbuf **, 178 struct uio *, struct mbuf **,
178 struct mbuf **, int *); 179 struct mbuf **, int *);
179 struct mowner *so_mowner; /* who owns mbufs for this socket */ 180 struct mowner *so_mowner; /* who owns mbufs for this socket */
180 struct uidinfo *so_uidinfo; /* who opened the socket */ 181 struct uidinfo *so_uidinfo; /* who opened the socket */
181 gid_t so_egid; /* creator effective gid */ 182 gid_t so_egid; /* creator effective gid */
182 pid_t so_cpid; /* creator pid */ 183 pid_t so_cpid; /* creator pid */
183 struct so_accf *so_accf; 184 struct so_accf *so_accf;
184 kauth_cred_t so_cred; /* socket credentials */ 185 kauth_cred_t so_cred; /* socket credentials */
185}; 186};
186 187
187/* 188/*
188 * Socket state bits. 189 * Socket state bits.
189 */ 190 */
190#define SS_NOFDREF 0x001 /* no file table ref any more */ 191#define SS_NOFDREF 0x001 /* no file table ref any more */
191#define SS_ISCONNECTED 0x002 /* socket connected to a peer */ 192#define SS_ISCONNECTED 0x002 /* socket connected to a peer */
192#define SS_ISCONNECTING 0x004 /* in process of connecting to peer */ 193#define SS_ISCONNECTING 0x004 /* in process of connecting to peer */
193#define SS_ISDISCONNECTING 0x008 /* in process of disconnecting */ 194#define SS_ISDISCONNECTING 0x008 /* in process of disconnecting */
194#define SS_CANTSENDMORE 0x010 /* can't send more data to peer */ 195#define SS_CANTSENDMORE 0x010 /* can't send more data to peer */
195#define SS_CANTRCVMORE 0x020 /* can't receive more data from peer */ 196#define SS_CANTRCVMORE 0x020 /* can't receive more data from peer */
196#define SS_RCVATMARK 0x040 /* at mark on input */ 197#define SS_RCVATMARK 0x040 /* at mark on input */
197#define SS_ISABORTING 0x080 /* aborting fd references - close() */ 198#define SS_ISABORTING 0x080 /* aborting fd references - close() */
198#define SS_RESTARTSYS 0x100 /* restart blocked system calls */ 199#define SS_RESTARTSYS 0x100 /* restart blocked system calls */
199#define SS_ISDISCONNECTED 0x800 /* socket disconnected from peer */ 200#define SS_ISDISCONNECTED 0x800 /* socket disconnected from peer */
200 201
201#define SS_ASYNC 0x100 /* async i/o notify */ 202#define SS_ASYNC 0x100 /* async i/o notify */
202#define SS_MORETOCOME 0x400 /* 203#define SS_MORETOCOME 0x400 /*
203 * hint from sosend to lower layer; 204 * hint from sosend to lower layer;
204 * more data coming 205 * more data coming
205 */ 206 */
206#define SS_ISAPIPE 0x1000 /* socket is implementing a pipe */ 207#define SS_ISAPIPE 0x1000 /* socket is implementing a pipe */
207#define SS_NBIO 0x2000 /* socket is in non blocking I/O */ 208#define SS_NBIO 0x2000 /* socket is in non blocking I/O */
208 209
209#ifdef _KERNEL 210#ifdef _KERNEL
210 211
211struct accept_filter { 212struct accept_filter {
212 char accf_name[16]; 213 char accf_name[16];
213 void (*accf_callback) 214 void (*accf_callback)
214 (struct socket *, void *, int, int); 215 (struct socket *, void *, int, int);
215 void * (*accf_create) 216 void * (*accf_create)
216 (struct socket *, char *); 217 (struct socket *, char *);
217 void (*accf_destroy) 218 void (*accf_destroy)
218 (struct socket *); 219 (struct socket *);
219 LIST_ENTRY(accept_filter) accf_next; 220 LIST_ENTRY(accept_filter) accf_next;
220 u_int accf_refcnt; 221 u_int accf_refcnt;
221}; 222};
222 223
223struct sockopt { 224struct sockopt {
224 int sopt_level; /* option level */ 225 int sopt_level; /* option level */
225 int sopt_name; /* option name */ 226 int sopt_name; /* option name */
226 size_t sopt_size; /* data length */ 227 size_t sopt_size; /* data length */
227 size_t sopt_retsize; /* returned data length */ 228 size_t sopt_retsize; /* returned data length */
228 void * sopt_data; /* data pointer */ 229 void * sopt_data; /* data pointer */
229 uint8_t sopt_buf[sizeof(int)]; /* internal storage */ 230 uint8_t sopt_buf[sizeof(int)]; /* internal storage */
230}; 231};
231 232
232#define SB_EMPTY_FIXUP(sb) \ 233#define SB_EMPTY_FIXUP(sb) \
233do { \ 234do { \
234 KASSERT(solocked((sb)->sb_so)); \ 235 KASSERT(solocked((sb)->sb_so)); \
235 if ((sb)->sb_mb == NULL) { \ 236 if ((sb)->sb_mb == NULL) { \
236 (sb)->sb_mbtail = NULL; \ 237 (sb)->sb_mbtail = NULL; \
237 (sb)->sb_lastrecord = NULL; \ 238 (sb)->sb_lastrecord = NULL; \
238 } \ 239 } \
239} while (/*CONSTCOND*/0) 240} while (/*CONSTCOND*/0)
240 241
241extern u_long sb_max; 242extern u_long sb_max;
242extern int somaxkva; 243extern int somaxkva;
243extern int sock_loan_thresh; 244extern int sock_loan_thresh;
244extern kmutex_t *softnet_lock; 245extern kmutex_t *softnet_lock;
245 246
246struct mbuf; 247struct mbuf;
247struct lwp; 248struct lwp;
248struct msghdr; 249struct msghdr;
249struct stat; 250struct stat;
250struct knote; 251struct knote;
251struct sockaddr_big; 252struct sockaddr_big;
252enum uio_seg; 253enum uio_seg;
253 254
254struct mbuf *getsombuf(struct socket *, int); 255struct mbuf *getsombuf(struct socket *, int);
255 256
256/* 0x400 is SO_OTIMESTAMP */ 257/* 0x400 is SO_OTIMESTAMP */
257#define SOOPT_TIMESTAMP(o) ((o) & (SO_TIMESTAMP | 0x400)) 258#define SOOPT_TIMESTAMP(o) ((o) & (SO_TIMESTAMP | 0x400))
258 259
259/* 260/*
260 * File operations on sockets. 261 * File operations on sockets.
261 */ 262 */
262int soo_read(file_t *, off_t *, struct uio *, kauth_cred_t, int); 263int soo_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
263int soo_write(file_t *, off_t *, struct uio *, kauth_cred_t, int); 264int soo_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
264int soo_fcntl(file_t *, u_int cmd, void *); 265int soo_fcntl(file_t *, u_int cmd, void *);
265int soo_ioctl(file_t *, u_long cmd, void *); 266int soo_ioctl(file_t *, u_long cmd, void *);
266int soo_poll(file_t *, int); 267int soo_poll(file_t *, int);
267int soo_kqfilter(file_t *, struct knote *); 268int soo_kqfilter(file_t *, struct knote *);
268int soo_close(file_t *); 269int soo_close(file_t *);
269int soo_stat(file_t *, struct stat *); 270int soo_stat(file_t *, struct stat *);
270void soo_restart(file_t *); 271void soo_restart(file_t *);
271void sbappend(struct sockbuf *, struct mbuf *); 272void sbappend(struct sockbuf *, struct mbuf *);
272void sbappendstream(struct sockbuf *, struct mbuf *); 273void sbappendstream(struct sockbuf *, struct mbuf *);
273int sbappendaddr(struct sockbuf *, const struct sockaddr *, struct mbuf *, 274int sbappendaddr(struct sockbuf *, const struct sockaddr *, struct mbuf *,
274 struct mbuf *); 275 struct mbuf *);
275int sbappendaddrchain(struct sockbuf *, const struct sockaddr *, 276int sbappendaddrchain(struct sockbuf *, const struct sockaddr *,
276 struct mbuf *, int); 277 struct mbuf *, int);
277int sbappendcontrol(struct sockbuf *, struct mbuf *, struct mbuf *); 278int sbappendcontrol(struct sockbuf *, struct mbuf *, struct mbuf *);
278void sbappendrecord(struct sockbuf *, struct mbuf *); 279void sbappendrecord(struct sockbuf *, struct mbuf *);
279void sbcheck(struct sockbuf *); 280void sbcheck(struct sockbuf *);
280void sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *); 281void sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *);
281struct mbuf * 282struct mbuf *
282 sbcreatecontrol(void *, int, int, int); 283 sbcreatecontrol(void *, int, int, int);
283struct mbuf * 284struct mbuf *
284 sbcreatecontrol1(void **, int, int, int, int); 285 sbcreatecontrol1(void **, int, int, int, int);
285struct mbuf ** 286struct mbuf **
286 sbsavetimestamp(int, struct mbuf **); 287 sbsavetimestamp(int, struct mbuf **);
287void sbdrop(struct sockbuf *, int); 288void sbdrop(struct sockbuf *, int);
288void sbdroprecord(struct sockbuf *); 289void sbdroprecord(struct sockbuf *);
289void sbflush(struct sockbuf *); 290void sbflush(struct sockbuf *);
290void sbinsertoob(struct sockbuf *, struct mbuf *); 291void sbinsertoob(struct sockbuf *, struct mbuf *);
291void sbrelease(struct sockbuf *, struct socket *); 292void sbrelease(struct sockbuf *, struct socket *);
292int sbreserve(struct sockbuf *, u_long, struct socket *); 293int sbreserve(struct sockbuf *, u_long, struct socket *);
293int sbwait(struct sockbuf *); 294int sbwait(struct sockbuf *);
294int sb_max_set(u_long); 295int sb_max_set(u_long);
295void soinit(void); 296void soinit(void);
296void soinit1(void); 297void soinit1(void);
297void soinit2(void); 298void soinit2(void);
298int soabort(struct socket *); 299int soabort(struct socket *);
299int soaccept(struct socket *, struct sockaddr *); 300int soaccept(struct socket *, struct sockaddr *);
300int sofamily(const struct socket *); 301int sofamily(const struct socket *);
301int sobind(struct socket *, struct sockaddr *, struct lwp *); 302int sobind(struct socket *, struct sockaddr *, struct lwp *);
302void socantrcvmore(struct socket *); 303void socantrcvmore(struct socket *);
303void socantsendmore(struct socket *); 304void socantsendmore(struct socket *);
304void soroverflow(struct socket *); 305void soroverflow(struct socket *);
305int soclose(struct socket *); 306int soclose(struct socket *);
306int soconnect(struct socket *, struct sockaddr *, struct lwp *); 307int soconnect(struct socket *, struct sockaddr *, struct lwp *);
307int soconnect2(struct socket *, struct socket *); 308int soconnect2(struct socket *, struct socket *);
308int socreate(int, struct socket **, int, int, struct lwp *, 309int socreate(int, struct socket **, int, int, struct lwp *,
309 struct socket *); 310 struct socket *);
310int fsocreate(int, struct socket **, int, int, int *); 311int fsocreate(int, struct socket **, int, int, int *);
311int sodisconnect(struct socket *); 312int sodisconnect(struct socket *);
312void sofree(struct socket *); 313void sofree(struct socket *);
313int sogetopt(struct socket *, struct sockopt *); 314int sogetopt(struct socket *, struct sockopt *);
314void sohasoutofband(struct socket *); 315void sohasoutofband(struct socket *);
315void soisconnected(struct socket *); 316void soisconnected(struct socket *);
316void soisconnecting(struct socket *); 317void soisconnecting(struct socket *);
317void soisdisconnected(struct socket *); 318void soisdisconnected(struct socket *);
318void soisdisconnecting(struct socket *); 319void soisdisconnecting(struct socket *);
319int solisten(struct socket *, int, struct lwp *); 320int solisten(struct socket *, int, struct lwp *);
320struct socket * 321struct socket *
321 sonewconn(struct socket *, bool); 322 sonewconn(struct socket *, bool);
322void soqinsque(struct socket *, struct socket *, int); 323void soqinsque(struct socket *, struct socket *, int);
323bool soqremque(struct socket *, int); 324bool soqremque(struct socket *, int);
324int soreceive(struct socket *, struct mbuf **, struct uio *, 325int soreceive(struct socket *, struct mbuf **, struct uio *,
325 struct mbuf **, struct mbuf **, int *); 326 struct mbuf **, struct mbuf **, int *);
326int soreserve(struct socket *, u_long, u_long); 327int soreserve(struct socket *, u_long, u_long);
327void sorflush(struct socket *); 328void sorflush(struct socket *);
328int sosend(struct socket *, struct sockaddr *, struct uio *, 329int sosend(struct socket *, struct sockaddr *, struct uio *,
329 struct mbuf *, struct mbuf *, int, struct lwp *); 330 struct mbuf *, struct mbuf *, int, struct lwp *);
330int sosetopt(struct socket *, struct sockopt *); 331int sosetopt(struct socket *, struct sockopt *);
331int so_setsockopt(struct lwp *, struct socket *, int, int, const void *, size_t); 332int so_setsockopt(struct lwp *, struct socket *, int, int, const void *, size_t);
332int soshutdown(struct socket *, int); 333int soshutdown(struct socket *, int);
333void sorestart(struct socket *); 334void sorestart(struct socket *);
334void sowakeup(struct socket *, struct sockbuf *, int); 335void sowakeup(struct socket *, struct sockbuf *, int);
335int sockargs(struct mbuf **, const void *, size_t, enum uio_seg, int); 336int sockargs(struct mbuf **, const void *, size_t, enum uio_seg, int);
336int sopoll(struct socket *, int); 337int sopoll(struct socket *, int);
337struct socket *soget(bool); 338struct socket *soget(bool);
338void soput(struct socket *); 339void soput(struct socket *);
339bool solocked(const struct socket *); 340bool solocked(const struct socket *);
340bool solocked2(const struct socket *, const struct socket *); 341bool solocked2(const struct socket *, const struct socket *);
341int sblock(struct sockbuf *, int); 342int sblock(struct sockbuf *, int);
342void sbunlock(struct sockbuf *); 343void sbunlock(struct sockbuf *);
343int sowait(struct socket *, bool, int); 344int sowait(struct socket *, bool, int);
344void solockretry(struct socket *, kmutex_t *); 345void solockretry(struct socket *, kmutex_t *);
345void sosetlock(struct socket *); 346void sosetlock(struct socket *);
346void solockreset(struct socket *, kmutex_t *); 347void solockreset(struct socket *, kmutex_t *);
347 348
348void sockopt_init(struct sockopt *, int, int, size_t); 349void sockopt_init(struct sockopt *, int, int, size_t);
349void sockopt_destroy(struct sockopt *); 350void sockopt_destroy(struct sockopt *);
350int sockopt_set(struct sockopt *, const void *, size_t); 351int sockopt_set(struct sockopt *, const void *, size_t);
351int sockopt_setint(struct sockopt *, int); 352int sockopt_setint(struct sockopt *, int);
352int sockopt_get(const struct sockopt *, void *, size_t); 353int sockopt_get(const struct sockopt *, void *, size_t);
353int sockopt_getint(const struct sockopt *, int *); 354int sockopt_getint(const struct sockopt *, int *);
354int sockopt_setmbuf(struct sockopt *, struct mbuf *); 355int sockopt_setmbuf(struct sockopt *, struct mbuf *);
355struct mbuf *sockopt_getmbuf(const struct sockopt *); 356struct mbuf *sockopt_getmbuf(const struct sockopt *);
356 357
357int copyout_sockname(struct sockaddr *, unsigned int *, int, struct mbuf *); 358int copyout_sockname(struct sockaddr *, unsigned int *, int, struct mbuf *);
358int copyout_sockname_sb(struct sockaddr *, unsigned int *, 359int copyout_sockname_sb(struct sockaddr *, unsigned int *,
359 int , struct sockaddr_big *); 360 int , struct sockaddr_big *);
360int copyout_msg_control(struct lwp *, struct msghdr *, struct mbuf *); 361int copyout_msg_control(struct lwp *, struct msghdr *, struct mbuf *);
361void free_control_mbuf(struct lwp *, struct mbuf *, struct mbuf *); 362void free_control_mbuf(struct lwp *, struct mbuf *, struct mbuf *);
362 363
363int do_sys_getpeername(int, struct sockaddr *); 364int do_sys_getpeername(int, struct sockaddr *);
364int do_sys_getsockname(int, struct sockaddr *); 365int do_sys_getsockname(int, struct sockaddr *);
365 366
366int do_sys_sendmsg(struct lwp *, int, struct msghdr *, int, register_t *); 367int do_sys_sendmsg(struct lwp *, int, struct msghdr *, int, register_t *);
367int do_sys_sendmsg_so(struct lwp *, int, struct socket *, file_t *, 368int do_sys_sendmsg_so(struct lwp *, int, struct socket *, file_t *,
368 struct msghdr *, int, register_t *); 369 struct msghdr *, int, register_t *);
369 370
370int do_sys_recvmsg(struct lwp *, int, struct msghdr *, 371int do_sys_recvmsg(struct lwp *, int, struct msghdr *,
371 struct mbuf **, struct mbuf **, register_t *); 372 struct mbuf **, struct mbuf **, register_t *);
372int do_sys_recvmsg_so(struct lwp *, int, struct socket *, 373int do_sys_recvmsg_so(struct lwp *, int, struct socket *,
373 struct msghdr *mp, struct mbuf **, struct mbuf **, register_t *); 374 struct msghdr *mp, struct mbuf **, struct mbuf **, register_t *);
374 375
375int do_sys_bind(struct lwp *, int, struct sockaddr *); 376int do_sys_bind(struct lwp *, int, struct sockaddr *);
376int do_sys_connect(struct lwp *, int, struct sockaddr *); 377int do_sys_connect(struct lwp *, int, struct sockaddr *);
377int do_sys_accept(struct lwp *, int, struct sockaddr *, register_t *, 378int do_sys_accept(struct lwp *, int, struct sockaddr *, register_t *,
378 const sigset_t *, int, int); 379 const sigset_t *, int, int);
379 380
380/* 381/*
381 * Inline functions for sockets and socket buffering. 382 * Inline functions for sockets and socket buffering.
382 */ 383 */
383 384
384#include <sys/protosw.h> 385#include <sys/protosw.h>
385#include <sys/mbuf.h> 386#include <sys/mbuf.h>
386 387
387/* 388/*
388 * Do we need to notify the other side when I/O is possible? 389 * Do we need to notify the other side when I/O is possible?
389 */ 390 */
390static __inline int 391static __inline int
391sb_notify(struct sockbuf *sb) 392sb_notify(struct sockbuf *sb)
392{ 393{
393 394
394 KASSERT(solocked(sb->sb_so)); 395 KASSERT(solocked(sb->sb_so));
395 396
396 return sb->sb_flags & (SB_NOTIFY | SB_ASYNC | SB_UPCALL | SB_KNOTE); 397 return sb->sb_flags & (SB_NOTIFY | SB_ASYNC | SB_UPCALL | SB_KNOTE);
397} 398}
398 399
399/* 400/*
400 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)? 401 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
401 * Since the fields are unsigned, detect overflow and return 0. 402 * Since the fields are unsigned, detect overflow and return 0.
402 */ 403 */
403static __inline u_long 404static __inline u_long
404sbspace(const struct sockbuf *sb) 405sbspace(const struct sockbuf *sb)
405{ 406{
406 407
407 KASSERT(solocked(sb->sb_so)); 408 KASSERT(solocked(sb->sb_so));
408 if (sb->sb_hiwat <= sb->sb_cc || sb->sb_mbmax <= sb->sb_mbcnt) 409 if (sb->sb_hiwat <= sb->sb_cc || sb->sb_mbmax <= sb->sb_mbcnt)
409 return 0; 410 return 0;
410 return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt); 411 return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
411} 412}
412 413
413/* do we have to send all at once on a socket? */ 414/* do we have to send all at once on a socket? */
414static __inline int 415static __inline int
415sosendallatonce(const struct socket *so) 416sosendallatonce(const struct socket *so)
416{ 417{
417 418
418 return so->so_proto->pr_flags & PR_ATOMIC; 419 return so->so_proto->pr_flags & PR_ATOMIC;
419} 420}
420 421
421/* can we read something from so? */ 422/* can we read something from so? */
422static __inline int 423static __inline int
423soreadable(const struct socket *so) 424soreadable(const struct socket *so)
424{ 425{
425 426
426 KASSERT(solocked(so)); 427 KASSERT(solocked(so));
427 428
428 return so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 429 return so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
429 (so->so_state & SS_CANTRCVMORE) != 0 || 430 (so->so_state & SS_CANTRCVMORE) != 0 ||
430 so->so_qlen != 0 || so->so_error != 0; 431 so->so_qlen != 0 || so->so_error != 0;
431} 432}
432 433
433/* can we write something to so? */ 434/* can we write something to so? */
434static __inline int 435static __inline int
435sowritable(const struct socket *so) 436sowritable(const struct socket *so)
436{ 437{
437 438
438 KASSERT(solocked(so)); 439 KASSERT(solocked(so));
439 440
440 return (sbspace(&so->so_snd) >= so->so_snd.sb_lowat && 441 return (sbspace(&so->so_snd) >= so->so_snd.sb_lowat &&
441 ((so->so_state & SS_ISCONNECTED) != 0 || 442 ((so->so_state & SS_ISCONNECTED) != 0 ||
442 (so->so_proto->pr_flags & PR_CONNREQUIRED) == 0)) || 443 (so->so_proto->pr_flags & PR_CONNREQUIRED) == 0)) ||
443 (so->so_state & SS_CANTSENDMORE) != 0 || 444 (so->so_state & SS_CANTSENDMORE) != 0 ||
444 so->so_error != 0; 445 so->so_error != 0;
445} 446}
446 447
447/* adjust counters in sb reflecting allocation of m */ 448/* adjust counters in sb reflecting allocation of m */
448static __inline void 449static __inline void
449sballoc(struct sockbuf *sb, struct mbuf *m) 450sballoc(struct sockbuf *sb, struct mbuf *m)
450{ 451{
451 452
452 KASSERT(solocked(sb->sb_so)); 453 KASSERT(solocked(sb->sb_so));
453 454
454 sb->sb_cc += m->m_len; 455 sb->sb_cc += m->m_len;
455 sb->sb_mbcnt += MSIZE; 456 sb->sb_mbcnt += MSIZE;
456 if (m->m_flags & M_EXT) 457 if (m->m_flags & M_EXT)
457 sb->sb_mbcnt += m->m_ext.ext_size; 458 sb->sb_mbcnt += m->m_ext.ext_size;
458} 459}
459 460
460/* adjust counters in sb reflecting freeing of m */ 461/* adjust counters in sb reflecting freeing of m */
461static __inline void 462static __inline void
462sbfree(struct sockbuf *sb, struct mbuf *m) 463sbfree(struct sockbuf *sb, struct mbuf *m)
463{ 464{
464 465
465 KASSERT(solocked(sb->sb_so)); 466 KASSERT(solocked(sb->sb_so));
466 467
467 sb->sb_cc -= m->m_len; 468 sb->sb_cc -= m->m_len;
468 sb->sb_mbcnt -= MSIZE; 469 sb->sb_mbcnt -= MSIZE;
469 if (m->m_flags & M_EXT) 470 if (m->m_flags & M_EXT)
470 sb->sb_mbcnt -= m->m_ext.ext_size; 471 sb->sb_mbcnt -= m->m_ext.ext_size;
471} 472}
472 473
473static __inline void 474static __inline void
474sorwakeup(struct socket *so) 475sorwakeup(struct socket *so)
475{ 476{
476 477
477 KASSERT(solocked(so)); 478 KASSERT(solocked(so));
478 479
479 if (sb_notify(&so->so_rcv)) 480 if (sb_notify(&so->so_rcv))
480 sowakeup(so, &so->so_rcv, POLL_IN); 481 sowakeup(so, &so->so_rcv, POLL_IN);
481} 482}
482 483
483static __inline void 484static __inline void
484sowwakeup(struct socket *so) 485sowwakeup(struct socket *so)
485{ 486{
486 487
487 KASSERT(solocked(so)); 488 KASSERT(solocked(so));
488 489
489 if (sb_notify(&so->so_snd)) 490 if (sb_notify(&so->so_snd))
490 sowakeup(so, &so->so_snd, POLL_OUT); 491 sowakeup(so, &so->so_snd, POLL_OUT);
491} 492}
492 493
493static __inline void 494static __inline void
494solock(struct socket *so) 495solock(struct socket *so)
495{ 496{
496 kmutex_t *lock; 497 kmutex_t *lock;
497 498
498 lock = so->so_lock; 499 lock = so->so_lock;
499 mutex_enter(lock); 500 mutex_enter(lock);
500 if (__predict_false(lock != so->so_lock)) 501 if (__predict_false(lock != so->so_lock))
501 solockretry(so, lock); 502 solockretry(so, lock);
502} 503}
503  504
504static __inline void 505static __inline void
505sounlock(struct socket *so) 506sounlock(struct socket *so)
506{ 507{
507 508
508 mutex_exit(so->so_lock); 509 mutex_exit(so->so_lock);
509} 510}
510 511
511#ifdef SOCKBUF_DEBUG 512#ifdef SOCKBUF_DEBUG
512/* 513/*
513 * SBLASTRECORDCHK: check sb->sb_lastrecord is maintained correctly. 514 * SBLASTRECORDCHK: check sb->sb_lastrecord is maintained correctly.
514 * SBLASTMBUFCHK: check sb->sb_mbtail is maintained correctly. 515 * SBLASTMBUFCHK: check sb->sb_mbtail is maintained correctly.
515 * 516 *
516 * => panic if the socket buffer is inconsistent. 517 * => panic if the socket buffer is inconsistent.
517 * => 'where' is used for a panic message. 518 * => 'where' is used for a panic message.
518 */ 519 */
519void sblastrecordchk(struct sockbuf *, const char *); 520void sblastrecordchk(struct sockbuf *, const char *);
520#define SBLASTRECORDCHK(sb, where) sblastrecordchk((sb), (where)) 521#define SBLASTRECORDCHK(sb, where) sblastrecordchk((sb), (where))
521 522
522void sblastmbufchk(struct sockbuf *, const char *); 523void sblastmbufchk(struct sockbuf *, const char *);
523#define SBLASTMBUFCHK(sb, where) sblastmbufchk((sb), (where)) 524#define SBLASTMBUFCHK(sb, where) sblastmbufchk((sb), (where))
524#define SBCHECK(sb) sbcheck(sb) 525#define SBCHECK(sb) sbcheck(sb)
525#else 526#else
526#define SBLASTRECORDCHK(sb, where) /* nothing */ 527#define SBLASTRECORDCHK(sb, where) /* nothing */
527#define SBLASTMBUFCHK(sb, where) /* nothing */ 528#define SBLASTMBUFCHK(sb, where) /* nothing */
528#define SBCHECK(sb) /* nothing */ 529#define SBCHECK(sb) /* nothing */
529#endif /* SOCKBUF_DEBUG */ 530#endif /* SOCKBUF_DEBUG */
530 531
531/* sosend loan */ 532/* sosend loan */
532vaddr_t sokvaalloc(vaddr_t, vsize_t, struct socket *); 533vaddr_t sokvaalloc(vaddr_t, vsize_t, struct socket *);
533void sokvafree(vaddr_t, vsize_t); 534void sokvafree(vaddr_t, vsize_t);
534void soloanfree(struct mbuf *, void *, size_t, void *); 535void soloanfree(struct mbuf *, void *, size_t, void *);
535 536
536/* 537/*
537 * Values for socket-buffer-append priority argument to sbappendaddrchain(). 538 * Values for socket-buffer-append priority argument to sbappendaddrchain().
538 * The following flags are reserved for future implementation: 539 * The following flags are reserved for future implementation:
539 * 540 *
540 * SB_PRIO_NONE: honour normal socket-buffer limits. 541 * SB_PRIO_NONE: honour normal socket-buffer limits.
541 * 542 *
542 * SB_PRIO_ONESHOT_OVERFLOW: if the socket has any space, 543 * SB_PRIO_ONESHOT_OVERFLOW: if the socket has any space,
543 * deliver the entire chain. Intended for large requests 544 * deliver the entire chain. Intended for large requests
544 * that should be delivered in their entirety, or not at all. 545 * that should be delivered in their entirety, or not at all.
545 * 546 *
546 * SB_PRIO_OVERDRAFT: allow a small (2*MLEN) overflow, over and 547 * SB_PRIO_OVERDRAFT: allow a small (2*MLEN) overflow, over and
547 * aboce normal socket limits. Intended messages indicating 548 * aboce normal socket limits. Intended messages indicating
548 * buffer overflow in earlier normal/lower-priority messages . 549 * buffer overflow in earlier normal/lower-priority messages .
549 * 550 *
550 * SB_PRIO_BESTEFFORT: Ignore limits entirely. Intended only for 551 * SB_PRIO_BESTEFFORT: Ignore limits entirely. Intended only for
551 * kernel-generated messages to specially-marked scokets which 552 * kernel-generated messages to specially-marked scokets which
552 * require "reliable" delivery, nd where the source socket/protocol 553 * require "reliable" delivery, nd where the source socket/protocol
553 * message generator enforce some hard limit (but possibly well 554 * message generator enforce some hard limit (but possibly well
554 * above kern.sbmax). It is entirely up to the in-kernel source to 555 * above kern.sbmax). It is entirely up to the in-kernel source to
555 * avoid complete mbuf exhaustion or DoS scenarios. 556 * avoid complete mbuf exhaustion or DoS scenarios.
556 */ 557 */
557#define SB_PRIO_NONE 0 558#define SB_PRIO_NONE 0
558#define SB_PRIO_ONESHOT_OVERFLOW 1 559#define SB_PRIO_ONESHOT_OVERFLOW 1
559#define SB_PRIO_OVERDRAFT 2 560#define SB_PRIO_OVERDRAFT 2
560#define SB_PRIO_BESTEFFORT 3 561#define SB_PRIO_BESTEFFORT 3
561 562
562/* 563/*
563 * Accept filter functions (duh). 564 * Accept filter functions (duh).
564 */ 565 */
565int accept_filt_getopt(struct socket *, struct sockopt *); 566int accept_filt_getopt(struct socket *, struct sockopt *);
566int accept_filt_setopt(struct socket *, const struct sockopt *); 567int accept_filt_setopt(struct socket *, const struct sockopt *);
567int accept_filt_clear(struct socket *); 568int accept_filt_clear(struct socket *);
568int accept_filt_add(struct accept_filter *); 569int accept_filt_add(struct accept_filter *);
569int accept_filt_del(struct accept_filter *); 570int accept_filt_del(struct accept_filter *);
570struct accept_filter *accept_filt_get(char *); 571struct accept_filter *accept_filt_get(char *);
571#ifdef ACCEPT_FILTER_MOD 572#ifdef ACCEPT_FILTER_MOD
572#ifdef SYSCTL_DECL 573#ifdef SYSCTL_DECL
573SYSCTL_DECL(_net_inet_accf); 574SYSCTL_DECL(_net_inet_accf);
574#endif 575#endif
575void accept_filter_init(void); 576void accept_filter_init(void);
576#endif 577#endif
577 578
578#endif /* _KERNEL */ 579#endif /* _KERNEL */
579 580
580#endif /* !_SYS_SOCKETVAR_H_ */ 581#endif /* !_SYS_SOCKETVAR_H_ */