Wed Mar 18 05:33:23 2009 UTC ()
Pull up following revision(s) (requested by mrg in ticket #577):
	sys/kern/kern_descrip.c: revision 1.188
	sys/kern/uipc_usrreq.c: revision 1.121
	sys/sys/fcntl.h: revision 1.35
	sys/sys/file.h: revision 1.66
	sys/sys/param.h: patch
	sys/sys/un.h: revision 1.45
completely rework the way that orphaned sockets that are being fdpassed
via SCM_RIGHTS messages are dealt with:
1. unp_gc: make this a kthread.
2. unp_detach: go not call unp_gc directly. instead, wake up unp_gc kthread.
3. unp_scan: do not close files here. instead, put them on a global list
   for unp_gc to close, along with a per-file "deferred close count". if
   file is already enqueued for close, just increment deferred close count.
   this eliminates the recursive calls.
3. unp_gc: scan files on global deferred close list. close each file N
   times, as specified by deferred close count in file. continue processing
   list until it becomes empty (closing may cause additional files to be
   queued for close).
4. unp_gc: add additional bit to mark files we are scanning. set during
   initial scan of global file list that currently clears FMARK/FDEFER.
   during later scans, never examine / garbage collect descriptors that
   we have not marked during the earlier scan. do not proceed with this
   initial scan until all deferred closes have been processed. be careful
   with locking to ensure no races are introduced between deferred close
   and file scan.
5. unp_gc: use dummy file_t to mark position in list when scanning. allow
   us to drop filelist_lock. in turn allows us to eliminate kmem_alloc()
   and safely close files, etc.
6. prohibit transfer of descriptors within SCM_RIGHTS messages if
   (num_files_in_transit > maxfiles / unp_rights_ratio)
7. fd_allocfile: ensure recycled filse don't get scanned.
this is 97% work done by andrew doran, with a couple of minor bug fixes
and a lot of testing by yours truly.


(snj)
diff -r1.182.6.3 -r1.182.6.4 src/sys/kern/kern_descrip.c
diff -r1.119.4.1 -r1.119.4.2 src/sys/kern/uipc_usrreq.c
diff -r1.34 -r1.34.64.1 src/sys/sys/fcntl.h
diff -r1.65 -r1.65.6.1 src/sys/sys/file.h
diff -r1.330.4.3 -r1.330.4.4 src/sys/sys/param.h
diff -r1.44 -r1.44.4.1 src/sys/sys/un.h

cvs diff -r1.182.6.3 -r1.182.6.4 src/sys/kern/kern_descrip.c (expand / switch to unified diff)

--- src/sys/kern/kern_descrip.c 2009/03/15 20:23:26 1.182.6.3
+++ src/sys/kern/kern_descrip.c 2009/03/18 05:33:23 1.182.6.4
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: kern_descrip.c,v 1.182.6.3 2009/03/15 20:23:26 snj Exp $ */ 1/* $NetBSD: kern_descrip.c,v 1.182.6.4 2009/03/18 05:33:23 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -57,27 +57,27 @@ @@ -57,27 +57,27 @@
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE. 60 * SUCH DAMAGE.
61 * 61 *
62 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95 62 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
63 */ 63 */
64 64
65/* 65/*
66 * File descriptor management. 66 * File descriptor management.
67 */ 67 */
68 68
69#include <sys/cdefs.h> 69#include <sys/cdefs.h>
70__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.182.6.3 2009/03/15 20:23:26 snj Exp $"); 70__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.182.6.4 2009/03/18 05:33:23 snj Exp $");
71 71
72#include <sys/param.h> 72#include <sys/param.h>
73#include <sys/systm.h> 73#include <sys/systm.h>
74#include <sys/filedesc.h> 74#include <sys/filedesc.h>
75#include <sys/kernel.h> 75#include <sys/kernel.h>
76#include <sys/vnode.h> 76#include <sys/vnode.h>
77#include <sys/proc.h> 77#include <sys/proc.h>
78#include <sys/file.h> 78#include <sys/file.h>
79#include <sys/namei.h> 79#include <sys/namei.h>
80#include <sys/socket.h> 80#include <sys/socket.h>
81#include <sys/socketvar.h> 81#include <sys/socketvar.h>
82#include <sys/stat.h> 82#include <sys/stat.h>
83#include <sys/ioctl.h> 83#include <sys/ioctl.h>
@@ -994,35 +994,46 @@ fd_allocfile(file_t **resultfp, int *res @@ -994,35 +994,46 @@ fd_allocfile(file_t **resultfp, int *res
994 int error; 994 int error;
995 995
996 p = curproc; 996 p = curproc;
997 997
998 while ((error = fd_alloc(p, 0, resultfd)) != 0) { 998 while ((error = fd_alloc(p, 0, resultfd)) != 0) {
999 if (error != ENOSPC) { 999 if (error != ENOSPC) {
1000 return error; 1000 return error;
1001 } 1001 }
1002 fd_tryexpand(p); 1002 fd_tryexpand(p);
1003 } 1003 }
1004 1004
1005 fp = pool_cache_get(file_cache, PR_WAITOK); 1005 fp = pool_cache_get(file_cache, PR_WAITOK);
1006 KASSERT(fp->f_count == 0); 1006 KASSERT(fp->f_count == 0);
 1007 KASSERT(fp->f_msgcount == 0);
 1008 KASSERT(fp->f_unpcount == 0);
1007 fp->f_cred = kauth_cred_get(); 1009 fp->f_cred = kauth_cred_get();
1008 kauth_cred_hold(fp->f_cred); 1010 kauth_cred_hold(fp->f_cred);
1009 1011
1010 if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) { 1012 if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) {
1011 fd_abort(p, fp, *resultfd); 1013 fd_abort(p, fp, *resultfd);
1012 tablefull("file", "increase kern.maxfiles or MAXFILES"); 1014 tablefull("file", "increase kern.maxfiles or MAXFILES");
1013 return ENFILE; 1015 return ENFILE;
1014 } 1016 }
1015 1017
 1018 /*
 1019 * Don't allow recycled files to be scanned.
 1020 */
 1021 if ((fp->f_flag & FSCAN) != 0) {
 1022 mutex_enter(&fp->f_lock);
 1023 atomic_and_uint(&fp->f_flag, ~FSCAN);
 1024 mutex_exit(&fp->f_lock);
 1025 }
 1026
1016 fp->f_advice = 0; 1027 fp->f_advice = 0;
1017 fp->f_msgcount = 0; 1028 fp->f_msgcount = 0;
1018 fp->f_offset = 0; 1029 fp->f_offset = 0;
1019 *resultfp = fp; 1030 *resultfp = fp;
1020 1031
1021 return 0; 1032 return 0;
1022} 1033}
1023 1034
1024/* 1035/*
1025 * Successful creation of a new descriptor: make visible to the process. 1036 * Successful creation of a new descriptor: make visible to the process.
1026 */ 1037 */
1027void 1038void
1028fd_affix(proc_t *p, file_t *fp, unsigned fd) 1039fd_affix(proc_t *p, file_t *fp, unsigned fd)

cvs diff -r1.119.4.1 -r1.119.4.2 src/sys/kern/uipc_usrreq.c (expand / switch to unified diff)

--- src/sys/kern/uipc_usrreq.c 2009/02/16 03:31:13 1.119.4.1
+++ src/sys/kern/uipc_usrreq.c 2009/03/18 05:33:23 1.119.4.2
@@ -1,22 +1,22 @@ @@ -1,22 +1,22 @@
1/* $NetBSD: uipc_usrreq.c,v 1.119.4.1 2009/02/16 03:31:13 snj Exp $ */ 1/* $NetBSD: uipc_usrreq.c,v 1.119.4.2 2009/03/18 05:33:23 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1998, 2000, 2004, 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 1998, 2000, 2004, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center. 9 * NASA Ames Research Center, and by Andrew Doran.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions 12 * modification, are permitted provided that the following conditions
13 * are met: 13 * are met:
14 * 1. Redistributions of source code must retain the above copyright 14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer. 15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright 16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the 17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution. 18 * documentation and/or other materials provided with the distribution.
19 * 19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
@@ -86,47 +86,49 @@ @@ -86,47 +86,49 @@
86 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 86 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
87 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 87 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
88 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 88 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
89 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 89 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
90 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 90 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
91 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 91 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
92 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 92 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
93 * SUCH DAMAGE. 93 * SUCH DAMAGE.
94 * 94 *
95 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95 95 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
96 */ 96 */
97 97
98#include <sys/cdefs.h> 98#include <sys/cdefs.h>
99__KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.119.4.1 2009/02/16 03:31:13 snj Exp $"); 99__KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.119.4.2 2009/03/18 05:33:23 snj Exp $");
100 100
101#include <sys/param.h> 101#include <sys/param.h>
102#include <sys/systm.h> 102#include <sys/systm.h>
103#include <sys/proc.h> 103#include <sys/proc.h>
104#include <sys/filedesc.h> 104#include <sys/filedesc.h>
105#include <sys/domain.h> 105#include <sys/domain.h>
106#include <sys/protosw.h> 106#include <sys/protosw.h>
107#include <sys/socket.h> 107#include <sys/socket.h>
108#include <sys/socketvar.h> 108#include <sys/socketvar.h>
109#include <sys/unpcb.h> 109#include <sys/unpcb.h>
110#include <sys/un.h> 110#include <sys/un.h>
111#include <sys/namei.h> 111#include <sys/namei.h>
112#include <sys/vnode.h> 112#include <sys/vnode.h>
113#include <sys/file.h> 113#include <sys/file.h>
114#include <sys/stat.h> 114#include <sys/stat.h>
115#include <sys/mbuf.h> 115#include <sys/mbuf.h>
116#include <sys/kauth.h> 116#include <sys/kauth.h>
117#include <sys/kmem.h> 117#include <sys/kmem.h>
118#include <sys/atomic.h> 118#include <sys/atomic.h>
119#include <sys/uidinfo.h> 119#include <sys/uidinfo.h>
 120#include <sys/kernel.h>
 121#include <sys/kthread.h>
120 122
121/* 123/*
122 * Unix communications domain. 124 * Unix communications domain.
123 * 125 *
124 * TODO: 126 * TODO:
125 * SEQPACKET, RDM 127 * SEQPACKET, RDM
126 * rethink name space problems 128 * rethink name space problems
127 * need a proper out-of-band 129 * need a proper out-of-band
128 * 130 *
129 * Notes on locking: 131 * Notes on locking:
130 * 132 *
131 * The generic rules noted in uipc_socket2.c apply. In addition: 133 * The generic rules noted in uipc_socket2.c apply. In addition:
132 * 134 *
@@ -159,36 +161,54 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq. @@ -159,36 +161,54 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.
159 * independent lock because of visibility / garbage collection issues: 161 * independent lock because of visibility / garbage collection issues:
160 * if a socket has been associated with a lock at any point, that lock 162 * if a socket has been associated with a lock at any point, that lock
161 * must remain valid until the socket is no longer visible in the system. 163 * must remain valid until the socket is no longer visible in the system.
162 * The lock must not be freed or otherwise destroyed until any sockets 164 * The lock must not be freed or otherwise destroyed until any sockets
163 * that had referenced it have also been destroyed. 165 * that had referenced it have also been destroyed.
164 */ 166 */
165const struct sockaddr_un sun_noname = { 167const struct sockaddr_un sun_noname = {
166 .sun_len = sizeof(sun_noname), 168 .sun_len = sizeof(sun_noname),
167 .sun_family = AF_LOCAL, 169 .sun_family = AF_LOCAL,
168}; 170};
169ino_t unp_ino; /* prototype for fake inode numbers */ 171ino_t unp_ino; /* prototype for fake inode numbers */
170 172
171struct mbuf *unp_addsockcred(struct lwp *, struct mbuf *); 173struct mbuf *unp_addsockcred(struct lwp *, struct mbuf *);
 174static void unp_mark(file_t *);
 175static void unp_scan(struct mbuf *, void (*)(file_t *), int);
 176static void unp_discard_now(file_t *);
 177static void unp_discard_later(file_t *);
 178static void unp_thread(void *);
 179static void unp_thread_kick(void);
172static kmutex_t *uipc_lock; 180static kmutex_t *uipc_lock;
173 181
 182static kcondvar_t unp_thread_cv;
 183static lwp_t *unp_thread_lwp;
 184static SLIST_HEAD(,file) unp_thread_discard;
 185static int unp_defer;
 186
174/* 187/*
175 * Initialize Unix protocols. 188 * Initialize Unix protocols.
176 */ 189 */
177void 190void
178uipc_init(void) 191uipc_init(void)
179{ 192{
 193 int error;
180 194
181 uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 195 uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
 196 cv_init(&unp_thread_cv, "unpgc");
 197
 198 error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, unp_thread,
 199 NULL, &unp_thread_lwp, "unpgc");
 200 if (error != 0)
 201 panic("uipc_init %d", error);
182} 202}
183 203
184/* 204/*
185 * A connection succeeded: disassociate both endpoints from the head's 205 * A connection succeeded: disassociate both endpoints from the head's
186 * lock, and make them share their own lock. There is a race here: for 206 * lock, and make them share their own lock. There is a race here: for
187 * a very brief time one endpoint will be locked by a different lock 207 * a very brief time one endpoint will be locked by a different lock
188 * than the other end. However, since the current thread holds the old 208 * than the other end. However, since the current thread holds the old
189 * lock (the listening socket's lock, the head) access can still only be 209 * lock (the listening socket's lock, the head) access can still only be
190 * made to one side of the connection. 210 * made to one side of the connection.
191 */ 211 */
192static void 212static void
193unp_setpeerlocks(struct socket *so, struct socket *so2) 213unp_setpeerlocks(struct socket *so, struct socket *so2)
194{ 214{
@@ -280,31 +300,29 @@ unp_output(struct mbuf *m, struct mbuf * @@ -280,31 +300,29 @@ unp_output(struct mbuf *m, struct mbuf *
280 so2 = unp->unp_conn->unp_socket; 300 so2 = unp->unp_conn->unp_socket;
281 301
282 KASSERT(solocked(so2)); 302 KASSERT(solocked(so2));
283 303
284 if (unp->unp_addr) 304 if (unp->unp_addr)
285 sun = unp->unp_addr; 305 sun = unp->unp_addr;
286 else 306 else
287 sun = &sun_noname; 307 sun = &sun_noname;
288 if (unp->unp_conn->unp_flags & UNP_WANTCRED) 308 if (unp->unp_conn->unp_flags & UNP_WANTCRED)
289 control = unp_addsockcred(l, control); 309 control = unp_addsockcred(l, control);
290 if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m, 310 if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m,
291 control) == 0) { 311 control) == 0) {
292 so2->so_rcv.sb_overflowed++; 312 so2->so_rcv.sb_overflowed++;
293 sounlock(so2); 
294 unp_dispose(control); 313 unp_dispose(control);
295 m_freem(control); 314 m_freem(control);
296 m_freem(m); 315 m_freem(m);
297 solock(so2); 
298 return (ENOBUFS); 316 return (ENOBUFS);
299 } else { 317 } else {
300 sorwakeup(so2); 318 sorwakeup(so2);
301 return (0); 319 return (0);
302 } 320 }
303} 321}
304 322
305void 323void
306unp_setaddr(struct socket *so, struct mbuf *nam, bool peeraddr) 324unp_setaddr(struct socket *so, struct mbuf *nam, bool peeraddr)
307{ 325{
308 const struct sockaddr_un *sun; 326 const struct sockaddr_un *sun;
309 struct unpcb *unp; 327 struct unpcb *unp;
310 bool ext; 328 bool ext;
@@ -508,31 +526,29 @@ uipc_usrreq(struct socket *so, int req,  @@ -508,31 +526,29 @@ uipc_usrreq(struct socket *so, int req,
508 * dropped until we have sent 526 * dropped until we have sent
509 * the message and disconnected. 527 * the message and disconnected.
510 * This is necessary to prevent 528 * This is necessary to prevent
511 * intervening control ops, like 529 * intervening control ops, like
512 * another connection. 530 * another connection.
513 */ 531 */
514 error = unp_connect(so, nam, l); 532 error = unp_connect(so, nam, l);
515 } 533 }
516 } else { 534 } else {
517 if ((so->so_state & SS_ISCONNECTED) == 0) 535 if ((so->so_state & SS_ISCONNECTED) == 0)
518 error = ENOTCONN; 536 error = ENOTCONN;
519 } 537 }
520 if (error) { 538 if (error) {
521 sounlock(so); 
522 unp_dispose(control); 539 unp_dispose(control);
523 m_freem(control); 540 m_freem(control);
524 m_freem(m); 541 m_freem(m);
525 solock(so); 
526 break; 542 break;
527 } 543 }
528 KASSERT(p != NULL); 544 KASSERT(p != NULL);
529 error = unp_output(m, control, unp, l); 545 error = unp_output(m, control, unp, l);
530 if (nam) 546 if (nam)
531 unp_disconnect(unp); 547 unp_disconnect(unp);
532 break; 548 break;
533 } 549 }
534 550
535 case SOCK_STREAM: 551 case SOCK_STREAM:
536#define rcv (&so2->so_rcv) 552#define rcv (&so2->so_rcv)
537#define snd (&so->so_snd) 553#define snd (&so->so_snd)
538 if (unp->unp_conn == NULL) { 554 if (unp->unp_conn == NULL) {
@@ -561,30 +577,28 @@ uipc_usrreq(struct socket *so, int req,  @@ -561,30 +577,28 @@ uipc_usrreq(struct socket *so, int req,
561 sbappend(rcv, m); 577 sbappend(rcv, m);
562 snd->sb_mbmax -= 578 snd->sb_mbmax -=
563 rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt; 579 rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
564 unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt; 580 unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
565 newhiwat = snd->sb_hiwat - 581 newhiwat = snd->sb_hiwat -
566 (rcv->sb_cc - unp->unp_conn->unp_cc); 582 (rcv->sb_cc - unp->unp_conn->unp_cc);
567 (void)chgsbsize(so->so_uidinfo, 583 (void)chgsbsize(so->so_uidinfo,
568 &snd->sb_hiwat, newhiwat, RLIM_INFINITY); 584 &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
569 unp->unp_conn->unp_cc = rcv->sb_cc; 585 unp->unp_conn->unp_cc = rcv->sb_cc;
570 sorwakeup(so2); 586 sorwakeup(so2);
571#undef snd 587#undef snd
572#undef rcv 588#undef rcv
573 if (control != NULL) { 589 if (control != NULL) {
574 sounlock(so); 
575 unp_dispose(control); 590 unp_dispose(control);
576 m_freem(control); 591 m_freem(control);
577 solock(so); 
578 } 592 }
579 break; 593 break;
580 594
581 default: 595 default:
582 panic("uipc 4"); 596 panic("uipc 4");
583 } 597 }
584 break; 598 break;
585 599
586 case PRU_ABORT: 600 case PRU_ABORT:
587 (void)unp_drop(unp, ECONNABORTED); 601 (void)unp_drop(unp, ECONNABORTED);
588 602
589 KASSERT(so->so_head == NULL); 603 KASSERT(so->so_head == NULL);
590#ifdef DIAGNOSTIC 604#ifdef DIAGNOSTIC
@@ -714,27 +728,28 @@ uipc_ctloutput(int op, struct socket *so @@ -714,27 +728,28 @@ uipc_ctloutput(int op, struct socket *so
714 * Both send and receive buffers are allocated PIPSIZ bytes of buffering 728 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
715 * for stream sockets, although the total for sender and receiver is 729 * for stream sockets, although the total for sender and receiver is
716 * actually only PIPSIZ. 730 * actually only PIPSIZ.
717 * Datagram sockets really use the sendspace as the maximum datagram size, 731 * Datagram sockets really use the sendspace as the maximum datagram size,
718 * and don't really want to reserve the sendspace. Their recvspace should 732 * and don't really want to reserve the sendspace. Their recvspace should
719 * be large enough for at least one max-size datagram plus address. 733 * be large enough for at least one max-size datagram plus address.
720 */ 734 */
721#define PIPSIZ 4096 735#define PIPSIZ 4096
722u_long unpst_sendspace = PIPSIZ; 736u_long unpst_sendspace = PIPSIZ;
723u_long unpst_recvspace = PIPSIZ; 737u_long unpst_recvspace = PIPSIZ;
724u_long unpdg_sendspace = 2*1024; /* really max datagram size */ 738u_long unpdg_sendspace = 2*1024; /* really max datagram size */
725u_long unpdg_recvspace = 4*1024; 739u_long unpdg_recvspace = 4*1024;
726 740
727u_int unp_rights; /* file descriptors in flight */ 741u_int unp_rights; /* files in flight */
 742u_int unp_rights_ratio = 2; /* limit, fraction of maxfiles */
728 743
729int 744int
730unp_attach(struct socket *so) 745unp_attach(struct socket *so)
731{ 746{
732 struct unpcb *unp; 747 struct unpcb *unp;
733 int error; 748 int error;
734 749
735 switch (so->so_type) { 750 switch (so->so_type) {
736 case SOCK_STREAM: 751 case SOCK_STREAM:
737 if (so->so_lock == NULL) { 752 if (so->so_lock == NULL) {
738 /*  753 /*
739 * XXX Assuming that no socket locks are held, 754 * XXX Assuming that no socket locks are held,
740 * as this call may sleep. 755 * as this call may sleep.
@@ -798,37 +813,34 @@ unp_detach(struct unpcb *unp) @@ -798,37 +813,34 @@ unp_detach(struct unpcb *unp)
798 if (unp->unp_conn) 813 if (unp->unp_conn)
799 unp_disconnect(unp); 814 unp_disconnect(unp);
800 while (unp->unp_refs) { 815 while (unp->unp_refs) {
801 KASSERT(solocked2(so, unp->unp_refs->unp_socket)); 816 KASSERT(solocked2(so, unp->unp_refs->unp_socket));
802 if (unp_drop(unp->unp_refs, ECONNRESET)) { 817 if (unp_drop(unp->unp_refs, ECONNRESET)) {
803 solock(so); 818 solock(so);
804 goto retry; 819 goto retry;
805 } 820 }
806 } 821 }
807 soisdisconnected(so); 822 soisdisconnected(so);
808 so->so_pcb = NULL; 823 so->so_pcb = NULL;
809 if (unp_rights) { 824 if (unp_rights) {
810 /* 825 /*
811 * Normally the receive buffer is flushed later, 826 * Normally the receive buffer is flushed later, in sofree,
812 * in sofree, but if our receive buffer holds references 827 * but if our receive buffer holds references to files that
813 * to descriptors that are now garbage, we will dispose 828 * are now garbage, we will enqueue those file references to
814 * of those descriptor references after the garbage collector 829 * the garbage collector and kick it into action.
815 * gets them (resulting in a "panic: closef: count < 0"). 
816 */ 830 */
817 sorflush(so); 831 sorflush(so);
818 unp_free(unp); 832 unp_free(unp);
819 sounlock(so); 833 unp_thread_kick();
820 unp_gc(); 
821 solock(so); 
822 } else 834 } else
823 unp_free(unp); 835 unp_free(unp);
824} 836}
825 837
826int 838int
827unp_bind(struct socket *so, struct mbuf *nam, struct lwp *l) 839unp_bind(struct socket *so, struct mbuf *nam, struct lwp *l)
828{ 840{
829 struct sockaddr_un *sun; 841 struct sockaddr_un *sun;
830 struct unpcb *unp; 842 struct unpcb *unp;
831 vnode_t *vp; 843 vnode_t *vp;
832 struct vattr vattr; 844 struct vattr vattr;
833 size_t addrlen; 845 size_t addrlen;
834 int error; 846 int error;
@@ -1155,66 +1167,62 @@ unp_externalize(struct mbuf *rights, str @@ -1155,66 +1167,62 @@ unp_externalize(struct mbuf *rights, str
1155 struct proc *p = l->l_proc; 1167 struct proc *p = l->l_proc;
1156 int i, *fdp; 1168 int i, *fdp;
1157 file_t **rp; 1169 file_t **rp;
1158 file_t *fp; 1170 file_t *fp;
1159 int nfds, error = 0; 1171 int nfds, error = 0;
1160 1172
1161 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / 1173 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
1162 sizeof(file_t *); 1174 sizeof(file_t *);
1163 rp = (file_t **)CMSG_DATA(cm); 1175 rp = (file_t **)CMSG_DATA(cm);
1164 1176
1165 fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK); 1177 fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK);
1166 rw_enter(&p->p_cwdi->cwdi_lock, RW_READER); 1178 rw_enter(&p->p_cwdi->cwdi_lock, RW_READER);
1167 1179
1168 /* Make sure the recipient should be able to see the descriptors.. */ 1180 /* Make sure the recipient should be able to see the files.. */
1169 if (p->p_cwdi->cwdi_rdir != NULL) { 1181 if (p->p_cwdi->cwdi_rdir != NULL) {
1170 rp = (file_t **)CMSG_DATA(cm); 1182 rp = (file_t **)CMSG_DATA(cm);
1171 for (i = 0; i < nfds; i++) { 1183 for (i = 0; i < nfds; i++) {
1172 fp = *rp++; 1184 fp = *rp++;
1173 /* 1185 /*
1174 * If we are in a chroot'ed directory, and 1186 * If we are in a chroot'ed directory, and
1175 * someone wants to pass us a directory, make 1187 * someone wants to pass us a directory, make
1176 * sure it's inside the subtree we're allowed 1188 * sure it's inside the subtree we're allowed
1177 * to access. 1189 * to access.
1178 */ 1190 */
1179 if (fp->f_type == DTYPE_VNODE) { 1191 if (fp->f_type == DTYPE_VNODE) {
1180 vnode_t *vp = (vnode_t *)fp->f_data; 1192 vnode_t *vp = (vnode_t *)fp->f_data;
1181 if ((vp->v_type == VDIR) && 1193 if ((vp->v_type == VDIR) &&
1182 !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) { 1194 !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) {
1183 error = EPERM; 1195 error = EPERM;
1184 break; 1196 break;
1185 } 1197 }
1186 } 1198 }
1187 } 1199 }
1188 } 1200 }
1189 1201
1190 restart: 1202 restart:
1191 rp = (file_t **)CMSG_DATA(cm); 1203 rp = (file_t **)CMSG_DATA(cm);
1192 if (error != 0) { 1204 if (error != 0) {
1193 for (i = 0; i < nfds; i++) { 1205 for (i = 0; i < nfds; i++) {
1194 fp = *rp; 1206 fp = *rp;
1195 /* 
1196 * zero the pointer before calling unp_discard, 
1197 * since it may end up in unp_gc().. 
1198 */ 
1199 *rp++ = 0; 1207 *rp++ = 0;
1200 unp_discard(fp); 1208 unp_discard_now(fp);
1201 } 1209 }
1202 goto out; 1210 goto out;
1203 } 1211 }
1204 1212
1205 /* 1213 /*
1206 * First loop -- allocate file descriptor table slots for the 1214 * First loop -- allocate file descriptor table slots for the
1207 * new descriptors. 1215 * new files.
1208 */ 1216 */
1209 for (i = 0; i < nfds; i++) { 1217 for (i = 0; i < nfds; i++) {
1210 fp = *rp++; 1218 fp = *rp++;
1211 if ((error = fd_alloc(p, 0, &fdp[i])) != 0) { 1219 if ((error = fd_alloc(p, 0, &fdp[i])) != 0) {
1212 /* 1220 /*
1213 * Back out what we've done so far. 1221 * Back out what we've done so far.
1214 */ 1222 */
1215 for (--i; i >= 0; i--) { 1223 for (--i; i >= 0; i--) {
1216 fd_abort(p, NULL, fdp[i]); 1224 fd_abort(p, NULL, fdp[i]);
1217 } 1225 }
1218 if (error == ENOSPC) { 1226 if (error == ENOSPC) {
1219 fd_tryexpand(p); 1227 fd_tryexpand(p);
1220 error = 0; 1228 error = 0;
@@ -1222,27 +1230,27 @@ unp_externalize(struct mbuf *rights, str @@ -1222,27 +1230,27 @@ unp_externalize(struct mbuf *rights, str
1222 /* 1230 /*
1223 * This is the error that has historically 1231 * This is the error that has historically
1224 * been returned, and some callers may 1232 * been returned, and some callers may
1225 * expect it. 1233 * expect it.
1226 */ 1234 */
1227 error = EMSGSIZE; 1235 error = EMSGSIZE;
1228 } 1236 }
1229 goto restart; 1237 goto restart;
1230 } 1238 }
1231 } 1239 }
1232 1240
1233 /* 1241 /*
1234 * Now that adding them has succeeded, update all of the 1242 * Now that adding them has succeeded, update all of the
1235 * descriptor passing state. 1243 * file passing state and affix the descriptors.
1236 */ 1244 */
1237 rp = (file_t **)CMSG_DATA(cm); 1245 rp = (file_t **)CMSG_DATA(cm);
1238 for (i = 0; i < nfds; i++) { 1246 for (i = 0; i < nfds; i++) {
1239 fp = *rp++; 1247 fp = *rp++;
1240 atomic_dec_uint(&unp_rights); 1248 atomic_dec_uint(&unp_rights);
1241 fd_affix(p, fp, fdp[i]); 1249 fd_affix(p, fp, fdp[i]);
1242 mutex_enter(&fp->f_lock); 1250 mutex_enter(&fp->f_lock);
1243 fp->f_msgcount--; 1251 fp->f_msgcount--;
1244 mutex_exit(&fp->f_lock); 1252 mutex_exit(&fp->f_lock);
1245 /* 1253 /*
1246 * Note that fd_affix() adds a reference to the file. 1254 * Note that fd_affix() adds a reference to the file.
1247 * The file may already have been closed by another 1255 * The file may already have been closed by another
1248 * LWP in the process, so we must drop the reference 1256 * LWP in the process, so we must drop the reference
@@ -1257,52 +1265,61 @@ unp_externalize(struct mbuf *rights, str @@ -1257,52 +1265,61 @@ unp_externalize(struct mbuf *rights, str
1257 */ 1265 */
1258 memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int)); 1266 memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int));
1259 cm->cmsg_len = CMSG_LEN(nfds * sizeof(int)); 1267 cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
1260 rights->m_len = CMSG_SPACE(nfds * sizeof(int)); 1268 rights->m_len = CMSG_SPACE(nfds * sizeof(int));
1261 out: 1269 out:
1262 rw_exit(&p->p_cwdi->cwdi_lock); 1270 rw_exit(&p->p_cwdi->cwdi_lock);
1263 free(fdp, M_TEMP); 1271 free(fdp, M_TEMP);
1264 return (error); 1272 return (error);
1265} 1273}
1266 1274
1267int 1275int
1268unp_internalize(struct mbuf **controlp) 1276unp_internalize(struct mbuf **controlp)
1269{ 1277{
1270 struct filedesc *fdescp = curlwp->l_fd; 1278 filedesc_t *fdescp = curlwp->l_fd;
1271 struct mbuf *control = *controlp; 1279 struct mbuf *control = *controlp;
1272 struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *); 1280 struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *);
1273 file_t **rp, **files; 1281 file_t **rp, **files;
1274 file_t *fp; 1282 file_t *fp;
1275 int i, fd, *fdp; 1283 int i, fd, *fdp;
1276 int nfds, error; 1284 int nfds, error;
 1285 u_int maxmsg;
1277 1286
1278 error = 0; 1287 error = 0;
1279 newcm = NULL; 1288 newcm = NULL;
1280 1289
1281 /* Sanity check the control message header. */ 1290 /* Sanity check the control message header. */
1282 if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || 1291 if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
1283 cm->cmsg_len > control->m_len || 1292 cm->cmsg_len > control->m_len ||
1284 cm->cmsg_len < CMSG_ALIGN(sizeof(*cm))) 1293 cm->cmsg_len < CMSG_ALIGN(sizeof(*cm)))
1285 return (EINVAL); 1294 return (EINVAL);
1286 1295
1287 /* 1296 /*
1288 * Verify that the file descriptors are valid, and acquire 1297 * Verify that the file descriptors are valid, and acquire
1289 * a reference to each. 1298 * a reference to each.
1290 */ 1299 */
1291 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int); 1300 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
1292 fdp = (int *)CMSG_DATA(cm); 1301 fdp = (int *)CMSG_DATA(cm);
 1302 maxmsg = maxfiles / unp_rights_ratio;
1293 for (i = 0; i < nfds; i++) { 1303 for (i = 0; i < nfds; i++) {
1294 fd = *fdp++; 1304 fd = *fdp++;
 1305 if (atomic_inc_uint_nv(&unp_rights) > maxmsg) {
 1306 atomic_dec_uint(&unp_rights);
 1307 nfds = i;
 1308 error = EAGAIN;
 1309 goto out;
 1310 }
1295 if ((fp = fd_getfile(fd)) == NULL) { 1311 if ((fp = fd_getfile(fd)) == NULL) {
 1312 atomic_dec_uint(&unp_rights);
1296 nfds = i; 1313 nfds = i;
1297 error = EBADF; 1314 error = EBADF;
1298 goto out; 1315 goto out;
1299 } 1316 }
1300 } 1317 }
1301 1318
1302 /* Allocate new space and copy header into it. */ 1319 /* Allocate new space and copy header into it. */
1303 newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK); 1320 newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK);
1304 if (newcm == NULL) { 1321 if (newcm == NULL) {
1305 error = E2BIG; 1322 error = E2BIG;
1306 goto out; 1323 goto out;
1307 } 1324 }
1308 memcpy(newcm, cm, sizeof(struct cmsghdr)); 1325 memcpy(newcm, cm, sizeof(struct cmsghdr));
@@ -1314,34 +1331,36 @@ unp_internalize(struct mbuf **controlp) @@ -1314,34 +1331,36 @@ unp_internalize(struct mbuf **controlp)
1314 * int won't get until we're done. No need to lock, as we have 1331 * int won't get until we're done. No need to lock, as we have
1315 * already validated the descriptors with fd_getfile(). 1332 * already validated the descriptors with fd_getfile().
1316 */ 1333 */
1317 fdp = (int *)CMSG_DATA(cm) + nfds; 1334 fdp = (int *)CMSG_DATA(cm) + nfds;
1318 rp = files + nfds; 1335 rp = files + nfds;
1319 for (i = 0; i < nfds; i++) { 1336 for (i = 0; i < nfds; i++) {
1320 fp = fdescp->fd_ofiles[*--fdp]->ff_file; 1337 fp = fdescp->fd_ofiles[*--fdp]->ff_file;
1321 KASSERT(fp != NULL); 1338 KASSERT(fp != NULL);
1322 mutex_enter(&fp->f_lock); 1339 mutex_enter(&fp->f_lock);
1323 *--rp = fp; 1340 *--rp = fp;
1324 fp->f_count++; 1341 fp->f_count++;
1325 fp->f_msgcount++; 1342 fp->f_msgcount++;
1326 mutex_exit(&fp->f_lock); 1343 mutex_exit(&fp->f_lock);
1327 atomic_inc_uint(&unp_rights); 
1328 } 1344 }
1329 1345
1330 out: 1346 out:
1331 /* Release descriptor references. */ 1347 /* Release descriptor references. */
1332 fdp = (int *)CMSG_DATA(cm); 1348 fdp = (int *)CMSG_DATA(cm);
1333 for (i = 0; i < nfds; i++) { 1349 for (i = 0; i < nfds; i++) {
1334 fd_putfile(*fdp++); 1350 fd_putfile(*fdp++);
 1351 if (error != 0) {
 1352 atomic_dec_uint(&unp_rights);
 1353 }
1335 } 1354 }
1336 1355
1337 if (error == 0) { 1356 if (error == 0) {
1338 if (control->m_flags & M_EXT) { 1357 if (control->m_flags & M_EXT) {
1339 m_freem(control); 1358 m_freem(control);
1340 *controlp = control = m_get(M_WAIT, MT_CONTROL); 1359 *controlp = control = m_get(M_WAIT, MT_CONTROL);
1341 } 1360 }
1342 MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)), 1361 MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)),
1343 M_MBUF, NULL, NULL); 1362 M_MBUF, NULL, NULL);
1344 cm = newcm; 1363 cm = newcm;
1345 /* 1364 /*
1346 * Adjust message & mbuf to note amount of space 1365 * Adjust message & mbuf to note amount of space
1347 * actually used. 1366 * actually used.
@@ -1394,297 +1413,340 @@ unp_addsockcred(struct lwp *l, struct mb @@ -1394,297 +1413,340 @@ unp_addsockcred(struct lwp *l, struct mb
1394 /* 1413 /*
1395 * If a control message already exists, append us to the end. 1414 * If a control message already exists, append us to the end.
1396 */ 1415 */
1397 if (control != NULL) { 1416 if (control != NULL) {
1398 for (n = control; n->m_next != NULL; n = n->m_next) 1417 for (n = control; n->m_next != NULL; n = n->m_next)
1399 ; 1418 ;
1400 n->m_next = m; 1419 n->m_next = m;
1401 } else 1420 } else
1402 control = m; 1421 control = m;
1403 1422
1404 return (control); 1423 return (control);
1405} 1424}
1406 1425
1407int unp_defer, unp_gcing; 
1408extern struct domain unixdomain; 
1409 
1410/* 1426/*
1411 * Comment added long after the fact explaining what's going on here. 1427 * Do a mark-sweep GC of files in the system, to free up any which are
1412 * Do a mark-sweep GC of file descriptors on the system, to free up 1428 * caught in flight to an about-to-be-closed socket. Additionally,
1413 * any which are caught in flight to an about-to-be-closed socket. 1429 * process deferred file closures.
1414 * 
1415 * Traditional mark-sweep gc's start at the "root", and mark 
1416 * everything reachable from the root (which, in our case would be the 
1417 * process table). The mark bits are cleared during the sweep. 
1418 * 
1419 * XXX For some inexplicable reason (perhaps because the file 
1420 * descriptor tables used to live in the u area which could be swapped 
1421 * out and thus hard to reach), we do multiple scans over the set of 
1422 * descriptors, using use *two* mark bits per object (DEFER and MARK). 
1423 * Whenever we find a descriptor which references other descriptors, 
1424 * the ones it references are marked with both bits, and we iterate 
1425 * over the whole file table until there are no more DEFER bits set. 
1426 * We also make an extra pass *before* the GC to clear the mark bits, 
1427 * which could have been cleared at almost no cost during the previous 
1428 * sweep. 
1429 */ 1430 */
1430void 1431static void
1431unp_gc(void) 1432unp_gc(file_t *dp)
1432{ 1433{
1433 file_t *fp, *nextfp; 1434 extern struct domain unixdomain;
 1435 file_t *fp, *np;
1434 struct socket *so, *so1; 1436 struct socket *so, *so1;
1435 file_t **extra_ref, **fpp; 1437 u_int i, old, new;
1436 int nunref, nslots, i; 1438 bool didwork;
1437 1439
1438 if (atomic_swap_uint(&unp_gcing, 1) == 1) 1440 KASSERT(curlwp == unp_thread_lwp);
1439 return; 1441 KASSERT(mutex_owned(&filelist_lock));
1440 1442
1441 restart: 1443 /*
1442 nslots = nfiles * 2; 1444 * First, process deferred file closures.
1443 extra_ref = kmem_alloc(nslots * sizeof(file_t *), KM_SLEEP); 1445 */
 1446 while (!SLIST_EMPTY(&unp_thread_discard)) {
 1447 fp = SLIST_FIRST(&unp_thread_discard);
 1448 KASSERT(fp->f_unpcount > 0);
 1449 KASSERT(fp->f_count > 0);
 1450 KASSERT(fp->f_msgcount > 0);
 1451 KASSERT(fp->f_count >= fp->f_unpcount);
 1452 KASSERT(fp->f_count >= fp->f_msgcount);
 1453 KASSERT(fp->f_msgcount >= fp->f_unpcount);
 1454 SLIST_REMOVE_HEAD(&unp_thread_discard, f_unplist);
 1455 i = fp->f_unpcount;
 1456 fp->f_unpcount = 0;
 1457 mutex_exit(&filelist_lock);
 1458 for (; i != 0; i--) {
 1459 unp_discard_now(fp);
 1460 }
 1461 mutex_enter(&filelist_lock);
 1462 }
1444 1463
1445 mutex_enter(&filelist_lock); 1464 /*
 1465 * Clear mark bits. Ensure that we don't consider new files
 1466 * entering the file table during this loop (they will not have
 1467 * FSCAN set).
 1468 */
1446 unp_defer = 0; 1469 unp_defer = 0;
1447 
1448 /* Clear mark bits */ 
1449 LIST_FOREACH(fp, &filehead, f_list) { 1470 LIST_FOREACH(fp, &filehead, f_list) {
1450 atomic_and_uint(&fp->f_flag, ~(FMARK|FDEFER)); 1471 for (old = fp->f_flag;; old = new) {
 1472 new = atomic_cas_uint(&fp->f_flag, old,
 1473 (old | FSCAN) & ~(FMARK|FDEFER));
 1474 if (__predict_true(old == new)) {
 1475 break;
 1476 }
 1477 }
1451 } 1478 }
1452 1479
1453 /* 1480 /*
1454 * Iterate over the set of descriptors, marking ones believed 1481 * Iterate over the set of sockets, marking ones believed (based on
1455 * (based on refcount) to be referenced from a process, and 1482 * refcount) to be referenced from a process, and marking for rescan
1456 * marking for rescan descriptors which are queued on a socket. 1483 * sockets which are queued on a socket. Recan continues descending
 1484 * and searching for sockets referenced by sockets (FDEFER), until
 1485 * there are no more socket->socket references to be discovered.
1457 */ 1486 */
1458 do { 1487 do {
1459 LIST_FOREACH(fp, &filehead, f_list) { 1488 didwork = false;
 1489 for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
 1490 KASSERT(mutex_owned(&filelist_lock));
 1491 np = LIST_NEXT(fp, f_list);
1460 mutex_enter(&fp->f_lock); 1492 mutex_enter(&fp->f_lock);
1461 if (fp->f_flag & FDEFER) { 1493 if ((fp->f_flag & FDEFER) != 0) {
1462 atomic_and_uint(&fp->f_flag, ~FDEFER); 1494 atomic_and_uint(&fp->f_flag, ~FDEFER);
1463 unp_defer--; 1495 unp_defer--;
1464 KASSERT(fp->f_count != 0); 1496 KASSERT(fp->f_count != 0);
1465 } else { 1497 } else {
1466 if (fp->f_count == 0 || 1498 if (fp->f_count == 0 ||
1467 (fp->f_flag & FMARK) || 1499 (fp->f_flag & FMARK) != 0 ||
1468 fp->f_count == fp->f_msgcount) { 1500 fp->f_count == fp->f_msgcount ||
 1501 fp->f_unpcount != 0) {
1469 mutex_exit(&fp->f_lock); 1502 mutex_exit(&fp->f_lock);
1470 continue; 1503 continue;
1471 } 1504 }
1472 } 1505 }
1473 atomic_or_uint(&fp->f_flag, FMARK); 1506 atomic_or_uint(&fp->f_flag, FMARK);
1474 1507
1475 if (fp->f_type != DTYPE_SOCKET || 1508 if (fp->f_type != DTYPE_SOCKET ||
1476 (so = fp->f_data) == NULL || 1509 (so = fp->f_data) == NULL ||
1477 so->so_proto->pr_domain != &unixdomain || 1510 so->so_proto->pr_domain != &unixdomain ||
1478 (so->so_proto->pr_flags&PR_RIGHTS) == 0) { 1511 (so->so_proto->pr_flags & PR_RIGHTS) == 0) {
1479 mutex_exit(&fp->f_lock); 1512 mutex_exit(&fp->f_lock);
1480 continue; 1513 continue;
1481 } 1514 }
1482#ifdef notdef 1515
1483 if (so->so_rcv.sb_flags & SB_LOCK) { 1516 /* Gain file ref, mark our position, and unlock. */
1484 mutex_exit(&fp->f_lock); 1517 didwork = true;
1485 mutex_exit(&filelist_lock); 1518 LIST_INSERT_AFTER(fp, dp, f_list);
1486 kmem_free(extra_ref, nslots * sizeof(file_t *)); 1519 fp->f_count++;
1487 /* 
1488 * This is problematical; it's not clear 
1489 * we need to wait for the sockbuf to be 
1490 * unlocked (on a uniprocessor, at least), 
1491 * and it's also not clear what to do 
1492 * if sbwait returns an error due to receipt 
1493 * of a signal. If sbwait does return 
1494 * an error, we'll go into an infinite 
1495 * loop. Delete all of this for now. 
1496 */ 
1497 (void) sbwait(&so->so_rcv); 
1498 goto restart; 
1499 } 
1500#endif 
1501 mutex_exit(&fp->f_lock); 1520 mutex_exit(&fp->f_lock);
 1521 mutex_exit(&filelist_lock);
1502 1522
1503 /* 1523 /*
1504 * XXX Locking a socket with filelist_lock held 1524 * Mark files referenced from sockets queued on the
1505 * is ugly. filelist_lock can be taken by the 1525 * accept queue as well.
1506 * pagedaemon when reclaiming items from file_cache. 
1507 * Socket activity could delay the pagedaemon. 
1508 */ 1526 */
1509 solock(so); 1527 solock(so);
1510 unp_scan(so->so_rcv.sb_mb, unp_mark, 0); 1528 unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
1511 /* 1529 if ((so->so_options & SO_ACCEPTCONN) != 0) {
1512 * Mark descriptors referenced from sockets queued 
1513 * on the accept queue as well. 
1514 */ 
1515 if (so->so_options & SO_ACCEPTCONN) { 
1516 TAILQ_FOREACH(so1, &so->so_q0, so_qe) { 1530 TAILQ_FOREACH(so1, &so->so_q0, so_qe) {
1517 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); 1531 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1518 } 1532 }
1519 TAILQ_FOREACH(so1, &so->so_q, so_qe) { 1533 TAILQ_FOREACH(so1, &so->so_q, so_qe) {
1520 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); 1534 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1521 } 1535 }
1522 } 1536 }
1523 sounlock(so); 1537 sounlock(so);
 1538
 1539 /* Re-lock and restart from where we left off. */
 1540 closef(fp);
 1541 mutex_enter(&filelist_lock);
 1542 np = LIST_NEXT(dp, f_list);
 1543 LIST_REMOVE(dp, f_list);
1524 } 1544 }
1525 } while (unp_defer); 1545 /*
 1546 * Bail early if we did nothing in the loop above. Could
 1547 * happen because of concurrent activity causing unp_defer
 1548 * to get out of sync.
 1549 */
 1550 } while (unp_defer != 0 && didwork);
1526 1551
1527 /* 1552 /*
1528 * Sweep pass. Find unmarked descriptors, and free them. 1553 * Sweep pass.
1529 * 
1530 * We grab an extra reference to each of the file table entries 
1531 * that are not otherwise accessible and then free the rights 
1532 * that are stored in messages on them. 
1533 * 
1534 * The bug in the original code is a little tricky, so I'll describe 
1535 * what's wrong with it here. 
1536 * 
1537 * It is incorrect to simply unp_discard each entry for f_msgcount 
1538 * times -- consider the case of sockets A and B that contain 
1539 * references to each other. On a last close of some other socket, 
1540 * we trigger a gc since the number of outstanding rights (unp_rights) 
1541 * is non-zero. If during the sweep phase the gc code un_discards, 
1542 * we end up doing a (full) closef on the descriptor. A closef on A 
1543 * results in the following chain. Closef calls soo_close, which 
1544 * calls soclose. Soclose calls first (through the switch 
1545 * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply 
1546 * returns because the previous instance had set unp_gcing, and 
1547 * we return all the way back to soclose, which marks the socket 
1548 * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush 
1549 * to free up the rights that are queued in messages on the socket A, 
1550 * i.e., the reference on B. The sorflush calls via the dom_dispose 
1551 * switch unp_dispose, which unp_scans with unp_discard. This second 
1552 * instance of unp_discard just calls closef on B. 
1553 * 1554 *
1554 * Well, a similar chain occurs on B, resulting in a sorflush on B, 1555 * We grab an extra reference to each of the files that are
1555 * which results in another closef on A. Unfortunately, A is already 1556 * not otherwise accessible and then free the rights that are
1556 * being closed, and the descriptor has already been marked with 1557 * stored in messages on them.
1557 * SS_NOFDREF, and soclose panics at this point. 
1558 * 
1559 * Here, we first take an extra reference to each inaccessible 
1560 * descriptor. Then, if the inaccessible descriptor is a 
1561 * socket, we call sorflush in case it is a Unix domain 
1562 * socket. After we destroy all the rights carried in 
1563 * messages, we do a last closef to get rid of our extra 
1564 * reference. This is the last close, and the unp_detach etc 
1565 * will shut down the socket. 
1566 * 
1567 * 91/09/19, bsy@cs.cmu.edu 
1568 */ 1558 */
1569 if (nslots < nfiles) { 1559 for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
1570 mutex_exit(&filelist_lock); 1560 KASSERT(mutex_owned(&filelist_lock));
1571 kmem_free(extra_ref, nslots * sizeof(file_t *)); 1561 np = LIST_NEXT(fp, f_list);
1572 goto restart; 
1573 } 
1574 for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0; 
1575 fp = nextfp) { 
1576 nextfp = LIST_NEXT(fp, f_list); 
1577 mutex_enter(&fp->f_lock); 1562 mutex_enter(&fp->f_lock);
1578 if (fp->f_count != 0 && 1563
1579 fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) { 1564 /*
1580 *fpp++ = fp; 1565 * Ignore non-sockets.
1581 nunref++; 1566 * Ignore dead sockets, or sockets with pending close.
1582 fp->f_count++; 1567 * Ignore sockets obviously referenced elsewhere.
 1568 * Ignore sockets marked as referenced by our scan.
 1569 * Ignore new sockets that did not exist during the scan.
 1570 */
 1571 if (fp->f_type != DTYPE_SOCKET ||
 1572 fp->f_count == 0 || fp->f_unpcount != 0 ||
 1573 fp->f_count != fp->f_msgcount ||
 1574 (fp->f_flag & (FMARK | FSCAN)) != FSCAN) {
 1575 mutex_exit(&fp->f_lock);
 1576 continue;
1583 } 1577 }
 1578
 1579 /* Gain file ref, mark our position, and unlock. */
 1580 LIST_INSERT_AFTER(fp, dp, f_list);
 1581 fp->f_count++;
1584 mutex_exit(&fp->f_lock); 1582 mutex_exit(&fp->f_lock);
 1583 mutex_exit(&filelist_lock);
 1584
 1585 /*
 1586 * Flush all data from the socket's receive buffer.
 1587 * This will cause files referenced only by the
 1588 * socket to be queued for close.
 1589 */
 1590 so = fp->f_data;
 1591 solock(so);
 1592 sorflush(so);
 1593 sounlock(so);
 1594
 1595 /* Re-lock and restart from where we left off. */
 1596 closef(fp);
 1597 mutex_enter(&filelist_lock);
 1598 np = LIST_NEXT(dp, f_list);
 1599 LIST_REMOVE(dp, f_list);
1585 } 1600 }
1586 mutex_exit(&filelist_lock); 1601}
1587 1602
1588 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1603/*
1589 fp = *fpp; 1604 * Garbage collector thread. While SCM_RIGHTS messages are in transit,
1590 if (fp->f_type == DTYPE_SOCKET) { 1605 * wake once per second to garbage collect. Run continually while we
1591 so = fp->f_data; 1606 * have deferred closes to process.
1592 solock(so); 1607 */
1593 sorflush(fp->f_data); 1608static void
1594 sounlock(so); 1609unp_thread(void *cookie)
 1610{
 1611 file_t *dp;
 1612
 1613 /* Allocate a dummy file for our scans. */
 1614 if ((dp = fgetdummy()) == NULL) {
 1615 panic("unp_thread");
 1616 }
 1617
 1618 mutex_enter(&filelist_lock);
 1619 for (;;) {
 1620 KASSERT(mutex_owned(&filelist_lock));
 1621 if (SLIST_EMPTY(&unp_thread_discard)) {
 1622 if (unp_rights != 0) {
 1623 (void)cv_timedwait(&unp_thread_cv,
 1624 &filelist_lock, hz);
 1625 } else {
 1626 cv_wait(&unp_thread_cv, &filelist_lock);
 1627 }
1595 } 1628 }
 1629 unp_gc(dp);
1596 } 1630 }
1597 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1631 /* NOTREACHED */
1598 closef(*fpp); 1632}
 1633
 1634/*
 1635 * Kick the garbage collector into action if there is something for
 1636 * it to process.
 1637 */
 1638static void
 1639unp_thread_kick(void)
 1640{
 1641
 1642 if (!SLIST_EMPTY(&unp_thread_discard) || unp_rights != 0) {
 1643 mutex_enter(&filelist_lock);
 1644 cv_signal(&unp_thread_cv);
 1645 mutex_exit(&filelist_lock);
1599 } 1646 }
1600 kmem_free(extra_ref, nslots * sizeof(file_t *)); 
1601 atomic_swap_uint(&unp_gcing, 0); 
1602} 1647}
1603 1648
1604void 1649void
1605unp_dispose(struct mbuf *m) 1650unp_dispose(struct mbuf *m)
1606{ 1651{
1607 1652
1608 if (m) 1653 if (m)
1609 unp_scan(m, unp_discard, 1); 1654 unp_scan(m, unp_discard_later, 1);
1610} 1655}
1611 1656
1612void 1657void
1613unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard) 1658unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard)
1614{ 1659{
1615 struct mbuf *m; 1660 struct mbuf *m;
1616 file_t **rp; 1661 file_t **rp, *fp;
1617 struct cmsghdr *cm; 1662 struct cmsghdr *cm;
1618 int i; 1663 int i, qfds;
1619 int qfds; 
1620 1664
1621 while (m0) { 1665 while (m0) {
1622 for (m = m0; m; m = m->m_next) { 1666 for (m = m0; m; m = m->m_next) {
1623 if (m->m_type == MT_CONTROL && 1667 if (m->m_type != MT_CONTROL ||
1624 m->m_len >= sizeof(*cm)) { 1668 m->m_len < sizeof(*cm)) {
1625 cm = mtod(m, struct cmsghdr *); 1669 continue;
1626 if (cm->cmsg_level != SOL_SOCKET || 1670 }
1627 cm->cmsg_type != SCM_RIGHTS) 1671 cm = mtod(m, struct cmsghdr *);
1628 continue; 1672 if (cm->cmsg_level != SOL_SOCKET ||
1629 qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) 1673 cm->cmsg_type != SCM_RIGHTS)
1630 / sizeof(file_t *); 1674 continue;
1631 rp = (file_t **)CMSG_DATA(cm); 1675 qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
1632 for (i = 0; i < qfds; i++) { 1676 / sizeof(file_t *);
1633 file_t *fp = *rp; 1677 rp = (file_t **)CMSG_DATA(cm);
1634 if (discard) 1678 for (i = 0; i < qfds; i++) {
1635 *rp = 0; 1679 fp = *rp;
1636 (*op)(fp); 1680 if (discard) {
1637 rp++; 1681 *rp = 0;
1638 } 1682 }
1639 break; /* XXX, but saves time */ 1683 (*op)(fp);
 1684 rp++;
1640 } 1685 }
1641 } 1686 }
1642 m0 = m0->m_nextpkt; 1687 m0 = m0->m_nextpkt;
1643 } 1688 }
1644} 1689}
1645 1690
1646void 1691void
1647unp_mark(file_t *fp) 1692unp_mark(file_t *fp)
1648{ 1693{
1649 1694
1650 if (fp == NULL) 1695 if (fp == NULL)
1651 return; 1696 return;
1652 1697
1653 /* If we're already deferred, don't screw up the defer count */ 1698 /* If we're already deferred, don't screw up the defer count */
1654 mutex_enter(&fp->f_lock); 1699 mutex_enter(&fp->f_lock);
1655 if (fp->f_flag & (FMARK | FDEFER)) { 1700 if (fp->f_flag & (FMARK | FDEFER)) {
1656 mutex_exit(&fp->f_lock); 1701 mutex_exit(&fp->f_lock);
1657 return; 1702 return;
1658 } 1703 }
1659 1704
1660 /* 1705 /*
1661 * Minimize the number of deferrals... Sockets are the only 1706 * Minimize the number of deferrals... Sockets are the only type of
1662 * type of descriptor which can hold references to another 1707 * file which can hold references to another file, so just mark
1663 * descriptor, so just mark other descriptors, and defer 1708 * other files, and defer unmarked sockets for the next pass.
1664 * unmarked sockets for the next pass. 
1665 */ 1709 */
1666 if (fp->f_type == DTYPE_SOCKET) { 1710 if (fp->f_type == DTYPE_SOCKET) {
1667 unp_defer++; 1711 unp_defer++;
1668 KASSERT(fp->f_count != 0); 1712 KASSERT(fp->f_count != 0);
1669 atomic_or_uint(&fp->f_flag, FDEFER); 1713 atomic_or_uint(&fp->f_flag, FDEFER);
1670 } else { 1714 } else {
1671 atomic_or_uint(&fp->f_flag, FMARK); 1715 atomic_or_uint(&fp->f_flag, FMARK);
1672 } 1716 }
1673 mutex_exit(&fp->f_lock); 1717 mutex_exit(&fp->f_lock);
1674 return; 
1675} 1718}
1676 1719
1677void 1720static void
1678unp_discard(file_t *fp) 1721unp_discard_now(file_t *fp)
1679{ 1722{
1680 1723
1681 if (fp == NULL) 1724 if (fp == NULL)
1682 return; 1725 return;
1683 1726
1684 mutex_enter(&fp->f_lock); 
1685 KASSERT(fp->f_count > 0); 1727 KASSERT(fp->f_count > 0);
 1728 KASSERT(fp->f_msgcount > 0);
 1729
 1730 mutex_enter(&fp->f_lock);
1686 fp->f_msgcount--; 1731 fp->f_msgcount--;
1687 mutex_exit(&fp->f_lock); 1732 mutex_exit(&fp->f_lock);
1688 atomic_dec_uint(&unp_rights); 1733 atomic_dec_uint(&unp_rights);
1689 (void)closef(fp); 1734 (void)closef(fp);
1690} 1735}
 1736
 1737static void
 1738unp_discard_later(file_t *fp)
 1739{
 1740
 1741 if (fp == NULL)
 1742 return;
 1743
 1744 KASSERT(fp->f_count > 0);
 1745 KASSERT(fp->f_msgcount > 0);
 1746
 1747 mutex_enter(&filelist_lock);
 1748 if (fp->f_unpcount++ == 0) {
 1749 SLIST_INSERT_HEAD(&unp_thread_discard, fp, f_unplist);
 1750 }
 1751 mutex_exit(&filelist_lock);
 1752}

cvs diff -r1.34 -r1.34.64.1 src/sys/sys/fcntl.h (expand / switch to unified diff)

--- src/sys/sys/fcntl.h 2006/10/05 14:48:33 1.34
+++ src/sys/sys/fcntl.h 2009/03/18 05:33:23 1.34.64.1
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: fcntl.h,v 1.34 2006/10/05 14:48:33 chs Exp $ */ 1/* $NetBSD: fcntl.h,v 1.34.64.1 2009/03/18 05:33:23 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1983, 1990, 1993 4 * Copyright (c) 1983, 1990, 1993
5 * The Regents of the University of California. All rights reserved. 5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc. 6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed 7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph 8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc. 10 * the permission of UNIX System Laboratories, Inc.
11 * 11 *
12 * Redistribution and use in source and binary forms, with or without 12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions 13 * modification, are permitted provided that the following conditions
14 * are met: 14 * are met:
@@ -115,26 +115,27 @@ @@ -115,26 +115,27 @@
115#ifdef _KERNEL 115#ifdef _KERNEL
116/* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ 116/* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */
117#define FFLAGS(oflags) ((oflags) + 1) 117#define FFLAGS(oflags) ((oflags) + 1)
118#define OFLAGS(fflags) ((fflags) - 1) 118#define OFLAGS(fflags) ((fflags) - 1)
119 119
120/* all bits settable during open(2) */ 120/* all bits settable during open(2) */
121#define O_MASK (O_ACCMODE|O_NONBLOCK|O_APPEND|O_SHLOCK|O_EXLOCK|\ 121#define O_MASK (O_ACCMODE|O_NONBLOCK|O_APPEND|O_SHLOCK|O_EXLOCK|\
122 O_ASYNC|O_SYNC|O_CREAT|O_TRUNC|O_EXCL|O_DSYNC|\ 122 O_ASYNC|O_SYNC|O_CREAT|O_TRUNC|O_EXCL|O_DSYNC|\
123 O_RSYNC|O_NOCTTY|O_ALT_IO|O_NOFOLLOW|O_DIRECT) 123 O_RSYNC|O_NOCTTY|O_ALT_IO|O_NOFOLLOW|O_DIRECT)
124 124
125#define FMARK 0x00001000 /* mark during gc() */ 125#define FMARK 0x00001000 /* mark during gc() */
126#define FDEFER 0x00002000 /* defer for next gc pass */ 126#define FDEFER 0x00002000 /* defer for next gc pass */
127#define FHASLOCK 0x00004000 /* descriptor holds advisory lock */ 127#define FHASLOCK 0x00004000 /* descriptor holds advisory lock */
 128#define FSCAN 0x00100000 /* scan during gc passes */
128#define FKIOCTL 0x80000000 /* kernel originated ioctl */ 129#define FKIOCTL 0x80000000 /* kernel originated ioctl */
129/* bits settable by fcntl(F_SETFL, ...) */ 130/* bits settable by fcntl(F_SETFL, ...) */
130#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FDSYNC|FRSYNC|FALTIO|\ 131#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FDSYNC|FRSYNC|FALTIO|\
131 FDIRECT) 132 FDIRECT)
132/* bits to save after open(2) */ 133/* bits to save after open(2) */
133#define FMASK (FREAD|FWRITE|FCNTLFLAGS) 134#define FMASK (FREAD|FWRITE|FCNTLFLAGS)
134#endif /* _KERNEL */ 135#endif /* _KERNEL */
135 136
136/* 137/*
137 * The O_* flags used to have only F* names, which were used in the kernel 138 * The O_* flags used to have only F* names, which were used in the kernel
138 * and by fcntl. We retain the F* names for the kernel f_flags field 139 * and by fcntl. We retain the F* names for the kernel f_flags field
139 * and for backward compatibility for fcntl. 140 * and for backward compatibility for fcntl.
140 */ 141 */

cvs diff -r1.65 -r1.65.6.1 src/sys/sys/file.h (expand / switch to unified diff)

--- src/sys/sys/file.h 2008/06/24 10:26:27 1.65
+++ src/sys/sys/file.h 2009/03/18 05:33:23 1.65.6.1
@@ -1,14 +1,43 @@ @@ -1,14 +1,43 @@
1/* $NetBSD: file.h,v 1.65 2008/06/24 10:26:27 gmcgarry Exp $ */ 1/* $NetBSD: file.h,v 1.65.6.1 2009/03/18 05:33:23 snj Exp $ */
 2
 3/*-
 4 * Copyright (c) 2009 The NetBSD Foundation, Inc.
 5 * All rights reserved.
 6 *
 7 * This code is derived from software contributed to The NetBSD Foundation
 8 * by Andrew Doran.
 9 *
 10 * Redistribution and use in source and binary forms, with or without
 11 * modification, are permitted provided that the following conditions
 12 * are met:
 13 * 1. Redistributions of source code must retain the above copyright
 14 * notice, this list of conditions and the following disclaimer.
 15 * 2. Redistributions in binary form must reproduce the above copyright
 16 * notice, this list of conditions and the following disclaimer in the
 17 * documentation and/or other materials provided with the distribution.
 18 *
 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 29 * POSSIBILITY OF SUCH DAMAGE.
 30 */
2 31
3/* 32/*
4 * Copyright (c) 1982, 1986, 1989, 1993 33 * Copyright (c) 1982, 1986, 1989, 1993
5 * The Regents of the University of California. All rights reserved. 34 * The Regents of the University of California. All rights reserved.
6 * 35 *
7 * Redistribution and use in source and binary forms, with or without 36 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 37 * modification, are permitted provided that the following conditions
9 * are met: 38 * are met:
10 * 1. Redistributions of source code must retain the above copyright 39 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 40 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 41 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 42 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 43 * documentation and/or other materials provided with the distribution.
@@ -43,60 +72,65 @@ @@ -43,60 +72,65 @@
43#include <sys/mutex.h> 72#include <sys/mutex.h>
44#include <sys/condvar.h> 73#include <sys/condvar.h>
45 74
46struct proc; 75struct proc;
47struct lwp; 76struct lwp;
48struct uio; 77struct uio;
49struct iovec; 78struct iovec;
50struct stat; 79struct stat;
51struct knote; 80struct knote;
52 81
53/* 82/*
54 * Kernel file descriptor. One entry for each open kernel vnode and 83 * Kernel file descriptor. One entry for each open kernel vnode and
55 * socket. 84 * socket.
 85 *
 86 * This structure is exported via the KERN_FILE and KERN_FILE2 sysctl
 87 * calls. Only add members to the end, do not delete them.
56 */ 88 */
57struct file { 89struct file {
58 off_t f_offset; /* first, is 64-bit */ 90 off_t f_offset; /* first, is 64-bit */
59 kauth_cred_t f_cred; /* creds associated with descriptor */ 91 kauth_cred_t f_cred; /* creds associated with descriptor */
60 const struct fileops { 92 const struct fileops {
61 int (*fo_read) (struct file *, off_t *, struct uio *, 93 int (*fo_read) (struct file *, off_t *, struct uio *,
62 kauth_cred_t, int); 94 kauth_cred_t, int);
63 int (*fo_write) (struct file *, off_t *, struct uio *, 95 int (*fo_write) (struct file *, off_t *, struct uio *,
64 kauth_cred_t, int); 96 kauth_cred_t, int);
65 int (*fo_ioctl) (struct file *, u_long, void *); 97 int (*fo_ioctl) (struct file *, u_long, void *);
66 int (*fo_fcntl) (struct file *, u_int, void *); 98 int (*fo_fcntl) (struct file *, u_int, void *);
67 int (*fo_poll) (struct file *, int); 99 int (*fo_poll) (struct file *, int);
68 int (*fo_stat) (struct file *, struct stat *); 100 int (*fo_stat) (struct file *, struct stat *);
69 int (*fo_close) (struct file *); 101 int (*fo_close) (struct file *);
70 int (*fo_kqfilter) (struct file *, struct knote *); 102 int (*fo_kqfilter) (struct file *, struct knote *);
71 } *f_ops; 103 } *f_ops;
72 void *f_data; /* descriptor data, e.g. vnode/socket */ 104 void *f_data; /* descriptor data, e.g. vnode/socket */
73 LIST_ENTRY(file) f_list; /* list of active files */ 105 LIST_ENTRY(file) f_list; /* list of active files */
74 kmutex_t f_lock; /* lock on structure */ 106 kmutex_t f_lock; /* lock on structure */
75 int f_flag; /* see fcntl.h */ 107 int f_flag; /* see fcntl.h */
76 u_int f_iflags; /* internal flags; FIF_* */ 108 u_int f_unused1; /* unused; was internal flags; FIF_* */
77#define DTYPE_VNODE 1 /* file */ 109#define DTYPE_VNODE 1 /* file */
78#define DTYPE_SOCKET 2 /* communications endpoint */ 110#define DTYPE_SOCKET 2 /* communications endpoint */
79#define DTYPE_PIPE 3 /* pipe */ 111#define DTYPE_PIPE 3 /* pipe */
80#define DTYPE_KQUEUE 4 /* event queue */ 112#define DTYPE_KQUEUE 4 /* event queue */
81#define DTYPE_MISC 5 /* misc file descriptor type */ 113#define DTYPE_MISC 5 /* misc file descriptor type */
82#define DTYPE_CRYPTO 6 /* crypto */ 114#define DTYPE_CRYPTO 6 /* crypto */
83#define DTYPE_MQUEUE 7 /* message queue */ 115#define DTYPE_MQUEUE 7 /* message queue */
84#define DTYPE_NAMES \ 116#define DTYPE_NAMES \
85 "0", "file", "socket", "pipe", "kqueue", "misc", "crypto", "mqueue" 117 "0", "file", "socket", "pipe", "kqueue", "misc", "crypto", "mqueue"
86 u_int f_type; /* descriptor type */ 118 u_int f_type; /* descriptor type */
87 u_int f_advice; /* access pattern hint; UVM_ADV_* */ 119 u_int f_advice; /* access pattern hint; UVM_ADV_* */
88 u_int f_count; /* reference count */ 120 u_int f_count; /* reference count */
89 u_int f_msgcount; /* references from message queue */ 121 u_int f_msgcount; /* references from message queue */
 122 u_int f_unpcount; /* deferred close: see uipc_usrreq.c */
 123 SLIST_ENTRY(file) f_unplist; /* deferred close: see uipc_usrreq.c */
90}; 124};
91 125
92#define FILE_LOCK(fp) mutex_enter(&(fp)->f_lock) 126#define FILE_LOCK(fp) mutex_enter(&(fp)->f_lock)
93#define FILE_UNLOCK(fp) mutex_exit(&(fp)->f_lock) 127#define FILE_UNLOCK(fp) mutex_exit(&(fp)->f_lock)
94 128
95/* 129/*
96 * Flags for fo_read and fo_write and do_fileread/write/v 130 * Flags for fo_read and fo_write and do_fileread/write/v
97 */ 131 */
98#define FOF_UPDATE_OFFSET 0x0001 /* update the file offset */ 132#define FOF_UPDATE_OFFSET 0x0001 /* update the file offset */
99#define FOF_IOV_SYSSPACE 0x0100 /* iov structure in kernel memory */ 133#define FOF_IOV_SYSSPACE 0x0100 /* iov structure in kernel memory */
100 134
101LIST_HEAD(filelist, file); 135LIST_HEAD(filelist, file);
102extern struct filelist filehead; /* head of list of open files */ 136extern struct filelist filehead; /* head of list of open files */

cvs diff -r1.330.4.3 -r1.330.4.4 src/sys/sys/param.h (expand / switch to unified diff)

--- src/sys/sys/param.h 2009/02/09 00:22:09 1.330.4.3
+++ src/sys/sys/param.h 2009/03/18 05:33:23 1.330.4.4
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: param.h,v 1.330.4.3 2009/02/09 00:22:09 snj Exp $ */ 1/* $NetBSD: param.h,v 1.330.4.4 2009/03/18 05:33:23 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1982, 1986, 1989, 1993 4 * Copyright (c) 1982, 1986, 1989, 1993
5 * The Regents of the University of California. All rights reserved. 5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc. 6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed 7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph 8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc. 10 * the permission of UNIX System Laboratories, Inc.
11 * 11 *
12 * Redistribution and use in source and binary forms, with or without 12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions 13 * modification, are permitted provided that the following conditions
14 * are met: 14 * are met:
@@ -53,27 +53,27 @@ @@ -53,27 +53,27 @@
53 * m = minor version; a minor number of 99 indicates current. 53 * m = minor version; a minor number of 99 indicates current.
54 * r = 0 (*) 54 * r = 0 (*)
55 * p = patchlevel 55 * p = patchlevel
56 * 56 *
57 * When new releases are made, src/gnu/usr.bin/groff/tmac/mdoc.local 57 * When new releases are made, src/gnu/usr.bin/groff/tmac/mdoc.local
58 * needs to be updated and the changes sent back to the groff maintainers. 58 * needs to be updated and the changes sent back to the groff maintainers.
59 * 59 *
60 * (*) Up to 2.0I "release" used to be "",A-Z,Z[A-Z] but numeric 60 * (*) Up to 2.0I "release" used to be "",A-Z,Z[A-Z] but numeric
61 * e.g. NetBSD-1.2D = 102040000 ('D' == 4) 61 * e.g. NetBSD-1.2D = 102040000 ('D' == 4)
62 * NetBSD-2.0H (200080000) was changed on 20041001 to: 62 * NetBSD-2.0H (200080000) was changed on 20041001 to:
63 * 2.99.9 (299000900) 63 * 2.99.9 (299000900)
64 */ 64 */
65 65
66#define __NetBSD_Version__ 500000000 /* NetBSD 5.0_RC2 */ 66#define __NetBSD_Version__ 500000001 /* NetBSD 5.0_RC2 */
67 67
68#define __NetBSD_Prereq__(M,m,p) (((((M) * 100000000) + \ 68#define __NetBSD_Prereq__(M,m,p) (((((M) * 100000000) + \
69 (m) * 1000000) + (p) * 100) <= __NetBSD_Version__) 69 (m) * 1000000) + (p) * 100) <= __NetBSD_Version__)
70 70
71/* 71/*
72 * Historical NetBSD #define 72 * Historical NetBSD #define
73 * 73 *
74 * NetBSD 1.4 was the last release for which this value was incremented. 74 * NetBSD 1.4 was the last release for which this value was incremented.
75 * The value is now permanently fixed at 199905. It will never be 75 * The value is now permanently fixed at 199905. It will never be
76 * changed again. 76 * changed again.
77 * 77 *
78 * New code must use __NetBSD_Version__ instead, and should not even 78 * New code must use __NetBSD_Version__ instead, and should not even
79 * count on NetBSD being defined. 79 * count on NetBSD being defined.

cvs diff -r1.44 -r1.44.4.1 src/sys/sys/un.h (expand / switch to unified diff)

--- src/sys/sys/un.h 2008/08/06 15:01:24 1.44
+++ src/sys/sys/un.h 2009/03/18 05:33:23 1.44.4.1
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: un.h,v 1.44 2008/08/06 15:01:24 plunky Exp $ */ 1/* $NetBSD: un.h,v 1.44.4.1 2009/03/18 05:33:23 snj Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 1982, 1986, 1993 4 * Copyright (c) 1982, 1986, 1993
5 * The Regents of the University of California. All rights reserved. 5 * The Regents of the University of California. All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -81,29 +81,26 @@ int uipc_ctloutput(int, struct socket *, @@ -81,29 +81,26 @@ int uipc_ctloutput(int, struct socket *,
81void uipc_init (void); 81void uipc_init (void);
82kmutex_t *uipc_dgramlock (void); 82kmutex_t *uipc_dgramlock (void);
83kmutex_t *uipc_streamlock (void); 83kmutex_t *uipc_streamlock (void);
84kmutex_t *uipc_rawlock (void); 84kmutex_t *uipc_rawlock (void);
85 85
86int unp_attach (struct socket *); 86int unp_attach (struct socket *);
87int unp_bind (struct socket *, struct mbuf *, struct lwp *); 87int unp_bind (struct socket *, struct mbuf *, struct lwp *);
88int unp_connect (struct socket *, struct mbuf *, struct lwp *); 88int unp_connect (struct socket *, struct mbuf *, struct lwp *);
89int unp_connect2 (struct socket *, struct socket *, int); 89int unp_connect2 (struct socket *, struct socket *, int);
90void unp_detach (struct unpcb *); 90void unp_detach (struct unpcb *);
91void unp_discard (struct file *); 91void unp_discard (struct file *);
92void unp_disconnect (struct unpcb *); 92void unp_disconnect (struct unpcb *);
93bool unp_drop (struct unpcb *, int); 93bool unp_drop (struct unpcb *, int);
94void unp_gc (void); 
95void unp_mark (struct file *); 
96void unp_scan (struct mbuf *, void (*)(struct file *), int); 
97void unp_shutdown (struct unpcb *); 94void unp_shutdown (struct unpcb *);
98int unp_externalize (struct mbuf *, struct lwp *); 95int unp_externalize (struct mbuf *, struct lwp *);
99int unp_internalize (struct mbuf **); 96int unp_internalize (struct mbuf **);
100void unp_dispose (struct mbuf *); 97void unp_dispose (struct mbuf *);
101int unp_output (struct mbuf *, struct mbuf *, struct unpcb *, 98int unp_output (struct mbuf *, struct mbuf *, struct unpcb *,
102 struct lwp *); 99 struct lwp *);
103void unp_setaddr (struct socket *, struct mbuf *, bool); 100void unp_setaddr (struct socket *, struct mbuf *, bool);
104#else /* !_KERNEL */ 101#else /* !_KERNEL */
105 102
106/* actual length of an initialized sockaddr_un */ 103/* actual length of an initialized sockaddr_un */
107#if defined(_NETBSD_SOURCE) 104#if defined(_NETBSD_SOURCE)
108#define SUN_LEN(su) \ 105#define SUN_LEN(su) \
109 (sizeof(*(su)) - sizeof((su)->sun_path) + strlen((su)->sun_path)) 106 (sizeof(*(su)) - sizeof((su)->sun_path) + strlen((su)->sun_path))