Wed Mar 18 05:33:23 2009 UTC ()
Pull up following revision(s) (requested by mrg in ticket #577):
	sys/kern/kern_descrip.c: revision 1.188
	sys/kern/uipc_usrreq.c: revision 1.121
	sys/sys/fcntl.h: revision 1.35
	sys/sys/file.h: revision 1.66
	sys/sys/param.h: patch
	sys/sys/un.h: revision 1.45
completely rework the way that orphaned sockets that are being fdpassed
via SCM_RIGHTS messages are dealt with:
1. unp_gc: make this a kthread.
2. unp_detach: go not call unp_gc directly. instead, wake up unp_gc kthread.
3. unp_scan: do not close files here. instead, put them on a global list
   for unp_gc to close, along with a per-file "deferred close count". if
   file is already enqueued for close, just increment deferred close count.
   this eliminates the recursive calls.
3. unp_gc: scan files on global deferred close list. close each file N
   times, as specified by deferred close count in file. continue processing
   list until it becomes empty (closing may cause additional files to be
   queued for close).
4. unp_gc: add additional bit to mark files we are scanning. set during
   initial scan of global file list that currently clears FMARK/FDEFER.
   during later scans, never examine / garbage collect descriptors that
   we have not marked during the earlier scan. do not proceed with this
   initial scan until all deferred closes have been processed. be careful
   with locking to ensure no races are introduced between deferred close
   and file scan.
5. unp_gc: use dummy file_t to mark position in list when scanning. allow
   us to drop filelist_lock. in turn allows us to eliminate kmem_alloc()
   and safely close files, etc.
6. prohibit transfer of descriptors within SCM_RIGHTS messages if
   (num_files_in_transit > maxfiles / unp_rights_ratio)
7. fd_allocfile: ensure recycled filse don't get scanned.
this is 97% work done by andrew doran, with a couple of minor bug fixes
and a lot of testing by yours truly.


(snj)
diff -r1.182.6.3 -r1.182.6.4 src/sys/kern/kern_descrip.c
diff -r1.119.4.1 -r1.119.4.2 src/sys/kern/uipc_usrreq.c
diff -r1.34 -r1.34.64.1 src/sys/sys/fcntl.h
diff -r1.65 -r1.65.6.1 src/sys/sys/file.h
diff -r1.330.4.3 -r1.330.4.4 src/sys/sys/param.h
diff -r1.44 -r1.44.4.1 src/sys/sys/un.h

cvs diff -r1.182.6.3 -r1.182.6.4 src/sys/kern/kern_descrip.c (expand / switch to context diff)
--- src/sys/kern/kern_descrip.c 2009/03/15 20:23:26 1.182.6.3
+++ src/sys/kern/kern_descrip.c 2009/03/18 05:33:23 1.182.6.4
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_descrip.c,v 1.182.6.3 2009/03/15 20:23:26 snj Exp $	*/
+/*	$NetBSD: kern_descrip.c,v 1.182.6.4 2009/03/18 05:33:23 snj Exp $	*/
 
 /*-
  * Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -67,7 +67,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.182.6.3 2009/03/15 20:23:26 snj Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.182.6.4 2009/03/18 05:33:23 snj Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -1004,6 +1004,8 @@
 
 	fp = pool_cache_get(file_cache, PR_WAITOK);
 	KASSERT(fp->f_count == 0);
+	KASSERT(fp->f_msgcount == 0);
+	KASSERT(fp->f_unpcount == 0);
 	fp->f_cred = kauth_cred_get();
 	kauth_cred_hold(fp->f_cred);
 
@@ -1011,6 +1013,15 @@
 		fd_abort(p, fp, *resultfd);
 		tablefull("file", "increase kern.maxfiles or MAXFILES");
 		return ENFILE;
+	}
+
+	/*
+	 * Don't allow recycled files to be scanned.
+	 */
+	if ((fp->f_flag & FSCAN) != 0) {
+		mutex_enter(&fp->f_lock);
+		atomic_and_uint(&fp->f_flag, ~FSCAN);
+		mutex_exit(&fp->f_lock);
 	}
 
 	fp->f_advice = 0;

cvs diff -r1.119.4.1 -r1.119.4.2 src/sys/kern/uipc_usrreq.c (expand / switch to context diff)
--- src/sys/kern/uipc_usrreq.c 2009/02/16 03:31:13 1.119.4.1
+++ src/sys/kern/uipc_usrreq.c 2009/03/18 05:33:23 1.119.4.2
@@ -1,12 +1,12 @@
-/*	$NetBSD: uipc_usrreq.c,v 1.119.4.1 2009/02/16 03:31:13 snj Exp $	*/
+/*	$NetBSD: uipc_usrreq.c,v 1.119.4.2 2009/03/18 05:33:23 snj Exp $	*/
 
 /*-
- * Copyright (c) 1998, 2000, 2004, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 1998, 2000, 2004, 2008, 2009 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
- * NASA Ames Research Center.
+ * NASA Ames Research Center, and by Andrew Doran.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -96,7 +96,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.119.4.1 2009/02/16 03:31:13 snj Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.119.4.2 2009/03/18 05:33:23 snj Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -117,6 +117,8 @@
 #include <sys/kmem.h>
 #include <sys/atomic.h>
 #include <sys/uidinfo.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
 
 /*
  * Unix communications domain.
@@ -169,16 +171,34 @@
 ino_t	unp_ino;			/* prototype for fake inode numbers */
 
 struct mbuf *unp_addsockcred(struct lwp *, struct mbuf *);
+static void unp_mark(file_t *);
+static void unp_scan(struct mbuf *, void (*)(file_t *), int);
+static void unp_discard_now(file_t *);
+static void unp_discard_later(file_t *);
+static void unp_thread(void *);
+static void unp_thread_kick(void);
 static kmutex_t *uipc_lock;
 
+static kcondvar_t unp_thread_cv;
+static lwp_t *unp_thread_lwp;
+static SLIST_HEAD(,file) unp_thread_discard;
+static int unp_defer;
+
 /*
  * Initialize Unix protocols.
  */
 void
 uipc_init(void)
 {
+	int error;
 
 	uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
+	cv_init(&unp_thread_cv, "unpgc");
+
+	error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, unp_thread,
+	    NULL, &unp_thread_lwp, "unpgc");
+	if (error != 0)
+		panic("uipc_init %d", error);
 }
 
 /*
@@ -290,11 +310,9 @@
 	if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m,
 	    control) == 0) {
 		so2->so_rcv.sb_overflowed++;
-	    	sounlock(so2);
 		unp_dispose(control);
 		m_freem(control);
 		m_freem(m);
-	    	solock(so2);
 		return (ENOBUFS);
 	} else {
 		sorwakeup(so2);
@@ -518,11 +536,9 @@
 					error = ENOTCONN;
 			}
 			if (error) {
-				sounlock(so);
 				unp_dispose(control);
 				m_freem(control);
 				m_freem(m);
-				solock(so);
 				break;
 			}
 			KASSERT(p != NULL);
@@ -571,10 +587,8 @@
 #undef snd
 #undef rcv
 			if (control != NULL) {
-				sounlock(so);
 				unp_dispose(control);
 				m_freem(control);
-				solock(so);
 			}
 			break;
 
@@ -724,7 +738,8 @@
 u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
 u_long	unpdg_recvspace = 4*1024;
 
-u_int	unp_rights;			/* file descriptors in flight */
+u_int	unp_rights;			/* files in flight */
+u_int	unp_rights_ratio = 2;		/* limit, fraction of maxfiles */
 
 int
 unp_attach(struct socket *so)
@@ -808,17 +823,14 @@
 	so->so_pcb = NULL;
 	if (unp_rights) {
 		/*
-		 * Normally the receive buffer is flushed later,
-		 * in sofree, but if our receive buffer holds references
-		 * to descriptors that are now garbage, we will dispose
-		 * of those descriptor references after the garbage collector
-		 * gets them (resulting in a "panic: closef: count < 0").
+		 * Normally the receive buffer is flushed later, in sofree,
+		 * but if our receive buffer holds references to files that
+		 * are now garbage, we will enqueue those file references to
+		 * the garbage collector and kick it into action.
 		 */
 		sorflush(so);
 		unp_free(unp);
-		sounlock(so);
-		unp_gc();
-		solock(so);
+		unp_thread_kick();
 	} else
 		unp_free(unp);
 }
@@ -1165,7 +1177,7 @@
 	fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK);
 	rw_enter(&p->p_cwdi->cwdi_lock, RW_READER);
 
-	/* Make sure the recipient should be able to see the descriptors.. */
+	/* Make sure the recipient should be able to see the files.. */
 	if (p->p_cwdi->cwdi_rdir != NULL) {
 		rp = (file_t **)CMSG_DATA(cm);
 		for (i = 0; i < nfds; i++) {
@@ -1192,19 +1204,15 @@
 	if (error != 0) {
 		for (i = 0; i < nfds; i++) {
 			fp = *rp;
-			/*
-			 * zero the pointer before calling unp_discard,
-			 * since it may end up in unp_gc()..
-			 */
 			*rp++ = 0;
-			unp_discard(fp);
+			unp_discard_now(fp);
 		}
 		goto out;
 	}
 
 	/*
 	 * First loop -- allocate file descriptor table slots for the
-	 * new descriptors.
+	 * new files.
 	 */
 	for (i = 0; i < nfds; i++) {
 		fp = *rp++;
@@ -1232,7 +1240,7 @@
 
 	/*
 	 * Now that adding them has succeeded, update all of the
-	 * descriptor passing state.
+	 * file passing state and affix the descriptors.
 	 */
 	rp = (file_t **)CMSG_DATA(cm);
 	for (i = 0; i < nfds; i++) {
@@ -1267,13 +1275,14 @@
 int
 unp_internalize(struct mbuf **controlp)
 {
-	struct filedesc *fdescp = curlwp->l_fd;
+	filedesc_t *fdescp = curlwp->l_fd;
 	struct mbuf *control = *controlp;
 	struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *);
 	file_t **rp, **files;
 	file_t *fp;
 	int i, fd, *fdp;
 	int nfds, error;
+	u_int maxmsg;
 
 	error = 0;
 	newcm = NULL;
@@ -1290,9 +1299,17 @@
 	 */
 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
 	fdp = (int *)CMSG_DATA(cm);
+	maxmsg = maxfiles / unp_rights_ratio;
 	for (i = 0; i < nfds; i++) {
 		fd = *fdp++;
+		if (atomic_inc_uint_nv(&unp_rights) > maxmsg) {
+			atomic_dec_uint(&unp_rights);
+			nfds = i;
+			error = EAGAIN;
+			goto out;
+		}
 		if ((fp = fd_getfile(fd)) == NULL) {
+			atomic_dec_uint(&unp_rights);
 			nfds = i;
 			error = EBADF;
 			goto out;
@@ -1324,7 +1341,6 @@
 		fp->f_count++;
 		fp->f_msgcount++;
 		mutex_exit(&fp->f_lock);
-		atomic_inc_uint(&unp_rights);
 	}
 
  out:
@@ -1332,6 +1348,9 @@
 	fdp = (int *)CMSG_DATA(cm);
 	for (i = 0; i < nfds; i++) {
 		fd_putfile(*fdp++);
+		if (error != 0) {
+			atomic_dec_uint(&unp_rights);
+		}
 	}
 
 	if (error == 0) {
@@ -1404,68 +1423,82 @@
 	return (control);
 }
 
-int	unp_defer, unp_gcing;
-extern	struct domain unixdomain;
-
 /*
- * Comment added long after the fact explaining what's going on here.
- * Do a mark-sweep GC of file descriptors on the system, to free up
- * any which are caught in flight to an about-to-be-closed socket.
- *
- * Traditional mark-sweep gc's start at the "root", and mark
- * everything reachable from the root (which, in our case would be the
- * process table).  The mark bits are cleared during the sweep.
- *
- * XXX For some inexplicable reason (perhaps because the file
- * descriptor tables used to live in the u area which could be swapped
- * out and thus hard to reach), we do multiple scans over the set of
- * descriptors, using use *two* mark bits per object (DEFER and MARK).
- * Whenever we find a descriptor which references other descriptors,
- * the ones it references are marked with both bits, and we iterate
- * over the whole file table until there are no more DEFER bits set.
- * We also make an extra pass *before* the GC to clear the mark bits,
- * which could have been cleared at almost no cost during the previous
- * sweep.
+ * Do a mark-sweep GC of files in the system, to free up any which are
+ * caught in flight to an about-to-be-closed socket.  Additionally,
+ * process deferred file closures.
  */
-void
-unp_gc(void)
+static void
+unp_gc(file_t *dp)
 {
-	file_t *fp, *nextfp;
+	extern	struct domain unixdomain;
+	file_t *fp, *np;
 	struct socket *so, *so1;
-	file_t **extra_ref, **fpp;
-	int nunref, nslots, i;
+	u_int i, old, new;
+	bool didwork;
 
-	if (atomic_swap_uint(&unp_gcing, 1) == 1)
-		return;
+	KASSERT(curlwp == unp_thread_lwp);
+	KASSERT(mutex_owned(&filelist_lock));
 
- restart:
- 	nslots = nfiles * 2;
- 	extra_ref = kmem_alloc(nslots * sizeof(file_t *), KM_SLEEP);
+	/*
+	 * First, process deferred file closures.
+	 */
+	while (!SLIST_EMPTY(&unp_thread_discard)) {
+		fp = SLIST_FIRST(&unp_thread_discard);
+		KASSERT(fp->f_unpcount > 0);
+		KASSERT(fp->f_count > 0);
+		KASSERT(fp->f_msgcount > 0);
+		KASSERT(fp->f_count >= fp->f_unpcount);
+		KASSERT(fp->f_count >= fp->f_msgcount);
+		KASSERT(fp->f_msgcount >= fp->f_unpcount);
+		SLIST_REMOVE_HEAD(&unp_thread_discard, f_unplist);
+		i = fp->f_unpcount;
+		fp->f_unpcount = 0;
+		mutex_exit(&filelist_lock);
+		for (; i != 0; i--) {
+			unp_discard_now(fp);
+		}
+		mutex_enter(&filelist_lock);
+	}
 
-	mutex_enter(&filelist_lock);
+	/*
+	 * Clear mark bits.  Ensure that we don't consider new files
+	 * entering the file table during this loop (they will not have
+	 * FSCAN set).
+	 */
 	unp_defer = 0;
-
-	/* Clear mark bits */
 	LIST_FOREACH(fp, &filehead, f_list) {
-		atomic_and_uint(&fp->f_flag, ~(FMARK|FDEFER));
+		for (old = fp->f_flag;; old = new) {
+			new = atomic_cas_uint(&fp->f_flag, old,
+			    (old | FSCAN) & ~(FMARK|FDEFER));
+			if (__predict_true(old == new)) {
+				break;
+			}
+		}
 	}
 
 	/*
-	 * Iterate over the set of descriptors, marking ones believed
-	 * (based on refcount) to be referenced from a process, and
-	 * marking for rescan descriptors which are queued on a socket.
+	 * Iterate over the set of sockets, marking ones believed (based on
+	 * refcount) to be referenced from a process, and marking for rescan
+	 * sockets which are queued on a socket.  Recan continues descending
+	 * and searching for sockets referenced by sockets (FDEFER), until
+	 * there are no more socket->socket references to be discovered.
 	 */
 	do {
-		LIST_FOREACH(fp, &filehead, f_list) {
+		didwork = false;
+		for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
+			KASSERT(mutex_owned(&filelist_lock));
+			np = LIST_NEXT(fp, f_list);
 			mutex_enter(&fp->f_lock);
-			if (fp->f_flag & FDEFER) {
+			if ((fp->f_flag & FDEFER) != 0) {
 				atomic_and_uint(&fp->f_flag, ~FDEFER);
 				unp_defer--;
 				KASSERT(fp->f_count != 0);
 			} else {
 				if (fp->f_count == 0 ||
-				    (fp->f_flag & FMARK) ||
-				    fp->f_count == fp->f_msgcount) {
+				    (fp->f_flag & FMARK) != 0 ||
+				    fp->f_count == fp->f_msgcount ||
+				    fp->f_unpcount != 0) {
 					mutex_exit(&fp->f_lock);
 					continue;
 				}
@@ -1475,44 +1508,25 @@
 			if (fp->f_type != DTYPE_SOCKET ||
 			    (so = fp->f_data) == NULL ||
 			    so->so_proto->pr_domain != &unixdomain ||
-			    (so->so_proto->pr_flags&PR_RIGHTS) == 0) {
+			    (so->so_proto->pr_flags & PR_RIGHTS) == 0) {
 				mutex_exit(&fp->f_lock);
 				continue;
 			}
-#ifdef notdef
-			if (so->so_rcv.sb_flags & SB_LOCK) {
-				mutex_exit(&fp->f_lock);
-				mutex_exit(&filelist_lock);
-				kmem_free(extra_ref, nslots * sizeof(file_t *));
-				/*
-				 * This is problematical; it's not clear
-				 * we need to wait for the sockbuf to be
-				 * unlocked (on a uniprocessor, at least),
-				 * and it's also not clear what to do
-				 * if sbwait returns an error due to receipt
-				 * of a signal.  If sbwait does return
-				 * an error, we'll go into an infinite
-				 * loop.  Delete all of this for now.
-				 */
-				(void) sbwait(&so->so_rcv);
-				goto restart;
-			}
-#endif
+
+			/* Gain file ref, mark our position, and unlock. */
+			didwork = true;
+			LIST_INSERT_AFTER(fp, dp, f_list);
+			fp->f_count++;
 			mutex_exit(&fp->f_lock);
+			mutex_exit(&filelist_lock);
 
 			/*
-			 * XXX Locking a socket with filelist_lock held
-			 * is ugly.  filelist_lock can be taken by the
-			 * pagedaemon when reclaiming items from file_cache.
-			 * Socket activity could delay the pagedaemon.
+			 * Mark files referenced from sockets queued on the
+			 * accept queue as well.
 			 */
 			solock(so);
 			unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
-			/*
-			 * Mark descriptors referenced from sockets queued
-			 * on the accept queue as well.
-			 */
-			if (so->so_options & SO_ACCEPTCONN) {
+			if ((so->so_options & SO_ACCEPTCONN) != 0) {
 				TAILQ_FOREACH(so1, &so->so_q0, so_qe) {
 					unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
 				}
@@ -1521,84 +1535,115 @@
 				}
 			}
 			sounlock(so);
+
+			/* Re-lock and restart from where we left off. */
+			closef(fp);
+			mutex_enter(&filelist_lock);
+			np = LIST_NEXT(dp, f_list);
+			LIST_REMOVE(dp, f_list);
 		}
-	} while (unp_defer);
+		/*
+		 * Bail early if we did nothing in the loop above.  Could
+		 * happen because of concurrent activity causing unp_defer
+		 * to get out of sync.
+		 */
+	} while (unp_defer != 0 && didwork);
 
 	/*
-	 * Sweep pass.  Find unmarked descriptors, and free them.
+	 * Sweep pass.
 	 *
-	 * We grab an extra reference to each of the file table entries
-	 * that are not otherwise accessible and then free the rights
-	 * that are stored in messages on them.
-	 *
-	 * The bug in the original code is a little tricky, so I'll describe
-	 * what's wrong with it here.
-	 *
-	 * It is incorrect to simply unp_discard each entry for f_msgcount
-	 * times -- consider the case of sockets A and B that contain
-	 * references to each other.  On a last close of some other socket,
-	 * we trigger a gc since the number of outstanding rights (unp_rights)
-	 * is non-zero.  If during the sweep phase the gc code un_discards,
-	 * we end up doing a (full) closef on the descriptor.  A closef on A
-	 * results in the following chain.  Closef calls soo_close, which
-	 * calls soclose.   Soclose calls first (through the switch
-	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
-	 * returns because the previous instance had set unp_gcing, and
-	 * we return all the way back to soclose, which marks the socket
-	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
-	 * to free up the rights that are queued in messages on the socket A,
-	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
-	 * switch unp_dispose, which unp_scans with unp_discard.  This second
-	 * instance of unp_discard just calls closef on B.
-	 *
-	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
-	 * which results in another closef on A.  Unfortunately, A is already
-	 * being closed, and the descriptor has already been marked with
-	 * SS_NOFDREF, and soclose panics at this point.
-	 *
-	 * Here, we first take an extra reference to each inaccessible
-	 * descriptor.  Then, if the inaccessible descriptor is a
-	 * socket, we call sorflush in case it is a Unix domain
-	 * socket.  After we destroy all the rights carried in
-	 * messages, we do a last closef to get rid of our extra
-	 * reference.  This is the last close, and the unp_detach etc
-	 * will shut down the socket.
-	 *
-	 * 91/09/19, bsy@cs.cmu.edu
+	 * We grab an extra reference to each of the files that are
+	 * not otherwise accessible and then free the rights that are
+	 * stored in messages on them.
 	 */
-	if (nslots < nfiles) {
-		mutex_exit(&filelist_lock);
-		kmem_free(extra_ref, nslots * sizeof(file_t *));
-		goto restart;
-	}
-	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0;
-	    fp = nextfp) {
-		nextfp = LIST_NEXT(fp, f_list);
+	for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
+		KASSERT(mutex_owned(&filelist_lock));
+		np = LIST_NEXT(fp, f_list);
 		mutex_enter(&fp->f_lock);
-		if (fp->f_count != 0 &&
-		    fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
-			*fpp++ = fp;
-			nunref++;
-			fp->f_count++;
+
+		/*
+		 * Ignore non-sockets.
+		 * Ignore dead sockets, or sockets with pending close.
+		 * Ignore sockets obviously referenced elsewhere. 
+		 * Ignore sockets marked as referenced by our scan.
+		 * Ignore new sockets that did not exist during the scan.
+		 */
+		if (fp->f_type != DTYPE_SOCKET ||
+		    fp->f_count == 0 || fp->f_unpcount != 0 ||
+		    fp->f_count != fp->f_msgcount ||
+		    (fp->f_flag & (FMARK | FSCAN)) != FSCAN) {
+			mutex_exit(&fp->f_lock);
+			continue;
 		}
+
+		/* Gain file ref, mark our position, and unlock. */
+		LIST_INSERT_AFTER(fp, dp, f_list);
+		fp->f_count++;
 		mutex_exit(&fp->f_lock);
+		mutex_exit(&filelist_lock);
+
+		/*
+		 * Flush all data from the socket's receive buffer.
+		 * This will cause files referenced only by the
+		 * socket to be queued for close.
+		 */
+		so = fp->f_data;
+		solock(so);
+		sorflush(so);
+		sounlock(so);
+
+		/* Re-lock and restart from where we left off. */
+		closef(fp);
+		mutex_enter(&filelist_lock);
+		np = LIST_NEXT(dp, f_list);
+		LIST_REMOVE(dp, f_list);
 	}
-	mutex_exit(&filelist_lock);
+}
 
-	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
-		fp = *fpp;
-		if (fp->f_type == DTYPE_SOCKET) {
-			so = fp->f_data;
-			solock(so);
-			sorflush(fp->f_data);
-			sounlock(so);
+/*
+ * Garbage collector thread.  While SCM_RIGHTS messages are in transit,
+ * wake once per second to garbage collect.  Run continually while we
+ * have deferred closes to process.
+ */
+static void
+unp_thread(void *cookie)
+{
+	file_t *dp;
+
+	/* Allocate a dummy file for our scans. */
+	if ((dp = fgetdummy()) == NULL) {
+		panic("unp_thread");
+	}
+
+	mutex_enter(&filelist_lock);
+	for (;;) {
+		KASSERT(mutex_owned(&filelist_lock));
+		if (SLIST_EMPTY(&unp_thread_discard)) {
+			if (unp_rights != 0) {
+				(void)cv_timedwait(&unp_thread_cv,
+				    &filelist_lock, hz);
+			} else {
+				cv_wait(&unp_thread_cv, &filelist_lock);
+			}
 		}
+		unp_gc(dp);
 	}
-	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
-		closef(*fpp);
+	/* NOTREACHED */
+}
+
+/*
+ * Kick the garbage collector into action if there is something for
+ * it to process.
+ */
+static void
+unp_thread_kick(void)
+{
+
+	if (!SLIST_EMPTY(&unp_thread_discard) || unp_rights != 0) {
+		mutex_enter(&filelist_lock);
+		cv_signal(&unp_thread_cv);
+		mutex_exit(&filelist_lock);
 	}
-	kmem_free(extra_ref, nslots * sizeof(file_t *));
-	atomic_swap_uint(&unp_gcing, 0);
 }
 
 void
@@ -1606,37 +1651,37 @@
 {
 
 	if (m)
-		unp_scan(m, unp_discard, 1);
+		unp_scan(m, unp_discard_later, 1);
 }
 
 void
 unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard)
 {
 	struct mbuf *m;
-	file_t **rp;
+	file_t **rp, *fp;
 	struct cmsghdr *cm;
-	int i;
-	int qfds;
+	int i, qfds;
 
 	while (m0) {
 		for (m = m0; m; m = m->m_next) {
-			if (m->m_type == MT_CONTROL &&
-			    m->m_len >= sizeof(*cm)) {
-				cm = mtod(m, struct cmsghdr *);
-				if (cm->cmsg_level != SOL_SOCKET ||
-				    cm->cmsg_type != SCM_RIGHTS)
-					continue;
-				qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
-				    / sizeof(file_t *);
-				rp = (file_t **)CMSG_DATA(cm);
-				for (i = 0; i < qfds; i++) {
-					file_t *fp = *rp;
-					if (discard)
-						*rp = 0;
-					(*op)(fp);
-					rp++;
+			if (m->m_type != MT_CONTROL ||
+			    m->m_len < sizeof(*cm)) {
+			    	continue;
+			}
+			cm = mtod(m, struct cmsghdr *);
+			if (cm->cmsg_level != SOL_SOCKET ||
+			    cm->cmsg_type != SCM_RIGHTS)
+				continue;
+			qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
+			    / sizeof(file_t *);
+			rp = (file_t **)CMSG_DATA(cm);
+			for (i = 0; i < qfds; i++) {
+				fp = *rp;
+				if (discard) {
+					*rp = 0;
 				}
-				break;		/* XXX, but saves time */
+				(*op)(fp);
+				rp++;
 			}
 		}
 		m0 = m0->m_nextpkt;
@@ -1658,10 +1703,9 @@
 	}
 
 	/*
-	 * Minimize the number of deferrals...  Sockets are the only
-	 * type of descriptor which can hold references to another
-	 * descriptor, so just mark other descriptors, and defer
-	 * unmarked sockets for the next pass.
+	 * Minimize the number of deferrals...  Sockets are the only type of
+	 * file which can hold references to another file, so just mark
+	 * other files, and defer unmarked sockets for the next pass.
 	 */
 	if (fp->f_type == DTYPE_SOCKET) {
 		unp_defer++;
@@ -1671,20 +1715,38 @@
 		atomic_or_uint(&fp->f_flag, FMARK);
 	}
 	mutex_exit(&fp->f_lock);
-	return;
 }
 
-void
-unp_discard(file_t *fp)
+static void
+unp_discard_now(file_t *fp)
 {
 
 	if (fp == NULL)
 		return;
 
-	mutex_enter(&fp->f_lock);
 	KASSERT(fp->f_count > 0);
+	KASSERT(fp->f_msgcount > 0);
+
+	mutex_enter(&fp->f_lock);
 	fp->f_msgcount--;
 	mutex_exit(&fp->f_lock);
 	atomic_dec_uint(&unp_rights);
 	(void)closef(fp);
+}
+
+static void
+unp_discard_later(file_t *fp)
+{
+
+	if (fp == NULL)
+		return;
+
+	KASSERT(fp->f_count > 0);
+	KASSERT(fp->f_msgcount > 0);
+
+	mutex_enter(&filelist_lock);
+	if (fp->f_unpcount++ == 0) {
+		SLIST_INSERT_HEAD(&unp_thread_discard, fp, f_unplist);
+	}
+	mutex_exit(&filelist_lock);
 }

cvs diff -r1.34 -r1.34.64.1 src/sys/sys/fcntl.h (expand / switch to context diff)
--- src/sys/sys/fcntl.h 2006/10/05 14:48:33 1.34
+++ src/sys/sys/fcntl.h 2009/03/18 05:33:23 1.34.64.1
@@ -1,4 +1,4 @@
-/*	$NetBSD: fcntl.h,v 1.34 2006/10/05 14:48:33 chs Exp $	*/
+/*	$NetBSD: fcntl.h,v 1.34.64.1 2009/03/18 05:33:23 snj Exp $	*/
 
 /*-
  * Copyright (c) 1983, 1990, 1993
@@ -125,6 +125,7 @@
 #define	FMARK		0x00001000	/* mark during gc() */
 #define	FDEFER		0x00002000	/* defer for next gc pass */
 #define	FHASLOCK	0x00004000	/* descriptor holds advisory lock */
+#define	FSCAN		0x00100000	/* scan during gc passes */
 #define	FKIOCTL		0x80000000	/* kernel originated ioctl */
 /* bits settable by fcntl(F_SETFL, ...) */
 #define	FCNTLFLAGS	(FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FDSYNC|FRSYNC|FALTIO|\

cvs diff -r1.65 -r1.65.6.1 src/sys/sys/file.h (expand / switch to context diff)
--- src/sys/sys/file.h 2008/06/24 10:26:27 1.65
+++ src/sys/sys/file.h 2009/03/18 05:33:23 1.65.6.1
@@ -1,5 +1,34 @@
-/*	$NetBSD: file.h,v 1.65 2008/06/24 10:26:27 gmcgarry Exp $	*/
+/*	$NetBSD: file.h,v 1.65.6.1 2009/03/18 05:33:23 snj Exp $	*/
 
+/*-
+ * Copyright (c) 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -53,6 +82,9 @@
 /*
  * Kernel file descriptor.  One entry for each open kernel vnode and
  * socket.
+ *
+ * This structure is exported via the KERN_FILE and KERN_FILE2 sysctl
+ * calls.  Only add members to the end, do not delete them.
  */
 struct file {
 	off_t		f_offset;	/* first, is 64-bit */
@@ -73,7 +105,7 @@
 	LIST_ENTRY(file) f_list;	/* list of active files */
 	kmutex_t	f_lock;		/* lock on structure */
 	int		f_flag;		/* see fcntl.h */
-	u_int		f_iflags;	/* internal flags; FIF_* */
+	u_int		f_unused1;	/* unused; was internal flags; FIF_* */
 #define	DTYPE_VNODE	1		/* file */
 #define	DTYPE_SOCKET	2		/* communications endpoint */
 #define	DTYPE_PIPE	3		/* pipe */
@@ -87,6 +119,8 @@
 	u_int		f_advice;	/* access pattern hint; UVM_ADV_* */
 	u_int		f_count;	/* reference count */
 	u_int		f_msgcount;	/* references from message queue */
+	u_int		f_unpcount;	/* deferred close: see uipc_usrreq.c */
+	SLIST_ENTRY(file) f_unplist;	/* deferred close: see uipc_usrreq.c */
 };
 
 #define FILE_LOCK(fp)	mutex_enter(&(fp)->f_lock)

cvs diff -r1.330.4.3 -r1.330.4.4 src/sys/sys/param.h (expand / switch to context diff)
--- src/sys/sys/param.h 2009/02/09 00:22:09 1.330.4.3
+++ src/sys/sys/param.h 2009/03/18 05:33:23 1.330.4.4
@@ -1,4 +1,4 @@
-/*	$NetBSD: param.h,v 1.330.4.3 2009/02/09 00:22:09 snj Exp $	*/
+/*	$NetBSD: param.h,v 1.330.4.4 2009/03/18 05:33:23 snj Exp $	*/
 
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -63,7 +63,7 @@
  *	2.99.9		(299000900)
  */
 
-#define	__NetBSD_Version__	500000000	/* NetBSD 5.0_RC2 */
+#define	__NetBSD_Version__	500000001	/* NetBSD 5.0_RC2 */
 
 #define __NetBSD_Prereq__(M,m,p) (((((M) * 100000000) + \
     (m) * 1000000) + (p) * 100) <= __NetBSD_Version__)

cvs diff -r1.44 -r1.44.4.1 src/sys/sys/un.h (expand / switch to context diff)
--- src/sys/sys/un.h 2008/08/06 15:01:24 1.44
+++ src/sys/sys/un.h 2009/03/18 05:33:23 1.44.4.1
@@ -1,4 +1,4 @@
-/*	$NetBSD: un.h,v 1.44 2008/08/06 15:01:24 plunky Exp $	*/
+/*	$NetBSD: un.h,v 1.44.4.1 2009/03/18 05:33:23 snj Exp $	*/
 
 /*
  * Copyright (c) 1982, 1986, 1993
@@ -91,9 +91,6 @@
 void	unp_discard (struct file *);
 void	unp_disconnect (struct unpcb *);
 bool	unp_drop (struct unpcb *, int);
-void	unp_gc (void);
-void	unp_mark (struct file *);
-void	unp_scan (struct mbuf *, void (*)(struct file *), int);
 void	unp_shutdown (struct unpcb *);
 int 	unp_externalize (struct mbuf *, struct lwp *);
 int	unp_internalize (struct mbuf **);