Wed Mar 18 05:33:23 2009 UTC ()
Pull up following revision(s) (requested by mrg in ticket #577):
	sys/kern/kern_descrip.c: revision 1.188
	sys/kern/uipc_usrreq.c: revision 1.121
	sys/sys/fcntl.h: revision 1.35
	sys/sys/file.h: revision 1.66
	sys/sys/param.h: patch
	sys/sys/un.h: revision 1.45
completely rework the way that orphaned sockets that are being fdpassed
via SCM_RIGHTS messages are dealt with:
1. unp_gc: make this a kthread.
2. unp_detach: go not call unp_gc directly. instead, wake up unp_gc kthread.
3. unp_scan: do not close files here. instead, put them on a global list
   for unp_gc to close, along with a per-file "deferred close count". if
   file is already enqueued for close, just increment deferred close count.
   this eliminates the recursive calls.
3. unp_gc: scan files on global deferred close list. close each file N
   times, as specified by deferred close count in file. continue processing
   list until it becomes empty (closing may cause additional files to be
   queued for close).
4. unp_gc: add additional bit to mark files we are scanning. set during
   initial scan of global file list that currently clears FMARK/FDEFER.
   during later scans, never examine / garbage collect descriptors that
   we have not marked during the earlier scan. do not proceed with this
   initial scan until all deferred closes have been processed. be careful
   with locking to ensure no races are introduced between deferred close
   and file scan.
5. unp_gc: use dummy file_t to mark position in list when scanning. allow
   us to drop filelist_lock. in turn allows us to eliminate kmem_alloc()
   and safely close files, etc.
6. prohibit transfer of descriptors within SCM_RIGHTS messages if
   (num_files_in_transit > maxfiles / unp_rights_ratio)
7. fd_allocfile: ensure recycled filse don't get scanned.
this is 97% work done by andrew doran, with a couple of minor bug fixes
and a lot of testing by yours truly.


(snj)
diff -r1.182.6.3 -r1.182.6.4 src/sys/kern/kern_descrip.c
diff -r1.119.4.1 -r1.119.4.2 src/sys/kern/uipc_usrreq.c
diff -r1.34 -r1.34.64.1 src/sys/sys/fcntl.h
diff -r1.65 -r1.65.6.1 src/sys/sys/file.h
diff -r1.330.4.3 -r1.330.4.4 src/sys/sys/param.h
diff -r1.44 -r1.44.4.1 src/sys/sys/un.h

cvs diff -r1.182.6.3 -r1.182.6.4 src/sys/kern/kern_descrip.c (switch to unified diff)

--- src/sys/kern/kern_descrip.c 2009/03/15 20:23:26 1.182.6.3
+++ src/sys/kern/kern_descrip.c 2009/03/18 05:33:23 1.182.6.4
@@ -1,1930 +1,1941 @@ @@ -1,1930 +1,1941 @@
1/* $NetBSD: kern_descrip.c,v 1.182.6.3 2009/03/15 20:23:26 snj Exp $ */ 1/* $NetBSD: kern_descrip.c,v 1.182.6.4 2009/03/18 05:33:23 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29/* 29/*
30 * Copyright (c) 1982, 1986, 1989, 1991, 1993 30 * Copyright (c) 1982, 1986, 1989, 1991, 1993
31 * The Regents of the University of California. All rights reserved. 31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc. 32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed 33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph 34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc. 36 * the permission of UNIX System Laboratories, Inc.
37 * 37 *
38 * Redistribution and use in source and binary forms, with or without 38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions 39 * modification, are permitted provided that the following conditions
40 * are met: 40 * are met:
41 * 1. Redistributions of source code must retain the above copyright 41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer. 42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright 43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the 44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution. 45 * documentation and/or other materials provided with the distribution.
46 * 3. Neither the name of the University nor the names of its contributors 46 * 3. Neither the name of the University nor the names of its contributors
47 * may be used to endorse or promote products derived from this software 47 * may be used to endorse or promote products derived from this software
48 * without specific prior written permission. 48 * without specific prior written permission.
49 * 49 *
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE. 60 * SUCH DAMAGE.
61 * 61 *
62 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95 62 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
63 */ 63 */
64 64
65/* 65/*
66 * File descriptor management. 66 * File descriptor management.
67 */ 67 */
68 68
69#include <sys/cdefs.h> 69#include <sys/cdefs.h>
70__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.182.6.3 2009/03/15 20:23:26 snj Exp $"); 70__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.182.6.4 2009/03/18 05:33:23 snj Exp $");
71 71
72#include <sys/param.h> 72#include <sys/param.h>
73#include <sys/systm.h> 73#include <sys/systm.h>
74#include <sys/filedesc.h> 74#include <sys/filedesc.h>
75#include <sys/kernel.h> 75#include <sys/kernel.h>
76#include <sys/vnode.h> 76#include <sys/vnode.h>
77#include <sys/proc.h> 77#include <sys/proc.h>
78#include <sys/file.h> 78#include <sys/file.h>
79#include <sys/namei.h> 79#include <sys/namei.h>
80#include <sys/socket.h> 80#include <sys/socket.h>
81#include <sys/socketvar.h> 81#include <sys/socketvar.h>
82#include <sys/stat.h> 82#include <sys/stat.h>
83#include <sys/ioctl.h> 83#include <sys/ioctl.h>
84#include <sys/fcntl.h> 84#include <sys/fcntl.h>
85#include <sys/pool.h> 85#include <sys/pool.h>
86#include <sys/syslog.h> 86#include <sys/syslog.h>
87#include <sys/unistd.h> 87#include <sys/unistd.h>
88#include <sys/resourcevar.h> 88#include <sys/resourcevar.h>
89#include <sys/conf.h> 89#include <sys/conf.h>
90#include <sys/event.h> 90#include <sys/event.h>
91#include <sys/kauth.h> 91#include <sys/kauth.h>
92#include <sys/atomic.h> 92#include <sys/atomic.h>
93#include <sys/mount.h> 93#include <sys/mount.h>
94#include <sys/syscallargs.h> 94#include <sys/syscallargs.h>
95#include <sys/cpu.h> 95#include <sys/cpu.h>
96 96
97static int cwdi_ctor(void *, void *, int); 97static int cwdi_ctor(void *, void *, int);
98static void cwdi_dtor(void *, void *); 98static void cwdi_dtor(void *, void *);
99static int file_ctor(void *, void *, int); 99static int file_ctor(void *, void *, int);
100static void file_dtor(void *, void *); 100static void file_dtor(void *, void *);
101static int fdfile_ctor(void *, void *, int); 101static int fdfile_ctor(void *, void *, int);
102static void fdfile_dtor(void *, void *); 102static void fdfile_dtor(void *, void *);
103static int filedesc_ctor(void *, void *, int); 103static int filedesc_ctor(void *, void *, int);
104static void filedesc_dtor(void *, void *); 104static void filedesc_dtor(void *, void *);
105static int filedescopen(dev_t, int, int, lwp_t *); 105static int filedescopen(dev_t, int, int, lwp_t *);
106 106
107kmutex_t filelist_lock; /* lock on filehead */ 107kmutex_t filelist_lock; /* lock on filehead */
108struct filelist filehead; /* head of list of open files */ 108struct filelist filehead; /* head of list of open files */
109u_int nfiles; /* actual number of open files */ 109u_int nfiles; /* actual number of open files */
110 110
111static pool_cache_t cwdi_cache; 111static pool_cache_t cwdi_cache;
112static pool_cache_t filedesc_cache; 112static pool_cache_t filedesc_cache;
113static pool_cache_t file_cache; 113static pool_cache_t file_cache;
114static pool_cache_t fdfile_cache; 114static pool_cache_t fdfile_cache;
115 115
116const struct cdevsw filedesc_cdevsw = { 116const struct cdevsw filedesc_cdevsw = {
117 filedescopen, noclose, noread, nowrite, noioctl, 117 filedescopen, noclose, noread, nowrite, noioctl,
118 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE, 118 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
119}; 119};
120 120
121/* For ease of reading. */ 121/* For ease of reading. */
122__strong_alias(fd_putvnode,fd_putfile) 122__strong_alias(fd_putvnode,fd_putfile)
123__strong_alias(fd_putsock,fd_putfile) 123__strong_alias(fd_putsock,fd_putfile)
124 124
125/* 125/*
126 * Initialize the descriptor system. 126 * Initialize the descriptor system.
127 */ 127 */
128void 128void
129fd_sys_init(void) 129fd_sys_init(void)
130{ 130{
131 131
132 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE); 132 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
133 133
134 file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0, 134 file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
135 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL); 135 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
136 KASSERT(file_cache != NULL); 136 KASSERT(file_cache != NULL);
137 137
138 fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0, 138 fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
139 PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor, 139 PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
140 NULL); 140 NULL);
141 KASSERT(fdfile_cache != NULL); 141 KASSERT(fdfile_cache != NULL);
142 142
143 cwdi_cache = pool_cache_init(sizeof(struct cwdinfo), coherency_unit, 143 cwdi_cache = pool_cache_init(sizeof(struct cwdinfo), coherency_unit,
144 0, 0, "cwdi", NULL, IPL_NONE, cwdi_ctor, cwdi_dtor, NULL); 144 0, 0, "cwdi", NULL, IPL_NONE, cwdi_ctor, cwdi_dtor, NULL);
145 KASSERT(cwdi_cache != NULL); 145 KASSERT(cwdi_cache != NULL);
146 146
147 filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit, 147 filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
148 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor, 148 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
149 NULL); 149 NULL);
150 KASSERT(filedesc_cache != NULL); 150 KASSERT(filedesc_cache != NULL);
151} 151}
152 152
153static int 153static int
154fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits) 154fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
155{ 155{
156 int i, off, maxoff; 156 int i, off, maxoff;
157 uint32_t sub; 157 uint32_t sub;
158 158
159 KASSERT(mutex_owned(&fdp->fd_lock)); 159 KASSERT(mutex_owned(&fdp->fd_lock));
160 160
161 if (want > bits) 161 if (want > bits)
162 return -1; 162 return -1;
163 163
164 off = want >> NDENTRYSHIFT; 164 off = want >> NDENTRYSHIFT;
165 i = want & NDENTRYMASK; 165 i = want & NDENTRYMASK;
166 if (i) { 166 if (i) {
167 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i)); 167 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
168 if (sub != ~0) 168 if (sub != ~0)
169 goto found; 169 goto found;
170 off++; 170 off++;
171 } 171 }
172 172
173 maxoff = NDLOSLOTS(bits); 173 maxoff = NDLOSLOTS(bits);
174 while (off < maxoff) { 174 while (off < maxoff) {
175 if ((sub = bitmap[off]) != ~0) 175 if ((sub = bitmap[off]) != ~0)
176 goto found; 176 goto found;
177 off++; 177 off++;
178 } 178 }
179 179
180 return (-1); 180 return (-1);
181 181
182 found: 182 found:
183 return (off << NDENTRYSHIFT) + ffs(~sub) - 1; 183 return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
184} 184}
185 185
186static int 186static int
187fd_last_set(filedesc_t *fd, int last) 187fd_last_set(filedesc_t *fd, int last)
188{ 188{
189 int off, i; 189 int off, i;
190 fdfile_t **ofiles = fd->fd_ofiles; 190 fdfile_t **ofiles = fd->fd_ofiles;
191 uint32_t *bitmap = fd->fd_lomap; 191 uint32_t *bitmap = fd->fd_lomap;
192 192
193 KASSERT(mutex_owned(&fd->fd_lock)); 193 KASSERT(mutex_owned(&fd->fd_lock));
194 194
195 off = (last - 1) >> NDENTRYSHIFT; 195 off = (last - 1) >> NDENTRYSHIFT;
196 196
197 while (off >= 0 && !bitmap[off]) 197 while (off >= 0 && !bitmap[off])
198 off--; 198 off--;
199 199
200 if (off < 0) 200 if (off < 0)
201 return (-1); 201 return (-1);
202 202
203 i = ((off + 1) << NDENTRYSHIFT) - 1; 203 i = ((off + 1) << NDENTRYSHIFT) - 1;
204 if (i >= last) 204 if (i >= last)
205 i = last - 1; 205 i = last - 1;
206 206
207 /* XXX should use bitmap */ 207 /* XXX should use bitmap */
208 /* XXXAD does not work for fd_copy() */ 208 /* XXXAD does not work for fd_copy() */
209 while (i > 0 && (ofiles[i] == NULL || !ofiles[i]->ff_allocated)) 209 while (i > 0 && (ofiles[i] == NULL || !ofiles[i]->ff_allocated))
210 i--; 210 i--;
211 211
212 return (i); 212 return (i);
213} 213}
214 214
215void 215void
216fd_used(filedesc_t *fdp, unsigned fd) 216fd_used(filedesc_t *fdp, unsigned fd)
217{ 217{
218 u_int off = fd >> NDENTRYSHIFT; 218 u_int off = fd >> NDENTRYSHIFT;
219 fdfile_t *ff; 219 fdfile_t *ff;
220 220
221 ff = fdp->fd_ofiles[fd]; 221 ff = fdp->fd_ofiles[fd];
222 222
223 KASSERT(mutex_owned(&fdp->fd_lock)); 223 KASSERT(mutex_owned(&fdp->fd_lock));
224 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0); 224 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
225 KASSERT(ff != NULL); 225 KASSERT(ff != NULL);
226 KASSERT(ff->ff_file == NULL); 226 KASSERT(ff->ff_file == NULL);
227 KASSERT(!ff->ff_allocated); 227 KASSERT(!ff->ff_allocated);
228 228
229 ff->ff_allocated = 1; 229 ff->ff_allocated = 1;
230 fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK); 230 fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
231 if (fdp->fd_lomap[off] == ~0) { 231 if (fdp->fd_lomap[off] == ~0) {
232 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & 232 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
233 (1 << (off & NDENTRYMASK))) == 0); 233 (1 << (off & NDENTRYMASK))) == 0);
234 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK); 234 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
235 } 235 }
236 236
237 if ((int)fd > fdp->fd_lastfile) { 237 if ((int)fd > fdp->fd_lastfile) {
238 fdp->fd_lastfile = fd; 238 fdp->fd_lastfile = fd;
239 } 239 }
240 240
241 if (fd >= NDFDFILE) { 241 if (fd >= NDFDFILE) {
242 fdp->fd_nused++; 242 fdp->fd_nused++;
243 } else { 243 } else {
244 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 244 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
245 } 245 }
246} 246}
247 247
248void 248void
249fd_unused(filedesc_t *fdp, unsigned fd) 249fd_unused(filedesc_t *fdp, unsigned fd)
250{ 250{
251 u_int off = fd >> NDENTRYSHIFT; 251 u_int off = fd >> NDENTRYSHIFT;
252 fdfile_t *ff; 252 fdfile_t *ff;
253 253
254 ff = fdp->fd_ofiles[fd]; 254 ff = fdp->fd_ofiles[fd];
255 255
256 /* 256 /*
257 * Don't assert the lock is held here, as we may be copying 257 * Don't assert the lock is held here, as we may be copying
258 * the table during exec() and it is not needed there. 258 * the table during exec() and it is not needed there.
259 * procfs and sysctl are locked out by proc::p_reflock. 259 * procfs and sysctl are locked out by proc::p_reflock.
260 * 260 *
261 * KASSERT(mutex_owned(&fdp->fd_lock)); 261 * KASSERT(mutex_owned(&fdp->fd_lock));
262 */ 262 */
263 KASSERT(ff != NULL); 263 KASSERT(ff != NULL);
264 KASSERT(ff->ff_file == NULL); 264 KASSERT(ff->ff_file == NULL);
265 KASSERT(ff->ff_allocated); 265 KASSERT(ff->ff_allocated);
266 266
267 if (fd < fdp->fd_freefile) { 267 if (fd < fdp->fd_freefile) {
268 fdp->fd_freefile = fd; 268 fdp->fd_freefile = fd;
269 } 269 }
270 270
271 if (fdp->fd_lomap[off] == ~0) { 271 if (fdp->fd_lomap[off] == ~0) {
272 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & 272 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
273 (1 << (off & NDENTRYMASK))) != 0); 273 (1 << (off & NDENTRYMASK))) != 0);
274 fdp->fd_himap[off >> NDENTRYSHIFT] &= 274 fdp->fd_himap[off >> NDENTRYSHIFT] &=
275 ~(1 << (off & NDENTRYMASK)); 275 ~(1 << (off & NDENTRYMASK));
276 } 276 }
277 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0); 277 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
278 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK)); 278 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
279 ff->ff_allocated = 0; 279 ff->ff_allocated = 0;
280 280
281 KASSERT(fd <= fdp->fd_lastfile); 281 KASSERT(fd <= fdp->fd_lastfile);
282 if (fd == fdp->fd_lastfile) { 282 if (fd == fdp->fd_lastfile) {
283 fdp->fd_lastfile = fd_last_set(fdp, fd); 283 fdp->fd_lastfile = fd_last_set(fdp, fd);
284 } 284 }
285 285
286 if (fd >= NDFDFILE) { 286 if (fd >= NDFDFILE) {
287 KASSERT(fdp->fd_nused > 0); 287 KASSERT(fdp->fd_nused > 0);
288 fdp->fd_nused--; 288 fdp->fd_nused--;
289 } else { 289 } else {
290 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 290 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
291 } 291 }
292} 292}
293 293
294/* 294/*
295 * Custom version of fd_unused() for fd_copy(), where the descriptor 295 * Custom version of fd_unused() for fd_copy(), where the descriptor
296 * table is not yet fully initialized. 296 * table is not yet fully initialized.
297 */ 297 */
298static inline void 298static inline void
299fd_zap(filedesc_t *fdp, unsigned fd) 299fd_zap(filedesc_t *fdp, unsigned fd)
300{ 300{
301 u_int off = fd >> NDENTRYSHIFT; 301 u_int off = fd >> NDENTRYSHIFT;
302 302
303 if (fd < fdp->fd_freefile) { 303 if (fd < fdp->fd_freefile) {
304 fdp->fd_freefile = fd; 304 fdp->fd_freefile = fd;
305 } 305 }
306 306
307 if (fdp->fd_lomap[off] == ~0) { 307 if (fdp->fd_lomap[off] == ~0) {
308 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & 308 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
309 (1 << (off & NDENTRYMASK))) != 0); 309 (1 << (off & NDENTRYMASK))) != 0);
310 fdp->fd_himap[off >> NDENTRYSHIFT] &= 310 fdp->fd_himap[off >> NDENTRYSHIFT] &=
311 ~(1 << (off & NDENTRYMASK)); 311 ~(1 << (off & NDENTRYMASK));
312 } 312 }
313 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0); 313 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
314 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK)); 314 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
315} 315}
316 316
317bool 317bool
318fd_isused(filedesc_t *fdp, unsigned fd) 318fd_isused(filedesc_t *fdp, unsigned fd)
319{ 319{
320 u_int off = fd >> NDENTRYSHIFT; 320 u_int off = fd >> NDENTRYSHIFT;
321 321
322 KASSERT(fd < fdp->fd_nfiles); 322 KASSERT(fd < fdp->fd_nfiles);
323 323
324 return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0; 324 return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
325} 325}
326 326
327/* 327/*
328 * Look up the file structure corresponding to a file descriptor 328 * Look up the file structure corresponding to a file descriptor
329 * and return the file, holding a reference on the descriptor. 329 * and return the file, holding a reference on the descriptor.
330 */ 330 */
331inline file_t * 331inline file_t *
332fd_getfile(unsigned fd) 332fd_getfile(unsigned fd)
333{ 333{
334 filedesc_t *fdp; 334 filedesc_t *fdp;
335 fdfile_t *ff; 335 fdfile_t *ff;
336 file_t *fp; 336 file_t *fp;
337 337
338 fdp = curlwp->l_fd; 338 fdp = curlwp->l_fd;
339 339
340 /* 340 /*
341 * Look up the fdfile structure representing this descriptor. 341 * Look up the fdfile structure representing this descriptor.
342 * Ensure that we see fd_nfiles before fd_ofiles since we 342 * Ensure that we see fd_nfiles before fd_ofiles since we
343 * are doing this unlocked. See fd_tryexpand(). 343 * are doing this unlocked. See fd_tryexpand().
344 */ 344 */
345 if (__predict_false(fd >= fdp->fd_nfiles)) { 345 if (__predict_false(fd >= fdp->fd_nfiles)) {
346 return NULL; 346 return NULL;
347 } 347 }
348 membar_consumer(); 348 membar_consumer();
349 ff = fdp->fd_ofiles[fd]; 349 ff = fdp->fd_ofiles[fd];
350 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 350 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
351 if (__predict_false(ff == NULL)) { 351 if (__predict_false(ff == NULL)) {
352 return NULL; 352 return NULL;
353 } 353 }
354 354
355 /* 355 /*
356 * Now get a reference to the descriptor. Issue a memory 356 * Now get a reference to the descriptor. Issue a memory
357 * barrier to ensure that we acquire the file pointer _after_ 357 * barrier to ensure that we acquire the file pointer _after_
358 * adding a reference. If no memory barrier, we could fetch 358 * adding a reference. If no memory barrier, we could fetch
359 * a stale pointer. 359 * a stale pointer.
360 */ 360 */
361 atomic_inc_uint(&ff->ff_refcnt); 361 atomic_inc_uint(&ff->ff_refcnt);
362#ifndef __HAVE_ATOMIC_AS_MEMBAR 362#ifndef __HAVE_ATOMIC_AS_MEMBAR
363 membar_enter(); 363 membar_enter();
364#endif 364#endif
365 365
366 /* 366 /*
367 * If the file is not open or is being closed then put the 367 * If the file is not open or is being closed then put the
368 * reference back. 368 * reference back.
369 */ 369 */
370 fp = ff->ff_file; 370 fp = ff->ff_file;
371 if (__predict_true(fp != NULL)) { 371 if (__predict_true(fp != NULL)) {
372 return fp; 372 return fp;
373 } 373 }
374 fd_putfile(fd); 374 fd_putfile(fd);
375 return NULL; 375 return NULL;
376} 376}
377 377
378/* 378/*
379 * Release a reference to a file descriptor acquired with fd_getfile(). 379 * Release a reference to a file descriptor acquired with fd_getfile().
380 */ 380 */
381void 381void
382fd_putfile(unsigned fd) 382fd_putfile(unsigned fd)
383{ 383{
384 filedesc_t *fdp; 384 filedesc_t *fdp;
385 fdfile_t *ff; 385 fdfile_t *ff;
386 u_int u, v; 386 u_int u, v;
387 387
388 fdp = curlwp->l_fd; 388 fdp = curlwp->l_fd;
389 ff = fdp->fd_ofiles[fd]; 389 ff = fdp->fd_ofiles[fd];
390 390
391 KASSERT(fd < fdp->fd_nfiles); 391 KASSERT(fd < fdp->fd_nfiles);
392 KASSERT(ff != NULL); 392 KASSERT(ff != NULL);
393 KASSERT((ff->ff_refcnt & FR_MASK) > 0); 393 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
394 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 394 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
395 395
396 /* 396 /*
397 * Ensure that any use of the file is complete and globally 397 * Ensure that any use of the file is complete and globally
398 * visible before dropping the final reference. If no membar, 398 * visible before dropping the final reference. If no membar,
399 * the current CPU could still access memory associated with 399 * the current CPU could still access memory associated with
400 * the file after it has been freed or recycled by another 400 * the file after it has been freed or recycled by another
401 * CPU. 401 * CPU.
402 */ 402 */
403#ifndef __HAVE_ATOMIC_AS_MEMBAR 403#ifndef __HAVE_ATOMIC_AS_MEMBAR
404 membar_exit(); 404 membar_exit();
405#endif 405#endif
406 406
407 /* 407 /*
408 * Be optimistic and start out with the assumption that no other 408 * Be optimistic and start out with the assumption that no other
409 * threads are trying to close the descriptor. If the CAS fails, 409 * threads are trying to close the descriptor. If the CAS fails,
410 * we lost a race and/or it's being closed. 410 * we lost a race and/or it's being closed.
411 */ 411 */
412 for (u = ff->ff_refcnt & FR_MASK;; u = v) { 412 for (u = ff->ff_refcnt & FR_MASK;; u = v) {
413 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1); 413 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
414 if (__predict_true(u == v)) { 414 if (__predict_true(u == v)) {
415 return; 415 return;
416 } 416 }
417 if (__predict_false((v & FR_CLOSING) != 0)) { 417 if (__predict_false((v & FR_CLOSING) != 0)) {
418 break; 418 break;
419 } 419 }
420 } 420 }
421 421
422 /* Another thread is waiting to close the file: join it. */ 422 /* Another thread is waiting to close the file: join it. */
423 (void)fd_close(fd); 423 (void)fd_close(fd);
424} 424}
425 425
426/* 426/*
427 * Convenience wrapper around fd_getfile() that returns reference 427 * Convenience wrapper around fd_getfile() that returns reference
428 * to a vnode. 428 * to a vnode.
429 */ 429 */
430int 430int
431fd_getvnode(unsigned fd, file_t **fpp) 431fd_getvnode(unsigned fd, file_t **fpp)
432{ 432{
433 vnode_t *vp; 433 vnode_t *vp;
434 file_t *fp; 434 file_t *fp;
435 435
436 fp = fd_getfile(fd); 436 fp = fd_getfile(fd);
437 if (__predict_false(fp == NULL)) { 437 if (__predict_false(fp == NULL)) {
438 return EBADF; 438 return EBADF;
439 } 439 }
440 if (__predict_false(fp->f_type != DTYPE_VNODE)) { 440 if (__predict_false(fp->f_type != DTYPE_VNODE)) {
441 fd_putfile(fd); 441 fd_putfile(fd);
442 return EINVAL; 442 return EINVAL;
443 } 443 }
444 vp = fp->f_data; 444 vp = fp->f_data;
445 if (__predict_false(vp->v_type == VBAD)) { 445 if (__predict_false(vp->v_type == VBAD)) {
446 /* XXX Is this case really necessary? */ 446 /* XXX Is this case really necessary? */
447 fd_putfile(fd); 447 fd_putfile(fd);
448 return EBADF; 448 return EBADF;
449 } 449 }
450 *fpp = fp; 450 *fpp = fp;
451 return 0; 451 return 0;
452} 452}
453 453
454/* 454/*
455 * Convenience wrapper around fd_getfile() that returns reference 455 * Convenience wrapper around fd_getfile() that returns reference
456 * to a socket. 456 * to a socket.
457 */ 457 */
458int 458int
459fd_getsock(unsigned fd, struct socket **sop) 459fd_getsock(unsigned fd, struct socket **sop)
460{ 460{
461 file_t *fp; 461 file_t *fp;
462 462
463 fp = fd_getfile(fd); 463 fp = fd_getfile(fd);
464 if (__predict_false(fp == NULL)) { 464 if (__predict_false(fp == NULL)) {
465 return EBADF; 465 return EBADF;
466 } 466 }
467 if (__predict_false(fp->f_type != DTYPE_SOCKET)) { 467 if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
468 fd_putfile(fd); 468 fd_putfile(fd);
469 return ENOTSOCK; 469 return ENOTSOCK;
470 } 470 }
471 *sop = fp->f_data; 471 *sop = fp->f_data;
472 return 0; 472 return 0;
473} 473}
474 474
475/* 475/*
476 * Look up the file structure corresponding to a file descriptor 476 * Look up the file structure corresponding to a file descriptor
477 * and return it with a reference held on the file, not the 477 * and return it with a reference held on the file, not the
478 * descriptor. 478 * descriptor.
479 * 479 *
480 * This is heavyweight and only used when accessing descriptors 480 * This is heavyweight and only used when accessing descriptors
481 * from a foreign process. The caller must ensure that `p' does 481 * from a foreign process. The caller must ensure that `p' does
482 * not exit or fork across this call. 482 * not exit or fork across this call.
483 * 483 *
484 * To release the file (not descriptor) reference, use closef(). 484 * To release the file (not descriptor) reference, use closef().
485 */ 485 */
486file_t * 486file_t *
487fd_getfile2(proc_t *p, unsigned fd) 487fd_getfile2(proc_t *p, unsigned fd)
488{ 488{
489 filedesc_t *fdp; 489 filedesc_t *fdp;
490 fdfile_t *ff; 490 fdfile_t *ff;
491 file_t *fp; 491 file_t *fp;
492 492
493 fdp = p->p_fd; 493 fdp = p->p_fd;
494 mutex_enter(&fdp->fd_lock); 494 mutex_enter(&fdp->fd_lock);
495 if (fd > fdp->fd_nfiles) { 495 if (fd > fdp->fd_nfiles) {
496 mutex_exit(&fdp->fd_lock); 496 mutex_exit(&fdp->fd_lock);
497 return NULL; 497 return NULL;
498 } 498 }
499 if ((ff = fdp->fd_ofiles[fd]) == NULL) { 499 if ((ff = fdp->fd_ofiles[fd]) == NULL) {
500 mutex_exit(&fdp->fd_lock); 500 mutex_exit(&fdp->fd_lock);
501 return NULL; 501 return NULL;
502 } 502 }
503 mutex_enter(&ff->ff_lock); 503 mutex_enter(&ff->ff_lock);
504 if ((fp = ff->ff_file) == NULL) { 504 if ((fp = ff->ff_file) == NULL) {
505 mutex_exit(&ff->ff_lock); 505 mutex_exit(&ff->ff_lock);
506 mutex_exit(&fdp->fd_lock); 506 mutex_exit(&fdp->fd_lock);
507 return NULL; 507 return NULL;
508 } 508 }
509 mutex_enter(&fp->f_lock); 509 mutex_enter(&fp->f_lock);
510 fp->f_count++; 510 fp->f_count++;
511 mutex_exit(&fp->f_lock); 511 mutex_exit(&fp->f_lock);
512 mutex_exit(&ff->ff_lock); 512 mutex_exit(&ff->ff_lock);
513 mutex_exit(&fdp->fd_lock); 513 mutex_exit(&fdp->fd_lock);
514 514
515 return fp; 515 return fp;
516} 516}
517 517
518/* 518/*
519 * Internal form of close. Must be called with a reference to the 519 * Internal form of close. Must be called with a reference to the
520 * descriptor, and will drop the reference. When all descriptor 520 * descriptor, and will drop the reference. When all descriptor
521 * references are dropped, releases the descriptor slot and a single 521 * references are dropped, releases the descriptor slot and a single
522 * reference to the file structure. 522 * reference to the file structure.
523 */ 523 */
524int 524int
525fd_close(unsigned fd) 525fd_close(unsigned fd)
526{ 526{
527 struct flock lf; 527 struct flock lf;
528 filedesc_t *fdp; 528 filedesc_t *fdp;
529 fdfile_t *ff; 529 fdfile_t *ff;
530 file_t *fp; 530 file_t *fp;
531 proc_t *p; 531 proc_t *p;
532 lwp_t *l; 532 lwp_t *l;
533 533
534 l = curlwp; 534 l = curlwp;
535 p = l->l_proc; 535 p = l->l_proc;
536 fdp = l->l_fd; 536 fdp = l->l_fd;
537 ff = fdp->fd_ofiles[fd]; 537 ff = fdp->fd_ofiles[fd];
538 538
539 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 539 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
540 540
541 mutex_enter(&ff->ff_lock); 541 mutex_enter(&ff->ff_lock);
542 KASSERT((ff->ff_refcnt & FR_MASK) > 0); 542 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
543 if (ff->ff_file == NULL) { 543 if (ff->ff_file == NULL) {
544 /* 544 /*
545 * Another user of the file is already closing, and is 545 * Another user of the file is already closing, and is
546 * waiting for other users of the file to drain. Release 546 * waiting for other users of the file to drain. Release
547 * our reference, and wake up the closer. 547 * our reference, and wake up the closer.
548 */ 548 */
549 atomic_dec_uint(&ff->ff_refcnt); 549 atomic_dec_uint(&ff->ff_refcnt);
550 cv_broadcast(&ff->ff_closing); 550 cv_broadcast(&ff->ff_closing);
551 mutex_exit(&ff->ff_lock); 551 mutex_exit(&ff->ff_lock);
552 552
553 /* 553 /*
554 * An application error, so pretend that the descriptor 554 * An application error, so pretend that the descriptor
555 * was already closed. We can't safely wait for it to 555 * was already closed. We can't safely wait for it to
556 * be closed without potentially deadlocking. 556 * be closed without potentially deadlocking.
557 */ 557 */
558 return (EBADF); 558 return (EBADF);
559 } 559 }
560 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0); 560 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
561 561
562 /* 562 /*
563 * There may be multiple users of this file within the process. 563 * There may be multiple users of this file within the process.
564 * Notify existing and new users that the file is closing. This 564 * Notify existing and new users that the file is closing. This
565 * will prevent them from adding additional uses to this file 565 * will prevent them from adding additional uses to this file
566 * while we are closing it. 566 * while we are closing it.
567 */ 567 */
568 fp = ff->ff_file; 568 fp = ff->ff_file;
569 ff->ff_file = NULL; 569 ff->ff_file = NULL;
570 ff->ff_exclose = false; 570 ff->ff_exclose = false;
571 571
572 /* 572 /*
573 * We expect the caller to hold a descriptor reference - drop it. 573 * We expect the caller to hold a descriptor reference - drop it.
574 * The reference count may increase beyond zero at this point due 574 * The reference count may increase beyond zero at this point due
575 * to an erroneous descriptor reference by an application, but 575 * to an erroneous descriptor reference by an application, but
576 * fd_getfile() will notice that the file is being closed and drop 576 * fd_getfile() will notice that the file is being closed and drop
577 * the reference again. 577 * the reference again.
578 */ 578 */
579#ifndef __HAVE_ATOMIC_AS_MEMBAR 579#ifndef __HAVE_ATOMIC_AS_MEMBAR
580 membar_producer(); 580 membar_producer();
581#endif 581#endif
582 if (__predict_false(atomic_dec_uint_nv(&ff->ff_refcnt) != 0)) { 582 if (__predict_false(atomic_dec_uint_nv(&ff->ff_refcnt) != 0)) {
583 /* 583 /*
584 * Wait for other references to drain. This is typically 584 * Wait for other references to drain. This is typically
585 * an application error - the descriptor is being closed 585 * an application error - the descriptor is being closed
586 * while still in use. 586 * while still in use.
587 * 587 *
588 */ 588 */
589 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING); 589 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
590 /* 590 /*
591 * Remove any knotes attached to the file. A knote 591 * Remove any knotes attached to the file. A knote
592 * attached to the descriptor can hold references on it. 592 * attached to the descriptor can hold references on it.
593 */ 593 */
594 if (!SLIST_EMPTY(&ff->ff_knlist)) { 594 if (!SLIST_EMPTY(&ff->ff_knlist)) {
595 mutex_exit(&ff->ff_lock); 595 mutex_exit(&ff->ff_lock);
596 knote_fdclose(fd); 596 knote_fdclose(fd);
597 mutex_enter(&ff->ff_lock); 597 mutex_enter(&ff->ff_lock);
598 } 598 }
599 /* 599 /*
600 * We need to see the count drop to zero at least once, 600 * We need to see the count drop to zero at least once,
601 * in order to ensure that all pre-existing references 601 * in order to ensure that all pre-existing references
602 * have been drained. New references past this point are 602 * have been drained. New references past this point are
603 * of no interest. 603 * of no interest.
604 */ 604 */
605 while ((ff->ff_refcnt & FR_MASK) != 0) { 605 while ((ff->ff_refcnt & FR_MASK) != 0) {
606 cv_wait(&ff->ff_closing, &ff->ff_lock); 606 cv_wait(&ff->ff_closing, &ff->ff_lock);
607 } 607 }
608 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING); 608 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
609 } else { 609 } else {
610 /* If no references, there must be no knotes. */ 610 /* If no references, there must be no knotes. */
611 KASSERT(SLIST_EMPTY(&ff->ff_knlist)); 611 KASSERT(SLIST_EMPTY(&ff->ff_knlist));
612 } 612 }
613 mutex_exit(&ff->ff_lock); 613 mutex_exit(&ff->ff_lock);
614 614
615 /* 615 /*
616 * POSIX record locking dictates that any close releases ALL 616 * POSIX record locking dictates that any close releases ALL
617 * locks owned by this process. This is handled by setting 617 * locks owned by this process. This is handled by setting
618 * a flag in the unlock to free ONLY locks obeying POSIX 618 * a flag in the unlock to free ONLY locks obeying POSIX
619 * semantics, and not to free BSD-style file locks. 619 * semantics, and not to free BSD-style file locks.
620 * If the descriptor was in a message, POSIX-style locks 620 * If the descriptor was in a message, POSIX-style locks
621 * aren't passed with the descriptor. 621 * aren't passed with the descriptor.
622 */ 622 */
623 if ((p->p_flag & PK_ADVLOCK) != 0 && fp->f_type == DTYPE_VNODE) { 623 if ((p->p_flag & PK_ADVLOCK) != 0 && fp->f_type == DTYPE_VNODE) {
624 lf.l_whence = SEEK_SET; 624 lf.l_whence = SEEK_SET;
625 lf.l_start = 0; 625 lf.l_start = 0;
626 lf.l_len = 0; 626 lf.l_len = 0;
627 lf.l_type = F_UNLCK; 627 lf.l_type = F_UNLCK;
628 (void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX); 628 (void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
629 } 629 }
630 630
631 631
632 /* Free descriptor slot. */ 632 /* Free descriptor slot. */
633 mutex_enter(&fdp->fd_lock); 633 mutex_enter(&fdp->fd_lock);
634 fd_unused(fdp, fd); 634 fd_unused(fdp, fd);
635 mutex_exit(&fdp->fd_lock); 635 mutex_exit(&fdp->fd_lock);
636 636
637 /* Now drop reference to the file itself. */ 637 /* Now drop reference to the file itself. */
638 return closef(fp); 638 return closef(fp);
639} 639}
640 640
641/* 641/*
642 * Duplicate a file descriptor. 642 * Duplicate a file descriptor.
643 */ 643 */
644int 644int
645fd_dup(file_t *fp, int minfd, int *newp, bool exclose) 645fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
646{ 646{
647 proc_t *p; 647 proc_t *p;
648 int error; 648 int error;
649 649
650 p = curproc; 650 p = curproc;
651 651
652 while ((error = fd_alloc(p, minfd, newp)) != 0) { 652 while ((error = fd_alloc(p, minfd, newp)) != 0) {
653 if (error != ENOSPC) { 653 if (error != ENOSPC) {
654 return error; 654 return error;
655 } 655 }
656 fd_tryexpand(p); 656 fd_tryexpand(p);
657 } 657 }
658 658
659 curlwp->l_fd->fd_ofiles[*newp]->ff_exclose = exclose; 659 curlwp->l_fd->fd_ofiles[*newp]->ff_exclose = exclose;
660 fd_affix(p, fp, *newp); 660 fd_affix(p, fp, *newp);
661 return 0; 661 return 0;
662} 662}
663 663
664/* 664/*
665 * dup2 operation. 665 * dup2 operation.
666 */ 666 */
667int 667int
668fd_dup2(file_t *fp, unsigned new) 668fd_dup2(file_t *fp, unsigned new)
669{ 669{
670 filedesc_t *fdp; 670 filedesc_t *fdp;
671 fdfile_t *ff; 671 fdfile_t *ff;
672 672
673 fdp = curlwp->l_fd; 673 fdp = curlwp->l_fd;
674 674
675 /* 675 /*
676 * Ensure there are enough slots in the descriptor table, 676 * Ensure there are enough slots in the descriptor table,
677 * and allocate an fdfile_t up front in case we need it. 677 * and allocate an fdfile_t up front in case we need it.
678 */ 678 */
679 while (new >= fdp->fd_nfiles) { 679 while (new >= fdp->fd_nfiles) {
680 fd_tryexpand(curproc); 680 fd_tryexpand(curproc);
681 } 681 }
682 ff = pool_cache_get(fdfile_cache, PR_WAITOK); 682 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
683 683
684 /* 684 /*
685 * If there is already a file open, close it. If the file is 685 * If there is already a file open, close it. If the file is
686 * half open, wait for it to be constructed before closing it. 686 * half open, wait for it to be constructed before closing it.
687 * XXX Potential for deadlock here? 687 * XXX Potential for deadlock here?
688 */ 688 */
689 mutex_enter(&fdp->fd_lock); 689 mutex_enter(&fdp->fd_lock);
690 while (fd_isused(fdp, new)) { 690 while (fd_isused(fdp, new)) {
691 mutex_exit(&fdp->fd_lock); 691 mutex_exit(&fdp->fd_lock);
692 if (fd_getfile(new) != NULL) { 692 if (fd_getfile(new) != NULL) {
693 (void)fd_close(new); 693 (void)fd_close(new);
694 } else { 694 } else {
695 /* XXX Crummy, but unlikely to happen. */ 695 /* XXX Crummy, but unlikely to happen. */
696 kpause("dup2", false, 1, NULL); 696 kpause("dup2", false, 1, NULL);
697 } 697 }
698 mutex_enter(&fdp->fd_lock); 698 mutex_enter(&fdp->fd_lock);
699 } 699 }
700 if (fdp->fd_ofiles[new] == NULL) { 700 if (fdp->fd_ofiles[new] == NULL) {
701 KASSERT(new >= NDFDFILE); 701 KASSERT(new >= NDFDFILE);
702 fdp->fd_ofiles[new] = ff; 702 fdp->fd_ofiles[new] = ff;
703 ff = NULL; 703 ff = NULL;
704 }  704 }
705 fd_used(fdp, new); 705 fd_used(fdp, new);
706 mutex_exit(&fdp->fd_lock); 706 mutex_exit(&fdp->fd_lock);
707 707
708 /* Slot is now allocated. Insert copy of the file. */ 708 /* Slot is now allocated. Insert copy of the file. */
709 fd_affix(curproc, fp, new); 709 fd_affix(curproc, fp, new);
710 if (ff != NULL) { 710 if (ff != NULL) {
711 pool_cache_put(fdfile_cache, ff); 711 pool_cache_put(fdfile_cache, ff);
712 } 712 }
713 return 0; 713 return 0;
714} 714}
715 715
716/* 716/*
717 * Drop reference to a file structure. 717 * Drop reference to a file structure.
718 */ 718 */
719int 719int
720closef(file_t *fp) 720closef(file_t *fp)
721{ 721{
722 struct flock lf; 722 struct flock lf;
723 int error; 723 int error;
724 724
725 /* 725 /*
726 * Drop reference. If referenced elsewhere it's still open 726 * Drop reference. If referenced elsewhere it's still open
727 * and we have nothing more to do. 727 * and we have nothing more to do.
728 */ 728 */
729 mutex_enter(&fp->f_lock); 729 mutex_enter(&fp->f_lock);
730 KASSERT(fp->f_count > 0); 730 KASSERT(fp->f_count > 0);
731 if (--fp->f_count > 0) { 731 if (--fp->f_count > 0) {
732 mutex_exit(&fp->f_lock); 732 mutex_exit(&fp->f_lock);
733 return 0; 733 return 0;
734 } 734 }
735 KASSERT(fp->f_count == 0); 735 KASSERT(fp->f_count == 0);
736 mutex_exit(&fp->f_lock); 736 mutex_exit(&fp->f_lock);
737 737
738 /* We held the last reference - release locks, close and free. */ 738 /* We held the last reference - release locks, close and free. */
739 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { 739 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
740 lf.l_whence = SEEK_SET; 740 lf.l_whence = SEEK_SET;
741 lf.l_start = 0; 741 lf.l_start = 0;
742 lf.l_len = 0; 742 lf.l_len = 0;
743 lf.l_type = F_UNLCK; 743 lf.l_type = F_UNLCK;
744 (void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK); 744 (void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
745 } 745 }
746 if (fp->f_ops != NULL) { 746 if (fp->f_ops != NULL) {
747 error = (*fp->f_ops->fo_close)(fp); 747 error = (*fp->f_ops->fo_close)(fp);
748 } else { 748 } else {
749 error = 0; 749 error = 0;
750 } 750 }
751 ffree(fp); 751 ffree(fp);
752 752
753 return error; 753 return error;
754} 754}
755 755
756/* 756/*
757 * Allocate a file descriptor for the process. 757 * Allocate a file descriptor for the process.
758 */ 758 */
759int 759int
760fd_alloc(proc_t *p, int want, int *result) 760fd_alloc(proc_t *p, int want, int *result)
761{ 761{
762 filedesc_t *fdp; 762 filedesc_t *fdp;
763 int i, lim, last, error; 763 int i, lim, last, error;
764 u_int off, new; 764 u_int off, new;
765 fdfile_t *ff; 765 fdfile_t *ff;
766 766
767 KASSERT(p == curproc || p == &proc0); 767 KASSERT(p == curproc || p == &proc0);
768 768
769 fdp = p->p_fd; 769 fdp = p->p_fd;
770 ff = pool_cache_get(fdfile_cache, PR_WAITOK); 770 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
771 KASSERT(ff->ff_refcnt == 0); 771 KASSERT(ff->ff_refcnt == 0);
772 KASSERT(ff->ff_file == NULL); 772 KASSERT(ff->ff_file == NULL);
773 773
774 /* 774 /*
775 * Search for a free descriptor starting at the higher 775 * Search for a free descriptor starting at the higher
776 * of want or fd_freefile. 776 * of want or fd_freefile.
777 */ 777 */
778 mutex_enter(&fdp->fd_lock); 778 mutex_enter(&fdp->fd_lock);
779 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 779 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
780 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); 780 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
781 last = min(fdp->fd_nfiles, lim); 781 last = min(fdp->fd_nfiles, lim);
782 for (;;) { 782 for (;;) {
783 if ((i = want) < fdp->fd_freefile) 783 if ((i = want) < fdp->fd_freefile)
784 i = fdp->fd_freefile; 784 i = fdp->fd_freefile;
785 off = i >> NDENTRYSHIFT; 785 off = i >> NDENTRYSHIFT;
786 new = fd_next_zero(fdp, fdp->fd_himap, off, 786 new = fd_next_zero(fdp, fdp->fd_himap, off,
787 (last + NDENTRIES - 1) >> NDENTRYSHIFT); 787 (last + NDENTRIES - 1) >> NDENTRYSHIFT);
788 if (new == -1) 788 if (new == -1)
789 break; 789 break;
790 i = fd_next_zero(fdp, &fdp->fd_lomap[new], 790 i = fd_next_zero(fdp, &fdp->fd_lomap[new],
791 new > off ? 0 : i & NDENTRYMASK, NDENTRIES); 791 new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
792 if (i == -1) { 792 if (i == -1) {
793 /* 793 /*
794 * Free file descriptor in this block was 794 * Free file descriptor in this block was
795 * below want, try again with higher want. 795 * below want, try again with higher want.
796 */ 796 */
797 want = (new + 1) << NDENTRYSHIFT; 797 want = (new + 1) << NDENTRYSHIFT;
798 continue; 798 continue;
799 } 799 }
800 i += (new << NDENTRYSHIFT); 800 i += (new << NDENTRYSHIFT);
801 if (i >= last) { 801 if (i >= last) {
802 break; 802 break;
803 } 803 }
804 if (fdp->fd_ofiles[i] == NULL) { 804 if (fdp->fd_ofiles[i] == NULL) {
805 KASSERT(i >= NDFDFILE); 805 KASSERT(i >= NDFDFILE);
806 fdp->fd_ofiles[i] = ff; 806 fdp->fd_ofiles[i] = ff;
807 } else { 807 } else {
808 pool_cache_put(fdfile_cache, ff); 808 pool_cache_put(fdfile_cache, ff);
809 } 809 }
810 KASSERT(fdp->fd_ofiles[i]->ff_file == NULL); 810 KASSERT(fdp->fd_ofiles[i]->ff_file == NULL);
811 fd_used(fdp, i); 811 fd_used(fdp, i);
812 if (want <= fdp->fd_freefile) { 812 if (want <= fdp->fd_freefile) {
813 fdp->fd_freefile = i; 813 fdp->fd_freefile = i;
814 } 814 }
815 *result = i; 815 *result = i;
816 mutex_exit(&fdp->fd_lock); 816 mutex_exit(&fdp->fd_lock);
817 KASSERT(i >= NDFDFILE || 817 KASSERT(i >= NDFDFILE ||
818 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]); 818 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
819 return 0; 819 return 0;
820 } 820 }
821 821
822 /* No space in current array. Let the caller expand and retry. */ 822 /* No space in current array. Let the caller expand and retry. */
823 error = (fdp->fd_nfiles >= lim) ? EMFILE : ENOSPC; 823 error = (fdp->fd_nfiles >= lim) ? EMFILE : ENOSPC;
824 mutex_exit(&fdp->fd_lock); 824 mutex_exit(&fdp->fd_lock);
825 pool_cache_put(fdfile_cache, ff); 825 pool_cache_put(fdfile_cache, ff);
826 return error; 826 return error;
827} 827}
828 828
829/* 829/*
830 * Allocate memory for the open files array. 830 * Allocate memory for the open files array.
831 */ 831 */
832static fdfile_t ** 832static fdfile_t **
833fd_ofile_alloc(int n) 833fd_ofile_alloc(int n)
834{ 834{
835 uintptr_t *ptr, sz; 835 uintptr_t *ptr, sz;
836 836
837 KASSERT(n > NDFILE); 837 KASSERT(n > NDFILE);
838 838
839 sz = (n + 2) * sizeof(uintptr_t); 839 sz = (n + 2) * sizeof(uintptr_t);
840 ptr = kmem_alloc((size_t)sz, KM_SLEEP); 840 ptr = kmem_alloc((size_t)sz, KM_SLEEP);
841 ptr[1] = sz; 841 ptr[1] = sz;
842 842
843 return (fdfile_t **)(ptr + 2); 843 return (fdfile_t **)(ptr + 2);
844} 844}
845 845
846/* 846/*
847 * Free an open files array. 847 * Free an open files array.
848 */ 848 */
849static void 849static void
850fd_ofile_free(int n, fdfile_t **of) 850fd_ofile_free(int n, fdfile_t **of)
851{ 851{
852 uintptr_t *ptr, sz; 852 uintptr_t *ptr, sz;
853 853
854 KASSERT(n > NDFILE); 854 KASSERT(n > NDFILE);
855 855
856 sz = (n + 2) * sizeof(uintptr_t); 856 sz = (n + 2) * sizeof(uintptr_t);
857 ptr = (uintptr_t *)of - 2; 857 ptr = (uintptr_t *)of - 2;
858 KASSERT(ptr[1] == sz); 858 KASSERT(ptr[1] == sz);
859 kmem_free(ptr, sz); 859 kmem_free(ptr, sz);
860} 860}
861 861
862/* 862/*
863 * Allocate descriptor bitmap. 863 * Allocate descriptor bitmap.
864 */ 864 */
865static void 865static void
866fd_map_alloc(int n, uint32_t **lo, uint32_t **hi) 866fd_map_alloc(int n, uint32_t **lo, uint32_t **hi)
867{ 867{
868 uint8_t *ptr; 868 uint8_t *ptr;
869 size_t szlo, szhi; 869 size_t szlo, szhi;
870 870
871 KASSERT(n > NDENTRIES); 871 KASSERT(n > NDENTRIES);
872 872
873 szlo = NDLOSLOTS(n) * sizeof(uint32_t); 873 szlo = NDLOSLOTS(n) * sizeof(uint32_t);
874 szhi = NDHISLOTS(n) * sizeof(uint32_t); 874 szhi = NDHISLOTS(n) * sizeof(uint32_t);
875 ptr = kmem_alloc(szlo + szhi, KM_SLEEP); 875 ptr = kmem_alloc(szlo + szhi, KM_SLEEP);
876 *lo = (uint32_t *)ptr; 876 *lo = (uint32_t *)ptr;
877 *hi = (uint32_t *)(ptr + szlo); 877 *hi = (uint32_t *)(ptr + szlo);
878} 878}
879 879
880/* 880/*
881 * Free descriptor bitmap. 881 * Free descriptor bitmap.
882 */ 882 */
883static void 883static void
884fd_map_free(int n, uint32_t *lo, uint32_t *hi) 884fd_map_free(int n, uint32_t *lo, uint32_t *hi)
885{ 885{
886 size_t szlo, szhi; 886 size_t szlo, szhi;
887 887
888 KASSERT(n > NDENTRIES); 888 KASSERT(n > NDENTRIES);
889 889
890 szlo = NDLOSLOTS(n) * sizeof(uint32_t); 890 szlo = NDLOSLOTS(n) * sizeof(uint32_t);
891 szhi = NDHISLOTS(n) * sizeof(uint32_t); 891 szhi = NDHISLOTS(n) * sizeof(uint32_t);
892 KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo)); 892 KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo));
893 kmem_free(lo, szlo + szhi); 893 kmem_free(lo, szlo + szhi);
894} 894}
895 895
896/* 896/*
897 * Expand a process' descriptor table. 897 * Expand a process' descriptor table.
898 */ 898 */
899void 899void
900fd_tryexpand(proc_t *p) 900fd_tryexpand(proc_t *p)
901{ 901{
902 filedesc_t *fdp; 902 filedesc_t *fdp;
903 int i, numfiles, oldnfiles; 903 int i, numfiles, oldnfiles;
904 fdfile_t **newofile; 904 fdfile_t **newofile;
905 uint32_t *newhimap, *newlomap; 905 uint32_t *newhimap, *newlomap;
906 906
907 KASSERT(p == curproc || p == &proc0); 907 KASSERT(p == curproc || p == &proc0);
908 908
909 fdp = p->p_fd; 909 fdp = p->p_fd;
910 newhimap = NULL; 910 newhimap = NULL;
911 newlomap = NULL; 911 newlomap = NULL;
912 oldnfiles = fdp->fd_nfiles; 912 oldnfiles = fdp->fd_nfiles;
913 913
914 if (oldnfiles < NDEXTENT) 914 if (oldnfiles < NDEXTENT)
915 numfiles = NDEXTENT; 915 numfiles = NDEXTENT;
916 else 916 else
917 numfiles = 2 * oldnfiles; 917 numfiles = 2 * oldnfiles;
918 918
919 newofile = fd_ofile_alloc(numfiles); 919 newofile = fd_ofile_alloc(numfiles);
920 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 920 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
921 fd_map_alloc(numfiles, &newlomap, &newhimap); 921 fd_map_alloc(numfiles, &newlomap, &newhimap);
922 } 922 }
923 923
924 mutex_enter(&fdp->fd_lock); 924 mutex_enter(&fdp->fd_lock);
925 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 925 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
926 if (fdp->fd_nfiles != oldnfiles) { 926 if (fdp->fd_nfiles != oldnfiles) {
927 /* fdp changed; caller must retry */ 927 /* fdp changed; caller must retry */
928 mutex_exit(&fdp->fd_lock); 928 mutex_exit(&fdp->fd_lock);
929 fd_ofile_free(numfiles, newofile); 929 fd_ofile_free(numfiles, newofile);
930 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 930 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
931 fd_map_free(numfiles, newlomap, newhimap); 931 fd_map_free(numfiles, newlomap, newhimap);
932 } 932 }
933 return; 933 return;
934 } 934 }
935 935
936 /* Copy the existing ofile array and zero the new portion. */ 936 /* Copy the existing ofile array and zero the new portion. */
937 i = sizeof(fdfile_t *) * fdp->fd_nfiles; 937 i = sizeof(fdfile_t *) * fdp->fd_nfiles;
938 memcpy(newofile, fdp->fd_ofiles, i); 938 memcpy(newofile, fdp->fd_ofiles, i);
939 memset((uint8_t *)newofile + i, 0, numfiles * sizeof(fdfile_t *) - i); 939 memset((uint8_t *)newofile + i, 0, numfiles * sizeof(fdfile_t *) - i);
940 940
941 /* 941 /*
942 * Link old ofiles array into list to be discarded. We defer 942 * Link old ofiles array into list to be discarded. We defer
943 * freeing until process exit if the descriptor table is visble 943 * freeing until process exit if the descriptor table is visble
944 * to other threads. 944 * to other threads.
945 */ 945 */
946 if (oldnfiles > NDFILE) { 946 if (oldnfiles > NDFILE) {
947 if ((fdp->fd_refcnt | p->p_nlwps) > 1) { 947 if ((fdp->fd_refcnt | p->p_nlwps) > 1) {
948 fdp->fd_ofiles[-2] = (void *)fdp->fd_discard; 948 fdp->fd_ofiles[-2] = (void *)fdp->fd_discard;
949 fdp->fd_discard = fdp->fd_ofiles - 2; 949 fdp->fd_discard = fdp->fd_ofiles - 2;
950 } else { 950 } else {
951 fd_ofile_free(oldnfiles, fdp->fd_ofiles); 951 fd_ofile_free(oldnfiles, fdp->fd_ofiles);
952 } 952 }
953 } 953 }
954 954
955 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 955 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
956 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t); 956 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
957 memcpy(newhimap, fdp->fd_himap, i); 957 memcpy(newhimap, fdp->fd_himap, i);
958 memset((uint8_t *)newhimap + i, 0, 958 memset((uint8_t *)newhimap + i, 0,
959 NDHISLOTS(numfiles) * sizeof(uint32_t) - i); 959 NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
960 960
961 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t); 961 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
962 memcpy(newlomap, fdp->fd_lomap, i); 962 memcpy(newlomap, fdp->fd_lomap, i);
963 memset((uint8_t *)newlomap + i, 0, 963 memset((uint8_t *)newlomap + i, 0,
964 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i); 964 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
965 965
966 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) { 966 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
967 fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap); 967 fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap);
968 } 968 }
969 fdp->fd_himap = newhimap; 969 fdp->fd_himap = newhimap;
970 fdp->fd_lomap = newlomap; 970 fdp->fd_lomap = newlomap;
971 } 971 }
972 972
973 /* 973 /*
974 * All other modifications must become globally visible before 974 * All other modifications must become globally visible before
975 * the change to fd_nfiles. See fd_getfile(). 975 * the change to fd_nfiles. See fd_getfile().
976 */ 976 */
977 fdp->fd_ofiles = newofile; 977 fdp->fd_ofiles = newofile;
978 membar_producer(); 978 membar_producer();
979 fdp->fd_nfiles = numfiles; 979 fdp->fd_nfiles = numfiles;
980 mutex_exit(&fdp->fd_lock); 980 mutex_exit(&fdp->fd_lock);
981 981
982 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 982 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
983} 983}
984 984
985/* 985/*
986 * Create a new open file structure and allocate a file descriptor 986 * Create a new open file structure and allocate a file descriptor
987 * for the current process. 987 * for the current process.
988 */ 988 */
989int 989int
990fd_allocfile(file_t **resultfp, int *resultfd) 990fd_allocfile(file_t **resultfp, int *resultfd)
991{ 991{
992 file_t *fp; 992 file_t *fp;
993 proc_t *p; 993 proc_t *p;
994 int error; 994 int error;
995 995
996 p = curproc; 996 p = curproc;
997 997
998 while ((error = fd_alloc(p, 0, resultfd)) != 0) { 998 while ((error = fd_alloc(p, 0, resultfd)) != 0) {
999 if (error != ENOSPC) { 999 if (error != ENOSPC) {
1000 return error; 1000 return error;
1001 } 1001 }
1002 fd_tryexpand(p); 1002 fd_tryexpand(p);
1003 } 1003 }
1004 1004
1005 fp = pool_cache_get(file_cache, PR_WAITOK); 1005 fp = pool_cache_get(file_cache, PR_WAITOK);
1006 KASSERT(fp->f_count == 0); 1006 KASSERT(fp->f_count == 0);
 1007 KASSERT(fp->f_msgcount == 0);
 1008 KASSERT(fp->f_unpcount == 0);
1007 fp->f_cred = kauth_cred_get(); 1009 fp->f_cred = kauth_cred_get();
1008 kauth_cred_hold(fp->f_cred); 1010 kauth_cred_hold(fp->f_cred);
1009 1011
1010 if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) { 1012 if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) {
1011 fd_abort(p, fp, *resultfd); 1013 fd_abort(p, fp, *resultfd);
1012 tablefull("file", "increase kern.maxfiles or MAXFILES"); 1014 tablefull("file", "increase kern.maxfiles or MAXFILES");
1013 return ENFILE; 1015 return ENFILE;
1014 } 1016 }
1015 1017
 1018 /*
 1019 * Don't allow recycled files to be scanned.
 1020 */
 1021 if ((fp->f_flag & FSCAN) != 0) {
 1022 mutex_enter(&fp->f_lock);
 1023 atomic_and_uint(&fp->f_flag, ~FSCAN);
 1024 mutex_exit(&fp->f_lock);
 1025 }
 1026
1016 fp->f_advice = 0; 1027 fp->f_advice = 0;
1017 fp->f_msgcount = 0; 1028 fp->f_msgcount = 0;
1018 fp->f_offset = 0; 1029 fp->f_offset = 0;
1019 *resultfp = fp; 1030 *resultfp = fp;
1020 1031
1021 return 0; 1032 return 0;
1022} 1033}
1023 1034
1024/* 1035/*
1025 * Successful creation of a new descriptor: make visible to the process. 1036 * Successful creation of a new descriptor: make visible to the process.
1026 */ 1037 */
1027void 1038void
1028fd_affix(proc_t *p, file_t *fp, unsigned fd) 1039fd_affix(proc_t *p, file_t *fp, unsigned fd)
1029{ 1040{
1030 fdfile_t *ff; 1041 fdfile_t *ff;
1031 filedesc_t *fdp; 1042 filedesc_t *fdp;
1032 1043
1033 KASSERT(p == curproc || p == &proc0); 1044 KASSERT(p == curproc || p == &proc0);
1034 1045
1035 /* Add a reference to the file structure. */ 1046 /* Add a reference to the file structure. */
1036 mutex_enter(&fp->f_lock); 1047 mutex_enter(&fp->f_lock);
1037 fp->f_count++; 1048 fp->f_count++;
1038 mutex_exit(&fp->f_lock); 1049 mutex_exit(&fp->f_lock);
1039 1050
1040 /* 1051 /*
1041 * Insert the new file into the descriptor slot. 1052 * Insert the new file into the descriptor slot.
1042 * 1053 *
1043 * The memory barriers provided by lock activity in this routine 1054 * The memory barriers provided by lock activity in this routine
1044 * ensure that any updates to the file structure become globally 1055 * ensure that any updates to the file structure become globally
1045 * visible before the file becomes visible to other LWPs in the 1056 * visible before the file becomes visible to other LWPs in the
1046 * current process. 1057 * current process.
1047 */ 1058 */
1048 fdp = p->p_fd; 1059 fdp = p->p_fd;
1049 ff = fdp->fd_ofiles[fd]; 1060 ff = fdp->fd_ofiles[fd];
1050 1061
1051 KASSERT(ff != NULL); 1062 KASSERT(ff != NULL);
1052 KASSERT(ff->ff_file == NULL); 1063 KASSERT(ff->ff_file == NULL);
1053 KASSERT(ff->ff_allocated); 1064 KASSERT(ff->ff_allocated);
1054 KASSERT(fd_isused(fdp, fd)); 1065 KASSERT(fd_isused(fdp, fd));
1055 KASSERT(fd >= NDFDFILE || 1066 KASSERT(fd >= NDFDFILE ||
1056 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]); 1067 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1057 1068
1058 /* No need to lock in order to make file initially visible. */ 1069 /* No need to lock in order to make file initially visible. */
1059 ff->ff_file = fp; 1070 ff->ff_file = fp;
1060} 1071}
1061 1072
1062/* 1073/*
1063 * Abort creation of a new descriptor: free descriptor slot and file. 1074 * Abort creation of a new descriptor: free descriptor slot and file.
1064 */ 1075 */
1065void 1076void
1066fd_abort(proc_t *p, file_t *fp, unsigned fd) 1077fd_abort(proc_t *p, file_t *fp, unsigned fd)
1067{ 1078{
1068 filedesc_t *fdp; 1079 filedesc_t *fdp;
1069 fdfile_t *ff; 1080 fdfile_t *ff;
1070 1081
1071 KASSERT(p == curproc || p == &proc0); 1082 KASSERT(p == curproc || p == &proc0);
1072 1083
1073 fdp = p->p_fd; 1084 fdp = p->p_fd;
1074 ff = fdp->fd_ofiles[fd]; 1085 ff = fdp->fd_ofiles[fd];
1075 1086
1076 KASSERT(fd >= NDFDFILE || 1087 KASSERT(fd >= NDFDFILE ||
1077 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]); 1088 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1078 1089
1079 mutex_enter(&fdp->fd_lock); 1090 mutex_enter(&fdp->fd_lock);
1080 KASSERT(fd_isused(fdp, fd)); 1091 KASSERT(fd_isused(fdp, fd));
1081 fd_unused(fdp, fd); 1092 fd_unused(fdp, fd);
1082 mutex_exit(&fdp->fd_lock); 1093 mutex_exit(&fdp->fd_lock);
1083 1094
1084 if (fp != NULL) { 1095 if (fp != NULL) {
1085 ffree(fp); 1096 ffree(fp);
1086 } 1097 }
1087} 1098}
1088 1099
1089/* 1100/*
1090 * Free a file descriptor. 1101 * Free a file descriptor.
1091 */ 1102 */
1092void 1103void
1093ffree(file_t *fp) 1104ffree(file_t *fp)
1094{ 1105{
1095 1106
1096 KASSERT(fp->f_count == 0); 1107 KASSERT(fp->f_count == 0);
1097 1108
1098 atomic_dec_uint(&nfiles); 1109 atomic_dec_uint(&nfiles);
1099 kauth_cred_free(fp->f_cred); 1110 kauth_cred_free(fp->f_cred);
1100 pool_cache_put(file_cache, fp); 1111 pool_cache_put(file_cache, fp);
1101} 1112}
1102 1113
1103/* 1114/*
1104 * Create an initial cwdinfo structure, using the same current and root 1115 * Create an initial cwdinfo structure, using the same current and root
1105 * directories as curproc. 1116 * directories as curproc.
1106 */ 1117 */
1107struct cwdinfo * 1118struct cwdinfo *
1108cwdinit(void) 1119cwdinit(void)
1109{ 1120{
1110 struct cwdinfo *cwdi; 1121 struct cwdinfo *cwdi;
1111 struct cwdinfo *copy; 1122 struct cwdinfo *copy;
1112 1123
1113 cwdi = pool_cache_get(cwdi_cache, PR_WAITOK); 1124 cwdi = pool_cache_get(cwdi_cache, PR_WAITOK);
1114 copy = curproc->p_cwdi; 1125 copy = curproc->p_cwdi;
1115 1126
1116 rw_enter(&copy->cwdi_lock, RW_READER); 1127 rw_enter(&copy->cwdi_lock, RW_READER);
1117 cwdi->cwdi_cdir = copy->cwdi_cdir; 1128 cwdi->cwdi_cdir = copy->cwdi_cdir;
1118 if (cwdi->cwdi_cdir) 1129 if (cwdi->cwdi_cdir)
1119 VREF(cwdi->cwdi_cdir); 1130 VREF(cwdi->cwdi_cdir);
1120 cwdi->cwdi_rdir = copy->cwdi_rdir; 1131 cwdi->cwdi_rdir = copy->cwdi_rdir;
1121 if (cwdi->cwdi_rdir) 1132 if (cwdi->cwdi_rdir)
1122 VREF(cwdi->cwdi_rdir); 1133 VREF(cwdi->cwdi_rdir);
1123 cwdi->cwdi_edir = copy->cwdi_edir; 1134 cwdi->cwdi_edir = copy->cwdi_edir;
1124 if (cwdi->cwdi_edir) 1135 if (cwdi->cwdi_edir)
1125 VREF(cwdi->cwdi_edir); 1136 VREF(cwdi->cwdi_edir);
1126 cwdi->cwdi_cmask = copy->cwdi_cmask; 1137 cwdi->cwdi_cmask = copy->cwdi_cmask;
1127 cwdi->cwdi_refcnt = 1; 1138 cwdi->cwdi_refcnt = 1;
1128 rw_exit(&copy->cwdi_lock); 1139 rw_exit(&copy->cwdi_lock);
1129 1140
1130 return (cwdi); 1141 return (cwdi);
1131} 1142}
1132 1143
1133static int 1144static int
1134cwdi_ctor(void *arg, void *obj, int flags) 1145cwdi_ctor(void *arg, void *obj, int flags)
1135{ 1146{
1136 struct cwdinfo *cwdi = obj; 1147 struct cwdinfo *cwdi = obj;
1137 1148
1138 rw_init(&cwdi->cwdi_lock); 1149 rw_init(&cwdi->cwdi_lock);
1139 1150
1140 return 0; 1151 return 0;
1141} 1152}
1142 1153
1143static void 1154static void
1144cwdi_dtor(void *arg, void *obj) 1155cwdi_dtor(void *arg, void *obj)
1145{ 1156{
1146 struct cwdinfo *cwdi = obj; 1157 struct cwdinfo *cwdi = obj;
1147 1158
1148 rw_destroy(&cwdi->cwdi_lock); 1159 rw_destroy(&cwdi->cwdi_lock);
1149} 1160}
1150 1161
1151static int 1162static int
1152file_ctor(void *arg, void *obj, int flags) 1163file_ctor(void *arg, void *obj, int flags)
1153{ 1164{
1154 file_t *fp = obj; 1165 file_t *fp = obj;
1155 1166
1156 memset(fp, 0, sizeof(*fp)); 1167 memset(fp, 0, sizeof(*fp));
1157 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE); 1168 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1158 1169
1159 mutex_enter(&filelist_lock); 1170 mutex_enter(&filelist_lock);
1160 LIST_INSERT_HEAD(&filehead, fp, f_list); 1171 LIST_INSERT_HEAD(&filehead, fp, f_list);
1161 mutex_exit(&filelist_lock); 1172 mutex_exit(&filelist_lock);
1162 1173
1163 return 0; 1174 return 0;
1164} 1175}
1165 1176
1166static void 1177static void
1167file_dtor(void *arg, void *obj) 1178file_dtor(void *arg, void *obj)
1168{ 1179{
1169 file_t *fp = obj; 1180 file_t *fp = obj;
1170 1181
1171 mutex_enter(&filelist_lock); 1182 mutex_enter(&filelist_lock);
1172 LIST_REMOVE(fp, f_list); 1183 LIST_REMOVE(fp, f_list);
1173 mutex_exit(&filelist_lock); 1184 mutex_exit(&filelist_lock);
1174 1185
1175 mutex_destroy(&fp->f_lock); 1186 mutex_destroy(&fp->f_lock);
1176} 1187}
1177 1188
1178static int 1189static int
1179fdfile_ctor(void *arg, void *obj, int flags) 1190fdfile_ctor(void *arg, void *obj, int flags)
1180{ 1191{
1181 fdfile_t *ff = obj; 1192 fdfile_t *ff = obj;
1182 1193
1183 memset(ff, 0, sizeof(*ff)); 1194 memset(ff, 0, sizeof(*ff));
1184 mutex_init(&ff->ff_lock, MUTEX_DEFAULT, IPL_NONE); 1195 mutex_init(&ff->ff_lock, MUTEX_DEFAULT, IPL_NONE);
1185 cv_init(&ff->ff_closing, "fdclose"); 1196 cv_init(&ff->ff_closing, "fdclose");
1186 1197
1187 return 0; 1198 return 0;
1188} 1199}
1189 1200
1190static void 1201static void
1191fdfile_dtor(void *arg, void *obj) 1202fdfile_dtor(void *arg, void *obj)
1192{ 1203{
1193 fdfile_t *ff = obj; 1204 fdfile_t *ff = obj;
1194 1205
1195 mutex_destroy(&ff->ff_lock); 1206 mutex_destroy(&ff->ff_lock);
1196 cv_destroy(&ff->ff_closing); 1207 cv_destroy(&ff->ff_closing);
1197} 1208}
1198 1209
1199file_t * 1210file_t *
1200fgetdummy(void) 1211fgetdummy(void)
1201{ 1212{
1202 file_t *fp; 1213 file_t *fp;
1203 1214
1204 fp = kmem_alloc(sizeof(*fp), KM_SLEEP); 1215 fp = kmem_alloc(sizeof(*fp), KM_SLEEP);
1205 if (fp != NULL) { 1216 if (fp != NULL) {
1206 memset(fp, 0, sizeof(*fp)); 1217 memset(fp, 0, sizeof(*fp));
1207 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE); 1218 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1208 } 1219 }
1209 return fp; 1220 return fp;
1210} 1221}
1211 1222
1212void 1223void
1213fputdummy(file_t *fp) 1224fputdummy(file_t *fp)
1214{ 1225{
1215 1226
1216 mutex_destroy(&fp->f_lock); 1227 mutex_destroy(&fp->f_lock);
1217 kmem_free(fp, sizeof(*fp)); 1228 kmem_free(fp, sizeof(*fp));
1218} 1229}
1219 1230
1220/* 1231/*
1221 * Make p2 share p1's cwdinfo. 1232 * Make p2 share p1's cwdinfo.
1222 */ 1233 */
1223void 1234void
1224cwdshare(struct proc *p2) 1235cwdshare(struct proc *p2)
1225{ 1236{
1226 struct cwdinfo *cwdi; 1237 struct cwdinfo *cwdi;
1227 1238
1228 cwdi = curproc->p_cwdi; 1239 cwdi = curproc->p_cwdi;
1229 1240
1230 atomic_inc_uint(&cwdi->cwdi_refcnt); 1241 atomic_inc_uint(&cwdi->cwdi_refcnt);
1231 p2->p_cwdi = cwdi; 1242 p2->p_cwdi = cwdi;
1232} 1243}
1233 1244
1234/* 1245/*
1235 * Release a cwdinfo structure. 1246 * Release a cwdinfo structure.
1236 */ 1247 */
1237void 1248void
1238cwdfree(struct cwdinfo *cwdi) 1249cwdfree(struct cwdinfo *cwdi)
1239{ 1250{
1240 1251
1241 if (atomic_dec_uint_nv(&cwdi->cwdi_refcnt) > 0) 1252 if (atomic_dec_uint_nv(&cwdi->cwdi_refcnt) > 0)
1242 return; 1253 return;
1243 1254
1244 vrele(cwdi->cwdi_cdir); 1255 vrele(cwdi->cwdi_cdir);
1245 if (cwdi->cwdi_rdir) 1256 if (cwdi->cwdi_rdir)
1246 vrele(cwdi->cwdi_rdir); 1257 vrele(cwdi->cwdi_rdir);
1247 if (cwdi->cwdi_edir) 1258 if (cwdi->cwdi_edir)
1248 vrele(cwdi->cwdi_edir); 1259 vrele(cwdi->cwdi_edir);
1249 pool_cache_put(cwdi_cache, cwdi); 1260 pool_cache_put(cwdi_cache, cwdi);
1250} 1261}
1251 1262
1252/* 1263/*
1253 * Create an initial filedesc structure. 1264 * Create an initial filedesc structure.
1254 */ 1265 */
1255filedesc_t * 1266filedesc_t *
1256fd_init(filedesc_t *fdp) 1267fd_init(filedesc_t *fdp)
1257{ 1268{
1258 unsigned fd; 1269 unsigned fd;
1259 1270
1260 if (fdp == NULL) { 1271 if (fdp == NULL) {
1261 fdp = pool_cache_get(filedesc_cache, PR_WAITOK); 1272 fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1262 } else { 1273 } else {
1263 filedesc_ctor(NULL, fdp, PR_WAITOK); 1274 filedesc_ctor(NULL, fdp, PR_WAITOK);
1264 } 1275 }
1265 1276
1266 fdp->fd_refcnt = 1; 1277 fdp->fd_refcnt = 1;
1267 fdp->fd_ofiles = fdp->fd_dfiles; 1278 fdp->fd_ofiles = fdp->fd_dfiles;
1268 fdp->fd_nfiles = NDFILE; 1279 fdp->fd_nfiles = NDFILE;
1269 fdp->fd_himap = fdp->fd_dhimap; 1280 fdp->fd_himap = fdp->fd_dhimap;
1270 fdp->fd_lomap = fdp->fd_dlomap; 1281 fdp->fd_lomap = fdp->fd_dlomap;
1271 KASSERT(fdp->fd_lastfile == -1); 1282 KASSERT(fdp->fd_lastfile == -1);
1272 KASSERT(fdp->fd_lastkqfile == -1); 1283 KASSERT(fdp->fd_lastkqfile == -1);
1273 KASSERT(fdp->fd_knhash == NULL); 1284 KASSERT(fdp->fd_knhash == NULL);
1274 1285
1275 memset(&fdp->fd_startzero, 0, sizeof(*fdp) - 1286 memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
1276 offsetof(filedesc_t, fd_startzero)); 1287 offsetof(filedesc_t, fd_startzero));
1277 for (fd = 0; fd < NDFDFILE; fd++) { 1288 for (fd = 0; fd < NDFDFILE; fd++) {
1278 fdp->fd_ofiles[fd] = (fdfile_t *)fdp->fd_dfdfile[fd]; 1289 fdp->fd_ofiles[fd] = (fdfile_t *)fdp->fd_dfdfile[fd];
1279 } 1290 }
1280 1291
1281 return fdp; 1292 return fdp;
1282} 1293}
1283 1294
1284/* 1295/*
1285 * Initialize a file descriptor table. 1296 * Initialize a file descriptor table.
1286 */ 1297 */
1287static int 1298static int
1288filedesc_ctor(void *arg, void *obj, int flag) 1299filedesc_ctor(void *arg, void *obj, int flag)
1289{ 1300{
1290 filedesc_t *fdp = obj; 1301 filedesc_t *fdp = obj;
1291 int i; 1302 int i;
1292 1303
1293 memset(fdp, 0, sizeof(*fdp)); 1304 memset(fdp, 0, sizeof(*fdp));
1294 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE); 1305 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
1295 fdp->fd_lastfile = -1; 1306 fdp->fd_lastfile = -1;
1296 fdp->fd_lastkqfile = -1; 1307 fdp->fd_lastkqfile = -1;
1297 1308
1298 CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t)); 1309 CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
1299 for (i = 0; i < NDFDFILE; i++) { 1310 for (i = 0; i < NDFDFILE; i++) {
1300 fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK); 1311 fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
1301 } 1312 }
1302 1313
1303 return 0; 1314 return 0;
1304} 1315}
1305 1316
1306static void 1317static void
1307filedesc_dtor(void *arg, void *obj) 1318filedesc_dtor(void *arg, void *obj)
1308{ 1319{
1309 filedesc_t *fdp = obj; 1320 filedesc_t *fdp = obj;
1310 int i; 1321 int i;
1311 1322
1312 for (i = 0; i < NDFDFILE; i++) { 1323 for (i = 0; i < NDFDFILE; i++) {
1313 fdfile_dtor(NULL, fdp->fd_dfdfile[i]); 1324 fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
1314 } 1325 }
1315 1326
1316 mutex_destroy(&fdp->fd_lock); 1327 mutex_destroy(&fdp->fd_lock);
1317} 1328}
1318 1329
1319/* 1330/*
1320 * Make p2 share p1's filedesc structure. 1331 * Make p2 share p1's filedesc structure.
1321 */ 1332 */
1322void 1333void
1323fd_share(struct proc *p2) 1334fd_share(struct proc *p2)
1324{ 1335{
1325 filedesc_t *fdp; 1336 filedesc_t *fdp;
1326 1337
1327 fdp = curlwp->l_fd; 1338 fdp = curlwp->l_fd;
1328 p2->p_fd = fdp; 1339 p2->p_fd = fdp;
1329 atomic_inc_uint(&fdp->fd_refcnt); 1340 atomic_inc_uint(&fdp->fd_refcnt);
1330} 1341}
1331 1342
1332/* 1343/*
1333 * Copy a filedesc structure. 1344 * Copy a filedesc structure.
1334 */ 1345 */
1335filedesc_t * 1346filedesc_t *
1336fd_copy(void) 1347fd_copy(void)
1337{ 1348{
1338 filedesc_t *newfdp, *fdp; 1349 filedesc_t *newfdp, *fdp;
1339 fdfile_t *ff, *fflist, **ffp, **nffp, *ff2; 1350 fdfile_t *ff, *fflist, **ffp, **nffp, *ff2;
1340 int i, nused, numfiles, lastfile, j, newlast; 1351 int i, nused, numfiles, lastfile, j, newlast;
1341 file_t *fp; 1352 file_t *fp;
1342 1353
1343 fdp = curproc->p_fd; 1354 fdp = curproc->p_fd;
1344 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK); 1355 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1345 newfdp->fd_refcnt = 1; 1356 newfdp->fd_refcnt = 1;
1346 1357
1347 KASSERT(newfdp->fd_knhash == NULL); 1358 KASSERT(newfdp->fd_knhash == NULL);
1348 KASSERT(newfdp->fd_knhashmask == 0); 1359 KASSERT(newfdp->fd_knhashmask == 0);
1349 KASSERT(newfdp->fd_discard == NULL); 1360 KASSERT(newfdp->fd_discard == NULL);
1350 1361
1351 for (;;) { 1362 for (;;) {
1352 numfiles = fdp->fd_nfiles; 1363 numfiles = fdp->fd_nfiles;
1353 lastfile = fdp->fd_lastfile; 1364 lastfile = fdp->fd_lastfile;
1354 1365
1355 /* 1366 /*
1356 * If the number of open files fits in the internal arrays 1367 * If the number of open files fits in the internal arrays
1357 * of the open file structure, use them, otherwise allocate 1368 * of the open file structure, use them, otherwise allocate
1358 * additional memory for the number of descriptors currently 1369 * additional memory for the number of descriptors currently
1359 * in use. 1370 * in use.
1360 */ 1371 */
1361 if (lastfile < NDFILE) { 1372 if (lastfile < NDFILE) {
1362 i = NDFILE; 1373 i = NDFILE;
1363 newfdp->fd_ofiles = newfdp->fd_dfiles; 1374 newfdp->fd_ofiles = newfdp->fd_dfiles;
1364 } else { 1375 } else {
1365 /* 1376 /*
1366 * Compute the smallest multiple of NDEXTENT needed 1377 * Compute the smallest multiple of NDEXTENT needed
1367 * for the file descriptors currently in use, 1378 * for the file descriptors currently in use,
1368 * allowing the table to shrink. 1379 * allowing the table to shrink.
1369 */ 1380 */
1370 i = numfiles; 1381 i = numfiles;
1371 while (i >= 2 * NDEXTENT && i > lastfile * 2) { 1382 while (i >= 2 * NDEXTENT && i > lastfile * 2) {
1372 i /= 2; 1383 i /= 2;
1373 } 1384 }
1374 newfdp->fd_ofiles = fd_ofile_alloc(i); 1385 newfdp->fd_ofiles = fd_ofile_alloc(i);
1375 KASSERT(i > NDFILE); 1386 KASSERT(i > NDFILE);
1376 } 1387 }
1377 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) { 1388 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
1378 newfdp->fd_himap = newfdp->fd_dhimap; 1389 newfdp->fd_himap = newfdp->fd_dhimap;
1379 newfdp->fd_lomap = newfdp->fd_dlomap; 1390 newfdp->fd_lomap = newfdp->fd_dlomap;
1380 } else { 1391 } else {
1381 fd_map_alloc(i, &newfdp->fd_lomap, 1392 fd_map_alloc(i, &newfdp->fd_lomap,
1382 &newfdp->fd_himap); 1393 &newfdp->fd_himap);
1383 } 1394 }
1384 1395
1385 /* 1396 /*
1386 * Allocate and string together fdfile structures. 1397 * Allocate and string together fdfile structures.
1387 * We abuse fdfile_t::ff_file here, but it will be 1398 * We abuse fdfile_t::ff_file here, but it will be
1388 * cleared before this routine returns. 1399 * cleared before this routine returns.
1389 */ 1400 */
1390 nused = fdp->fd_nused; 1401 nused = fdp->fd_nused;
1391 fflist = NULL; 1402 fflist = NULL;
1392 for (j = nused; j != 0; j--) { 1403 for (j = nused; j != 0; j--) {
1393 ff = pool_cache_get(fdfile_cache, PR_WAITOK); 1404 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
1394 ff->ff_file = (void *)fflist; 1405 ff->ff_file = (void *)fflist;
1395 fflist = ff; 1406 fflist = ff;
1396 } 1407 }
1397 1408
1398 mutex_enter(&fdp->fd_lock); 1409 mutex_enter(&fdp->fd_lock);
1399 if (numfiles == fdp->fd_nfiles && nused == fdp->fd_nused && 1410 if (numfiles == fdp->fd_nfiles && nused == fdp->fd_nused &&
1400 lastfile == fdp->fd_lastfile) { 1411 lastfile == fdp->fd_lastfile) {
1401 break; 1412 break;
1402 } 1413 }
1403 mutex_exit(&fdp->fd_lock); 1414 mutex_exit(&fdp->fd_lock);
1404 if (i > NDFILE) { 1415 if (i > NDFILE) {
1405 fd_ofile_free(i, newfdp->fd_ofiles); 1416 fd_ofile_free(i, newfdp->fd_ofiles);
1406 } 1417 }
1407 if (NDHISLOTS(i) > NDHISLOTS(NDFILE)) { 1418 if (NDHISLOTS(i) > NDHISLOTS(NDFILE)) {
1408 fd_map_free(i, newfdp->fd_lomap, newfdp->fd_himap); 1419 fd_map_free(i, newfdp->fd_lomap, newfdp->fd_himap);
1409 } 1420 }
1410 while (fflist != NULL) { 1421 while (fflist != NULL) {
1411 ff = fflist; 1422 ff = fflist;
1412 fflist = (void *)ff->ff_file; 1423 fflist = (void *)ff->ff_file;
1413 ff->ff_file = NULL; 1424 ff->ff_file = NULL;
1414 pool_cache_put(fdfile_cache, ff); 1425 pool_cache_put(fdfile_cache, ff);
1415 } 1426 }
1416 } 1427 }
1417 1428
1418 newfdp->fd_nfiles = i; 1429 newfdp->fd_nfiles = i;
1419 newfdp->fd_freefile = fdp->fd_freefile; 1430 newfdp->fd_freefile = fdp->fd_freefile;
1420 newfdp->fd_exclose = fdp->fd_exclose; 1431 newfdp->fd_exclose = fdp->fd_exclose;
1421 1432
1422 /* 1433 /*
1423 * Clear the entries that will not be copied over. 1434 * Clear the entries that will not be copied over.
1424 * Avoid calling memset with 0 size. 1435 * Avoid calling memset with 0 size.
1425 */ 1436 */
1426 if (lastfile < (i-1)) { 1437 if (lastfile < (i-1)) {
1427 memset(newfdp->fd_ofiles + lastfile + 1, 0, 1438 memset(newfdp->fd_ofiles + lastfile + 1, 0,
1428 (i - lastfile - 1) * sizeof(file_t **)); 1439 (i - lastfile - 1) * sizeof(file_t **));
1429 } 1440 }
1430 if (i < NDENTRIES * NDENTRIES) { 1441 if (i < NDENTRIES * NDENTRIES) {
1431 i = NDENTRIES * NDENTRIES; /* size of inlined bitmaps */ 1442 i = NDENTRIES * NDENTRIES; /* size of inlined bitmaps */
1432 } 1443 }
1433 memcpy(newfdp->fd_himap, fdp->fd_himap, NDHISLOTS(i)*sizeof(uint32_t)); 1444 memcpy(newfdp->fd_himap, fdp->fd_himap, NDHISLOTS(i)*sizeof(uint32_t));
1434 memcpy(newfdp->fd_lomap, fdp->fd_lomap, NDLOSLOTS(i)*sizeof(uint32_t)); 1445 memcpy(newfdp->fd_lomap, fdp->fd_lomap, NDLOSLOTS(i)*sizeof(uint32_t));
1435 1446
1436 ffp = fdp->fd_ofiles; 1447 ffp = fdp->fd_ofiles;
1437 nffp = newfdp->fd_ofiles; 1448 nffp = newfdp->fd_ofiles;
1438 j = imax(lastfile, (NDFDFILE - 1)); 1449 j = imax(lastfile, (NDFDFILE - 1));
1439 newlast = -1; 1450 newlast = -1;
1440 KASSERT(j < fdp->fd_nfiles); 1451 KASSERT(j < fdp->fd_nfiles);
1441 for (i = 0; i <= j; i++, ffp++, *nffp++ = ff2) { 1452 for (i = 0; i <= j; i++, ffp++, *nffp++ = ff2) {
1442 ff = *ffp; 1453 ff = *ffp;
1443 /* Install built-in fdfiles even if unused here. */ 1454 /* Install built-in fdfiles even if unused here. */
1444 if (i < NDFDFILE) { 1455 if (i < NDFDFILE) {
1445 ff2 = (fdfile_t *)newfdp->fd_dfdfile[i]; 1456 ff2 = (fdfile_t *)newfdp->fd_dfdfile[i];
1446 } else { 1457 } else {
1447 ff2 = NULL; 1458 ff2 = NULL;
1448 } 1459 }
1449 /* Determine if descriptor is active in parent. */ 1460 /* Determine if descriptor is active in parent. */
1450 if (ff == NULL || !fd_isused(fdp, i)) { 1461 if (ff == NULL || !fd_isused(fdp, i)) {
1451 KASSERT(ff != NULL || i >= NDFDFILE); 1462 KASSERT(ff != NULL || i >= NDFDFILE);
1452 continue; 1463 continue;
1453 } 1464 }
1454 mutex_enter(&ff->ff_lock); 1465 mutex_enter(&ff->ff_lock);
1455 fp = ff->ff_file; 1466 fp = ff->ff_file;
1456 if (fp == NULL) { 1467 if (fp == NULL) {
1457 /* Descriptor is half-open: free slot. */ 1468 /* Descriptor is half-open: free slot. */
1458 fd_zap(newfdp, i); 1469 fd_zap(newfdp, i);
1459 mutex_exit(&ff->ff_lock); 1470 mutex_exit(&ff->ff_lock);
1460 continue; 1471 continue;
1461 } 1472 }
1462 if (fp->f_type == DTYPE_KQUEUE) { 1473 if (fp->f_type == DTYPE_KQUEUE) {
1463 /* kqueue descriptors cannot be copied. */ 1474 /* kqueue descriptors cannot be copied. */
1464 fd_zap(newfdp, i); 1475 fd_zap(newfdp, i);
1465 mutex_exit(&ff->ff_lock); 1476 mutex_exit(&ff->ff_lock);
1466 continue; 1477 continue;
1467 } 1478 }
1468 /* It's active: add a reference to the file. */ 1479 /* It's active: add a reference to the file. */
1469 mutex_enter(&fp->f_lock); 1480 mutex_enter(&fp->f_lock);
1470 fp->f_count++; 1481 fp->f_count++;
1471 mutex_exit(&fp->f_lock); 1482 mutex_exit(&fp->f_lock);
1472 /* Consume one fdfile_t to represent it. */ 1483 /* Consume one fdfile_t to represent it. */
1473 if (i >= NDFDFILE) { 1484 if (i >= NDFDFILE) {
1474 ff2 = fflist; 1485 ff2 = fflist;
1475 fflist = (void *)ff2->ff_file; 1486 fflist = (void *)ff2->ff_file;
1476 } 1487 }
1477 ff2->ff_file = fp; 1488 ff2->ff_file = fp;
1478 ff2->ff_exclose = ff->ff_exclose; 1489 ff2->ff_exclose = ff->ff_exclose;
1479 ff2->ff_allocated = true; 1490 ff2->ff_allocated = true;
1480 mutex_exit(&ff->ff_lock); 1491 mutex_exit(&ff->ff_lock);
1481 if (i > newlast) { 1492 if (i > newlast) {
1482 newlast = i; 1493 newlast = i;
1483 } 1494 }
1484 } 1495 }
1485 mutex_exit(&fdp->fd_lock); 1496 mutex_exit(&fdp->fd_lock);
1486 1497
1487 /* Discard unused fdfile_t structures. */ 1498 /* Discard unused fdfile_t structures. */
1488 while (__predict_false(fflist != NULL)) { 1499 while (__predict_false(fflist != NULL)) {
1489 ff = fflist; 1500 ff = fflist;
1490 fflist = (void *)ff->ff_file; 1501 fflist = (void *)ff->ff_file;
1491 ff->ff_file = NULL; 1502 ff->ff_file = NULL;
1492 pool_cache_put(fdfile_cache, ff); 1503 pool_cache_put(fdfile_cache, ff);
1493 nused--; 1504 nused--;
1494 } 1505 }
1495 KASSERT(nused >= 0); 1506 KASSERT(nused >= 0);
1496 KASSERT(newfdp->fd_ofiles[0] == (fdfile_t *)newfdp->fd_dfdfile[0]); 1507 KASSERT(newfdp->fd_ofiles[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
1497 1508
1498 newfdp->fd_nused = nused; 1509 newfdp->fd_nused = nused;
1499 newfdp->fd_lastfile = newlast; 1510 newfdp->fd_lastfile = newlast;
1500 1511
1501 return (newfdp); 1512 return (newfdp);
1502} 1513}
1503 1514
1504/* 1515/*
1505 * Release a filedesc structure. 1516 * Release a filedesc structure.
1506 */ 1517 */
1507void 1518void
1508fd_free(void) 1519fd_free(void)
1509{ 1520{
1510 filedesc_t *fdp; 1521 filedesc_t *fdp;
1511 fdfile_t *ff; 1522 fdfile_t *ff;
1512 file_t *fp; 1523 file_t *fp;
1513 int fd, lastfd; 1524 int fd, lastfd;
1514 void **discard; 1525 void **discard;
1515 1526
1516 fdp = curlwp->l_fd; 1527 fdp = curlwp->l_fd;
1517 1528
1518 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 1529 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1519 1530
1520 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0) 1531 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1521 return; 1532 return;
1522 1533
1523 /* 1534 /*
1524 * Close any files that the process holds open. 1535 * Close any files that the process holds open.
1525 */ 1536 */
1526 for (fd = 0, lastfd = fdp->fd_nfiles - 1; fd <= lastfd; fd++) { 1537 for (fd = 0, lastfd = fdp->fd_nfiles - 1; fd <= lastfd; fd++) {
1527 ff = fdp->fd_ofiles[fd]; 1538 ff = fdp->fd_ofiles[fd];
1528 KASSERT(fd >= NDFDFILE || 1539 KASSERT(fd >= NDFDFILE ||
1529 ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1540 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1530 if ((ff = fdp->fd_ofiles[fd]) == NULL) 1541 if ((ff = fdp->fd_ofiles[fd]) == NULL)
1531 continue; 1542 continue;
1532 if ((fp = ff->ff_file) != NULL) { 1543 if ((fp = ff->ff_file) != NULL) {
1533 /* 1544 /*
1534 * Must use fd_close() here as kqueue holds 1545 * Must use fd_close() here as kqueue holds
1535 * long term references to descriptors. 1546 * long term references to descriptors.
1536 */ 1547 */
1537 ff->ff_refcnt++; 1548 ff->ff_refcnt++;
1538 fd_close(fd); 1549 fd_close(fd);
1539 } 1550 }
1540 KASSERT(ff->ff_refcnt == 0); 1551 KASSERT(ff->ff_refcnt == 0);
1541 KASSERT(ff->ff_file == NULL); 1552 KASSERT(ff->ff_file == NULL);
1542 KASSERT(!ff->ff_exclose); 1553 KASSERT(!ff->ff_exclose);
1543 KASSERT(!ff->ff_allocated); 1554 KASSERT(!ff->ff_allocated);
1544 if (fd >= NDFDFILE) { 1555 if (fd >= NDFDFILE) {
1545 pool_cache_put(fdfile_cache, ff); 1556 pool_cache_put(fdfile_cache, ff);
1546 } 1557 }
1547 } 1558 }
1548 1559
1549 /* 1560 /*
1550 * Clean out the descriptor table for the next user and return 1561 * Clean out the descriptor table for the next user and return
1551 * to the cache. 1562 * to the cache.
1552 */ 1563 */
1553 while ((discard = fdp->fd_discard) != NULL) { 1564 while ((discard = fdp->fd_discard) != NULL) {
1554 fdp->fd_discard = discard[0]; 1565 fdp->fd_discard = discard[0];
1555 kmem_free(discard, (uintptr_t)discard[1]); 1566 kmem_free(discard, (uintptr_t)discard[1]);
1556 } 1567 }
1557 if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) { 1568 if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
1558 KASSERT(fdp->fd_himap != fdp->fd_dhimap); 1569 KASSERT(fdp->fd_himap != fdp->fd_dhimap);
1559 KASSERT(fdp->fd_lomap != fdp->fd_dlomap); 1570 KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
1560 fd_map_free(fdp->fd_nfiles, fdp->fd_lomap, fdp->fd_himap); 1571 fd_map_free(fdp->fd_nfiles, fdp->fd_lomap, fdp->fd_himap);
1561 } 1572 }
1562 if (fdp->fd_nfiles > NDFILE) { 1573 if (fdp->fd_nfiles > NDFILE) {
1563 KASSERT(fdp->fd_ofiles != fdp->fd_dfiles); 1574 KASSERT(fdp->fd_ofiles != fdp->fd_dfiles);
1564 fd_ofile_free(fdp->fd_nfiles, fdp->fd_ofiles); 1575 fd_ofile_free(fdp->fd_nfiles, fdp->fd_ofiles);
1565 } 1576 }
1566 if (fdp->fd_knhash != NULL) { 1577 if (fdp->fd_knhash != NULL) {
1567 hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask); 1578 hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
1568 fdp->fd_knhash = NULL; 1579 fdp->fd_knhash = NULL;
1569 fdp->fd_knhashmask = 0; 1580 fdp->fd_knhashmask = 0;
1570 } else { 1581 } else {
1571 KASSERT(fdp->fd_knhashmask == 0); 1582 KASSERT(fdp->fd_knhashmask == 0);
1572 } 1583 }
1573 fdp->fd_lastkqfile = -1; 1584 fdp->fd_lastkqfile = -1;
1574 pool_cache_put(filedesc_cache, fdp); 1585 pool_cache_put(filedesc_cache, fdp);
1575} 1586}
1576 1587
1577/* 1588/*
1578 * File Descriptor pseudo-device driver (/dev/fd/). 1589 * File Descriptor pseudo-device driver (/dev/fd/).
1579 * 1590 *
1580 * Opening minor device N dup()s the file (if any) connected to file 1591 * Opening minor device N dup()s the file (if any) connected to file
1581 * descriptor N belonging to the calling process. Note that this driver 1592 * descriptor N belonging to the calling process. Note that this driver
1582 * consists of only the ``open()'' routine, because all subsequent 1593 * consists of only the ``open()'' routine, because all subsequent
1583 * references to this file will be direct to the other driver. 1594 * references to this file will be direct to the other driver.
1584 */ 1595 */
1585static int 1596static int
1586filedescopen(dev_t dev, int mode, int type, lwp_t *l) 1597filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1587{ 1598{
1588 1599
1589 /* 1600 /*
1590 * XXX Kludge: set dupfd to contain the value of the 1601 * XXX Kludge: set dupfd to contain the value of the
1591 * the file descriptor being sought for duplication. The error 1602 * the file descriptor being sought for duplication. The error
1592 * return ensures that the vnode for this device will be released 1603 * return ensures that the vnode for this device will be released
1593 * by vn_open. Open will detect this special error and take the 1604 * by vn_open. Open will detect this special error and take the
1594 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 1605 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1595 * will simply report the error. 1606 * will simply report the error.
1596 */ 1607 */
1597 l->l_dupfd = minor(dev); /* XXX */ 1608 l->l_dupfd = minor(dev); /* XXX */
1598 return EDUPFD; 1609 return EDUPFD;
1599} 1610}
1600 1611
1601/* 1612/*
1602 * Duplicate the specified descriptor to a free descriptor. 1613 * Duplicate the specified descriptor to a free descriptor.
1603 */ 1614 */
1604int 1615int
1605fd_dupopen(int old, int *new, int mode, int error) 1616fd_dupopen(int old, int *new, int mode, int error)
1606{ 1617{
1607 filedesc_t *fdp; 1618 filedesc_t *fdp;
1608 fdfile_t *ff; 1619 fdfile_t *ff;
1609 file_t *fp; 1620 file_t *fp;
1610 1621
1611 if ((fp = fd_getfile(old)) == NULL) { 1622 if ((fp = fd_getfile(old)) == NULL) {
1612 return EBADF; 1623 return EBADF;
1613 } 1624 }
1614 fdp = curlwp->l_fd; 1625 fdp = curlwp->l_fd;
1615 ff = fdp->fd_ofiles[old]; 1626 ff = fdp->fd_ofiles[old];
1616 1627
1617 /* 1628 /*
1618 * There are two cases of interest here. 1629 * There are two cases of interest here.
1619 * 1630 *
1620 * For EDUPFD simply dup (dfd) to file descriptor 1631 * For EDUPFD simply dup (dfd) to file descriptor
1621 * (indx) and return. 1632 * (indx) and return.
1622 * 1633 *
1623 * For EMOVEFD steal away the file structure from (dfd) and 1634 * For EMOVEFD steal away the file structure from (dfd) and
1624 * store it in (indx). (dfd) is effectively closed by 1635 * store it in (indx). (dfd) is effectively closed by
1625 * this operation. 1636 * this operation.
1626 * 1637 *
1627 * Any other error code is just returned. 1638 * Any other error code is just returned.
1628 */ 1639 */
1629 switch (error) { 1640 switch (error) {
1630 case EDUPFD: 1641 case EDUPFD:
1631 /* 1642 /*
1632 * Check that the mode the file is being opened for is a 1643 * Check that the mode the file is being opened for is a
1633 * subset of the mode of the existing descriptor. 1644 * subset of the mode of the existing descriptor.
1634 */ 1645 */
1635 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { 1646 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
1636 error = EACCES; 1647 error = EACCES;
1637 break; 1648 break;
1638 } 1649 }
1639 1650
1640 /* Copy it. */ 1651 /* Copy it. */
1641 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose); 1652 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1642 break; 1653 break;
1643 1654
1644 case EMOVEFD: 1655 case EMOVEFD:
1645 /* Copy it. */ 1656 /* Copy it. */
1646 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose); 1657 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1647 if (error != 0) { 1658 if (error != 0) {
1648 break; 1659 break;
1649 } 1660 }
1650 1661
1651 /* Steal away the file pointer from 'old'. */ 1662 /* Steal away the file pointer from 'old'. */
1652 (void)fd_close(old); 1663 (void)fd_close(old);
1653 return 0; 1664 return 0;
1654 } 1665 }
1655 1666
1656 fd_putfile(old); 1667 fd_putfile(old);
1657 return error; 1668 return error;
1658} 1669}
1659 1670
1660/* 1671/*
1661 * Close open files on exec. 1672 * Close open files on exec.
1662 */ 1673 */
1663void 1674void
1664fd_closeexec(void) 1675fd_closeexec(void)
1665{ 1676{
1666 struct cwdinfo *cwdi; 1677 struct cwdinfo *cwdi;
1667 proc_t *p; 1678 proc_t *p;
1668 filedesc_t *fdp; 1679 filedesc_t *fdp;
1669 fdfile_t *ff; 1680 fdfile_t *ff;
1670 lwp_t *l; 1681 lwp_t *l;
1671 int fd; 1682 int fd;
1672 1683
1673 l = curlwp; 1684 l = curlwp;
1674 p = l->l_proc; 1685 p = l->l_proc;
1675 fdp = p->p_fd; 1686 fdp = p->p_fd;
1676 cwdi = p->p_cwdi; 1687 cwdi = p->p_cwdi;
1677 1688
1678 if (cwdi->cwdi_refcnt > 1) { 1689 if (cwdi->cwdi_refcnt > 1) {
1679 cwdi = cwdinit(); 1690 cwdi = cwdinit();
1680 cwdfree(p->p_cwdi); 1691 cwdfree(p->p_cwdi);
1681 p->p_cwdi = cwdi; 1692 p->p_cwdi = cwdi;
1682 } 1693 }
1683 if (p->p_cwdi->cwdi_edir) { 1694 if (p->p_cwdi->cwdi_edir) {
1684 vrele(p->p_cwdi->cwdi_edir); 1695 vrele(p->p_cwdi->cwdi_edir);
1685 } 1696 }
1686 1697
1687 if (fdp->fd_refcnt > 1) { 1698 if (fdp->fd_refcnt > 1) {
1688 fdp = fd_copy(); 1699 fdp = fd_copy();
1689 fd_free(); 1700 fd_free();
1690 p->p_fd = fdp; 1701 p->p_fd = fdp;
1691 l->l_fd = fdp; 1702 l->l_fd = fdp;
1692 } 1703 }
1693 if (!fdp->fd_exclose) { 1704 if (!fdp->fd_exclose) {
1694 return; 1705 return;
1695 } 1706 }
1696 fdp->fd_exclose = false; 1707 fdp->fd_exclose = false;
1697 1708
1698 for (fd = 0; fd <= fdp->fd_lastfile; fd++) { 1709 for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
1699 if ((ff = fdp->fd_ofiles[fd]) == NULL) { 1710 if ((ff = fdp->fd_ofiles[fd]) == NULL) {
1700 KASSERT(fd >= NDFDFILE); 1711 KASSERT(fd >= NDFDFILE);
1701 continue; 1712 continue;
1702 } 1713 }
1703 KASSERT(fd >= NDFDFILE || 1714 KASSERT(fd >= NDFDFILE ||
1704 ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1715 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1705 if (ff->ff_file == NULL) 1716 if (ff->ff_file == NULL)
1706 continue; 1717 continue;
1707 if (ff->ff_exclose) { 1718 if (ff->ff_exclose) {
1708 /* 1719 /*
1709 * We need a reference to close the file. 1720 * We need a reference to close the file.
1710 * No other threads can see the fdfile_t at 1721 * No other threads can see the fdfile_t at
1711 * this point, so don't bother locking. 1722 * this point, so don't bother locking.
1712 */ 1723 */
1713 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0); 1724 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
1714 ff->ff_refcnt++; 1725 ff->ff_refcnt++;
1715 fd_close(fd); 1726 fd_close(fd);
1716 } 1727 }
1717 } 1728 }
1718} 1729}
1719 1730
1720/* 1731/*
1721 * It is unsafe for set[ug]id processes to be started with file 1732 * It is unsafe for set[ug]id processes to be started with file
1722 * descriptors 0..2 closed, as these descriptors are given implicit 1733 * descriptors 0..2 closed, as these descriptors are given implicit
1723 * significance in the Standard C library. fdcheckstd() will create a 1734 * significance in the Standard C library. fdcheckstd() will create a
1724 * descriptor referencing /dev/null for each of stdin, stdout, and 1735 * descriptor referencing /dev/null for each of stdin, stdout, and
1725 * stderr that is not already open. 1736 * stderr that is not already open.
1726 */ 1737 */
1727#define CHECK_UPTO 3 1738#define CHECK_UPTO 3
1728int 1739int
1729fd_checkstd(void) 1740fd_checkstd(void)
1730{ 1741{
1731 struct proc *p; 1742 struct proc *p;
1732 struct nameidata nd; 1743 struct nameidata nd;
1733 filedesc_t *fdp; 1744 filedesc_t *fdp;
1734 file_t *fp; 1745 file_t *fp;
1735 struct proc *pp; 1746 struct proc *pp;
1736 int fd, i, error, flags = FREAD|FWRITE; 1747 int fd, i, error, flags = FREAD|FWRITE;
1737 char closed[CHECK_UPTO * 3 + 1], which[3 + 1]; 1748 char closed[CHECK_UPTO * 3 + 1], which[3 + 1];
1738 1749
1739 p = curproc; 1750 p = curproc;
1740 closed[0] = '\0'; 1751 closed[0] = '\0';
1741 if ((fdp = p->p_fd) == NULL) 1752 if ((fdp = p->p_fd) == NULL)
1742 return (0); 1753 return (0);
1743 for (i = 0; i < CHECK_UPTO; i++) { 1754 for (i = 0; i < CHECK_UPTO; i++) {
1744 KASSERT(i >= NDFDFILE || 1755 KASSERT(i >= NDFDFILE ||
1745 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]); 1756 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
1746 if (fdp->fd_ofiles[i]->ff_file != NULL) 1757 if (fdp->fd_ofiles[i]->ff_file != NULL)
1747 continue; 1758 continue;
1748 snprintf(which, sizeof(which), ",%d", i); 1759 snprintf(which, sizeof(which), ",%d", i);
1749 strlcat(closed, which, sizeof(closed)); 1760 strlcat(closed, which, sizeof(closed));
1750 if ((error = fd_allocfile(&fp, &fd)) != 0) 1761 if ((error = fd_allocfile(&fp, &fd)) != 0)
1751 return (error); 1762 return (error);
1752 KASSERT(fd < CHECK_UPTO); 1763 KASSERT(fd < CHECK_UPTO);
1753 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null"); 1764 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null");
1754 if ((error = vn_open(&nd, flags, 0)) != 0) { 1765 if ((error = vn_open(&nd, flags, 0)) != 0) {
1755 fd_abort(p, fp, fd); 1766 fd_abort(p, fp, fd);
1756 return (error); 1767 return (error);
1757 } 1768 }
1758 fp->f_data = nd.ni_vp; 1769 fp->f_data = nd.ni_vp;
1759 fp->f_flag = flags; 1770 fp->f_flag = flags;
1760 fp->f_ops = &vnops; 1771 fp->f_ops = &vnops;
1761 fp->f_type = DTYPE_VNODE; 1772 fp->f_type = DTYPE_VNODE;
1762 VOP_UNLOCK(nd.ni_vp, 0); 1773 VOP_UNLOCK(nd.ni_vp, 0);
1763 fd_affix(p, fp, fd); 1774 fd_affix(p, fp, fd);
1764 } 1775 }
1765 if (closed[0] != '\0') { 1776 if (closed[0] != '\0') {
1766 mutex_enter(proc_lock); 1777 mutex_enter(proc_lock);
1767 pp = p->p_pptr; 1778 pp = p->p_pptr;
1768 mutex_enter(pp->p_lock); 1779 mutex_enter(pp->p_lock);
1769 log(LOG_WARNING, "set{u,g}id pid %d (%s) " 1780 log(LOG_WARNING, "set{u,g}id pid %d (%s) "
1770 "was invoked by uid %d ppid %d (%s) " 1781 "was invoked by uid %d ppid %d (%s) "
1771 "with fd %s closed\n", 1782 "with fd %s closed\n",
1772 p->p_pid, p->p_comm, kauth_cred_geteuid(pp->p_cred), 1783 p->p_pid, p->p_comm, kauth_cred_geteuid(pp->p_cred),
1773 pp->p_pid, pp->p_comm, &closed[1]); 1784 pp->p_pid, pp->p_comm, &closed[1]);
1774 mutex_exit(pp->p_lock); 1785 mutex_exit(pp->p_lock);
1775 mutex_exit(proc_lock); 1786 mutex_exit(proc_lock);
1776 } 1787 }
1777 return (0); 1788 return (0);
1778} 1789}
1779#undef CHECK_UPTO 1790#undef CHECK_UPTO
1780 1791
1781/* 1792/*
1782 * Sets descriptor owner. If the owner is a process, 'pgid' 1793 * Sets descriptor owner. If the owner is a process, 'pgid'
1783 * is set to positive value, process ID. If the owner is process group, 1794 * is set to positive value, process ID. If the owner is process group,
1784 * 'pgid' is set to -pg_id. 1795 * 'pgid' is set to -pg_id.
1785 */ 1796 */
1786int 1797int
1787fsetown(pid_t *pgid, u_long cmd, const void *data) 1798fsetown(pid_t *pgid, u_long cmd, const void *data)
1788{ 1799{
1789 int id = *(const int *)data; 1800 int id = *(const int *)data;
1790 int error; 1801 int error;
1791 1802
1792 switch (cmd) { 1803 switch (cmd) {
1793 case TIOCSPGRP: 1804 case TIOCSPGRP:
1794 if (id < 0) 1805 if (id < 0)
1795 return (EINVAL); 1806 return (EINVAL);
1796 id = -id; 1807 id = -id;
1797 break; 1808 break;
1798 default: 1809 default:
1799 break; 1810 break;
1800 } 1811 }
1801 1812
1802 if (id > 0 && !pfind(id)) 1813 if (id > 0 && !pfind(id))
1803 return (ESRCH); 1814 return (ESRCH);
1804 else if (id < 0 && (error = pgid_in_session(curproc, -id))) 1815 else if (id < 0 && (error = pgid_in_session(curproc, -id)))
1805 return (error); 1816 return (error);
1806 1817
1807 *pgid = id; 1818 *pgid = id;
1808 return (0); 1819 return (0);
1809} 1820}
1810 1821
1811/* 1822/*
1812 * Return descriptor owner information. If the value is positive, 1823 * Return descriptor owner information. If the value is positive,
1813 * it's process ID. If it's negative, it's process group ID and 1824 * it's process ID. If it's negative, it's process group ID and
1814 * needs the sign removed before use. 1825 * needs the sign removed before use.
1815 */ 1826 */
1816int 1827int
1817fgetown(pid_t pgid, u_long cmd, void *data) 1828fgetown(pid_t pgid, u_long cmd, void *data)
1818{ 1829{
1819 1830
1820 switch (cmd) { 1831 switch (cmd) {
1821 case TIOCGPGRP: 1832 case TIOCGPGRP:
1822 *(int *)data = -pgid; 1833 *(int *)data = -pgid;
1823 break; 1834 break;
1824 default: 1835 default:
1825 *(int *)data = pgid; 1836 *(int *)data = pgid;
1826 break; 1837 break;
1827 } 1838 }
1828 return (0); 1839 return (0);
1829} 1840}
1830 1841
1831/* 1842/*
1832 * Send signal to descriptor owner, either process or process group. 1843 * Send signal to descriptor owner, either process or process group.
1833 */ 1844 */
1834void 1845void
1835fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata) 1846fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
1836{ 1847{
1837 struct proc *p1; 1848 struct proc *p1;
1838 struct pgrp *pgrp; 1849 struct pgrp *pgrp;
1839 ksiginfo_t ksi; 1850 ksiginfo_t ksi;
1840 1851
1841 KASSERT(!cpu_intr_p()); 1852 KASSERT(!cpu_intr_p());
1842 1853
1843 KSI_INIT(&ksi); 1854 KSI_INIT(&ksi);
1844 ksi.ksi_signo = signo; 1855 ksi.ksi_signo = signo;
1845 ksi.ksi_code = code; 1856 ksi.ksi_code = code;
1846 ksi.ksi_band = band; 1857 ksi.ksi_band = band;
1847 1858
1848 mutex_enter(proc_lock); 1859 mutex_enter(proc_lock);
1849 if (pgid > 0 && (p1 = p_find(pgid, PFIND_LOCKED))) 1860 if (pgid > 0 && (p1 = p_find(pgid, PFIND_LOCKED)))
1850 kpsignal(p1, &ksi, fdescdata); 1861 kpsignal(p1, &ksi, fdescdata);
1851 else if (pgid < 0 && (pgrp = pg_find(-pgid, PFIND_LOCKED))) 1862 else if (pgid < 0 && (pgrp = pg_find(-pgid, PFIND_LOCKED)))
1852 kpgsignal(pgrp, &ksi, fdescdata, 0); 1863 kpgsignal(pgrp, &ksi, fdescdata, 0);
1853 mutex_exit(proc_lock); 1864 mutex_exit(proc_lock);
1854} 1865}
1855 1866
1856int 1867int
1857fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops, 1868fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
1858 void *data) 1869 void *data)
1859{ 1870{
1860 1871
1861 fp->f_flag = flag; 1872 fp->f_flag = flag;
1862 fp->f_type = DTYPE_MISC; 1873 fp->f_type = DTYPE_MISC;
1863 fp->f_ops = fops; 1874 fp->f_ops = fops;
1864 fp->f_data = data; 1875 fp->f_data = data;
1865 curlwp->l_dupfd = fd; 1876 curlwp->l_dupfd = fd;
1866 fd_affix(curproc, fp, fd); 1877 fd_affix(curproc, fp, fd);
1867 1878
1868 return EMOVEFD; 1879 return EMOVEFD;
1869} 1880}
1870 1881
1871int 1882int
1872fnullop_fcntl(file_t *fp, u_int cmd, void *data) 1883fnullop_fcntl(file_t *fp, u_int cmd, void *data)
1873{ 1884{
1874 1885
1875 if (cmd == F_SETFL) 1886 if (cmd == F_SETFL)
1876 return 0; 1887 return 0;
1877 1888
1878 return EOPNOTSUPP; 1889 return EOPNOTSUPP;
1879} 1890}
1880 1891
1881int 1892int
1882fnullop_poll(file_t *fp, int which) 1893fnullop_poll(file_t *fp, int which)
1883{ 1894{
1884 1895
1885 return 0; 1896 return 0;
1886} 1897}
1887 1898
1888int 1899int
1889fnullop_kqfilter(file_t *fp, struct knote *kn) 1900fnullop_kqfilter(file_t *fp, struct knote *kn)
1890{ 1901{
1891 1902
1892 return 0; 1903 return 0;
1893} 1904}
1894 1905
1895int 1906int
1896fbadop_read(file_t *fp, off_t *offset, struct uio *uio, 1907fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
1897 kauth_cred_t cred, int flags) 1908 kauth_cred_t cred, int flags)
1898{ 1909{
1899 1910
1900 return EOPNOTSUPP; 1911 return EOPNOTSUPP;
1901} 1912}
1902 1913
1903int 1914int
1904fbadop_write(file_t *fp, off_t *offset, struct uio *uio, 1915fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
1905 kauth_cred_t cred, int flags) 1916 kauth_cred_t cred, int flags)
1906{ 1917{
1907 1918
1908 return EOPNOTSUPP; 1919 return EOPNOTSUPP;
1909} 1920}
1910 1921
1911int 1922int
1912fbadop_ioctl(file_t *fp, u_long com, void *data) 1923fbadop_ioctl(file_t *fp, u_long com, void *data)
1913{ 1924{
1914 1925
1915 return EOPNOTSUPP; 1926 return EOPNOTSUPP;
1916} 1927}
1917 1928
1918int 1929int
1919fbadop_stat(file_t *fp, struct stat *sb) 1930fbadop_stat(file_t *fp, struct stat *sb)
1920{ 1931{
1921 1932
1922 return EOPNOTSUPP; 1933 return EOPNOTSUPP;
1923} 1934}
1924 1935
1925int 1936int
1926fbadop_close(file_t *fp) 1937fbadop_close(file_t *fp)
1927{ 1938{
1928 1939
1929 return EOPNOTSUPP; 1940 return EOPNOTSUPP;
1930} 1941}

cvs diff -r1.119.4.1 -r1.119.4.2 src/sys/kern/uipc_usrreq.c (switch to unified diff)

--- src/sys/kern/uipc_usrreq.c 2009/02/16 03:31:13 1.119.4.1
+++ src/sys/kern/uipc_usrreq.c 2009/03/18 05:33:23 1.119.4.2
@@ -1,1690 +1,1752 @@ @@ -1,1690 +1,1752 @@
1/* $NetBSD: uipc_usrreq.c,v 1.119.4.1 2009/02/16 03:31:13 snj Exp $ */ 1/* $NetBSD: uipc_usrreq.c,v 1.119.4.2 2009/03/18 05:33:23 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1998, 2000, 2004, 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 1998, 2000, 2004, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center. 9 * NASA Ames Research Center, and by Andrew Doran.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions 12 * modification, are permitted provided that the following conditions
13 * are met: 13 * are met:
14 * 1. Redistributions of source code must retain the above copyright 14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer. 15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright 16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the 17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution. 18 * documentation and/or other materials provided with the distribution.
19 * 19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE. 30 * POSSIBILITY OF SUCH DAMAGE.
31 */ 31 */
32 32
33/* 33/*
34 * Copyright (c) 1982, 1986, 1989, 1991, 1993 34 * Copyright (c) 1982, 1986, 1989, 1991, 1993
35 * The Regents of the University of California. All rights reserved. 35 * The Regents of the University of California. All rights reserved.
36 * 36 *
37 * Redistribution and use in source and binary forms, with or without 37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions 38 * modification, are permitted provided that the following conditions
39 * are met: 39 * are met:
40 * 1. Redistributions of source code must retain the above copyright 40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer. 41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright 42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the 43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution. 44 * documentation and/or other materials provided with the distribution.
45 * 3. Neither the name of the University nor the names of its contributors 45 * 3. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software 46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission. 47 * without specific prior written permission.
48 * 48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE. 59 * SUCH DAMAGE.
60 * 60 *
61 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95 61 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
62 */ 62 */
63 63
64/* 64/*
65 * Copyright (c) 1997 Christopher G. Demetriou. All rights reserved. 65 * Copyright (c) 1997 Christopher G. Demetriou. All rights reserved.
66 * 66 *
67 * Redistribution and use in source and binary forms, with or without 67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions 68 * modification, are permitted provided that the following conditions
69 * are met: 69 * are met:
70 * 1. Redistributions of source code must retain the above copyright 70 * 1. Redistributions of source code must retain the above copyright
71 * notice, this list of conditions and the following disclaimer. 71 * notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright 72 * 2. Redistributions in binary form must reproduce the above copyright
73 * notice, this list of conditions and the following disclaimer in the 73 * notice, this list of conditions and the following disclaimer in the
74 * documentation and/or other materials provided with the distribution. 74 * documentation and/or other materials provided with the distribution.
75 * 3. All advertising materials mentioning features or use of this software 75 * 3. All advertising materials mentioning features or use of this software
76 * must display the following acknowledgement: 76 * must display the following acknowledgement:
77 * This product includes software developed by the University of 77 * This product includes software developed by the University of
78 * California, Berkeley and its contributors. 78 * California, Berkeley and its contributors.
79 * 4. Neither the name of the University nor the names of its contributors 79 * 4. Neither the name of the University nor the names of its contributors
80 * may be used to endorse or promote products derived from this software 80 * may be used to endorse or promote products derived from this software
81 * without specific prior written permission. 81 * without specific prior written permission.
82 * 82 *
83 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 83 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
84 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 84 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
85 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 85 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
86 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 86 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
87 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 87 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
88 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 88 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
89 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 89 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
90 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 90 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
91 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 91 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
92 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 92 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
93 * SUCH DAMAGE. 93 * SUCH DAMAGE.
94 * 94 *
95 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95 95 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
96 */ 96 */
97 97
98#include <sys/cdefs.h> 98#include <sys/cdefs.h>
99__KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.119.4.1 2009/02/16 03:31:13 snj Exp $"); 99__KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.119.4.2 2009/03/18 05:33:23 snj Exp $");
100 100
101#include <sys/param.h> 101#include <sys/param.h>
102#include <sys/systm.h> 102#include <sys/systm.h>
103#include <sys/proc.h> 103#include <sys/proc.h>
104#include <sys/filedesc.h> 104#include <sys/filedesc.h>
105#include <sys/domain.h> 105#include <sys/domain.h>
106#include <sys/protosw.h> 106#include <sys/protosw.h>
107#include <sys/socket.h> 107#include <sys/socket.h>
108#include <sys/socketvar.h> 108#include <sys/socketvar.h>
109#include <sys/unpcb.h> 109#include <sys/unpcb.h>
110#include <sys/un.h> 110#include <sys/un.h>
111#include <sys/namei.h> 111#include <sys/namei.h>
112#include <sys/vnode.h> 112#include <sys/vnode.h>
113#include <sys/file.h> 113#include <sys/file.h>
114#include <sys/stat.h> 114#include <sys/stat.h>
115#include <sys/mbuf.h> 115#include <sys/mbuf.h>
116#include <sys/kauth.h> 116#include <sys/kauth.h>
117#include <sys/kmem.h> 117#include <sys/kmem.h>
118#include <sys/atomic.h> 118#include <sys/atomic.h>
119#include <sys/uidinfo.h> 119#include <sys/uidinfo.h>
 120#include <sys/kernel.h>
 121#include <sys/kthread.h>
120 122
121/* 123/*
122 * Unix communications domain. 124 * Unix communications domain.
123 * 125 *
124 * TODO: 126 * TODO:
125 * SEQPACKET, RDM 127 * SEQPACKET, RDM
126 * rethink name space problems 128 * rethink name space problems
127 * need a proper out-of-band 129 * need a proper out-of-band
128 * 130 *
129 * Notes on locking: 131 * Notes on locking:
130 * 132 *
131 * The generic rules noted in uipc_socket2.c apply. In addition: 133 * The generic rules noted in uipc_socket2.c apply. In addition:
132 * 134 *
133 * o We have a global lock, uipc_lock. 135 * o We have a global lock, uipc_lock.
134 * 136 *
135 * o All datagram sockets are locked by uipc_lock. 137 * o All datagram sockets are locked by uipc_lock.
136 * 138 *
137 * o For stream socketpairs, the two endpoints are created sharing the same 139 * o For stream socketpairs, the two endpoints are created sharing the same
138 * independent lock. Sockets presented to PRU_CONNECT2 must already have 140 * independent lock. Sockets presented to PRU_CONNECT2 must already have
139 * matching locks. 141 * matching locks.
140 * 142 *
141 * o Stream sockets created via socket() start life with their own 143 * o Stream sockets created via socket() start life with their own
142 * independent lock. 144 * independent lock.
143 *  145 *
144 * o Stream connections to a named endpoint are slightly more complicated. 146 * o Stream connections to a named endpoint are slightly more complicated.
145 * Sockets that have called listen() have their lock pointer mutated to 147 * Sockets that have called listen() have their lock pointer mutated to
146 * the global uipc_lock. When establishing a connection, the connecting 148 * the global uipc_lock. When establishing a connection, the connecting
147 * socket also has its lock mutated to uipc_lock, which matches the head 149 * socket also has its lock mutated to uipc_lock, which matches the head
148 * (listening socket). We create a new socket for accept() to return, and 150 * (listening socket). We create a new socket for accept() to return, and
149 * that also shares the head's lock. Until the connection is completely 151 * that also shares the head's lock. Until the connection is completely
150 * done on both ends, all three sockets are locked by uipc_lock. Once the 152 * done on both ends, all three sockets are locked by uipc_lock. Once the
151 * connection is complete, the association with the head's lock is broken. 153 * connection is complete, the association with the head's lock is broken.
152 * The connecting socket and the socket returned from accept() have their 154 * The connecting socket and the socket returned from accept() have their
153 * lock pointers mutated away from uipc_lock, and back to the connecting 155 * lock pointers mutated away from uipc_lock, and back to the connecting
154 * socket's original, independent lock. The head continues to be locked 156 * socket's original, independent lock. The head continues to be locked
155 * by uipc_lock. 157 * by uipc_lock.
156 * 158 *
157 * o If uipc_lock is determined to be a significant source of contention, 159 * o If uipc_lock is determined to be a significant source of contention,
158 * it could easily be hashed out. It is difficult to simply make it an 160 * it could easily be hashed out. It is difficult to simply make it an
159 * independent lock because of visibility / garbage collection issues: 161 * independent lock because of visibility / garbage collection issues:
160 * if a socket has been associated with a lock at any point, that lock 162 * if a socket has been associated with a lock at any point, that lock
161 * must remain valid until the socket is no longer visible in the system. 163 * must remain valid until the socket is no longer visible in the system.
162 * The lock must not be freed or otherwise destroyed until any sockets 164 * The lock must not be freed or otherwise destroyed until any sockets
163 * that had referenced it have also been destroyed. 165 * that had referenced it have also been destroyed.
164 */ 166 */
165const struct sockaddr_un sun_noname = { 167const struct sockaddr_un sun_noname = {
166 .sun_len = sizeof(sun_noname), 168 .sun_len = sizeof(sun_noname),
167 .sun_family = AF_LOCAL, 169 .sun_family = AF_LOCAL,
168}; 170};
169ino_t unp_ino; /* prototype for fake inode numbers */ 171ino_t unp_ino; /* prototype for fake inode numbers */
170 172
171struct mbuf *unp_addsockcred(struct lwp *, struct mbuf *); 173struct mbuf *unp_addsockcred(struct lwp *, struct mbuf *);
 174static void unp_mark(file_t *);
 175static void unp_scan(struct mbuf *, void (*)(file_t *), int);
 176static void unp_discard_now(file_t *);
 177static void unp_discard_later(file_t *);
 178static void unp_thread(void *);
 179static void unp_thread_kick(void);
172static kmutex_t *uipc_lock; 180static kmutex_t *uipc_lock;
173 181
 182static kcondvar_t unp_thread_cv;
 183static lwp_t *unp_thread_lwp;
 184static SLIST_HEAD(,file) unp_thread_discard;
 185static int unp_defer;
 186
174/* 187/*
175 * Initialize Unix protocols. 188 * Initialize Unix protocols.
176 */ 189 */
177void 190void
178uipc_init(void) 191uipc_init(void)
179{ 192{
 193 int error;
180 194
181 uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 195 uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
 196 cv_init(&unp_thread_cv, "unpgc");
 197
 198 error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, unp_thread,
 199 NULL, &unp_thread_lwp, "unpgc");
 200 if (error != 0)
 201 panic("uipc_init %d", error);
182} 202}
183 203
184/* 204/*
185 * A connection succeeded: disassociate both endpoints from the head's 205 * A connection succeeded: disassociate both endpoints from the head's
186 * lock, and make them share their own lock. There is a race here: for 206 * lock, and make them share their own lock. There is a race here: for
187 * a very brief time one endpoint will be locked by a different lock 207 * a very brief time one endpoint will be locked by a different lock
188 * than the other end. However, since the current thread holds the old 208 * than the other end. However, since the current thread holds the old
189 * lock (the listening socket's lock, the head) access can still only be 209 * lock (the listening socket's lock, the head) access can still only be
190 * made to one side of the connection. 210 * made to one side of the connection.
191 */ 211 */
192static void 212static void
193unp_setpeerlocks(struct socket *so, struct socket *so2) 213unp_setpeerlocks(struct socket *so, struct socket *so2)
194{ 214{
195 struct unpcb *unp; 215 struct unpcb *unp;
196 kmutex_t *lock; 216 kmutex_t *lock;
197 217
198 KASSERT(solocked2(so, so2)); 218 KASSERT(solocked2(so, so2));
199 219
200 /* 220 /*
201 * Bail out if either end of the socket is not yet fully 221 * Bail out if either end of the socket is not yet fully
202 * connected or accepted. We only break the lock association 222 * connected or accepted. We only break the lock association
203 * with the head when the pair of sockets stand completely 223 * with the head when the pair of sockets stand completely
204 * on their own. 224 * on their own.
205 */ 225 */
206 if (so->so_head != NULL || so2->so_head != NULL) 226 if (so->so_head != NULL || so2->so_head != NULL)
207 return; 227 return;
208 228
209 /* 229 /*
210 * Drop references to old lock. A third reference (from the 230 * Drop references to old lock. A third reference (from the
211 * queue head) must be held as we still hold its lock. Bonus: 231 * queue head) must be held as we still hold its lock. Bonus:
212 * we don't need to worry about garbage collecting the lock. 232 * we don't need to worry about garbage collecting the lock.
213 */ 233 */
214 lock = so->so_lock; 234 lock = so->so_lock;
215 KASSERT(lock == uipc_lock); 235 KASSERT(lock == uipc_lock);
216 mutex_obj_free(lock); 236 mutex_obj_free(lock);
217 mutex_obj_free(lock); 237 mutex_obj_free(lock);
218 238
219 /* 239 /*
220 * Grab stream lock from the initiator and share between the two 240 * Grab stream lock from the initiator and share between the two
221 * endpoints. Issue memory barrier to ensure all modifications 241 * endpoints. Issue memory barrier to ensure all modifications
222 * become globally visible before the lock change. so2 is 242 * become globally visible before the lock change. so2 is
223 * assumed not to have a stream lock, because it was created 243 * assumed not to have a stream lock, because it was created
224 * purely for the server side to accept this connection and 244 * purely for the server side to accept this connection and
225 * started out life using the domain-wide lock. 245 * started out life using the domain-wide lock.
226 */ 246 */
227 unp = sotounpcb(so); 247 unp = sotounpcb(so);
228 KASSERT(unp->unp_streamlock != NULL); 248 KASSERT(unp->unp_streamlock != NULL);
229 KASSERT(sotounpcb(so2)->unp_streamlock == NULL); 249 KASSERT(sotounpcb(so2)->unp_streamlock == NULL);
230 lock = unp->unp_streamlock; 250 lock = unp->unp_streamlock;
231 unp->unp_streamlock = NULL; 251 unp->unp_streamlock = NULL;
232 mutex_obj_hold(lock); 252 mutex_obj_hold(lock);
233 membar_exit(); 253 membar_exit();
234 solockreset(so, lock); 254 solockreset(so, lock);
235 solockreset(so2, lock); 255 solockreset(so2, lock);
236} 256}
237 257
238/* 258/*
239 * Reset a socket's lock back to the domain-wide lock. 259 * Reset a socket's lock back to the domain-wide lock.
240 */ 260 */
241static void 261static void
242unp_resetlock(struct socket *so) 262unp_resetlock(struct socket *so)
243{ 263{
244 kmutex_t *olock, *nlock; 264 kmutex_t *olock, *nlock;
245 struct unpcb *unp; 265 struct unpcb *unp;
246 266
247 KASSERT(solocked(so)); 267 KASSERT(solocked(so));
248 268
249 olock = so->so_lock; 269 olock = so->so_lock;
250 nlock = uipc_lock; 270 nlock = uipc_lock;
251 if (olock == nlock) 271 if (olock == nlock)
252 return; 272 return;
253 unp = sotounpcb(so); 273 unp = sotounpcb(so);
254 KASSERT(unp->unp_streamlock == NULL); 274 KASSERT(unp->unp_streamlock == NULL);
255 unp->unp_streamlock = olock; 275 unp->unp_streamlock = olock;
256 mutex_obj_hold(nlock); 276 mutex_obj_hold(nlock);
257 mutex_enter(nlock); 277 mutex_enter(nlock);
258 solockreset(so, nlock); 278 solockreset(so, nlock);
259 mutex_exit(olock); 279 mutex_exit(olock);
260} 280}
261 281
262static void 282static void
263unp_free(struct unpcb *unp) 283unp_free(struct unpcb *unp)
264{ 284{
265 285
266 if (unp->unp_addr) 286 if (unp->unp_addr)
267 free(unp->unp_addr, M_SONAME); 287 free(unp->unp_addr, M_SONAME);
268 if (unp->unp_streamlock != NULL) 288 if (unp->unp_streamlock != NULL)
269 mutex_obj_free(unp->unp_streamlock); 289 mutex_obj_free(unp->unp_streamlock);
270 free(unp, M_PCB); 290 free(unp, M_PCB);
271} 291}
272 292
273int 293int
274unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp, 294unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp,
275 struct lwp *l) 295 struct lwp *l)
276{ 296{
277 struct socket *so2; 297 struct socket *so2;
278 const struct sockaddr_un *sun; 298 const struct sockaddr_un *sun;
279 299
280 so2 = unp->unp_conn->unp_socket; 300 so2 = unp->unp_conn->unp_socket;
281 301
282 KASSERT(solocked(so2)); 302 KASSERT(solocked(so2));
283 303
284 if (unp->unp_addr) 304 if (unp->unp_addr)
285 sun = unp->unp_addr; 305 sun = unp->unp_addr;
286 else 306 else
287 sun = &sun_noname; 307 sun = &sun_noname;
288 if (unp->unp_conn->unp_flags & UNP_WANTCRED) 308 if (unp->unp_conn->unp_flags & UNP_WANTCRED)
289 control = unp_addsockcred(l, control); 309 control = unp_addsockcred(l, control);
290 if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m, 310 if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m,
291 control) == 0) { 311 control) == 0) {
292 so2->so_rcv.sb_overflowed++; 312 so2->so_rcv.sb_overflowed++;
293 sounlock(so2); 
294 unp_dispose(control); 313 unp_dispose(control);
295 m_freem(control); 314 m_freem(control);
296 m_freem(m); 315 m_freem(m);
297 solock(so2); 
298 return (ENOBUFS); 316 return (ENOBUFS);
299 } else { 317 } else {
300 sorwakeup(so2); 318 sorwakeup(so2);
301 return (0); 319 return (0);
302 } 320 }
303} 321}
304 322
305void 323void
306unp_setaddr(struct socket *so, struct mbuf *nam, bool peeraddr) 324unp_setaddr(struct socket *so, struct mbuf *nam, bool peeraddr)
307{ 325{
308 const struct sockaddr_un *sun; 326 const struct sockaddr_un *sun;
309 struct unpcb *unp; 327 struct unpcb *unp;
310 bool ext; 328 bool ext;
311 329
312 unp = sotounpcb(so); 330 unp = sotounpcb(so);
313 ext = false; 331 ext = false;
314 332
315 for (;;) { 333 for (;;) {
316 sun = NULL; 334 sun = NULL;
317 if (peeraddr) { 335 if (peeraddr) {
318 if (unp->unp_conn && unp->unp_conn->unp_addr) 336 if (unp->unp_conn && unp->unp_conn->unp_addr)
319 sun = unp->unp_conn->unp_addr; 337 sun = unp->unp_conn->unp_addr;
320 } else { 338 } else {
321 if (unp->unp_addr) 339 if (unp->unp_addr)
322 sun = unp->unp_addr; 340 sun = unp->unp_addr;
323 } 341 }
324 if (sun == NULL) 342 if (sun == NULL)
325 sun = &sun_noname; 343 sun = &sun_noname;
326 nam->m_len = sun->sun_len; 344 nam->m_len = sun->sun_len;
327 if (nam->m_len > MLEN && !ext) { 345 if (nam->m_len > MLEN && !ext) {
328 sounlock(so); 346 sounlock(so);
329 MEXTMALLOC(nam, MAXPATHLEN * 2, M_WAITOK); 347 MEXTMALLOC(nam, MAXPATHLEN * 2, M_WAITOK);
330 solock(so); 348 solock(so);
331 ext = true; 349 ext = true;
332 } else { 350 } else {
333 KASSERT(nam->m_len <= MAXPATHLEN * 2); 351 KASSERT(nam->m_len <= MAXPATHLEN * 2);
334 memcpy(mtod(nam, void *), sun, (size_t)nam->m_len); 352 memcpy(mtod(nam, void *), sun, (size_t)nam->m_len);
335 break; 353 break;
336 } 354 }
337 } 355 }
338} 356}
339 357
340/*ARGSUSED*/ 358/*ARGSUSED*/
341int 359int
342uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 360uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
343 struct mbuf *control, struct lwp *l) 361 struct mbuf *control, struct lwp *l)
344{ 362{
345 struct unpcb *unp = sotounpcb(so); 363 struct unpcb *unp = sotounpcb(so);
346 struct socket *so2; 364 struct socket *so2;
347 struct proc *p; 365 struct proc *p;
348 u_int newhiwat; 366 u_int newhiwat;
349 int error = 0; 367 int error = 0;
350 368
351 if (req == PRU_CONTROL) 369 if (req == PRU_CONTROL)
352 return (EOPNOTSUPP); 370 return (EOPNOTSUPP);
353 371
354#ifdef DIAGNOSTIC 372#ifdef DIAGNOSTIC
355 if (req != PRU_SEND && req != PRU_SENDOOB && control) 373 if (req != PRU_SEND && req != PRU_SENDOOB && control)
356 panic("uipc_usrreq: unexpected control mbuf"); 374 panic("uipc_usrreq: unexpected control mbuf");
357#endif 375#endif
358 p = l ? l->l_proc : NULL; 376 p = l ? l->l_proc : NULL;
359 if (req != PRU_ATTACH) { 377 if (req != PRU_ATTACH) {
360 if (unp == 0) { 378 if (unp == 0) {
361 error = EINVAL; 379 error = EINVAL;
362 goto release; 380 goto release;
363 } 381 }
364 KASSERT(solocked(so)); 382 KASSERT(solocked(so));
365 } 383 }
366 384
367 switch (req) { 385 switch (req) {
368 386
369 case PRU_ATTACH: 387 case PRU_ATTACH:
370 if (unp != 0) { 388 if (unp != 0) {
371 error = EISCONN; 389 error = EISCONN;
372 break; 390 break;
373 } 391 }
374 error = unp_attach(so); 392 error = unp_attach(so);
375 break; 393 break;
376 394
377 case PRU_DETACH: 395 case PRU_DETACH:
378 unp_detach(unp); 396 unp_detach(unp);
379 break; 397 break;
380 398
381 case PRU_BIND: 399 case PRU_BIND:
382 KASSERT(l != NULL); 400 KASSERT(l != NULL);
383 error = unp_bind(so, nam, l); 401 error = unp_bind(so, nam, l);
384 break; 402 break;
385 403
386 case PRU_LISTEN: 404 case PRU_LISTEN:
387 /* 405 /*
388 * If the socket can accept a connection, it must be 406 * If the socket can accept a connection, it must be
389 * locked by uipc_lock. 407 * locked by uipc_lock.
390 */ 408 */
391 unp_resetlock(so); 409 unp_resetlock(so);
392 if (unp->unp_vnode == 0) 410 if (unp->unp_vnode == 0)
393 error = EINVAL; 411 error = EINVAL;
394 break; 412 break;
395 413
396 case PRU_CONNECT: 414 case PRU_CONNECT:
397 KASSERT(l != NULL); 415 KASSERT(l != NULL);
398 error = unp_connect(so, nam, l); 416 error = unp_connect(so, nam, l);
399 break; 417 break;
400 418
401 case PRU_CONNECT2: 419 case PRU_CONNECT2:
402 error = unp_connect2(so, (struct socket *)nam, PRU_CONNECT2); 420 error = unp_connect2(so, (struct socket *)nam, PRU_CONNECT2);
403 break; 421 break;
404 422
405 case PRU_DISCONNECT: 423 case PRU_DISCONNECT:
406 unp_disconnect(unp); 424 unp_disconnect(unp);
407 break; 425 break;
408 426
409 case PRU_ACCEPT: 427 case PRU_ACCEPT:
410 KASSERT(so->so_lock == uipc_lock); 428 KASSERT(so->so_lock == uipc_lock);
411 /* 429 /*
412 * Mark the initiating STREAM socket as connected *ONLY* 430 * Mark the initiating STREAM socket as connected *ONLY*
413 * after it's been accepted. This prevents a client from 431 * after it's been accepted. This prevents a client from
414 * overrunning a server and receiving ECONNREFUSED. 432 * overrunning a server and receiving ECONNREFUSED.
415 */ 433 */
416 if (unp->unp_conn == NULL) 434 if (unp->unp_conn == NULL)
417 break; 435 break;
418 so2 = unp->unp_conn->unp_socket; 436 so2 = unp->unp_conn->unp_socket;
419 if (so2->so_state & SS_ISCONNECTING) { 437 if (so2->so_state & SS_ISCONNECTING) {
420 KASSERT(solocked2(so, so->so_head)); 438 KASSERT(solocked2(so, so->so_head));
421 KASSERT(solocked2(so2, so->so_head)); 439 KASSERT(solocked2(so2, so->so_head));
422 soisconnected(so2); 440 soisconnected(so2);
423 } 441 }
424 /* 442 /*
425 * If the connection is fully established, break the 443 * If the connection is fully established, break the
426 * association with uipc_lock and give the connected 444 * association with uipc_lock and give the connected
427 * pair a seperate lock to share. 445 * pair a seperate lock to share.
428 */ 446 */
429 unp_setpeerlocks(so2, so); 447 unp_setpeerlocks(so2, so);
430 /* 448 /*
431 * Only now return peer's address, as we may need to 449 * Only now return peer's address, as we may need to
432 * block in order to allocate memory. 450 * block in order to allocate memory.
433 * 451 *
434 * XXX Minor race: connection can be broken while 452 * XXX Minor race: connection can be broken while
435 * lock is dropped in unp_setaddr(). We will return 453 * lock is dropped in unp_setaddr(). We will return
436 * error == 0 and sun_noname as the peer address. 454 * error == 0 and sun_noname as the peer address.
437 */ 455 */
438 unp_setaddr(so, nam, true); 456 unp_setaddr(so, nam, true);
439 break; 457 break;
440 458
441 case PRU_SHUTDOWN: 459 case PRU_SHUTDOWN:
442 socantsendmore(so); 460 socantsendmore(so);
443 unp_shutdown(unp); 461 unp_shutdown(unp);
444 break; 462 break;
445 463
446 case PRU_RCVD: 464 case PRU_RCVD:
447 switch (so->so_type) { 465 switch (so->so_type) {
448 466
449 case SOCK_DGRAM: 467 case SOCK_DGRAM:
450 panic("uipc 1"); 468 panic("uipc 1");
451 /*NOTREACHED*/ 469 /*NOTREACHED*/
452 470
453 case SOCK_STREAM: 471 case SOCK_STREAM:
454#define rcv (&so->so_rcv) 472#define rcv (&so->so_rcv)
455#define snd (&so2->so_snd) 473#define snd (&so2->so_snd)
456 if (unp->unp_conn == 0) 474 if (unp->unp_conn == 0)
457 break; 475 break;
458 so2 = unp->unp_conn->unp_socket; 476 so2 = unp->unp_conn->unp_socket;
459 KASSERT(solocked2(so, so2)); 477 KASSERT(solocked2(so, so2));
460 /* 478 /*
461 * Adjust backpressure on sender 479 * Adjust backpressure on sender
462 * and wakeup any waiting to write. 480 * and wakeup any waiting to write.
463 */ 481 */
464 snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt; 482 snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
465 unp->unp_mbcnt = rcv->sb_mbcnt; 483 unp->unp_mbcnt = rcv->sb_mbcnt;
466 newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc; 484 newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc;
467 (void)chgsbsize(so2->so_uidinfo, 485 (void)chgsbsize(so2->so_uidinfo,
468 &snd->sb_hiwat, newhiwat, RLIM_INFINITY); 486 &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
469 unp->unp_cc = rcv->sb_cc; 487 unp->unp_cc = rcv->sb_cc;
470 sowwakeup(so2); 488 sowwakeup(so2);
471#undef snd 489#undef snd
472#undef rcv 490#undef rcv
473 break; 491 break;
474 492
475 default: 493 default:
476 panic("uipc 2"); 494 panic("uipc 2");
477 } 495 }
478 break; 496 break;
479 497
480 case PRU_SEND: 498 case PRU_SEND:
481 /* 499 /*
482 * Note: unp_internalize() rejects any control message 500 * Note: unp_internalize() rejects any control message
483 * other than SCM_RIGHTS, and only allows one. This 501 * other than SCM_RIGHTS, and only allows one. This
484 * has the side-effect of preventing a caller from 502 * has the side-effect of preventing a caller from
485 * forging SCM_CREDS. 503 * forging SCM_CREDS.
486 */ 504 */
487 if (control) { 505 if (control) {
488 sounlock(so); 506 sounlock(so);
489 error = unp_internalize(&control); 507 error = unp_internalize(&control);
490 solock(so); 508 solock(so);
491 if (error != 0) { 509 if (error != 0) {
492 m_freem(control); 510 m_freem(control);
493 m_freem(m); 511 m_freem(m);
494 break; 512 break;
495 } 513 }
496 } 514 }
497 switch (so->so_type) { 515 switch (so->so_type) {
498 516
499 case SOCK_DGRAM: { 517 case SOCK_DGRAM: {
500 KASSERT(so->so_lock == uipc_lock); 518 KASSERT(so->so_lock == uipc_lock);
501 if (nam) { 519 if (nam) {
502 if ((so->so_state & SS_ISCONNECTED) != 0) 520 if ((so->so_state & SS_ISCONNECTED) != 0)
503 error = EISCONN; 521 error = EISCONN;
504 else { 522 else {
505 /* 523 /*
506 * Note: once connected, the 524 * Note: once connected, the
507 * socket's lock must not be 525 * socket's lock must not be
508 * dropped until we have sent 526 * dropped until we have sent
509 * the message and disconnected. 527 * the message and disconnected.
510 * This is necessary to prevent 528 * This is necessary to prevent
511 * intervening control ops, like 529 * intervening control ops, like
512 * another connection. 530 * another connection.
513 */ 531 */
514 error = unp_connect(so, nam, l); 532 error = unp_connect(so, nam, l);
515 } 533 }
516 } else { 534 } else {
517 if ((so->so_state & SS_ISCONNECTED) == 0) 535 if ((so->so_state & SS_ISCONNECTED) == 0)
518 error = ENOTCONN; 536 error = ENOTCONN;
519 } 537 }
520 if (error) { 538 if (error) {
521 sounlock(so); 
522 unp_dispose(control); 539 unp_dispose(control);
523 m_freem(control); 540 m_freem(control);
524 m_freem(m); 541 m_freem(m);
525 solock(so); 
526 break; 542 break;
527 } 543 }
528 KASSERT(p != NULL); 544 KASSERT(p != NULL);
529 error = unp_output(m, control, unp, l); 545 error = unp_output(m, control, unp, l);
530 if (nam) 546 if (nam)
531 unp_disconnect(unp); 547 unp_disconnect(unp);
532 break; 548 break;
533 } 549 }
534 550
535 case SOCK_STREAM: 551 case SOCK_STREAM:
536#define rcv (&so2->so_rcv) 552#define rcv (&so2->so_rcv)
537#define snd (&so->so_snd) 553#define snd (&so->so_snd)
538 if (unp->unp_conn == NULL) { 554 if (unp->unp_conn == NULL) {
539 error = ENOTCONN; 555 error = ENOTCONN;
540 break; 556 break;
541 } 557 }
542 so2 = unp->unp_conn->unp_socket; 558 so2 = unp->unp_conn->unp_socket;
543 KASSERT(solocked2(so, so2)); 559 KASSERT(solocked2(so, so2));
544 if (unp->unp_conn->unp_flags & UNP_WANTCRED) { 560 if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
545 /* 561 /*
546 * Credentials are passed only once on 562 * Credentials are passed only once on
547 * SOCK_STREAM. 563 * SOCK_STREAM.
548 */ 564 */
549 unp->unp_conn->unp_flags &= ~UNP_WANTCRED; 565 unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
550 control = unp_addsockcred(l, control); 566 control = unp_addsockcred(l, control);
551 } 567 }
552 /* 568 /*
553 * Send to paired receive port, and then reduce 569 * Send to paired receive port, and then reduce
554 * send buffer hiwater marks to maintain backpressure. 570 * send buffer hiwater marks to maintain backpressure.
555 * Wake up readers. 571 * Wake up readers.
556 */ 572 */
557 if (control) { 573 if (control) {
558 if (sbappendcontrol(rcv, m, control) != 0) 574 if (sbappendcontrol(rcv, m, control) != 0)
559 control = NULL; 575 control = NULL;
560 } else 576 } else
561 sbappend(rcv, m); 577 sbappend(rcv, m);
562 snd->sb_mbmax -= 578 snd->sb_mbmax -=
563 rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt; 579 rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
564 unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt; 580 unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
565 newhiwat = snd->sb_hiwat - 581 newhiwat = snd->sb_hiwat -
566 (rcv->sb_cc - unp->unp_conn->unp_cc); 582 (rcv->sb_cc - unp->unp_conn->unp_cc);
567 (void)chgsbsize(so->so_uidinfo, 583 (void)chgsbsize(so->so_uidinfo,
568 &snd->sb_hiwat, newhiwat, RLIM_INFINITY); 584 &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
569 unp->unp_conn->unp_cc = rcv->sb_cc; 585 unp->unp_conn->unp_cc = rcv->sb_cc;
570 sorwakeup(so2); 586 sorwakeup(so2);
571#undef snd 587#undef snd
572#undef rcv 588#undef rcv
573 if (control != NULL) { 589 if (control != NULL) {
574 sounlock(so); 
575 unp_dispose(control); 590 unp_dispose(control);
576 m_freem(control); 591 m_freem(control);
577 solock(so); 
578 } 592 }
579 break; 593 break;
580 594
581 default: 595 default:
582 panic("uipc 4"); 596 panic("uipc 4");
583 } 597 }
584 break; 598 break;
585 599
586 case PRU_ABORT: 600 case PRU_ABORT:
587 (void)unp_drop(unp, ECONNABORTED); 601 (void)unp_drop(unp, ECONNABORTED);
588 602
589 KASSERT(so->so_head == NULL); 603 KASSERT(so->so_head == NULL);
590#ifdef DIAGNOSTIC 604#ifdef DIAGNOSTIC
591 if (so->so_pcb == 0) 605 if (so->so_pcb == 0)
592 panic("uipc 5: drop killed pcb"); 606 panic("uipc 5: drop killed pcb");
593#endif 607#endif
594 unp_detach(unp); 608 unp_detach(unp);
595 break; 609 break;
596 610
597 case PRU_SENSE: 611 case PRU_SENSE:
598 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 612 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
599 if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) { 613 if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
600 so2 = unp->unp_conn->unp_socket; 614 so2 = unp->unp_conn->unp_socket;
601 KASSERT(solocked2(so, so2)); 615 KASSERT(solocked2(so, so2));
602 ((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc; 616 ((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc;
603 } 617 }
604 ((struct stat *) m)->st_dev = NODEV; 618 ((struct stat *) m)->st_dev = NODEV;
605 if (unp->unp_ino == 0) 619 if (unp->unp_ino == 0)
606 unp->unp_ino = unp_ino++; 620 unp->unp_ino = unp_ino++;
607 ((struct stat *) m)->st_atimespec = 621 ((struct stat *) m)->st_atimespec =
608 ((struct stat *) m)->st_mtimespec = 622 ((struct stat *) m)->st_mtimespec =
609 ((struct stat *) m)->st_ctimespec = unp->unp_ctime; 623 ((struct stat *) m)->st_ctimespec = unp->unp_ctime;
610 ((struct stat *) m)->st_ino = unp->unp_ino; 624 ((struct stat *) m)->st_ino = unp->unp_ino;
611 return (0); 625 return (0);
612 626
613 case PRU_RCVOOB: 627 case PRU_RCVOOB:
614 error = EOPNOTSUPP; 628 error = EOPNOTSUPP;
615 break; 629 break;
616 630
617 case PRU_SENDOOB: 631 case PRU_SENDOOB:
618 m_freem(control); 632 m_freem(control);
619 m_freem(m); 633 m_freem(m);
620 error = EOPNOTSUPP; 634 error = EOPNOTSUPP;
621 break; 635 break;
622 636
623 case PRU_SOCKADDR: 637 case PRU_SOCKADDR:
624 unp_setaddr(so, nam, false); 638 unp_setaddr(so, nam, false);
625 break; 639 break;
626 640
627 case PRU_PEERADDR: 641 case PRU_PEERADDR:
628 unp_setaddr(so, nam, true); 642 unp_setaddr(so, nam, true);
629 break; 643 break;
630 644
631 default: 645 default:
632 panic("piusrreq"); 646 panic("piusrreq");
633 } 647 }
634 648
635release: 649release:
636 return (error); 650 return (error);
637} 651}
638 652
639/* 653/*
640 * Unix domain socket option processing. 654 * Unix domain socket option processing.
641 */ 655 */
642int 656int
643uipc_ctloutput(int op, struct socket *so, struct sockopt *sopt) 657uipc_ctloutput(int op, struct socket *so, struct sockopt *sopt)
644{ 658{
645 struct unpcb *unp = sotounpcb(so); 659 struct unpcb *unp = sotounpcb(so);
646 int optval = 0, error = 0; 660 int optval = 0, error = 0;
647 661
648 KASSERT(solocked(so)); 662 KASSERT(solocked(so));
649 663
650 if (sopt->sopt_level != 0) { 664 if (sopt->sopt_level != 0) {
651 error = ENOPROTOOPT; 665 error = ENOPROTOOPT;
652 } else switch (op) { 666 } else switch (op) {
653 667
654 case PRCO_SETOPT: 668 case PRCO_SETOPT:
655 switch (sopt->sopt_name) { 669 switch (sopt->sopt_name) {
656 case LOCAL_CREDS: 670 case LOCAL_CREDS:
657 case LOCAL_CONNWAIT: 671 case LOCAL_CONNWAIT:
658 error = sockopt_getint(sopt, &optval); 672 error = sockopt_getint(sopt, &optval);
659 if (error) 673 if (error)
660 break; 674 break;
661 switch (sopt->sopt_name) { 675 switch (sopt->sopt_name) {
662#define OPTSET(bit) \ 676#define OPTSET(bit) \
663 if (optval) \ 677 if (optval) \
664 unp->unp_flags |= (bit); \ 678 unp->unp_flags |= (bit); \
665 else \ 679 else \
666 unp->unp_flags &= ~(bit); 680 unp->unp_flags &= ~(bit);
667 681
668 case LOCAL_CREDS: 682 case LOCAL_CREDS:
669 OPTSET(UNP_WANTCRED); 683 OPTSET(UNP_WANTCRED);
670 break; 684 break;
671 case LOCAL_CONNWAIT: 685 case LOCAL_CONNWAIT:
672 OPTSET(UNP_CONNWAIT); 686 OPTSET(UNP_CONNWAIT);
673 break; 687 break;
674 } 688 }
675 break; 689 break;
676#undef OPTSET 690#undef OPTSET
677 691
678 default: 692 default:
679 error = ENOPROTOOPT; 693 error = ENOPROTOOPT;
680 break; 694 break;
681 } 695 }
682 break; 696 break;
683 697
684 case PRCO_GETOPT: 698 case PRCO_GETOPT:
685 sounlock(so); 699 sounlock(so);
686 switch (sopt->sopt_name) { 700 switch (sopt->sopt_name) {
687 case LOCAL_PEEREID: 701 case LOCAL_PEEREID:
688 if (unp->unp_flags & UNP_EIDSVALID) { 702 if (unp->unp_flags & UNP_EIDSVALID) {
689 error = sockopt_set(sopt, 703 error = sockopt_set(sopt,
690 &unp->unp_connid, sizeof(unp->unp_connid)); 704 &unp->unp_connid, sizeof(unp->unp_connid));
691 } else { 705 } else {
692 error = EINVAL; 706 error = EINVAL;
693 } 707 }
694 break; 708 break;
695 case LOCAL_CREDS: 709 case LOCAL_CREDS:
696#define OPTBIT(bit) (unp->unp_flags & (bit) ? 1 : 0) 710#define OPTBIT(bit) (unp->unp_flags & (bit) ? 1 : 0)
697 711
698 optval = OPTBIT(UNP_WANTCRED); 712 optval = OPTBIT(UNP_WANTCRED);
699 error = sockopt_setint(sopt, optval); 713 error = sockopt_setint(sopt, optval);
700 break; 714 break;
701#undef OPTBIT 715#undef OPTBIT
702 716
703 default: 717 default:
704 error = ENOPROTOOPT; 718 error = ENOPROTOOPT;
705 break; 719 break;
706 } 720 }
707 solock(so); 721 solock(so);
708 break; 722 break;
709 } 723 }
710 return (error); 724 return (error);
711} 725}
712 726
713/* 727/*
714 * Both send and receive buffers are allocated PIPSIZ bytes of buffering 728 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
715 * for stream sockets, although the total for sender and receiver is 729 * for stream sockets, although the total for sender and receiver is
716 * actually only PIPSIZ. 730 * actually only PIPSIZ.
717 * Datagram sockets really use the sendspace as the maximum datagram size, 731 * Datagram sockets really use the sendspace as the maximum datagram size,
718 * and don't really want to reserve the sendspace. Their recvspace should 732 * and don't really want to reserve the sendspace. Their recvspace should
719 * be large enough for at least one max-size datagram plus address. 733 * be large enough for at least one max-size datagram plus address.
720 */ 734 */
721#define PIPSIZ 4096 735#define PIPSIZ 4096
722u_long unpst_sendspace = PIPSIZ; 736u_long unpst_sendspace = PIPSIZ;
723u_long unpst_recvspace = PIPSIZ; 737u_long unpst_recvspace = PIPSIZ;
724u_long unpdg_sendspace = 2*1024; /* really max datagram size */ 738u_long unpdg_sendspace = 2*1024; /* really max datagram size */
725u_long unpdg_recvspace = 4*1024; 739u_long unpdg_recvspace = 4*1024;
726 740
727u_int unp_rights; /* file descriptors in flight */ 741u_int unp_rights; /* files in flight */
 742u_int unp_rights_ratio = 2; /* limit, fraction of maxfiles */
728 743
729int 744int
730unp_attach(struct socket *so) 745unp_attach(struct socket *so)
731{ 746{
732 struct unpcb *unp; 747 struct unpcb *unp;
733 int error; 748 int error;
734 749
735 switch (so->so_type) { 750 switch (so->so_type) {
736 case SOCK_STREAM: 751 case SOCK_STREAM:
737 if (so->so_lock == NULL) { 752 if (so->so_lock == NULL) {
738 /*  753 /*
739 * XXX Assuming that no socket locks are held, 754 * XXX Assuming that no socket locks are held,
740 * as this call may sleep. 755 * as this call may sleep.
741 */ 756 */
742 so->so_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 757 so->so_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
743 solock(so); 758 solock(so);
744 } 759 }
745 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 760 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
746 error = soreserve(so, unpst_sendspace, unpst_recvspace); 761 error = soreserve(so, unpst_sendspace, unpst_recvspace);
747 if (error != 0) 762 if (error != 0)
748 return (error); 763 return (error);
749 } 764 }
750 break; 765 break;
751 766
752 case SOCK_DGRAM: 767 case SOCK_DGRAM:
753 if (so->so_lock == NULL) { 768 if (so->so_lock == NULL) {
754 mutex_obj_hold(uipc_lock); 769 mutex_obj_hold(uipc_lock);
755 so->so_lock = uipc_lock; 770 so->so_lock = uipc_lock;
756 solock(so); 771 solock(so);
757 } 772 }
758 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 773 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
759 error = soreserve(so, unpdg_sendspace, unpdg_recvspace); 774 error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
760 if (error != 0) 775 if (error != 0)
761 return (error); 776 return (error);
762 } 777 }
763 break; 778 break;
764 779
765 default: 780 default:
766 panic("unp_attach"); 781 panic("unp_attach");
767 } 782 }
768 KASSERT(solocked(so)); 783 KASSERT(solocked(so));
769 unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT); 784 unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT);
770 if (unp == NULL) 785 if (unp == NULL)
771 return (ENOBUFS); 786 return (ENOBUFS);
772 memset((void *)unp, 0, sizeof(*unp)); 787 memset((void *)unp, 0, sizeof(*unp));
773 unp->unp_socket = so; 788 unp->unp_socket = so;
774 so->so_pcb = unp; 789 so->so_pcb = unp;
775 nanotime(&unp->unp_ctime); 790 nanotime(&unp->unp_ctime);
776 return (0); 791 return (0);
777} 792}
778 793
779void 794void
780unp_detach(struct unpcb *unp) 795unp_detach(struct unpcb *unp)
781{ 796{
782 struct socket *so; 797 struct socket *so;
783 vnode_t *vp; 798 vnode_t *vp;
784 799
785 so = unp->unp_socket; 800 so = unp->unp_socket;
786 801
787 retry: 802 retry:
788 if ((vp = unp->unp_vnode) != NULL) { 803 if ((vp = unp->unp_vnode) != NULL) {
789 sounlock(so); 804 sounlock(so);
790 /* Acquire v_interlock to protect against unp_connect(). */ 805 /* Acquire v_interlock to protect against unp_connect(). */
791 /* XXXAD racy */ 806 /* XXXAD racy */
792 mutex_enter(&vp->v_interlock); 807 mutex_enter(&vp->v_interlock);
793 vp->v_socket = NULL; 808 vp->v_socket = NULL;
794 vrelel(vp, 0); 809 vrelel(vp, 0);
795 solock(so); 810 solock(so);
796 unp->unp_vnode = NULL; 811 unp->unp_vnode = NULL;
797 } 812 }
798 if (unp->unp_conn) 813 if (unp->unp_conn)
799 unp_disconnect(unp); 814 unp_disconnect(unp);
800 while (unp->unp_refs) { 815 while (unp->unp_refs) {
801 KASSERT(solocked2(so, unp->unp_refs->unp_socket)); 816 KASSERT(solocked2(so, unp->unp_refs->unp_socket));
802 if (unp_drop(unp->unp_refs, ECONNRESET)) { 817 if (unp_drop(unp->unp_refs, ECONNRESET)) {
803 solock(so); 818 solock(so);
804 goto retry; 819 goto retry;
805 } 820 }
806 } 821 }
807 soisdisconnected(so); 822 soisdisconnected(so);
808 so->so_pcb = NULL; 823 so->so_pcb = NULL;
809 if (unp_rights) { 824 if (unp_rights) {
810 /* 825 /*
811 * Normally the receive buffer is flushed later, 826 * Normally the receive buffer is flushed later, in sofree,
812 * in sofree, but if our receive buffer holds references 827 * but if our receive buffer holds references to files that
813 * to descriptors that are now garbage, we will dispose 828 * are now garbage, we will enqueue those file references to
814 * of those descriptor references after the garbage collector 829 * the garbage collector and kick it into action.
815 * gets them (resulting in a "panic: closef: count < 0"). 
816 */ 830 */
817 sorflush(so); 831 sorflush(so);
818 unp_free(unp); 832 unp_free(unp);
819 sounlock(so); 833 unp_thread_kick();
820 unp_gc(); 
821 solock(so); 
822 } else 834 } else
823 unp_free(unp); 835 unp_free(unp);
824} 836}
825 837
826int 838int
827unp_bind(struct socket *so, struct mbuf *nam, struct lwp *l) 839unp_bind(struct socket *so, struct mbuf *nam, struct lwp *l)
828{ 840{
829 struct sockaddr_un *sun; 841 struct sockaddr_un *sun;
830 struct unpcb *unp; 842 struct unpcb *unp;
831 vnode_t *vp; 843 vnode_t *vp;
832 struct vattr vattr; 844 struct vattr vattr;
833 size_t addrlen; 845 size_t addrlen;
834 int error; 846 int error;
835 struct nameidata nd; 847 struct nameidata nd;
836 proc_t *p; 848 proc_t *p;
837 849
838 unp = sotounpcb(so); 850 unp = sotounpcb(so);
839 if (unp->unp_vnode != NULL) 851 if (unp->unp_vnode != NULL)
840 return (EINVAL); 852 return (EINVAL);
841 if ((unp->unp_flags & UNP_BUSY) != 0) { 853 if ((unp->unp_flags & UNP_BUSY) != 0) {
842 /* 854 /*
843 * EALREADY may not be strictly accurate, but since this 855 * EALREADY may not be strictly accurate, but since this
844 * is a major application error it's hardly a big deal. 856 * is a major application error it's hardly a big deal.
845 */ 857 */
846 return (EALREADY); 858 return (EALREADY);
847 } 859 }
848 unp->unp_flags |= UNP_BUSY; 860 unp->unp_flags |= UNP_BUSY;
849 sounlock(so); 861 sounlock(so);
850 862
851 /* 863 /*
852 * Allocate the new sockaddr. We have to allocate one 864 * Allocate the new sockaddr. We have to allocate one
853 * extra byte so that we can ensure that the pathname 865 * extra byte so that we can ensure that the pathname
854 * is nul-terminated. 866 * is nul-terminated.
855 */ 867 */
856 p = l->l_proc; 868 p = l->l_proc;
857 addrlen = nam->m_len + 1; 869 addrlen = nam->m_len + 1;
858 sun = malloc(addrlen, M_SONAME, M_WAITOK); 870 sun = malloc(addrlen, M_SONAME, M_WAITOK);
859 m_copydata(nam, 0, nam->m_len, (void *)sun); 871 m_copydata(nam, 0, nam->m_len, (void *)sun);
860 *(((char *)sun) + nam->m_len) = '\0'; 872 *(((char *)sun) + nam->m_len) = '\0';
861 873
862 NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT | TRYEMULROOT, UIO_SYSSPACE, 874 NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT | TRYEMULROOT, UIO_SYSSPACE,
863 sun->sun_path); 875 sun->sun_path);
864 876
865/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ 877/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
866 if ((error = namei(&nd)) != 0) 878 if ((error = namei(&nd)) != 0)
867 goto bad; 879 goto bad;
868 vp = nd.ni_vp; 880 vp = nd.ni_vp;
869 if (vp != NULL) { 881 if (vp != NULL) {
870 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); 882 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
871 if (nd.ni_dvp == vp) 883 if (nd.ni_dvp == vp)
872 vrele(nd.ni_dvp); 884 vrele(nd.ni_dvp);
873 else 885 else
874 vput(nd.ni_dvp); 886 vput(nd.ni_dvp);
875 vrele(vp); 887 vrele(vp);
876 error = EADDRINUSE; 888 error = EADDRINUSE;
877 goto bad; 889 goto bad;
878 } 890 }
879 VATTR_NULL(&vattr); 891 VATTR_NULL(&vattr);
880 vattr.va_type = VSOCK; 892 vattr.va_type = VSOCK;
881 vattr.va_mode = ACCESSPERMS & ~(p->p_cwdi->cwdi_cmask); 893 vattr.va_mode = ACCESSPERMS & ~(p->p_cwdi->cwdi_cmask);
882 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); 894 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
883 if (error) 895 if (error)
884 goto bad; 896 goto bad;
885 vp = nd.ni_vp; 897 vp = nd.ni_vp;
886 solock(so); 898 solock(so);
887 vp->v_socket = unp->unp_socket; 899 vp->v_socket = unp->unp_socket;
888 unp->unp_vnode = vp; 900 unp->unp_vnode = vp;
889 unp->unp_addrlen = addrlen; 901 unp->unp_addrlen = addrlen;
890 unp->unp_addr = sun; 902 unp->unp_addr = sun;
891 unp->unp_connid.unp_pid = p->p_pid; 903 unp->unp_connid.unp_pid = p->p_pid;
892 unp->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred); 904 unp->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred);
893 unp->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred); 905 unp->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred);
894 unp->unp_flags |= UNP_EIDSBIND; 906 unp->unp_flags |= UNP_EIDSBIND;
895 VOP_UNLOCK(vp, 0); 907 VOP_UNLOCK(vp, 0);
896 unp->unp_flags &= ~UNP_BUSY; 908 unp->unp_flags &= ~UNP_BUSY;
897 return (0); 909 return (0);
898 910
899 bad: 911 bad:
900 free(sun, M_SONAME); 912 free(sun, M_SONAME);
901 solock(so); 913 solock(so);
902 unp->unp_flags &= ~UNP_BUSY; 914 unp->unp_flags &= ~UNP_BUSY;
903 return (error); 915 return (error);
904} 916}
905 917
906int 918int
907unp_connect(struct socket *so, struct mbuf *nam, struct lwp *l) 919unp_connect(struct socket *so, struct mbuf *nam, struct lwp *l)
908{ 920{
909 struct sockaddr_un *sun; 921 struct sockaddr_un *sun;
910 vnode_t *vp; 922 vnode_t *vp;
911 struct socket *so2, *so3; 923 struct socket *so2, *so3;
912 struct unpcb *unp, *unp2, *unp3; 924 struct unpcb *unp, *unp2, *unp3;
913 size_t addrlen; 925 size_t addrlen;
914 int error; 926 int error;
915 struct nameidata nd; 927 struct nameidata nd;
916 928
917 unp = sotounpcb(so); 929 unp = sotounpcb(so);
918 if ((unp->unp_flags & UNP_BUSY) != 0) { 930 if ((unp->unp_flags & UNP_BUSY) != 0) {
919 /* 931 /*
920 * EALREADY may not be strictly accurate, but since this 932 * EALREADY may not be strictly accurate, but since this
921 * is a major application error it's hardly a big deal. 933 * is a major application error it's hardly a big deal.
922 */ 934 */
923 return (EALREADY); 935 return (EALREADY);
924 } 936 }
925 unp->unp_flags |= UNP_BUSY; 937 unp->unp_flags |= UNP_BUSY;
926 sounlock(so); 938 sounlock(so);
927 939
928 /* 940 /*
929 * Allocate a temporary sockaddr. We have to allocate one extra 941 * Allocate a temporary sockaddr. We have to allocate one extra
930 * byte so that we can ensure that the pathname is nul-terminated. 942 * byte so that we can ensure that the pathname is nul-terminated.
931 * When we establish the connection, we copy the other PCB's 943 * When we establish the connection, we copy the other PCB's
932 * sockaddr to our own. 944 * sockaddr to our own.
933 */ 945 */
934 addrlen = nam->m_len + 1; 946 addrlen = nam->m_len + 1;
935 sun = malloc(addrlen, M_SONAME, M_WAITOK); 947 sun = malloc(addrlen, M_SONAME, M_WAITOK);
936 m_copydata(nam, 0, nam->m_len, (void *)sun); 948 m_copydata(nam, 0, nam->m_len, (void *)sun);
937 *(((char *)sun) + nam->m_len) = '\0'; 949 *(((char *)sun) + nam->m_len) = '\0';
938 950
939 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_SYSSPACE, 951 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_SYSSPACE,
940 sun->sun_path); 952 sun->sun_path);
941 953
942 if ((error = namei(&nd)) != 0) 954 if ((error = namei(&nd)) != 0)
943 goto bad2; 955 goto bad2;
944 vp = nd.ni_vp; 956 vp = nd.ni_vp;
945 if (vp->v_type != VSOCK) { 957 if (vp->v_type != VSOCK) {
946 error = ENOTSOCK; 958 error = ENOTSOCK;
947 goto bad; 959 goto bad;
948 } 960 }
949 if ((error = VOP_ACCESS(vp, VWRITE, l->l_cred)) != 0) 961 if ((error = VOP_ACCESS(vp, VWRITE, l->l_cred)) != 0)
950 goto bad; 962 goto bad;
951 /* Acquire v_interlock to protect against unp_detach(). */ 963 /* Acquire v_interlock to protect against unp_detach(). */
952 mutex_enter(&vp->v_interlock); 964 mutex_enter(&vp->v_interlock);
953 so2 = vp->v_socket; 965 so2 = vp->v_socket;
954 if (so2 == NULL) { 966 if (so2 == NULL) {
955 mutex_exit(&vp->v_interlock); 967 mutex_exit(&vp->v_interlock);
956 error = ECONNREFUSED; 968 error = ECONNREFUSED;
957 goto bad; 969 goto bad;
958 } 970 }
959 if (so->so_type != so2->so_type) { 971 if (so->so_type != so2->so_type) {
960 mutex_exit(&vp->v_interlock); 972 mutex_exit(&vp->v_interlock);
961 error = EPROTOTYPE; 973 error = EPROTOTYPE;
962 goto bad; 974 goto bad;
963 } 975 }
964 solock(so); 976 solock(so);
965 unp_resetlock(so); 977 unp_resetlock(so);
966 mutex_exit(&vp->v_interlock); 978 mutex_exit(&vp->v_interlock);
967 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 979 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
968 /* 980 /*
969 * This may seem somewhat fragile but is OK: if we can 981 * This may seem somewhat fragile but is OK: if we can
970 * see SO_ACCEPTCONN set on the endpoint, then it must 982 * see SO_ACCEPTCONN set on the endpoint, then it must
971 * be locked by the domain-wide uipc_lock. 983 * be locked by the domain-wide uipc_lock.
972 */ 984 */
973 KASSERT((so->so_options & SO_ACCEPTCONN) == 0 || 985 KASSERT((so->so_options & SO_ACCEPTCONN) == 0 ||
974 so2->so_lock == uipc_lock); 986 so2->so_lock == uipc_lock);
975 if ((so2->so_options & SO_ACCEPTCONN) == 0 || 987 if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
976 (so3 = sonewconn(so2, 0)) == 0) { 988 (so3 = sonewconn(so2, 0)) == 0) {
977 error = ECONNREFUSED; 989 error = ECONNREFUSED;
978 sounlock(so); 990 sounlock(so);
979 goto bad; 991 goto bad;
980 } 992 }
981 unp2 = sotounpcb(so2); 993 unp2 = sotounpcb(so2);
982 unp3 = sotounpcb(so3); 994 unp3 = sotounpcb(so3);
983 if (unp2->unp_addr) { 995 if (unp2->unp_addr) {
984 unp3->unp_addr = malloc(unp2->unp_addrlen, 996 unp3->unp_addr = malloc(unp2->unp_addrlen,
985 M_SONAME, M_WAITOK); 997 M_SONAME, M_WAITOK);
986 memcpy(unp3->unp_addr, unp2->unp_addr, 998 memcpy(unp3->unp_addr, unp2->unp_addr,
987 unp2->unp_addrlen); 999 unp2->unp_addrlen);
988 unp3->unp_addrlen = unp2->unp_addrlen; 1000 unp3->unp_addrlen = unp2->unp_addrlen;
989 } 1001 }
990 unp3->unp_flags = unp2->unp_flags; 1002 unp3->unp_flags = unp2->unp_flags;
991 unp3->unp_connid.unp_pid = l->l_proc->p_pid; 1003 unp3->unp_connid.unp_pid = l->l_proc->p_pid;
992 unp3->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred); 1004 unp3->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred);
993 unp3->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred); 1005 unp3->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred);
994 unp3->unp_flags |= UNP_EIDSVALID; 1006 unp3->unp_flags |= UNP_EIDSVALID;
995 if (unp2->unp_flags & UNP_EIDSBIND) { 1007 if (unp2->unp_flags & UNP_EIDSBIND) {
996 unp->unp_connid = unp2->unp_connid; 1008 unp->unp_connid = unp2->unp_connid;
997 unp->unp_flags |= UNP_EIDSVALID; 1009 unp->unp_flags |= UNP_EIDSVALID;
998 } 1010 }
999 so2 = so3; 1011 so2 = so3;
1000 } 1012 }
1001 error = unp_connect2(so, so2, PRU_CONNECT); 1013 error = unp_connect2(so, so2, PRU_CONNECT);
1002 sounlock(so); 1014 sounlock(so);
1003 bad: 1015 bad:
1004 vput(vp); 1016 vput(vp);
1005 bad2: 1017 bad2:
1006 free(sun, M_SONAME); 1018 free(sun, M_SONAME);
1007 solock(so); 1019 solock(so);
1008 unp->unp_flags &= ~UNP_BUSY; 1020 unp->unp_flags &= ~UNP_BUSY;
1009 return (error); 1021 return (error);
1010} 1022}
1011 1023
1012int 1024int
1013unp_connect2(struct socket *so, struct socket *so2, int req) 1025unp_connect2(struct socket *so, struct socket *so2, int req)
1014{ 1026{
1015 struct unpcb *unp = sotounpcb(so); 1027 struct unpcb *unp = sotounpcb(so);
1016 struct unpcb *unp2; 1028 struct unpcb *unp2;
1017 1029
1018 if (so2->so_type != so->so_type) 1030 if (so2->so_type != so->so_type)
1019 return (EPROTOTYPE); 1031 return (EPROTOTYPE);
1020 1032
1021 /* 1033 /*
1022 * All three sockets involved must be locked by same lock: 1034 * All three sockets involved must be locked by same lock:
1023 * 1035 *
1024 * local endpoint (so) 1036 * local endpoint (so)
1025 * remote endpoint (so2) 1037 * remote endpoint (so2)
1026 * queue head (so->so_head, only if PR_CONNREQUIRED) 1038 * queue head (so->so_head, only if PR_CONNREQUIRED)
1027 */ 1039 */
1028 KASSERT(solocked2(so, so2)); 1040 KASSERT(solocked2(so, so2));
1029 if (so->so_head != NULL) { 1041 if (so->so_head != NULL) {
1030 KASSERT(so->so_lock == uipc_lock); 1042 KASSERT(so->so_lock == uipc_lock);
1031 KASSERT(solocked2(so, so->so_head)); 1043 KASSERT(solocked2(so, so->so_head));
1032 } 1044 }
1033 1045
1034 unp2 = sotounpcb(so2); 1046 unp2 = sotounpcb(so2);
1035 unp->unp_conn = unp2; 1047 unp->unp_conn = unp2;
1036 switch (so->so_type) { 1048 switch (so->so_type) {
1037 1049
1038 case SOCK_DGRAM: 1050 case SOCK_DGRAM:
1039 unp->unp_nextref = unp2->unp_refs; 1051 unp->unp_nextref = unp2->unp_refs;
1040 unp2->unp_refs = unp; 1052 unp2->unp_refs = unp;
1041 soisconnected(so); 1053 soisconnected(so);
1042 break; 1054 break;
1043 1055
1044 case SOCK_STREAM: 1056 case SOCK_STREAM:
1045 unp2->unp_conn = unp; 1057 unp2->unp_conn = unp;
1046 if (req == PRU_CONNECT && 1058 if (req == PRU_CONNECT &&
1047 ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) 1059 ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
1048 soisconnecting(so); 1060 soisconnecting(so);
1049 else 1061 else
1050 soisconnected(so); 1062 soisconnected(so);
1051 soisconnected(so2); 1063 soisconnected(so2);
1052 /* 1064 /*
1053 * If the connection is fully established, break the 1065 * If the connection is fully established, break the
1054 * association with uipc_lock and give the connected 1066 * association with uipc_lock and give the connected
1055 * pair a seperate lock to share. For CONNECT2, we 1067 * pair a seperate lock to share. For CONNECT2, we
1056 * require that the locks already match (the sockets 1068 * require that the locks already match (the sockets
1057 * are created that way). 1069 * are created that way).
1058 */ 1070 */
1059 if (req == PRU_CONNECT) 1071 if (req == PRU_CONNECT)
1060 unp_setpeerlocks(so, so2); 1072 unp_setpeerlocks(so, so2);
1061 break; 1073 break;
1062 1074
1063 default: 1075 default:
1064 panic("unp_connect2"); 1076 panic("unp_connect2");
1065 } 1077 }
1066 return (0); 1078 return (0);
1067} 1079}
1068 1080
1069void 1081void
1070unp_disconnect(struct unpcb *unp) 1082unp_disconnect(struct unpcb *unp)
1071{ 1083{
1072 struct unpcb *unp2 = unp->unp_conn; 1084 struct unpcb *unp2 = unp->unp_conn;
1073 struct socket *so; 1085 struct socket *so;
1074 1086
1075 if (unp2 == 0) 1087 if (unp2 == 0)
1076 return; 1088 return;
1077 unp->unp_conn = 0; 1089 unp->unp_conn = 0;
1078 so = unp->unp_socket; 1090 so = unp->unp_socket;
1079 switch (so->so_type) { 1091 switch (so->so_type) {
1080 case SOCK_DGRAM: 1092 case SOCK_DGRAM:
1081 if (unp2->unp_refs == unp) 1093 if (unp2->unp_refs == unp)
1082 unp2->unp_refs = unp->unp_nextref; 1094 unp2->unp_refs = unp->unp_nextref;
1083 else { 1095 else {
1084 unp2 = unp2->unp_refs; 1096 unp2 = unp2->unp_refs;
1085 for (;;) { 1097 for (;;) {
1086 KASSERT(solocked2(so, unp2->unp_socket)); 1098 KASSERT(solocked2(so, unp2->unp_socket));
1087 if (unp2 == 0) 1099 if (unp2 == 0)
1088 panic("unp_disconnect"); 1100 panic("unp_disconnect");
1089 if (unp2->unp_nextref == unp) 1101 if (unp2->unp_nextref == unp)
1090 break; 1102 break;
1091 unp2 = unp2->unp_nextref; 1103 unp2 = unp2->unp_nextref;
1092 } 1104 }
1093 unp2->unp_nextref = unp->unp_nextref; 1105 unp2->unp_nextref = unp->unp_nextref;
1094 } 1106 }
1095 unp->unp_nextref = 0; 1107 unp->unp_nextref = 0;
1096 so->so_state &= ~SS_ISCONNECTED; 1108 so->so_state &= ~SS_ISCONNECTED;
1097 break; 1109 break;
1098 1110
1099 case SOCK_STREAM: 1111 case SOCK_STREAM:
1100 KASSERT(solocked2(so, unp2->unp_socket)); 1112 KASSERT(solocked2(so, unp2->unp_socket));
1101 soisdisconnected(so); 1113 soisdisconnected(so);
1102 unp2->unp_conn = 0; 1114 unp2->unp_conn = 0;
1103 soisdisconnected(unp2->unp_socket); 1115 soisdisconnected(unp2->unp_socket);
1104 break; 1116 break;
1105 } 1117 }
1106} 1118}
1107 1119
1108#ifdef notdef 1120#ifdef notdef
1109unp_abort(struct unpcb *unp) 1121unp_abort(struct unpcb *unp)
1110{ 1122{
1111 unp_detach(unp); 1123 unp_detach(unp);
1112} 1124}
1113#endif 1125#endif
1114 1126
1115void 1127void
1116unp_shutdown(struct unpcb *unp) 1128unp_shutdown(struct unpcb *unp)
1117{ 1129{
1118 struct socket *so; 1130 struct socket *so;
1119 1131
1120 if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && 1132 if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
1121 (so = unp->unp_conn->unp_socket)) 1133 (so = unp->unp_conn->unp_socket))
1122 socantrcvmore(so); 1134 socantrcvmore(so);
1123} 1135}
1124 1136
1125bool 1137bool
1126unp_drop(struct unpcb *unp, int errno) 1138unp_drop(struct unpcb *unp, int errno)
1127{ 1139{
1128 struct socket *so = unp->unp_socket; 1140 struct socket *so = unp->unp_socket;
1129 1141
1130 KASSERT(solocked(so)); 1142 KASSERT(solocked(so));
1131 1143
1132 so->so_error = errno; 1144 so->so_error = errno;
1133 unp_disconnect(unp); 1145 unp_disconnect(unp);
1134 if (so->so_head) { 1146 if (so->so_head) {
1135 so->so_pcb = NULL; 1147 so->so_pcb = NULL;
1136 /* sofree() drops the socket lock */ 1148 /* sofree() drops the socket lock */
1137 sofree(so); 1149 sofree(so);
1138 unp_free(unp); 1150 unp_free(unp);
1139 return true; 1151 return true;
1140 } 1152 }
1141 return false; 1153 return false;
1142} 1154}
1143 1155
1144#ifdef notdef 1156#ifdef notdef
1145unp_drain(void) 1157unp_drain(void)
1146{ 1158{
1147 1159
1148} 1160}
1149#endif 1161#endif
1150 1162
1151int 1163int
1152unp_externalize(struct mbuf *rights, struct lwp *l) 1164unp_externalize(struct mbuf *rights, struct lwp *l)
1153{ 1165{
1154 struct cmsghdr *cm = mtod(rights, struct cmsghdr *); 1166 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
1155 struct proc *p = l->l_proc; 1167 struct proc *p = l->l_proc;
1156 int i, *fdp; 1168 int i, *fdp;
1157 file_t **rp; 1169 file_t **rp;
1158 file_t *fp; 1170 file_t *fp;
1159 int nfds, error = 0; 1171 int nfds, error = 0;
1160 1172
1161 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / 1173 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
1162 sizeof(file_t *); 1174 sizeof(file_t *);
1163 rp = (file_t **)CMSG_DATA(cm); 1175 rp = (file_t **)CMSG_DATA(cm);
1164 1176
1165 fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK); 1177 fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK);
1166 rw_enter(&p->p_cwdi->cwdi_lock, RW_READER); 1178 rw_enter(&p->p_cwdi->cwdi_lock, RW_READER);
1167 1179
1168 /* Make sure the recipient should be able to see the descriptors.. */ 1180 /* Make sure the recipient should be able to see the files.. */
1169 if (p->p_cwdi->cwdi_rdir != NULL) { 1181 if (p->p_cwdi->cwdi_rdir != NULL) {
1170 rp = (file_t **)CMSG_DATA(cm); 1182 rp = (file_t **)CMSG_DATA(cm);
1171 for (i = 0; i < nfds; i++) { 1183 for (i = 0; i < nfds; i++) {
1172 fp = *rp++; 1184 fp = *rp++;
1173 /* 1185 /*
1174 * If we are in a chroot'ed directory, and 1186 * If we are in a chroot'ed directory, and
1175 * someone wants to pass us a directory, make 1187 * someone wants to pass us a directory, make
1176 * sure it's inside the subtree we're allowed 1188 * sure it's inside the subtree we're allowed
1177 * to access. 1189 * to access.
1178 */ 1190 */
1179 if (fp->f_type == DTYPE_VNODE) { 1191 if (fp->f_type == DTYPE_VNODE) {
1180 vnode_t *vp = (vnode_t *)fp->f_data; 1192 vnode_t *vp = (vnode_t *)fp->f_data;
1181 if ((vp->v_type == VDIR) && 1193 if ((vp->v_type == VDIR) &&
1182 !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) { 1194 !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) {
1183 error = EPERM; 1195 error = EPERM;
1184 break; 1196 break;
1185 } 1197 }
1186 } 1198 }
1187 } 1199 }
1188 } 1200 }
1189 1201
1190 restart: 1202 restart:
1191 rp = (file_t **)CMSG_DATA(cm); 1203 rp = (file_t **)CMSG_DATA(cm);
1192 if (error != 0) { 1204 if (error != 0) {
1193 for (i = 0; i < nfds; i++) { 1205 for (i = 0; i < nfds; i++) {
1194 fp = *rp; 1206 fp = *rp;
1195 /* 
1196 * zero the pointer before calling unp_discard, 
1197 * since it may end up in unp_gc().. 
1198 */ 
1199 *rp++ = 0; 1207 *rp++ = 0;
1200 unp_discard(fp); 1208 unp_discard_now(fp);
1201 } 1209 }
1202 goto out; 1210 goto out;
1203 } 1211 }
1204 1212
1205 /* 1213 /*
1206 * First loop -- allocate file descriptor table slots for the 1214 * First loop -- allocate file descriptor table slots for the
1207 * new descriptors. 1215 * new files.
1208 */ 1216 */
1209 for (i = 0; i < nfds; i++) { 1217 for (i = 0; i < nfds; i++) {
1210 fp = *rp++; 1218 fp = *rp++;
1211 if ((error = fd_alloc(p, 0, &fdp[i])) != 0) { 1219 if ((error = fd_alloc(p, 0, &fdp[i])) != 0) {
1212 /* 1220 /*
1213 * Back out what we've done so far. 1221 * Back out what we've done so far.
1214 */ 1222 */
1215 for (--i; i >= 0; i--) { 1223 for (--i; i >= 0; i--) {
1216 fd_abort(p, NULL, fdp[i]); 1224 fd_abort(p, NULL, fdp[i]);
1217 } 1225 }
1218 if (error == ENOSPC) { 1226 if (error == ENOSPC) {
1219 fd_tryexpand(p); 1227 fd_tryexpand(p);
1220 error = 0; 1228 error = 0;
1221 } else { 1229 } else {
1222 /* 1230 /*
1223 * This is the error that has historically 1231 * This is the error that has historically
1224 * been returned, and some callers may 1232 * been returned, and some callers may
1225 * expect it. 1233 * expect it.
1226 */ 1234 */
1227 error = EMSGSIZE; 1235 error = EMSGSIZE;
1228 } 1236 }
1229 goto restart; 1237 goto restart;
1230 } 1238 }
1231 } 1239 }
1232 1240
1233 /* 1241 /*
1234 * Now that adding them has succeeded, update all of the 1242 * Now that adding them has succeeded, update all of the
1235 * descriptor passing state. 1243 * file passing state and affix the descriptors.
1236 */ 1244 */
1237 rp = (file_t **)CMSG_DATA(cm); 1245 rp = (file_t **)CMSG_DATA(cm);
1238 for (i = 0; i < nfds; i++) { 1246 for (i = 0; i < nfds; i++) {
1239 fp = *rp++; 1247 fp = *rp++;
1240 atomic_dec_uint(&unp_rights); 1248 atomic_dec_uint(&unp_rights);
1241 fd_affix(p, fp, fdp[i]); 1249 fd_affix(p, fp, fdp[i]);
1242 mutex_enter(&fp->f_lock); 1250 mutex_enter(&fp->f_lock);
1243 fp->f_msgcount--; 1251 fp->f_msgcount--;
1244 mutex_exit(&fp->f_lock); 1252 mutex_exit(&fp->f_lock);
1245 /* 1253 /*
1246 * Note that fd_affix() adds a reference to the file. 1254 * Note that fd_affix() adds a reference to the file.
1247 * The file may already have been closed by another 1255 * The file may already have been closed by another
1248 * LWP in the process, so we must drop the reference 1256 * LWP in the process, so we must drop the reference
1249 * added by unp_internalize() with closef(). 1257 * added by unp_internalize() with closef().
1250 */ 1258 */
1251 closef(fp); 1259 closef(fp);
1252 } 1260 }
1253 1261
1254 /* 1262 /*
1255 * Copy temporary array to message and adjust length, in case of 1263 * Copy temporary array to message and adjust length, in case of
1256 * transition from large file_t pointers to ints. 1264 * transition from large file_t pointers to ints.
1257 */ 1265 */
1258 memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int)); 1266 memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int));
1259 cm->cmsg_len = CMSG_LEN(nfds * sizeof(int)); 1267 cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
1260 rights->m_len = CMSG_SPACE(nfds * sizeof(int)); 1268 rights->m_len = CMSG_SPACE(nfds * sizeof(int));
1261 out: 1269 out:
1262 rw_exit(&p->p_cwdi->cwdi_lock); 1270 rw_exit(&p->p_cwdi->cwdi_lock);
1263 free(fdp, M_TEMP); 1271 free(fdp, M_TEMP);
1264 return (error); 1272 return (error);
1265} 1273}
1266 1274
1267int 1275int
1268unp_internalize(struct mbuf **controlp) 1276unp_internalize(struct mbuf **controlp)
1269{ 1277{
1270 struct filedesc *fdescp = curlwp->l_fd; 1278 filedesc_t *fdescp = curlwp->l_fd;
1271 struct mbuf *control = *controlp; 1279 struct mbuf *control = *controlp;
1272 struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *); 1280 struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *);
1273 file_t **rp, **files; 1281 file_t **rp, **files;
1274 file_t *fp; 1282 file_t *fp;
1275 int i, fd, *fdp; 1283 int i, fd, *fdp;
1276 int nfds, error; 1284 int nfds, error;
 1285 u_int maxmsg;
1277 1286
1278 error = 0; 1287 error = 0;
1279 newcm = NULL; 1288 newcm = NULL;
1280 1289
1281 /* Sanity check the control message header. */ 1290 /* Sanity check the control message header. */
1282 if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || 1291 if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
1283 cm->cmsg_len > control->m_len || 1292 cm->cmsg_len > control->m_len ||
1284 cm->cmsg_len < CMSG_ALIGN(sizeof(*cm))) 1293 cm->cmsg_len < CMSG_ALIGN(sizeof(*cm)))
1285 return (EINVAL); 1294 return (EINVAL);
1286 1295
1287 /* 1296 /*
1288 * Verify that the file descriptors are valid, and acquire 1297 * Verify that the file descriptors are valid, and acquire
1289 * a reference to each. 1298 * a reference to each.
1290 */ 1299 */
1291 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int); 1300 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
1292 fdp = (int *)CMSG_DATA(cm); 1301 fdp = (int *)CMSG_DATA(cm);
 1302 maxmsg = maxfiles / unp_rights_ratio;
1293 for (i = 0; i < nfds; i++) { 1303 for (i = 0; i < nfds; i++) {
1294 fd = *fdp++; 1304 fd = *fdp++;
 1305 if (atomic_inc_uint_nv(&unp_rights) > maxmsg) {
 1306 atomic_dec_uint(&unp_rights);
 1307 nfds = i;
 1308 error = EAGAIN;
 1309 goto out;
 1310 }
1295 if ((fp = fd_getfile(fd)) == NULL) { 1311 if ((fp = fd_getfile(fd)) == NULL) {
 1312 atomic_dec_uint(&unp_rights);
1296 nfds = i; 1313 nfds = i;
1297 error = EBADF; 1314 error = EBADF;
1298 goto out; 1315 goto out;
1299 } 1316 }
1300 } 1317 }
1301 1318
1302 /* Allocate new space and copy header into it. */ 1319 /* Allocate new space and copy header into it. */
1303 newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK); 1320 newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK);
1304 if (newcm == NULL) { 1321 if (newcm == NULL) {
1305 error = E2BIG; 1322 error = E2BIG;
1306 goto out; 1323 goto out;
1307 } 1324 }
1308 memcpy(newcm, cm, sizeof(struct cmsghdr)); 1325 memcpy(newcm, cm, sizeof(struct cmsghdr));
1309 files = (file_t **)CMSG_DATA(newcm); 1326 files = (file_t **)CMSG_DATA(newcm);
1310 1327
1311 /* 1328 /*
1312 * Transform the file descriptors into file_t pointers, in 1329 * Transform the file descriptors into file_t pointers, in
1313 * reverse order so that if pointers are bigger than ints, the 1330 * reverse order so that if pointers are bigger than ints, the
1314 * int won't get until we're done. No need to lock, as we have 1331 * int won't get until we're done. No need to lock, as we have
1315 * already validated the descriptors with fd_getfile(). 1332 * already validated the descriptors with fd_getfile().
1316 */ 1333 */
1317 fdp = (int *)CMSG_DATA(cm) + nfds; 1334 fdp = (int *)CMSG_DATA(cm) + nfds;
1318 rp = files + nfds; 1335 rp = files + nfds;
1319 for (i = 0; i < nfds; i++) { 1336 for (i = 0; i < nfds; i++) {
1320 fp = fdescp->fd_ofiles[*--fdp]->ff_file; 1337 fp = fdescp->fd_ofiles[*--fdp]->ff_file;
1321 KASSERT(fp != NULL); 1338 KASSERT(fp != NULL);
1322 mutex_enter(&fp->f_lock); 1339 mutex_enter(&fp->f_lock);
1323 *--rp = fp; 1340 *--rp = fp;
1324 fp->f_count++; 1341 fp->f_count++;
1325 fp->f_msgcount++; 1342 fp->f_msgcount++;
1326 mutex_exit(&fp->f_lock); 1343 mutex_exit(&fp->f_lock);
1327 atomic_inc_uint(&unp_rights); 
1328 } 1344 }
1329 1345
1330 out: 1346 out:
1331 /* Release descriptor references. */ 1347 /* Release descriptor references. */
1332 fdp = (int *)CMSG_DATA(cm); 1348 fdp = (int *)CMSG_DATA(cm);
1333 for (i = 0; i < nfds; i++) { 1349 for (i = 0; i < nfds; i++) {
1334 fd_putfile(*fdp++); 1350 fd_putfile(*fdp++);
 1351 if (error != 0) {
 1352 atomic_dec_uint(&unp_rights);
 1353 }
1335 } 1354 }
1336 1355
1337 if (error == 0) { 1356 if (error == 0) {
1338 if (control->m_flags & M_EXT) { 1357 if (control->m_flags & M_EXT) {
1339 m_freem(control); 1358 m_freem(control);
1340 *controlp = control = m_get(M_WAIT, MT_CONTROL); 1359 *controlp = control = m_get(M_WAIT, MT_CONTROL);
1341 } 1360 }
1342 MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)), 1361 MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)),
1343 M_MBUF, NULL, NULL); 1362 M_MBUF, NULL, NULL);
1344 cm = newcm; 1363 cm = newcm;
1345 /* 1364 /*
1346 * Adjust message & mbuf to note amount of space 1365 * Adjust message & mbuf to note amount of space
1347 * actually used. 1366 * actually used.
1348 */ 1367 */
1349 cm->cmsg_len = CMSG_LEN(nfds * sizeof(file_t *)); 1368 cm->cmsg_len = CMSG_LEN(nfds * sizeof(file_t *));
1350 control->m_len = CMSG_SPACE(nfds * sizeof(file_t *)); 1369 control->m_len = CMSG_SPACE(nfds * sizeof(file_t *));
1351 } 1370 }
1352 1371
1353 return error; 1372 return error;
1354} 1373}
1355 1374
1356struct mbuf * 1375struct mbuf *
1357unp_addsockcred(struct lwp *l, struct mbuf *control) 1376unp_addsockcred(struct lwp *l, struct mbuf *control)
1358{ 1377{
1359 struct cmsghdr *cmp; 1378 struct cmsghdr *cmp;
1360 struct sockcred *sc; 1379 struct sockcred *sc;
1361 struct mbuf *m, *n; 1380 struct mbuf *m, *n;
1362 int len, space, i; 1381 int len, space, i;
1363 1382
1364 len = CMSG_LEN(SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred))); 1383 len = CMSG_LEN(SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)));
1365 space = CMSG_SPACE(SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred))); 1384 space = CMSG_SPACE(SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)));
1366 1385
1367 m = m_get(M_WAIT, MT_CONTROL); 1386 m = m_get(M_WAIT, MT_CONTROL);
1368 if (space > MLEN) { 1387 if (space > MLEN) {
1369 if (space > MCLBYTES) 1388 if (space > MCLBYTES)
1370 MEXTMALLOC(m, space, M_WAITOK); 1389 MEXTMALLOC(m, space, M_WAITOK);
1371 else 1390 else
1372 m_clget(m, M_WAIT); 1391 m_clget(m, M_WAIT);
1373 if ((m->m_flags & M_EXT) == 0) { 1392 if ((m->m_flags & M_EXT) == 0) {
1374 m_free(m); 1393 m_free(m);
1375 return (control); 1394 return (control);
1376 } 1395 }
1377 } 1396 }
1378 1397
1379 m->m_len = space; 1398 m->m_len = space;
1380 m->m_next = NULL; 1399 m->m_next = NULL;
1381 cmp = mtod(m, struct cmsghdr *); 1400 cmp = mtod(m, struct cmsghdr *);
1382 sc = (struct sockcred *)CMSG_DATA(cmp); 1401 sc = (struct sockcred *)CMSG_DATA(cmp);
1383 cmp->cmsg_len = len; 1402 cmp->cmsg_len = len;
1384 cmp->cmsg_level = SOL_SOCKET; 1403 cmp->cmsg_level = SOL_SOCKET;
1385 cmp->cmsg_type = SCM_CREDS; 1404 cmp->cmsg_type = SCM_CREDS;
1386 sc->sc_uid = kauth_cred_getuid(l->l_cred); 1405 sc->sc_uid = kauth_cred_getuid(l->l_cred);
1387 sc->sc_euid = kauth_cred_geteuid(l->l_cred); 1406 sc->sc_euid = kauth_cred_geteuid(l->l_cred);
1388 sc->sc_gid = kauth_cred_getgid(l->l_cred); 1407 sc->sc_gid = kauth_cred_getgid(l->l_cred);
1389 sc->sc_egid = kauth_cred_getegid(l->l_cred); 1408 sc->sc_egid = kauth_cred_getegid(l->l_cred);
1390 sc->sc_ngroups = kauth_cred_ngroups(l->l_cred); 1409 sc->sc_ngroups = kauth_cred_ngroups(l->l_cred);
1391 for (i = 0; i < sc->sc_ngroups; i++) 1410 for (i = 0; i < sc->sc_ngroups; i++)
1392 sc->sc_groups[i] = kauth_cred_group(l->l_cred, i); 1411 sc->sc_groups[i] = kauth_cred_group(l->l_cred, i);
1393 1412
1394 /* 1413 /*
1395 * If a control message already exists, append us to the end. 1414 * If a control message already exists, append us to the end.
1396 */ 1415 */
1397 if (control != NULL) { 1416 if (control != NULL) {
1398 for (n = control; n->m_next != NULL; n = n->m_next) 1417 for (n = control; n->m_next != NULL; n = n->m_next)
1399 ; 1418 ;
1400 n->m_next = m; 1419 n->m_next = m;
1401 } else 1420 } else
1402 control = m; 1421 control = m;
1403 1422
1404 return (control); 1423 return (control);
1405} 1424}
1406 1425
1407int unp_defer, unp_gcing; 
1408extern struct domain unixdomain; 
1409 
1410/* 1426/*
1411 * Comment added long after the fact explaining what's going on here. 1427 * Do a mark-sweep GC of files in the system, to free up any which are
1412 * Do a mark-sweep GC of file descriptors on the system, to free up 1428 * caught in flight to an about-to-be-closed socket. Additionally,
1413 * any which are caught in flight to an about-to-be-closed socket. 1429 * process deferred file closures.
1414 * 
1415 * Traditional mark-sweep gc's start at the "root", and mark 
1416 * everything reachable from the root (which, in our case would be the 
1417 * process table). The mark bits are cleared during the sweep. 
1418 * 
1419 * XXX For some inexplicable reason (perhaps because the file 
1420 * descriptor tables used to live in the u area which could be swapped 
1421 * out and thus hard to reach), we do multiple scans over the set of 
1422 * descriptors, using use *two* mark bits per object (DEFER and MARK). 
1423 * Whenever we find a descriptor which references other descriptors, 
1424 * the ones it references are marked with both bits, and we iterate 
1425 * over the whole file table until there are no more DEFER bits set. 
1426 * We also make an extra pass *before* the GC to clear the mark bits, 
1427 * which could have been cleared at almost no cost during the previous 
1428 * sweep. 
1429 */ 1430 */
1430void 1431static void
1431unp_gc(void) 1432unp_gc(file_t *dp)
1432{ 1433{
1433 file_t *fp, *nextfp; 1434 extern struct domain unixdomain;
 1435 file_t *fp, *np;
1434 struct socket *so, *so1; 1436 struct socket *so, *so1;
1435 file_t **extra_ref, **fpp; 1437 u_int i, old, new;
1436 int nunref, nslots, i; 1438 bool didwork;
1437 1439
1438 if (atomic_swap_uint(&unp_gcing, 1) == 1) 1440 KASSERT(curlwp == unp_thread_lwp);
1439 return; 1441 KASSERT(mutex_owned(&filelist_lock));
1440 1442
1441 restart: 1443 /*
1442 nslots = nfiles * 2; 1444 * First, process deferred file closures.
1443 extra_ref = kmem_alloc(nslots * sizeof(file_t *), KM_SLEEP); 1445 */
 1446 while (!SLIST_EMPTY(&unp_thread_discard)) {
 1447 fp = SLIST_FIRST(&unp_thread_discard);
 1448 KASSERT(fp->f_unpcount > 0);
 1449 KASSERT(fp->f_count > 0);
 1450 KASSERT(fp->f_msgcount > 0);
 1451 KASSERT(fp->f_count >= fp->f_unpcount);
 1452 KASSERT(fp->f_count >= fp->f_msgcount);
 1453 KASSERT(fp->f_msgcount >= fp->f_unpcount);
 1454 SLIST_REMOVE_HEAD(&unp_thread_discard, f_unplist);
 1455 i = fp->f_unpcount;
 1456 fp->f_unpcount = 0;
 1457 mutex_exit(&filelist_lock);
 1458 for (; i != 0; i--) {
 1459 unp_discard_now(fp);
 1460 }
 1461 mutex_enter(&filelist_lock);
 1462 }
1444 1463
1445 mutex_enter(&filelist_lock); 1464 /*
 1465 * Clear mark bits. Ensure that we don't consider new files
 1466 * entering the file table during this loop (they will not have
 1467 * FSCAN set).
 1468 */
1446 unp_defer = 0; 1469 unp_defer = 0;
1447 
1448 /* Clear mark bits */ 
1449 LIST_FOREACH(fp, &filehead, f_list) { 1470 LIST_FOREACH(fp, &filehead, f_list) {
1450 atomic_and_uint(&fp->f_flag, ~(FMARK|FDEFER)); 1471 for (old = fp->f_flag;; old = new) {
 1472 new = atomic_cas_uint(&fp->f_flag, old,
 1473 (old | FSCAN) & ~(FMARK|FDEFER));
 1474 if (__predict_true(old == new)) {
 1475 break;
 1476 }
 1477 }
1451 } 1478 }
1452 1479
1453 /* 1480 /*
1454 * Iterate over the set of descriptors, marking ones believed 1481 * Iterate over the set of sockets, marking ones believed (based on
1455 * (based on refcount) to be referenced from a process, and 1482 * refcount) to be referenced from a process, and marking for rescan
1456 * marking for rescan descriptors which are queued on a socket. 1483 * sockets which are queued on a socket. Recan continues descending
 1484 * and searching for sockets referenced by sockets (FDEFER), until
 1485 * there are no more socket->socket references to be discovered.
1457 */ 1486 */
1458 do { 1487 do {
1459 LIST_FOREACH(fp, &filehead, f_list) { 1488 didwork = false;
 1489 for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
 1490 KASSERT(mutex_owned(&filelist_lock));
 1491 np = LIST_NEXT(fp, f_list);
1460 mutex_enter(&fp->f_lock); 1492 mutex_enter(&fp->f_lock);
1461 if (fp->f_flag & FDEFER) { 1493 if ((fp->f_flag & FDEFER) != 0) {
1462 atomic_and_uint(&fp->f_flag, ~FDEFER); 1494 atomic_and_uint(&fp->f_flag, ~FDEFER);
1463 unp_defer--; 1495 unp_defer--;
1464 KASSERT(fp->f_count != 0); 1496 KASSERT(fp->f_count != 0);
1465 } else { 1497 } else {
1466 if (fp->f_count == 0 || 1498 if (fp->f_count == 0 ||
1467 (fp->f_flag & FMARK) || 1499 (fp->f_flag & FMARK) != 0 ||
1468 fp->f_count == fp->f_msgcount) { 1500 fp->f_count == fp->f_msgcount ||
 1501 fp->f_unpcount != 0) {
1469 mutex_exit(&fp->f_lock); 1502 mutex_exit(&fp->f_lock);
1470 continue; 1503 continue;
1471 } 1504 }
1472 } 1505 }
1473 atomic_or_uint(&fp->f_flag, FMARK); 1506 atomic_or_uint(&fp->f_flag, FMARK);
1474 1507
1475 if (fp->f_type != DTYPE_SOCKET || 1508 if (fp->f_type != DTYPE_SOCKET ||
1476 (so = fp->f_data) == NULL || 1509 (so = fp->f_data) == NULL ||
1477 so->so_proto->pr_domain != &unixdomain || 1510 so->so_proto->pr_domain != &unixdomain ||
1478 (so->so_proto->pr_flags&PR_RIGHTS) == 0) { 1511 (so->so_proto->pr_flags & PR_RIGHTS) == 0) {
1479 mutex_exit(&fp->f_lock); 1512 mutex_exit(&fp->f_lock);
1480 continue; 1513 continue;
1481 } 1514 }
1482#ifdef notdef 1515
1483 if (so->so_rcv.sb_flags & SB_LOCK) { 1516 /* Gain file ref, mark our position, and unlock. */
1484 mutex_exit(&fp->f_lock); 1517 didwork = true;
1485 mutex_exit(&filelist_lock); 1518 LIST_INSERT_AFTER(fp, dp, f_list);
1486 kmem_free(extra_ref, nslots * sizeof(file_t *)); 1519 fp->f_count++;
1487 /* 
1488 * This is problematical; it's not clear 
1489 * we need to wait for the sockbuf to be 
1490 * unlocked (on a uniprocessor, at least), 
1491 * and it's also not clear what to do 
1492 * if sbwait returns an error due to receipt 
1493 * of a signal. If sbwait does return 
1494 * an error, we'll go into an infinite 
1495 * loop. Delete all of this for now. 
1496 */ 
1497 (void) sbwait(&so->so_rcv); 
1498 goto restart; 
1499 } 
1500#endif 
1501 mutex_exit(&fp->f_lock); 1520 mutex_exit(&fp->f_lock);
 1521 mutex_exit(&filelist_lock);
1502 1522
1503 /* 1523 /*
1504 * XXX Locking a socket with filelist_lock held 1524 * Mark files referenced from sockets queued on the
1505 * is ugly. filelist_lock can be taken by the 1525 * accept queue as well.
1506 * pagedaemon when reclaiming items from file_cache. 
1507 * Socket activity could delay the pagedaemon. 
1508 */ 1526 */
1509 solock(so); 1527 solock(so);
1510 unp_scan(so->so_rcv.sb_mb, unp_mark, 0); 1528 unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
1511 /* 1529 if ((so->so_options & SO_ACCEPTCONN) != 0) {
1512 * Mark descriptors referenced from sockets queued 
1513 * on the accept queue as well. 
1514 */ 
1515 if (so->so_options & SO_ACCEPTCONN) { 
1516 TAILQ_FOREACH(so1, &so->so_q0, so_qe) { 1530 TAILQ_FOREACH(so1, &so->so_q0, so_qe) {
1517 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); 1531 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1518 } 1532 }
1519 TAILQ_FOREACH(so1, &so->so_q, so_qe) { 1533 TAILQ_FOREACH(so1, &so->so_q, so_qe) {
1520 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); 1534 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1521 } 1535 }
1522 } 1536 }
1523 sounlock(so); 1537 sounlock(so);
 1538
 1539 /* Re-lock and restart from where we left off. */
 1540 closef(fp);
 1541 mutex_enter(&filelist_lock);
 1542 np = LIST_NEXT(dp, f_list);
 1543 LIST_REMOVE(dp, f_list);
1524 } 1544 }
1525 } while (unp_defer); 1545 /*
 1546 * Bail early if we did nothing in the loop above. Could
 1547 * happen because of concurrent activity causing unp_defer
 1548 * to get out of sync.
 1549 */
 1550 } while (unp_defer != 0 && didwork);
1526 1551
1527 /* 1552 /*
1528 * Sweep pass. Find unmarked descriptors, and free them. 1553 * Sweep pass.
1529 * 
1530 * We grab an extra reference to each of the file table entries 
1531 * that are not otherwise accessible and then free the rights 
1532 * that are stored in messages on them. 
1533 * 
1534 * The bug in the original code is a little tricky, so I'll describe 
1535 * what's wrong with it here. 
1536 * 
1537 * It is incorrect to simply unp_discard each entry for f_msgcount 
1538 * times -- consider the case of sockets A and B that contain 
1539 * references to each other. On a last close of some other socket, 
1540 * we trigger a gc since the number of outstanding rights (unp_rights) 
1541 * is non-zero. If during the sweep phase the gc code un_discards, 
1542 * we end up doing a (full) closef on the descriptor. A closef on A 
1543 * results in the following chain. Closef calls soo_close, which 
1544 * calls soclose. Soclose calls first (through the switch 
1545 * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply 
1546 * returns because the previous instance had set unp_gcing, and 
1547 * we return all the way back to soclose, which marks the socket 
1548 * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush 
1549 * to free up the rights that are queued in messages on the socket A, 
1550 * i.e., the reference on B. The sorflush calls via the dom_dispose 
1551 * switch unp_dispose, which unp_scans with unp_discard. This second 
1552 * instance of unp_discard just calls closef on B. 
1553 * 1554 *
1554 * Well, a similar chain occurs on B, resulting in a sorflush on B, 1555 * We grab an extra reference to each of the files that are
1555 * which results in another closef on A. Unfortunately, A is already 1556 * not otherwise accessible and then free the rights that are
1556 * being closed, and the descriptor has already been marked with 1557 * stored in messages on them.
1557 * SS_NOFDREF, and soclose panics at this point. 
1558 * 
1559 * Here, we first take an extra reference to each inaccessible 
1560 * descriptor. Then, if the inaccessible descriptor is a 
1561 * socket, we call sorflush in case it is a Unix domain 
1562 * socket. After we destroy all the rights carried in 
1563 * messages, we do a last closef to get rid of our extra 
1564 * reference. This is the last close, and the unp_detach etc 
1565 * will shut down the socket. 
1566 * 
1567 * 91/09/19, bsy@cs.cmu.edu 
1568 */ 1558 */
1569 if (nslots < nfiles) { 1559 for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
1570 mutex_exit(&filelist_lock); 1560 KASSERT(mutex_owned(&filelist_lock));
1571 kmem_free(extra_ref, nslots * sizeof(file_t *)); 1561 np = LIST_NEXT(fp, f_list);
1572 goto restart; 
1573 } 
1574 for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0; 
1575 fp = nextfp) { 
1576 nextfp = LIST_NEXT(fp, f_list); 
1577 mutex_enter(&fp->f_lock); 1562 mutex_enter(&fp->f_lock);
1578 if (fp->f_count != 0 && 1563
1579 fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) { 1564 /*
1580 *fpp++ = fp; 1565 * Ignore non-sockets.
1581 nunref++; 1566 * Ignore dead sockets, or sockets with pending close.
1582 fp->f_count++; 1567 * Ignore sockets obviously referenced elsewhere.
 1568 * Ignore sockets marked as referenced by our scan.
 1569 * Ignore new sockets that did not exist during the scan.
 1570 */
 1571 if (fp->f_type != DTYPE_SOCKET ||
 1572 fp->f_count == 0 || fp->f_unpcount != 0 ||
 1573 fp->f_count != fp->f_msgcount ||
 1574 (fp->f_flag & (FMARK | FSCAN)) != FSCAN) {
 1575 mutex_exit(&fp->f_lock);
 1576 continue;
1583 } 1577 }
 1578
 1579 /* Gain file ref, mark our position, and unlock. */
 1580 LIST_INSERT_AFTER(fp, dp, f_list);
 1581 fp->f_count++;
1584 mutex_exit(&fp->f_lock); 1582 mutex_exit(&fp->f_lock);
 1583 mutex_exit(&filelist_lock);
 1584
 1585 /*
 1586 * Flush all data from the socket's receive buffer.
 1587 * This will cause files referenced only by the
 1588 * socket to be queued for close.
 1589 */
 1590 so = fp->f_data;
 1591 solock(so);
 1592 sorflush(so);
 1593 sounlock(so);
 1594
 1595 /* Re-lock and restart from where we left off. */
 1596 closef(fp);
 1597 mutex_enter(&filelist_lock);
 1598 np = LIST_NEXT(dp, f_list);
 1599 LIST_REMOVE(dp, f_list);
1585 } 1600 }
1586 mutex_exit(&filelist_lock); 1601}
1587 1602
1588 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1603/*
1589 fp = *fpp; 1604 * Garbage collector thread. While SCM_RIGHTS messages are in transit,
1590 if (fp->f_type == DTYPE_SOCKET) { 1605 * wake once per second to garbage collect. Run continually while we
1591 so = fp->f_data; 1606 * have deferred closes to process.
1592 solock(so); 1607 */
1593 sorflush(fp->f_data); 1608static void
1594 sounlock(so); 1609unp_thread(void *cookie)
 1610{
 1611 file_t *dp;
 1612
 1613 /* Allocate a dummy file for our scans. */
 1614 if ((dp = fgetdummy()) == NULL) {
 1615 panic("unp_thread");
 1616 }
 1617
 1618 mutex_enter(&filelist_lock);
 1619 for (;;) {
 1620 KASSERT(mutex_owned(&filelist_lock));
 1621 if (SLIST_EMPTY(&unp_thread_discard)) {
 1622 if (unp_rights != 0) {
 1623 (void)cv_timedwait(&unp_thread_cv,
 1624 &filelist_lock, hz);
 1625 } else {
 1626 cv_wait(&unp_thread_cv, &filelist_lock);
 1627 }
1595 } 1628 }
 1629 unp_gc(dp);
1596 } 1630 }
1597 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1631 /* NOTREACHED */
1598 closef(*fpp); 1632}
 1633
 1634/*
 1635 * Kick the garbage collector into action if there is something for
 1636 * it to process.
 1637 */
 1638static void
 1639unp_thread_kick(void)
 1640{
 1641
 1642 if (!SLIST_EMPTY(&unp_thread_discard) || unp_rights != 0) {
 1643 mutex_enter(&filelist_lock);
 1644 cv_signal(&unp_thread_cv);
 1645 mutex_exit(&filelist_lock);
1599 } 1646 }
1600 kmem_free(extra_ref, nslots * sizeof(file_t *)); 
1601 atomic_swap_uint(&unp_gcing, 0); 
1602} 1647}
1603 1648
1604void 1649void
1605unp_dispose(struct mbuf *m) 1650unp_dispose(struct mbuf *m)
1606{ 1651{
1607 1652
1608 if (m) 1653 if (m)
1609 unp_scan(m, unp_discard, 1); 1654 unp_scan(m, unp_discard_later, 1);
1610} 1655}
1611 1656
1612void 1657void
1613unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard) 1658unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard)
1614{ 1659{
1615 struct mbuf *m; 1660 struct mbuf *m;
1616 file_t **rp; 1661 file_t **rp, *fp;
1617 struct cmsghdr *cm; 1662 struct cmsghdr *cm;
1618 int i; 1663 int i, qfds;
1619 int qfds; 
1620 1664
1621 while (m0) { 1665 while (m0) {
1622 for (m = m0; m; m = m->m_next) { 1666 for (m = m0; m; m = m->m_next) {
1623 if (m->m_type == MT_CONTROL && 1667 if (m->m_type != MT_CONTROL ||
1624 m->m_len >= sizeof(*cm)) { 1668 m->m_len < sizeof(*cm)) {
1625 cm = mtod(m, struct cmsghdr *); 1669 continue;
1626 if (cm->cmsg_level != SOL_SOCKET || 1670 }
1627 cm->cmsg_type != SCM_RIGHTS) 1671 cm = mtod(m, struct cmsghdr *);
1628 continue; 1672 if (cm->cmsg_level != SOL_SOCKET ||
1629 qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) 1673 cm->cmsg_type != SCM_RIGHTS)
1630 / sizeof(file_t *); 1674 continue;
1631 rp = (file_t **)CMSG_DATA(cm); 1675 qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
1632 for (i = 0; i < qfds; i++) { 1676 / sizeof(file_t *);
1633 file_t *fp = *rp; 1677 rp = (file_t **)CMSG_DATA(cm);
1634 if (discard) 1678 for (i = 0; i < qfds; i++) {
1635 *rp = 0; 1679 fp = *rp;
1636 (*op)(fp); 1680 if (discard) {
1637 rp++; 1681 *rp = 0;
1638 } 1682 }
1639 break; /* XXX, but saves time */ 1683 (*op)(fp);
 1684 rp++;
1640 } 1685 }
1641 } 1686 }
1642 m0 = m0->m_nextpkt; 1687 m0 = m0->m_nextpkt;
1643 } 1688 }
1644} 1689}
1645 1690
1646void 1691void
1647unp_mark(file_t *fp) 1692unp_mark(file_t *fp)
1648{ 1693{
1649 1694
1650 if (fp == NULL) 1695 if (fp == NULL)
1651 return; 1696 return;
1652 1697
1653 /* If we're already deferred, don't screw up the defer count */ 1698 /* If we're already deferred, don't screw up the defer count */
1654 mutex_enter(&fp->f_lock); 1699 mutex_enter(&fp->f_lock);
1655 if (fp->f_flag & (FMARK | FDEFER)) { 1700 if (fp->f_flag & (FMARK | FDEFER)) {
1656 mutex_exit(&fp->f_lock); 1701 mutex_exit(&fp->f_lock);
1657 return; 1702 return;
1658 } 1703 }
1659 1704
1660 /* 1705 /*
1661 * Minimize the number of deferrals... Sockets are the only 1706 * Minimize the number of deferrals... Sockets are the only type of
1662 * type of descriptor which can hold references to another 1707 * file which can hold references to another file, so just mark
1663 * descriptor, so just mark other descriptors, and defer 1708 * other files, and defer unmarked sockets for the next pass.
1664 * unmarked sockets for the next pass. 
1665 */ 1709 */
1666 if (fp->f_type == DTYPE_SOCKET) { 1710 if (fp->f_type == DTYPE_SOCKET) {
1667 unp_defer++; 1711 unp_defer++;
1668 KASSERT(fp->f_count != 0); 1712 KASSERT(fp->f_count != 0);
1669 atomic_or_uint(&fp->f_flag, FDEFER); 1713 atomic_or_uint(&fp->f_flag, FDEFER);
1670 } else { 1714 } else {
1671 atomic_or_uint(&fp->f_flag, FMARK); 1715 atomic_or_uint(&fp->f_flag, FMARK);
1672 } 1716 }
1673 mutex_exit(&fp->f_lock); 1717 mutex_exit(&fp->f_lock);
1674 return; 
1675} 1718}
1676 1719
1677void 1720static void
1678unp_discard(file_t *fp) 1721unp_discard_now(file_t *fp)
1679{ 1722{
1680 1723
1681 if (fp == NULL) 1724 if (fp == NULL)
1682 return; 1725 return;
1683 1726
1684 mutex_enter(&fp->f_lock); 
1685 KASSERT(fp->f_count > 0); 1727 KASSERT(fp->f_count > 0);
 1728 KASSERT(fp->f_msgcount > 0);
 1729
 1730 mutex_enter(&fp->f_lock);
1686 fp->f_msgcount--; 1731 fp->f_msgcount--;
1687 mutex_exit(&fp->f_lock); 1732 mutex_exit(&fp->f_lock);
1688 atomic_dec_uint(&unp_rights); 1733 atomic_dec_uint(&unp_rights);
1689 (void)closef(fp); 1734 (void)closef(fp);
1690} 1735}
 1736
 1737static void
 1738unp_discard_later(file_t *fp)
 1739{
 1740
 1741 if (fp == NULL)
 1742 return;
 1743
 1744 KASSERT(fp->f_count > 0);
 1745 KASSERT(fp->f_msgcount > 0);
 1746
 1747 mutex_enter(&filelist_lock);
 1748 if (fp->f_unpcount++ == 0) {
 1749 SLIST_INSERT_HEAD(&unp_thread_discard, fp, f_unplist);
 1750 }
 1751 mutex_exit(&filelist_lock);
 1752}

cvs diff -r1.34 -r1.34.64.1 src/sys/sys/fcntl.h (switch to unified diff)

--- src/sys/sys/fcntl.h 2006/10/05 14:48:33 1.34
+++ src/sys/sys/fcntl.h 2009/03/18 05:33:23 1.34.64.1
@@ -1,287 +1,288 @@ @@ -1,287 +1,288 @@
1/* $NetBSD: fcntl.h,v 1.34 2006/10/05 14:48:33 chs Exp $ */ 1/* $NetBSD: fcntl.h,v 1.34.64.1 2009/03/18 05:33:23 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1983, 1990, 1993 4 * Copyright (c) 1983, 1990, 1993
5 * The Regents of the University of California. All rights reserved. 5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc. 6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed 7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph 8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc. 10 * the permission of UNIX System Laboratories, Inc.
11 * 11 *
12 * Redistribution and use in source and binary forms, with or without 12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions 13 * modification, are permitted provided that the following conditions
14 * are met: 14 * are met:
15 * 1. Redistributions of source code must retain the above copyright 15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer. 16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright 17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the 18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution. 19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors 20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software 21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission. 22 * without specific prior written permission.
23 * 23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE. 34 * SUCH DAMAGE.
35 * 35 *
36 * @(#)fcntl.h 8.3 (Berkeley) 1/21/94 36 * @(#)fcntl.h 8.3 (Berkeley) 1/21/94
37 */ 37 */
38 38
39#ifndef _SYS_FCNTL_H_ 39#ifndef _SYS_FCNTL_H_
40#define _SYS_FCNTL_H_ 40#define _SYS_FCNTL_H_
41 41
42/* 42/*
43 * This file includes the definitions for open and fcntl 43 * This file includes the definitions for open and fcntl
44 * described by POSIX for <fcntl.h>; it also includes 44 * described by POSIX for <fcntl.h>; it also includes
45 * related kernel definitions. 45 * related kernel definitions.
46 */ 46 */
47 47
48#ifndef _KERNEL 48#ifndef _KERNEL
49#include <sys/featuretest.h> 49#include <sys/featuretest.h>
50#include <sys/types.h> 50#include <sys/types.h>
51#if defined(_XOPEN_SOURCE) || defined(_NETBSD_SOURCE) 51#if defined(_XOPEN_SOURCE) || defined(_NETBSD_SOURCE)
52#include <sys/stat.h> 52#include <sys/stat.h>
53#endif /* _XOPEN_SOURCE || _NETBSD_SOURCE */ 53#endif /* _XOPEN_SOURCE || _NETBSD_SOURCE */
54#endif /* !_KERNEL */ 54#endif /* !_KERNEL */
55 55
56/* 56/*
57 * File status flags: these are used by open(2), fcntl(2). 57 * File status flags: these are used by open(2), fcntl(2).
58 * They are also used (indirectly) in the kernel file structure f_flags, 58 * They are also used (indirectly) in the kernel file structure f_flags,
59 * which is a superset of the open/fcntl flags. Open flags and f_flags 59 * which is a superset of the open/fcntl flags. Open flags and f_flags
60 * are inter-convertible using OFLAGS(fflags) and FFLAGS(oflags). 60 * are inter-convertible using OFLAGS(fflags) and FFLAGS(oflags).
61 * Open/fcntl flags begin with O_; kernel-internal flags begin with F. 61 * Open/fcntl flags begin with O_; kernel-internal flags begin with F.
62 */ 62 */
63/* open-only flags */ 63/* open-only flags */
64#define O_RDONLY 0x00000000 /* open for reading only */ 64#define O_RDONLY 0x00000000 /* open for reading only */
65#define O_WRONLY 0x00000001 /* open for writing only */ 65#define O_WRONLY 0x00000001 /* open for writing only */
66#define O_RDWR 0x00000002 /* open for reading and writing */ 66#define O_RDWR 0x00000002 /* open for reading and writing */
67#define O_ACCMODE 0x00000003 /* mask for above modes */ 67#define O_ACCMODE 0x00000003 /* mask for above modes */
68 68
69/* 69/*
70 * Kernel encoding of open mode; separate read and write bits that are 70 * Kernel encoding of open mode; separate read and write bits that are
71 * independently testable: 1 greater than the above. 71 * independently testable: 1 greater than the above.
72 * 72 *
73 * XXX 73 * XXX
74 * FREAD and FWRITE are excluded from the #ifdef _KERNEL so that TIOCFLUSH, 74 * FREAD and FWRITE are excluded from the #ifdef _KERNEL so that TIOCFLUSH,
75 * which was documented to use FREAD/FWRITE, continues to work. 75 * which was documented to use FREAD/FWRITE, continues to work.
76 */ 76 */
77#if defined(_NETBSD_SOURCE) 77#if defined(_NETBSD_SOURCE)
78#define FREAD 0x00000001 78#define FREAD 0x00000001
79#define FWRITE 0x00000002 79#define FWRITE 0x00000002
80#endif 80#endif
81#define O_NONBLOCK 0x00000004 /* no delay */ 81#define O_NONBLOCK 0x00000004 /* no delay */
82#define O_APPEND 0x00000008 /* set append mode */ 82#define O_APPEND 0x00000008 /* set append mode */
83#if defined(_NETBSD_SOURCE) 83#if defined(_NETBSD_SOURCE)
84#define O_SHLOCK 0x00000010 /* open with shared file lock */ 84#define O_SHLOCK 0x00000010 /* open with shared file lock */
85#define O_EXLOCK 0x00000020 /* open with exclusive file lock */ 85#define O_EXLOCK 0x00000020 /* open with exclusive file lock */
86#define O_ASYNC 0x00000040 /* signal pgrp when data ready */ 86#define O_ASYNC 0x00000040 /* signal pgrp when data ready */
87#endif 87#endif
88#if (_POSIX_C_SOURCE - 0) >= 199309L || \ 88#if (_POSIX_C_SOURCE - 0) >= 199309L || \
89 (defined(_XOPEN_SOURCE) && defined(_XOPEN_SOURCE_EXTENDED)) || \ 89 (defined(_XOPEN_SOURCE) && defined(_XOPEN_SOURCE_EXTENDED)) || \
90 (_XOPEN_SOURCE - 0) >= 500 || defined(_NETBSD_SOURCE) 90 (_XOPEN_SOURCE - 0) >= 500 || defined(_NETBSD_SOURCE)
91#define O_SYNC 0x00000080 /* synchronous writes */ 91#define O_SYNC 0x00000080 /* synchronous writes */
92#endif 92#endif
93#if defined(_NETBSD_SOURCE) 93#if defined(_NETBSD_SOURCE)
94#define O_NOFOLLOW 0x00000100 /* don't follow symlinks on the last */ 94#define O_NOFOLLOW 0x00000100 /* don't follow symlinks on the last */
95 /* path component */ 95 /* path component */
96#endif 96#endif
97#define O_CREAT 0x00000200 /* create if nonexistent */ 97#define O_CREAT 0x00000200 /* create if nonexistent */
98#define O_TRUNC 0x00000400 /* truncate to zero length */ 98#define O_TRUNC 0x00000400 /* truncate to zero length */
99#define O_EXCL 0x00000800 /* error if already exists */ 99#define O_EXCL 0x00000800 /* error if already exists */
100 100
101#if (_POSIX_C_SOURCE - 0) >= 199309L || (_XOPEN_SOURCE - 0) >= 500 || \ 101#if (_POSIX_C_SOURCE - 0) >= 199309L || (_XOPEN_SOURCE - 0) >= 500 || \
102 defined(_NETBSD_SOURCE) 102 defined(_NETBSD_SOURCE)
103#define O_DSYNC 0x00010000 /* write: I/O data completion */ 103#define O_DSYNC 0x00010000 /* write: I/O data completion */
104#define O_RSYNC 0x00020000 /* read: I/O completion as for write */ 104#define O_RSYNC 0x00020000 /* read: I/O completion as for write */
105#endif 105#endif
106 106
107#if defined(_NETBSD_SOURCE) 107#if defined(_NETBSD_SOURCE)
108#define O_ALT_IO 0x00040000 /* use alternate i/o semantics */ 108#define O_ALT_IO 0x00040000 /* use alternate i/o semantics */
109#define O_DIRECT 0x00080000 /* direct I/O hint */ 109#define O_DIRECT 0x00080000 /* direct I/O hint */
110#endif 110#endif
111 111
112/* defined by POSIX 1003.1; BSD default, but required to be bitwise distinct */ 112/* defined by POSIX 1003.1; BSD default, but required to be bitwise distinct */
113#define O_NOCTTY 0x00008000 /* don't assign controlling terminal */ 113#define O_NOCTTY 0x00008000 /* don't assign controlling terminal */
114 114
115#ifdef _KERNEL 115#ifdef _KERNEL
116/* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ 116/* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */
117#define FFLAGS(oflags) ((oflags) + 1) 117#define FFLAGS(oflags) ((oflags) + 1)
118#define OFLAGS(fflags) ((fflags) - 1) 118#define OFLAGS(fflags) ((fflags) - 1)
119 119
120/* all bits settable during open(2) */ 120/* all bits settable during open(2) */
121#define O_MASK (O_ACCMODE|O_NONBLOCK|O_APPEND|O_SHLOCK|O_EXLOCK|\ 121#define O_MASK (O_ACCMODE|O_NONBLOCK|O_APPEND|O_SHLOCK|O_EXLOCK|\
122 O_ASYNC|O_SYNC|O_CREAT|O_TRUNC|O_EXCL|O_DSYNC|\ 122 O_ASYNC|O_SYNC|O_CREAT|O_TRUNC|O_EXCL|O_DSYNC|\
123 O_RSYNC|O_NOCTTY|O_ALT_IO|O_NOFOLLOW|O_DIRECT) 123 O_RSYNC|O_NOCTTY|O_ALT_IO|O_NOFOLLOW|O_DIRECT)
124 124
125#define FMARK 0x00001000 /* mark during gc() */ 125#define FMARK 0x00001000 /* mark during gc() */
126#define FDEFER 0x00002000 /* defer for next gc pass */ 126#define FDEFER 0x00002000 /* defer for next gc pass */
127#define FHASLOCK 0x00004000 /* descriptor holds advisory lock */ 127#define FHASLOCK 0x00004000 /* descriptor holds advisory lock */
 128#define FSCAN 0x00100000 /* scan during gc passes */
128#define FKIOCTL 0x80000000 /* kernel originated ioctl */ 129#define FKIOCTL 0x80000000 /* kernel originated ioctl */
129/* bits settable by fcntl(F_SETFL, ...) */ 130/* bits settable by fcntl(F_SETFL, ...) */
130#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FDSYNC|FRSYNC|FALTIO|\ 131#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FDSYNC|FRSYNC|FALTIO|\
131 FDIRECT) 132 FDIRECT)
132/* bits to save after open(2) */ 133/* bits to save after open(2) */
133#define FMASK (FREAD|FWRITE|FCNTLFLAGS) 134#define FMASK (FREAD|FWRITE|FCNTLFLAGS)
134#endif /* _KERNEL */ 135#endif /* _KERNEL */
135 136
136/* 137/*
137 * The O_* flags used to have only F* names, which were used in the kernel 138 * The O_* flags used to have only F* names, which were used in the kernel
138 * and by fcntl. We retain the F* names for the kernel f_flags field 139 * and by fcntl. We retain the F* names for the kernel f_flags field
139 * and for backward compatibility for fcntl. 140 * and for backward compatibility for fcntl.
140 */ 141 */
141#if defined(_NETBSD_SOURCE) 142#if defined(_NETBSD_SOURCE)
142#define FAPPEND O_APPEND /* kernel/compat */ 143#define FAPPEND O_APPEND /* kernel/compat */
143#define FASYNC O_ASYNC /* kernel/compat */ 144#define FASYNC O_ASYNC /* kernel/compat */
144#define O_FSYNC O_SYNC /* compat */ 145#define O_FSYNC O_SYNC /* compat */
145#define FNDELAY O_NONBLOCK /* compat */ 146#define FNDELAY O_NONBLOCK /* compat */
146#define O_NDELAY O_NONBLOCK /* compat */ 147#define O_NDELAY O_NONBLOCK /* compat */
147#endif 148#endif
148#if defined(_KERNEL) 149#if defined(_KERNEL)
149#define FNONBLOCK O_NONBLOCK /* kernel */ 150#define FNONBLOCK O_NONBLOCK /* kernel */
150#define FFSYNC O_SYNC /* kernel */ 151#define FFSYNC O_SYNC /* kernel */
151#define FDSYNC O_DSYNC /* kernel */ 152#define FDSYNC O_DSYNC /* kernel */
152#define FRSYNC O_RSYNC /* kernel */ 153#define FRSYNC O_RSYNC /* kernel */
153#define FALTIO O_ALT_IO /* kernel */ 154#define FALTIO O_ALT_IO /* kernel */
154#define FDIRECT O_DIRECT /* kernel */ 155#define FDIRECT O_DIRECT /* kernel */
155#endif 156#endif
156 157
157/* 158/*
158 * Constants used for fcntl(2) 159 * Constants used for fcntl(2)
159 */ 160 */
160 161
161/* command values */ 162/* command values */
162#define F_DUPFD 0 /* duplicate file descriptor */ 163#define F_DUPFD 0 /* duplicate file descriptor */
163#define F_GETFD 1 /* get file descriptor flags */ 164#define F_GETFD 1 /* get file descriptor flags */
164#define F_SETFD 2 /* set file descriptor flags */ 165#define F_SETFD 2 /* set file descriptor flags */
165#define F_GETFL 3 /* get file status flags */ 166#define F_GETFL 3 /* get file status flags */
166#define F_SETFL 4 /* set file status flags */ 167#define F_SETFL 4 /* set file status flags */
167#if (_POSIX_C_SOURCE - 0) >= 200112L || (_XOPEN_SOURCE - 0) >= 500 || \ 168#if (_POSIX_C_SOURCE - 0) >= 200112L || (_XOPEN_SOURCE - 0) >= 500 || \
168 defined(_NETBSD_SOURCE) 169 defined(_NETBSD_SOURCE)
169#define F_GETOWN 5 /* get SIGIO/SIGURG proc/pgrp */ 170#define F_GETOWN 5 /* get SIGIO/SIGURG proc/pgrp */
170#define F_SETOWN 6 /* set SIGIO/SIGURG proc/pgrp */ 171#define F_SETOWN 6 /* set SIGIO/SIGURG proc/pgrp */
171#endif 172#endif
172#define F_GETLK 7 /* get record locking information */ 173#define F_GETLK 7 /* get record locking information */
173#define F_SETLK 8 /* set record locking information */ 174#define F_SETLK 8 /* set record locking information */
174#define F_SETLKW 9 /* F_SETLK; wait if blocked */ 175#define F_SETLKW 9 /* F_SETLK; wait if blocked */
175#if defined(_NETBSD_SOURCE) 176#if defined(_NETBSD_SOURCE)
176#define F_CLOSEM 10 /* close all fds >= to the one given */ 177#define F_CLOSEM 10 /* close all fds >= to the one given */
177#define F_MAXFD 11 /* return the max open fd */ 178#define F_MAXFD 11 /* return the max open fd */
178#endif 179#endif
179 180
180/* file descriptor flags (F_GETFD, F_SETFD) */ 181/* file descriptor flags (F_GETFD, F_SETFD) */
181#define FD_CLOEXEC 1 /* close-on-exec flag */ 182#define FD_CLOEXEC 1 /* close-on-exec flag */
182 183
183/* record locking flags (F_GETLK, F_SETLK, F_SETLKW) */ 184/* record locking flags (F_GETLK, F_SETLK, F_SETLKW) */
184#define F_RDLCK 1 /* shared or read lock */ 185#define F_RDLCK 1 /* shared or read lock */
185#define F_UNLCK 2 /* unlock */ 186#define F_UNLCK 2 /* unlock */
186#define F_WRLCK 3 /* exclusive or write lock */ 187#define F_WRLCK 3 /* exclusive or write lock */
187#ifdef _KERNEL 188#ifdef _KERNEL
188#define F_WAIT 0x010 /* Wait until lock is granted */ 189#define F_WAIT 0x010 /* Wait until lock is granted */
189#define F_FLOCK 0x020 /* Use flock(2) semantics for lock */ 190#define F_FLOCK 0x020 /* Use flock(2) semantics for lock */
190#define F_POSIX 0x040 /* Use POSIX semantics for lock */ 191#define F_POSIX 0x040 /* Use POSIX semantics for lock */
191#endif 192#endif
192 193
193/* Constants for fcntl's passed to the underlying fs - like ioctl's. */ 194/* Constants for fcntl's passed to the underlying fs - like ioctl's. */
194#if defined(_NETBSD_SOURCE) 195#if defined(_NETBSD_SOURCE)
195#define F_PARAM_MASK 0xfff 196#define F_PARAM_MASK 0xfff
196#define F_PARAM_LEN(x) (((x) >> 16) & F_PARAM_MASK) 197#define F_PARAM_LEN(x) (((x) >> 16) & F_PARAM_MASK)
197#define F_PARAM_MAX 4095 198#define F_PARAM_MAX 4095
198#define F_FSCTL (int)0x80000000 /* This fcntl goes to the fs */ 199#define F_FSCTL (int)0x80000000 /* This fcntl goes to the fs */
199#define F_FSVOID (int)0x40000000 /* no parameters */ 200#define F_FSVOID (int)0x40000000 /* no parameters */
200#define F_FSOUT (int)0x20000000 /* copy out parameter */ 201#define F_FSOUT (int)0x20000000 /* copy out parameter */
201#define F_FSIN (int)0x10000000 /* copy in parameter */ 202#define F_FSIN (int)0x10000000 /* copy in parameter */
202#define F_FSINOUT (F_FSIN | F_FSOUT) 203#define F_FSINOUT (F_FSIN | F_FSOUT)
203#define F_FSDIRMASK (int)0x70000000 /* mask for IN/OUT/VOID */ 204#define F_FSDIRMASK (int)0x70000000 /* mask for IN/OUT/VOID */
204#define F_FSPRIV (int)0x00008000 /* command is fs-specific */ 205#define F_FSPRIV (int)0x00008000 /* command is fs-specific */
205 206
206/* 207/*
207 * Define command macros for operations which, if implemented, must be 208 * Define command macros for operations which, if implemented, must be
208 * the same for all fs's. 209 * the same for all fs's.
209 */ 210 */
210#define _FCN(inout, num, len) \ 211#define _FCN(inout, num, len) \
211 (F_FSCTL | inout | ((len & F_PARAM_MASK) << 16) | (num)) 212 (F_FSCTL | inout | ((len & F_PARAM_MASK) << 16) | (num))
212#define _FCNO(c) _FCN(F_FSVOID, (c), 0) 213#define _FCNO(c) _FCN(F_FSVOID, (c), 0)
213#define _FCNR(c, t) _FCN(F_FSIN, (c), (int)sizeof(t)) 214#define _FCNR(c, t) _FCN(F_FSIN, (c), (int)sizeof(t))
214#define _FCNW(c, t) _FCN(F_FSOUT, (c), (int)sizeof(t)) 215#define _FCNW(c, t) _FCN(F_FSOUT, (c), (int)sizeof(t))
215#define _FCNRW(c, t) _FCN(F_FSINOUT, (c), (int)sizeof(t)) 216#define _FCNRW(c, t) _FCN(F_FSINOUT, (c), (int)sizeof(t))
216 217
217/* 218/*
218 * Define command macros for fs-specific commands. 219 * Define command macros for fs-specific commands.
219 */ 220 */
220#define _FCN_FSPRIV(inout, fs, num, len) \ 221#define _FCN_FSPRIV(inout, fs, num, len) \
221 (F_FSCTL | F_FSPRIV | inout | ((len & F_PARAM_MASK) << 16) | \ 222 (F_FSCTL | F_FSPRIV | inout | ((len & F_PARAM_MASK) << 16) | \
222 (fs) << 8 | (num)) 223 (fs) << 8 | (num))
223#define _FCNO_FSPRIV(f, c) _FCN_FSPRIV(F_FSVOID, (f), (c), 0) 224#define _FCNO_FSPRIV(f, c) _FCN_FSPRIV(F_FSVOID, (f), (c), 0)
224#define _FCNR_FSPRIV(f, c, t) _FCN_FSPRIV(F_FSIN, (f), (c), (int)sizeof(t)) 225#define _FCNR_FSPRIV(f, c, t) _FCN_FSPRIV(F_FSIN, (f), (c), (int)sizeof(t))
225#define _FCNW_FSPRIV(f, c, t) _FCN_FSPRIV(F_FSOUT, (f), (c), (int)sizeof(t)) 226#define _FCNW_FSPRIV(f, c, t) _FCN_FSPRIV(F_FSOUT, (f), (c), (int)sizeof(t))
226#define _FCNRW_FSPRIV(f, c, t) _FCN_FSPRIV(F_FSINOUT, (f), (c), (int)sizeof(t)) 227#define _FCNRW_FSPRIV(f, c, t) _FCN_FSPRIV(F_FSINOUT, (f), (c), (int)sizeof(t))
227 228
228#endif /* _NETBSD_SOURCE */ 229#endif /* _NETBSD_SOURCE */
229 230
230/* 231/*
231 * Advisory file segment locking data type - 232 * Advisory file segment locking data type -
232 * information passed to system by user 233 * information passed to system by user
233 */ 234 */
234struct flock { 235struct flock {
235 off_t l_start; /* starting offset */ 236 off_t l_start; /* starting offset */
236 off_t l_len; /* len = 0 means until end of file */ 237 off_t l_len; /* len = 0 means until end of file */
237 pid_t l_pid; /* lock owner */ 238 pid_t l_pid; /* lock owner */
238 short l_type; /* lock type: read/write, etc. */ 239 short l_type; /* lock type: read/write, etc. */
239 short l_whence; /* type of l_start */ 240 short l_whence; /* type of l_start */
240}; 241};
241 242
242 243
243#if defined(_NETBSD_SOURCE) 244#if defined(_NETBSD_SOURCE)
244/* lock operations for flock(2) */ 245/* lock operations for flock(2) */
245#define LOCK_SH 0x01 /* shared file lock */ 246#define LOCK_SH 0x01 /* shared file lock */
246#define LOCK_EX 0x02 /* exclusive file lock */ 247#define LOCK_EX 0x02 /* exclusive file lock */
247#define LOCK_NB 0x04 /* don't block when locking */ 248#define LOCK_NB 0x04 /* don't block when locking */
248#define LOCK_UN 0x08 /* unlock file */ 249#define LOCK_UN 0x08 /* unlock file */
249#endif 250#endif
250 251
251/* Always ensure that these are consistent with <stdio.h> and <unistd.h>! */ 252/* Always ensure that these are consistent with <stdio.h> and <unistd.h>! */
252#ifndef SEEK_SET 253#ifndef SEEK_SET
253#define SEEK_SET 0 /* set file offset to offset */ 254#define SEEK_SET 0 /* set file offset to offset */
254#endif 255#endif
255#ifndef SEEK_CUR 256#ifndef SEEK_CUR
256#define SEEK_CUR 1 /* set file offset to current plus offset */ 257#define SEEK_CUR 1 /* set file offset to current plus offset */
257#endif 258#endif
258#ifndef SEEK_END 259#ifndef SEEK_END
259#define SEEK_END 2 /* set file offset to EOF plus offset */ 260#define SEEK_END 2 /* set file offset to EOF plus offset */
260#endif 261#endif
261 262
262/* 263/*
263 * posix_advise advisories. 264 * posix_advise advisories.
264 */ 265 */
265 266
266#define POSIX_FADV_NORMAL 0 /* default advice / no advice */ 267#define POSIX_FADV_NORMAL 0 /* default advice / no advice */
267#define POSIX_FADV_RANDOM 1 /* random access */ 268#define POSIX_FADV_RANDOM 1 /* random access */
268#define POSIX_FADV_SEQUENTIAL 2 /* sequential access(lower to higher) */ 269#define POSIX_FADV_SEQUENTIAL 2 /* sequential access(lower to higher) */
269#define POSIX_FADV_WILLNEED 3 /* be needed in near future */ 270#define POSIX_FADV_WILLNEED 3 /* be needed in near future */
270#define POSIX_FADV_DONTNEED 4 /* not be needed in near future */ 271#define POSIX_FADV_DONTNEED 4 /* not be needed in near future */
271#define POSIX_FADV_NOREUSE 5 /* be accessed once */ 272#define POSIX_FADV_NOREUSE 5 /* be accessed once */
272 273
273#ifndef _KERNEL 274#ifndef _KERNEL
274#include <sys/cdefs.h> 275#include <sys/cdefs.h>
275 276
276__BEGIN_DECLS 277__BEGIN_DECLS
277int open(const char *, int, ...); 278int open(const char *, int, ...);
278int creat(const char *, mode_t); 279int creat(const char *, mode_t);
279int fcntl(int, int, ...); 280int fcntl(int, int, ...);
280#if defined(_NETBSD_SOURCE) 281#if defined(_NETBSD_SOURCE)
281int flock(int, int); 282int flock(int, int);
282#endif /* _NETBSD_SOURCE */ 283#endif /* _NETBSD_SOURCE */
283int posix_fadvise(int, off_t, off_t, int); 284int posix_fadvise(int, off_t, off_t, int);
284__END_DECLS 285__END_DECLS
285#endif /* !_KERNEL */ 286#endif /* !_KERNEL */
286 287
287#endif /* !_SYS_FCNTL_H_ */ 288#endif /* !_SYS_FCNTL_H_ */

cvs diff -r1.65 -r1.65.6.1 src/sys/sys/file.h (switch to unified diff)

--- src/sys/sys/file.h 2008/06/24 10:26:27 1.65
+++ src/sys/sys/file.h 2009/03/18 05:33:23 1.65.6.1
@@ -1,134 +1,168 @@ @@ -1,134 +1,168 @@
1/* $NetBSD: file.h,v 1.65 2008/06/24 10:26:27 gmcgarry Exp $ */ 1/* $NetBSD: file.h,v 1.65.6.1 2009/03/18 05:33:23 snj Exp $ */
 2
 3/*-
 4 * Copyright (c) 2009 The NetBSD Foundation, Inc.
 5 * All rights reserved.
 6 *
 7 * This code is derived from software contributed to The NetBSD Foundation
 8 * by Andrew Doran.
 9 *
 10 * Redistribution and use in source and binary forms, with or without
 11 * modification, are permitted provided that the following conditions
 12 * are met:
 13 * 1. Redistributions of source code must retain the above copyright
 14 * notice, this list of conditions and the following disclaimer.
 15 * 2. Redistributions in binary form must reproduce the above copyright
 16 * notice, this list of conditions and the following disclaimer in the
 17 * documentation and/or other materials provided with the distribution.
 18 *
 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 29 * POSSIBILITY OF SUCH DAMAGE.
 30 */
2 31
3/* 32/*
4 * Copyright (c) 1982, 1986, 1989, 1993 33 * Copyright (c) 1982, 1986, 1989, 1993
5 * The Regents of the University of California. All rights reserved. 34 * The Regents of the University of California. All rights reserved.
6 * 35 *
7 * Redistribution and use in source and binary forms, with or without 36 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 37 * modification, are permitted provided that the following conditions
9 * are met: 38 * are met:
10 * 1. Redistributions of source code must retain the above copyright 39 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 40 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 41 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 42 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 43 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors 44 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software 45 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission. 46 * without specific prior written permission.
18 * 47 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE. 58 * SUCH DAMAGE.
30 * 59 *
31 * @(#)file.h 8.3 (Berkeley) 1/9/95 60 * @(#)file.h 8.3 (Berkeley) 1/9/95
32 */ 61 */
33 62
34#ifndef _SYS_FILE_H_ 63#ifndef _SYS_FILE_H_
35#define _SYS_FILE_H_ 64#define _SYS_FILE_H_
36 65
37#include <sys/fcntl.h> 66#include <sys/fcntl.h>
38#include <sys/unistd.h> 67#include <sys/unistd.h>
39 68
40#ifdef _KERNEL 69#ifdef _KERNEL
41#include <sys/mallocvar.h> 70#include <sys/mallocvar.h>
42#include <sys/queue.h> 71#include <sys/queue.h>
43#include <sys/mutex.h> 72#include <sys/mutex.h>
44#include <sys/condvar.h> 73#include <sys/condvar.h>
45 74
46struct proc; 75struct proc;
47struct lwp; 76struct lwp;
48struct uio; 77struct uio;
49struct iovec; 78struct iovec;
50struct stat; 79struct stat;
51struct knote; 80struct knote;
52 81
53/* 82/*
54 * Kernel file descriptor. One entry for each open kernel vnode and 83 * Kernel file descriptor. One entry for each open kernel vnode and
55 * socket. 84 * socket.
 85 *
 86 * This structure is exported via the KERN_FILE and KERN_FILE2 sysctl
 87 * calls. Only add members to the end, do not delete them.
56 */ 88 */
57struct file { 89struct file {
58 off_t f_offset; /* first, is 64-bit */ 90 off_t f_offset; /* first, is 64-bit */
59 kauth_cred_t f_cred; /* creds associated with descriptor */ 91 kauth_cred_t f_cred; /* creds associated with descriptor */
60 const struct fileops { 92 const struct fileops {
61 int (*fo_read) (struct file *, off_t *, struct uio *, 93 int (*fo_read) (struct file *, off_t *, struct uio *,
62 kauth_cred_t, int); 94 kauth_cred_t, int);
63 int (*fo_write) (struct file *, off_t *, struct uio *, 95 int (*fo_write) (struct file *, off_t *, struct uio *,
64 kauth_cred_t, int); 96 kauth_cred_t, int);
65 int (*fo_ioctl) (struct file *, u_long, void *); 97 int (*fo_ioctl) (struct file *, u_long, void *);
66 int (*fo_fcntl) (struct file *, u_int, void *); 98 int (*fo_fcntl) (struct file *, u_int, void *);
67 int (*fo_poll) (struct file *, int); 99 int (*fo_poll) (struct file *, int);
68 int (*fo_stat) (struct file *, struct stat *); 100 int (*fo_stat) (struct file *, struct stat *);
69 int (*fo_close) (struct file *); 101 int (*fo_close) (struct file *);
70 int (*fo_kqfilter) (struct file *, struct knote *); 102 int (*fo_kqfilter) (struct file *, struct knote *);
71 } *f_ops; 103 } *f_ops;
72 void *f_data; /* descriptor data, e.g. vnode/socket */ 104 void *f_data; /* descriptor data, e.g. vnode/socket */
73 LIST_ENTRY(file) f_list; /* list of active files */ 105 LIST_ENTRY(file) f_list; /* list of active files */
74 kmutex_t f_lock; /* lock on structure */ 106 kmutex_t f_lock; /* lock on structure */
75 int f_flag; /* see fcntl.h */ 107 int f_flag; /* see fcntl.h */
76 u_int f_iflags; /* internal flags; FIF_* */ 108 u_int f_unused1; /* unused; was internal flags; FIF_* */
77#define DTYPE_VNODE 1 /* file */ 109#define DTYPE_VNODE 1 /* file */
78#define DTYPE_SOCKET 2 /* communications endpoint */ 110#define DTYPE_SOCKET 2 /* communications endpoint */
79#define DTYPE_PIPE 3 /* pipe */ 111#define DTYPE_PIPE 3 /* pipe */
80#define DTYPE_KQUEUE 4 /* event queue */ 112#define DTYPE_KQUEUE 4 /* event queue */
81#define DTYPE_MISC 5 /* misc file descriptor type */ 113#define DTYPE_MISC 5 /* misc file descriptor type */
82#define DTYPE_CRYPTO 6 /* crypto */ 114#define DTYPE_CRYPTO 6 /* crypto */
83#define DTYPE_MQUEUE 7 /* message queue */ 115#define DTYPE_MQUEUE 7 /* message queue */
84#define DTYPE_NAMES \ 116#define DTYPE_NAMES \
85 "0", "file", "socket", "pipe", "kqueue", "misc", "crypto", "mqueue" 117 "0", "file", "socket", "pipe", "kqueue", "misc", "crypto", "mqueue"
86 u_int f_type; /* descriptor type */ 118 u_int f_type; /* descriptor type */
87 u_int f_advice; /* access pattern hint; UVM_ADV_* */ 119 u_int f_advice; /* access pattern hint; UVM_ADV_* */
88 u_int f_count; /* reference count */ 120 u_int f_count; /* reference count */
89 u_int f_msgcount; /* references from message queue */ 121 u_int f_msgcount; /* references from message queue */
 122 u_int f_unpcount; /* deferred close: see uipc_usrreq.c */
 123 SLIST_ENTRY(file) f_unplist; /* deferred close: see uipc_usrreq.c */
90}; 124};
91 125
92#define FILE_LOCK(fp) mutex_enter(&(fp)->f_lock) 126#define FILE_LOCK(fp) mutex_enter(&(fp)->f_lock)
93#define FILE_UNLOCK(fp) mutex_exit(&(fp)->f_lock) 127#define FILE_UNLOCK(fp) mutex_exit(&(fp)->f_lock)
94 128
95/* 129/*
96 * Flags for fo_read and fo_write and do_fileread/write/v 130 * Flags for fo_read and fo_write and do_fileread/write/v
97 */ 131 */
98#define FOF_UPDATE_OFFSET 0x0001 /* update the file offset */ 132#define FOF_UPDATE_OFFSET 0x0001 /* update the file offset */
99#define FOF_IOV_SYSSPACE 0x0100 /* iov structure in kernel memory */ 133#define FOF_IOV_SYSSPACE 0x0100 /* iov structure in kernel memory */
100 134
101LIST_HEAD(filelist, file); 135LIST_HEAD(filelist, file);
102extern struct filelist filehead; /* head of list of open files */ 136extern struct filelist filehead; /* head of list of open files */
103extern u_int maxfiles; /* kernel limit on # of open files */ 137extern u_int maxfiles; /* kernel limit on # of open files */
104extern u_int nfiles; /* actual number of open files */ 138extern u_int nfiles; /* actual number of open files */
105 139
106extern const struct fileops vnops; /* vnode operations for files */ 140extern const struct fileops vnops; /* vnode operations for files */
107 141
108int dofileread(int, struct file *, void *, size_t, 142int dofileread(int, struct file *, void *, size_t,
109 off_t *, int, register_t *); 143 off_t *, int, register_t *);
110int dofilewrite(int, struct file *, const void *, 144int dofilewrite(int, struct file *, const void *,
111 size_t, off_t *, int, register_t *); 145 size_t, off_t *, int, register_t *);
112 146
113int do_filereadv(int, const struct iovec *, int, off_t *, 147int do_filereadv(int, const struct iovec *, int, off_t *,
114 int, register_t *); 148 int, register_t *);
115int do_filewritev(int, const struct iovec *, int, off_t *, 149int do_filewritev(int, const struct iovec *, int, off_t *,
116 int, register_t *); 150 int, register_t *);
117 151
118int fsetown(pid_t *, u_long, const void *); 152int fsetown(pid_t *, u_long, const void *);
119int fgetown(pid_t, u_long, void *); 153int fgetown(pid_t, u_long, void *);
120void fownsignal(pid_t, int, int, int, void *); 154void fownsignal(pid_t, int, int, int, void *);
121 155
122/* Commonly used fileops */ 156/* Commonly used fileops */
123int fnullop_fcntl(struct file *, u_int, void *); 157int fnullop_fcntl(struct file *, u_int, void *);
124int fnullop_poll(struct file *, int); 158int fnullop_poll(struct file *, int);
125int fnullop_kqfilter(struct file *, struct knote *); 159int fnullop_kqfilter(struct file *, struct knote *);
126int fbadop_read(struct file *, off_t *, struct uio *, kauth_cred_t, int); 160int fbadop_read(struct file *, off_t *, struct uio *, kauth_cred_t, int);
127int fbadop_write(struct file *, off_t *, struct uio *, kauth_cred_t, int); 161int fbadop_write(struct file *, off_t *, struct uio *, kauth_cred_t, int);
128int fbadop_ioctl(struct file *, u_long, void *); 162int fbadop_ioctl(struct file *, u_long, void *);
129int fbadop_close(struct file *); 163int fbadop_close(struct file *);
130int fbadop_stat(struct file *, struct stat *); 164int fbadop_stat(struct file *, struct stat *);
131 165
132#endif /* _KERNEL */ 166#endif /* _KERNEL */
133 167
134#endif /* _SYS_FILE_H_ */ 168#endif /* _SYS_FILE_H_ */

cvs diff -r1.330.4.3 -r1.330.4.4 src/sys/sys/param.h (switch to unified diff)

--- src/sys/sys/param.h 2009/02/09 00:22:09 1.330.4.3
+++ src/sys/sys/param.h 2009/03/18 05:33:23 1.330.4.4
@@ -1,432 +1,432 @@ @@ -1,432 +1,432 @@
1/* $NetBSD: param.h,v 1.330.4.3 2009/02/09 00:22:09 snj Exp $ */ 1/* $NetBSD: param.h,v 1.330.4.4 2009/03/18 05:33:23 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1982, 1986, 1989, 1993 4 * Copyright (c) 1982, 1986, 1989, 1993
5 * The Regents of the University of California. All rights reserved. 5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc. 6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed 7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph 8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc. 10 * the permission of UNIX System Laboratories, Inc.
11 * 11 *
12 * Redistribution and use in source and binary forms, with or without 12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions 13 * modification, are permitted provided that the following conditions
14 * are met: 14 * are met:
15 * 1. Redistributions of source code must retain the above copyright 15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer. 16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright 17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the 18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution. 19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors 20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software 21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission. 22 * without specific prior written permission.
23 * 23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE. 34 * SUCH DAMAGE.
35 * 35 *
36 * @(#)param.h 8.3 (Berkeley) 4/4/95 36 * @(#)param.h 8.3 (Berkeley) 4/4/95
37 */ 37 */
38 38
39#ifndef _SYS_PARAM_H_ 39#ifndef _SYS_PARAM_H_
40#define _SYS_PARAM_H_ 40#define _SYS_PARAM_H_
41 41
42/* 42/*
43 * Historic BSD #defines -- probably will remain untouched for all time. 43 * Historic BSD #defines -- probably will remain untouched for all time.
44 */ 44 */
45#define BSD 199506 /* System version (year & month). */ 45#define BSD 199506 /* System version (year & month). */
46#define BSD4_3 1 46#define BSD4_3 1
47#define BSD4_4 1 47#define BSD4_4 1
48 48
49/* 49/*
50 * #define __NetBSD_Version__ MMmmrrpp00 50 * #define __NetBSD_Version__ MMmmrrpp00
51 * 51 *
52 * M = major version 52 * M = major version
53 * m = minor version; a minor number of 99 indicates current. 53 * m = minor version; a minor number of 99 indicates current.
54 * r = 0 (*) 54 * r = 0 (*)
55 * p = patchlevel 55 * p = patchlevel
56 * 56 *
57 * When new releases are made, src/gnu/usr.bin/groff/tmac/mdoc.local 57 * When new releases are made, src/gnu/usr.bin/groff/tmac/mdoc.local
58 * needs to be updated and the changes sent back to the groff maintainers. 58 * needs to be updated and the changes sent back to the groff maintainers.
59 * 59 *
60 * (*) Up to 2.0I "release" used to be "",A-Z,Z[A-Z] but numeric 60 * (*) Up to 2.0I "release" used to be "",A-Z,Z[A-Z] but numeric
61 * e.g. NetBSD-1.2D = 102040000 ('D' == 4) 61 * e.g. NetBSD-1.2D = 102040000 ('D' == 4)
62 * NetBSD-2.0H (200080000) was changed on 20041001 to: 62 * NetBSD-2.0H (200080000) was changed on 20041001 to:
63 * 2.99.9 (299000900) 63 * 2.99.9 (299000900)
64 */ 64 */
65 65
66#define __NetBSD_Version__ 500000000 /* NetBSD 5.0_RC2 */ 66#define __NetBSD_Version__ 500000001 /* NetBSD 5.0_RC2 */
67 67
68#define __NetBSD_Prereq__(M,m,p) (((((M) * 100000000) + \ 68#define __NetBSD_Prereq__(M,m,p) (((((M) * 100000000) + \
69 (m) * 1000000) + (p) * 100) <= __NetBSD_Version__) 69 (m) * 1000000) + (p) * 100) <= __NetBSD_Version__)
70 70
71/* 71/*
72 * Historical NetBSD #define 72 * Historical NetBSD #define
73 * 73 *
74 * NetBSD 1.4 was the last release for which this value was incremented. 74 * NetBSD 1.4 was the last release for which this value was incremented.
75 * The value is now permanently fixed at 199905. It will never be 75 * The value is now permanently fixed at 199905. It will never be
76 * changed again. 76 * changed again.
77 * 77 *
78 * New code must use __NetBSD_Version__ instead, and should not even 78 * New code must use __NetBSD_Version__ instead, and should not even
79 * count on NetBSD being defined. 79 * count on NetBSD being defined.
80 * 80 *
81 */ 81 */
82 82
83#define NetBSD 199905 /* NetBSD version (year & month). */ 83#define NetBSD 199905 /* NetBSD version (year & month). */
84 84
85#include <sys/null.h> 85#include <sys/null.h>
86 86
87#ifndef _LOCORE 87#ifndef _LOCORE
88#include <sys/inttypes.h> 88#include <sys/inttypes.h>
89#include <sys/types.h> 89#include <sys/types.h>
90#endif 90#endif
91 91
92/* 92/*
93 * Machine-independent constants (some used in following include files). 93 * Machine-independent constants (some used in following include files).
94 * Redefined constants are from POSIX 1003.1 limits file. 94 * Redefined constants are from POSIX 1003.1 limits file.
95 * 95 *
96 * MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>) 96 * MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>)
97 * MAXHOSTNAMELEN should be >= (_POSIX_HOST_NAME_MAX + 1) (see <limits.h>) 97 * MAXHOSTNAMELEN should be >= (_POSIX_HOST_NAME_MAX + 1) (see <limits.h>)
98 * MAXLOGNAME should be >= UT_NAMESIZE (see <utmp.h>) 98 * MAXLOGNAME should be >= UT_NAMESIZE (see <utmp.h>)
99 */ 99 */
100#include <sys/syslimits.h> 100#include <sys/syslimits.h>
101 101
102#define MAXCOMLEN 16 /* max command name remembered */ 102#define MAXCOMLEN 16 /* max command name remembered */
103#define MAXINTERP PATH_MAX /* max interpreter file name length */ 103#define MAXINTERP PATH_MAX /* max interpreter file name length */
104/* DEPRECATED: use LOGIN_NAME_MAX instead. */ 104/* DEPRECATED: use LOGIN_NAME_MAX instead. */
105#define MAXLOGNAME (LOGIN_NAME_MAX - 1) /* max login name length */ 105#define MAXLOGNAME (LOGIN_NAME_MAX - 1) /* max login name length */
106#define NCARGS ARG_MAX /* max bytes for an exec function */ 106#define NCARGS ARG_MAX /* max bytes for an exec function */
107#define NGROUPS NGROUPS_MAX /* max number groups */ 107#define NGROUPS NGROUPS_MAX /* max number groups */
108#define NOGROUP 65535 /* marker for empty group set member */ 108#define NOGROUP 65535 /* marker for empty group set member */
109#define MAXHOSTNAMELEN 256 /* max hostname size */ 109#define MAXHOSTNAMELEN 256 /* max hostname size */
110 110
111#ifndef NOFILE 111#ifndef NOFILE
112#define NOFILE OPEN_MAX /* max open files per process */ 112#define NOFILE OPEN_MAX /* max open files per process */
113#endif 113#endif
114#ifndef MAXUPRC /* max simultaneous processes */ 114#ifndef MAXUPRC /* max simultaneous processes */
115#define MAXUPRC CHILD_MAX /* POSIX 1003.1-compliant default */ 115#define MAXUPRC CHILD_MAX /* POSIX 1003.1-compliant default */
116#else 116#else
117#if (MAXUPRC - 0) < CHILD_MAX 117#if (MAXUPRC - 0) < CHILD_MAX
118#error MAXUPRC less than CHILD_MAX. See options(4) for details. 118#error MAXUPRC less than CHILD_MAX. See options(4) for details.
119#endif /* (MAXUPRC - 0) < CHILD_MAX */ 119#endif /* (MAXUPRC - 0) < CHILD_MAX */
120#endif /* !defined(MAXUPRC) */ 120#endif /* !defined(MAXUPRC) */
121 121
122/* More types and definitions used throughout the kernel. */ 122/* More types and definitions used throughout the kernel. */
123#ifdef _KERNEL 123#ifdef _KERNEL
124#include <sys/cdefs.h> 124#include <sys/cdefs.h>
125#include <sys/errno.h> 125#include <sys/errno.h>
126#include <sys/time.h> 126#include <sys/time.h>
127#include <sys/resource.h> 127#include <sys/resource.h>
128#include <sys/ucred.h> 128#include <sys/ucred.h>
129#include <sys/uio.h> 129#include <sys/uio.h>
130#ifndef NPROC 130#ifndef NPROC
131#define NPROC (20 + 16 * MAXUSERS) 131#define NPROC (20 + 16 * MAXUSERS)
132#endif 132#endif
133#ifndef NTEXT 133#ifndef NTEXT
134#define NTEXT (80 + NPROC / 8) /* actually the object cache */ 134#define NTEXT (80 + NPROC / 8) /* actually the object cache */
135#endif 135#endif
136#ifndef NVNODE 136#ifndef NVNODE
137#define NVNODE (NPROC + NTEXT + 100) 137#define NVNODE (NPROC + NTEXT + 100)
138#define NVNODE_IMPLICIT 138#define NVNODE_IMPLICIT
139#endif 139#endif
140#ifndef VNODE_VA_MAXPCT 140#ifndef VNODE_VA_MAXPCT
141#define VNODE_VA_MAXPCT 20 141#define VNODE_VA_MAXPCT 20
142#endif 142#endif
143#ifndef BUFCACHE_VA_MAXPCT 143#ifndef BUFCACHE_VA_MAXPCT
144#define BUFCACHE_VA_MAXPCT 20 144#define BUFCACHE_VA_MAXPCT 20
145#endif 145#endif
146#define VNODE_COST 2048 /* assumed space in bytes */ 146#define VNODE_COST 2048 /* assumed space in bytes */
147#endif /* _KERNEL */ 147#endif /* _KERNEL */
148 148
149/* Signals. */ 149/* Signals. */
150#include <sys/signal.h> 150#include <sys/signal.h>
151 151
152/* Machine type dependent parameters. */ 152/* Machine type dependent parameters. */
153#include <machine/param.h> 153#include <machine/param.h>
154#include <machine/limits.h> 154#include <machine/limits.h>
155 155
156/* pages ("clicks") to disk blocks */ 156/* pages ("clicks") to disk blocks */
157#define ctod(x) ((x) << (PGSHIFT - DEV_BSHIFT)) 157#define ctod(x) ((x) << (PGSHIFT - DEV_BSHIFT))
158#define dtoc(x) ((x) >> (PGSHIFT - DEV_BSHIFT)) 158#define dtoc(x) ((x) >> (PGSHIFT - DEV_BSHIFT))
159 159
160/* bytes to pages */ 160/* bytes to pages */
161#define ctob(x) ((x) << PGSHIFT) 161#define ctob(x) ((x) << PGSHIFT)
162#define btoc(x) (((x) + PGOFSET) >> PGSHIFT) 162#define btoc(x) (((x) + PGOFSET) >> PGSHIFT)
163 163
164/* bytes to disk blocks */ 164/* bytes to disk blocks */
165#define dbtob(x) ((x) << DEV_BSHIFT) 165#define dbtob(x) ((x) << DEV_BSHIFT)
166#define btodb(x) ((x) >> DEV_BSHIFT) 166#define btodb(x) ((x) >> DEV_BSHIFT)
167 167
168#ifndef COHERENCY_UNIT 168#ifndef COHERENCY_UNIT
169#define COHERENCY_UNIT 64 169#define COHERENCY_UNIT 64
170#endif 170#endif
171#ifndef CACHE_LINE_SIZE 171#ifndef CACHE_LINE_SIZE
172#define CACHE_LINE_SIZE 64 172#define CACHE_LINE_SIZE 64
173#endif 173#endif
174#ifndef MAXCPUS 174#ifndef MAXCPUS
175#define MAXCPUS 32 175#define MAXCPUS 32
176#endif 176#endif
177#ifndef MAX_LWP_PER_PROC 177#ifndef MAX_LWP_PER_PROC
178#define MAX_LWP_PER_PROC 8000 178#define MAX_LWP_PER_PROC 8000
179#endif 179#endif
180 180
181/* 181/*
182 * Stack macros. On most architectures, the stack grows down, 182 * Stack macros. On most architectures, the stack grows down,
183 * towards lower addresses; it is the rare architecture where 183 * towards lower addresses; it is the rare architecture where
184 * it grows up, towards higher addresses. 184 * it grows up, towards higher addresses.
185 * 185 *
186 * STACK_GROW and STACK_SHRINK adjust a stack pointer by some 186 * STACK_GROW and STACK_SHRINK adjust a stack pointer by some
187 * size, no questions asked. STACK_ALIGN aligns a stack pointer. 187 * size, no questions asked. STACK_ALIGN aligns a stack pointer.
188 * 188 *
189 * STACK_ALLOC returns a pointer to allocated stack space of 189 * STACK_ALLOC returns a pointer to allocated stack space of
190 * some size; given such a pointer and a size, STACK_MAX gives 190 * some size; given such a pointer and a size, STACK_MAX gives
191 * the maximum (in the "maxsaddr" sense) stack address of the 191 * the maximum (in the "maxsaddr" sense) stack address of the
192 * allocated memory. 192 * allocated memory.
193 */ 193 */
194#if defined(_KERNEL) || defined(__EXPOSE_STACK) 194#if defined(_KERNEL) || defined(__EXPOSE_STACK)
195#ifdef __MACHINE_STACK_GROWS_UP 195#ifdef __MACHINE_STACK_GROWS_UP
196#define STACK_GROW(sp, _size) (((char *)(void *)(sp)) + (_size)) 196#define STACK_GROW(sp, _size) (((char *)(void *)(sp)) + (_size))
197#define STACK_SHRINK(sp, _size) (((char *)(void *)(sp)) - (_size)) 197#define STACK_SHRINK(sp, _size) (((char *)(void *)(sp)) - (_size))
198#define STACK_ALIGN(sp, bytes) \ 198#define STACK_ALIGN(sp, bytes) \
199 ((char *)((((unsigned long)(sp)) + (bytes)) & ~(bytes))) 199 ((char *)((((unsigned long)(sp)) + (bytes)) & ~(bytes)))
200#define STACK_ALLOC(sp, _size) ((char *)(void *)(sp)) 200#define STACK_ALLOC(sp, _size) ((char *)(void *)(sp))
201#define STACK_MAX(p, _size) (((char *)(void *)(p)) + (_size)) 201#define STACK_MAX(p, _size) (((char *)(void *)(p)) + (_size))
202#else 202#else
203#define STACK_GROW(sp, _size) (((char *)(void *)(sp)) - (_size)) 203#define STACK_GROW(sp, _size) (((char *)(void *)(sp)) - (_size))
204#define STACK_SHRINK(sp, _size) (((char *)(void *)(sp)) + (_size)) 204#define STACK_SHRINK(sp, _size) (((char *)(void *)(sp)) + (_size))
205#define STACK_ALIGN(sp, bytes) \ 205#define STACK_ALIGN(sp, bytes) \
206 ((char *)(((unsigned long)(sp)) & ~(bytes))) 206 ((char *)(((unsigned long)(sp)) & ~(bytes)))
207#define STACK_ALLOC(sp, _size) (((char *)(void *)(sp)) - (_size)) 207#define STACK_ALLOC(sp, _size) (((char *)(void *)(sp)) - (_size))
208#define STACK_MAX(p, _size) ((char *)(void *)(p)) 208#define STACK_MAX(p, _size) ((char *)(void *)(p))
209#endif 209#endif
210#endif /* defined(_KERNEL) || defined(__EXPOSE_STACK) */ 210#endif /* defined(_KERNEL) || defined(__EXPOSE_STACK) */
211 211
212/* 212/*
213 * Historic priority levels. These are meaningless and remain only 213 * Historic priority levels. These are meaningless and remain only
214 * for source compatibility. Do not use in new code. 214 * for source compatibility. Do not use in new code.
215 */ 215 */
216#define PSWP 0 216#define PSWP 0
217#define PVM 4 217#define PVM 4
218#define PINOD 8 218#define PINOD 8
219#define PRIBIO 16 219#define PRIBIO 16
220#define PVFS 20 220#define PVFS 20
221#define PZERO 22 221#define PZERO 22
222#define PSOCK 24 222#define PSOCK 24
223#define PWAIT 32 223#define PWAIT 32
224#define PLOCK 36 224#define PLOCK 36
225#define PPAUSE 40 225#define PPAUSE 40
226#define PUSER 50 226#define PUSER 50
227#define MAXPRI 127 227#define MAXPRI 127
228 228
229#define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ 229#define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */
230#define PNORELOCK 0x200 /* OR'd with pri for tsleep to not relock */ 230#define PNORELOCK 0x200 /* OR'd with pri for tsleep to not relock */
231 231
232/* 232/*
233 * New priority levels. 233 * New priority levels.
234 */ 234 */
235#define PRI_COUNT 224 235#define PRI_COUNT 224
236#define PRI_NONE (-1) 236#define PRI_NONE (-1)
237 237
238#define PRI_KERNEL_RT 192 238#define PRI_KERNEL_RT 192
239#define NPRI_KERNEL_RT 32 239#define NPRI_KERNEL_RT 32
240#define MAXPRI_KERNEL_RT (PRI_KERNEL_RT + NPRI_KERNEL_RT - 1) 240#define MAXPRI_KERNEL_RT (PRI_KERNEL_RT + NPRI_KERNEL_RT - 1)
241 241
242#define PRI_USER_RT 128 242#define PRI_USER_RT 128
243#define NPRI_USER_RT 64 243#define NPRI_USER_RT 64
244#define MAXPRI_USER_RT (PRI_USER_RT + NPRI_USER_RT - 1) 244#define MAXPRI_USER_RT (PRI_USER_RT + NPRI_USER_RT - 1)
245 245
246#define PRI_KTHREAD 96 246#define PRI_KTHREAD 96
247#define NPRI_KTHREAD 32 247#define NPRI_KTHREAD 32
248#define MAXPRI_KTHREAD (PRI_KTHREAD + NPRI_KTHREAD - 1) 248#define MAXPRI_KTHREAD (PRI_KTHREAD + NPRI_KTHREAD - 1)
249 249
250#define PRI_KERNEL 64 250#define PRI_KERNEL 64
251#define NPRI_KERNEL 32 251#define NPRI_KERNEL 32
252#define MAXPRI_KERNEL (PRI_KERNEL + NPRI_KERNEL - 1) 252#define MAXPRI_KERNEL (PRI_KERNEL + NPRI_KERNEL - 1)
253 253
254#define PRI_USER 0 254#define PRI_USER 0
255#define NPRI_USER 64 255#define NPRI_USER 64
256#define MAXPRI_USER (PRI_USER + NPRI_USER - 1) 256#define MAXPRI_USER (PRI_USER + NPRI_USER - 1)
257 257
258/* Priority range used by POSIX real-time features */ 258/* Priority range used by POSIX real-time features */
259#define SCHED_PRI_MIN 0 259#define SCHED_PRI_MIN 0
260#define SCHED_PRI_MAX 63 260#define SCHED_PRI_MAX 63
261 261
262/* 262/*
263 * Kernel thread priorities. 263 * Kernel thread priorities.
264 */ 264 */
265#define PRI_SOFTSERIAL MAXPRI_KERNEL_RT 265#define PRI_SOFTSERIAL MAXPRI_KERNEL_RT
266#define PRI_SOFTNET (MAXPRI_KERNEL_RT - schedppq * 1) 266#define PRI_SOFTNET (MAXPRI_KERNEL_RT - schedppq * 1)
267#define PRI_SOFTBIO (MAXPRI_KERNEL_RT - schedppq * 2) 267#define PRI_SOFTBIO (MAXPRI_KERNEL_RT - schedppq * 2)
268#define PRI_SOFTCLOCK (MAXPRI_KERNEL_RT - schedppq * 3) 268#define PRI_SOFTCLOCK (MAXPRI_KERNEL_RT - schedppq * 3)
269 269
270#define PRI_XCALL MAXPRI_KTHREAD 270#define PRI_XCALL MAXPRI_KTHREAD
271#define PRI_PGDAEMON (MAXPRI_KTHREAD - schedppq * 1) 271#define PRI_PGDAEMON (MAXPRI_KTHREAD - schedppq * 1)
272#define PRI_VM (MAXPRI_KTHREAD - schedppq * 2) 272#define PRI_VM (MAXPRI_KTHREAD - schedppq * 2)
273#define PRI_IOFLUSH (MAXPRI_KTHREAD - schedppq * 3) 273#define PRI_IOFLUSH (MAXPRI_KTHREAD - schedppq * 3)
274#define PRI_BIO (MAXPRI_KTHREAD - schedppq * 4) 274#define PRI_BIO (MAXPRI_KTHREAD - schedppq * 4)
275 275
276#define PRI_IDLE PRI_USER 276#define PRI_IDLE PRI_USER
277 277
278/* 278/*
279 * Miscellaneous. 279 * Miscellaneous.
280 */ 280 */
281#define NBPW sizeof(int) /* number of bytes per word (integer) */ 281#define NBPW sizeof(int) /* number of bytes per word (integer) */
282 282
283#define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ 283#define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */
284#define NODEV (dev_t)(-1) /* non-existent device */ 284#define NODEV (dev_t)(-1) /* non-existent device */
285 285
286#define CBLOCK 64 /* Clist block size, must be a power of 2. */ 286#define CBLOCK 64 /* Clist block size, must be a power of 2. */
287#define CBQSIZE (CBLOCK/NBBY) /* Quote bytes/cblock - can do better. */ 287#define CBQSIZE (CBLOCK/NBBY) /* Quote bytes/cblock - can do better. */
288 /* Data chars/clist. */ 288 /* Data chars/clist. */
289#define CBSIZE (CBLOCK - (int)sizeof(struct cblock *) - CBQSIZE) 289#define CBSIZE (CBLOCK - (int)sizeof(struct cblock *) - CBQSIZE)
290#define CROUND (CBLOCK - 1) /* Clist rounding. */ 290#define CROUND (CBLOCK - 1) /* Clist rounding. */
291 291
292/* 292/*
293 * File system parameters and macros. 293 * File system parameters and macros.
294 * 294 *
295 * The file system is made out of blocks of at most MAXBSIZE units, with 295 * The file system is made out of blocks of at most MAXBSIZE units, with
296 * smaller units (fragments) only in the last direct block. MAXBSIZE 296 * smaller units (fragments) only in the last direct block. MAXBSIZE
297 * primarily determines the size of buffers in the buffer pool. It may be 297 * primarily determines the size of buffers in the buffer pool. It may be
298 * made larger without any effect on existing file systems; however making 298 * made larger without any effect on existing file systems; however making
299 * it smaller may make some file systems unmountable. 299 * it smaller may make some file systems unmountable.
300 */ 300 */
301#ifndef MAXBSIZE /* XXX */ 301#ifndef MAXBSIZE /* XXX */
302#define MAXBSIZE MAXPHYS 302#define MAXBSIZE MAXPHYS
303#endif 303#endif
304#define MAXFRAG 8 304#define MAXFRAG 8
305 305
306/* 306/*
307 * MAXPATHLEN defines the longest permissible path length after expanding 307 * MAXPATHLEN defines the longest permissible path length after expanding
308 * symbolic links. It is used to allocate a temporary buffer from the buffer 308 * symbolic links. It is used to allocate a temporary buffer from the buffer
309 * pool in which to do the name expansion, hence should be a power of two, 309 * pool in which to do the name expansion, hence should be a power of two,
310 * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the 310 * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the
311 * maximum number of symbolic links that may be expanded in a path name. 311 * maximum number of symbolic links that may be expanded in a path name.
312 * It should be set high enough to allow all legitimate uses, but halt 312 * It should be set high enough to allow all legitimate uses, but halt
313 * infinite loops reasonably quickly. 313 * infinite loops reasonably quickly.
314 * 314 *
315 * MAXSYMLINKS should be >= _POSIX_SYMLOOP_MAX (see <limits.h>) 315 * MAXSYMLINKS should be >= _POSIX_SYMLOOP_MAX (see <limits.h>)
316 */ 316 */
317#define MAXPATHLEN PATH_MAX 317#define MAXPATHLEN PATH_MAX
318#define MAXSYMLINKS 32 318#define MAXSYMLINKS 32
319 319
320/* Bit map related macros. */ 320/* Bit map related macros. */
321#define setbit(a,i) ((a)[(i)/NBBY] |= 1<<((i)%NBBY)) 321#define setbit(a,i) ((a)[(i)/NBBY] |= 1<<((i)%NBBY))
322#define clrbit(a,i) ((a)[(i)/NBBY] &= ~(1<<((i)%NBBY))) 322#define clrbit(a,i) ((a)[(i)/NBBY] &= ~(1<<((i)%NBBY)))
323#define isset(a,i) ((a)[(i)/NBBY] & (1<<((i)%NBBY))) 323#define isset(a,i) ((a)[(i)/NBBY] & (1<<((i)%NBBY)))
324#define isclr(a,i) (((a)[(i)/NBBY] & (1<<((i)%NBBY))) == 0) 324#define isclr(a,i) (((a)[(i)/NBBY] & (1<<((i)%NBBY))) == 0)
325 325
326/* Macros for counting and rounding. */ 326/* Macros for counting and rounding. */
327#ifndef howmany 327#ifndef howmany
328#define howmany(x, y) (((x)+((y)-1))/(y)) 328#define howmany(x, y) (((x)+((y)-1))/(y))
329#endif 329#endif
330#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) 330#define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
331#define rounddown(x,y) (((x)/(y))*(y)) 331#define rounddown(x,y) (((x)/(y))*(y))
332#define roundup2(x, m) (((x) + m - 1) & ~(m - 1)) 332#define roundup2(x, m) (((x) + m - 1) & ~(m - 1))
333#define powerof2(x) ((((x)-1)&(x))==0) 333#define powerof2(x) ((((x)-1)&(x))==0)
334 334
335/* Macros for min/max. */ 335/* Macros for min/max. */
336#define MIN(a,b) (((a)<(b))?(a):(b)) 336#define MIN(a,b) (((a)<(b))?(a):(b))
337#define MAX(a,b) (((a)>(b))?(a):(b)) 337#define MAX(a,b) (((a)>(b))?(a):(b))
338 338
339/* 339/*
340 * Constants for setting the parameters of the kernel memory allocator. 340 * Constants for setting the parameters of the kernel memory allocator.
341 * 341 *
342 * 2 ** MINBUCKET is the smallest unit of memory that will be 342 * 2 ** MINBUCKET is the smallest unit of memory that will be
343 * allocated. It must be at least large enough to hold a pointer. 343 * allocated. It must be at least large enough to hold a pointer.
344 * 344 *
345 * Units of memory less or equal to MAXALLOCSAVE will permanently 345 * Units of memory less or equal to MAXALLOCSAVE will permanently
346 * allocate physical memory; requests for these size pieces of 346 * allocate physical memory; requests for these size pieces of
347 * memory are quite fast. Allocations greater than MAXALLOCSAVE must 347 * memory are quite fast. Allocations greater than MAXALLOCSAVE must
348 * always allocate and free physical memory; requests for these 348 * always allocate and free physical memory; requests for these
349 * size allocations should be done infrequently as they will be slow. 349 * size allocations should be done infrequently as they will be slow.
350 * 350 *
351 * Constraints: NBPG <= MAXALLOCSAVE <= 2 ** (MINBUCKET + 14), and 351 * Constraints: NBPG <= MAXALLOCSAVE <= 2 ** (MINBUCKET + 14), and
352 * MAXALLOCSAVE must be a power of two. 352 * MAXALLOCSAVE must be a power of two.
353 */ 353 */
354#ifdef _LP64 354#ifdef _LP64
355#define MINBUCKET 5 /* 5 => min allocation of 32 bytes */ 355#define MINBUCKET 5 /* 5 => min allocation of 32 bytes */
356#else 356#else
357#define MINBUCKET 4 /* 4 => min allocation of 16 bytes */ 357#define MINBUCKET 4 /* 4 => min allocation of 16 bytes */
358#endif 358#endif
359#define MAXALLOCSAVE (2 * NBPG) 359#define MAXALLOCSAVE (2 * NBPG)
360 360
361/* 361/*
362 * Scale factor for scaled integers used to count %cpu time and load avgs. 362 * Scale factor for scaled integers used to count %cpu time and load avgs.
363 * 363 *
364 * The number of CPU `tick's that map to a unique `%age' can be expressed 364 * The number of CPU `tick's that map to a unique `%age' can be expressed
365 * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that 365 * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that
366 * can be calculated (assuming 32 bits) can be closely approximated using 366 * can be calculated (assuming 32 bits) can be closely approximated using
367 * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). 367 * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15).
368 * 368 *
369 * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', 369 * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age',
370 * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. 370 * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024.
371 */ 371 */
372#define FSHIFT 11 /* bits to right of fixed binary point */ 372#define FSHIFT 11 /* bits to right of fixed binary point */
373#define FSCALE (1<<FSHIFT) 373#define FSCALE (1<<FSHIFT)
374 374
375/* 375/*
376 * The time for a process to be blocked before being very swappable. 376 * The time for a process to be blocked before being very swappable.
377 * This is a number of seconds which the system takes as being a non-trivial 377 * This is a number of seconds which the system takes as being a non-trivial
378 * amount of real time. You probably shouldn't change this; 378 * amount of real time. You probably shouldn't change this;
379 * it is used in subtle ways (fractions and multiples of it are, that is, like 379 * it is used in subtle ways (fractions and multiples of it are, that is, like
380 * half of a ``long time'', almost a long time, etc.) 380 * half of a ``long time'', almost a long time, etc.)
381 * It is related to human patience and other factors which don't really 381 * It is related to human patience and other factors which don't really
382 * change over time. 382 * change over time.
383 */ 383 */
384#define MAXSLP 20 384#define MAXSLP 20
385 385
386/* 386/*
387 * Defaults for Unified Buffer Cache parameters. 387 * Defaults for Unified Buffer Cache parameters.
388 * These may be overridden in <machine/param.h>. 388 * These may be overridden in <machine/param.h>.
389 */ 389 */
390 390
391#ifndef UBC_WINSHIFT 391#ifndef UBC_WINSHIFT
392#define UBC_WINSHIFT 13 392#define UBC_WINSHIFT 13
393#endif 393#endif
394#ifndef UBC_NWINS 394#ifndef UBC_NWINS
395#define UBC_NWINS 1024 395#define UBC_NWINS 1024
396#endif 396#endif
397 397
398#ifdef _KERNEL 398#ifdef _KERNEL
399/* 399/*
400 * macro to convert from milliseconds to hz without integer overflow 400 * macro to convert from milliseconds to hz without integer overflow
401 * Default version using only 32bits arithmetics. 401 * Default version using only 32bits arithmetics.
402 * 64bit port can define 64bit version in their <machine/param.h> 402 * 64bit port can define 64bit version in their <machine/param.h>
403 * 0x20000 is safe for hz < 20000 403 * 0x20000 is safe for hz < 20000
404 */ 404 */
405#ifndef mstohz 405#ifndef mstohz
406#define mstohz(ms) \ 406#define mstohz(ms) \
407 (__predict_false((ms) >= 0x20000) ? \ 407 (__predict_false((ms) >= 0x20000) ? \
408 ((ms +0u) / 1000u) * hz : \ 408 ((ms +0u) / 1000u) * hz : \
409 ((ms +0u) * hz) / 1000u) 409 ((ms +0u) * hz) / 1000u)
410#endif 410#endif
411#ifndef hztoms 411#ifndef hztoms
412#define hztoms(t) \ 412#define hztoms(t) \
413 (__predict_false((t) >= 0x20000) ? \ 413 (__predict_false((t) >= 0x20000) ? \
414 ((t +0u) / hz) * 1000u : \ 414 ((t +0u) / hz) * 1000u : \
415 ((t +0u) * 1000u) / hz) 415 ((t +0u) * 1000u) / hz)
416#endif 416#endif
417 417
418extern const int schedppq; 418extern const int schedppq;
419extern size_t coherency_unit; 419extern size_t coherency_unit;
420 420
421#endif /* _KERNEL */ 421#endif /* _KERNEL */
422 422
423/* 423/*
424 * Minimum alignment of "struct lwp" needed by the architecture. 424 * Minimum alignment of "struct lwp" needed by the architecture.
425 * This counts when packing a lock byte into a word alongside a 425 * This counts when packing a lock byte into a word alongside a
426 * pointer to an LWP. 426 * pointer to an LWP.
427 */ 427 */
428#ifndef MIN_LWP_ALIGNMENT 428#ifndef MIN_LWP_ALIGNMENT
429#define MIN_LWP_ALIGNMENT 32 429#define MIN_LWP_ALIGNMENT 32
430#endif 430#endif
431 431
432#endif /* !_SYS_PARAM_H_ */ 432#endif /* !_SYS_PARAM_H_ */

cvs diff -r1.44 -r1.44.4.1 src/sys/sys/un.h (switch to unified diff)

--- src/sys/sys/un.h 2008/08/06 15:01:24 1.44
+++ src/sys/sys/un.h 2009/03/18 05:33:23 1.44.4.1
@@ -1,113 +1,110 @@ @@ -1,113 +1,110 @@
1/* $NetBSD: un.h,v 1.44 2008/08/06 15:01:24 plunky Exp $ */ 1/* $NetBSD: un.h,v 1.44.4.1 2009/03/18 05:33:23 snj Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 1982, 1986, 1993 4 * Copyright (c) 1982, 1986, 1993
5 * The Regents of the University of California. All rights reserved. 5 * The Regents of the University of California. All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors 15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software 16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission. 17 * without specific prior written permission.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE. 29 * SUCH DAMAGE.
30 * 30 *
31 * @(#)un.h 8.3 (Berkeley) 2/19/95 31 * @(#)un.h 8.3 (Berkeley) 2/19/95
32 */ 32 */
33 33
34#ifndef _SYS_UN_H_ 34#ifndef _SYS_UN_H_
35#define _SYS_UN_H_ 35#define _SYS_UN_H_
36 36
37#include <sys/ansi.h> 37#include <sys/ansi.h>
38#include <sys/featuretest.h> 38#include <sys/featuretest.h>
39#include <sys/types.h> 39#include <sys/types.h>
40 40
41#ifndef sa_family_t 41#ifndef sa_family_t
42typedef __sa_family_t sa_family_t; 42typedef __sa_family_t sa_family_t;
43#define sa_family_t __sa_family_t 43#define sa_family_t __sa_family_t
44#endif 44#endif
45 45
46/* 46/*
47 * Definitions for UNIX IPC domain. 47 * Definitions for UNIX IPC domain.
48 */ 48 */
49struct sockaddr_un { 49struct sockaddr_un {
50 uint8_t sun_len; /* total sockaddr length */ 50 uint8_t sun_len; /* total sockaddr length */
51 sa_family_t sun_family; /* AF_LOCAL */ 51 sa_family_t sun_family; /* AF_LOCAL */
52 char sun_path[104]; /* path name (gag) */ 52 char sun_path[104]; /* path name (gag) */
53}; 53};
54 54
55/* 55/*
56 * Socket options for UNIX IPC domain. 56 * Socket options for UNIX IPC domain.
57 */ 57 */
58#if defined(_NETBSD_SOURCE) 58#if defined(_NETBSD_SOURCE)
59#define LOCAL_CREDS 0x0001 /* pass credentials to receiver */ 59#define LOCAL_CREDS 0x0001 /* pass credentials to receiver */
60#define LOCAL_CONNWAIT 0x0002 /* connects block until accepted */ 60#define LOCAL_CONNWAIT 0x0002 /* connects block until accepted */
61#define LOCAL_PEEREID 0x0003 /* get peer identification */ 61#define LOCAL_PEEREID 0x0003 /* get peer identification */
62#endif 62#endif
63 63
64/* 64/*
65 * Data automatically stored inside connect() for use by LOCAL_PEEREID 65 * Data automatically stored inside connect() for use by LOCAL_PEEREID
66 */ 66 */
67struct unpcbid { 67struct unpcbid {
68 pid_t unp_pid; /* process id */ 68 pid_t unp_pid; /* process id */
69 uid_t unp_euid; /* effective user id */ 69 uid_t unp_euid; /* effective user id */
70 gid_t unp_egid; /* effective group id */ 70 gid_t unp_egid; /* effective group id */
71}; 71};
72 72
73#ifdef _KERNEL 73#ifdef _KERNEL
74struct unpcb; 74struct unpcb;
75struct socket; 75struct socket;
76struct sockopt; 76struct sockopt;
77 77
78int uipc_usrreq(struct socket *, int, struct mbuf *, 78int uipc_usrreq(struct socket *, int, struct mbuf *,
79 struct mbuf *, struct mbuf *, struct lwp *); 79 struct mbuf *, struct mbuf *, struct lwp *);
80int uipc_ctloutput(int, struct socket *, struct sockopt *); 80int uipc_ctloutput(int, struct socket *, struct sockopt *);
81void uipc_init (void); 81void uipc_init (void);
82kmutex_t *uipc_dgramlock (void); 82kmutex_t *uipc_dgramlock (void);
83kmutex_t *uipc_streamlock (void); 83kmutex_t *uipc_streamlock (void);
84kmutex_t *uipc_rawlock (void); 84kmutex_t *uipc_rawlock (void);
85 85
86int unp_attach (struct socket *); 86int unp_attach (struct socket *);
87int unp_bind (struct socket *, struct mbuf *, struct lwp *); 87int unp_bind (struct socket *, struct mbuf *, struct lwp *);
88int unp_connect (struct socket *, struct mbuf *, struct lwp *); 88int unp_connect (struct socket *, struct mbuf *, struct lwp *);
89int unp_connect2 (struct socket *, struct socket *, int); 89int unp_connect2 (struct socket *, struct socket *, int);
90void unp_detach (struct unpcb *); 90void unp_detach (struct unpcb *);
91void unp_discard (struct file *); 91void unp_discard (struct file *);
92void unp_disconnect (struct unpcb *); 92void unp_disconnect (struct unpcb *);
93bool unp_drop (struct unpcb *, int); 93bool unp_drop (struct unpcb *, int);
94void unp_gc (void); 
95void unp_mark (struct file *); 
96void unp_scan (struct mbuf *, void (*)(struct file *), int); 
97void unp_shutdown (struct unpcb *); 94void unp_shutdown (struct unpcb *);
98int unp_externalize (struct mbuf *, struct lwp *); 95int unp_externalize (struct mbuf *, struct lwp *);
99int unp_internalize (struct mbuf **); 96int unp_internalize (struct mbuf **);
100void unp_dispose (struct mbuf *); 97void unp_dispose (struct mbuf *);
101int unp_output (struct mbuf *, struct mbuf *, struct unpcb *, 98int unp_output (struct mbuf *, struct mbuf *, struct unpcb *,
102 struct lwp *); 99 struct lwp *);
103void unp_setaddr (struct socket *, struct mbuf *, bool); 100void unp_setaddr (struct socket *, struct mbuf *, bool);
104#else /* !_KERNEL */ 101#else /* !_KERNEL */
105 102
106/* actual length of an initialized sockaddr_un */ 103/* actual length of an initialized sockaddr_un */
107#if defined(_NETBSD_SOURCE) 104#if defined(_NETBSD_SOURCE)
108#define SUN_LEN(su) \ 105#define SUN_LEN(su) \
109 (sizeof(*(su)) - sizeof((su)->sun_path) + strlen((su)->sun_path)) 106 (sizeof(*(su)) - sizeof((su)->sun_path) + strlen((su)->sun_path))
110#endif /* !_NetBSD_SOURCE */ 107#endif /* !_NetBSD_SOURCE */
111#endif /* _KERNEL */ 108#endif /* _KERNEL */
112 109
113#endif /* !_SYS_UN_H_ */ 110#endif /* !_SYS_UN_H_ */