Tue Oct 27 02:58:29 2009 UTC ()
- Amend fd_hold() to take an argument and add assert (reflects two cases,
  fork1() and the rest, e.g. kthread_create(), when creating from lwp0).

- lwp_create(): do not touch filedesc internals, use fd_hold().


(rmind)
diff -r1.199 -r1.200 src/sys/kern/kern_descrip.c
diff -r1.135 -r1.136 src/sys/kern/kern_lwp.c
diff -r1.56 -r1.57 src/sys/sys/filedesc.h

cvs diff -r1.199 -r1.200 src/sys/kern/kern_descrip.c (switch to unified diff)

--- src/sys/kern/kern_descrip.c 2009/08/16 11:00:20 1.199
+++ src/sys/kern/kern_descrip.c 2009/10/27 02:58:28 1.200
@@ -1,1828 +1,1830 @@ @@ -1,1828 +1,1830 @@
1/* $NetBSD: kern_descrip.c,v 1.199 2009/08/16 11:00:20 yamt Exp $ */ 1/* $NetBSD: kern_descrip.c,v 1.200 2009/10/27 02:58:28 rmind Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * Copyright (c) 1982, 1986, 1989, 1991, 1993 33 * Copyright (c) 1982, 1986, 1989, 1991, 1993
34 * The Regents of the University of California. All rights reserved. 34 * The Regents of the University of California. All rights reserved.
35 * (c) UNIX System Laboratories, Inc. 35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed 36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph 37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc. 39 * the permission of UNIX System Laboratories, Inc.
40 * 40 *
41 * Redistribution and use in source and binary forms, with or without 41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions 42 * modification, are permitted provided that the following conditions
43 * are met: 43 * are met:
44 * 1. Redistributions of source code must retain the above copyright 44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer. 45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright 46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the 47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution. 48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors 49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software 50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission. 51 * without specific prior written permission.
52 * 52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE. 63 * SUCH DAMAGE.
64 * 64 *
65 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95 65 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
66 */ 66 */
67 67
68/* 68/*
69 * File descriptor management. 69 * File descriptor management.
70 */ 70 */
71 71
72#include <sys/cdefs.h> 72#include <sys/cdefs.h>
73__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.199 2009/08/16 11:00:20 yamt Exp $"); 73__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.200 2009/10/27 02:58:28 rmind Exp $");
74 74
75#include <sys/param.h> 75#include <sys/param.h>
76#include <sys/systm.h> 76#include <sys/systm.h>
77#include <sys/filedesc.h> 77#include <sys/filedesc.h>
78#include <sys/kernel.h> 78#include <sys/kernel.h>
79#include <sys/proc.h> 79#include <sys/proc.h>
80#include <sys/file.h> 80#include <sys/file.h>
81#include <sys/socket.h> 81#include <sys/socket.h>
82#include <sys/socketvar.h> 82#include <sys/socketvar.h>
83#include <sys/stat.h> 83#include <sys/stat.h>
84#include <sys/ioctl.h> 84#include <sys/ioctl.h>
85#include <sys/fcntl.h> 85#include <sys/fcntl.h>
86#include <sys/pool.h> 86#include <sys/pool.h>
87#include <sys/unistd.h> 87#include <sys/unistd.h>
88#include <sys/resourcevar.h> 88#include <sys/resourcevar.h>
89#include <sys/conf.h> 89#include <sys/conf.h>
90#include <sys/event.h> 90#include <sys/event.h>
91#include <sys/kauth.h> 91#include <sys/kauth.h>
92#include <sys/atomic.h> 92#include <sys/atomic.h>
93#include <sys/syscallargs.h> 93#include <sys/syscallargs.h>
94#include <sys/cpu.h> 94#include <sys/cpu.h>
95#include <sys/kmem.h> 95#include <sys/kmem.h>
96#include <sys/vnode.h> 96#include <sys/vnode.h>
97 97
98static int file_ctor(void *, void *, int); 98static int file_ctor(void *, void *, int);
99static void file_dtor(void *, void *); 99static void file_dtor(void *, void *);
100static int fdfile_ctor(void *, void *, int); 100static int fdfile_ctor(void *, void *, int);
101static void fdfile_dtor(void *, void *); 101static void fdfile_dtor(void *, void *);
102static int filedesc_ctor(void *, void *, int); 102static int filedesc_ctor(void *, void *, int);
103static void filedesc_dtor(void *, void *); 103static void filedesc_dtor(void *, void *);
104static int filedescopen(dev_t, int, int, lwp_t *); 104static int filedescopen(dev_t, int, int, lwp_t *);
105 105
106kmutex_t filelist_lock; /* lock on filehead */ 106kmutex_t filelist_lock; /* lock on filehead */
107struct filelist filehead; /* head of list of open files */ 107struct filelist filehead; /* head of list of open files */
108u_int nfiles; /* actual number of open files */ 108u_int nfiles; /* actual number of open files */
109 109
110static pool_cache_t filedesc_cache; 110static pool_cache_t filedesc_cache;
111static pool_cache_t file_cache; 111static pool_cache_t file_cache;
112static pool_cache_t fdfile_cache; 112static pool_cache_t fdfile_cache;
113 113
114const struct cdevsw filedesc_cdevsw = { 114const struct cdevsw filedesc_cdevsw = {
115 filedescopen, noclose, noread, nowrite, noioctl, 115 filedescopen, noclose, noread, nowrite, noioctl,
116 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE, 116 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
117}; 117};
118 118
119/* For ease of reading. */ 119/* For ease of reading. */
120__strong_alias(fd_putvnode,fd_putfile) 120__strong_alias(fd_putvnode,fd_putfile)
121__strong_alias(fd_putsock,fd_putfile) 121__strong_alias(fd_putsock,fd_putfile)
122 122
123/* 123/*
124 * Initialize the descriptor system. 124 * Initialize the descriptor system.
125 */ 125 */
126void 126void
127fd_sys_init(void) 127fd_sys_init(void)
128{ 128{
129 129
130 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE); 130 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
131 131
132 file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0, 132 file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
133 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL); 133 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
134 KASSERT(file_cache != NULL); 134 KASSERT(file_cache != NULL);
135 135
136 fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0, 136 fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
137 PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor, 137 PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
138 NULL); 138 NULL);
139 KASSERT(fdfile_cache != NULL); 139 KASSERT(fdfile_cache != NULL);
140 140
141 filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit, 141 filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
142 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor, 142 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
143 NULL); 143 NULL);
144 KASSERT(filedesc_cache != NULL); 144 KASSERT(filedesc_cache != NULL);
145} 145}
146 146
147static bool 147static bool
148fd_isused(filedesc_t *fdp, unsigned fd) 148fd_isused(filedesc_t *fdp, unsigned fd)
149{ 149{
150 u_int off = fd >> NDENTRYSHIFT; 150 u_int off = fd >> NDENTRYSHIFT;
151 151
152 KASSERT(fd < fdp->fd_dt->dt_nfiles); 152 KASSERT(fd < fdp->fd_dt->dt_nfiles);
153 153
154 return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0; 154 return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
155} 155}
156 156
157/* 157/*
158 * Verify that the bitmaps match the descriptor table. 158 * Verify that the bitmaps match the descriptor table.
159 */ 159 */
160static inline void 160static inline void
161fd_checkmaps(filedesc_t *fdp) 161fd_checkmaps(filedesc_t *fdp)
162{ 162{
163#ifdef DEBUG 163#ifdef DEBUG
164 fdtab_t *dt; 164 fdtab_t *dt;
165 u_int fd; 165 u_int fd;
166 166
167 dt = fdp->fd_dt; 167 dt = fdp->fd_dt;
168 if (fdp->fd_refcnt == -1) { 168 if (fdp->fd_refcnt == -1) {
169 /* 169 /*
170 * fd_free tears down the table without maintaining its bitmap. 170 * fd_free tears down the table without maintaining its bitmap.
171 */ 171 */
172 return; 172 return;
173 } 173 }
174 for (fd = 0; fd < dt->dt_nfiles; fd++) { 174 for (fd = 0; fd < dt->dt_nfiles; fd++) {
175 if (fd < NDFDFILE) { 175 if (fd < NDFDFILE) {
176 KASSERT(dt->dt_ff[fd] == 176 KASSERT(dt->dt_ff[fd] ==
177 (fdfile_t *)fdp->fd_dfdfile[fd]); 177 (fdfile_t *)fdp->fd_dfdfile[fd]);
178 } 178 }
179 if (dt->dt_ff[fd] == NULL) { 179 if (dt->dt_ff[fd] == NULL) {
180 KASSERT(!fd_isused(fdp, fd)); 180 KASSERT(!fd_isused(fdp, fd));
181 } else if (dt->dt_ff[fd]->ff_file != NULL) { 181 } else if (dt->dt_ff[fd]->ff_file != NULL) {
182 KASSERT(fd_isused(fdp, fd)); 182 KASSERT(fd_isused(fdp, fd));
183 } 183 }
184 } 184 }
185#else /* DEBUG */ 185#else /* DEBUG */
186 /* nothing */ 186 /* nothing */
187#endif /* DEBUG */ 187#endif /* DEBUG */
188} 188}
189 189
190static int 190static int
191fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits) 191fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
192{ 192{
193 int i, off, maxoff; 193 int i, off, maxoff;
194 uint32_t sub; 194 uint32_t sub;
195 195
196 KASSERT(mutex_owned(&fdp->fd_lock)); 196 KASSERT(mutex_owned(&fdp->fd_lock));
197 197
198 fd_checkmaps(fdp); 198 fd_checkmaps(fdp);
199 199
200 if (want > bits) 200 if (want > bits)
201 return -1; 201 return -1;
202 202
203 off = want >> NDENTRYSHIFT; 203 off = want >> NDENTRYSHIFT;
204 i = want & NDENTRYMASK; 204 i = want & NDENTRYMASK;
205 if (i) { 205 if (i) {
206 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i)); 206 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
207 if (sub != ~0) 207 if (sub != ~0)
208 goto found; 208 goto found;
209 off++; 209 off++;
210 } 210 }
211 211
212 maxoff = NDLOSLOTS(bits); 212 maxoff = NDLOSLOTS(bits);
213 while (off < maxoff) { 213 while (off < maxoff) {
214 if ((sub = bitmap[off]) != ~0) 214 if ((sub = bitmap[off]) != ~0)
215 goto found; 215 goto found;
216 off++; 216 off++;
217 } 217 }
218 218
219 return (-1); 219 return (-1);
220 220
221 found: 221 found:
222 return (off << NDENTRYSHIFT) + ffs(~sub) - 1; 222 return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
223} 223}
224 224
225static int 225static int
226fd_last_set(filedesc_t *fd, int last) 226fd_last_set(filedesc_t *fd, int last)
227{ 227{
228 int off, i; 228 int off, i;
229 fdfile_t **ff = fd->fd_dt->dt_ff; 229 fdfile_t **ff = fd->fd_dt->dt_ff;
230 uint32_t *bitmap = fd->fd_lomap; 230 uint32_t *bitmap = fd->fd_lomap;
231 231
232 KASSERT(mutex_owned(&fd->fd_lock)); 232 KASSERT(mutex_owned(&fd->fd_lock));
233 233
234 fd_checkmaps(fd); 234 fd_checkmaps(fd);
235 235
236 off = (last - 1) >> NDENTRYSHIFT; 236 off = (last - 1) >> NDENTRYSHIFT;
237 237
238 while (off >= 0 && !bitmap[off]) 238 while (off >= 0 && !bitmap[off])
239 off--; 239 off--;
240 240
241 if (off < 0) 241 if (off < 0)
242 return (-1); 242 return (-1);
243 243
244 i = ((off + 1) << NDENTRYSHIFT) - 1; 244 i = ((off + 1) << NDENTRYSHIFT) - 1;
245 if (i >= last) 245 if (i >= last)
246 i = last - 1; 246 i = last - 1;
247 247
248 /* XXX should use bitmap */ 248 /* XXX should use bitmap */
249 while (i > 0 && (ff[i] == NULL || !ff[i]->ff_allocated)) 249 while (i > 0 && (ff[i] == NULL || !ff[i]->ff_allocated))
250 i--; 250 i--;
251 251
252 return (i); 252 return (i);
253} 253}
254 254
255static inline void 255static inline void
256fd_used(filedesc_t *fdp, unsigned fd) 256fd_used(filedesc_t *fdp, unsigned fd)
257{ 257{
258 u_int off = fd >> NDENTRYSHIFT; 258 u_int off = fd >> NDENTRYSHIFT;
259 fdfile_t *ff; 259 fdfile_t *ff;
260 260
261 ff = fdp->fd_dt->dt_ff[fd]; 261 ff = fdp->fd_dt->dt_ff[fd];
262 262
263 KASSERT(mutex_owned(&fdp->fd_lock)); 263 KASSERT(mutex_owned(&fdp->fd_lock));
264 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0); 264 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
265 KASSERT(ff != NULL); 265 KASSERT(ff != NULL);
266 KASSERT(ff->ff_file == NULL); 266 KASSERT(ff->ff_file == NULL);
267 KASSERT(!ff->ff_allocated); 267 KASSERT(!ff->ff_allocated);
268 268
269 ff->ff_allocated = 1; 269 ff->ff_allocated = 1;
270 fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK); 270 fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
271 if (__predict_false(fdp->fd_lomap[off] == ~0)) { 271 if (__predict_false(fdp->fd_lomap[off] == ~0)) {
272 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & 272 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
273 (1 << (off & NDENTRYMASK))) == 0); 273 (1 << (off & NDENTRYMASK))) == 0);
274 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK); 274 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
275 } 275 }
276 276
277 if ((int)fd > fdp->fd_lastfile) { 277 if ((int)fd > fdp->fd_lastfile) {
278 fdp->fd_lastfile = fd; 278 fdp->fd_lastfile = fd;
279 } 279 }
280 280
281 fd_checkmaps(fdp); 281 fd_checkmaps(fdp);
282} 282}
283 283
284static inline void 284static inline void
285fd_unused(filedesc_t *fdp, unsigned fd) 285fd_unused(filedesc_t *fdp, unsigned fd)
286{ 286{
287 u_int off = fd >> NDENTRYSHIFT; 287 u_int off = fd >> NDENTRYSHIFT;
288 fdfile_t *ff; 288 fdfile_t *ff;
289 289
290 ff = fdp->fd_dt->dt_ff[fd]; 290 ff = fdp->fd_dt->dt_ff[fd];
291 291
292 /* 292 /*
293 * Don't assert the lock is held here, as we may be copying 293 * Don't assert the lock is held here, as we may be copying
294 * the table during exec() and it is not needed there. 294 * the table during exec() and it is not needed there.
295 * procfs and sysctl are locked out by proc::p_reflock. 295 * procfs and sysctl are locked out by proc::p_reflock.
296 * 296 *
297 * KASSERT(mutex_owned(&fdp->fd_lock)); 297 * KASSERT(mutex_owned(&fdp->fd_lock));
298 */ 298 */
299 KASSERT(ff != NULL); 299 KASSERT(ff != NULL);
300 KASSERT(ff->ff_file == NULL); 300 KASSERT(ff->ff_file == NULL);
301 KASSERT(ff->ff_allocated); 301 KASSERT(ff->ff_allocated);
302 302
303 if (fd < fdp->fd_freefile) { 303 if (fd < fdp->fd_freefile) {
304 fdp->fd_freefile = fd; 304 fdp->fd_freefile = fd;
305 } 305 }
306 306
307 if (fdp->fd_lomap[off] == ~0) { 307 if (fdp->fd_lomap[off] == ~0) {
308 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & 308 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
309 (1 << (off & NDENTRYMASK))) != 0); 309 (1 << (off & NDENTRYMASK))) != 0);
310 fdp->fd_himap[off >> NDENTRYSHIFT] &= 310 fdp->fd_himap[off >> NDENTRYSHIFT] &=
311 ~(1 << (off & NDENTRYMASK)); 311 ~(1 << (off & NDENTRYMASK));
312 } 312 }
313 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0); 313 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
314 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK)); 314 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
315 ff->ff_allocated = 0; 315 ff->ff_allocated = 0;
316 316
317 KASSERT(fd <= fdp->fd_lastfile); 317 KASSERT(fd <= fdp->fd_lastfile);
318 if (fd == fdp->fd_lastfile) { 318 if (fd == fdp->fd_lastfile) {
319 fdp->fd_lastfile = fd_last_set(fdp, fd); 319 fdp->fd_lastfile = fd_last_set(fdp, fd);
320 } 320 }
321 fd_checkmaps(fdp); 321 fd_checkmaps(fdp);
322} 322}
323 323
324/* 324/*
325 * Look up the file structure corresponding to a file descriptor 325 * Look up the file structure corresponding to a file descriptor
326 * and return the file, holding a reference on the descriptor. 326 * and return the file, holding a reference on the descriptor.
327 */ 327 */
328inline file_t * 328inline file_t *
329fd_getfile(unsigned fd) 329fd_getfile(unsigned fd)
330{ 330{
331 filedesc_t *fdp; 331 filedesc_t *fdp;
332 fdfile_t *ff; 332 fdfile_t *ff;
333 file_t *fp; 333 file_t *fp;
334 fdtab_t *dt; 334 fdtab_t *dt;
335 335
336 /* 336 /*
337 * Look up the fdfile structure representing this descriptor. 337 * Look up the fdfile structure representing this descriptor.
338 * We are doing this unlocked. See fd_tryexpand(). 338 * We are doing this unlocked. See fd_tryexpand().
339 */ 339 */
340 fdp = curlwp->l_fd; 340 fdp = curlwp->l_fd;
341 dt = fdp->fd_dt; 341 dt = fdp->fd_dt;
342 if (__predict_false(fd >= dt->dt_nfiles)) { 342 if (__predict_false(fd >= dt->dt_nfiles)) {
343 return NULL; 343 return NULL;
344 } 344 }
345 ff = dt->dt_ff[fd]; 345 ff = dt->dt_ff[fd];
346 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 346 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
347 if (__predict_false(ff == NULL)) { 347 if (__predict_false(ff == NULL)) {
348 return NULL; 348 return NULL;
349 } 349 }
350 350
351 /* Now get a reference to the descriptor. */ 351 /* Now get a reference to the descriptor. */
352 if (fdp->fd_refcnt == 1) { 352 if (fdp->fd_refcnt == 1) {
353 /* 353 /*
354 * Single threaded: don't need to worry about concurrent 354 * Single threaded: don't need to worry about concurrent
355 * access (other than earlier calls to kqueue, which may 355 * access (other than earlier calls to kqueue, which may
356 * hold a reference to the descriptor). 356 * hold a reference to the descriptor).
357 */ 357 */
358 ff->ff_refcnt++; 358 ff->ff_refcnt++;
359 } else { 359 } else {
360 /* 360 /*
361 * Multi threaded: issue a memory barrier to ensure that we 361 * Multi threaded: issue a memory barrier to ensure that we
362 * acquire the file pointer _after_ adding a reference. If 362 * acquire the file pointer _after_ adding a reference. If
363 * no memory barrier, we could fetch a stale pointer. 363 * no memory barrier, we could fetch a stale pointer.
364 */ 364 */
365 atomic_inc_uint(&ff->ff_refcnt); 365 atomic_inc_uint(&ff->ff_refcnt);
366#ifndef __HAVE_ATOMIC_AS_MEMBAR 366#ifndef __HAVE_ATOMIC_AS_MEMBAR
367 membar_enter(); 367 membar_enter();
368#endif 368#endif
369 } 369 }
370 370
371 /* 371 /*
372 * If the file is not open or is being closed then put the 372 * If the file is not open or is being closed then put the
373 * reference back. 373 * reference back.
374 */ 374 */
375 fp = ff->ff_file; 375 fp = ff->ff_file;
376 if (__predict_true(fp != NULL)) { 376 if (__predict_true(fp != NULL)) {
377 return fp; 377 return fp;
378 } 378 }
379 fd_putfile(fd); 379 fd_putfile(fd);
380 return NULL; 380 return NULL;
381} 381}
382 382
383/* 383/*
384 * Release a reference to a file descriptor acquired with fd_getfile(). 384 * Release a reference to a file descriptor acquired with fd_getfile().
385 */ 385 */
386void 386void
387fd_putfile(unsigned fd) 387fd_putfile(unsigned fd)
388{ 388{
389 filedesc_t *fdp; 389 filedesc_t *fdp;
390 fdfile_t *ff; 390 fdfile_t *ff;
391 u_int u, v; 391 u_int u, v;
392 392
393 fdp = curlwp->l_fd; 393 fdp = curlwp->l_fd;
394 ff = fdp->fd_dt->dt_ff[fd]; 394 ff = fdp->fd_dt->dt_ff[fd];
395 395
396 KASSERT(fd < fdp->fd_dt->dt_nfiles); 396 KASSERT(fd < fdp->fd_dt->dt_nfiles);
397 KASSERT(ff != NULL); 397 KASSERT(ff != NULL);
398 KASSERT((ff->ff_refcnt & FR_MASK) > 0); 398 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
399 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 399 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
400 400
401 if (fdp->fd_refcnt == 1) { 401 if (fdp->fd_refcnt == 1) {
402 /* 402 /*
403 * Single threaded: don't need to worry about concurrent 403 * Single threaded: don't need to worry about concurrent
404 * access (other than earlier calls to kqueue, which may 404 * access (other than earlier calls to kqueue, which may
405 * hold a reference to the descriptor). 405 * hold a reference to the descriptor).
406 */ 406 */
407 if (__predict_false((ff->ff_refcnt & FR_CLOSING) != 0)) { 407 if (__predict_false((ff->ff_refcnt & FR_CLOSING) != 0)) {
408 fd_close(fd); 408 fd_close(fd);
409 return; 409 return;
410 } 410 }
411 ff->ff_refcnt--; 411 ff->ff_refcnt--;
412 return; 412 return;
413 } 413 }
414 414
415 /* 415 /*
416 * Ensure that any use of the file is complete and globally 416 * Ensure that any use of the file is complete and globally
417 * visible before dropping the final reference. If no membar, 417 * visible before dropping the final reference. If no membar,
418 * the current CPU could still access memory associated with 418 * the current CPU could still access memory associated with
419 * the file after it has been freed or recycled by another 419 * the file after it has been freed or recycled by another
420 * CPU. 420 * CPU.
421 */ 421 */
422#ifndef __HAVE_ATOMIC_AS_MEMBAR 422#ifndef __HAVE_ATOMIC_AS_MEMBAR
423 membar_exit(); 423 membar_exit();
424#endif 424#endif
425 425
426 /* 426 /*
427 * Be optimistic and start out with the assumption that no other 427 * Be optimistic and start out with the assumption that no other
428 * threads are trying to close the descriptor. If the CAS fails, 428 * threads are trying to close the descriptor. If the CAS fails,
429 * we lost a race and/or it's being closed. 429 * we lost a race and/or it's being closed.
430 */ 430 */
431 for (u = ff->ff_refcnt & FR_MASK;; u = v) { 431 for (u = ff->ff_refcnt & FR_MASK;; u = v) {
432 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1); 432 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
433 if (__predict_true(u == v)) { 433 if (__predict_true(u == v)) {
434 return; 434 return;
435 } 435 }
436 if (__predict_false((v & FR_CLOSING) != 0)) { 436 if (__predict_false((v & FR_CLOSING) != 0)) {
437 break; 437 break;
438 } 438 }
439 } 439 }
440 440
441 /* Another thread is waiting to close the file: join it. */ 441 /* Another thread is waiting to close the file: join it. */
442 (void)fd_close(fd); 442 (void)fd_close(fd);
443} 443}
444 444
445/* 445/*
446 * Convenience wrapper around fd_getfile() that returns reference 446 * Convenience wrapper around fd_getfile() that returns reference
447 * to a vnode. 447 * to a vnode.
448 */ 448 */
449int 449int
450fd_getvnode(unsigned fd, file_t **fpp) 450fd_getvnode(unsigned fd, file_t **fpp)
451{ 451{
452 vnode_t *vp; 452 vnode_t *vp;
453 file_t *fp; 453 file_t *fp;
454 454
455 fp = fd_getfile(fd); 455 fp = fd_getfile(fd);
456 if (__predict_false(fp == NULL)) { 456 if (__predict_false(fp == NULL)) {
457 return EBADF; 457 return EBADF;
458 } 458 }
459 if (__predict_false(fp->f_type != DTYPE_VNODE)) { 459 if (__predict_false(fp->f_type != DTYPE_VNODE)) {
460 fd_putfile(fd); 460 fd_putfile(fd);
461 return EINVAL; 461 return EINVAL;
462 } 462 }
463 vp = fp->f_data; 463 vp = fp->f_data;
464 if (__predict_false(vp->v_type == VBAD)) { 464 if (__predict_false(vp->v_type == VBAD)) {
465 /* XXX Is this case really necessary? */ 465 /* XXX Is this case really necessary? */
466 fd_putfile(fd); 466 fd_putfile(fd);
467 return EBADF; 467 return EBADF;
468 } 468 }
469 *fpp = fp; 469 *fpp = fp;
470 return 0; 470 return 0;
471} 471}
472 472
473/* 473/*
474 * Convenience wrapper around fd_getfile() that returns reference 474 * Convenience wrapper around fd_getfile() that returns reference
475 * to a socket. 475 * to a socket.
476 */ 476 */
477int 477int
478fd_getsock(unsigned fd, struct socket **sop) 478fd_getsock(unsigned fd, struct socket **sop)
479{ 479{
480 file_t *fp; 480 file_t *fp;
481 481
482 fp = fd_getfile(fd); 482 fp = fd_getfile(fd);
483 if (__predict_false(fp == NULL)) { 483 if (__predict_false(fp == NULL)) {
484 return EBADF; 484 return EBADF;
485 } 485 }
486 if (__predict_false(fp->f_type != DTYPE_SOCKET)) { 486 if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
487 fd_putfile(fd); 487 fd_putfile(fd);
488 return ENOTSOCK; 488 return ENOTSOCK;
489 } 489 }
490 *sop = fp->f_data; 490 *sop = fp->f_data;
491 return 0; 491 return 0;
492} 492}
493 493
494/* 494/*
495 * Look up the file structure corresponding to a file descriptor 495 * Look up the file structure corresponding to a file descriptor
496 * and return it with a reference held on the file, not the 496 * and return it with a reference held on the file, not the
497 * descriptor. 497 * descriptor.
498 * 498 *
499 * This is heavyweight and only used when accessing descriptors 499 * This is heavyweight and only used when accessing descriptors
500 * from a foreign process. The caller must ensure that `p' does 500 * from a foreign process. The caller must ensure that `p' does
501 * not exit or fork across this call. 501 * not exit or fork across this call.
502 * 502 *
503 * To release the file (not descriptor) reference, use closef(). 503 * To release the file (not descriptor) reference, use closef().
504 */ 504 */
505file_t * 505file_t *
506fd_getfile2(proc_t *p, unsigned fd) 506fd_getfile2(proc_t *p, unsigned fd)
507{ 507{
508 filedesc_t *fdp; 508 filedesc_t *fdp;
509 fdfile_t *ff; 509 fdfile_t *ff;
510 file_t *fp; 510 file_t *fp;
511 fdtab_t *dt; 511 fdtab_t *dt;
512 512
513 fdp = p->p_fd; 513 fdp = p->p_fd;
514 mutex_enter(&fdp->fd_lock); 514 mutex_enter(&fdp->fd_lock);
515 dt = fdp->fd_dt; 515 dt = fdp->fd_dt;
516 if (fd >= dt->dt_nfiles) { 516 if (fd >= dt->dt_nfiles) {
517 mutex_exit(&fdp->fd_lock); 517 mutex_exit(&fdp->fd_lock);
518 return NULL; 518 return NULL;
519 } 519 }
520 if ((ff = dt->dt_ff[fd]) == NULL) { 520 if ((ff = dt->dt_ff[fd]) == NULL) {
521 mutex_exit(&fdp->fd_lock); 521 mutex_exit(&fdp->fd_lock);
522 return NULL; 522 return NULL;
523 } 523 }
524 if ((fp = ff->ff_file) == NULL) { 524 if ((fp = ff->ff_file) == NULL) {
525 mutex_exit(&fdp->fd_lock); 525 mutex_exit(&fdp->fd_lock);
526 return NULL; 526 return NULL;
527 } 527 }
528 mutex_enter(&fp->f_lock); 528 mutex_enter(&fp->f_lock);
529 fp->f_count++; 529 fp->f_count++;
530 mutex_exit(&fp->f_lock); 530 mutex_exit(&fp->f_lock);
531 mutex_exit(&fdp->fd_lock); 531 mutex_exit(&fdp->fd_lock);
532 532
533 return fp; 533 return fp;
534} 534}
535 535
536/* 536/*
537 * Internal form of close. Must be called with a reference to the 537 * Internal form of close. Must be called with a reference to the
538 * descriptor, and will drop the reference. When all descriptor 538 * descriptor, and will drop the reference. When all descriptor
539 * references are dropped, releases the descriptor slot and a single 539 * references are dropped, releases the descriptor slot and a single
540 * reference to the file structure. 540 * reference to the file structure.
541 */ 541 */
542int 542int
543fd_close(unsigned fd) 543fd_close(unsigned fd)
544{ 544{
545 struct flock lf; 545 struct flock lf;
546 filedesc_t *fdp; 546 filedesc_t *fdp;
547 fdfile_t *ff; 547 fdfile_t *ff;
548 file_t *fp; 548 file_t *fp;
549 proc_t *p; 549 proc_t *p;
550 lwp_t *l; 550 lwp_t *l;
551 u_int refcnt; 551 u_int refcnt;
552 552
553 l = curlwp; 553 l = curlwp;
554 p = l->l_proc; 554 p = l->l_proc;
555 fdp = l->l_fd; 555 fdp = l->l_fd;
556 ff = fdp->fd_dt->dt_ff[fd]; 556 ff = fdp->fd_dt->dt_ff[fd];
557 557
558 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 558 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
559 559
560 mutex_enter(&fdp->fd_lock); 560 mutex_enter(&fdp->fd_lock);
561 KASSERT((ff->ff_refcnt & FR_MASK) > 0); 561 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
562 if (__predict_false(ff->ff_file == NULL)) { 562 if (__predict_false(ff->ff_file == NULL)) {
563 /* 563 /*
564 * Another user of the file is already closing, and is 564 * Another user of the file is already closing, and is
565 * waiting for other users of the file to drain. Release 565 * waiting for other users of the file to drain. Release
566 * our reference, and wake up the closer. 566 * our reference, and wake up the closer.
567 */ 567 */
568 atomic_dec_uint(&ff->ff_refcnt); 568 atomic_dec_uint(&ff->ff_refcnt);
569 cv_broadcast(&ff->ff_closing); 569 cv_broadcast(&ff->ff_closing);
570 mutex_exit(&fdp->fd_lock); 570 mutex_exit(&fdp->fd_lock);
571 571
572 /* 572 /*
573 * An application error, so pretend that the descriptor 573 * An application error, so pretend that the descriptor
574 * was already closed. We can't safely wait for it to 574 * was already closed. We can't safely wait for it to
575 * be closed without potentially deadlocking. 575 * be closed without potentially deadlocking.
576 */ 576 */
577 return (EBADF); 577 return (EBADF);
578 } 578 }
579 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0); 579 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
580 580
581 /* 581 /*
582 * There may be multiple users of this file within the process. 582 * There may be multiple users of this file within the process.
583 * Notify existing and new users that the file is closing. This 583 * Notify existing and new users that the file is closing. This
584 * will prevent them from adding additional uses to this file 584 * will prevent them from adding additional uses to this file
585 * while we are closing it. 585 * while we are closing it.
586 */ 586 */
587 fp = ff->ff_file; 587 fp = ff->ff_file;
588 ff->ff_file = NULL; 588 ff->ff_file = NULL;
589 ff->ff_exclose = false; 589 ff->ff_exclose = false;
590 590
591 /* 591 /*
592 * We expect the caller to hold a descriptor reference - drop it. 592 * We expect the caller to hold a descriptor reference - drop it.
593 * The reference count may increase beyond zero at this point due 593 * The reference count may increase beyond zero at this point due
594 * to an erroneous descriptor reference by an application, but 594 * to an erroneous descriptor reference by an application, but
595 * fd_getfile() will notice that the file is being closed and drop 595 * fd_getfile() will notice that the file is being closed and drop
596 * the reference again. 596 * the reference again.
597 */ 597 */
598 if (fdp->fd_refcnt == 1) { 598 if (fdp->fd_refcnt == 1) {
599 /* Single threaded. */ 599 /* Single threaded. */
600 refcnt = --(ff->ff_refcnt); 600 refcnt = --(ff->ff_refcnt);
601 } else { 601 } else {
602 /* Multi threaded. */ 602 /* Multi threaded. */
603#ifndef __HAVE_ATOMIC_AS_MEMBAR 603#ifndef __HAVE_ATOMIC_AS_MEMBAR
604 membar_producer(); 604 membar_producer();
605#endif 605#endif
606 refcnt = atomic_dec_uint_nv(&ff->ff_refcnt); 606 refcnt = atomic_dec_uint_nv(&ff->ff_refcnt);
607 } 607 }
608 if (__predict_false(refcnt != 0)) { 608 if (__predict_false(refcnt != 0)) {
609 /* 609 /*
610 * Wait for other references to drain. This is typically 610 * Wait for other references to drain. This is typically
611 * an application error - the descriptor is being closed 611 * an application error - the descriptor is being closed
612 * while still in use. 612 * while still in use.
613 * 613 *
614 */ 614 */
615 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING); 615 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
616 616
617 /* 617 /*
618 * Remove any knotes attached to the file. A knote 618 * Remove any knotes attached to the file. A knote
619 * attached to the descriptor can hold references on it. 619 * attached to the descriptor can hold references on it.
620 */ 620 */
621 mutex_exit(&fdp->fd_lock); 621 mutex_exit(&fdp->fd_lock);
622 if (!SLIST_EMPTY(&ff->ff_knlist)) { 622 if (!SLIST_EMPTY(&ff->ff_knlist)) {
623 knote_fdclose(fd); 623 knote_fdclose(fd);
624 } 624 }
625 625
626 /* Try to drain out descriptor references. */ 626 /* Try to drain out descriptor references. */
627 (*fp->f_ops->fo_drain)(fp); 627 (*fp->f_ops->fo_drain)(fp);
628 mutex_enter(&fdp->fd_lock); 628 mutex_enter(&fdp->fd_lock);
629 629
630 /* 630 /*
631 * We need to see the count drop to zero at least once, 631 * We need to see the count drop to zero at least once,
632 * in order to ensure that all pre-existing references 632 * in order to ensure that all pre-existing references
633 * have been drained. New references past this point are 633 * have been drained. New references past this point are
634 * of no interest. 634 * of no interest.
635 */ 635 */
636 while ((ff->ff_refcnt & FR_MASK) != 0) { 636 while ((ff->ff_refcnt & FR_MASK) != 0) {
637 cv_wait(&ff->ff_closing, &fdp->fd_lock); 637 cv_wait(&ff->ff_closing, &fdp->fd_lock);
638 } 638 }
639 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING); 639 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
640 } else { 640 } else {
641 /* If no references, there must be no knotes. */ 641 /* If no references, there must be no knotes. */
642 KASSERT(SLIST_EMPTY(&ff->ff_knlist)); 642 KASSERT(SLIST_EMPTY(&ff->ff_knlist));
643 } 643 }
644 644
645 /* 645 /*
646 * POSIX record locking dictates that any close releases ALL 646 * POSIX record locking dictates that any close releases ALL
647 * locks owned by this process. This is handled by setting 647 * locks owned by this process. This is handled by setting
648 * a flag in the unlock to free ONLY locks obeying POSIX 648 * a flag in the unlock to free ONLY locks obeying POSIX
649 * semantics, and not to free BSD-style file locks. 649 * semantics, and not to free BSD-style file locks.
650 * If the descriptor was in a message, POSIX-style locks 650 * If the descriptor was in a message, POSIX-style locks
651 * aren't passed with the descriptor. 651 * aren't passed with the descriptor.
652 */ 652 */
653 if (__predict_false((p->p_flag & PK_ADVLOCK) != 0 && 653 if (__predict_false((p->p_flag & PK_ADVLOCK) != 0 &&
654 fp->f_type == DTYPE_VNODE)) { 654 fp->f_type == DTYPE_VNODE)) {
655 lf.l_whence = SEEK_SET; 655 lf.l_whence = SEEK_SET;
656 lf.l_start = 0; 656 lf.l_start = 0;
657 lf.l_len = 0; 657 lf.l_len = 0;
658 lf.l_type = F_UNLCK; 658 lf.l_type = F_UNLCK;
659 mutex_exit(&fdp->fd_lock); 659 mutex_exit(&fdp->fd_lock);
660 (void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX); 660 (void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
661 mutex_enter(&fdp->fd_lock); 661 mutex_enter(&fdp->fd_lock);
662 } 662 }
663 663
664 /* Free descriptor slot. */ 664 /* Free descriptor slot. */
665 fd_unused(fdp, fd); 665 fd_unused(fdp, fd);
666 mutex_exit(&fdp->fd_lock); 666 mutex_exit(&fdp->fd_lock);
667 667
668 /* Now drop reference to the file itself. */ 668 /* Now drop reference to the file itself. */
669 return closef(fp); 669 return closef(fp);
670} 670}
671 671
672/* 672/*
673 * Duplicate a file descriptor. 673 * Duplicate a file descriptor.
674 */ 674 */
675int 675int
676fd_dup(file_t *fp, int minfd, int *newp, bool exclose) 676fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
677{ 677{
678 proc_t *p; 678 proc_t *p;
679 int error; 679 int error;
680 680
681 p = curproc; 681 p = curproc;
682 682
683 while ((error = fd_alloc(p, minfd, newp)) != 0) { 683 while ((error = fd_alloc(p, minfd, newp)) != 0) {
684 if (error != ENOSPC) { 684 if (error != ENOSPC) {
685 return error; 685 return error;
686 } 686 }
687 fd_tryexpand(p); 687 fd_tryexpand(p);
688 } 688 }
689 689
690 curlwp->l_fd->fd_dt->dt_ff[*newp]->ff_exclose = exclose; 690 curlwp->l_fd->fd_dt->dt_ff[*newp]->ff_exclose = exclose;
691 fd_affix(p, fp, *newp); 691 fd_affix(p, fp, *newp);
692 return 0; 692 return 0;
693} 693}
694 694
695/* 695/*
696 * dup2 operation. 696 * dup2 operation.
697 */ 697 */
698int 698int
699fd_dup2(file_t *fp, unsigned new) 699fd_dup2(file_t *fp, unsigned new)
700{ 700{
701 filedesc_t *fdp; 701 filedesc_t *fdp;
702 fdfile_t *ff; 702 fdfile_t *ff;
703 fdtab_t *dt; 703 fdtab_t *dt;
704 704
705 fdp = curlwp->l_fd; 705 fdp = curlwp->l_fd;
706 706
707 /* 707 /*
708 * Ensure there are enough slots in the descriptor table, 708 * Ensure there are enough slots in the descriptor table,
709 * and allocate an fdfile_t up front in case we need it. 709 * and allocate an fdfile_t up front in case we need it.
710 */ 710 */
711 while (new >= fdp->fd_dt->dt_nfiles) { 711 while (new >= fdp->fd_dt->dt_nfiles) {
712 fd_tryexpand(curproc); 712 fd_tryexpand(curproc);
713 } 713 }
714 ff = pool_cache_get(fdfile_cache, PR_WAITOK); 714 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
715 715
716 /* 716 /*
717 * If there is already a file open, close it. If the file is 717 * If there is already a file open, close it. If the file is
718 * half open, wait for it to be constructed before closing it. 718 * half open, wait for it to be constructed before closing it.
719 * XXX Potential for deadlock here? 719 * XXX Potential for deadlock here?
720 */ 720 */
721 mutex_enter(&fdp->fd_lock); 721 mutex_enter(&fdp->fd_lock);
722 while (fd_isused(fdp, new)) { 722 while (fd_isused(fdp, new)) {
723 mutex_exit(&fdp->fd_lock); 723 mutex_exit(&fdp->fd_lock);
724 if (fd_getfile(new) != NULL) { 724 if (fd_getfile(new) != NULL) {
725 (void)fd_close(new); 725 (void)fd_close(new);
726 } else { 726 } else {
727 /* 727 /*
728 * Crummy, but unlikely to happen. 728 * Crummy, but unlikely to happen.
729 * Can occur if we interrupt another 729 * Can occur if we interrupt another
730 * thread while it is opening a file. 730 * thread while it is opening a file.
731 */ 731 */
732 kpause("dup2", false, 1, NULL); 732 kpause("dup2", false, 1, NULL);
733 } 733 }
734 mutex_enter(&fdp->fd_lock); 734 mutex_enter(&fdp->fd_lock);
735 } 735 }
736 dt = fdp->fd_dt; 736 dt = fdp->fd_dt;
737 if (dt->dt_ff[new] == NULL) { 737 if (dt->dt_ff[new] == NULL) {
738 KASSERT(new >= NDFDFILE); 738 KASSERT(new >= NDFDFILE);
739 dt->dt_ff[new] = ff; 739 dt->dt_ff[new] = ff;
740 ff = NULL; 740 ff = NULL;
741 }  741 }
742 fd_used(fdp, new); 742 fd_used(fdp, new);
743 mutex_exit(&fdp->fd_lock); 743 mutex_exit(&fdp->fd_lock);
744 744
745 /* Slot is now allocated. Insert copy of the file. */ 745 /* Slot is now allocated. Insert copy of the file. */
746 fd_affix(curproc, fp, new); 746 fd_affix(curproc, fp, new);
747 if (ff != NULL) { 747 if (ff != NULL) {
748 pool_cache_put(fdfile_cache, ff); 748 pool_cache_put(fdfile_cache, ff);
749 } 749 }
750 return 0; 750 return 0;
751} 751}
752 752
753/* 753/*
754 * Drop reference to a file structure. 754 * Drop reference to a file structure.
755 */ 755 */
756int 756int
757closef(file_t *fp) 757closef(file_t *fp)
758{ 758{
759 struct flock lf; 759 struct flock lf;
760 int error; 760 int error;
761 761
762 /* 762 /*
763 * Drop reference. If referenced elsewhere it's still open 763 * Drop reference. If referenced elsewhere it's still open
764 * and we have nothing more to do. 764 * and we have nothing more to do.
765 */ 765 */
766 mutex_enter(&fp->f_lock); 766 mutex_enter(&fp->f_lock);
767 KASSERT(fp->f_count > 0); 767 KASSERT(fp->f_count > 0);
768 if (--fp->f_count > 0) { 768 if (--fp->f_count > 0) {
769 mutex_exit(&fp->f_lock); 769 mutex_exit(&fp->f_lock);
770 return 0; 770 return 0;
771 } 771 }
772 KASSERT(fp->f_count == 0); 772 KASSERT(fp->f_count == 0);
773 mutex_exit(&fp->f_lock); 773 mutex_exit(&fp->f_lock);
774 774
775 /* We held the last reference - release locks, close and free. */ 775 /* We held the last reference - release locks, close and free. */
776 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { 776 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
777 lf.l_whence = SEEK_SET; 777 lf.l_whence = SEEK_SET;
778 lf.l_start = 0; 778 lf.l_start = 0;
779 lf.l_len = 0; 779 lf.l_len = 0;
780 lf.l_type = F_UNLCK; 780 lf.l_type = F_UNLCK;
781 (void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK); 781 (void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
782 } 782 }
783 if (fp->f_ops != NULL) { 783 if (fp->f_ops != NULL) {
784 error = (*fp->f_ops->fo_close)(fp); 784 error = (*fp->f_ops->fo_close)(fp);
785 } else { 785 } else {
786 error = 0; 786 error = 0;
787 } 787 }
788 KASSERT(fp->f_count == 0); 788 KASSERT(fp->f_count == 0);
789 KASSERT(fp->f_cred != NULL); 789 KASSERT(fp->f_cred != NULL);
790 pool_cache_put(file_cache, fp); 790 pool_cache_put(file_cache, fp);
791 791
792 return error; 792 return error;
793} 793}
794 794
795/* 795/*
796 * Allocate a file descriptor for the process. 796 * Allocate a file descriptor for the process.
797 */ 797 */
798int 798int
799fd_alloc(proc_t *p, int want, int *result) 799fd_alloc(proc_t *p, int want, int *result)
800{ 800{
801 filedesc_t *fdp; 801 filedesc_t *fdp;
802 int i, lim, last, error; 802 int i, lim, last, error;
803 u_int off, new; 803 u_int off, new;
804 fdtab_t *dt; 804 fdtab_t *dt;
805 805
806 KASSERT(p == curproc || p == &proc0); 806 KASSERT(p == curproc || p == &proc0);
807 807
808 fdp = p->p_fd; 808 fdp = p->p_fd;
809 809
810 /* 810 /*
811 * Search for a free descriptor starting at the higher 811 * Search for a free descriptor starting at the higher
812 * of want or fd_freefile. 812 * of want or fd_freefile.
813 */ 813 */
814 mutex_enter(&fdp->fd_lock); 814 mutex_enter(&fdp->fd_lock);
815 fd_checkmaps(fdp); 815 fd_checkmaps(fdp);
816 dt = fdp->fd_dt; 816 dt = fdp->fd_dt;
817 KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 817 KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
818 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); 818 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
819 last = min(dt->dt_nfiles, lim); 819 last = min(dt->dt_nfiles, lim);
820 for (;;) { 820 for (;;) {
821 if ((i = want) < fdp->fd_freefile) 821 if ((i = want) < fdp->fd_freefile)
822 i = fdp->fd_freefile; 822 i = fdp->fd_freefile;
823 off = i >> NDENTRYSHIFT; 823 off = i >> NDENTRYSHIFT;
824 new = fd_next_zero(fdp, fdp->fd_himap, off, 824 new = fd_next_zero(fdp, fdp->fd_himap, off,
825 (last + NDENTRIES - 1) >> NDENTRYSHIFT); 825 (last + NDENTRIES - 1) >> NDENTRYSHIFT);
826 if (new == -1) 826 if (new == -1)
827 break; 827 break;
828 i = fd_next_zero(fdp, &fdp->fd_lomap[new], 828 i = fd_next_zero(fdp, &fdp->fd_lomap[new],
829 new > off ? 0 : i & NDENTRYMASK, NDENTRIES); 829 new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
830 if (i == -1) { 830 if (i == -1) {
831 /* 831 /*
832 * Free file descriptor in this block was 832 * Free file descriptor in this block was
833 * below want, try again with higher want. 833 * below want, try again with higher want.
834 */ 834 */
835 want = (new + 1) << NDENTRYSHIFT; 835 want = (new + 1) << NDENTRYSHIFT;
836 continue; 836 continue;
837 } 837 }
838 i += (new << NDENTRYSHIFT); 838 i += (new << NDENTRYSHIFT);
839 if (i >= last) { 839 if (i >= last) {
840 break; 840 break;
841 } 841 }
842 if (dt->dt_ff[i] == NULL) { 842 if (dt->dt_ff[i] == NULL) {
843 KASSERT(i >= NDFDFILE); 843 KASSERT(i >= NDFDFILE);
844 dt->dt_ff[i] = pool_cache_get(fdfile_cache, PR_WAITOK); 844 dt->dt_ff[i] = pool_cache_get(fdfile_cache, PR_WAITOK);
845 } 845 }
846 KASSERT(dt->dt_ff[i]->ff_refcnt == 0); 846 KASSERT(dt->dt_ff[i]->ff_refcnt == 0);
847 KASSERT(dt->dt_ff[i]->ff_file == NULL); 847 KASSERT(dt->dt_ff[i]->ff_file == NULL);
848 fd_used(fdp, i); 848 fd_used(fdp, i);
849 if (want <= fdp->fd_freefile) { 849 if (want <= fdp->fd_freefile) {
850 fdp->fd_freefile = i; 850 fdp->fd_freefile = i;
851 } 851 }
852 *result = i; 852 *result = i;
853 KASSERT(i >= NDFDFILE || 853 KASSERT(i >= NDFDFILE ||
854 dt->dt_ff[i] == (fdfile_t *)fdp->fd_dfdfile[i]); 854 dt->dt_ff[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
855 fd_checkmaps(fdp); 855 fd_checkmaps(fdp);
856 mutex_exit(&fdp->fd_lock); 856 mutex_exit(&fdp->fd_lock);
857 return 0; 857 return 0;
858 } 858 }
859 859
860 /* No space in current array. Let the caller expand and retry. */ 860 /* No space in current array. Let the caller expand and retry. */
861 error = (dt->dt_nfiles >= lim) ? EMFILE : ENOSPC; 861 error = (dt->dt_nfiles >= lim) ? EMFILE : ENOSPC;
862 mutex_exit(&fdp->fd_lock); 862 mutex_exit(&fdp->fd_lock);
863 return error; 863 return error;
864} 864}
865 865
866/* 866/*
867 * Allocate memory for a descriptor table. 867 * Allocate memory for a descriptor table.
868 */ 868 */
869static fdtab_t * 869static fdtab_t *
870fd_dtab_alloc(int n) 870fd_dtab_alloc(int n)
871{ 871{
872 fdtab_t *dt; 872 fdtab_t *dt;
873 size_t sz; 873 size_t sz;
874 874
875 KASSERT(n > NDFILE); 875 KASSERT(n > NDFILE);
876 876
877 sz = sizeof(*dt) + (n - NDFILE) * sizeof(dt->dt_ff[0]); 877 sz = sizeof(*dt) + (n - NDFILE) * sizeof(dt->dt_ff[0]);
878 dt = kmem_alloc(sz, KM_SLEEP); 878 dt = kmem_alloc(sz, KM_SLEEP);
879#ifdef DIAGNOSTIC 879#ifdef DIAGNOSTIC
880 memset(dt, 0xff, sz); 880 memset(dt, 0xff, sz);
881#endif 881#endif
882 dt->dt_nfiles = n; 882 dt->dt_nfiles = n;
883 dt->dt_link = NULL; 883 dt->dt_link = NULL;
884 return dt; 884 return dt;
885} 885}
886 886
887/* 887/*
888 * Free a descriptor table, and all tables linked for deferred free. 888 * Free a descriptor table, and all tables linked for deferred free.
889 */ 889 */
890static void 890static void
891fd_dtab_free(fdtab_t *dt) 891fd_dtab_free(fdtab_t *dt)
892{ 892{
893 fdtab_t *next; 893 fdtab_t *next;
894 size_t sz; 894 size_t sz;
895 895
896 do { 896 do {
897 next = dt->dt_link; 897 next = dt->dt_link;
898 KASSERT(dt->dt_nfiles > NDFILE); 898 KASSERT(dt->dt_nfiles > NDFILE);
899 sz = sizeof(*dt) + 899 sz = sizeof(*dt) +
900 (dt->dt_nfiles - NDFILE) * sizeof(dt->dt_ff[0]); 900 (dt->dt_nfiles - NDFILE) * sizeof(dt->dt_ff[0]);
901#ifdef DIAGNOSTIC 901#ifdef DIAGNOSTIC
902 memset(dt, 0xff, sz); 902 memset(dt, 0xff, sz);
903#endif 903#endif
904 kmem_free(dt, sz); 904 kmem_free(dt, sz);
905 dt = next; 905 dt = next;
906 } while (dt != NULL); 906 } while (dt != NULL);
907} 907}
908 908
909/* 909/*
910 * Allocate descriptor bitmap. 910 * Allocate descriptor bitmap.
911 */ 911 */
912static void 912static void
913fd_map_alloc(int n, uint32_t **lo, uint32_t **hi) 913fd_map_alloc(int n, uint32_t **lo, uint32_t **hi)
914{ 914{
915 uint8_t *ptr; 915 uint8_t *ptr;
916 size_t szlo, szhi; 916 size_t szlo, szhi;
917 917
918 KASSERT(n > NDENTRIES); 918 KASSERT(n > NDENTRIES);
919 919
920 szlo = NDLOSLOTS(n) * sizeof(uint32_t); 920 szlo = NDLOSLOTS(n) * sizeof(uint32_t);
921 szhi = NDHISLOTS(n) * sizeof(uint32_t); 921 szhi = NDHISLOTS(n) * sizeof(uint32_t);
922 ptr = kmem_alloc(szlo + szhi, KM_SLEEP); 922 ptr = kmem_alloc(szlo + szhi, KM_SLEEP);
923 *lo = (uint32_t *)ptr; 923 *lo = (uint32_t *)ptr;
924 *hi = (uint32_t *)(ptr + szlo); 924 *hi = (uint32_t *)(ptr + szlo);
925} 925}
926 926
927/* 927/*
928 * Free descriptor bitmap. 928 * Free descriptor bitmap.
929 */ 929 */
930static void 930static void
931fd_map_free(int n, uint32_t *lo, uint32_t *hi) 931fd_map_free(int n, uint32_t *lo, uint32_t *hi)
932{ 932{
933 size_t szlo, szhi; 933 size_t szlo, szhi;
934 934
935 KASSERT(n > NDENTRIES); 935 KASSERT(n > NDENTRIES);
936 936
937 szlo = NDLOSLOTS(n) * sizeof(uint32_t); 937 szlo = NDLOSLOTS(n) * sizeof(uint32_t);
938 szhi = NDHISLOTS(n) * sizeof(uint32_t); 938 szhi = NDHISLOTS(n) * sizeof(uint32_t);
939 KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo)); 939 KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo));
940 kmem_free(lo, szlo + szhi); 940 kmem_free(lo, szlo + szhi);
941} 941}
942 942
943/* 943/*
944 * Expand a process' descriptor table. 944 * Expand a process' descriptor table.
945 */ 945 */
946void 946void
947fd_tryexpand(proc_t *p) 947fd_tryexpand(proc_t *p)
948{ 948{
949 filedesc_t *fdp; 949 filedesc_t *fdp;
950 int i, numfiles, oldnfiles; 950 int i, numfiles, oldnfiles;
951 fdtab_t *newdt, *dt; 951 fdtab_t *newdt, *dt;
952 uint32_t *newhimap, *newlomap; 952 uint32_t *newhimap, *newlomap;
953 953
954 KASSERT(p == curproc || p == &proc0); 954 KASSERT(p == curproc || p == &proc0);
955 955
956 fdp = p->p_fd; 956 fdp = p->p_fd;
957 newhimap = NULL; 957 newhimap = NULL;
958 newlomap = NULL; 958 newlomap = NULL;
959 oldnfiles = fdp->fd_dt->dt_nfiles; 959 oldnfiles = fdp->fd_dt->dt_nfiles;
960 960
961 if (oldnfiles < NDEXTENT) 961 if (oldnfiles < NDEXTENT)
962 numfiles = NDEXTENT; 962 numfiles = NDEXTENT;
963 else 963 else
964 numfiles = 2 * oldnfiles; 964 numfiles = 2 * oldnfiles;
965 965
966 newdt = fd_dtab_alloc(numfiles); 966 newdt = fd_dtab_alloc(numfiles);
967 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 967 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
968 fd_map_alloc(numfiles, &newlomap, &newhimap); 968 fd_map_alloc(numfiles, &newlomap, &newhimap);
969 } 969 }
970 970
971 mutex_enter(&fdp->fd_lock); 971 mutex_enter(&fdp->fd_lock);
972 dt = fdp->fd_dt; 972 dt = fdp->fd_dt;
973 KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 973 KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
974 if (dt->dt_nfiles != oldnfiles) { 974 if (dt->dt_nfiles != oldnfiles) {
975 /* fdp changed; caller must retry */ 975 /* fdp changed; caller must retry */
976 mutex_exit(&fdp->fd_lock); 976 mutex_exit(&fdp->fd_lock);
977 fd_dtab_free(newdt); 977 fd_dtab_free(newdt);
978 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 978 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
979 fd_map_free(numfiles, newlomap, newhimap); 979 fd_map_free(numfiles, newlomap, newhimap);
980 } 980 }
981 return; 981 return;
982 } 982 }
983 983
984 /* Copy the existing descriptor table and zero the new portion. */ 984 /* Copy the existing descriptor table and zero the new portion. */
985 i = sizeof(fdfile_t *) * oldnfiles; 985 i = sizeof(fdfile_t *) * oldnfiles;
986 memcpy(newdt->dt_ff, dt->dt_ff, i); 986 memcpy(newdt->dt_ff, dt->dt_ff, i);
987 memset((uint8_t *)newdt->dt_ff + i, 0, 987 memset((uint8_t *)newdt->dt_ff + i, 0,
988 numfiles * sizeof(fdfile_t *) - i); 988 numfiles * sizeof(fdfile_t *) - i);
989 989
990 /* 990 /*
991 * Link old descriptor array into list to be discarded. We defer 991 * Link old descriptor array into list to be discarded. We defer
992 * freeing until the last reference to the descriptor table goes 992 * freeing until the last reference to the descriptor table goes
993 * away (usually process exit). This allows us to do lockless 993 * away (usually process exit). This allows us to do lockless
994 * lookups in fd_getfile(). 994 * lookups in fd_getfile().
995 */ 995 */
996 if (oldnfiles > NDFILE) { 996 if (oldnfiles > NDFILE) {
997 if (fdp->fd_refcnt > 1) { 997 if (fdp->fd_refcnt > 1) {
998 newdt->dt_link = dt; 998 newdt->dt_link = dt;
999 } else { 999 } else {
1000 fd_dtab_free(dt); 1000 fd_dtab_free(dt);
1001 } 1001 }
1002 } 1002 }
1003 1003
1004 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 1004 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
1005 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t); 1005 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
1006 memcpy(newhimap, fdp->fd_himap, i); 1006 memcpy(newhimap, fdp->fd_himap, i);
1007 memset((uint8_t *)newhimap + i, 0, 1007 memset((uint8_t *)newhimap + i, 0,
1008 NDHISLOTS(numfiles) * sizeof(uint32_t) - i); 1008 NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
1009 1009
1010 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t); 1010 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
1011 memcpy(newlomap, fdp->fd_lomap, i); 1011 memcpy(newlomap, fdp->fd_lomap, i);
1012 memset((uint8_t *)newlomap + i, 0, 1012 memset((uint8_t *)newlomap + i, 0,
1013 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i); 1013 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
1014 1014
1015 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) { 1015 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
1016 fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap); 1016 fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap);
1017 } 1017 }
1018 fdp->fd_himap = newhimap; 1018 fdp->fd_himap = newhimap;
1019 fdp->fd_lomap = newlomap; 1019 fdp->fd_lomap = newlomap;
1020 } 1020 }
1021 1021
1022 /* 1022 /*
1023 * All other modifications must become globally visible before 1023 * All other modifications must become globally visible before
1024 * the change to fd_dt. See fd_getfile(). 1024 * the change to fd_dt. See fd_getfile().
1025 */ 1025 */
1026 membar_producer(); 1026 membar_producer();
1027 fdp->fd_dt = newdt; 1027 fdp->fd_dt = newdt;
1028 KASSERT(newdt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 1028 KASSERT(newdt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1029 fd_checkmaps(fdp); 1029 fd_checkmaps(fdp);
1030 mutex_exit(&fdp->fd_lock); 1030 mutex_exit(&fdp->fd_lock);
1031} 1031}
1032 1032
1033/* 1033/*
1034 * Create a new open file structure and allocate a file descriptor 1034 * Create a new open file structure and allocate a file descriptor
1035 * for the current process. 1035 * for the current process.
1036 */ 1036 */
1037int 1037int
1038fd_allocfile(file_t **resultfp, int *resultfd) 1038fd_allocfile(file_t **resultfp, int *resultfd)
1039{ 1039{
1040 kauth_cred_t cred; 1040 kauth_cred_t cred;
1041 file_t *fp; 1041 file_t *fp;
1042 proc_t *p; 1042 proc_t *p;
1043 int error; 1043 int error;
1044 1044
1045 p = curproc; 1045 p = curproc;
1046 1046
1047 while ((error = fd_alloc(p, 0, resultfd)) != 0) { 1047 while ((error = fd_alloc(p, 0, resultfd)) != 0) {
1048 if (error != ENOSPC) { 1048 if (error != ENOSPC) {
1049 return error; 1049 return error;
1050 } 1050 }
1051 fd_tryexpand(p); 1051 fd_tryexpand(p);
1052 } 1052 }
1053 1053
1054 fp = pool_cache_get(file_cache, PR_WAITOK); 1054 fp = pool_cache_get(file_cache, PR_WAITOK);
1055 if (fp == NULL) { 1055 if (fp == NULL) {
1056 return ENFILE; 1056 return ENFILE;
1057 } 1057 }
1058 KASSERT(fp->f_count == 0); 1058 KASSERT(fp->f_count == 0);
1059 KASSERT(fp->f_msgcount == 0); 1059 KASSERT(fp->f_msgcount == 0);
1060 KASSERT(fp->f_unpcount == 0); 1060 KASSERT(fp->f_unpcount == 0);
1061 1061
1062 /* Replace cached credentials if not what we need. */ 1062 /* Replace cached credentials if not what we need. */
1063 cred = curlwp->l_cred; 1063 cred = curlwp->l_cred;
1064 if (__predict_false(cred != fp->f_cred)) { 1064 if (__predict_false(cred != fp->f_cred)) {
1065 kauth_cred_free(fp->f_cred); 1065 kauth_cred_free(fp->f_cred);
1066 kauth_cred_hold(cred); 1066 kauth_cred_hold(cred);
1067 fp->f_cred = cred; 1067 fp->f_cred = cred;
1068 } 1068 }
1069 1069
1070 /* 1070 /*
1071 * Don't allow recycled files to be scanned. 1071 * Don't allow recycled files to be scanned.
1072 * See uipc_usrreq.c. 1072 * See uipc_usrreq.c.
1073 */ 1073 */
1074 if (__predict_false((fp->f_flag & FSCAN) != 0)) { 1074 if (__predict_false((fp->f_flag & FSCAN) != 0)) {
1075 mutex_enter(&fp->f_lock); 1075 mutex_enter(&fp->f_lock);
1076 atomic_and_uint(&fp->f_flag, ~FSCAN); 1076 atomic_and_uint(&fp->f_flag, ~FSCAN);
1077 mutex_exit(&fp->f_lock); 1077 mutex_exit(&fp->f_lock);
1078 } 1078 }
1079 1079
1080 fp->f_advice = 0; 1080 fp->f_advice = 0;
1081 fp->f_offset = 0; 1081 fp->f_offset = 0;
1082 *resultfp = fp; 1082 *resultfp = fp;
1083 1083
1084 return 0; 1084 return 0;
1085} 1085}
1086 1086
1087/* 1087/*
1088 * Successful creation of a new descriptor: make visible to the process. 1088 * Successful creation of a new descriptor: make visible to the process.
1089 */ 1089 */
1090void 1090void
1091fd_affix(proc_t *p, file_t *fp, unsigned fd) 1091fd_affix(proc_t *p, file_t *fp, unsigned fd)
1092{ 1092{
1093 fdfile_t *ff; 1093 fdfile_t *ff;
1094 filedesc_t *fdp; 1094 filedesc_t *fdp;
1095 1095
1096 KASSERT(p == curproc || p == &proc0); 1096 KASSERT(p == curproc || p == &proc0);
1097 1097
1098 /* Add a reference to the file structure. */ 1098 /* Add a reference to the file structure. */
1099 mutex_enter(&fp->f_lock); 1099 mutex_enter(&fp->f_lock);
1100 fp->f_count++; 1100 fp->f_count++;
1101 mutex_exit(&fp->f_lock); 1101 mutex_exit(&fp->f_lock);
1102 1102
1103 /* 1103 /*
1104 * Insert the new file into the descriptor slot. 1104 * Insert the new file into the descriptor slot.
1105 * 1105 *
1106 * The memory barriers provided by lock activity in this routine 1106 * The memory barriers provided by lock activity in this routine
1107 * ensure that any updates to the file structure become globally 1107 * ensure that any updates to the file structure become globally
1108 * visible before the file becomes visible to other LWPs in the 1108 * visible before the file becomes visible to other LWPs in the
1109 * current process. 1109 * current process.
1110 */ 1110 */
1111 fdp = p->p_fd; 1111 fdp = p->p_fd;
1112 ff = fdp->fd_dt->dt_ff[fd]; 1112 ff = fdp->fd_dt->dt_ff[fd];
1113 1113
1114 KASSERT(ff != NULL); 1114 KASSERT(ff != NULL);
1115 KASSERT(ff->ff_file == NULL); 1115 KASSERT(ff->ff_file == NULL);
1116 KASSERT(ff->ff_allocated); 1116 KASSERT(ff->ff_allocated);
1117 KASSERT(fd_isused(fdp, fd)); 1117 KASSERT(fd_isused(fdp, fd));
1118 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1118 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1119 1119
1120 /* No need to lock in order to make file initially visible. */ 1120 /* No need to lock in order to make file initially visible. */
1121 ff->ff_file = fp; 1121 ff->ff_file = fp;
1122} 1122}
1123 1123
1124/* 1124/*
1125 * Abort creation of a new descriptor: free descriptor slot and file. 1125 * Abort creation of a new descriptor: free descriptor slot and file.
1126 */ 1126 */
1127void 1127void
1128fd_abort(proc_t *p, file_t *fp, unsigned fd) 1128fd_abort(proc_t *p, file_t *fp, unsigned fd)
1129{ 1129{
1130 filedesc_t *fdp; 1130 filedesc_t *fdp;
1131 fdfile_t *ff; 1131 fdfile_t *ff;
1132 1132
1133 KASSERT(p == curproc || p == &proc0); 1133 KASSERT(p == curproc || p == &proc0);
1134 1134
1135 fdp = p->p_fd; 1135 fdp = p->p_fd;
1136 ff = fdp->fd_dt->dt_ff[fd]; 1136 ff = fdp->fd_dt->dt_ff[fd];
1137 1137
1138 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1138 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1139 1139
1140 mutex_enter(&fdp->fd_lock); 1140 mutex_enter(&fdp->fd_lock);
1141 KASSERT(fd_isused(fdp, fd)); 1141 KASSERT(fd_isused(fdp, fd));
1142 fd_unused(fdp, fd); 1142 fd_unused(fdp, fd);
1143 mutex_exit(&fdp->fd_lock); 1143 mutex_exit(&fdp->fd_lock);
1144 1144
1145 if (fp != NULL) { 1145 if (fp != NULL) {
1146 KASSERT(fp->f_count == 0); 1146 KASSERT(fp->f_count == 0);
1147 KASSERT(fp->f_cred != NULL); 1147 KASSERT(fp->f_cred != NULL);
1148 pool_cache_put(file_cache, fp); 1148 pool_cache_put(file_cache, fp);
1149 } 1149 }
1150} 1150}
1151 1151
1152static int 1152static int
1153file_ctor(void *arg, void *obj, int flags) 1153file_ctor(void *arg, void *obj, int flags)
1154{ 1154{
1155 file_t *fp = obj; 1155 file_t *fp = obj;
1156 1156
1157 memset(fp, 0, sizeof(*fp)); 1157 memset(fp, 0, sizeof(*fp));
1158 1158
1159 mutex_enter(&filelist_lock); 1159 mutex_enter(&filelist_lock);
1160 if (__predict_false(nfiles >= maxfiles)) { 1160 if (__predict_false(nfiles >= maxfiles)) {
1161 mutex_exit(&filelist_lock); 1161 mutex_exit(&filelist_lock);
1162 tablefull("file", "increase kern.maxfiles or MAXFILES"); 1162 tablefull("file", "increase kern.maxfiles or MAXFILES");
1163 return ENFILE; 1163 return ENFILE;
1164 } 1164 }
1165 nfiles++; 1165 nfiles++;
1166 LIST_INSERT_HEAD(&filehead, fp, f_list); 1166 LIST_INSERT_HEAD(&filehead, fp, f_list);
1167 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE); 1167 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1168 fp->f_cred = curlwp->l_cred; 1168 fp->f_cred = curlwp->l_cred;
1169 kauth_cred_hold(fp->f_cred); 1169 kauth_cred_hold(fp->f_cred);
1170 mutex_exit(&filelist_lock); 1170 mutex_exit(&filelist_lock);
1171 1171
1172 return 0; 1172 return 0;
1173} 1173}
1174 1174
1175static void 1175static void
1176file_dtor(void *arg, void *obj) 1176file_dtor(void *arg, void *obj)
1177{ 1177{
1178 file_t *fp = obj; 1178 file_t *fp = obj;
1179 1179
1180 mutex_enter(&filelist_lock); 1180 mutex_enter(&filelist_lock);
1181 nfiles--; 1181 nfiles--;
1182 LIST_REMOVE(fp, f_list); 1182 LIST_REMOVE(fp, f_list);
1183 mutex_exit(&filelist_lock); 1183 mutex_exit(&filelist_lock);
1184 1184
1185 kauth_cred_free(fp->f_cred); 1185 kauth_cred_free(fp->f_cred);
1186 mutex_destroy(&fp->f_lock); 1186 mutex_destroy(&fp->f_lock);
1187} 1187}
1188 1188
1189static int 1189static int
1190fdfile_ctor(void *arg, void *obj, int flags) 1190fdfile_ctor(void *arg, void *obj, int flags)
1191{ 1191{
1192 fdfile_t *ff = obj; 1192 fdfile_t *ff = obj;
1193 1193
1194 memset(ff, 0, sizeof(*ff)); 1194 memset(ff, 0, sizeof(*ff));
1195 cv_init(&ff->ff_closing, "fdclose"); 1195 cv_init(&ff->ff_closing, "fdclose");
1196 1196
1197 return 0; 1197 return 0;
1198} 1198}
1199 1199
1200static void 1200static void
1201fdfile_dtor(void *arg, void *obj) 1201fdfile_dtor(void *arg, void *obj)
1202{ 1202{
1203 fdfile_t *ff = obj; 1203 fdfile_t *ff = obj;
1204 1204
1205 cv_destroy(&ff->ff_closing); 1205 cv_destroy(&ff->ff_closing);
1206} 1206}
1207 1207
1208file_t * 1208file_t *
1209fgetdummy(void) 1209fgetdummy(void)
1210{ 1210{
1211 file_t *fp; 1211 file_t *fp;
1212 1212
1213 fp = kmem_alloc(sizeof(*fp), KM_SLEEP); 1213 fp = kmem_alloc(sizeof(*fp), KM_SLEEP);
1214 if (fp != NULL) { 1214 if (fp != NULL) {
1215 memset(fp, 0, sizeof(*fp)); 1215 memset(fp, 0, sizeof(*fp));
1216 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE); 1216 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1217 } 1217 }
1218 return fp; 1218 return fp;
1219} 1219}
1220 1220
1221void 1221void
1222fputdummy(file_t *fp) 1222fputdummy(file_t *fp)
1223{ 1223{
1224 1224
1225 mutex_destroy(&fp->f_lock); 1225 mutex_destroy(&fp->f_lock);
1226 kmem_free(fp, sizeof(*fp)); 1226 kmem_free(fp, sizeof(*fp));
1227} 1227}
1228 1228
1229/* 1229/*
1230 * Create an initial filedesc structure. 1230 * Create an initial filedesc structure.
1231 */ 1231 */
1232filedesc_t * 1232filedesc_t *
1233fd_init(filedesc_t *fdp) 1233fd_init(filedesc_t *fdp)
1234{ 1234{
1235#ifdef DIAGNOSTIC 1235#ifdef DIAGNOSTIC
1236 unsigned fd; 1236 unsigned fd;
1237#endif 1237#endif
1238 1238
1239 if (__predict_true(fdp == NULL)) { 1239 if (__predict_true(fdp == NULL)) {
1240 fdp = pool_cache_get(filedesc_cache, PR_WAITOK); 1240 fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1241 } else { 1241 } else {
1242 /* XXXRUMP KASSERT(fdp == &filedesc0); */ 1242 /* XXXRUMP KASSERT(fdp == &filedesc0); */
1243 filedesc_ctor(NULL, fdp, PR_WAITOK); 1243 filedesc_ctor(NULL, fdp, PR_WAITOK);
1244 } 1244 }
1245 1245
1246#ifdef DIAGNOSTIC 1246#ifdef DIAGNOSTIC
1247 KASSERT(fdp->fd_lastfile == -1); 1247 KASSERT(fdp->fd_lastfile == -1);
1248 KASSERT(fdp->fd_lastkqfile == -1); 1248 KASSERT(fdp->fd_lastkqfile == -1);
1249 KASSERT(fdp->fd_knhash == NULL); 1249 KASSERT(fdp->fd_knhash == NULL);
1250 KASSERT(fdp->fd_freefile == 0); 1250 KASSERT(fdp->fd_freefile == 0);
1251 KASSERT(fdp->fd_exclose == false); 1251 KASSERT(fdp->fd_exclose == false);
1252 KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin); 1252 KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
1253 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1253 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
1254 for (fd = 0; fd < NDFDFILE; fd++) { 1254 for (fd = 0; fd < NDFDFILE; fd++) {
1255 KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == 1255 KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] ==
1256 (fdfile_t *)fdp->fd_dfdfile[fd]); 1256 (fdfile_t *)fdp->fd_dfdfile[fd]);
1257 } 1257 }
1258 for (fd = NDFDFILE; fd < NDFILE; fd++) { 1258 for (fd = NDFDFILE; fd < NDFILE; fd++) {
1259 KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == NULL); 1259 KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == NULL);
1260 } 1260 }
1261 KASSERT(fdp->fd_himap == fdp->fd_dhimap); 1261 KASSERT(fdp->fd_himap == fdp->fd_dhimap);
1262 KASSERT(fdp->fd_lomap == fdp->fd_dlomap); 1262 KASSERT(fdp->fd_lomap == fdp->fd_dlomap);
1263#endif /* DIAGNOSTIC */ 1263#endif /* DIAGNOSTIC */
1264 1264
1265 fdp->fd_refcnt = 1; 1265 fdp->fd_refcnt = 1;
1266 fd_checkmaps(fdp); 1266 fd_checkmaps(fdp);
1267 1267
1268 return fdp; 1268 return fdp;
1269} 1269}
1270 1270
1271/* 1271/*
1272 * Initialize a file descriptor table. 1272 * Initialize a file descriptor table.
1273 */ 1273 */
1274static int 1274static int
1275filedesc_ctor(void *arg, void *obj, int flag) 1275filedesc_ctor(void *arg, void *obj, int flag)
1276{ 1276{
1277 filedesc_t *fdp = obj; 1277 filedesc_t *fdp = obj;
1278 fdfile_t **ffp; 1278 fdfile_t **ffp;
1279 int i; 1279 int i;
1280 1280
1281 memset(fdp, 0, sizeof(*fdp)); 1281 memset(fdp, 0, sizeof(*fdp));
1282 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE); 1282 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
1283 fdp->fd_lastfile = -1; 1283 fdp->fd_lastfile = -1;
1284 fdp->fd_lastkqfile = -1; 1284 fdp->fd_lastkqfile = -1;
1285 fdp->fd_dt = &fdp->fd_dtbuiltin; 1285 fdp->fd_dt = &fdp->fd_dtbuiltin;
1286 fdp->fd_dtbuiltin.dt_nfiles = NDFILE; 1286 fdp->fd_dtbuiltin.dt_nfiles = NDFILE;
1287 fdp->fd_himap = fdp->fd_dhimap; 1287 fdp->fd_himap = fdp->fd_dhimap;
1288 fdp->fd_lomap = fdp->fd_dlomap; 1288 fdp->fd_lomap = fdp->fd_dlomap;
1289 1289
1290 CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t)); 1290 CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
1291 for (i = 0, ffp = fdp->fd_dt->dt_ff; i < NDFDFILE; i++, ffp++) { 1291 for (i = 0, ffp = fdp->fd_dt->dt_ff; i < NDFDFILE; i++, ffp++) {
1292 *ffp = (fdfile_t *)fdp->fd_dfdfile[i]; 1292 *ffp = (fdfile_t *)fdp->fd_dfdfile[i];
1293 (void)fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK); 1293 (void)fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
1294 } 1294 }
1295 1295
1296 return 0; 1296 return 0;
1297} 1297}
1298 1298
1299static void 1299static void
1300filedesc_dtor(void *arg, void *obj) 1300filedesc_dtor(void *arg, void *obj)
1301{ 1301{
1302 filedesc_t *fdp = obj; 1302 filedesc_t *fdp = obj;
1303 int i; 1303 int i;
1304 1304
1305 for (i = 0; i < NDFDFILE; i++) { 1305 for (i = 0; i < NDFDFILE; i++) {
1306 fdfile_dtor(NULL, fdp->fd_dfdfile[i]); 1306 fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
1307 } 1307 }
1308 1308
1309 mutex_destroy(&fdp->fd_lock); 1309 mutex_destroy(&fdp->fd_lock);
1310} 1310}
1311 1311
1312/* 1312/*
1313 * Make p2 share p1's filedesc structure. 1313 * Make p2 share p1's filedesc structure.
1314 */ 1314 */
1315void 1315void
1316fd_share(struct proc *p2) 1316fd_share(struct proc *p2)
1317{ 1317{
1318 filedesc_t *fdp; 1318 filedesc_t *fdp;
1319 1319
1320 fdp = curlwp->l_fd; 1320 fdp = curlwp->l_fd;
1321 p2->p_fd = fdp; 1321 p2->p_fd = fdp;
1322 atomic_inc_uint(&fdp->fd_refcnt); 1322 atomic_inc_uint(&fdp->fd_refcnt);
1323} 1323}
1324 1324
1325/* 1325/*
1326 * Acquire a hold on a filedesc structure. 1326 * Acquire a hold on a filedesc structure.
1327 */ 1327 */
1328void 1328void
1329fd_hold(void) 1329fd_hold(lwp_t *l)
1330{ 1330{
 1331 filedesc_t *fdp = l->l_fd;
1331 1332
1332 atomic_inc_uint(&curlwp->l_fd->fd_refcnt); 1333 KASSERT(fdp == curlwp->l_fd || fdp == lwp0.l_fd);
 1334 atomic_inc_uint(&fdp->fd_refcnt);
1333} 1335}
1334 1336
1335/* 1337/*
1336 * Copy a filedesc structure. 1338 * Copy a filedesc structure.
1337 */ 1339 */
1338filedesc_t * 1340filedesc_t *
1339fd_copy(void) 1341fd_copy(void)
1340{ 1342{
1341 filedesc_t *newfdp, *fdp; 1343 filedesc_t *newfdp, *fdp;
1342 fdfile_t *ff, **ffp, **nffp, *ff2; 1344 fdfile_t *ff, **ffp, **nffp, *ff2;
1343 int i, j, numfiles, lastfile, newlast; 1345 int i, j, numfiles, lastfile, newlast;
1344 file_t *fp; 1346 file_t *fp;
1345 fdtab_t *newdt; 1347 fdtab_t *newdt;
1346 1348
1347 fdp = curproc->p_fd; 1349 fdp = curproc->p_fd;
1348 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK); 1350 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1349 newfdp->fd_refcnt = 1; 1351 newfdp->fd_refcnt = 1;
1350 1352
1351#ifdef DIAGNOSTIC 1353#ifdef DIAGNOSTIC
1352 KASSERT(newfdp->fd_lastfile == -1); 1354 KASSERT(newfdp->fd_lastfile == -1);
1353 KASSERT(newfdp->fd_lastkqfile == -1); 1355 KASSERT(newfdp->fd_lastkqfile == -1);
1354 KASSERT(newfdp->fd_knhash == NULL); 1356 KASSERT(newfdp->fd_knhash == NULL);
1355 KASSERT(newfdp->fd_freefile == 0); 1357 KASSERT(newfdp->fd_freefile == 0);
1356 KASSERT(newfdp->fd_exclose == false); 1358 KASSERT(newfdp->fd_exclose == false);
1357 KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin); 1359 KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
1358 KASSERT(newfdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1360 KASSERT(newfdp->fd_dtbuiltin.dt_nfiles == NDFILE);
1359 for (i = 0; i < NDFDFILE; i++) { 1361 for (i = 0; i < NDFDFILE; i++) {
1360 KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == 1362 KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] ==
1361 (fdfile_t *)&newfdp->fd_dfdfile[i]); 1363 (fdfile_t *)&newfdp->fd_dfdfile[i]);
1362 } 1364 }
1363 for (i = NDFDFILE; i < NDFILE; i++) { 1365 for (i = NDFDFILE; i < NDFILE; i++) {
1364 KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == NULL); 1366 KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == NULL);
1365 } 1367 }
1366#endif /* DIAGNOSTIC */ 1368#endif /* DIAGNOSTIC */
1367 1369
1368 mutex_enter(&fdp->fd_lock); 1370 mutex_enter(&fdp->fd_lock);
1369 fd_checkmaps(fdp); 1371 fd_checkmaps(fdp);
1370 numfiles = fdp->fd_dt->dt_nfiles; 1372 numfiles = fdp->fd_dt->dt_nfiles;
1371 lastfile = fdp->fd_lastfile; 1373 lastfile = fdp->fd_lastfile;
1372 1374
1373 /* 1375 /*
1374 * If the number of open files fits in the internal arrays 1376 * If the number of open files fits in the internal arrays
1375 * of the open file structure, use them, otherwise allocate 1377 * of the open file structure, use them, otherwise allocate
1376 * additional memory for the number of descriptors currently 1378 * additional memory for the number of descriptors currently
1377 * in use. 1379 * in use.
1378 */ 1380 */
1379 if (lastfile < NDFILE) { 1381 if (lastfile < NDFILE) {
1380 i = NDFILE; 1382 i = NDFILE;
1381 newdt = newfdp->fd_dt; 1383 newdt = newfdp->fd_dt;
1382 KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin); 1384 KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
1383 } else { 1385 } else {
1384 /* 1386 /*
1385 * Compute the smallest multiple of NDEXTENT needed 1387 * Compute the smallest multiple of NDEXTENT needed
1386 * for the file descriptors currently in use, 1388 * for the file descriptors currently in use,
1387 * allowing the table to shrink. 1389 * allowing the table to shrink.
1388 */ 1390 */
1389 i = numfiles; 1391 i = numfiles;
1390 while (i >= 2 * NDEXTENT && i > lastfile * 2) { 1392 while (i >= 2 * NDEXTENT && i > lastfile * 2) {
1391 i /= 2; 1393 i /= 2;
1392 } 1394 }
1393 KASSERT(i > NDFILE); 1395 KASSERT(i > NDFILE);
1394 newdt = fd_dtab_alloc(i); 1396 newdt = fd_dtab_alloc(i);
1395 newfdp->fd_dt = newdt; 1397 newfdp->fd_dt = newdt;
1396 memcpy(newdt->dt_ff, newfdp->fd_dtbuiltin.dt_ff, 1398 memcpy(newdt->dt_ff, newfdp->fd_dtbuiltin.dt_ff,
1397 NDFDFILE * sizeof(fdfile_t **)); 1399 NDFDFILE * sizeof(fdfile_t **));
1398 memset(newdt->dt_ff + NDFDFILE, 0, 1400 memset(newdt->dt_ff + NDFDFILE, 0,
1399 (i - NDFDFILE) * sizeof(fdfile_t **)); 1401 (i - NDFDFILE) * sizeof(fdfile_t **));
1400 } 1402 }
1401 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) { 1403 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
1402 newfdp->fd_himap = newfdp->fd_dhimap; 1404 newfdp->fd_himap = newfdp->fd_dhimap;
1403 newfdp->fd_lomap = newfdp->fd_dlomap; 1405 newfdp->fd_lomap = newfdp->fd_dlomap;
1404 } else { 1406 } else {
1405 fd_map_alloc(i, &newfdp->fd_lomap, &newfdp->fd_himap); 1407 fd_map_alloc(i, &newfdp->fd_lomap, &newfdp->fd_himap);
1406 KASSERT(i >= NDENTRIES * NDENTRIES); 1408 KASSERT(i >= NDENTRIES * NDENTRIES);
1407 memset(newfdp->fd_himap, 0, NDHISLOTS(i)*sizeof(uint32_t)); 1409 memset(newfdp->fd_himap, 0, NDHISLOTS(i)*sizeof(uint32_t));
1408 memset(newfdp->fd_lomap, 0, NDLOSLOTS(i)*sizeof(uint32_t)); 1410 memset(newfdp->fd_lomap, 0, NDLOSLOTS(i)*sizeof(uint32_t));
1409 } 1411 }
1410 newfdp->fd_freefile = fdp->fd_freefile; 1412 newfdp->fd_freefile = fdp->fd_freefile;
1411 newfdp->fd_exclose = fdp->fd_exclose; 1413 newfdp->fd_exclose = fdp->fd_exclose;
1412 1414
1413 ffp = fdp->fd_dt->dt_ff; 1415 ffp = fdp->fd_dt->dt_ff;
1414 nffp = newdt->dt_ff; 1416 nffp = newdt->dt_ff;
1415 newlast = -1; 1417 newlast = -1;
1416 for (i = 0; i <= (int)lastfile; i++, ffp++, nffp++) { 1418 for (i = 0; i <= (int)lastfile; i++, ffp++, nffp++) {
1417 KASSERT(i >= NDFDFILE || 1419 KASSERT(i >= NDFDFILE ||
1418 *nffp == (fdfile_t *)newfdp->fd_dfdfile[i]); 1420 *nffp == (fdfile_t *)newfdp->fd_dfdfile[i]);
1419 ff = *ffp; 1421 ff = *ffp;
1420 if (ff == NULL || (fp = ff->ff_file) == NULL) { 1422 if (ff == NULL || (fp = ff->ff_file) == NULL) {
1421 /* Descriptor unused, or descriptor half open. */ 1423 /* Descriptor unused, or descriptor half open. */
1422 KASSERT(!fd_isused(newfdp, i)); 1424 KASSERT(!fd_isused(newfdp, i));
1423 continue; 1425 continue;
1424 } 1426 }
1425 if (__predict_false(fp->f_type == DTYPE_KQUEUE)) { 1427 if (__predict_false(fp->f_type == DTYPE_KQUEUE)) {
1426 /* kqueue descriptors cannot be copied. */ 1428 /* kqueue descriptors cannot be copied. */
1427 if (i < newfdp->fd_freefile) 1429 if (i < newfdp->fd_freefile)
1428 newfdp->fd_freefile = i; 1430 newfdp->fd_freefile = i;
1429 continue; 1431 continue;
1430 } 1432 }
1431 /* It's active: add a reference to the file. */ 1433 /* It's active: add a reference to the file. */
1432 mutex_enter(&fp->f_lock); 1434 mutex_enter(&fp->f_lock);
1433 fp->f_count++; 1435 fp->f_count++;
1434 mutex_exit(&fp->f_lock); 1436 mutex_exit(&fp->f_lock);
1435 1437
1436 /* Allocate an fdfile_t to represent it. */ 1438 /* Allocate an fdfile_t to represent it. */
1437 if (i >= NDFDFILE) { 1439 if (i >= NDFDFILE) {
1438 ff2 = pool_cache_get(fdfile_cache, PR_WAITOK); 1440 ff2 = pool_cache_get(fdfile_cache, PR_WAITOK);
1439 *nffp = ff2; 1441 *nffp = ff2;
1440 } else { 1442 } else {
1441 ff2 = newdt->dt_ff[i]; 1443 ff2 = newdt->dt_ff[i];
1442 } 1444 }
1443 ff2->ff_file = fp; 1445 ff2->ff_file = fp;
1444 ff2->ff_exclose = ff->ff_exclose; 1446 ff2->ff_exclose = ff->ff_exclose;
1445 ff2->ff_allocated = true; 1447 ff2->ff_allocated = true;
1446 1448
1447 /* Fix up bitmaps. */ 1449 /* Fix up bitmaps. */
1448 j = i >> NDENTRYSHIFT; 1450 j = i >> NDENTRYSHIFT;
1449 KASSERT((newfdp->fd_lomap[j] & (1 << (i & NDENTRYMASK))) == 0); 1451 KASSERT((newfdp->fd_lomap[j] & (1 << (i & NDENTRYMASK))) == 0);
1450 newfdp->fd_lomap[j] |= 1 << (i & NDENTRYMASK); 1452 newfdp->fd_lomap[j] |= 1 << (i & NDENTRYMASK);
1451 if (__predict_false(newfdp->fd_lomap[j] == ~0)) { 1453 if (__predict_false(newfdp->fd_lomap[j] == ~0)) {
1452 KASSERT((newfdp->fd_himap[j >> NDENTRYSHIFT] & 1454 KASSERT((newfdp->fd_himap[j >> NDENTRYSHIFT] &
1453 (1 << (j & NDENTRYMASK))) == 0); 1455 (1 << (j & NDENTRYMASK))) == 0);
1454 newfdp->fd_himap[j >> NDENTRYSHIFT] |= 1456 newfdp->fd_himap[j >> NDENTRYSHIFT] |=
1455 1 << (j & NDENTRYMASK); 1457 1 << (j & NDENTRYMASK);
1456 } 1458 }
1457 newlast = i; 1459 newlast = i;
1458 } 1460 }
1459 KASSERT(newdt->dt_ff[0] == (fdfile_t *)newfdp->fd_dfdfile[0]); 1461 KASSERT(newdt->dt_ff[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
1460 newfdp->fd_lastfile = newlast; 1462 newfdp->fd_lastfile = newlast;
1461 fd_checkmaps(newfdp); 1463 fd_checkmaps(newfdp);
1462 mutex_exit(&fdp->fd_lock); 1464 mutex_exit(&fdp->fd_lock);
1463  1465
1464 return (newfdp); 1466 return (newfdp);
1465} 1467}
1466 1468
1467/* 1469/*
1468 * Release a filedesc structure. 1470 * Release a filedesc structure.
1469 */ 1471 */
1470void 1472void
1471fd_free(void) 1473fd_free(void)
1472{ 1474{
1473 fdfile_t *ff; 1475 fdfile_t *ff;
1474 file_t *fp; 1476 file_t *fp;
1475 int fd, nf; 1477 int fd, nf;
1476 fdtab_t *dt; 1478 fdtab_t *dt;
1477 lwp_t * const l = curlwp; 1479 lwp_t * const l = curlwp;
1478 filedesc_t * const fdp = l->l_fd; 1480 filedesc_t * const fdp = l->l_fd;
1479 const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0; 1481 const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0;
1480 1482
1481 KASSERT(fdp->fd_dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 1483 KASSERT(fdp->fd_dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1482 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1484 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
1483 KASSERT(fdp->fd_dtbuiltin.dt_link == NULL); 1485 KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
1484 1486
1485#ifndef __HAVE_ATOMIC_AS_MEMBAR 1487#ifndef __HAVE_ATOMIC_AS_MEMBAR
1486 membar_exit(); 1488 membar_exit();
1487#endif 1489#endif
1488 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0) 1490 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1489 return; 1491 return;
1490 1492
1491 /* 1493 /*
1492 * Close any files that the process holds open. 1494 * Close any files that the process holds open.
1493 */ 1495 */
1494 dt = fdp->fd_dt; 1496 dt = fdp->fd_dt;
1495 fd_checkmaps(fdp); 1497 fd_checkmaps(fdp);
1496#ifdef DEBUG 1498#ifdef DEBUG
1497 fdp->fd_refcnt = -1; /* see fd_checkmaps */ 1499 fdp->fd_refcnt = -1; /* see fd_checkmaps */
1498#endif 1500#endif
1499 for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) { 1501 for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) {
1500 ff = dt->dt_ff[fd]; 1502 ff = dt->dt_ff[fd];
1501 KASSERT(fd >= NDFDFILE || 1503 KASSERT(fd >= NDFDFILE ||
1502 ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1504 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1503 if (ff == NULL) 1505 if (ff == NULL)
1504 continue; 1506 continue;
1505 if ((fp = ff->ff_file) != NULL) { 1507 if ((fp = ff->ff_file) != NULL) {
1506 /* 1508 /*
1507 * Must use fd_close() here if there is 1509 * Must use fd_close() here if there is
1508 * a reference from kqueue or we might have posix 1510 * a reference from kqueue or we might have posix
1509 * advisory locks. 1511 * advisory locks.
1510 */ 1512 */
1511 if (__predict_true(ff->ff_refcnt == 0) && 1513 if (__predict_true(ff->ff_refcnt == 0) &&
1512 (noadvlock || fp->f_type != DTYPE_VNODE)) { 1514 (noadvlock || fp->f_type != DTYPE_VNODE)) {
1513 ff->ff_file = NULL; 1515 ff->ff_file = NULL;
1514 ff->ff_exclose = false; 1516 ff->ff_exclose = false;
1515 ff->ff_allocated = false; 1517 ff->ff_allocated = false;
1516 closef(fp); 1518 closef(fp);
1517 } else { 1519 } else {
1518 ff->ff_refcnt++; 1520 ff->ff_refcnt++;
1519 fd_close(fd); 1521 fd_close(fd);
1520 } 1522 }
1521 } 1523 }
1522 KASSERT(ff->ff_refcnt == 0); 1524 KASSERT(ff->ff_refcnt == 0);
1523 KASSERT(ff->ff_file == NULL); 1525 KASSERT(ff->ff_file == NULL);
1524 KASSERT(!ff->ff_exclose); 1526 KASSERT(!ff->ff_exclose);
1525 KASSERT(!ff->ff_allocated); 1527 KASSERT(!ff->ff_allocated);
1526 if (fd >= NDFDFILE) { 1528 if (fd >= NDFDFILE) {
1527 pool_cache_put(fdfile_cache, ff); 1529 pool_cache_put(fdfile_cache, ff);
1528 dt->dt_ff[fd] = NULL; 1530 dt->dt_ff[fd] = NULL;
1529 } 1531 }
1530 } 1532 }
1531 1533
1532 /* 1534 /*
1533 * Clean out the descriptor table for the next user and return 1535 * Clean out the descriptor table for the next user and return
1534 * to the cache. 1536 * to the cache.
1535 */ 1537 */
1536 if (__predict_false(dt != &fdp->fd_dtbuiltin)) { 1538 if (__predict_false(dt != &fdp->fd_dtbuiltin)) {
1537 fd_dtab_free(fdp->fd_dt); 1539 fd_dtab_free(fdp->fd_dt);
1538 /* Otherwise, done above. */ 1540 /* Otherwise, done above. */
1539 memset(&fdp->fd_dtbuiltin.dt_ff[NDFDFILE], 0, 1541 memset(&fdp->fd_dtbuiltin.dt_ff[NDFDFILE], 0,
1540 (NDFILE - NDFDFILE) * sizeof(fdp->fd_dtbuiltin.dt_ff[0])); 1542 (NDFILE - NDFDFILE) * sizeof(fdp->fd_dtbuiltin.dt_ff[0]));
1541 fdp->fd_dt = &fdp->fd_dtbuiltin; 1543 fdp->fd_dt = &fdp->fd_dtbuiltin;
1542 } 1544 }
1543 if (__predict_false(NDHISLOTS(nf) > NDHISLOTS(NDFILE))) { 1545 if (__predict_false(NDHISLOTS(nf) > NDHISLOTS(NDFILE))) {
1544 KASSERT(fdp->fd_himap != fdp->fd_dhimap); 1546 KASSERT(fdp->fd_himap != fdp->fd_dhimap);
1545 KASSERT(fdp->fd_lomap != fdp->fd_dlomap); 1547 KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
1546 fd_map_free(nf, fdp->fd_lomap, fdp->fd_himap); 1548 fd_map_free(nf, fdp->fd_lomap, fdp->fd_himap);
1547 } 1549 }
1548 if (__predict_false(fdp->fd_knhash != NULL)) { 1550 if (__predict_false(fdp->fd_knhash != NULL)) {
1549 hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask); 1551 hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
1550 fdp->fd_knhash = NULL; 1552 fdp->fd_knhash = NULL;
1551 fdp->fd_knhashmask = 0; 1553 fdp->fd_knhashmask = 0;
1552 } else { 1554 } else {
1553 KASSERT(fdp->fd_knhashmask == 0); 1555 KASSERT(fdp->fd_knhashmask == 0);
1554 } 1556 }
1555 fdp->fd_dt = &fdp->fd_dtbuiltin; 1557 fdp->fd_dt = &fdp->fd_dtbuiltin;
1556 fdp->fd_lastkqfile = -1; 1558 fdp->fd_lastkqfile = -1;
1557 fdp->fd_lastfile = -1; 1559 fdp->fd_lastfile = -1;
1558 fdp->fd_freefile = 0; 1560 fdp->fd_freefile = 0;
1559 fdp->fd_exclose = false; 1561 fdp->fd_exclose = false;
1560 memset(&fdp->fd_startzero, 0, sizeof(*fdp) - 1562 memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
1561 offsetof(filedesc_t, fd_startzero)); 1563 offsetof(filedesc_t, fd_startzero));
1562 fdp->fd_himap = fdp->fd_dhimap; 1564 fdp->fd_himap = fdp->fd_dhimap;
1563 fdp->fd_lomap = fdp->fd_dlomap; 1565 fdp->fd_lomap = fdp->fd_dlomap;
1564 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1566 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
1565 KASSERT(fdp->fd_dtbuiltin.dt_link == NULL); 1567 KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
1566 KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin); 1568 KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
1567#ifdef DEBUG 1569#ifdef DEBUG
1568 fdp->fd_refcnt = 0; /* see fd_checkmaps */ 1570 fdp->fd_refcnt = 0; /* see fd_checkmaps */
1569#endif 1571#endif
1570 fd_checkmaps(fdp); 1572 fd_checkmaps(fdp);
1571 pool_cache_put(filedesc_cache, fdp); 1573 pool_cache_put(filedesc_cache, fdp);
1572} 1574}
1573 1575
1574/* 1576/*
1575 * File Descriptor pseudo-device driver (/dev/fd/). 1577 * File Descriptor pseudo-device driver (/dev/fd/).
1576 * 1578 *
1577 * Opening minor device N dup()s the file (if any) connected to file 1579 * Opening minor device N dup()s the file (if any) connected to file
1578 * descriptor N belonging to the calling process. Note that this driver 1580 * descriptor N belonging to the calling process. Note that this driver
1579 * consists of only the ``open()'' routine, because all subsequent 1581 * consists of only the ``open()'' routine, because all subsequent
1580 * references to this file will be direct to the other driver. 1582 * references to this file will be direct to the other driver.
1581 */ 1583 */
1582static int 1584static int
1583filedescopen(dev_t dev, int mode, int type, lwp_t *l) 1585filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1584{ 1586{
1585 1587
1586 /* 1588 /*
1587 * XXX Kludge: set dupfd to contain the value of the 1589 * XXX Kludge: set dupfd to contain the value of the
1588 * the file descriptor being sought for duplication. The error 1590 * the file descriptor being sought for duplication. The error
1589 * return ensures that the vnode for this device will be released 1591 * return ensures that the vnode for this device will be released
1590 * by vn_open. Open will detect this special error and take the 1592 * by vn_open. Open will detect this special error and take the
1591 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 1593 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1592 * will simply report the error. 1594 * will simply report the error.
1593 */ 1595 */
1594 l->l_dupfd = minor(dev); /* XXX */ 1596 l->l_dupfd = minor(dev); /* XXX */
1595 return EDUPFD; 1597 return EDUPFD;
1596} 1598}
1597 1599
1598/* 1600/*
1599 * Duplicate the specified descriptor to a free descriptor. 1601 * Duplicate the specified descriptor to a free descriptor.
1600 */ 1602 */
1601int 1603int
1602fd_dupopen(int old, int *new, int mode, int error) 1604fd_dupopen(int old, int *new, int mode, int error)
1603{ 1605{
1604 filedesc_t *fdp; 1606 filedesc_t *fdp;
1605 fdfile_t *ff; 1607 fdfile_t *ff;
1606 file_t *fp; 1608 file_t *fp;
1607 fdtab_t *dt; 1609 fdtab_t *dt;
1608 1610
1609 if ((fp = fd_getfile(old)) == NULL) { 1611 if ((fp = fd_getfile(old)) == NULL) {
1610 return EBADF; 1612 return EBADF;
1611 } 1613 }
1612 fdp = curlwp->l_fd; 1614 fdp = curlwp->l_fd;
1613 dt = fdp->fd_dt; 1615 dt = fdp->fd_dt;
1614 ff = dt->dt_ff[old]; 1616 ff = dt->dt_ff[old];
1615 1617
1616 /* 1618 /*
1617 * There are two cases of interest here. 1619 * There are two cases of interest here.
1618 * 1620 *
1619 * For EDUPFD simply dup (dfd) to file descriptor 1621 * For EDUPFD simply dup (dfd) to file descriptor
1620 * (indx) and return. 1622 * (indx) and return.
1621 * 1623 *
1622 * For EMOVEFD steal away the file structure from (dfd) and 1624 * For EMOVEFD steal away the file structure from (dfd) and
1623 * store it in (indx). (dfd) is effectively closed by 1625 * store it in (indx). (dfd) is effectively closed by
1624 * this operation. 1626 * this operation.
1625 * 1627 *
1626 * Any other error code is just returned. 1628 * Any other error code is just returned.
1627 */ 1629 */
1628 switch (error) { 1630 switch (error) {
1629 case EDUPFD: 1631 case EDUPFD:
1630 /* 1632 /*
1631 * Check that the mode the file is being opened for is a 1633 * Check that the mode the file is being opened for is a
1632 * subset of the mode of the existing descriptor. 1634 * subset of the mode of the existing descriptor.
1633 */ 1635 */
1634 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { 1636 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
1635 error = EACCES; 1637 error = EACCES;
1636 break; 1638 break;
1637 } 1639 }
1638 1640
1639 /* Copy it. */ 1641 /* Copy it. */
1640 error = fd_dup(fp, 0, new, ff->ff_exclose); 1642 error = fd_dup(fp, 0, new, ff->ff_exclose);
1641 break; 1643 break;
1642 1644
1643 case EMOVEFD: 1645 case EMOVEFD:
1644 /* Copy it. */ 1646 /* Copy it. */
1645 error = fd_dup(fp, 0, new, ff->ff_exclose); 1647 error = fd_dup(fp, 0, new, ff->ff_exclose);
1646 if (error != 0) { 1648 if (error != 0) {
1647 break; 1649 break;
1648 } 1650 }
1649 1651
1650 /* Steal away the file pointer from 'old'. */ 1652 /* Steal away the file pointer from 'old'. */
1651 (void)fd_close(old); 1653 (void)fd_close(old);
1652 return 0; 1654 return 0;
1653 } 1655 }
1654 1656
1655 fd_putfile(old); 1657 fd_putfile(old);
1656 return error; 1658 return error;
1657} 1659}
1658 1660
1659/* 1661/*
1660 * Sets descriptor owner. If the owner is a process, 'pgid' 1662 * Sets descriptor owner. If the owner is a process, 'pgid'
1661 * is set to positive value, process ID. If the owner is process group, 1663 * is set to positive value, process ID. If the owner is process group,
1662 * 'pgid' is set to -pg_id. 1664 * 'pgid' is set to -pg_id.
1663 */ 1665 */
1664int 1666int
1665fsetown(pid_t *pgid, u_long cmd, const void *data) 1667fsetown(pid_t *pgid, u_long cmd, const void *data)
1666{ 1668{
1667 int id = *(const int *)data; 1669 int id = *(const int *)data;
1668 int error; 1670 int error;
1669 1671
1670 switch (cmd) { 1672 switch (cmd) {
1671 case TIOCSPGRP: 1673 case TIOCSPGRP:
1672 if (id < 0) 1674 if (id < 0)
1673 return (EINVAL); 1675 return (EINVAL);
1674 id = -id; 1676 id = -id;
1675 break; 1677 break;
1676 default: 1678 default:
1677 break; 1679 break;
1678 } 1680 }
1679 1681
1680 if (id > 0 && !pfind(id)) 1682 if (id > 0 && !pfind(id))
1681 return (ESRCH); 1683 return (ESRCH);
1682 else if (id < 0 && (error = pgid_in_session(curproc, -id))) 1684 else if (id < 0 && (error = pgid_in_session(curproc, -id)))
1683 return (error); 1685 return (error);
1684 1686
1685 *pgid = id; 1687 *pgid = id;
1686 return (0); 1688 return (0);
1687} 1689}
1688 1690
1689/* 1691/*
1690 * Return descriptor owner information. If the value is positive, 1692 * Return descriptor owner information. If the value is positive,
1691 * it's process ID. If it's negative, it's process group ID and 1693 * it's process ID. If it's negative, it's process group ID and
1692 * needs the sign removed before use. 1694 * needs the sign removed before use.
1693 */ 1695 */
1694int 1696int
1695fgetown(pid_t pgid, u_long cmd, void *data) 1697fgetown(pid_t pgid, u_long cmd, void *data)
1696{ 1698{
1697 1699
1698 switch (cmd) { 1700 switch (cmd) {
1699 case TIOCGPGRP: 1701 case TIOCGPGRP:
1700 *(int *)data = -pgid; 1702 *(int *)data = -pgid;
1701 break; 1703 break;
1702 default: 1704 default:
1703 *(int *)data = pgid; 1705 *(int *)data = pgid;
1704 break; 1706 break;
1705 } 1707 }
1706 return (0); 1708 return (0);
1707} 1709}
1708 1710
1709/* 1711/*
1710 * Send signal to descriptor owner, either process or process group. 1712 * Send signal to descriptor owner, either process or process group.
1711 */ 1713 */
1712void 1714void
1713fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata) 1715fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
1714{ 1716{
1715 ksiginfo_t ksi; 1717 ksiginfo_t ksi;
1716 1718
1717 KASSERT(!cpu_intr_p()); 1719 KASSERT(!cpu_intr_p());
1718 1720
1719 if (pgid == 0) { 1721 if (pgid == 0) {
1720 return; 1722 return;
1721 } 1723 }
1722 1724
1723 KSI_INIT(&ksi); 1725 KSI_INIT(&ksi);
1724 ksi.ksi_signo = signo; 1726 ksi.ksi_signo = signo;
1725 ksi.ksi_code = code; 1727 ksi.ksi_code = code;
1726 ksi.ksi_band = band; 1728 ksi.ksi_band = band;
1727 1729
1728 mutex_enter(proc_lock); 1730 mutex_enter(proc_lock);
1729 if (pgid > 0) { 1731 if (pgid > 0) {
1730 struct proc *p1; 1732 struct proc *p1;
1731 1733
1732 p1 = p_find(pgid, PFIND_LOCKED); 1734 p1 = p_find(pgid, PFIND_LOCKED);
1733 if (p1 != NULL) { 1735 if (p1 != NULL) {
1734 kpsignal(p1, &ksi, fdescdata); 1736 kpsignal(p1, &ksi, fdescdata);
1735 } 1737 }
1736 } else { 1738 } else {
1737 struct pgrp *pgrp; 1739 struct pgrp *pgrp;
1738 1740
1739 KASSERT(pgid < 0); 1741 KASSERT(pgid < 0);
1740 pgrp = pg_find(-pgid, PFIND_LOCKED); 1742 pgrp = pg_find(-pgid, PFIND_LOCKED);
1741 if (pgrp != NULL) { 1743 if (pgrp != NULL) {
1742 kpgsignal(pgrp, &ksi, fdescdata, 0); 1744 kpgsignal(pgrp, &ksi, fdescdata, 0);
1743 } 1745 }
1744 } 1746 }
1745 mutex_exit(proc_lock); 1747 mutex_exit(proc_lock);
1746} 1748}
1747 1749
1748int 1750int
1749fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops, 1751fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
1750 void *data) 1752 void *data)
1751{ 1753{
1752 1754
1753 fp->f_flag = flag; 1755 fp->f_flag = flag;
1754 fp->f_type = DTYPE_MISC; 1756 fp->f_type = DTYPE_MISC;
1755 fp->f_ops = fops; 1757 fp->f_ops = fops;
1756 fp->f_data = data; 1758 fp->f_data = data;
1757 curlwp->l_dupfd = fd; 1759 curlwp->l_dupfd = fd;
1758 fd_affix(curproc, fp, fd); 1760 fd_affix(curproc, fp, fd);
1759 1761
1760 return EMOVEFD; 1762 return EMOVEFD;
1761} 1763}
1762 1764
1763int 1765int
1764fnullop_fcntl(file_t *fp, u_int cmd, void *data) 1766fnullop_fcntl(file_t *fp, u_int cmd, void *data)
1765{ 1767{
1766 1768
1767 if (cmd == F_SETFL) 1769 if (cmd == F_SETFL)
1768 return 0; 1770 return 0;
1769 1771
1770 return EOPNOTSUPP; 1772 return EOPNOTSUPP;
1771} 1773}
1772 1774
1773int 1775int
1774fnullop_poll(file_t *fp, int which) 1776fnullop_poll(file_t *fp, int which)
1775{ 1777{
1776 1778
1777 return 0; 1779 return 0;
1778} 1780}
1779 1781
1780int 1782int
1781fnullop_kqfilter(file_t *fp, struct knote *kn) 1783fnullop_kqfilter(file_t *fp, struct knote *kn)
1782{ 1784{
1783 1785
1784 return 0; 1786 return 0;
1785} 1787}
1786 1788
1787void 1789void
1788fnullop_drain(file_t *fp) 1790fnullop_drain(file_t *fp)
1789{ 1791{
1790 1792
1791} 1793}
1792 1794
1793int 1795int
1794fbadop_read(file_t *fp, off_t *offset, struct uio *uio, 1796fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
1795 kauth_cred_t cred, int flags) 1797 kauth_cred_t cred, int flags)
1796{ 1798{
1797 1799
1798 return EOPNOTSUPP; 1800 return EOPNOTSUPP;
1799} 1801}
1800 1802
1801int 1803int
1802fbadop_write(file_t *fp, off_t *offset, struct uio *uio, 1804fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
1803 kauth_cred_t cred, int flags) 1805 kauth_cred_t cred, int flags)
1804{ 1806{
1805 1807
1806 return EOPNOTSUPP; 1808 return EOPNOTSUPP;
1807} 1809}
1808 1810
1809int 1811int
1810fbadop_ioctl(file_t *fp, u_long com, void *data) 1812fbadop_ioctl(file_t *fp, u_long com, void *data)
1811{ 1813{
1812 1814
1813 return EOPNOTSUPP; 1815 return EOPNOTSUPP;
1814} 1816}
1815 1817
1816int 1818int
1817fbadop_stat(file_t *fp, struct stat *sb) 1819fbadop_stat(file_t *fp, struct stat *sb)
1818{ 1820{
1819 1821
1820 return EOPNOTSUPP; 1822 return EOPNOTSUPP;
1821} 1823}
1822 1824
1823int 1825int
1824fbadop_close(file_t *fp) 1826fbadop_close(file_t *fp)
1825{ 1827{
1826 1828
1827 return EOPNOTSUPP; 1829 return EOPNOTSUPP;
1828} 1830}

cvs diff -r1.135 -r1.136 src/sys/kern/kern_lwp.c (switch to unified diff)

--- src/sys/kern/kern_lwp.c 2009/10/22 22:28:57 1.135
+++ src/sys/kern/kern_lwp.c 2009/10/27 02:58:28 1.136
@@ -1,1608 +1,1608 @@ @@ -1,1608 +1,1608 @@
1/* $NetBSD: kern_lwp.c,v 1.135 2009/10/22 22:28:57 rmind Exp $ */ 1/* $NetBSD: kern_lwp.c,v 1.136 2009/10/27 02:58:28 rmind Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2001, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc. 4 * Copyright (c) 2001, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Nathan J. Williams, and Andrew Doran. 8 * by Nathan J. Williams, and Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * Overview 33 * Overview
34 * 34 *
35 * Lightweight processes (LWPs) are the basic unit or thread of 35 * Lightweight processes (LWPs) are the basic unit or thread of
36 * execution within the kernel. The core state of an LWP is described 36 * execution within the kernel. The core state of an LWP is described
37 * by "struct lwp", also known as lwp_t. 37 * by "struct lwp", also known as lwp_t.
38 * 38 *
39 * Each LWP is contained within a process (described by "struct proc"), 39 * Each LWP is contained within a process (described by "struct proc"),
40 * Every process contains at least one LWP, but may contain more. The 40 * Every process contains at least one LWP, but may contain more. The
41 * process describes attributes shared among all of its LWPs such as a 41 * process describes attributes shared among all of its LWPs such as a
42 * private address space, global execution state (stopped, active, 42 * private address space, global execution state (stopped, active,
43 * zombie, ...), signal disposition and so on. On a multiprocessor 43 * zombie, ...), signal disposition and so on. On a multiprocessor
44 * machine, multiple LWPs be executing concurrently in the kernel. 44 * machine, multiple LWPs be executing concurrently in the kernel.
45 * 45 *
46 * Execution states 46 * Execution states
47 * 47 *
48 * At any given time, an LWP has overall state that is described by 48 * At any given time, an LWP has overall state that is described by
49 * lwp::l_stat. The states are broken into two sets below. The first 49 * lwp::l_stat. The states are broken into two sets below. The first
50 * set is guaranteed to represent the absolute, current state of the 50 * set is guaranteed to represent the absolute, current state of the
51 * LWP: 51 * LWP:
52 * 52 *
53 * LSONPROC 53 * LSONPROC
54 * 54 *
55 * On processor: the LWP is executing on a CPU, either in the 55 * On processor: the LWP is executing on a CPU, either in the
56 * kernel or in user space. 56 * kernel or in user space.
57 * 57 *
58 * LSRUN 58 * LSRUN
59 * 59 *
60 * Runnable: the LWP is parked on a run queue, and may soon be 60 * Runnable: the LWP is parked on a run queue, and may soon be
61 * chosen to run by an idle processor, or by a processor that 61 * chosen to run by an idle processor, or by a processor that
62 * has been asked to preempt a currently runnning but lower 62 * has been asked to preempt a currently runnning but lower
63 * priority LWP. 63 * priority LWP.
64 * 64 *
65 * LSIDL 65 * LSIDL
66 * 66 *
67 * Idle: the LWP has been created but has not yet executed, 67 * Idle: the LWP has been created but has not yet executed,
68 * or it has ceased executing a unit of work and is waiting 68 * or it has ceased executing a unit of work and is waiting
69 * to be started again. 69 * to be started again.
70 * 70 *
71 * LSSUSPENDED: 71 * LSSUSPENDED:
72 * 72 *
73 * Suspended: the LWP has had its execution suspended by 73 * Suspended: the LWP has had its execution suspended by
74 * another LWP in the same process using the _lwp_suspend() 74 * another LWP in the same process using the _lwp_suspend()
75 * system call. User-level LWPs also enter the suspended 75 * system call. User-level LWPs also enter the suspended
76 * state when the system is shutting down. 76 * state when the system is shutting down.
77 * 77 *
78 * The second set represent a "statement of intent" on behalf of the 78 * The second set represent a "statement of intent" on behalf of the
79 * LWP. The LWP may in fact be executing on a processor, may be 79 * LWP. The LWP may in fact be executing on a processor, may be
80 * sleeping or idle. It is expected to take the necessary action to 80 * sleeping or idle. It is expected to take the necessary action to
81 * stop executing or become "running" again within a short timeframe. 81 * stop executing or become "running" again within a short timeframe.
82 * The LP_RUNNING flag in lwp::l_pflag indicates that an LWP is running. 82 * The LP_RUNNING flag in lwp::l_pflag indicates that an LWP is running.
83 * Importantly, it indicates that its state is tied to a CPU. 83 * Importantly, it indicates that its state is tied to a CPU.
84 * 84 *
85 * LSZOMB: 85 * LSZOMB:
86 * 86 *
87 * Dead or dying: the LWP has released most of its resources 87 * Dead or dying: the LWP has released most of its resources
88 * and is about to switch away into oblivion, or has already 88 * and is about to switch away into oblivion, or has already
89 * switched away. When it switches away, its few remaining 89 * switched away. When it switches away, its few remaining
90 * resources can be collected. 90 * resources can be collected.
91 * 91 *
92 * LSSLEEP: 92 * LSSLEEP:
93 * 93 *
94 * Sleeping: the LWP has entered itself onto a sleep queue, and 94 * Sleeping: the LWP has entered itself onto a sleep queue, and
95 * has switched away or will switch away shortly to allow other 95 * has switched away or will switch away shortly to allow other
96 * LWPs to run on the CPU. 96 * LWPs to run on the CPU.
97 * 97 *
98 * LSSTOP: 98 * LSSTOP:
99 * 99 *
100 * Stopped: the LWP has been stopped as a result of a job 100 * Stopped: the LWP has been stopped as a result of a job
101 * control signal, or as a result of the ptrace() interface.  101 * control signal, or as a result of the ptrace() interface.
102 * 102 *
103 * Stopped LWPs may run briefly within the kernel to handle 103 * Stopped LWPs may run briefly within the kernel to handle
104 * signals that they receive, but will not return to user space 104 * signals that they receive, but will not return to user space
105 * until their process' state is changed away from stopped.  105 * until their process' state is changed away from stopped.
106 * 106 *
107 * Single LWPs within a process can not be set stopped 107 * Single LWPs within a process can not be set stopped
108 * selectively: all actions that can stop or continue LWPs 108 * selectively: all actions that can stop or continue LWPs
109 * occur at the process level. 109 * occur at the process level.
110 * 110 *
111 * State transitions 111 * State transitions
112 * 112 *
113 * Note that the LSSTOP state may only be set when returning to 113 * Note that the LSSTOP state may only be set when returning to
114 * user space in userret(), or when sleeping interruptably. The 114 * user space in userret(), or when sleeping interruptably. The
115 * LSSUSPENDED state may only be set in userret(). Before setting 115 * LSSUSPENDED state may only be set in userret(). Before setting
116 * those states, we try to ensure that the LWPs will release all 116 * those states, we try to ensure that the LWPs will release all
117 * locks that they hold, and at a minimum try to ensure that the 117 * locks that they hold, and at a minimum try to ensure that the
118 * LWP can be set runnable again by a signal. 118 * LWP can be set runnable again by a signal.
119 * 119 *
120 * LWPs may transition states in the following ways: 120 * LWPs may transition states in the following ways:
121 * 121 *
122 * RUN -------> ONPROC ONPROC -----> RUN 122 * RUN -------> ONPROC ONPROC -----> RUN
123 * > SLEEP 123 * > SLEEP
124 * > STOPPED 124 * > STOPPED
125 * > SUSPENDED 125 * > SUSPENDED
126 * > ZOMB 126 * > ZOMB
127 * > IDL (special cases) 127 * > IDL (special cases)
128 * 128 *
129 * STOPPED ---> RUN SUSPENDED --> RUN 129 * STOPPED ---> RUN SUSPENDED --> RUN
130 * > SLEEP 130 * > SLEEP
131 * 131 *
132 * SLEEP -----> ONPROC IDL --------> RUN 132 * SLEEP -----> ONPROC IDL --------> RUN
133 * > RUN > SUSPENDED 133 * > RUN > SUSPENDED
134 * > STOPPED > STOPPED 134 * > STOPPED > STOPPED
135 * > ONPROC (special cases) 135 * > ONPROC (special cases)
136 * 136 *
137 * Some state transitions are only possible with kernel threads (eg 137 * Some state transitions are only possible with kernel threads (eg
138 * ONPROC -> IDL) and happen under tightly controlled circumstances 138 * ONPROC -> IDL) and happen under tightly controlled circumstances
139 * free of unwanted side effects. 139 * free of unwanted side effects.
140 * 140 *
141 * Migration 141 * Migration
142 * 142 *
143 * Migration of threads from one CPU to another could be performed 143 * Migration of threads from one CPU to another could be performed
144 * internally by the scheduler via sched_takecpu() or sched_catchlwp() 144 * internally by the scheduler via sched_takecpu() or sched_catchlwp()
145 * functions. The universal lwp_migrate() function should be used for 145 * functions. The universal lwp_migrate() function should be used for
146 * any other cases. Subsystems in the kernel must be aware that CPU 146 * any other cases. Subsystems in the kernel must be aware that CPU
147 * of LWP may change, while it is not locked. 147 * of LWP may change, while it is not locked.
148 * 148 *
149 * Locking 149 * Locking
150 * 150 *
151 * The majority of fields in 'struct lwp' are covered by a single, 151 * The majority of fields in 'struct lwp' are covered by a single,
152 * general spin lock pointed to by lwp::l_mutex. The locks covering 152 * general spin lock pointed to by lwp::l_mutex. The locks covering
153 * each field are documented in sys/lwp.h. 153 * each field are documented in sys/lwp.h.
154 * 154 *
155 * State transitions must be made with the LWP's general lock held, 155 * State transitions must be made with the LWP's general lock held,
156 * and may cause the LWP's lock pointer to change. Manipulation of 156 * and may cause the LWP's lock pointer to change. Manipulation of
157 * the general lock is not performed directly, but through calls to 157 * the general lock is not performed directly, but through calls to
158 * lwp_lock(), lwp_relock() and similar. 158 * lwp_lock(), lwp_relock() and similar.
159 * 159 *
160 * States and their associated locks: 160 * States and their associated locks:
161 * 161 *
162 * LSONPROC, LSZOMB: 162 * LSONPROC, LSZOMB:
163 * 163 *
164 * Always covered by spc_lwplock, which protects running LWPs. 164 * Always covered by spc_lwplock, which protects running LWPs.
165 * This is a per-CPU lock and matches lwp::l_cpu. 165 * This is a per-CPU lock and matches lwp::l_cpu.
166 * 166 *
167 * LSIDL, LSRUN: 167 * LSIDL, LSRUN:
168 * 168 *
169 * Always covered by spc_mutex, which protects the run queues. 169 * Always covered by spc_mutex, which protects the run queues.
170 * This is a per-CPU lock and matches lwp::l_cpu. 170 * This is a per-CPU lock and matches lwp::l_cpu.
171 * 171 *
172 * LSSLEEP: 172 * LSSLEEP:
173 * 173 *
174 * Covered by a lock associated with the sleep queue that the 174 * Covered by a lock associated with the sleep queue that the
175 * LWP resides on. Matches lwp::l_sleepq::sq_mutex. 175 * LWP resides on. Matches lwp::l_sleepq::sq_mutex.
176 * 176 *
177 * LSSTOP, LSSUSPENDED: 177 * LSSTOP, LSSUSPENDED:
178 * 178 *
179 * If the LWP was previously sleeping (l_wchan != NULL), then 179 * If the LWP was previously sleeping (l_wchan != NULL), then
180 * l_mutex references the sleep queue lock. If the LWP was 180 * l_mutex references the sleep queue lock. If the LWP was
181 * runnable or on the CPU when halted, or has been removed from 181 * runnable or on the CPU when halted, or has been removed from
182 * the sleep queue since halted, then the lock is spc_lwplock. 182 * the sleep queue since halted, then the lock is spc_lwplock.
183 * 183 *
184 * The lock order is as follows: 184 * The lock order is as follows:
185 * 185 *
186 * spc::spc_lwplock -> 186 * spc::spc_lwplock ->
187 * sleeptab::st_mutex -> 187 * sleeptab::st_mutex ->
188 * tschain_t::tc_mutex -> 188 * tschain_t::tc_mutex ->
189 * spc::spc_mutex 189 * spc::spc_mutex
190 * 190 *
191 * Each process has an scheduler state lock (proc::p_lock), and a 191 * Each process has an scheduler state lock (proc::p_lock), and a
192 * number of counters on LWPs and their states: p_nzlwps, p_nrlwps, and 192 * number of counters on LWPs and their states: p_nzlwps, p_nrlwps, and
193 * so on. When an LWP is to be entered into or removed from one of the 193 * so on. When an LWP is to be entered into or removed from one of the
194 * following states, p_lock must be held and the process wide counters 194 * following states, p_lock must be held and the process wide counters
195 * adjusted: 195 * adjusted:
196 * 196 *
197 * LSIDL, LSZOMB, LSSTOP, LSSUSPENDED 197 * LSIDL, LSZOMB, LSSTOP, LSSUSPENDED
198 * 198 *
199 * (But not always for kernel threads. There are some special cases 199 * (But not always for kernel threads. There are some special cases
200 * as mentioned above. See kern_softint.c.) 200 * as mentioned above. See kern_softint.c.)
201 * 201 *
202 * Note that an LWP is considered running or likely to run soon if in 202 * Note that an LWP is considered running or likely to run soon if in
203 * one of the following states. This affects the value of p_nrlwps: 203 * one of the following states. This affects the value of p_nrlwps:
204 * 204 *
205 * LSRUN, LSONPROC, LSSLEEP 205 * LSRUN, LSONPROC, LSSLEEP
206 * 206 *
207 * p_lock does not need to be held when transitioning among these 207 * p_lock does not need to be held when transitioning among these
208 * three states, hence p_lock is rarely taken for state transitions. 208 * three states, hence p_lock is rarely taken for state transitions.
209 */ 209 */
210 210
211#include <sys/cdefs.h> 211#include <sys/cdefs.h>
212__KERNEL_RCSID(0, "$NetBSD: kern_lwp.c,v 1.135 2009/10/22 22:28:57 rmind Exp $"); 212__KERNEL_RCSID(0, "$NetBSD: kern_lwp.c,v 1.136 2009/10/27 02:58:28 rmind Exp $");
213 213
214#include "opt_ddb.h" 214#include "opt_ddb.h"
215#include "opt_lockdebug.h" 215#include "opt_lockdebug.h"
216#include "opt_sa.h" 216#include "opt_sa.h"
217 217
218#define _LWP_API_PRIVATE 218#define _LWP_API_PRIVATE
219 219
220#include <sys/param.h> 220#include <sys/param.h>
221#include <sys/systm.h> 221#include <sys/systm.h>
222#include <sys/cpu.h> 222#include <sys/cpu.h>
223#include <sys/pool.h> 223#include <sys/pool.h>
224#include <sys/proc.h> 224#include <sys/proc.h>
225#include <sys/sa.h> 225#include <sys/sa.h>
226#include <sys/savar.h> 226#include <sys/savar.h>
227#include <sys/syscallargs.h> 227#include <sys/syscallargs.h>
228#include <sys/syscall_stats.h> 228#include <sys/syscall_stats.h>
229#include <sys/kauth.h> 229#include <sys/kauth.h>
230#include <sys/sleepq.h> 230#include <sys/sleepq.h>
231#include <sys/user.h> 231#include <sys/user.h>
232#include <sys/lockdebug.h> 232#include <sys/lockdebug.h>
233#include <sys/kmem.h> 233#include <sys/kmem.h>
234#include <sys/pset.h> 234#include <sys/pset.h>
235#include <sys/intr.h> 235#include <sys/intr.h>
236#include <sys/lwpctl.h> 236#include <sys/lwpctl.h>
237#include <sys/atomic.h> 237#include <sys/atomic.h>
238#include <sys/filedesc.h> 238#include <sys/filedesc.h>
239 239
240#include <uvm/uvm_extern.h> 240#include <uvm/uvm_extern.h>
241#include <uvm/uvm_object.h> 241#include <uvm/uvm_object.h>
242 242
243struct lwplist alllwp = LIST_HEAD_INITIALIZER(alllwp); 243struct lwplist alllwp = LIST_HEAD_INITIALIZER(alllwp);
244 244
245struct pool lwp_uc_pool; 245struct pool lwp_uc_pool;
246 246
247static pool_cache_t lwp_cache; 247static pool_cache_t lwp_cache;
248static specificdata_domain_t lwp_specificdata_domain; 248static specificdata_domain_t lwp_specificdata_domain;
249 249
250void 250void
251lwpinit(void) 251lwpinit(void)
252{ 252{
253 253
254 pool_init(&lwp_uc_pool, sizeof(ucontext_t), 0, 0, 0, "lwpucpl", 254 pool_init(&lwp_uc_pool, sizeof(ucontext_t), 0, 0, 0, "lwpucpl",
255 &pool_allocator_nointr, IPL_NONE); 255 &pool_allocator_nointr, IPL_NONE);
256 lwp_specificdata_domain = specificdata_domain_create(); 256 lwp_specificdata_domain = specificdata_domain_create();
257 KASSERT(lwp_specificdata_domain != NULL); 257 KASSERT(lwp_specificdata_domain != NULL);
258 lwp_sys_init(); 258 lwp_sys_init();
259 lwp_cache = pool_cache_init(sizeof(lwp_t), MIN_LWP_ALIGNMENT, 0, 0, 259 lwp_cache = pool_cache_init(sizeof(lwp_t), MIN_LWP_ALIGNMENT, 0, 0,
260 "lwppl", NULL, IPL_NONE, NULL, NULL, NULL); 260 "lwppl", NULL, IPL_NONE, NULL, NULL, NULL);
261} 261}
262 262
263/* 263/*
264 * Set an suspended. 264 * Set an suspended.
265 * 265 *
266 * Must be called with p_lock held, and the LWP locked. Will unlock the 266 * Must be called with p_lock held, and the LWP locked. Will unlock the
267 * LWP before return. 267 * LWP before return.
268 */ 268 */
269int 269int
270lwp_suspend(struct lwp *curl, struct lwp *t) 270lwp_suspend(struct lwp *curl, struct lwp *t)
271{ 271{
272 int error; 272 int error;
273 273
274 KASSERT(mutex_owned(t->l_proc->p_lock)); 274 KASSERT(mutex_owned(t->l_proc->p_lock));
275 KASSERT(lwp_locked(t, NULL)); 275 KASSERT(lwp_locked(t, NULL));
276 276
277 KASSERT(curl != t || curl->l_stat == LSONPROC); 277 KASSERT(curl != t || curl->l_stat == LSONPROC);
278 278
279 /* 279 /*
280 * If the current LWP has been told to exit, we must not suspend anyone 280 * If the current LWP has been told to exit, we must not suspend anyone
281 * else or deadlock could occur. We won't return to userspace. 281 * else or deadlock could occur. We won't return to userspace.
282 */ 282 */
283 if ((curl->l_flag & (LW_WEXIT | LW_WCORE)) != 0) { 283 if ((curl->l_flag & (LW_WEXIT | LW_WCORE)) != 0) {
284 lwp_unlock(t); 284 lwp_unlock(t);
285 return (EDEADLK); 285 return (EDEADLK);
286 } 286 }
287 287
288 error = 0; 288 error = 0;
289 289
290 switch (t->l_stat) { 290 switch (t->l_stat) {
291 case LSRUN: 291 case LSRUN:
292 case LSONPROC: 292 case LSONPROC:
293 t->l_flag |= LW_WSUSPEND; 293 t->l_flag |= LW_WSUSPEND;
294 lwp_need_userret(t); 294 lwp_need_userret(t);
295 lwp_unlock(t); 295 lwp_unlock(t);
296 break; 296 break;
297 297
298 case LSSLEEP: 298 case LSSLEEP:
299 t->l_flag |= LW_WSUSPEND; 299 t->l_flag |= LW_WSUSPEND;
300 300
301 /* 301 /*
302 * Kick the LWP and try to get it to the kernel boundary 302 * Kick the LWP and try to get it to the kernel boundary
303 * so that it will release any locks that it holds. 303 * so that it will release any locks that it holds.
304 * setrunnable() will release the lock. 304 * setrunnable() will release the lock.
305 */ 305 */
306 if ((t->l_flag & LW_SINTR) != 0) 306 if ((t->l_flag & LW_SINTR) != 0)
307 setrunnable(t); 307 setrunnable(t);
308 else 308 else
309 lwp_unlock(t); 309 lwp_unlock(t);
310 break; 310 break;
311 311
312 case LSSUSPENDED: 312 case LSSUSPENDED:
313 lwp_unlock(t); 313 lwp_unlock(t);
314 break; 314 break;
315 315
316 case LSSTOP: 316 case LSSTOP:
317 t->l_flag |= LW_WSUSPEND; 317 t->l_flag |= LW_WSUSPEND;
318 setrunnable(t); 318 setrunnable(t);
319 break; 319 break;
320 320
321 case LSIDL: 321 case LSIDL:
322 case LSZOMB: 322 case LSZOMB:
323 error = EINTR; /* It's what Solaris does..... */ 323 error = EINTR; /* It's what Solaris does..... */
324 lwp_unlock(t); 324 lwp_unlock(t);
325 break; 325 break;
326 } 326 }
327 327
328 return (error); 328 return (error);
329} 329}
330 330
331/* 331/*
332 * Restart a suspended LWP. 332 * Restart a suspended LWP.
333 * 333 *
334 * Must be called with p_lock held, and the LWP locked. Will unlock the 334 * Must be called with p_lock held, and the LWP locked. Will unlock the
335 * LWP before return. 335 * LWP before return.
336 */ 336 */
337void 337void
338lwp_continue(struct lwp *l) 338lwp_continue(struct lwp *l)
339{ 339{
340 340
341 KASSERT(mutex_owned(l->l_proc->p_lock)); 341 KASSERT(mutex_owned(l->l_proc->p_lock));
342 KASSERT(lwp_locked(l, NULL)); 342 KASSERT(lwp_locked(l, NULL));
343 343
344 /* If rebooting or not suspended, then just bail out. */ 344 /* If rebooting or not suspended, then just bail out. */
345 if ((l->l_flag & LW_WREBOOT) != 0) { 345 if ((l->l_flag & LW_WREBOOT) != 0) {
346 lwp_unlock(l); 346 lwp_unlock(l);
347 return; 347 return;
348 } 348 }
349 349
350 l->l_flag &= ~LW_WSUSPEND; 350 l->l_flag &= ~LW_WSUSPEND;
351 351
352 if (l->l_stat != LSSUSPENDED) { 352 if (l->l_stat != LSSUSPENDED) {
353 lwp_unlock(l); 353 lwp_unlock(l);
354 return; 354 return;
355 } 355 }
356 356
357 /* setrunnable() will release the lock. */ 357 /* setrunnable() will release the lock. */
358 setrunnable(l); 358 setrunnable(l);
359} 359}
360 360
361/* 361/*
362 * Wait for an LWP within the current process to exit. If 'lid' is 362 * Wait for an LWP within the current process to exit. If 'lid' is
363 * non-zero, we are waiting for a specific LWP. 363 * non-zero, we are waiting for a specific LWP.
364 * 364 *
365 * Must be called with p->p_lock held. 365 * Must be called with p->p_lock held.
366 */ 366 */
367int 367int
368lwp_wait1(struct lwp *l, lwpid_t lid, lwpid_t *departed, int flags) 368lwp_wait1(struct lwp *l, lwpid_t lid, lwpid_t *departed, int flags)
369{ 369{
370 struct proc *p = l->l_proc; 370 struct proc *p = l->l_proc;
371 struct lwp *l2; 371 struct lwp *l2;
372 int nfound, error; 372 int nfound, error;
373 lwpid_t curlid; 373 lwpid_t curlid;
374 bool exiting; 374 bool exiting;
375 375
376 KASSERT(mutex_owned(p->p_lock)); 376 KASSERT(mutex_owned(p->p_lock));
377 377
378 p->p_nlwpwait++; 378 p->p_nlwpwait++;
379 l->l_waitingfor = lid; 379 l->l_waitingfor = lid;
380 curlid = l->l_lid; 380 curlid = l->l_lid;
381 exiting = ((flags & LWPWAIT_EXITCONTROL) != 0); 381 exiting = ((flags & LWPWAIT_EXITCONTROL) != 0);
382 382
383 for (;;) { 383 for (;;) {
384 /* 384 /*
385 * Avoid a race between exit1() and sigexit(): if the 385 * Avoid a race between exit1() and sigexit(): if the
386 * process is dumping core, then we need to bail out: call 386 * process is dumping core, then we need to bail out: call
387 * into lwp_userret() where we will be suspended until the 387 * into lwp_userret() where we will be suspended until the
388 * deed is done. 388 * deed is done.
389 */ 389 */
390 if ((p->p_sflag & PS_WCORE) != 0) { 390 if ((p->p_sflag & PS_WCORE) != 0) {
391 mutex_exit(p->p_lock); 391 mutex_exit(p->p_lock);
392 lwp_userret(l); 392 lwp_userret(l);
393#ifdef DIAGNOSTIC 393#ifdef DIAGNOSTIC
394 panic("lwp_wait1"); 394 panic("lwp_wait1");
395#endif 395#endif
396 /* NOTREACHED */ 396 /* NOTREACHED */
397 } 397 }
398 398
399 /* 399 /*
400 * First off, drain any detached LWP that is waiting to be 400 * First off, drain any detached LWP that is waiting to be
401 * reaped. 401 * reaped.
402 */ 402 */
403 while ((l2 = p->p_zomblwp) != NULL) { 403 while ((l2 = p->p_zomblwp) != NULL) {
404 p->p_zomblwp = NULL; 404 p->p_zomblwp = NULL;
405 lwp_free(l2, false, false);/* releases proc mutex */ 405 lwp_free(l2, false, false);/* releases proc mutex */
406 mutex_enter(p->p_lock); 406 mutex_enter(p->p_lock);
407 } 407 }
408 408
409 /* 409 /*
410 * Now look for an LWP to collect. If the whole process is 410 * Now look for an LWP to collect. If the whole process is
411 * exiting, count detached LWPs as eligible to be collected, 411 * exiting, count detached LWPs as eligible to be collected,
412 * but don't drain them here. 412 * but don't drain them here.
413 */ 413 */
414 nfound = 0; 414 nfound = 0;
415 error = 0; 415 error = 0;
416 LIST_FOREACH(l2, &p->p_lwps, l_sibling) { 416 LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
417 /* 417 /*
418 * If a specific wait and the target is waiting on 418 * If a specific wait and the target is waiting on
419 * us, then avoid deadlock. This also traps LWPs 419 * us, then avoid deadlock. This also traps LWPs
420 * that try to wait on themselves. 420 * that try to wait on themselves.
421 * 421 *
422 * Note that this does not handle more complicated 422 * Note that this does not handle more complicated
423 * cycles, like: t1 -> t2 -> t3 -> t1. The process 423 * cycles, like: t1 -> t2 -> t3 -> t1. The process
424 * can still be killed so it is not a major problem. 424 * can still be killed so it is not a major problem.
425 */ 425 */
426 if (l2->l_lid == lid && l2->l_waitingfor == curlid) { 426 if (l2->l_lid == lid && l2->l_waitingfor == curlid) {
427 error = EDEADLK; 427 error = EDEADLK;
428 break; 428 break;
429 } 429 }
430 if (l2 == l) 430 if (l2 == l)
431 continue; 431 continue;
432 if ((l2->l_prflag & LPR_DETACHED) != 0) { 432 if ((l2->l_prflag & LPR_DETACHED) != 0) {
433 nfound += exiting; 433 nfound += exiting;
434 continue; 434 continue;
435 } 435 }
436 if (lid != 0) { 436 if (lid != 0) {
437 if (l2->l_lid != lid) 437 if (l2->l_lid != lid)
438 continue; 438 continue;
439 /* 439 /*
440 * Mark this LWP as the first waiter, if there 440 * Mark this LWP as the first waiter, if there
441 * is no other. 441 * is no other.
442 */ 442 */
443 if (l2->l_waiter == 0) 443 if (l2->l_waiter == 0)
444 l2->l_waiter = curlid; 444 l2->l_waiter = curlid;
445 } else if (l2->l_waiter != 0) { 445 } else if (l2->l_waiter != 0) {
446 /* 446 /*
447 * It already has a waiter - so don't 447 * It already has a waiter - so don't
448 * collect it. If the waiter doesn't 448 * collect it. If the waiter doesn't
449 * grab it we'll get another chance 449 * grab it we'll get another chance
450 * later. 450 * later.
451 */ 451 */
452 nfound++; 452 nfound++;
453 continue; 453 continue;
454 } 454 }
455 nfound++; 455 nfound++;
456 456
457 /* No need to lock the LWP in order to see LSZOMB. */ 457 /* No need to lock the LWP in order to see LSZOMB. */
458 if (l2->l_stat != LSZOMB) 458 if (l2->l_stat != LSZOMB)
459 continue; 459 continue;
460 460
461 /* 461 /*
462 * We're no longer waiting. Reset the "first waiter" 462 * We're no longer waiting. Reset the "first waiter"
463 * pointer on the target, in case it was us. 463 * pointer on the target, in case it was us.
464 */ 464 */
465 l->l_waitingfor = 0; 465 l->l_waitingfor = 0;
466 l2->l_waiter = 0; 466 l2->l_waiter = 0;
467 p->p_nlwpwait--; 467 p->p_nlwpwait--;
468 if (departed) 468 if (departed)
469 *departed = l2->l_lid; 469 *departed = l2->l_lid;
470 sched_lwp_collect(l2); 470 sched_lwp_collect(l2);
471 471
472 /* lwp_free() releases the proc lock. */ 472 /* lwp_free() releases the proc lock. */
473 lwp_free(l2, false, false); 473 lwp_free(l2, false, false);
474 mutex_enter(p->p_lock); 474 mutex_enter(p->p_lock);
475 return 0; 475 return 0;
476 } 476 }
477 477
478 if (error != 0) 478 if (error != 0)
479 break; 479 break;
480 if (nfound == 0) { 480 if (nfound == 0) {
481 error = ESRCH; 481 error = ESRCH;
482 break; 482 break;
483 } 483 }
484 484
485 /* 485 /*
486 * The kernel is careful to ensure that it can not deadlock 486 * The kernel is careful to ensure that it can not deadlock
487 * when exiting - just keep waiting. 487 * when exiting - just keep waiting.
488 */ 488 */
489 if (exiting) { 489 if (exiting) {
490 KASSERT(p->p_nlwps > 1); 490 KASSERT(p->p_nlwps > 1);
491 cv_wait(&p->p_lwpcv, p->p_lock); 491 cv_wait(&p->p_lwpcv, p->p_lock);
492 continue; 492 continue;
493 } 493 }
494 494
495 /* 495 /*
496 * If all other LWPs are waiting for exits or suspends 496 * If all other LWPs are waiting for exits or suspends
497 * and the supply of zombies and potential zombies is 497 * and the supply of zombies and potential zombies is
498 * exhausted, then we are about to deadlock. 498 * exhausted, then we are about to deadlock.
499 * 499 *
500 * If the process is exiting (and this LWP is not the one 500 * If the process is exiting (and this LWP is not the one
501 * that is coordinating the exit) then bail out now. 501 * that is coordinating the exit) then bail out now.
502 */ 502 */
503 if ((p->p_sflag & PS_WEXIT) != 0 || 503 if ((p->p_sflag & PS_WEXIT) != 0 ||
504 p->p_nrlwps + p->p_nzlwps - p->p_ndlwps <= p->p_nlwpwait) { 504 p->p_nrlwps + p->p_nzlwps - p->p_ndlwps <= p->p_nlwpwait) {
505 error = EDEADLK; 505 error = EDEADLK;
506 break; 506 break;
507 } 507 }
508 508
509 /* 509 /*
510 * Sit around and wait for something to happen. We'll be  510 * Sit around and wait for something to happen. We'll be
511 * awoken if any of the conditions examined change: if an 511 * awoken if any of the conditions examined change: if an
512 * LWP exits, is collected, or is detached. 512 * LWP exits, is collected, or is detached.
513 */ 513 */
514 if ((error = cv_wait_sig(&p->p_lwpcv, p->p_lock)) != 0) 514 if ((error = cv_wait_sig(&p->p_lwpcv, p->p_lock)) != 0)
515 break; 515 break;
516 } 516 }
517 517
518 /* 518 /*
519 * We didn't find any LWPs to collect, we may have received a  519 * We didn't find any LWPs to collect, we may have received a
520 * signal, or some other condition has caused us to bail out. 520 * signal, or some other condition has caused us to bail out.
521 * 521 *
522 * If waiting on a specific LWP, clear the waiters marker: some 522 * If waiting on a specific LWP, clear the waiters marker: some
523 * other LWP may want it. Then, kick all the remaining waiters 523 * other LWP may want it. Then, kick all the remaining waiters
524 * so that they can re-check for zombies and for deadlock. 524 * so that they can re-check for zombies and for deadlock.
525 */ 525 */
526 if (lid != 0) { 526 if (lid != 0) {
527 LIST_FOREACH(l2, &p->p_lwps, l_sibling) { 527 LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
528 if (l2->l_lid == lid) { 528 if (l2->l_lid == lid) {
529 if (l2->l_waiter == curlid) 529 if (l2->l_waiter == curlid)
530 l2->l_waiter = 0; 530 l2->l_waiter = 0;
531 break; 531 break;
532 } 532 }
533 } 533 }
534 } 534 }
535 p->p_nlwpwait--; 535 p->p_nlwpwait--;
536 l->l_waitingfor = 0; 536 l->l_waitingfor = 0;
537 cv_broadcast(&p->p_lwpcv); 537 cv_broadcast(&p->p_lwpcv);
538 538
539 return error; 539 return error;
540} 540}
541 541
542/* 542/*
543 * Create a new LWP within process 'p2', using LWP 'l1' as a template. 543 * Create a new LWP within process 'p2', using LWP 'l1' as a template.
544 * The new LWP is created in state LSIDL and must be set running, 544 * The new LWP is created in state LSIDL and must be set running,
545 * suspended, or stopped by the caller. 545 * suspended, or stopped by the caller.
546 */ 546 */
547int 547int
548lwp_create(lwp_t *l1, proc_t *p2, vaddr_t uaddr, int flags, 548lwp_create(lwp_t *l1, proc_t *p2, vaddr_t uaddr, int flags,
549 void *stack, size_t stacksize, void (*func)(void *), void *arg, 549 void *stack, size_t stacksize, void (*func)(void *), void *arg,
550 lwp_t **rnewlwpp, int sclass) 550 lwp_t **rnewlwpp, int sclass)
551{ 551{
552 struct lwp *l2, *isfree; 552 struct lwp *l2, *isfree;
553 turnstile_t *ts; 553 turnstile_t *ts;
554 554
555 KASSERT(l1 == curlwp || l1->l_proc == &proc0); 555 KASSERT(l1 == curlwp || l1->l_proc == &proc0);
556 556
557 /* 557 /*
558 * First off, reap any detached LWP waiting to be collected. 558 * First off, reap any detached LWP waiting to be collected.
559 * We can re-use its LWP structure and turnstile. 559 * We can re-use its LWP structure and turnstile.
560 */ 560 */
561 isfree = NULL; 561 isfree = NULL;
562 if (p2->p_zomblwp != NULL) { 562 if (p2->p_zomblwp != NULL) {
563 mutex_enter(p2->p_lock); 563 mutex_enter(p2->p_lock);
564 if ((isfree = p2->p_zomblwp) != NULL) { 564 if ((isfree = p2->p_zomblwp) != NULL) {
565 p2->p_zomblwp = NULL; 565 p2->p_zomblwp = NULL;
566 lwp_free(isfree, true, false);/* releases proc mutex */ 566 lwp_free(isfree, true, false);/* releases proc mutex */
567 } else 567 } else
568 mutex_exit(p2->p_lock); 568 mutex_exit(p2->p_lock);
569 } 569 }
570 if (isfree == NULL) { 570 if (isfree == NULL) {
571 l2 = pool_cache_get(lwp_cache, PR_WAITOK); 571 l2 = pool_cache_get(lwp_cache, PR_WAITOK);
572 memset(l2, 0, sizeof(*l2)); 572 memset(l2, 0, sizeof(*l2));
573 l2->l_ts = pool_cache_get(turnstile_cache, PR_WAITOK); 573 l2->l_ts = pool_cache_get(turnstile_cache, PR_WAITOK);
574 SLIST_INIT(&l2->l_pi_lenders); 574 SLIST_INIT(&l2->l_pi_lenders);
575 } else { 575 } else {
576 l2 = isfree; 576 l2 = isfree;
577 ts = l2->l_ts; 577 ts = l2->l_ts;
578 KASSERT(l2->l_inheritedprio == -1); 578 KASSERT(l2->l_inheritedprio == -1);
579 KASSERT(SLIST_EMPTY(&l2->l_pi_lenders)); 579 KASSERT(SLIST_EMPTY(&l2->l_pi_lenders));
580 memset(l2, 0, sizeof(*l2)); 580 memset(l2, 0, sizeof(*l2));
581 l2->l_ts = ts; 581 l2->l_ts = ts;
582 } 582 }
583 583
584 l2->l_stat = LSIDL; 584 l2->l_stat = LSIDL;
585 l2->l_proc = p2; 585 l2->l_proc = p2;
586 l2->l_refcnt = 1; 586 l2->l_refcnt = 1;
587 l2->l_class = sclass; 587 l2->l_class = sclass;
588 588
589 /* 589 /*
590 * If vfork(), we want the LWP to run fast and on the same CPU 590 * If vfork(), we want the LWP to run fast and on the same CPU
591 * as its parent, so that it can reuse the VM context and cache 591 * as its parent, so that it can reuse the VM context and cache
592 * footprint on the local CPU. 592 * footprint on the local CPU.
593 */ 593 */
594 l2->l_kpriority = ((flags & LWP_VFORK) ? true : false); 594 l2->l_kpriority = ((flags & LWP_VFORK) ? true : false);
595 l2->l_kpribase = PRI_KERNEL; 595 l2->l_kpribase = PRI_KERNEL;
596 l2->l_priority = l1->l_priority; 596 l2->l_priority = l1->l_priority;
597 l2->l_inheritedprio = -1; 597 l2->l_inheritedprio = -1;
598 l2->l_flag = 0; 598 l2->l_flag = 0;
599 l2->l_pflag = LP_MPSAFE; 599 l2->l_pflag = LP_MPSAFE;
600 TAILQ_INIT(&l2->l_ld_locks); 600 TAILQ_INIT(&l2->l_ld_locks);
601 601
602 /* 602 /*
603 * If not the first LWP in the process, grab a reference to the 603 * If not the first LWP in the process, grab a reference to the
604 * descriptor table. 604 * descriptor table.
605 */ 605 */
606 l2->l_fd = p2->p_fd; 606 l2->l_fd = p2->p_fd;
607 if (p2->p_nlwps != 0) { 607 if (p2->p_nlwps != 0) {
608 KASSERT(l1->l_proc == p2); 608 KASSERT(l1->l_proc == p2);
609 atomic_inc_uint(&l2->l_fd->fd_refcnt); 609 fd_hold(l2);
610 } else { 610 } else {
611 KASSERT(l1->l_proc != p2); 611 KASSERT(l1->l_proc != p2);
612 } 612 }
613 613
614 if (p2->p_flag & PK_SYSTEM) { 614 if (p2->p_flag & PK_SYSTEM) {
615 /* Mark it as a system LWP. */ 615 /* Mark it as a system LWP. */
616 l2->l_flag |= LW_SYSTEM; 616 l2->l_flag |= LW_SYSTEM;
617 } 617 }
618 618
619 kpreempt_disable(); 619 kpreempt_disable();
620 l2->l_mutex = l1->l_cpu->ci_schedstate.spc_mutex; 620 l2->l_mutex = l1->l_cpu->ci_schedstate.spc_mutex;
621 l2->l_cpu = l1->l_cpu; 621 l2->l_cpu = l1->l_cpu;
622 kpreempt_enable(); 622 kpreempt_enable();
623 623
624 lwp_initspecific(l2); 624 lwp_initspecific(l2);
625 sched_lwp_fork(l1, l2); 625 sched_lwp_fork(l1, l2);
626 lwp_update_creds(l2); 626 lwp_update_creds(l2);
627 callout_init(&l2->l_timeout_ch, CALLOUT_MPSAFE); 627 callout_init(&l2->l_timeout_ch, CALLOUT_MPSAFE);
628 callout_setfunc(&l2->l_timeout_ch, sleepq_timeout, l2); 628 callout_setfunc(&l2->l_timeout_ch, sleepq_timeout, l2);
629 cv_init(&l2->l_sigcv, "sigwait"); 629 cv_init(&l2->l_sigcv, "sigwait");
630 l2->l_syncobj = &sched_syncobj; 630 l2->l_syncobj = &sched_syncobj;
631 631
632 if (rnewlwpp != NULL) 632 if (rnewlwpp != NULL)
633 *rnewlwpp = l2; 633 *rnewlwpp = l2;
634 634
635 l2->l_addr = UAREA_TO_USER(uaddr); 635 l2->l_addr = UAREA_TO_USER(uaddr);
636 uvm_lwp_fork(l1, l2, stack, stacksize, func, 636 uvm_lwp_fork(l1, l2, stack, stacksize, func,
637 (arg != NULL) ? arg : l2); 637 (arg != NULL) ? arg : l2);
638 638
639 mutex_enter(p2->p_lock); 639 mutex_enter(p2->p_lock);
640 640
641 if ((flags & LWP_DETACHED) != 0) { 641 if ((flags & LWP_DETACHED) != 0) {
642 l2->l_prflag = LPR_DETACHED; 642 l2->l_prflag = LPR_DETACHED;
643 p2->p_ndlwps++; 643 p2->p_ndlwps++;
644 } else 644 } else
645 l2->l_prflag = 0; 645 l2->l_prflag = 0;
646 646
647 l2->l_sigmask = l1->l_sigmask; 647 l2->l_sigmask = l1->l_sigmask;
648 CIRCLEQ_INIT(&l2->l_sigpend.sp_info); 648 CIRCLEQ_INIT(&l2->l_sigpend.sp_info);
649 sigemptyset(&l2->l_sigpend.sp_set); 649 sigemptyset(&l2->l_sigpend.sp_set);
650 650
651 p2->p_nlwpid++; 651 p2->p_nlwpid++;
652 if (p2->p_nlwpid == 0) 652 if (p2->p_nlwpid == 0)
653 p2->p_nlwpid++; 653 p2->p_nlwpid++;
654 l2->l_lid = p2->p_nlwpid; 654 l2->l_lid = p2->p_nlwpid;
655 LIST_INSERT_HEAD(&p2->p_lwps, l2, l_sibling); 655 LIST_INSERT_HEAD(&p2->p_lwps, l2, l_sibling);
656 p2->p_nlwps++; 656 p2->p_nlwps++;
657 657
658 if ((p2->p_flag & PK_SYSTEM) == 0) { 658 if ((p2->p_flag & PK_SYSTEM) == 0) {
659 /* Inherit an affinity */ 659 /* Inherit an affinity */
660 if (l1->l_flag & LW_AFFINITY) { 660 if (l1->l_flag & LW_AFFINITY) {
661 /* 661 /*
662 * Note that we hold the state lock while inheriting 662 * Note that we hold the state lock while inheriting
663 * the affinity to avoid race with sched_setaffinity(). 663 * the affinity to avoid race with sched_setaffinity().
664 */ 664 */
665 lwp_lock(l1); 665 lwp_lock(l1);
666 if (l1->l_flag & LW_AFFINITY) { 666 if (l1->l_flag & LW_AFFINITY) {
667 kcpuset_use(l1->l_affinity); 667 kcpuset_use(l1->l_affinity);
668 l2->l_affinity = l1->l_affinity; 668 l2->l_affinity = l1->l_affinity;
669 l2->l_flag |= LW_AFFINITY; 669 l2->l_flag |= LW_AFFINITY;
670 } 670 }
671 lwp_unlock(l1); 671 lwp_unlock(l1);
672 } 672 }
673 lwp_lock(l2); 673 lwp_lock(l2);
674 /* Inherit a processor-set */ 674 /* Inherit a processor-set */
675 l2->l_psid = l1->l_psid; 675 l2->l_psid = l1->l_psid;
676 /* Look for a CPU to start */ 676 /* Look for a CPU to start */
677 l2->l_cpu = sched_takecpu(l2); 677 l2->l_cpu = sched_takecpu(l2);
678 lwp_unlock_to(l2, l2->l_cpu->ci_schedstate.spc_mutex); 678 lwp_unlock_to(l2, l2->l_cpu->ci_schedstate.spc_mutex);
679 } 679 }
680 mutex_exit(p2->p_lock); 680 mutex_exit(p2->p_lock);
681 681
682 mutex_enter(proc_lock); 682 mutex_enter(proc_lock);
683 LIST_INSERT_HEAD(&alllwp, l2, l_list); 683 LIST_INSERT_HEAD(&alllwp, l2, l_list);
684 mutex_exit(proc_lock); 684 mutex_exit(proc_lock);
685 685
686 SYSCALL_TIME_LWP_INIT(l2); 686 SYSCALL_TIME_LWP_INIT(l2);
687 687
688 if (p2->p_emul->e_lwp_fork) 688 if (p2->p_emul->e_lwp_fork)
689 (*p2->p_emul->e_lwp_fork)(l1, l2); 689 (*p2->p_emul->e_lwp_fork)(l1, l2);
690 690
691 return (0); 691 return (0);
692} 692}
693 693
694/* 694/*
695 * Called by MD code when a new LWP begins execution. Must be called 695 * Called by MD code when a new LWP begins execution. Must be called
696 * with the previous LWP locked (so at splsched), or if there is no 696 * with the previous LWP locked (so at splsched), or if there is no
697 * previous LWP, at splsched. 697 * previous LWP, at splsched.
698 */ 698 */
699void 699void
700lwp_startup(struct lwp *prev, struct lwp *new) 700lwp_startup(struct lwp *prev, struct lwp *new)
701{ 701{
702 702
703 KASSERT(kpreempt_disabled()); 703 KASSERT(kpreempt_disabled());
704 if (prev != NULL) { 704 if (prev != NULL) {
705 /* 705 /*
706 * Normalize the count of the spin-mutexes, it was 706 * Normalize the count of the spin-mutexes, it was
707 * increased in mi_switch(). Unmark the state of 707 * increased in mi_switch(). Unmark the state of
708 * context switch - it is finished for previous LWP. 708 * context switch - it is finished for previous LWP.
709 */ 709 */
710 curcpu()->ci_mtx_count++; 710 curcpu()->ci_mtx_count++;
711 membar_exit(); 711 membar_exit();
712 prev->l_ctxswtch = 0; 712 prev->l_ctxswtch = 0;
713 } 713 }
714 KPREEMPT_DISABLE(new); 714 KPREEMPT_DISABLE(new);
715 spl0(); 715 spl0();
716 pmap_activate(new); 716 pmap_activate(new);
717 LOCKDEBUG_BARRIER(NULL, 0); 717 LOCKDEBUG_BARRIER(NULL, 0);
718 KPREEMPT_ENABLE(new); 718 KPREEMPT_ENABLE(new);
719 if ((new->l_pflag & LP_MPSAFE) == 0) { 719 if ((new->l_pflag & LP_MPSAFE) == 0) {
720 KERNEL_LOCK(1, new); 720 KERNEL_LOCK(1, new);
721 } 721 }
722} 722}
723 723
724/* 724/*
725 * Exit an LWP. 725 * Exit an LWP.
726 */ 726 */
727void 727void
728lwp_exit(struct lwp *l) 728lwp_exit(struct lwp *l)
729{ 729{
730 struct proc *p = l->l_proc; 730 struct proc *p = l->l_proc;
731 struct lwp *l2; 731 struct lwp *l2;
732 bool current; 732 bool current;
733 733
734 current = (l == curlwp); 734 current = (l == curlwp);
735 735
736 KASSERT(current || (l->l_stat == LSIDL && l->l_target_cpu == NULL)); 736 KASSERT(current || (l->l_stat == LSIDL && l->l_target_cpu == NULL));
737 KASSERT(p == curproc); 737 KASSERT(p == curproc);
738 738
739 /* 739 /*
740 * Verify that we hold no locks other than the kernel lock. 740 * Verify that we hold no locks other than the kernel lock.
741 */ 741 */
742 LOCKDEBUG_BARRIER(&kernel_lock, 0); 742 LOCKDEBUG_BARRIER(&kernel_lock, 0);
743 743
744 /* 744 /*
745 * If we are the last live LWP in a process, we need to exit the 745 * If we are the last live LWP in a process, we need to exit the
746 * entire process. We do so with an exit status of zero, because 746 * entire process. We do so with an exit status of zero, because
747 * it's a "controlled" exit, and because that's what Solaris does. 747 * it's a "controlled" exit, and because that's what Solaris does.
748 * 748 *
749 * We are not quite a zombie yet, but for accounting purposes we 749 * We are not quite a zombie yet, but for accounting purposes we
750 * must increment the count of zombies here. 750 * must increment the count of zombies here.
751 * 751 *
752 * Note: the last LWP's specificdata will be deleted here. 752 * Note: the last LWP's specificdata will be deleted here.
753 */ 753 */
754 mutex_enter(p->p_lock); 754 mutex_enter(p->p_lock);
755 if (p->p_nlwps - p->p_nzlwps == 1) { 755 if (p->p_nlwps - p->p_nzlwps == 1) {
756 KASSERT(current == true); 756 KASSERT(current == true);
757 /* XXXSMP kernel_lock not held */ 757 /* XXXSMP kernel_lock not held */
758 exit1(l, 0); 758 exit1(l, 0);
759 /* NOTREACHED */ 759 /* NOTREACHED */
760 } 760 }
761 p->p_nzlwps++; 761 p->p_nzlwps++;
762 mutex_exit(p->p_lock); 762 mutex_exit(p->p_lock);
763 763
764 if (p->p_emul->e_lwp_exit) 764 if (p->p_emul->e_lwp_exit)
765 (*p->p_emul->e_lwp_exit)(l); 765 (*p->p_emul->e_lwp_exit)(l);
766 766
767 /* Drop filedesc reference. */ 767 /* Drop filedesc reference. */
768 fd_free(); 768 fd_free();
769 769
770 /* Delete the specificdata while it's still safe to sleep. */ 770 /* Delete the specificdata while it's still safe to sleep. */
771 specificdata_fini(lwp_specificdata_domain, &l->l_specdataref); 771 specificdata_fini(lwp_specificdata_domain, &l->l_specdataref);
772 772
773 /* 773 /*
774 * Release our cached credentials. 774 * Release our cached credentials.
775 */ 775 */
776 kauth_cred_free(l->l_cred); 776 kauth_cred_free(l->l_cred);
777 callout_destroy(&l->l_timeout_ch); 777 callout_destroy(&l->l_timeout_ch);
778 778
779 /* 779 /*
780 * Remove the LWP from the global list. 780 * Remove the LWP from the global list.
781 */ 781 */
782 mutex_enter(proc_lock); 782 mutex_enter(proc_lock);
783 LIST_REMOVE(l, l_list); 783 LIST_REMOVE(l, l_list);
784 mutex_exit(proc_lock); 784 mutex_exit(proc_lock);
785 785
786 /* 786 /*
787 * Get rid of all references to the LWP that others (e.g. procfs) 787 * Get rid of all references to the LWP that others (e.g. procfs)
788 * may have, and mark the LWP as a zombie. If the LWP is detached, 788 * may have, and mark the LWP as a zombie. If the LWP is detached,
789 * mark it waiting for collection in the proc structure. Note that 789 * mark it waiting for collection in the proc structure. Note that
790 * before we can do that, we need to free any other dead, deatched 790 * before we can do that, we need to free any other dead, deatched
791 * LWP waiting to meet its maker. 791 * LWP waiting to meet its maker.
792 */ 792 */
793 mutex_enter(p->p_lock); 793 mutex_enter(p->p_lock);
794 lwp_drainrefs(l); 794 lwp_drainrefs(l);
795 795
796 if ((l->l_prflag & LPR_DETACHED) != 0) { 796 if ((l->l_prflag & LPR_DETACHED) != 0) {
797 while ((l2 = p->p_zomblwp) != NULL) { 797 while ((l2 = p->p_zomblwp) != NULL) {
798 p->p_zomblwp = NULL; 798 p->p_zomblwp = NULL;
799 lwp_free(l2, false, false);/* releases proc mutex */ 799 lwp_free(l2, false, false);/* releases proc mutex */
800 mutex_enter(p->p_lock); 800 mutex_enter(p->p_lock);
801 l->l_refcnt++; 801 l->l_refcnt++;
802 lwp_drainrefs(l); 802 lwp_drainrefs(l);
803 } 803 }
804 p->p_zomblwp = l; 804 p->p_zomblwp = l;
805 } 805 }
806 806
807 /* 807 /*
808 * If we find a pending signal for the process and we have been 808 * If we find a pending signal for the process and we have been
809 * asked to check for signals, then we loose: arrange to have 809 * asked to check for signals, then we loose: arrange to have
810 * all other LWPs in the process check for signals. 810 * all other LWPs in the process check for signals.
811 */ 811 */
812 if ((l->l_flag & LW_PENDSIG) != 0 && 812 if ((l->l_flag & LW_PENDSIG) != 0 &&
813 firstsig(&p->p_sigpend.sp_set) != 0) { 813 firstsig(&p->p_sigpend.sp_set) != 0) {
814 LIST_FOREACH(l2, &p->p_lwps, l_sibling) { 814 LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
815 lwp_lock(l2); 815 lwp_lock(l2);
816 l2->l_flag |= LW_PENDSIG; 816 l2->l_flag |= LW_PENDSIG;
817 lwp_unlock(l2); 817 lwp_unlock(l2);
818 } 818 }
819 } 819 }
820 820
821 lwp_lock(l); 821 lwp_lock(l);
822 l->l_stat = LSZOMB; 822 l->l_stat = LSZOMB;
823 if (l->l_name != NULL) 823 if (l->l_name != NULL)
824 strcpy(l->l_name, "(zombie)"); 824 strcpy(l->l_name, "(zombie)");
825 if (l->l_flag & LW_AFFINITY) { 825 if (l->l_flag & LW_AFFINITY) {
826 l->l_flag &= ~LW_AFFINITY; 826 l->l_flag &= ~LW_AFFINITY;
827 } else { 827 } else {
828 KASSERT(l->l_affinity == NULL); 828 KASSERT(l->l_affinity == NULL);
829 } 829 }
830 lwp_unlock(l); 830 lwp_unlock(l);
831 p->p_nrlwps--; 831 p->p_nrlwps--;
832 cv_broadcast(&p->p_lwpcv); 832 cv_broadcast(&p->p_lwpcv);
833 if (l->l_lwpctl != NULL) 833 if (l->l_lwpctl != NULL)
834 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; 834 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED;
835 mutex_exit(p->p_lock); 835 mutex_exit(p->p_lock);
836 836
837 /* Safe without lock since LWP is in zombie state */ 837 /* Safe without lock since LWP is in zombie state */
838 if (l->l_affinity) { 838 if (l->l_affinity) {
839 kcpuset_unuse(l->l_affinity, NULL); 839 kcpuset_unuse(l->l_affinity, NULL);
840 l->l_affinity = NULL; 840 l->l_affinity = NULL;
841 } 841 }
842 842
843 /* 843 /*
844 * We can no longer block. At this point, lwp_free() may already 844 * We can no longer block. At this point, lwp_free() may already
845 * be gunning for us. On a multi-CPU system, we may be off p_lwps. 845 * be gunning for us. On a multi-CPU system, we may be off p_lwps.
846 * 846 *
847 * Free MD LWP resources. 847 * Free MD LWP resources.
848 */ 848 */
849 cpu_lwp_free(l, 0); 849 cpu_lwp_free(l, 0);
850 850
851 if (current) { 851 if (current) {
852 pmap_deactivate(l); 852 pmap_deactivate(l);
853 853
854 /* 854 /*
855 * Release the kernel lock, and switch away into 855 * Release the kernel lock, and switch away into
856 * oblivion. 856 * oblivion.
857 */ 857 */
858#ifdef notyet 858#ifdef notyet
859 /* XXXSMP hold in lwp_userret() */ 859 /* XXXSMP hold in lwp_userret() */
860 KERNEL_UNLOCK_LAST(l); 860 KERNEL_UNLOCK_LAST(l);
861#else 861#else
862 KERNEL_UNLOCK_ALL(l, NULL); 862 KERNEL_UNLOCK_ALL(l, NULL);
863#endif 863#endif
864 lwp_exit_switchaway(l); 864 lwp_exit_switchaway(l);
865 } 865 }
866} 866}
867 867
868/* 868/*
869 * Free a dead LWP's remaining resources. 869 * Free a dead LWP's remaining resources.
870 * 870 *
871 * XXXLWP limits. 871 * XXXLWP limits.
872 */ 872 */
873void 873void
874lwp_free(struct lwp *l, bool recycle, bool last) 874lwp_free(struct lwp *l, bool recycle, bool last)
875{ 875{
876 struct proc *p = l->l_proc; 876 struct proc *p = l->l_proc;
877 struct rusage *ru; 877 struct rusage *ru;
878 ksiginfoq_t kq; 878 ksiginfoq_t kq;
879 879
880 KASSERT(l != curlwp); 880 KASSERT(l != curlwp);
881 881
882 /* 882 /*
883 * If this was not the last LWP in the process, then adjust 883 * If this was not the last LWP in the process, then adjust
884 * counters and unlock. 884 * counters and unlock.
885 */ 885 */
886 if (!last) { 886 if (!last) {
887 /* 887 /*
888 * Add the LWP's run time to the process' base value. 888 * Add the LWP's run time to the process' base value.
889 * This needs to co-incide with coming off p_lwps. 889 * This needs to co-incide with coming off p_lwps.
890 */ 890 */
891 bintime_add(&p->p_rtime, &l->l_rtime); 891 bintime_add(&p->p_rtime, &l->l_rtime);
892 p->p_pctcpu += l->l_pctcpu; 892 p->p_pctcpu += l->l_pctcpu;
893 ru = &p->p_stats->p_ru; 893 ru = &p->p_stats->p_ru;
894 ruadd(ru, &l->l_ru); 894 ruadd(ru, &l->l_ru);
895 ru->ru_nvcsw += (l->l_ncsw - l->l_nivcsw); 895 ru->ru_nvcsw += (l->l_ncsw - l->l_nivcsw);
896 ru->ru_nivcsw += l->l_nivcsw; 896 ru->ru_nivcsw += l->l_nivcsw;
897 LIST_REMOVE(l, l_sibling); 897 LIST_REMOVE(l, l_sibling);
898 p->p_nlwps--; 898 p->p_nlwps--;
899 p->p_nzlwps--; 899 p->p_nzlwps--;
900 if ((l->l_prflag & LPR_DETACHED) != 0) 900 if ((l->l_prflag & LPR_DETACHED) != 0)
901 p->p_ndlwps--; 901 p->p_ndlwps--;
902 902
903 /* 903 /*
904 * Have any LWPs sleeping in lwp_wait() recheck for 904 * Have any LWPs sleeping in lwp_wait() recheck for
905 * deadlock. 905 * deadlock.
906 */ 906 */
907 cv_broadcast(&p->p_lwpcv); 907 cv_broadcast(&p->p_lwpcv);
908 mutex_exit(p->p_lock); 908 mutex_exit(p->p_lock);
909 } 909 }
910 910
911#ifdef MULTIPROCESSOR 911#ifdef MULTIPROCESSOR
912 /* 912 /*
913 * In the unlikely event that the LWP is still on the CPU, 913 * In the unlikely event that the LWP is still on the CPU,
914 * then spin until it has switched away. We need to release 914 * then spin until it has switched away. We need to release
915 * all locks to avoid deadlock against interrupt handlers on 915 * all locks to avoid deadlock against interrupt handlers on
916 * the target CPU. 916 * the target CPU.
917 */ 917 */
918 if ((l->l_pflag & LP_RUNNING) != 0 || l->l_cpu->ci_curlwp == l) { 918 if ((l->l_pflag & LP_RUNNING) != 0 || l->l_cpu->ci_curlwp == l) {
919 int count; 919 int count;
920 (void)count; /* XXXgcc */ 920 (void)count; /* XXXgcc */
921 KERNEL_UNLOCK_ALL(curlwp, &count); 921 KERNEL_UNLOCK_ALL(curlwp, &count);
922 while ((l->l_pflag & LP_RUNNING) != 0 || 922 while ((l->l_pflag & LP_RUNNING) != 0 ||
923 l->l_cpu->ci_curlwp == l) 923 l->l_cpu->ci_curlwp == l)
924 SPINLOCK_BACKOFF_HOOK; 924 SPINLOCK_BACKOFF_HOOK;
925 KERNEL_LOCK(count, curlwp); 925 KERNEL_LOCK(count, curlwp);
926 } 926 }
927#endif 927#endif
928 928
929 /* 929 /*
930 * Destroy the LWP's remaining signal information. 930 * Destroy the LWP's remaining signal information.
931 */ 931 */
932 ksiginfo_queue_init(&kq); 932 ksiginfo_queue_init(&kq);
933 sigclear(&l->l_sigpend, NULL, &kq); 933 sigclear(&l->l_sigpend, NULL, &kq);
934 ksiginfo_queue_drain(&kq); 934 ksiginfo_queue_drain(&kq);
935 cv_destroy(&l->l_sigcv); 935 cv_destroy(&l->l_sigcv);
936 936
937 /* 937 /*
938 * Free the LWP's turnstile and the LWP structure itself unless the 938 * Free the LWP's turnstile and the LWP structure itself unless the
939 * caller wants to recycle them. Also, free the scheduler specific 939 * caller wants to recycle them. Also, free the scheduler specific
940 * data. 940 * data.
941 * 941 *
942 * We can't return turnstile0 to the pool (it didn't come from it), 942 * We can't return turnstile0 to the pool (it didn't come from it),
943 * so if it comes up just drop it quietly and move on. 943 * so if it comes up just drop it quietly and move on.
944 * 944 *
945 * We don't recycle the VM resources at this time. 945 * We don't recycle the VM resources at this time.
946 */ 946 */
947 if (l->l_lwpctl != NULL) 947 if (l->l_lwpctl != NULL)
948 lwp_ctl_free(l); 948 lwp_ctl_free(l);
949 949
950 if (!recycle && l->l_ts != &turnstile0) 950 if (!recycle && l->l_ts != &turnstile0)
951 pool_cache_put(turnstile_cache, l->l_ts); 951 pool_cache_put(turnstile_cache, l->l_ts);
952 if (l->l_name != NULL) 952 if (l->l_name != NULL)
953 kmem_free(l->l_name, MAXCOMLEN); 953 kmem_free(l->l_name, MAXCOMLEN);
954 954
955 cpu_lwp_free2(l); 955 cpu_lwp_free2(l);
956 uvm_lwp_exit(l); 956 uvm_lwp_exit(l);
957 957
958 KASSERT(SLIST_EMPTY(&l->l_pi_lenders)); 958 KASSERT(SLIST_EMPTY(&l->l_pi_lenders));
959 KASSERT(l->l_inheritedprio == -1); 959 KASSERT(l->l_inheritedprio == -1);
960 if (!recycle) 960 if (!recycle)
961 pool_cache_put(lwp_cache, l); 961 pool_cache_put(lwp_cache, l);
962} 962}
963 963
964/* 964/*
965 * Migrate the LWP to the another CPU. Unlocks the LWP. 965 * Migrate the LWP to the another CPU. Unlocks the LWP.
966 */ 966 */
967void 967void
968lwp_migrate(lwp_t *l, struct cpu_info *tci) 968lwp_migrate(lwp_t *l, struct cpu_info *tci)
969{ 969{
970 struct schedstate_percpu *tspc; 970 struct schedstate_percpu *tspc;
971 int lstat = l->l_stat; 971 int lstat = l->l_stat;
972 972
973 KASSERT(lwp_locked(l, NULL)); 973 KASSERT(lwp_locked(l, NULL));
974 KASSERT(tci != NULL); 974 KASSERT(tci != NULL);
975 975
976 /* If LWP is still on the CPU, it must be handled like LSONPROC */ 976 /* If LWP is still on the CPU, it must be handled like LSONPROC */
977 if ((l->l_pflag & LP_RUNNING) != 0) { 977 if ((l->l_pflag & LP_RUNNING) != 0) {
978 lstat = LSONPROC; 978 lstat = LSONPROC;
979 } 979 }
980 980
981 /* 981 /*
982 * The destination CPU could be changed while previous migration 982 * The destination CPU could be changed while previous migration
983 * was not finished. 983 * was not finished.
984 */ 984 */
985 if (l->l_target_cpu != NULL) { 985 if (l->l_target_cpu != NULL) {
986 l->l_target_cpu = tci; 986 l->l_target_cpu = tci;
987 lwp_unlock(l); 987 lwp_unlock(l);
988 return; 988 return;
989 } 989 }
990 990
991 /* Nothing to do if trying to migrate to the same CPU */ 991 /* Nothing to do if trying to migrate to the same CPU */
992 if (l->l_cpu == tci) { 992 if (l->l_cpu == tci) {
993 lwp_unlock(l); 993 lwp_unlock(l);
994 return; 994 return;
995 } 995 }
996 996
997 KASSERT(l->l_target_cpu == NULL); 997 KASSERT(l->l_target_cpu == NULL);
998 tspc = &tci->ci_schedstate; 998 tspc = &tci->ci_schedstate;
999 switch (lstat) { 999 switch (lstat) {
1000 case LSRUN: 1000 case LSRUN:
1001 l->l_target_cpu = tci; 1001 l->l_target_cpu = tci;
1002 break; 1002 break;
1003 case LSIDL: 1003 case LSIDL:
1004 l->l_cpu = tci; 1004 l->l_cpu = tci;
1005 lwp_unlock_to(l, tspc->spc_mutex); 1005 lwp_unlock_to(l, tspc->spc_mutex);
1006 return; 1006 return;
1007 case LSSLEEP: 1007 case LSSLEEP:
1008 l->l_cpu = tci; 1008 l->l_cpu = tci;
1009 break; 1009 break;
1010 case LSSTOP: 1010 case LSSTOP:
1011 case LSSUSPENDED: 1011 case LSSUSPENDED:
1012 l->l_cpu = tci; 1012 l->l_cpu = tci;
1013 if (l->l_wchan == NULL) { 1013 if (l->l_wchan == NULL) {
1014 lwp_unlock_to(l, tspc->spc_lwplock); 1014 lwp_unlock_to(l, tspc->spc_lwplock);
1015 return; 1015 return;
1016 } 1016 }
1017 break; 1017 break;
1018 case LSONPROC: 1018 case LSONPROC:
1019 l->l_target_cpu = tci; 1019 l->l_target_cpu = tci;
1020 spc_lock(l->l_cpu); 1020 spc_lock(l->l_cpu);
1021 cpu_need_resched(l->l_cpu, RESCHED_KPREEMPT); 1021 cpu_need_resched(l->l_cpu, RESCHED_KPREEMPT);
1022 spc_unlock(l->l_cpu); 1022 spc_unlock(l->l_cpu);
1023 break; 1023 break;
1024 } 1024 }
1025 lwp_unlock(l); 1025 lwp_unlock(l);
1026} 1026}
1027 1027
1028/* 1028/*
1029 * Find the LWP in the process. Arguments may be zero, in such case, 1029 * Find the LWP in the process. Arguments may be zero, in such case,
1030 * the calling process and first LWP in the list will be used. 1030 * the calling process and first LWP in the list will be used.
1031 * On success - returns proc locked. 1031 * On success - returns proc locked.
1032 */ 1032 */
1033struct lwp * 1033struct lwp *
1034lwp_find2(pid_t pid, lwpid_t lid) 1034lwp_find2(pid_t pid, lwpid_t lid)
1035{ 1035{
1036 proc_t *p; 1036 proc_t *p;
1037 lwp_t *l; 1037 lwp_t *l;
1038 1038
1039 /* Find the process */ 1039 /* Find the process */
1040 p = (pid == 0) ? curlwp->l_proc : p_find(pid, PFIND_UNLOCK_FAIL); 1040 p = (pid == 0) ? curlwp->l_proc : p_find(pid, PFIND_UNLOCK_FAIL);
1041 if (p == NULL) 1041 if (p == NULL)
1042 return NULL; 1042 return NULL;
1043 mutex_enter(p->p_lock); 1043 mutex_enter(p->p_lock);
1044 if (pid != 0) { 1044 if (pid != 0) {
1045 /* Case of p_find */ 1045 /* Case of p_find */
1046 mutex_exit(proc_lock); 1046 mutex_exit(proc_lock);
1047 } 1047 }
1048 1048
1049 /* Find the thread */ 1049 /* Find the thread */
1050 l = (lid == 0) ? LIST_FIRST(&p->p_lwps) : lwp_find(p, lid); 1050 l = (lid == 0) ? LIST_FIRST(&p->p_lwps) : lwp_find(p, lid);
1051 if (l == NULL) { 1051 if (l == NULL) {
1052 mutex_exit(p->p_lock); 1052 mutex_exit(p->p_lock);
1053 } 1053 }
1054 1054
1055 return l; 1055 return l;
1056} 1056}
1057 1057
1058/* 1058/*
1059 * Look up a live LWP within the speicifed process, and return it locked. 1059 * Look up a live LWP within the speicifed process, and return it locked.
1060 * 1060 *
1061 * Must be called with p->p_lock held. 1061 * Must be called with p->p_lock held.
1062 */ 1062 */
1063struct lwp * 1063struct lwp *
1064lwp_find(struct proc *p, int id) 1064lwp_find(struct proc *p, int id)
1065{ 1065{
1066 struct lwp *l; 1066 struct lwp *l;
1067 1067
1068 KASSERT(mutex_owned(p->p_lock)); 1068 KASSERT(mutex_owned(p->p_lock));
1069 1069
1070 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1070 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1071 if (l->l_lid == id) 1071 if (l->l_lid == id)
1072 break; 1072 break;
1073 } 1073 }
1074 1074
1075 /* 1075 /*
1076 * No need to lock - all of these conditions will 1076 * No need to lock - all of these conditions will
1077 * be visible with the process level mutex held. 1077 * be visible with the process level mutex held.
1078 */ 1078 */
1079 if (l != NULL && (l->l_stat == LSIDL || l->l_stat == LSZOMB)) 1079 if (l != NULL && (l->l_stat == LSIDL || l->l_stat == LSZOMB))
1080 l = NULL; 1080 l = NULL;
1081 1081
1082 return l; 1082 return l;
1083} 1083}
1084 1084
1085/* 1085/*
1086 * Update an LWP's cached credentials to mirror the process' master copy. 1086 * Update an LWP's cached credentials to mirror the process' master copy.
1087 * 1087 *
1088 * This happens early in the syscall path, on user trap, and on LWP 1088 * This happens early in the syscall path, on user trap, and on LWP
1089 * creation. A long-running LWP can also voluntarily choose to update 1089 * creation. A long-running LWP can also voluntarily choose to update
1090 * it's credentials by calling this routine. This may be called from 1090 * it's credentials by calling this routine. This may be called from
1091 * LWP_CACHE_CREDS(), which checks l->l_cred != p->p_cred beforehand. 1091 * LWP_CACHE_CREDS(), which checks l->l_cred != p->p_cred beforehand.
1092 */ 1092 */
1093void 1093void
1094lwp_update_creds(struct lwp *l) 1094lwp_update_creds(struct lwp *l)
1095{ 1095{
1096 kauth_cred_t oc; 1096 kauth_cred_t oc;
1097 struct proc *p; 1097 struct proc *p;
1098 1098
1099 p = l->l_proc; 1099 p = l->l_proc;
1100 oc = l->l_cred; 1100 oc = l->l_cred;
1101 1101
1102 mutex_enter(p->p_lock); 1102 mutex_enter(p->p_lock);
1103 kauth_cred_hold(p->p_cred); 1103 kauth_cred_hold(p->p_cred);
1104 l->l_cred = p->p_cred; 1104 l->l_cred = p->p_cred;
1105 l->l_prflag &= ~LPR_CRMOD; 1105 l->l_prflag &= ~LPR_CRMOD;
1106 mutex_exit(p->p_lock); 1106 mutex_exit(p->p_lock);
1107 if (oc != NULL) 1107 if (oc != NULL)
1108 kauth_cred_free(oc); 1108 kauth_cred_free(oc);
1109} 1109}
1110 1110
1111/* 1111/*
1112 * Verify that an LWP is locked, and optionally verify that the lock matches 1112 * Verify that an LWP is locked, and optionally verify that the lock matches
1113 * one we specify. 1113 * one we specify.
1114 */ 1114 */
1115int 1115int
1116lwp_locked(struct lwp *l, kmutex_t *mtx) 1116lwp_locked(struct lwp *l, kmutex_t *mtx)
1117{ 1117{
1118 kmutex_t *cur = l->l_mutex; 1118 kmutex_t *cur = l->l_mutex;
1119 1119
1120 return mutex_owned(cur) && (mtx == cur || mtx == NULL); 1120 return mutex_owned(cur) && (mtx == cur || mtx == NULL);
1121} 1121}
1122 1122
1123/* 1123/*
1124 * Lock an LWP. 1124 * Lock an LWP.
1125 */ 1125 */
1126kmutex_t * 1126kmutex_t *
1127lwp_lock_retry(struct lwp *l, kmutex_t *old) 1127lwp_lock_retry(struct lwp *l, kmutex_t *old)
1128{ 1128{
1129 1129
1130 /* 1130 /*
1131 * XXXgcc ignoring kmutex_t * volatile on i386 1131 * XXXgcc ignoring kmutex_t * volatile on i386
1132 * 1132 *
1133 * gcc version 4.1.2 20061021 prerelease (NetBSD nb1 20061021) 1133 * gcc version 4.1.2 20061021 prerelease (NetBSD nb1 20061021)
1134 */ 1134 */
1135#if 1 1135#if 1
1136 while (l->l_mutex != old) { 1136 while (l->l_mutex != old) {
1137#else 1137#else
1138 for (;;) { 1138 for (;;) {
1139#endif 1139#endif
1140 mutex_spin_exit(old); 1140 mutex_spin_exit(old);
1141 old = l->l_mutex; 1141 old = l->l_mutex;
1142 mutex_spin_enter(old); 1142 mutex_spin_enter(old);
1143 1143
1144 /* 1144 /*
1145 * mutex_enter() will have posted a read barrier. Re-test 1145 * mutex_enter() will have posted a read barrier. Re-test
1146 * l->l_mutex. If it has changed, we need to try again. 1146 * l->l_mutex. If it has changed, we need to try again.
1147 */ 1147 */
1148#if 1 1148#if 1
1149 } 1149 }
1150#else 1150#else
1151 } while (__predict_false(l->l_mutex != old)); 1151 } while (__predict_false(l->l_mutex != old));
1152#endif 1152#endif
1153 1153
1154 return old; 1154 return old;
1155} 1155}
1156 1156
1157/* 1157/*
1158 * Lend a new mutex to an LWP. The old mutex must be held. 1158 * Lend a new mutex to an LWP. The old mutex must be held.
1159 */ 1159 */
1160void 1160void
1161lwp_setlock(struct lwp *l, kmutex_t *new) 1161lwp_setlock(struct lwp *l, kmutex_t *new)
1162{ 1162{
1163 1163
1164 KASSERT(mutex_owned(l->l_mutex)); 1164 KASSERT(mutex_owned(l->l_mutex));
1165 1165
1166 membar_exit(); 1166 membar_exit();
1167 l->l_mutex = new; 1167 l->l_mutex = new;
1168} 1168}
1169 1169
1170/* 1170/*
1171 * Lend a new mutex to an LWP, and release the old mutex. The old mutex 1171 * Lend a new mutex to an LWP, and release the old mutex. The old mutex
1172 * must be held. 1172 * must be held.
1173 */ 1173 */
1174void 1174void
1175lwp_unlock_to(struct lwp *l, kmutex_t *new) 1175lwp_unlock_to(struct lwp *l, kmutex_t *new)
1176{ 1176{
1177 kmutex_t *old; 1177 kmutex_t *old;
1178 1178
1179 KASSERT(mutex_owned(l->l_mutex)); 1179 KASSERT(mutex_owned(l->l_mutex));
1180 1180
1181 old = l->l_mutex; 1181 old = l->l_mutex;
1182 membar_exit(); 1182 membar_exit();
1183 l->l_mutex = new; 1183 l->l_mutex = new;
1184 mutex_spin_exit(old); 1184 mutex_spin_exit(old);
1185} 1185}
1186 1186
1187/* 1187/*
1188 * Acquire a new mutex, and donate it to an LWP. The LWP must already be 1188 * Acquire a new mutex, and donate it to an LWP. The LWP must already be
1189 * locked. 1189 * locked.
1190 */ 1190 */
1191void 1191void
1192lwp_relock(struct lwp *l, kmutex_t *new) 1192lwp_relock(struct lwp *l, kmutex_t *new)
1193{ 1193{
1194 kmutex_t *old; 1194 kmutex_t *old;
1195 1195
1196 KASSERT(mutex_owned(l->l_mutex)); 1196 KASSERT(mutex_owned(l->l_mutex));
1197 1197
1198 old = l->l_mutex; 1198 old = l->l_mutex;
1199 if (old != new) { 1199 if (old != new) {
1200 mutex_spin_enter(new); 1200 mutex_spin_enter(new);
1201 l->l_mutex = new; 1201 l->l_mutex = new;
1202 mutex_spin_exit(old); 1202 mutex_spin_exit(old);
1203 } 1203 }
1204} 1204}
1205 1205
1206int 1206int
1207lwp_trylock(struct lwp *l) 1207lwp_trylock(struct lwp *l)
1208{ 1208{
1209 kmutex_t *old; 1209 kmutex_t *old;
1210 1210
1211 for (;;) { 1211 for (;;) {
1212 if (!mutex_tryenter(old = l->l_mutex)) 1212 if (!mutex_tryenter(old = l->l_mutex))
1213 return 0; 1213 return 0;
1214 if (__predict_true(l->l_mutex == old)) 1214 if (__predict_true(l->l_mutex == old))
1215 return 1; 1215 return 1;
1216 mutex_spin_exit(old); 1216 mutex_spin_exit(old);
1217 } 1217 }
1218} 1218}
1219 1219
1220void 1220void
1221lwp_unsleep(lwp_t *l, bool cleanup) 1221lwp_unsleep(lwp_t *l, bool cleanup)
1222{ 1222{
1223 1223
1224 KASSERT(mutex_owned(l->l_mutex)); 1224 KASSERT(mutex_owned(l->l_mutex));
1225 (*l->l_syncobj->sobj_unsleep)(l, cleanup); 1225 (*l->l_syncobj->sobj_unsleep)(l, cleanup);
1226} 1226}
1227 1227
1228 1228
1229/* 1229/*
1230 * Handle exceptions for mi_userret(). Called if a member of LW_USERRET is 1230 * Handle exceptions for mi_userret(). Called if a member of LW_USERRET is
1231 * set. 1231 * set.
1232 */ 1232 */
1233void 1233void
1234lwp_userret(struct lwp *l) 1234lwp_userret(struct lwp *l)
1235{ 1235{
1236 struct proc *p; 1236 struct proc *p;
1237 void (*hook)(void); 1237 void (*hook)(void);
1238 int sig; 1238 int sig;
1239 1239
1240 KASSERT(l == curlwp); 1240 KASSERT(l == curlwp);
1241 KASSERT(l->l_stat == LSONPROC); 1241 KASSERT(l->l_stat == LSONPROC);
1242 p = l->l_proc; 1242 p = l->l_proc;
1243 1243
1244#ifndef __HAVE_FAST_SOFTINTS 1244#ifndef __HAVE_FAST_SOFTINTS
1245 /* Run pending soft interrupts. */ 1245 /* Run pending soft interrupts. */
1246 if (l->l_cpu->ci_data.cpu_softints != 0) 1246 if (l->l_cpu->ci_data.cpu_softints != 0)
1247 softint_overlay(); 1247 softint_overlay();
1248#endif 1248#endif
1249 1249
1250#ifdef KERN_SA 1250#ifdef KERN_SA
1251 /* Generate UNBLOCKED upcall if needed */ 1251 /* Generate UNBLOCKED upcall if needed */
1252 if (l->l_flag & LW_SA_BLOCKING) { 1252 if (l->l_flag & LW_SA_BLOCKING) {
1253 sa_unblock_userret(l); 1253 sa_unblock_userret(l);
1254 /* NOTREACHED */ 1254 /* NOTREACHED */
1255 } 1255 }
1256#endif 1256#endif
1257 1257
1258 /* 1258 /*
1259 * It should be safe to do this read unlocked on a multiprocessor 1259 * It should be safe to do this read unlocked on a multiprocessor
1260 * system.. 1260 * system..
1261 * 1261 *
1262 * LW_SA_UPCALL will be handled after the while() loop, so don't 1262 * LW_SA_UPCALL will be handled after the while() loop, so don't
1263 * consider it now. 1263 * consider it now.
1264 */ 1264 */
1265 while ((l->l_flag & (LW_USERRET & ~(LW_SA_UPCALL))) != 0) { 1265 while ((l->l_flag & (LW_USERRET & ~(LW_SA_UPCALL))) != 0) {
1266 /* 1266 /*
1267 * Process pending signals first, unless the process 1267 * Process pending signals first, unless the process
1268 * is dumping core or exiting, where we will instead 1268 * is dumping core or exiting, where we will instead
1269 * enter the LW_WSUSPEND case below. 1269 * enter the LW_WSUSPEND case below.
1270 */ 1270 */
1271 if ((l->l_flag & (LW_PENDSIG | LW_WCORE | LW_WEXIT)) == 1271 if ((l->l_flag & (LW_PENDSIG | LW_WCORE | LW_WEXIT)) ==
1272 LW_PENDSIG) { 1272 LW_PENDSIG) {
1273 mutex_enter(p->p_lock); 1273 mutex_enter(p->p_lock);
1274 while ((sig = issignal(l)) != 0) 1274 while ((sig = issignal(l)) != 0)
1275 postsig(sig); 1275 postsig(sig);
1276 mutex_exit(p->p_lock); 1276 mutex_exit(p->p_lock);
1277 } 1277 }
1278 1278
1279 /* 1279 /*
1280 * Core-dump or suspend pending. 1280 * Core-dump or suspend pending.
1281 * 1281 *
1282 * In case of core dump, suspend ourselves, so that the 1282 * In case of core dump, suspend ourselves, so that the
1283 * kernel stack and therefore the userland registers saved 1283 * kernel stack and therefore the userland registers saved
1284 * in the trapframe are around for coredump() to write them 1284 * in the trapframe are around for coredump() to write them
1285 * out. We issue a wakeup on p->p_lwpcv so that sigexit() 1285 * out. We issue a wakeup on p->p_lwpcv so that sigexit()
1286 * will write the core file out once all other LWPs are 1286 * will write the core file out once all other LWPs are
1287 * suspended. 1287 * suspended.
1288 */ 1288 */
1289 if ((l->l_flag & LW_WSUSPEND) != 0) { 1289 if ((l->l_flag & LW_WSUSPEND) != 0) {
1290 mutex_enter(p->p_lock); 1290 mutex_enter(p->p_lock);
1291 p->p_nrlwps--; 1291 p->p_nrlwps--;
1292 cv_broadcast(&p->p_lwpcv); 1292 cv_broadcast(&p->p_lwpcv);
1293 lwp_lock(l); 1293 lwp_lock(l);
1294 l->l_stat = LSSUSPENDED; 1294 l->l_stat = LSSUSPENDED;
1295 lwp_unlock(l); 1295 lwp_unlock(l);
1296 mutex_exit(p->p_lock); 1296 mutex_exit(p->p_lock);
1297 lwp_lock(l); 1297 lwp_lock(l);
1298 mi_switch(l); 1298 mi_switch(l);
1299 } 1299 }
1300 1300
1301 /* Process is exiting. */ 1301 /* Process is exiting. */
1302 if ((l->l_flag & LW_WEXIT) != 0) { 1302 if ((l->l_flag & LW_WEXIT) != 0) {
1303 lwp_exit(l); 1303 lwp_exit(l);
1304 KASSERT(0); 1304 KASSERT(0);
1305 /* NOTREACHED */ 1305 /* NOTREACHED */
1306 } 1306 }
1307 1307
1308 /* Call userret hook; used by Linux emulation. */ 1308 /* Call userret hook; used by Linux emulation. */
1309 if ((l->l_flag & LW_WUSERRET) != 0) { 1309 if ((l->l_flag & LW_WUSERRET) != 0) {
1310 lwp_lock(l); 1310 lwp_lock(l);
1311 l->l_flag &= ~LW_WUSERRET; 1311 l->l_flag &= ~LW_WUSERRET;
1312 lwp_unlock(l); 1312 lwp_unlock(l);
1313 hook = p->p_userret; 1313 hook = p->p_userret;
1314 p->p_userret = NULL; 1314 p->p_userret = NULL;
1315 (*hook)(); 1315 (*hook)();
1316 } 1316 }
1317 } 1317 }
1318 1318
1319#ifdef KERN_SA 1319#ifdef KERN_SA
1320 /* 1320 /*
1321 * Timer events are handled specially. We only try once to deliver 1321 * Timer events are handled specially. We only try once to deliver
1322 * pending timer upcalls; if if fails, we can try again on the next 1322 * pending timer upcalls; if if fails, we can try again on the next
1323 * loop around. If we need to re-enter lwp_userret(), MD code will 1323 * loop around. If we need to re-enter lwp_userret(), MD code will
1324 * bounce us back here through the trap path after we return. 1324 * bounce us back here through the trap path after we return.
1325 */ 1325 */
1326 if (p->p_timerpend) 1326 if (p->p_timerpend)
1327 timerupcall(l); 1327 timerupcall(l);
1328 if (l->l_flag & LW_SA_UPCALL) 1328 if (l->l_flag & LW_SA_UPCALL)
1329 sa_upcall_userret(l); 1329 sa_upcall_userret(l);
1330#endif /* KERN_SA */ 1330#endif /* KERN_SA */
1331} 1331}
1332 1332
1333/* 1333/*
1334 * Force an LWP to enter the kernel, to take a trip through lwp_userret(). 1334 * Force an LWP to enter the kernel, to take a trip through lwp_userret().
1335 */ 1335 */
1336void 1336void
1337lwp_need_userret(struct lwp *l) 1337lwp_need_userret(struct lwp *l)
1338{ 1338{
1339 KASSERT(lwp_locked(l, NULL)); 1339 KASSERT(lwp_locked(l, NULL));
1340 1340
1341 /* 1341 /*
1342 * Since the tests in lwp_userret() are done unlocked, make sure 1342 * Since the tests in lwp_userret() are done unlocked, make sure
1343 * that the condition will be seen before forcing the LWP to enter 1343 * that the condition will be seen before forcing the LWP to enter
1344 * kernel mode. 1344 * kernel mode.
1345 */ 1345 */
1346 membar_producer(); 1346 membar_producer();
1347 cpu_signotify(l); 1347 cpu_signotify(l);
1348} 1348}
1349 1349
1350/* 1350/*
1351 * Add one reference to an LWP. This will prevent the LWP from 1351 * Add one reference to an LWP. This will prevent the LWP from
1352 * exiting, thus keep the lwp structure and PCB around to inspect. 1352 * exiting, thus keep the lwp structure and PCB around to inspect.
1353 */ 1353 */
1354void 1354void
1355lwp_addref(struct lwp *l) 1355lwp_addref(struct lwp *l)
1356{ 1356{
1357 1357
1358 KASSERT(mutex_owned(l->l_proc->p_lock)); 1358 KASSERT(mutex_owned(l->l_proc->p_lock));
1359 KASSERT(l->l_stat != LSZOMB); 1359 KASSERT(l->l_stat != LSZOMB);
1360 KASSERT(l->l_refcnt != 0); 1360 KASSERT(l->l_refcnt != 0);
1361 1361
1362 l->l_refcnt++; 1362 l->l_refcnt++;
1363} 1363}
1364 1364
1365/* 1365/*
1366 * Remove one reference to an LWP. If this is the last reference, 1366 * Remove one reference to an LWP. If this is the last reference,
1367 * then we must finalize the LWP's death. 1367 * then we must finalize the LWP's death.
1368 */ 1368 */
1369void 1369void
1370lwp_delref(struct lwp *l) 1370lwp_delref(struct lwp *l)
1371{ 1371{
1372 struct proc *p = l->l_proc; 1372 struct proc *p = l->l_proc;
1373 1373
1374 mutex_enter(p->p_lock); 1374 mutex_enter(p->p_lock);
1375 KASSERT(l->l_stat != LSZOMB); 1375 KASSERT(l->l_stat != LSZOMB);
1376 KASSERT(l->l_refcnt > 0); 1376 KASSERT(l->l_refcnt > 0);
1377 if (--l->l_refcnt == 0) 1377 if (--l->l_refcnt == 0)
1378 cv_broadcast(&p->p_lwpcv); 1378 cv_broadcast(&p->p_lwpcv);
1379 mutex_exit(p->p_lock); 1379 mutex_exit(p->p_lock);
1380} 1380}
1381 1381
1382/* 1382/*
1383 * Drain all references to the current LWP. 1383 * Drain all references to the current LWP.
1384 */ 1384 */
1385void 1385void
1386lwp_drainrefs(struct lwp *l) 1386lwp_drainrefs(struct lwp *l)
1387{ 1387{
1388 struct proc *p = l->l_proc; 1388 struct proc *p = l->l_proc;
1389 1389
1390 KASSERT(mutex_owned(p->p_lock)); 1390 KASSERT(mutex_owned(p->p_lock));
1391 KASSERT(l->l_refcnt != 0); 1391 KASSERT(l->l_refcnt != 0);
1392 1392
1393 l->l_refcnt--; 1393 l->l_refcnt--;
1394 while (l->l_refcnt != 0) 1394 while (l->l_refcnt != 0)
1395 cv_wait(&p->p_lwpcv, p->p_lock); 1395 cv_wait(&p->p_lwpcv, p->p_lock);
1396} 1396}
1397 1397
1398/* 1398/*
1399 * Return true if the specified LWP is 'alive'. Only p->p_lock need 1399 * Return true if the specified LWP is 'alive'. Only p->p_lock need
1400 * be held. 1400 * be held.
1401 */ 1401 */
1402bool 1402bool
1403lwp_alive(lwp_t *l) 1403lwp_alive(lwp_t *l)
1404{ 1404{
1405 1405
1406 KASSERT(mutex_owned(l->l_proc->p_lock)); 1406 KASSERT(mutex_owned(l->l_proc->p_lock));
1407 1407
1408 switch (l->l_stat) { 1408 switch (l->l_stat) {
1409 case LSSLEEP: 1409 case LSSLEEP:
1410 case LSRUN: 1410 case LSRUN:
1411 case LSONPROC: 1411 case LSONPROC:
1412 case LSSTOP: 1412 case LSSTOP:
1413 case LSSUSPENDED: 1413 case LSSUSPENDED:
1414 return true; 1414 return true;
1415 default: 1415 default:
1416 return false; 1416 return false;
1417 } 1417 }
1418} 1418}
1419 1419
1420/* 1420/*
1421 * Return first live LWP in the process. 1421 * Return first live LWP in the process.
1422 */ 1422 */
1423lwp_t * 1423lwp_t *
1424lwp_find_first(proc_t *p) 1424lwp_find_first(proc_t *p)
1425{ 1425{
1426 lwp_t *l; 1426 lwp_t *l;
1427 1427
1428 KASSERT(mutex_owned(p->p_lock)); 1428 KASSERT(mutex_owned(p->p_lock));
1429 1429
1430 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1430 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1431 if (lwp_alive(l)) { 1431 if (lwp_alive(l)) {
1432 return l; 1432 return l;
1433 } 1433 }
1434 } 1434 }
1435 1435
1436 return NULL; 1436 return NULL;
1437} 1437}
1438 1438
1439/* 1439/*
1440 * lwp_specific_key_create -- 1440 * lwp_specific_key_create --
1441 * Create a key for subsystem lwp-specific data. 1441 * Create a key for subsystem lwp-specific data.
1442 */ 1442 */
1443int 1443int
1444lwp_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor) 1444lwp_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
1445{ 1445{
1446 1446
1447 return (specificdata_key_create(lwp_specificdata_domain, keyp, dtor)); 1447 return (specificdata_key_create(lwp_specificdata_domain, keyp, dtor));
1448} 1448}
1449 1449
1450/* 1450/*
1451 * lwp_specific_key_delete -- 1451 * lwp_specific_key_delete --
1452 * Delete a key for subsystem lwp-specific data. 1452 * Delete a key for subsystem lwp-specific data.
1453 */ 1453 */
1454void 1454void
1455lwp_specific_key_delete(specificdata_key_t key) 1455lwp_specific_key_delete(specificdata_key_t key)
1456{ 1456{
1457 1457
1458 specificdata_key_delete(lwp_specificdata_domain, key); 1458 specificdata_key_delete(lwp_specificdata_domain, key);
1459} 1459}
1460 1460
1461/* 1461/*
1462 * lwp_initspecific -- 1462 * lwp_initspecific --
1463 * Initialize an LWP's specificdata container. 1463 * Initialize an LWP's specificdata container.
1464 */ 1464 */
1465void 1465void
1466lwp_initspecific(struct lwp *l) 1466lwp_initspecific(struct lwp *l)
1467{ 1467{
1468 int error; 1468 int error;
1469 1469
1470 error = specificdata_init(lwp_specificdata_domain, &l->l_specdataref); 1470 error = specificdata_init(lwp_specificdata_domain, &l->l_specdataref);
1471 KASSERT(error == 0); 1471 KASSERT(error == 0);
1472} 1472}
1473 1473
1474/* 1474/*
1475 * lwp_finispecific -- 1475 * lwp_finispecific --
1476 * Finalize an LWP's specificdata container. 1476 * Finalize an LWP's specificdata container.
1477 */ 1477 */
1478void 1478void
1479lwp_finispecific(struct lwp *l) 1479lwp_finispecific(struct lwp *l)
1480{ 1480{
1481 1481
1482 specificdata_fini(lwp_specificdata_domain, &l->l_specdataref); 1482 specificdata_fini(lwp_specificdata_domain, &l->l_specdataref);
1483} 1483}
1484 1484
1485/* 1485/*
1486 * lwp_getspecific -- 1486 * lwp_getspecific --
1487 * Return lwp-specific data corresponding to the specified key. 1487 * Return lwp-specific data corresponding to the specified key.
1488 * 1488 *
1489 * Note: LWP specific data is NOT INTERLOCKED. An LWP should access 1489 * Note: LWP specific data is NOT INTERLOCKED. An LWP should access
1490 * only its OWN SPECIFIC DATA. If it is necessary to access another 1490 * only its OWN SPECIFIC DATA. If it is necessary to access another
1491 * LWP's specifc data, care must be taken to ensure that doing so 1491 * LWP's specifc data, care must be taken to ensure that doing so
1492 * would not cause internal data structure inconsistency (i.e. caller 1492 * would not cause internal data structure inconsistency (i.e. caller
1493 * can guarantee that the target LWP is not inside an lwp_getspecific() 1493 * can guarantee that the target LWP is not inside an lwp_getspecific()
1494 * or lwp_setspecific() call). 1494 * or lwp_setspecific() call).
1495 */ 1495 */
1496void * 1496void *
1497lwp_getspecific(specificdata_key_t key) 1497lwp_getspecific(specificdata_key_t key)
1498{ 1498{
1499 1499
1500 return (specificdata_getspecific_unlocked(lwp_specificdata_domain, 1500 return (specificdata_getspecific_unlocked(lwp_specificdata_domain,
1501 &curlwp->l_specdataref, key)); 1501 &curlwp->l_specdataref, key));
1502} 1502}
1503 1503
1504void * 1504void *
1505_lwp_getspecific_by_lwp(struct lwp *l, specificdata_key_t key) 1505_lwp_getspecific_by_lwp(struct lwp *l, specificdata_key_t key)
1506{ 1506{
1507 1507
1508 return (specificdata_getspecific_unlocked(lwp_specificdata_domain, 1508 return (specificdata_getspecific_unlocked(lwp_specificdata_domain,
1509 &l->l_specdataref, key)); 1509 &l->l_specdataref, key));
1510} 1510}
1511 1511
1512/* 1512/*
1513 * lwp_setspecific -- 1513 * lwp_setspecific --
1514 * Set lwp-specific data corresponding to the specified key. 1514 * Set lwp-specific data corresponding to the specified key.
1515 */ 1515 */
1516void 1516void
1517lwp_setspecific(specificdata_key_t key, void *data) 1517lwp_setspecific(specificdata_key_t key, void *data)
1518{ 1518{
1519 1519
1520 specificdata_setspecific(lwp_specificdata_domain, 1520 specificdata_setspecific(lwp_specificdata_domain,
1521 &curlwp->l_specdataref, key, data); 1521 &curlwp->l_specdataref, key, data);
1522} 1522}
1523 1523
1524/* 1524/*
1525 * Allocate a new lwpctl structure for a user LWP. 1525 * Allocate a new lwpctl structure for a user LWP.
1526 */ 1526 */
1527int 1527int
1528lwp_ctl_alloc(vaddr_t *uaddr) 1528lwp_ctl_alloc(vaddr_t *uaddr)
1529{ 1529{
1530 lcproc_t *lp; 1530 lcproc_t *lp;
1531 u_int bit, i, offset; 1531 u_int bit, i, offset;
1532 struct uvm_object *uao; 1532 struct uvm_object *uao;
1533 int error; 1533 int error;
1534 lcpage_t *lcp; 1534 lcpage_t *lcp;
1535 proc_t *p; 1535 proc_t *p;
1536 lwp_t *l; 1536 lwp_t *l;
1537 1537
1538 l = curlwp; 1538 l = curlwp;
1539 p = l->l_proc; 1539 p = l->l_proc;
1540 1540
1541 if (l->l_lcpage != NULL) { 1541 if (l->l_lcpage != NULL) {
1542 lcp = l->l_lcpage; 1542 lcp = l->l_lcpage;
1543 *uaddr = lcp->lcp_uaddr + (vaddr_t)l->l_lwpctl - lcp->lcp_kaddr; 1543 *uaddr = lcp->lcp_uaddr + (vaddr_t)l->l_lwpctl - lcp->lcp_kaddr;
1544 return (EINVAL); 1544 return (EINVAL);
1545 } 1545 }
1546 1546
1547 /* First time around, allocate header structure for the process. */ 1547 /* First time around, allocate header structure for the process. */
1548 if ((lp = p->p_lwpctl) == NULL) { 1548 if ((lp = p->p_lwpctl) == NULL) {
1549 lp = kmem_alloc(sizeof(*lp), KM_SLEEP); 1549 lp = kmem_alloc(sizeof(*lp), KM_SLEEP);
1550 mutex_init(&lp->lp_lock, MUTEX_DEFAULT, IPL_NONE); 1550 mutex_init(&lp->lp_lock, MUTEX_DEFAULT, IPL_NONE);
1551 lp->lp_uao = NULL; 1551 lp->lp_uao = NULL;
1552 TAILQ_INIT(&lp->lp_pages); 1552 TAILQ_INIT(&lp->lp_pages);
1553 mutex_enter(p->p_lock); 1553 mutex_enter(p->p_lock);
1554 if (p->p_lwpctl == NULL) { 1554 if (p->p_lwpctl == NULL) {
1555 p->p_lwpctl = lp; 1555 p->p_lwpctl = lp;
1556 mutex_exit(p->p_lock); 1556 mutex_exit(p->p_lock);
1557 } else { 1557 } else {
1558 mutex_exit(p->p_lock); 1558 mutex_exit(p->p_lock);
1559 mutex_destroy(&lp->lp_lock); 1559 mutex_destroy(&lp->lp_lock);
1560 kmem_free(lp, sizeof(*lp)); 1560 kmem_free(lp, sizeof(*lp));
1561 lp = p->p_lwpctl; 1561 lp = p->p_lwpctl;
1562 } 1562 }
1563 } 1563 }
1564 1564
1565 /* 1565 /*
1566 * Set up an anonymous memory region to hold the shared pages. 1566 * Set up an anonymous memory region to hold the shared pages.
1567 * Map them into the process' address space. The user vmspace 1567 * Map them into the process' address space. The user vmspace
1568 * gets the first reference on the UAO. 1568 * gets the first reference on the UAO.
1569 */ 1569 */
1570 mutex_enter(&lp->lp_lock); 1570 mutex_enter(&lp->lp_lock);
1571 if (lp->lp_uao == NULL) { 1571 if (lp->lp_uao == NULL) {
1572 lp->lp_uao = uao_create(LWPCTL_UAREA_SZ, 0); 1572 lp->lp_uao = uao_create(LWPCTL_UAREA_SZ, 0);
1573 lp->lp_cur = 0; 1573 lp->lp_cur = 0;
1574 lp->lp_max = LWPCTL_UAREA_SZ; 1574 lp->lp_max = LWPCTL_UAREA_SZ;
1575 lp->lp_uva = p->p_emul->e_vm_default_addr(p, 1575 lp->lp_uva = p->p_emul->e_vm_default_addr(p,
1576 (vaddr_t)p->p_vmspace->vm_daddr, LWPCTL_UAREA_SZ); 1576 (vaddr_t)p->p_vmspace->vm_daddr, LWPCTL_UAREA_SZ);
1577 error = uvm_map(&p->p_vmspace->vm_map, &lp->lp_uva, 1577 error = uvm_map(&p->p_vmspace->vm_map, &lp->lp_uva,
1578 LWPCTL_UAREA_SZ, lp->lp_uao, 0, 0, UVM_MAPFLAG(UVM_PROT_RW, 1578 LWPCTL_UAREA_SZ, lp->lp_uao, 0, 0, UVM_MAPFLAG(UVM_PROT_RW,
1579 UVM_PROT_RW, UVM_INH_NONE, UVM_ADV_NORMAL, 0)); 1579 UVM_PROT_RW, UVM_INH_NONE, UVM_ADV_NORMAL, 0));
1580 if (error != 0) { 1580 if (error != 0) {
1581 uao_detach(lp->lp_uao); 1581 uao_detach(lp->lp_uao);
1582 lp->lp_uao = NULL; 1582 lp->lp_uao = NULL;
1583 mutex_exit(&lp->lp_lock); 1583 mutex_exit(&lp->lp_lock);
1584 return error; 1584 return error;
1585 } 1585 }
1586 } 1586 }
1587 1587
1588 /* Get a free block and allocate for this LWP. */ 1588 /* Get a free block and allocate for this LWP. */
1589 TAILQ_FOREACH(lcp, &lp->lp_pages, lcp_chain) { 1589 TAILQ_FOREACH(lcp, &lp->lp_pages, lcp_chain) {
1590 if (lcp->lcp_nfree != 0) 1590 if (lcp->lcp_nfree != 0)
1591 break; 1591 break;
1592 } 1592 }
1593 if (lcp == NULL) { 1593 if (lcp == NULL) {
1594 /* Nothing available - try to set up a free page. */ 1594 /* Nothing available - try to set up a free page. */
1595 if (lp->lp_cur == lp->lp_max) { 1595 if (lp->lp_cur == lp->lp_max) {
1596 mutex_exit(&lp->lp_lock); 1596 mutex_exit(&lp->lp_lock);
1597 return ENOMEM; 1597 return ENOMEM;
1598 } 1598 }
1599 lcp = kmem_alloc(LWPCTL_LCPAGE_SZ, KM_SLEEP); 1599 lcp = kmem_alloc(LWPCTL_LCPAGE_SZ, KM_SLEEP);
1600 if (lcp == NULL) { 1600 if (lcp == NULL) {
1601 mutex_exit(&lp->lp_lock); 1601 mutex_exit(&lp->lp_lock);
1602 return ENOMEM; 1602 return ENOMEM;
1603 } 1603 }
1604 /* 1604 /*
1605 * Wire the next page down in kernel space. Since this 1605 * Wire the next page down in kernel space. Since this
1606 * is a new mapping, we must add a reference. 1606 * is a new mapping, we must add a reference.
1607 */ 1607 */
1608 uao = lp->lp_uao; 1608 uao = lp->lp_uao;

cvs diff -r1.56 -r1.57 src/sys/sys/filedesc.h (switch to unified diff)

--- src/sys/sys/filedesc.h 2009/05/25 03:59:45 1.56
+++ src/sys/sys/filedesc.h 2009/10/27 02:58:28 1.57
@@ -1,236 +1,236 @@ @@ -1,236 +1,236 @@
1/* $NetBSD: filedesc.h,v 1.56 2009/05/25 03:59:45 yamt Exp $ */ 1/* $NetBSD: filedesc.h,v 1.57 2009/10/27 02:58:28 rmind Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29/* 29/*
30 * Copyright (c) 1990, 1993 30 * Copyright (c) 1990, 1993
31 * The Regents of the University of California. All rights reserved. 31 * The Regents of the University of California. All rights reserved.
32 * 32 *
33 * Redistribution and use in source and binary forms, with or without 33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions 34 * modification, are permitted provided that the following conditions
35 * are met: 35 * are met:
36 * 1. Redistributions of source code must retain the above copyright 36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer. 37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright 38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the 39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution. 40 * documentation and/or other materials provided with the distribution.
41 * 3. Neither the name of the University nor the names of its contributors 41 * 3. Neither the name of the University nor the names of its contributors
42 * may be used to endorse or promote products derived from this software 42 * may be used to endorse or promote products derived from this software
43 * without specific prior written permission. 43 * without specific prior written permission.
44 * 44 *
45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE. 55 * SUCH DAMAGE.
56 * 56 *
57 * @(#)filedesc.h 8.1 (Berkeley) 6/2/93 57 * @(#)filedesc.h 8.1 (Berkeley) 6/2/93
58 */ 58 */
59 59
60#ifndef _SYS_FILEDESC_H_ 60#ifndef _SYS_FILEDESC_H_
61#define _SYS_FILEDESC_H_ 61#define _SYS_FILEDESC_H_
62 62
63#include <sys/param.h> 63#include <sys/param.h>
64#include <sys/queue.h> 64#include <sys/queue.h>
65#include <sys/mutex.h> 65#include <sys/mutex.h>
66#include <sys/rwlock.h> 66#include <sys/rwlock.h>
67#include <sys/condvar.h> 67#include <sys/condvar.h>
68 68
69/* 69/*
70 * This structure is used for the management of descriptors. It may be 70 * This structure is used for the management of descriptors. It may be
71 * shared by multiple processes. 71 * shared by multiple processes.
72 * 72 *
73 * A process is initially started out with NDFILE descriptors stored within 73 * A process is initially started out with NDFILE descriptors stored within
74 * this structure, selected to be enough for typical applications based on 74 * this structure, selected to be enough for typical applications based on
75 * the historical limit of 20 open files (and the usage of descriptors by 75 * the historical limit of 20 open files (and the usage of descriptors by
76 * shells). If these descriptors are exhausted, a larger descriptor table 76 * shells). If these descriptors are exhausted, a larger descriptor table
77 * may be allocated, up to a process' resource limit; the internal arrays 77 * may be allocated, up to a process' resource limit; the internal arrays
78 * are then unused. The initial expansion is set to NDEXTENT; each time 78 * are then unused. The initial expansion is set to NDEXTENT; each time
79 * it runs out, it is doubled until the resource limit is reached. NDEXTENT 79 * it runs out, it is doubled until the resource limit is reached. NDEXTENT
80 * should be selected to be the biggest multiple of OFILESIZE (see below) 80 * should be selected to be the biggest multiple of OFILESIZE (see below)
81 * that will fit in a power-of-two sized piece of memory. 81 * that will fit in a power-of-two sized piece of memory.
82 */ 82 */
83#define NDFILE 20 83#define NDFILE 20
84#define NDEXTENT 50 /* 250 bytes in 256-byte alloc */ 84#define NDEXTENT 50 /* 250 bytes in 256-byte alloc */
85#define NDENTRIES 32 /* 32 fds per entry */ 85#define NDENTRIES 32 /* 32 fds per entry */
86#define NDENTRYMASK (NDENTRIES - 1) 86#define NDENTRYMASK (NDENTRIES - 1)
87#define NDENTRYSHIFT 5 /* bits per entry */ 87#define NDENTRYSHIFT 5 /* bits per entry */
88#define NDLOSLOTS(x) (((x) + NDENTRIES - 1) >> NDENTRYSHIFT) 88#define NDLOSLOTS(x) (((x) + NDENTRIES - 1) >> NDENTRYSHIFT)
89#define NDHISLOTS(x) ((NDLOSLOTS(x) + NDENTRIES - 1) >> NDENTRYSHIFT) 89#define NDHISLOTS(x) ((NDLOSLOTS(x) + NDENTRIES - 1) >> NDENTRYSHIFT)
90#define NDFDFILE 6 /* first 6 descriptors are free */ 90#define NDFDFILE 6 /* first 6 descriptors are free */
91 91
92/* 92/*
93 * Process-private descriptor reference, one for each descriptor slot 93 * Process-private descriptor reference, one for each descriptor slot
94 * in use. Locks: 94 * in use. Locks:
95 * 95 *
96 * : unlocked 96 * : unlocked
97 * a atomic operations + filedesc_t::fd_lock in some cases 97 * a atomic operations + filedesc_t::fd_lock in some cases
98 * d filedesc_t::fd_lock 98 * d filedesc_t::fd_lock
99 * 99 *
100 * Note that ff_exclose and ff_allocated are likely to be byte sized 100 * Note that ff_exclose and ff_allocated are likely to be byte sized
101 * (bool). In general adjacent sub-word sized fields must be locked 101 * (bool). In general adjacent sub-word sized fields must be locked
102 * the same way, but in this case it's ok: ff_exclose can only be 102 * the same way, but in this case it's ok: ff_exclose can only be
103 * modified while the descriptor slot is live, and ff_allocated when 103 * modified while the descriptor slot is live, and ff_allocated when
104 * it's invalid. 104 * it's invalid.
105 */ 105 */
106typedef struct fdfile { 106typedef struct fdfile {
107 bool ff_exclose; /* :: close on exec flag */ 107 bool ff_exclose; /* :: close on exec flag */
108 bool ff_allocated; /* d: descriptor slot is allocated */ 108 bool ff_allocated; /* d: descriptor slot is allocated */
109 u_int ff_refcnt; /* a: reference count on structure */ 109 u_int ff_refcnt; /* a: reference count on structure */
110 struct file *ff_file; /* d: pointer to file if open */ 110 struct file *ff_file; /* d: pointer to file if open */
111 SLIST_HEAD(,knote) ff_knlist; /* d: knotes attached to this fd */ 111 SLIST_HEAD(,knote) ff_knlist; /* d: knotes attached to this fd */
112 kcondvar_t ff_closing; /* d: notifier for close */ 112 kcondvar_t ff_closing; /* d: notifier for close */
113} fdfile_t; 113} fdfile_t;
114 114
115/* Reference count */ 115/* Reference count */
116#define FR_CLOSING (0x80000000) /* closing: must interlock */ 116#define FR_CLOSING (0x80000000) /* closing: must interlock */
117#define FR_MASK (~FR_CLOSING) /* reference count */ 117#define FR_MASK (~FR_CLOSING) /* reference count */
118 118
119/* 119/*
120 * Open file table, potentially many 'active' tables per filedesc_t 120 * Open file table, potentially many 'active' tables per filedesc_t
121 * in a multi-threaded process, or with a shared filedesc_t (clone()). 121 * in a multi-threaded process, or with a shared filedesc_t (clone()).
122 * nfiles is first to avoid pointer arithmetic. 122 * nfiles is first to avoid pointer arithmetic.
123 */ 123 */
124typedef struct fdtab { 124typedef struct fdtab {
125 u_int dt_nfiles; /* number of open files allocated */ 125 u_int dt_nfiles; /* number of open files allocated */
126 struct fdtab *dt_link; /* for lists of dtab */ 126 struct fdtab *dt_link; /* for lists of dtab */
127 fdfile_t *dt_ff[NDFILE]; /* file structures for open fds */ 127 fdfile_t *dt_ff[NDFILE]; /* file structures for open fds */
128} fdtab_t; 128} fdtab_t;
129 129
130typedef struct filedesc { 130typedef struct filedesc {
131 /* 131 /*
132 * Built-in fdfile_t records first, since they have strict 132 * Built-in fdfile_t records first, since they have strict
133 * alignment requirements. 133 * alignment requirements.
134 */ 134 */
135 uint8_t fd_dfdfile[NDFDFILE][CACHE_LINE_SIZE]; 135 uint8_t fd_dfdfile[NDFDFILE][CACHE_LINE_SIZE];
136 /* 136 /*
137 * All of the remaining fields are locked by fd_lock. 137 * All of the remaining fields are locked by fd_lock.
138 */ 138 */
139 kmutex_t fd_lock; /* lock on structure */ 139 kmutex_t fd_lock; /* lock on structure */
140 fdtab_t * volatile fd_dt; /* active descriptor table */ 140 fdtab_t * volatile fd_dt; /* active descriptor table */
141 uint32_t *fd_himap; /* each bit points to 32 fds */ 141 uint32_t *fd_himap; /* each bit points to 32 fds */
142 uint32_t *fd_lomap; /* bitmap of free fds */ 142 uint32_t *fd_lomap; /* bitmap of free fds */
143 struct klist *fd_knhash; /* hash of attached non-fd knotes */ 143 struct klist *fd_knhash; /* hash of attached non-fd knotes */
144 int fd_lastkqfile; /* max descriptor for kqueue */ 144 int fd_lastkqfile; /* max descriptor for kqueue */
145 int fd_lastfile; /* high-water mark of fd_ofiles */ 145 int fd_lastfile; /* high-water mark of fd_ofiles */
146 int fd_refcnt; /* reference count */ 146 int fd_refcnt; /* reference count */
147 u_long fd_knhashmask; /* size of fd_knhash */ 147 u_long fd_knhashmask; /* size of fd_knhash */
148 int fd_freefile; /* approx. next free file */ 148 int fd_freefile; /* approx. next free file */
149 int fd_unused; /* unused */ 149 int fd_unused; /* unused */
150 bool fd_exclose; /* non-zero if >0 fd with EXCLOSE */ 150 bool fd_exclose; /* non-zero if >0 fd with EXCLOSE */
151 /* 151 /*
152 * This structure is used when the number of open files is 152 * This structure is used when the number of open files is
153 * <= NDFILE, and are then pointed to by the pointers above. 153 * <= NDFILE, and are then pointed to by the pointers above.
154 */ 154 */
155 fdtab_t fd_dtbuiltin; 155 fdtab_t fd_dtbuiltin;
156 /* 156 /*
157 * These arrays are used when the number of open files is 157 * These arrays are used when the number of open files is
158 * <= 1024, and are then pointed to by the pointers above. 158 * <= 1024, and are then pointed to by the pointers above.
159 */ 159 */
160#define fd_startzero fd_dhimap /* area to zero on return to cache */ 160#define fd_startzero fd_dhimap /* area to zero on return to cache */
161 uint32_t fd_dhimap[NDENTRIES >> NDENTRYSHIFT]; 161 uint32_t fd_dhimap[NDENTRIES >> NDENTRYSHIFT];
162 uint32_t fd_dlomap[NDENTRIES]; 162 uint32_t fd_dlomap[NDENTRIES];
163} filedesc_t; 163} filedesc_t;
164 164
165typedef struct cwdinfo { 165typedef struct cwdinfo {
166 struct vnode *cwdi_cdir; /* current directory */ 166 struct vnode *cwdi_cdir; /* current directory */
167 struct vnode *cwdi_rdir; /* root directory */ 167 struct vnode *cwdi_rdir; /* root directory */
168 struct vnode *cwdi_edir; /* emulation root (if known) */ 168 struct vnode *cwdi_edir; /* emulation root (if known) */
169 krwlock_t cwdi_lock; /* lock on entire struct */ 169 krwlock_t cwdi_lock; /* lock on entire struct */
170 u_short cwdi_cmask; /* mask for file creation */ 170 u_short cwdi_cmask; /* mask for file creation */
171 u_int cwdi_refcnt; /* reference count */ 171 u_int cwdi_refcnt; /* reference count */
172} cwdinfo_t; 172} cwdinfo_t;
173 173
174#ifdef _KERNEL 174#ifdef _KERNEL
175 175
176struct fileops; 176struct fileops;
177struct socket; 177struct socket;
178struct proc; 178struct proc;
179 179
180/* 180/*
181 * Kernel global variables and routines. 181 * Kernel global variables and routines.
182 */ 182 */
183void fd_sys_init(void); 183void fd_sys_init(void);
184int fd_dupopen(int, int *, int, int); 184int fd_dupopen(int, int *, int, int);
185int fd_alloc(struct proc *, int, int *); 185int fd_alloc(struct proc *, int, int *);
186void fd_tryexpand(struct proc *); 186void fd_tryexpand(struct proc *);
187int fd_allocfile(file_t **, int *); 187int fd_allocfile(file_t **, int *);
188void fd_affix(struct proc *, file_t *, unsigned); 188void fd_affix(struct proc *, file_t *, unsigned);
189void fd_abort(struct proc *, file_t *, unsigned); 189void fd_abort(struct proc *, file_t *, unsigned);
190filedesc_t *fd_copy(void); 190filedesc_t *fd_copy(void);
191filedesc_t *fd_init(filedesc_t *); 191filedesc_t *fd_init(filedesc_t *);
192void fd_share(proc_t *); 192void fd_share(proc_t *);
193void fd_hold(void); 193void fd_hold(lwp_t *);
194void fd_free(void); 194void fd_free(void);
195void fd_closeexec(void); 195void fd_closeexec(void);
196int fd_checkstd(void); 196int fd_checkstd(void);
197file_t *fd_getfile(unsigned); 197file_t *fd_getfile(unsigned);
198file_t *fd_getfile2(proc_t *, unsigned); 198file_t *fd_getfile2(proc_t *, unsigned);
199void fd_putfile(unsigned); 199void fd_putfile(unsigned);
200int fd_getvnode(unsigned, file_t **); 200int fd_getvnode(unsigned, file_t **);
201int fd_getsock(unsigned, struct socket **); 201int fd_getsock(unsigned, struct socket **);
202void fd_putvnode(unsigned); 202void fd_putvnode(unsigned);
203void fd_putsock(unsigned); 203void fd_putsock(unsigned);
204int fd_close(unsigned); 204int fd_close(unsigned);
205int fd_dup(file_t *, int, int *, bool); 205int fd_dup(file_t *, int, int *, bool);
206int fd_dup2(file_t *, unsigned); 206int fd_dup2(file_t *, unsigned);
207int fd_clone(file_t *, unsigned, int, const struct fileops *, void *); 207int fd_clone(file_t *, unsigned, int, const struct fileops *, void *);
208 208
209void cwd_sys_init(void); 209void cwd_sys_init(void);
210struct cwdinfo *cwdinit(void); 210struct cwdinfo *cwdinit(void);
211void cwdshare(proc_t *); 211void cwdshare(proc_t *);
212void cwdunshare(proc_t *); 212void cwdunshare(proc_t *);
213void cwdfree(struct cwdinfo *); 213void cwdfree(struct cwdinfo *);
214 214
215#define GETCWD_CHECK_ACCESS 0x0001 215#define GETCWD_CHECK_ACCESS 0x0001
216int getcwd_common(struct vnode *, struct vnode *, char **, char *, int, 216int getcwd_common(struct vnode *, struct vnode *, char **, char *, int,
217 int, struct lwp *); 217 int, struct lwp *);
218int vnode_to_path(char *, size_t, struct vnode *, struct lwp *, 218int vnode_to_path(char *, size_t, struct vnode *, struct lwp *,
219 struct proc *); 219 struct proc *);
220 220
221int closef(file_t *); 221int closef(file_t *);
222file_t *fgetdummy(void); 222file_t *fgetdummy(void);
223void fputdummy(file_t *); 223void fputdummy(file_t *);
224 224
225struct stat; 225struct stat;
226int do_sys_fstat(int, struct stat *); 226int do_sys_fstat(int, struct stat *);
227struct flock; 227struct flock;
228int do_fcntl_lock(int, int, struct flock *); 228int do_fcntl_lock(int, int, struct flock *);
229int do_posix_fadvise(int, off_t, off_t, int); 229int do_posix_fadvise(int, off_t, off_t, int);
230 230
231extern kmutex_t filelist_lock; 231extern kmutex_t filelist_lock;
232extern filedesc_t filedesc0; 232extern filedesc_t filedesc0;
233 233
234#endif /* _KERNEL */ 234#endif /* _KERNEL */
235 235
236#endif /* !_SYS_FILEDESC_H_ */ 236#endif /* !_SYS_FILEDESC_H_ */