Wed Jul 27 14:35:34 2011 UTC ()
These don't need uvm/uvm_extern.h.


(uebayasi)
diff -r1.127 -r1.128 src/sys/kern/kern_clock.c
diff -r1.29 -r1.30 src/sys/kern/kern_condvar.c
diff -r1.29 -r1.30 src/sys/kern/kern_turnstile.c
diff -r1.63 -r1.64 src/sys/kern/kern_ksyms.c
diff -r1.40 -r1.41 src/sys/kern/kern_sleepq.c
diff -r1.208 -r1.209 src/sys/kern/kern_subr.c
diff -r1.168 -r1.169 src/sys/kern/kern_time.c
diff -r1.26 -r1.27 src/sys/kern/sched_4bsd.c
diff -r1.13 -r1.14 src/sys/kern/subr_percpu.c
diff -r1.30 -r1.31 src/sys/kern/subr_workqueue.c
diff -r1.126 -r1.127 src/sys/kern/sys_generic.c
diff -r1.140 -r1.141 src/sys/kern/uipc_mbuf.c
diff -r1.145 -r1.146 src/sys/kern/uipc_syscalls.c

cvs diff -r1.127 -r1.128 src/sys/kern/kern_clock.c (switch to unified diff)

--- src/sys/kern/kern_clock.c 2010/12/20 00:25:46 1.127
+++ src/sys/kern/kern_clock.c 2011/07/27 14:35:33 1.128
@@ -1,438 +1,436 @@ @@ -1,438 +1,436 @@
1/* $NetBSD: kern_clock.c,v 1.127 2010/12/20 00:25:46 matt Exp $ */ 1/* $NetBSD: kern_clock.c,v 1.128 2011/07/27 14:35:33 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center. 9 * NASA Ames Research Center.
10 * This code is derived from software contributed to The NetBSD Foundation 10 * This code is derived from software contributed to The NetBSD Foundation
11 * by Charles M. Hannum. 11 * by Charles M. Hannum.
12 * 12 *
13 * Redistribution and use in source and binary forms, with or without 13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions 14 * modification, are permitted provided that the following conditions
15 * are met: 15 * are met:
16 * 1. Redistributions of source code must retain the above copyright 16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer. 17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright 18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the 19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution. 20 * documentation and/or other materials provided with the distribution.
21 * 21 *
22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 * POSSIBILITY OF SUCH DAMAGE. 32 * POSSIBILITY OF SUCH DAMAGE.
33 */ 33 */
34 34
35/*- 35/*-
36 * Copyright (c) 1982, 1986, 1991, 1993 36 * Copyright (c) 1982, 1986, 1991, 1993
37 * The Regents of the University of California. All rights reserved. 37 * The Regents of the University of California. All rights reserved.
38 * (c) UNIX System Laboratories, Inc. 38 * (c) UNIX System Laboratories, Inc.
39 * All or some portions of this file are derived from material licensed 39 * All or some portions of this file are derived from material licensed
40 * to the University of California by American Telephone and Telegraph 40 * to the University of California by American Telephone and Telegraph
41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
42 * the permission of UNIX System Laboratories, Inc. 42 * the permission of UNIX System Laboratories, Inc.
43 * 43 *
44 * Redistribution and use in source and binary forms, with or without 44 * Redistribution and use in source and binary forms, with or without
45 * modification, are permitted provided that the following conditions 45 * modification, are permitted provided that the following conditions
46 * are met: 46 * are met:
47 * 1. Redistributions of source code must retain the above copyright 47 * 1. Redistributions of source code must retain the above copyright
48 * notice, this list of conditions and the following disclaimer. 48 * notice, this list of conditions and the following disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright 49 * 2. Redistributions in binary form must reproduce the above copyright
50 * notice, this list of conditions and the following disclaimer in the 50 * notice, this list of conditions and the following disclaimer in the
51 * documentation and/or other materials provided with the distribution. 51 * documentation and/or other materials provided with the distribution.
52 * 3. Neither the name of the University nor the names of its contributors 52 * 3. Neither the name of the University nor the names of its contributors
53 * may be used to endorse or promote products derived from this software 53 * may be used to endorse or promote products derived from this software
54 * without specific prior written permission. 54 * without specific prior written permission.
55 * 55 *
56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66 * SUCH DAMAGE. 66 * SUCH DAMAGE.
67 * 67 *
68 * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 68 * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
69 */ 69 */
70 70
71#include <sys/cdefs.h> 71#include <sys/cdefs.h>
72__KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.127 2010/12/20 00:25:46 matt Exp $"); 72__KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.128 2011/07/27 14:35:33 uebayasi Exp $");
73 73
74#include "opt_ntp.h" 74#include "opt_ntp.h"
75#include "opt_perfctrs.h" 75#include "opt_perfctrs.h"
76 76
77#include <sys/param.h> 77#include <sys/param.h>
78#include <sys/systm.h> 78#include <sys/systm.h>
79#include <sys/callout.h> 79#include <sys/callout.h>
80#include <sys/kernel.h> 80#include <sys/kernel.h>
81#include <sys/proc.h> 81#include <sys/proc.h>
82#include <sys/resourcevar.h> 82#include <sys/resourcevar.h>
83#include <sys/signalvar.h> 83#include <sys/signalvar.h>
84#include <sys/sysctl.h> 84#include <sys/sysctl.h>
85#include <sys/timex.h> 85#include <sys/timex.h>
86#include <sys/sched.h> 86#include <sys/sched.h>
87#include <sys/time.h> 87#include <sys/time.h>
88#include <sys/timetc.h> 88#include <sys/timetc.h>
89#include <sys/cpu.h> 89#include <sys/cpu.h>
90#include <sys/atomic.h> 90#include <sys/atomic.h>
91 91
92#include <uvm/uvm_extern.h> 
93 
94#ifdef GPROF 92#ifdef GPROF
95#include <sys/gmon.h> 93#include <sys/gmon.h>
96#endif 94#endif
97 95
98/* 96/*
99 * Clock handling routines. 97 * Clock handling routines.
100 * 98 *
101 * This code is written to operate with two timers that run independently of 99 * This code is written to operate with two timers that run independently of
102 * each other. The main clock, running hz times per second, is used to keep 100 * each other. The main clock, running hz times per second, is used to keep
103 * track of real time. The second timer handles kernel and user profiling, 101 * track of real time. The second timer handles kernel and user profiling,
104 * and does resource use estimation. If the second timer is programmable, 102 * and does resource use estimation. If the second timer is programmable,
105 * it is randomized to avoid aliasing between the two clocks. For example, 103 * it is randomized to avoid aliasing between the two clocks. For example,
106 * the randomization prevents an adversary from always giving up the CPU 104 * the randomization prevents an adversary from always giving up the CPU
107 * just before its quantum expires. Otherwise, it would never accumulate 105 * just before its quantum expires. Otherwise, it would never accumulate
108 * CPU ticks. The mean frequency of the second timer is stathz. 106 * CPU ticks. The mean frequency of the second timer is stathz.
109 * 107 *
110 * If no second timer exists, stathz will be zero; in this case we drive 108 * If no second timer exists, stathz will be zero; in this case we drive
111 * profiling and statistics off the main clock. This WILL NOT be accurate; 109 * profiling and statistics off the main clock. This WILL NOT be accurate;
112 * do not do it unless absolutely necessary. 110 * do not do it unless absolutely necessary.
113 * 111 *
114 * The statistics clock may (or may not) be run at a higher rate while 112 * The statistics clock may (or may not) be run at a higher rate while
115 * profiling. This profile clock runs at profhz. We require that profhz 113 * profiling. This profile clock runs at profhz. We require that profhz
116 * be an integral multiple of stathz. 114 * be an integral multiple of stathz.
117 * 115 *
118 * If the statistics clock is running fast, it must be divided by the ratio 116 * If the statistics clock is running fast, it must be divided by the ratio
119 * profhz/stathz for statistics. (For profiling, every tick counts.) 117 * profhz/stathz for statistics. (For profiling, every tick counts.)
120 */ 118 */
121 119
122int stathz; 120int stathz;
123int profhz; 121int profhz;
124int profsrc; 122int profsrc;
125int schedhz; 123int schedhz;
126int profprocs; 124int profprocs;
127int hardclock_ticks; 125int hardclock_ticks;
128static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */ 126static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */
129static int psdiv; /* prof => stat divider */ 127static int psdiv; /* prof => stat divider */
130int psratio; /* ratio: prof / stat */ 128int psratio; /* ratio: prof / stat */
131 129
132static u_int get_intr_timecount(struct timecounter *); 130static u_int get_intr_timecount(struct timecounter *);
133 131
134static struct timecounter intr_timecounter = { 132static struct timecounter intr_timecounter = {
135 get_intr_timecount, /* get_timecount */ 133 get_intr_timecount, /* get_timecount */
136 0, /* no poll_pps */ 134 0, /* no poll_pps */
137 ~0u, /* counter_mask */ 135 ~0u, /* counter_mask */
138 0, /* frequency */ 136 0, /* frequency */
139 "clockinterrupt", /* name */ 137 "clockinterrupt", /* name */
140 0, /* quality - minimum implementation level for a clock */ 138 0, /* quality - minimum implementation level for a clock */
141 NULL, /* prev */ 139 NULL, /* prev */
142 NULL, /* next */ 140 NULL, /* next */
143}; 141};
144 142
145static u_int 143static u_int
146get_intr_timecount(struct timecounter *tc) 144get_intr_timecount(struct timecounter *tc)
147{ 145{
148 146
149 return (u_int)hardclock_ticks; 147 return (u_int)hardclock_ticks;
150} 148}
151 149
152/* 150/*
153 * Initialize clock frequencies and start both clocks running. 151 * Initialize clock frequencies and start both clocks running.
154 */ 152 */
155void 153void
156initclocks(void) 154initclocks(void)
157{ 155{
158 int i; 156 int i;
159 157
160 /* 158 /*
161 * Set divisors to 1 (normal case) and let the machine-specific 159 * Set divisors to 1 (normal case) and let the machine-specific
162 * code do its bit. 160 * code do its bit.
163 */ 161 */
164 psdiv = 1; 162 psdiv = 1;
165 /* 163 /*
166 * provide minimum default time counter 164 * provide minimum default time counter
167 * will only run at interrupt resolution 165 * will only run at interrupt resolution
168 */ 166 */
169 intr_timecounter.tc_frequency = hz; 167 intr_timecounter.tc_frequency = hz;
170 tc_init(&intr_timecounter); 168 tc_init(&intr_timecounter);
171 cpu_initclocks(); 169 cpu_initclocks();
172 170
173 /* 171 /*
174 * Compute profhz and stathz, fix profhz if needed. 172 * Compute profhz and stathz, fix profhz if needed.
175 */ 173 */
176 i = stathz ? stathz : hz; 174 i = stathz ? stathz : hz;
177 if (profhz == 0) 175 if (profhz == 0)
178 profhz = i; 176 profhz = i;
179 psratio = profhz / i; 177 psratio = profhz / i;
180 if (schedhz == 0) { 178 if (schedhz == 0) {
181 /* 16Hz is best */ 179 /* 16Hz is best */
182 hardscheddiv = hz / 16; 180 hardscheddiv = hz / 16;
183 if (hardscheddiv <= 0) 181 if (hardscheddiv <= 0)
184 panic("hardscheddiv"); 182 panic("hardscheddiv");
185 } 183 }
186 184
187} 185}
188 186
189/* 187/*
190 * The real-time timer, interrupting hz times per second. 188 * The real-time timer, interrupting hz times per second.
191 */ 189 */
192void 190void
193hardclock(struct clockframe *frame) 191hardclock(struct clockframe *frame)
194{ 192{
195 struct lwp *l; 193 struct lwp *l;
196 struct cpu_info *ci; 194 struct cpu_info *ci;
197 195
198 ci = curcpu(); 196 ci = curcpu();
199 l = ci->ci_data.cpu_onproc; 197 l = ci->ci_data.cpu_onproc;
200 198
201 timer_tick(l, CLKF_USERMODE(frame)); 199 timer_tick(l, CLKF_USERMODE(frame));
202 200
203 /* 201 /*
204 * If no separate statistics clock is available, run it from here. 202 * If no separate statistics clock is available, run it from here.
205 */ 203 */
206 if (stathz == 0) 204 if (stathz == 0)
207 statclock(frame); 205 statclock(frame);
208 /* 206 /*
209 * If no separate schedclock is provided, call it here 207 * If no separate schedclock is provided, call it here
210 * at about 16 Hz. 208 * at about 16 Hz.
211 */ 209 */
212 if (schedhz == 0) { 210 if (schedhz == 0) {
213 if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) { 211 if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) {
214 schedclock(l); 212 schedclock(l);
215 ci->ci_schedstate.spc_schedticks = hardscheddiv; 213 ci->ci_schedstate.spc_schedticks = hardscheddiv;
216 } 214 }
217 } 215 }
218 if ((--ci->ci_schedstate.spc_ticks) <= 0) 216 if ((--ci->ci_schedstate.spc_ticks) <= 0)
219 sched_tick(ci); 217 sched_tick(ci);
220 218
221 if (CPU_IS_PRIMARY(ci)) { 219 if (CPU_IS_PRIMARY(ci)) {
222 hardclock_ticks++; 220 hardclock_ticks++;
223 tc_ticktock(); 221 tc_ticktock();
224 } 222 }
225 223
226 /* 224 /*
227 * Update real-time timeout queue. 225 * Update real-time timeout queue.
228 */ 226 */
229 callout_hardclock(); 227 callout_hardclock();
230} 228}
231 229
232/* 230/*
233 * Start profiling on a process. 231 * Start profiling on a process.
234 * 232 *
235 * Kernel profiling passes proc0 which never exits and hence 233 * Kernel profiling passes proc0 which never exits and hence
236 * keeps the profile clock running constantly. 234 * keeps the profile clock running constantly.
237 */ 235 */
238void 236void
239startprofclock(struct proc *p) 237startprofclock(struct proc *p)
240{ 238{
241 239
242 KASSERT(mutex_owned(&p->p_stmutex)); 240 KASSERT(mutex_owned(&p->p_stmutex));
243 241
244 if ((p->p_stflag & PST_PROFIL) == 0) { 242 if ((p->p_stflag & PST_PROFIL) == 0) {
245 p->p_stflag |= PST_PROFIL; 243 p->p_stflag |= PST_PROFIL;
246 /* 244 /*
247 * This is only necessary if using the clock as the 245 * This is only necessary if using the clock as the
248 * profiling source. 246 * profiling source.
249 */ 247 */
250 if (++profprocs == 1 && stathz != 0) 248 if (++profprocs == 1 && stathz != 0)
251 psdiv = psratio; 249 psdiv = psratio;
252 } 250 }
253} 251}
254 252
255/* 253/*
256 * Stop profiling on a process. 254 * Stop profiling on a process.
257 */ 255 */
258void 256void
259stopprofclock(struct proc *p) 257stopprofclock(struct proc *p)
260{ 258{
261 259
262 KASSERT(mutex_owned(&p->p_stmutex)); 260 KASSERT(mutex_owned(&p->p_stmutex));
263 261
264 if (p->p_stflag & PST_PROFIL) { 262 if (p->p_stflag & PST_PROFIL) {
265 p->p_stflag &= ~PST_PROFIL; 263 p->p_stflag &= ~PST_PROFIL;
266 /* 264 /*
267 * This is only necessary if using the clock as the 265 * This is only necessary if using the clock as the
268 * profiling source. 266 * profiling source.
269 */ 267 */
270 if (--profprocs == 0 && stathz != 0) 268 if (--profprocs == 0 && stathz != 0)
271 psdiv = 1; 269 psdiv = 1;
272 } 270 }
273} 271}
274 272
275#if defined(PERFCTRS) 273#if defined(PERFCTRS)
276/* 274/*
277 * Independent profiling "tick" in case we're using a separate 275 * Independent profiling "tick" in case we're using a separate
278 * clock or profiling event source. Currently, that's just 276 * clock or profiling event source. Currently, that's just
279 * performance counters--hence the wrapper. 277 * performance counters--hence the wrapper.
280 */ 278 */
281void 279void
282proftick(struct clockframe *frame) 280proftick(struct clockframe *frame)
283{ 281{
284#ifdef GPROF 282#ifdef GPROF
285 struct gmonparam *g; 283 struct gmonparam *g;
286 intptr_t i; 284 intptr_t i;
287#endif 285#endif
288 struct lwp *l; 286 struct lwp *l;
289 struct proc *p; 287 struct proc *p;
290 288
291 l = curcpu()->ci_data.cpu_onproc; 289 l = curcpu()->ci_data.cpu_onproc;
292 p = (l ? l->l_proc : NULL); 290 p = (l ? l->l_proc : NULL);
293 if (CLKF_USERMODE(frame)) { 291 if (CLKF_USERMODE(frame)) {
294 mutex_spin_enter(&p->p_stmutex); 292 mutex_spin_enter(&p->p_stmutex);
295 if (p->p_stflag & PST_PROFIL) 293 if (p->p_stflag & PST_PROFIL)
296 addupc_intr(l, CLKF_PC(frame)); 294 addupc_intr(l, CLKF_PC(frame));
297 mutex_spin_exit(&p->p_stmutex); 295 mutex_spin_exit(&p->p_stmutex);
298 } else { 296 } else {
299#ifdef GPROF 297#ifdef GPROF
300 g = &_gmonparam; 298 g = &_gmonparam;
301 if (g->state == GMON_PROF_ON) { 299 if (g->state == GMON_PROF_ON) {
302 i = CLKF_PC(frame) - g->lowpc; 300 i = CLKF_PC(frame) - g->lowpc;
303 if (i < g->textsize) { 301 if (i < g->textsize) {
304 i /= HISTFRACTION * sizeof(*g->kcount); 302 i /= HISTFRACTION * sizeof(*g->kcount);
305 g->kcount[i]++; 303 g->kcount[i]++;
306 } 304 }
307 } 305 }
308#endif 306#endif
309#ifdef LWP_PC 307#ifdef LWP_PC
310 if (p != NULL && (p->p_stflag & PST_PROFIL) != 0) 308 if (p != NULL && (p->p_stflag & PST_PROFIL) != 0)
311 addupc_intr(l, LWP_PC(l)); 309 addupc_intr(l, LWP_PC(l));
312#endif 310#endif
313 } 311 }
314} 312}
315#endif 313#endif
316 314
317void 315void
318schedclock(struct lwp *l) 316schedclock(struct lwp *l)
319{ 317{
320 if ((l->l_flag & LW_IDLE) != 0) 318 if ((l->l_flag & LW_IDLE) != 0)
321 return; 319 return;
322 320
323 sched_schedclock(l); 321 sched_schedclock(l);
324} 322}
325 323
326/* 324/*
327 * Statistics clock. Grab profile sample, and if divider reaches 0, 325 * Statistics clock. Grab profile sample, and if divider reaches 0,
328 * do process and kernel statistics. 326 * do process and kernel statistics.
329 */ 327 */
330void 328void
331statclock(struct clockframe *frame) 329statclock(struct clockframe *frame)
332{ 330{
333#ifdef GPROF 331#ifdef GPROF
334 struct gmonparam *g; 332 struct gmonparam *g;
335 intptr_t i; 333 intptr_t i;
336#endif 334#endif
337 struct cpu_info *ci = curcpu(); 335 struct cpu_info *ci = curcpu();
338 struct schedstate_percpu *spc = &ci->ci_schedstate; 336 struct schedstate_percpu *spc = &ci->ci_schedstate;
339 struct proc *p; 337 struct proc *p;
340 struct lwp *l; 338 struct lwp *l;
341 339
342 /* 340 /*
343 * Notice changes in divisor frequency, and adjust clock 341 * Notice changes in divisor frequency, and adjust clock
344 * frequency accordingly. 342 * frequency accordingly.
345 */ 343 */
346 if (spc->spc_psdiv != psdiv) { 344 if (spc->spc_psdiv != psdiv) {
347 spc->spc_psdiv = psdiv; 345 spc->spc_psdiv = psdiv;
348 spc->spc_pscnt = psdiv; 346 spc->spc_pscnt = psdiv;
349 if (psdiv == 1) { 347 if (psdiv == 1) {
350 setstatclockrate(stathz); 348 setstatclockrate(stathz);
351 } else { 349 } else {
352 setstatclockrate(profhz); 350 setstatclockrate(profhz);
353 } 351 }
354 } 352 }
355 l = ci->ci_data.cpu_onproc; 353 l = ci->ci_data.cpu_onproc;
356 if ((l->l_flag & LW_IDLE) != 0) { 354 if ((l->l_flag & LW_IDLE) != 0) {
357 /* 355 /*
358 * don't account idle lwps as swapper. 356 * don't account idle lwps as swapper.
359 */ 357 */
360 p = NULL; 358 p = NULL;
361 } else { 359 } else {
362 p = l->l_proc; 360 p = l->l_proc;
363 mutex_spin_enter(&p->p_stmutex); 361 mutex_spin_enter(&p->p_stmutex);
364 } 362 }
365 363
366 if (CLKF_USERMODE(frame)) { 364 if (CLKF_USERMODE(frame)) {
367 if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK) 365 if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK)
368 addupc_intr(l, CLKF_PC(frame)); 366 addupc_intr(l, CLKF_PC(frame));
369 if (--spc->spc_pscnt > 0) { 367 if (--spc->spc_pscnt > 0) {
370 mutex_spin_exit(&p->p_stmutex); 368 mutex_spin_exit(&p->p_stmutex);
371 return; 369 return;
372 } 370 }
373 371
374 /* 372 /*
375 * Came from user mode; CPU was in user state. 373 * Came from user mode; CPU was in user state.
376 * If this process is being profiled record the tick. 374 * If this process is being profiled record the tick.
377 */ 375 */
378 p->p_uticks++; 376 p->p_uticks++;
379 if (p->p_nice > NZERO) 377 if (p->p_nice > NZERO)
380 spc->spc_cp_time[CP_NICE]++; 378 spc->spc_cp_time[CP_NICE]++;
381 else 379 else
382 spc->spc_cp_time[CP_USER]++; 380 spc->spc_cp_time[CP_USER]++;
383 } else { 381 } else {
384#ifdef GPROF 382#ifdef GPROF
385 /* 383 /*
386 * Kernel statistics are just like addupc_intr, only easier. 384 * Kernel statistics are just like addupc_intr, only easier.
387 */ 385 */
388 g = &_gmonparam; 386 g = &_gmonparam;
389 if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) { 387 if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
390 i = CLKF_PC(frame) - g->lowpc; 388 i = CLKF_PC(frame) - g->lowpc;
391 if (i < g->textsize) { 389 if (i < g->textsize) {
392 i /= HISTFRACTION * sizeof(*g->kcount); 390 i /= HISTFRACTION * sizeof(*g->kcount);
393 g->kcount[i]++; 391 g->kcount[i]++;
394 } 392 }
395 } 393 }
396#endif 394#endif
397#ifdef LWP_PC 395#ifdef LWP_PC
398 if (p != NULL && profsrc == PROFSRC_CLOCK && 396 if (p != NULL && profsrc == PROFSRC_CLOCK &&
399 (p->p_stflag & PST_PROFIL)) { 397 (p->p_stflag & PST_PROFIL)) {
400 addupc_intr(l, LWP_PC(l)); 398 addupc_intr(l, LWP_PC(l));
401 } 399 }
402#endif 400#endif
403 if (--spc->spc_pscnt > 0) { 401 if (--spc->spc_pscnt > 0) {
404 if (p != NULL) 402 if (p != NULL)
405 mutex_spin_exit(&p->p_stmutex); 403 mutex_spin_exit(&p->p_stmutex);
406 return; 404 return;
407 } 405 }
408 /* 406 /*
409 * Came from kernel mode, so we were: 407 * Came from kernel mode, so we were:
410 * - handling an interrupt, 408 * - handling an interrupt,
411 * - doing syscall or trap work on behalf of the current 409 * - doing syscall or trap work on behalf of the current
412 * user process, or 410 * user process, or
413 * - spinning in the idle loop. 411 * - spinning in the idle loop.
414 * Whichever it is, charge the time as appropriate. 412 * Whichever it is, charge the time as appropriate.
415 * Note that we charge interrupts to the current process, 413 * Note that we charge interrupts to the current process,
416 * regardless of whether they are ``for'' that process, 414 * regardless of whether they are ``for'' that process,
417 * so that we know how much of its real time was spent 415 * so that we know how much of its real time was spent
418 * in ``non-process'' (i.e., interrupt) work. 416 * in ``non-process'' (i.e., interrupt) work.
419 */ 417 */
420 if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) { 418 if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) {
421 if (p != NULL) { 419 if (p != NULL) {
422 p->p_iticks++; 420 p->p_iticks++;
423 } 421 }
424 spc->spc_cp_time[CP_INTR]++; 422 spc->spc_cp_time[CP_INTR]++;
425 } else if (p != NULL) { 423 } else if (p != NULL) {
426 p->p_sticks++; 424 p->p_sticks++;
427 spc->spc_cp_time[CP_SYS]++; 425 spc->spc_cp_time[CP_SYS]++;
428 } else { 426 } else {
429 spc->spc_cp_time[CP_IDLE]++; 427 spc->spc_cp_time[CP_IDLE]++;
430 } 428 }
431 } 429 }
432 spc->spc_pscnt = psdiv; 430 spc->spc_pscnt = psdiv;
433 431
434 if (p != NULL) { 432 if (p != NULL) {
435 atomic_inc_uint(&l->l_cpticks); 433 atomic_inc_uint(&l->l_cpticks);
436 mutex_spin_exit(&p->p_stmutex); 434 mutex_spin_exit(&p->p_stmutex);
437 } 435 }
438} 436}

cvs diff -r1.29 -r1.30 src/sys/kern/kern_condvar.c (switch to unified diff)

--- src/sys/kern/kern_condvar.c 2011/04/14 20:19:35 1.29
+++ src/sys/kern/kern_condvar.c 2011/07/27 14:35:33 1.30
@@ -1,391 +1,389 @@ @@ -1,391 +1,389 @@
1/* $NetBSD: kern_condvar.c,v 1.29 2011/04/14 20:19:35 jym Exp $ */ 1/* $NetBSD: kern_condvar.c,v 1.30 2011/07/27 14:35:33 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * Kernel condition variable implementation. 33 * Kernel condition variable implementation.
34 */ 34 */
35 35
36#include <sys/cdefs.h> 36#include <sys/cdefs.h>
37__KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.29 2011/04/14 20:19:35 jym Exp $"); 37__KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.30 2011/07/27 14:35:33 uebayasi Exp $");
38 38
39#include <sys/param.h> 39#include <sys/param.h>
40#include <sys/proc.h> 40#include <sys/proc.h>
41#include <sys/sched.h> 41#include <sys/sched.h>
42#include <sys/systm.h> 42#include <sys/systm.h>
43#include <sys/condvar.h> 43#include <sys/condvar.h>
44#include <sys/sleepq.h> 44#include <sys/sleepq.h>
45#include <sys/lockdebug.h> 45#include <sys/lockdebug.h>
46#include <sys/cpu.h> 46#include <sys/cpu.h>
47 47
48#include <uvm/uvm_extern.h> 
49 
50/* 48/*
51 * Accessors for the private contents of the kcondvar_t data type. 49 * Accessors for the private contents of the kcondvar_t data type.
52 * 50 *
53 * cv_opaque[0] sleepq... 51 * cv_opaque[0] sleepq...
54 * cv_opaque[1] ...pointers 52 * cv_opaque[1] ...pointers
55 * cv_opaque[2] description for ps(1) 53 * cv_opaque[2] description for ps(1)
56 * 54 *
57 * cv_opaque[0..1] is protected by the interlock passed to cv_wait() (enqueue 55 * cv_opaque[0..1] is protected by the interlock passed to cv_wait() (enqueue
58 * only), and the sleep queue lock acquired with sleeptab_lookup() (enqueue 56 * only), and the sleep queue lock acquired with sleeptab_lookup() (enqueue
59 * and dequeue). 57 * and dequeue).
60 * 58 *
61 * cv_opaque[2] (the wmesg) is static and does not change throughout the life 59 * cv_opaque[2] (the wmesg) is static and does not change throughout the life
62 * of the CV. 60 * of the CV.
63 */ 61 */
64#define CV_SLEEPQ(cv) ((sleepq_t *)(cv)->cv_opaque) 62#define CV_SLEEPQ(cv) ((sleepq_t *)(cv)->cv_opaque)
65#define CV_WMESG(cv) ((const char *)(cv)->cv_opaque[2]) 63#define CV_WMESG(cv) ((const char *)(cv)->cv_opaque[2])
66#define CV_SET_WMESG(cv, v) (cv)->cv_opaque[2] = __UNCONST(v) 64#define CV_SET_WMESG(cv, v) (cv)->cv_opaque[2] = __UNCONST(v)
67 65
68#define CV_DEBUG_P(cv) (CV_WMESG(cv) != nodebug) 66#define CV_DEBUG_P(cv) (CV_WMESG(cv) != nodebug)
69#define CV_RA ((uintptr_t)__builtin_return_address(0)) 67#define CV_RA ((uintptr_t)__builtin_return_address(0))
70 68
71static void cv_unsleep(lwp_t *, bool); 69static void cv_unsleep(lwp_t *, bool);
72static void cv_wakeup_one(kcondvar_t *); 70static void cv_wakeup_one(kcondvar_t *);
73static void cv_wakeup_all(kcondvar_t *); 71static void cv_wakeup_all(kcondvar_t *);
74 72
75static syncobj_t cv_syncobj = { 73static syncobj_t cv_syncobj = {
76 SOBJ_SLEEPQ_SORTED, 74 SOBJ_SLEEPQ_SORTED,
77 cv_unsleep, 75 cv_unsleep,
78 sleepq_changepri, 76 sleepq_changepri,
79 sleepq_lendpri, 77 sleepq_lendpri,
80 syncobj_noowner, 78 syncobj_noowner,
81}; 79};
82 80
83lockops_t cv_lockops = { 81lockops_t cv_lockops = {
84 "Condition variable", 82 "Condition variable",
85 LOCKOPS_CV, 83 LOCKOPS_CV,
86 NULL 84 NULL
87}; 85};
88 86
89static const char deadcv[] = "deadcv"; 87static const char deadcv[] = "deadcv";
90static const char nodebug[] = "nodebug"; 88static const char nodebug[] = "nodebug";
91 89
92/* 90/*
93 * cv_init: 91 * cv_init:
94 * 92 *
95 * Initialize a condition variable for use. 93 * Initialize a condition variable for use.
96 */ 94 */
97void 95void
98cv_init(kcondvar_t *cv, const char *wmesg) 96cv_init(kcondvar_t *cv, const char *wmesg)
99{ 97{
100#ifdef LOCKDEBUG 98#ifdef LOCKDEBUG
101 bool dodebug; 99 bool dodebug;
102 100
103 dodebug = LOCKDEBUG_ALLOC(cv, &cv_lockops, 101 dodebug = LOCKDEBUG_ALLOC(cv, &cv_lockops,
104 (uintptr_t)__builtin_return_address(0)); 102 (uintptr_t)__builtin_return_address(0));
105 if (!dodebug) { 103 if (!dodebug) {
106 /* XXX This will break vfs_lockf. */ 104 /* XXX This will break vfs_lockf. */
107 wmesg = nodebug; 105 wmesg = nodebug;
108 } 106 }
109#endif 107#endif
110 KASSERT(wmesg != NULL); 108 KASSERT(wmesg != NULL);
111 CV_SET_WMESG(cv, wmesg); 109 CV_SET_WMESG(cv, wmesg);
112 sleepq_init(CV_SLEEPQ(cv)); 110 sleepq_init(CV_SLEEPQ(cv));
113} 111}
114 112
115/* 113/*
116 * cv_destroy: 114 * cv_destroy:
117 * 115 *
118 * Tear down a condition variable. 116 * Tear down a condition variable.
119 */ 117 */
120void 118void
121cv_destroy(kcondvar_t *cv) 119cv_destroy(kcondvar_t *cv)
122{ 120{
123 121
124 LOCKDEBUG_FREE(CV_DEBUG_P(cv), cv); 122 LOCKDEBUG_FREE(CV_DEBUG_P(cv), cv);
125#ifdef DIAGNOSTIC 123#ifdef DIAGNOSTIC
126 KASSERT(cv_is_valid(cv)); 124 KASSERT(cv_is_valid(cv));
127 CV_SET_WMESG(cv, deadcv); 125 CV_SET_WMESG(cv, deadcv);
128#endif 126#endif
129} 127}
130 128
131/* 129/*
132 * cv_enter: 130 * cv_enter:
133 * 131 *
134 * Look up and lock the sleep queue corresponding to the given 132 * Look up and lock the sleep queue corresponding to the given
135 * condition variable, and increment the number of waiters. 133 * condition variable, and increment the number of waiters.
136 */ 134 */
137static inline void 135static inline void
138cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l) 136cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l)
139{ 137{
140 sleepq_t *sq; 138 sleepq_t *sq;
141 kmutex_t *mp; 139 kmutex_t *mp;
142 140
143 KASSERT(cv_is_valid(cv)); 141 KASSERT(cv_is_valid(cv));
144 KASSERT(!cpu_intr_p()); 142 KASSERT(!cpu_intr_p());
145 KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL); 143 KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL);
146 144
147 LOCKDEBUG_LOCKED(CV_DEBUG_P(cv), cv, mtx, CV_RA, 0); 145 LOCKDEBUG_LOCKED(CV_DEBUG_P(cv), cv, mtx, CV_RA, 0);
148 146
149 l->l_kpriority = true; 147 l->l_kpriority = true;
150 mp = sleepq_hashlock(cv); 148 mp = sleepq_hashlock(cv);
151 sq = CV_SLEEPQ(cv); 149 sq = CV_SLEEPQ(cv);
152 sleepq_enter(sq, l, mp); 150 sleepq_enter(sq, l, mp);
153 sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj); 151 sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj);
154 mutex_exit(mtx); 152 mutex_exit(mtx);
155 KASSERT(cv_has_waiters(cv)); 153 KASSERT(cv_has_waiters(cv));
156} 154}
157 155
158/* 156/*
159 * cv_exit: 157 * cv_exit:
160 * 158 *
161 * After resuming execution, check to see if we have been restarted 159 * After resuming execution, check to see if we have been restarted
162 * as a result of cv_signal(). If we have, but cannot take the 160 * as a result of cv_signal(). If we have, but cannot take the
163 * wakeup (because of eg a pending Unix signal or timeout) then try 161 * wakeup (because of eg a pending Unix signal or timeout) then try
164 * to ensure that another LWP sees it. This is necessary because 162 * to ensure that another LWP sees it. This is necessary because
165 * there may be multiple waiters, and at least one should take the 163 * there may be multiple waiters, and at least one should take the
166 * wakeup if possible. 164 * wakeup if possible.
167 */ 165 */
168static inline int 166static inline int
169cv_exit(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, const int error) 167cv_exit(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, const int error)
170{ 168{
171 169
172 mutex_enter(mtx); 170 mutex_enter(mtx);
173 if (__predict_false(error != 0)) 171 if (__predict_false(error != 0))
174 cv_signal(cv); 172 cv_signal(cv);
175 173
176 LOCKDEBUG_UNLOCKED(CV_DEBUG_P(cv), cv, CV_RA, 0); 174 LOCKDEBUG_UNLOCKED(CV_DEBUG_P(cv), cv, CV_RA, 0);
177 KASSERT(cv_is_valid(cv)); 175 KASSERT(cv_is_valid(cv));
178 176
179 return error; 177 return error;
180} 178}
181 179
182/* 180/*
183 * cv_unsleep: 181 * cv_unsleep:
184 * 182 *
185 * Remove an LWP from the condition variable and sleep queue. This 183 * Remove an LWP from the condition variable and sleep queue. This
186 * is called when the LWP has not been awoken normally but instead 184 * is called when the LWP has not been awoken normally but instead
187 * interrupted: for example, when a signal is received. Must be 185 * interrupted: for example, when a signal is received. Must be
188 * called with the LWP locked, and must return it unlocked. 186 * called with the LWP locked, and must return it unlocked.
189 */ 187 */
190static void 188static void
191cv_unsleep(lwp_t *l, bool cleanup) 189cv_unsleep(lwp_t *l, bool cleanup)
192{ 190{
193 kcondvar_t *cv; 191 kcondvar_t *cv;
194 192
195 cv = (kcondvar_t *)(uintptr_t)l->l_wchan; 193 cv = (kcondvar_t *)(uintptr_t)l->l_wchan;
196 194
197 KASSERT(l->l_wchan == (wchan_t)cv); 195 KASSERT(l->l_wchan == (wchan_t)cv);
198 KASSERT(l->l_sleepq == CV_SLEEPQ(cv)); 196 KASSERT(l->l_sleepq == CV_SLEEPQ(cv));
199 KASSERT(cv_is_valid(cv)); 197 KASSERT(cv_is_valid(cv));
200 KASSERT(cv_has_waiters(cv)); 198 KASSERT(cv_has_waiters(cv));
201 199
202 sleepq_unsleep(l, cleanup); 200 sleepq_unsleep(l, cleanup);
203} 201}
204 202
205/* 203/*
206 * cv_wait: 204 * cv_wait:
207 * 205 *
208 * Wait non-interruptably on a condition variable until awoken. 206 * Wait non-interruptably on a condition variable until awoken.
209 */ 207 */
210void 208void
211cv_wait(kcondvar_t *cv, kmutex_t *mtx) 209cv_wait(kcondvar_t *cv, kmutex_t *mtx)
212{ 210{
213 lwp_t *l = curlwp; 211 lwp_t *l = curlwp;
214 212
215 KASSERT(mutex_owned(mtx)); 213 KASSERT(mutex_owned(mtx));
216 214
217 cv_enter(cv, mtx, l); 215 cv_enter(cv, mtx, l);
218 (void)sleepq_block(0, false); 216 (void)sleepq_block(0, false);
219 (void)cv_exit(cv, mtx, l, 0); 217 (void)cv_exit(cv, mtx, l, 0);
220} 218}
221 219
222/* 220/*
223 * cv_wait_sig: 221 * cv_wait_sig:
224 * 222 *
225 * Wait on a condition variable until a awoken or a signal is received.  223 * Wait on a condition variable until a awoken or a signal is received.
226 * Will also return early if the process is exiting. Returns zero if 224 * Will also return early if the process is exiting. Returns zero if
227 * awoken normally, ERESTART if a signal was received and the system 225 * awoken normally, ERESTART if a signal was received and the system
228 * call is restartable, or EINTR otherwise. 226 * call is restartable, or EINTR otherwise.
229 */ 227 */
230int 228int
231cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx) 229cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx)
232{ 230{
233 lwp_t *l = curlwp; 231 lwp_t *l = curlwp;
234 int error; 232 int error;
235 233
236 KASSERT(mutex_owned(mtx)); 234 KASSERT(mutex_owned(mtx));
237 235
238 cv_enter(cv, mtx, l); 236 cv_enter(cv, mtx, l);
239 error = sleepq_block(0, true); 237 error = sleepq_block(0, true);
240 return cv_exit(cv, mtx, l, error); 238 return cv_exit(cv, mtx, l, error);
241} 239}
242 240
243/* 241/*
244 * cv_timedwait: 242 * cv_timedwait:
245 * 243 *
246 * Wait on a condition variable until awoken or the specified timeout 244 * Wait on a condition variable until awoken or the specified timeout
247 * expires. Returns zero if awoken normally or EWOULDBLOCK if the 245 * expires. Returns zero if awoken normally or EWOULDBLOCK if the
248 * timeout expired. 246 * timeout expired.
249 */ 247 */
250int 248int
251cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo) 249cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo)
252{ 250{
253 lwp_t *l = curlwp; 251 lwp_t *l = curlwp;
254 int error; 252 int error;
255 253
256 KASSERT(mutex_owned(mtx)); 254 KASSERT(mutex_owned(mtx));
257 255
258 cv_enter(cv, mtx, l); 256 cv_enter(cv, mtx, l);
259 error = sleepq_block(timo, false); 257 error = sleepq_block(timo, false);
260 return cv_exit(cv, mtx, l, error); 258 return cv_exit(cv, mtx, l, error);
261} 259}
262 260
263/* 261/*
264 * cv_timedwait_sig: 262 * cv_timedwait_sig:
265 * 263 *
266 * Wait on a condition variable until a timeout expires, awoken or a 264 * Wait on a condition variable until a timeout expires, awoken or a
267 * signal is received. Will also return early if the process is 265 * signal is received. Will also return early if the process is
268 * exiting. Returns zero if awoken normally, EWOULDBLOCK if the 266 * exiting. Returns zero if awoken normally, EWOULDBLOCK if the
269 * timeout expires, ERESTART if a signal was received and the system 267 * timeout expires, ERESTART if a signal was received and the system
270 * call is restartable, or EINTR otherwise. 268 * call is restartable, or EINTR otherwise.
271 */ 269 */
272int 270int
273cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo) 271cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo)
274{ 272{
275 lwp_t *l = curlwp; 273 lwp_t *l = curlwp;
276 int error; 274 int error;
277 275
278 KASSERT(mutex_owned(mtx)); 276 KASSERT(mutex_owned(mtx));
279 277
280 cv_enter(cv, mtx, l); 278 cv_enter(cv, mtx, l);
281 error = sleepq_block(timo, true); 279 error = sleepq_block(timo, true);
282 return cv_exit(cv, mtx, l, error); 280 return cv_exit(cv, mtx, l, error);
283} 281}
284 282
285/* 283/*
286 * cv_signal: 284 * cv_signal:
287 * 285 *
288 * Wake the highest priority LWP waiting on a condition variable. 286 * Wake the highest priority LWP waiting on a condition variable.
289 * Must be called with the interlocking mutex held. 287 * Must be called with the interlocking mutex held.
290 */ 288 */
291void 289void
292cv_signal(kcondvar_t *cv) 290cv_signal(kcondvar_t *cv)
293{ 291{
294 292
295 /* LOCKDEBUG_WAKEUP(CV_DEBUG_P(cv), cv, CV_RA); */ 293 /* LOCKDEBUG_WAKEUP(CV_DEBUG_P(cv), cv, CV_RA); */
296 KASSERT(cv_is_valid(cv)); 294 KASSERT(cv_is_valid(cv));
297 295
298 if (__predict_false(!TAILQ_EMPTY(CV_SLEEPQ(cv)))) 296 if (__predict_false(!TAILQ_EMPTY(CV_SLEEPQ(cv))))
299 cv_wakeup_one(cv); 297 cv_wakeup_one(cv);
300} 298}
301 299
302static void __noinline 300static void __noinline
303cv_wakeup_one(kcondvar_t *cv) 301cv_wakeup_one(kcondvar_t *cv)
304{ 302{
305 sleepq_t *sq; 303 sleepq_t *sq;
306 kmutex_t *mp; 304 kmutex_t *mp;
307 lwp_t *l; 305 lwp_t *l;
308 306
309 KASSERT(cv_is_valid(cv)); 307 KASSERT(cv_is_valid(cv));
310 308
311 mp = sleepq_hashlock(cv); 309 mp = sleepq_hashlock(cv);
312 sq = CV_SLEEPQ(cv); 310 sq = CV_SLEEPQ(cv);
313 l = TAILQ_FIRST(sq); 311 l = TAILQ_FIRST(sq);
314 if (l == NULL) { 312 if (l == NULL) {
315 mutex_spin_exit(mp); 313 mutex_spin_exit(mp);
316 return; 314 return;
317 } 315 }
318 KASSERT(l->l_sleepq == sq); 316 KASSERT(l->l_sleepq == sq);
319 KASSERT(l->l_mutex == mp); 317 KASSERT(l->l_mutex == mp);
320 KASSERT(l->l_wchan == cv); 318 KASSERT(l->l_wchan == cv);
321 sleepq_remove(sq, l); 319 sleepq_remove(sq, l);
322 mutex_spin_exit(mp); 320 mutex_spin_exit(mp);
323 321
324 KASSERT(cv_is_valid(cv)); 322 KASSERT(cv_is_valid(cv));
325} 323}
326 324
327/* 325/*
328 * cv_broadcast: 326 * cv_broadcast:
329 * 327 *
330 * Wake all LWPs waiting on a condition variable. Must be called 328 * Wake all LWPs waiting on a condition variable. Must be called
331 * with the interlocking mutex held. 329 * with the interlocking mutex held.
332 */ 330 */
333void 331void
334cv_broadcast(kcondvar_t *cv) 332cv_broadcast(kcondvar_t *cv)
335{ 333{
336 334
337 /* LOCKDEBUG_WAKEUP(CV_DEBUG_P(cv), cv, CV_RA); */ 335 /* LOCKDEBUG_WAKEUP(CV_DEBUG_P(cv), cv, CV_RA); */
338 KASSERT(cv_is_valid(cv)); 336 KASSERT(cv_is_valid(cv));
339 337
340 if (__predict_false(!TAILQ_EMPTY(CV_SLEEPQ(cv))))  338 if (__predict_false(!TAILQ_EMPTY(CV_SLEEPQ(cv))))
341 cv_wakeup_all(cv); 339 cv_wakeup_all(cv);
342} 340}
343 341
344static void __noinline 342static void __noinline
345cv_wakeup_all(kcondvar_t *cv) 343cv_wakeup_all(kcondvar_t *cv)
346{ 344{
347 sleepq_t *sq; 345 sleepq_t *sq;
348 kmutex_t *mp; 346 kmutex_t *mp;
349 lwp_t *l, *next; 347 lwp_t *l, *next;
350 348
351 KASSERT(cv_is_valid(cv)); 349 KASSERT(cv_is_valid(cv));
352 350
353 mp = sleepq_hashlock(cv); 351 mp = sleepq_hashlock(cv);
354 sq = CV_SLEEPQ(cv); 352 sq = CV_SLEEPQ(cv);
355 for (l = TAILQ_FIRST(sq); l != NULL; l = next) { 353 for (l = TAILQ_FIRST(sq); l != NULL; l = next) {
356 KASSERT(l->l_sleepq == sq); 354 KASSERT(l->l_sleepq == sq);
357 KASSERT(l->l_mutex == mp); 355 KASSERT(l->l_mutex == mp);
358 KASSERT(l->l_wchan == cv); 356 KASSERT(l->l_wchan == cv);
359 next = TAILQ_NEXT(l, l_sleepchain); 357 next = TAILQ_NEXT(l, l_sleepchain);
360 sleepq_remove(sq, l); 358 sleepq_remove(sq, l);
361 } 359 }
362 mutex_spin_exit(mp); 360 mutex_spin_exit(mp);
363 361
364 KASSERT(cv_is_valid(cv)); 362 KASSERT(cv_is_valid(cv));
365} 363}
366 364
367/* 365/*
368 * cv_has_waiters: 366 * cv_has_waiters:
369 * 367 *
370 * For diagnostic assertions: return non-zero if a condition 368 * For diagnostic assertions: return non-zero if a condition
371 * variable has waiters. 369 * variable has waiters.
372 */ 370 */
373bool 371bool
374cv_has_waiters(kcondvar_t *cv) 372cv_has_waiters(kcondvar_t *cv)
375{ 373{
376 374
377 return !TAILQ_EMPTY(CV_SLEEPQ(cv)); 375 return !TAILQ_EMPTY(CV_SLEEPQ(cv));
378} 376}
379 377
380/* 378/*
381 * cv_is_valid: 379 * cv_is_valid:
382 * 380 *
383 * For diagnostic assertions: return non-zero if a condition 381 * For diagnostic assertions: return non-zero if a condition
384 * variable appears to be valid. No locks need be held. 382 * variable appears to be valid. No locks need be held.
385 */ 383 */
386bool 384bool
387cv_is_valid(kcondvar_t *cv) 385cv_is_valid(kcondvar_t *cv)
388{ 386{
389 387
390 return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL; 388 return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL;
391} 389}

cvs diff -r1.29 -r1.30 src/sys/kern/kern_turnstile.c (switch to unified diff)

--- src/sys/kern/kern_turnstile.c 2011/05/13 22:19:41 1.29
+++ src/sys/kern/kern_turnstile.c 2011/07/27 14:35:34 1.30
@@ -1,515 +1,513 @@ @@ -1,515 +1,513 @@
1/* $NetBSD: kern_turnstile.c,v 1.29 2011/05/13 22:19:41 rmind Exp $ */ 1/* $NetBSD: kern_turnstile.c,v 1.30 2011/07/27 14:35:34 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2002, 2006, 2007, 2009 The NetBSD Foundation, Inc. 4 * Copyright (c) 2002, 2006, 2007, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe and Andrew Doran. 8 * by Jason R. Thorpe and Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * Turnstiles are described in detail in: 33 * Turnstiles are described in detail in:
34 * 34 *
35 * Solaris Internals: Core Kernel Architecture, Jim Mauro and 35 * Solaris Internals: Core Kernel Architecture, Jim Mauro and
36 * Richard McDougall. 36 * Richard McDougall.
37 * 37 *
38 * Turnstiles are kept in a hash table. There are likely to be many more 38 * Turnstiles are kept in a hash table. There are likely to be many more
39 * synchronisation objects than there are threads. Since a thread can block 39 * synchronisation objects than there are threads. Since a thread can block
40 * on only one lock at a time, we only need one turnstile per thread, and 40 * on only one lock at a time, we only need one turnstile per thread, and
41 * so they are allocated at thread creation time. 41 * so they are allocated at thread creation time.
42 * 42 *
43 * When a thread decides it needs to block on a lock, it looks up the 43 * When a thread decides it needs to block on a lock, it looks up the
44 * active turnstile for that lock. If no active turnstile exists, then 44 * active turnstile for that lock. If no active turnstile exists, then
45 * the process lends its turnstile to the lock. If there is already an 45 * the process lends its turnstile to the lock. If there is already an
46 * active turnstile for the lock, the thread places its turnstile on a 46 * active turnstile for the lock, the thread places its turnstile on a
47 * list of free turnstiles, and references the active one instead. 47 * list of free turnstiles, and references the active one instead.
48 * 48 *
49 * The act of looking up the turnstile acquires an interlock on the sleep 49 * The act of looking up the turnstile acquires an interlock on the sleep
50 * queue. If a thread decides it doesn't need to block after all, then this 50 * queue. If a thread decides it doesn't need to block after all, then this
51 * interlock must be released by explicitly aborting the turnstile 51 * interlock must be released by explicitly aborting the turnstile
52 * operation. 52 * operation.
53 * 53 *
54 * When a thread is awakened, it needs to get its turnstile back. If there 54 * When a thread is awakened, it needs to get its turnstile back. If there
55 * are still other threads waiting in the active turnstile, the thread 55 * are still other threads waiting in the active turnstile, the thread
56 * grabs a free turnstile off the free list. Otherwise, it can take back 56 * grabs a free turnstile off the free list. Otherwise, it can take back
57 * the active turnstile from the lock (thus deactivating the turnstile). 57 * the active turnstile from the lock (thus deactivating the turnstile).
58 * 58 *
59 * Turnstiles are the place to do priority inheritence. 59 * Turnstiles are the place to do priority inheritence.
60 */ 60 */
61 61
62#include <sys/cdefs.h> 62#include <sys/cdefs.h>
63__KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.29 2011/05/13 22:19:41 rmind Exp $"); 63__KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.30 2011/07/27 14:35:34 uebayasi Exp $");
64 64
65#include <sys/param.h> 65#include <sys/param.h>
66#include <sys/lockdebug.h> 66#include <sys/lockdebug.h>
67#include <sys/pool.h> 67#include <sys/pool.h>
68#include <sys/proc.h>  68#include <sys/proc.h>
69#include <sys/sleepq.h> 69#include <sys/sleepq.h>
70#include <sys/systm.h> 70#include <sys/systm.h>
71 71
72#include <uvm/uvm_extern.h> 
73 
74#define TS_HASH_SIZE 64 72#define TS_HASH_SIZE 64
75#define TS_HASH_MASK (TS_HASH_SIZE - 1) 73#define TS_HASH_MASK (TS_HASH_SIZE - 1)
76#define TS_HASH(obj) (((uintptr_t)(obj) >> 3) & TS_HASH_MASK) 74#define TS_HASH(obj) (((uintptr_t)(obj) >> 3) & TS_HASH_MASK)
77 75
78static tschain_t turnstile_tab[TS_HASH_SIZE] __cacheline_aligned; 76static tschain_t turnstile_tab[TS_HASH_SIZE] __cacheline_aligned;
79pool_cache_t turnstile_cache __read_mostly; 77pool_cache_t turnstile_cache __read_mostly;
80 78
81static int turnstile_ctor(void *, void *, int); 79static int turnstile_ctor(void *, void *, int);
82 80
83extern turnstile_t turnstile0; 81extern turnstile_t turnstile0;
84 82
85/* 83/*
86 * turnstile_init: 84 * turnstile_init:
87 * 85 *
88 * Initialize the turnstile mechanism. 86 * Initialize the turnstile mechanism.
89 */ 87 */
90void 88void
91turnstile_init(void) 89turnstile_init(void)
92{ 90{
93 tschain_t *tc; 91 tschain_t *tc;
94 int i; 92 int i;
95 93
96 for (i = 0; i < TS_HASH_SIZE; i++) { 94 for (i = 0; i < TS_HASH_SIZE; i++) {
97 tc = &turnstile_tab[i]; 95 tc = &turnstile_tab[i];
98 LIST_INIT(&tc->tc_chain); 96 LIST_INIT(&tc->tc_chain);
99 tc->tc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 97 tc->tc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
100 } 98 }
101 99
102 turnstile_cache = pool_cache_init(sizeof(turnstile_t), 0, 0, 0, 100 turnstile_cache = pool_cache_init(sizeof(turnstile_t), 0, 0, 0,
103 "tstilepl", NULL, IPL_NONE, turnstile_ctor, NULL, NULL); 101 "tstilepl", NULL, IPL_NONE, turnstile_ctor, NULL, NULL);
104 KASSERT(turnstile_cache != NULL); 102 KASSERT(turnstile_cache != NULL);
105 103
106 (void)turnstile_ctor(NULL, &turnstile0, 0); 104 (void)turnstile_ctor(NULL, &turnstile0, 0);
107} 105}
108 106
109/* 107/*
110 * turnstile_ctor: 108 * turnstile_ctor:
111 * 109 *
112 * Constructor for turnstiles. 110 * Constructor for turnstiles.
113 */ 111 */
114static int 112static int
115turnstile_ctor(void *arg, void *obj, int flags) 113turnstile_ctor(void *arg, void *obj, int flags)
116{ 114{
117 turnstile_t *ts = obj; 115 turnstile_t *ts = obj;
118 116
119 memset(ts, 0, sizeof(*ts)); 117 memset(ts, 0, sizeof(*ts));
120 sleepq_init(&ts->ts_sleepq[TS_READER_Q]); 118 sleepq_init(&ts->ts_sleepq[TS_READER_Q]);
121 sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]); 119 sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]);
122 return (0); 120 return (0);
123} 121}
124 122
125/* 123/*
126 * turnstile_remove: 124 * turnstile_remove:
127 * 125 *
128 * Remove an LWP from a turnstile sleep queue and wake it. 126 * Remove an LWP from a turnstile sleep queue and wake it.
129 */ 127 */
130static inline void 128static inline void
131turnstile_remove(turnstile_t *ts, lwp_t *l, int q) 129turnstile_remove(turnstile_t *ts, lwp_t *l, int q)
132{ 130{
133 turnstile_t *nts; 131 turnstile_t *nts;
134 132
135 KASSERT(l->l_ts == ts); 133 KASSERT(l->l_ts == ts);
136 134
137 /* 135 /*
138 * This process is no longer using the active turnstile. 136 * This process is no longer using the active turnstile.
139 * Find an inactive one on the free list to give to it. 137 * Find an inactive one on the free list to give to it.
140 */ 138 */
141 if ((nts = ts->ts_free) != NULL) { 139 if ((nts = ts->ts_free) != NULL) {
142 KASSERT(TS_ALL_WAITERS(ts) > 1); 140 KASSERT(TS_ALL_WAITERS(ts) > 1);
143 l->l_ts = nts; 141 l->l_ts = nts;
144 ts->ts_free = nts->ts_free; 142 ts->ts_free = nts->ts_free;
145 nts->ts_free = NULL; 143 nts->ts_free = NULL;
146 } else { 144 } else {
147 /* 145 /*
148 * If the free list is empty, this is the last 146 * If the free list is empty, this is the last
149 * waiter. 147 * waiter.
150 */ 148 */
151 KASSERT(TS_ALL_WAITERS(ts) == 1); 149 KASSERT(TS_ALL_WAITERS(ts) == 1);
152 LIST_REMOVE(ts, ts_chain); 150 LIST_REMOVE(ts, ts_chain);
153 } 151 }
154 152
155 ts->ts_waiters[q]--; 153 ts->ts_waiters[q]--;
156 sleepq_remove(&ts->ts_sleepq[q], l); 154 sleepq_remove(&ts->ts_sleepq[q], l);
157} 155}
158 156
159/* 157/*
160 * turnstile_lookup: 158 * turnstile_lookup:
161 * 159 *
162 * Look up the turnstile for the specified lock. This acquires and 160 * Look up the turnstile for the specified lock. This acquires and
163 * holds the turnstile chain lock (sleep queue interlock). 161 * holds the turnstile chain lock (sleep queue interlock).
164 */ 162 */
165turnstile_t * 163turnstile_t *
166turnstile_lookup(wchan_t obj) 164turnstile_lookup(wchan_t obj)
167{ 165{
168 turnstile_t *ts; 166 turnstile_t *ts;
169 tschain_t *tc; 167 tschain_t *tc;
170 168
171 tc = &turnstile_tab[TS_HASH(obj)]; 169 tc = &turnstile_tab[TS_HASH(obj)];
172 mutex_spin_enter(tc->tc_mutex); 170 mutex_spin_enter(tc->tc_mutex);
173 171
174 LIST_FOREACH(ts, &tc->tc_chain, ts_chain) 172 LIST_FOREACH(ts, &tc->tc_chain, ts_chain)
175 if (ts->ts_obj == obj) 173 if (ts->ts_obj == obj)
176 return (ts); 174 return (ts);
177 175
178 /* 176 /*
179 * No turnstile yet for this lock. No problem, turnstile_block() 177 * No turnstile yet for this lock. No problem, turnstile_block()
180 * handles this by fetching the turnstile from the blocking thread. 178 * handles this by fetching the turnstile from the blocking thread.
181 */ 179 */
182 return (NULL); 180 return (NULL);
183} 181}
184 182
185/* 183/*
186 * turnstile_exit: 184 * turnstile_exit:
187 * 185 *
188 * Abort a turnstile operation. 186 * Abort a turnstile operation.
189 */ 187 */
190void 188void
191turnstile_exit(wchan_t obj) 189turnstile_exit(wchan_t obj)
192{ 190{
193 tschain_t *tc; 191 tschain_t *tc;
194 192
195 tc = &turnstile_tab[TS_HASH(obj)]; 193 tc = &turnstile_tab[TS_HASH(obj)];
196 mutex_spin_exit(tc->tc_mutex); 194 mutex_spin_exit(tc->tc_mutex);
197} 195}
198 196
199/* 197/*
200 * turnstile_block: 198 * turnstile_block:
201 * 199 *
202 * Enter an object into the turnstile chain and prepare the current 200 * Enter an object into the turnstile chain and prepare the current
203 * LWP for sleep. 201 * LWP for sleep.
204 */ 202 */
205void 203void
206turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj) 204turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj)
207{ 205{
208 lwp_t *l; 206 lwp_t *l;
209 lwp_t *cur; /* cached curlwp */ 207 lwp_t *cur; /* cached curlwp */
210 lwp_t *owner; 208 lwp_t *owner;
211 turnstile_t *ots; 209 turnstile_t *ots;
212 tschain_t *tc; 210 tschain_t *tc;
213 sleepq_t *sq; 211 sleepq_t *sq;
214 pri_t prio, obase; 212 pri_t prio, obase;
215 213
216 tc = &turnstile_tab[TS_HASH(obj)]; 214 tc = &turnstile_tab[TS_HASH(obj)];
217 l = cur = curlwp; 215 l = cur = curlwp;
218 216
219 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 217 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q);
220 KASSERT(mutex_owned(tc->tc_mutex)); 218 KASSERT(mutex_owned(tc->tc_mutex));
221 KASSERT(l != NULL && l->l_ts != NULL); 219 KASSERT(l != NULL && l->l_ts != NULL);
222 220
223 if (ts == NULL) { 221 if (ts == NULL) {
224 /* 222 /*
225 * We are the first thread to wait for this object; 223 * We are the first thread to wait for this object;
226 * lend our turnstile to it. 224 * lend our turnstile to it.
227 */ 225 */
228 ts = l->l_ts; 226 ts = l->l_ts;
229 KASSERT(TS_ALL_WAITERS(ts) == 0); 227 KASSERT(TS_ALL_WAITERS(ts) == 0);
230 KASSERT(TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) && 228 KASSERT(TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) &&
231 TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 229 TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
232 ts->ts_obj = obj; 230 ts->ts_obj = obj;
233 ts->ts_inheritor = NULL; 231 ts->ts_inheritor = NULL;
234 LIST_INSERT_HEAD(&tc->tc_chain, ts, ts_chain); 232 LIST_INSERT_HEAD(&tc->tc_chain, ts, ts_chain);
235 } else { 233 } else {
236 /* 234 /*
237 * Object already has a turnstile. Put our turnstile 235 * Object already has a turnstile. Put our turnstile
238 * onto the free list, and reference the existing 236 * onto the free list, and reference the existing
239 * turnstile instead. 237 * turnstile instead.
240 */ 238 */
241 ots = l->l_ts; 239 ots = l->l_ts;
242 KASSERT(ots->ts_free == NULL); 240 KASSERT(ots->ts_free == NULL);
243 ots->ts_free = ts->ts_free; 241 ots->ts_free = ts->ts_free;
244 ts->ts_free = ots; 242 ts->ts_free = ots;
245 l->l_ts = ts; 243 l->l_ts = ts;
246 244
247 KASSERT(ts->ts_obj == obj); 245 KASSERT(ts->ts_obj == obj);
248 KASSERT(TS_ALL_WAITERS(ts) != 0); 246 KASSERT(TS_ALL_WAITERS(ts) != 0);
249 KASSERT(!TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) || 247 KASSERT(!TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) ||
250 !TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 248 !TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
251 } 249 }
252 250
253 sq = &ts->ts_sleepq[q]; 251 sq = &ts->ts_sleepq[q];
254 ts->ts_waiters[q]++; 252 ts->ts_waiters[q]++;
255 sleepq_enter(sq, l, tc->tc_mutex); 253 sleepq_enter(sq, l, tc->tc_mutex);
256 LOCKDEBUG_BARRIER(tc->tc_mutex, 1); 254 LOCKDEBUG_BARRIER(tc->tc_mutex, 1);
257 l->l_kpriority = true; 255 l->l_kpriority = true;
258 obase = l->l_kpribase; 256 obase = l->l_kpribase;
259 if (obase < PRI_KTHREAD) 257 if (obase < PRI_KTHREAD)
260 l->l_kpribase = PRI_KTHREAD; 258 l->l_kpribase = PRI_KTHREAD;
261 sleepq_enqueue(sq, obj, "tstile", sobj); 259 sleepq_enqueue(sq, obj, "tstile", sobj);
262 260
263 /* 261 /*
264 * Disable preemption across this entire block, as we may drop 262 * Disable preemption across this entire block, as we may drop
265 * scheduler locks (allowing preemption), and would prefer not 263 * scheduler locks (allowing preemption), and would prefer not
266 * to be interrupted while in a state of flux. 264 * to be interrupted while in a state of flux.
267 */ 265 */
268 KPREEMPT_DISABLE(l); 266 KPREEMPT_DISABLE(l);
269 267
270 /* 268 /*
271 * Lend our priority to lwps on the blocking chain. 269 * Lend our priority to lwps on the blocking chain.
272 * 270 *
273 * NOTE: if you get a panic in this code block, it is likely that 271 * NOTE: if you get a panic in this code block, it is likely that
274 * a lock has been destroyed or corrupted while still in use. Try 272 * a lock has been destroyed or corrupted while still in use. Try
275 * compiling a kernel with LOCKDEBUG to pinpoint the problem. 273 * compiling a kernel with LOCKDEBUG to pinpoint the problem.
276 */ 274 */
277 prio = lwp_eprio(l); 275 prio = lwp_eprio(l);
278 KASSERT(cur == l); 276 KASSERT(cur == l);
279 KASSERT(tc->tc_mutex == cur->l_mutex); 277 KASSERT(tc->tc_mutex == cur->l_mutex);
280 for (;;) { 278 for (;;) {
281 bool dolock; 279 bool dolock;
282 280
283 if (l->l_wchan == NULL) 281 if (l->l_wchan == NULL)
284 break; 282 break;
285 283
286 owner = (*l->l_syncobj->sobj_owner)(l->l_wchan); 284 owner = (*l->l_syncobj->sobj_owner)(l->l_wchan);
287 if (owner == NULL) 285 if (owner == NULL)
288 break; 286 break;
289 287
290 /* The owner may have changed as we have dropped the tc lock */ 288 /* The owner may have changed as we have dropped the tc lock */
291 if (cur == owner) { 289 if (cur == owner) {
292 /* 290 /*
293 * we own the lock: stop here, sleepq_block() 291 * we own the lock: stop here, sleepq_block()
294 * should wake up immediatly 292 * should wake up immediatly
295 */ 293 */
296 break; 294 break;
297 } 295 }
298 if (l->l_mutex != owner->l_mutex) 296 if (l->l_mutex != owner->l_mutex)
299 dolock = true; 297 dolock = true;
300 else 298 else
301 dolock = false; 299 dolock = false;
302 if (l == owner || (dolock && !lwp_trylock(owner))) { 300 if (l == owner || (dolock && !lwp_trylock(owner))) {
303 /* 301 /*
304 * restart from curlwp. 302 * restart from curlwp.
305 * Note that there may be a livelock here: 303 * Note that there may be a livelock here:
306 * the owner may try grabing cur's lock (which is 304 * the owner may try grabing cur's lock (which is
307 * the tc lock) while we're trying to grab 305 * the tc lock) while we're trying to grab
308 * the owner's lock. 306 * the owner's lock.
309 */ 307 */
310 lwp_unlock(l); 308 lwp_unlock(l);
311 l = cur; 309 l = cur;
312 lwp_lock(l); 310 lwp_lock(l);
313 prio = lwp_eprio(l); 311 prio = lwp_eprio(l);
314 continue; 312 continue;
315 } 313 }
316 if (prio <= lwp_eprio(owner)) { 314 if (prio <= lwp_eprio(owner)) {
317 if (dolock) 315 if (dolock)
318 lwp_unlock(owner); 316 lwp_unlock(owner);
319 break; 317 break;
320 } 318 }
321 ts = l->l_ts; 319 ts = l->l_ts;
322 KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL); 320 KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL);
323 if (ts->ts_inheritor == NULL) { 321 if (ts->ts_inheritor == NULL) {
324 ts->ts_inheritor = owner; 322 ts->ts_inheritor = owner;
325 ts->ts_eprio = prio; 323 ts->ts_eprio = prio;
326 SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain); 324 SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain);
327 lwp_lendpri(owner, prio); 325 lwp_lendpri(owner, prio);
328 } else if (prio > ts->ts_eprio) { 326 } else if (prio > ts->ts_eprio) {
329 ts->ts_eprio = prio; 327 ts->ts_eprio = prio;
330 lwp_lendpri(owner, prio); 328 lwp_lendpri(owner, prio);
331 } 329 }
332 if (dolock) 330 if (dolock)
333 lwp_unlock(l); 331 lwp_unlock(l);
334 l = owner; 332 l = owner;
335 } 333 }
336 LOCKDEBUG_BARRIER(l->l_mutex, 1); 334 LOCKDEBUG_BARRIER(l->l_mutex, 1);
337 if (cur->l_mutex != l->l_mutex) { 335 if (cur->l_mutex != l->l_mutex) {
338 lwp_unlock(l); 336 lwp_unlock(l);
339 lwp_lock(cur); 337 lwp_lock(cur);
340 } 338 }
341 LOCKDEBUG_BARRIER(cur->l_mutex, 1); 339 LOCKDEBUG_BARRIER(cur->l_mutex, 1);
342 340
343 sleepq_block(0, false); 341 sleepq_block(0, false);
344 cur->l_kpribase = obase; 342 cur->l_kpribase = obase;
345 KPREEMPT_ENABLE(cur); 343 KPREEMPT_ENABLE(cur);
346} 344}
347 345
348/* 346/*
349 * turnstile_wakeup: 347 * turnstile_wakeup:
350 * 348 *
351 * Wake up the specified number of threads that are blocked 349 * Wake up the specified number of threads that are blocked
352 * in a turnstile. 350 * in a turnstile.
353 */ 351 */
354void 352void
355turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl) 353turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl)
356{ 354{
357 sleepq_t *sq; 355 sleepq_t *sq;
358 tschain_t *tc; 356 tschain_t *tc;
359 lwp_t *l; 357 lwp_t *l;
360 358
361 tc = &turnstile_tab[TS_HASH(ts->ts_obj)]; 359 tc = &turnstile_tab[TS_HASH(ts->ts_obj)];
362 sq = &ts->ts_sleepq[q]; 360 sq = &ts->ts_sleepq[q];
363 361
364 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 362 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q);
365 KASSERT(count > 0 && count <= TS_WAITERS(ts, q)); 363 KASSERT(count > 0 && count <= TS_WAITERS(ts, q));
366 KASSERT(mutex_owned(tc->tc_mutex)); 364 KASSERT(mutex_owned(tc->tc_mutex));
367 KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL); 365 KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL);
368 366
369 /* 367 /*
370 * restore inherited priority if necessary. 368 * restore inherited priority if necessary.
371 */ 369 */
372 370
373 if (ts->ts_inheritor != NULL) { 371 if (ts->ts_inheritor != NULL) {
374 turnstile_t *iter; 372 turnstile_t *iter;
375 turnstile_t *next; 373 turnstile_t *next;
376 turnstile_t *prev = NULL; 374 turnstile_t *prev = NULL;
377 pri_t prio; 375 pri_t prio;
378 bool dolock; 376 bool dolock;
379 377
380 ts->ts_inheritor = NULL; 378 ts->ts_inheritor = NULL;
381 l = curlwp; 379 l = curlwp;
382 380
383 dolock = l->l_mutex == l->l_cpu->ci_schedstate.spc_lwplock; 381 dolock = l->l_mutex == l->l_cpu->ci_schedstate.spc_lwplock;
384 if (dolock) { 382 if (dolock) {
385 lwp_lock(l); 383 lwp_lock(l);
386 } 384 }
387 385
388 /* 386 /*
389 * the following loop does two things. 387 * the following loop does two things.
390 * 388 *
391 * - remove ts from the list. 389 * - remove ts from the list.
392 * 390 *
393 * - from the rest of the list, find the highest priority. 391 * - from the rest of the list, find the highest priority.
394 */ 392 */
395 393
396 prio = -1; 394 prio = -1;
397 KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); 395 KASSERT(!SLIST_EMPTY(&l->l_pi_lenders));
398 for (iter = SLIST_FIRST(&l->l_pi_lenders); 396 for (iter = SLIST_FIRST(&l->l_pi_lenders);
399 iter != NULL; iter = next) { 397 iter != NULL; iter = next) {
400 KASSERT(lwp_eprio(l) >= ts->ts_eprio); 398 KASSERT(lwp_eprio(l) >= ts->ts_eprio);
401 next = SLIST_NEXT(iter, ts_pichain); 399 next = SLIST_NEXT(iter, ts_pichain);
402 if (iter == ts) { 400 if (iter == ts) {
403 if (prev == NULL) { 401 if (prev == NULL) {
404 SLIST_REMOVE_HEAD(&l->l_pi_lenders, 402 SLIST_REMOVE_HEAD(&l->l_pi_lenders,
405 ts_pichain); 403 ts_pichain);
406 } else { 404 } else {
407 SLIST_REMOVE_AFTER(prev, ts_pichain); 405 SLIST_REMOVE_AFTER(prev, ts_pichain);
408 } 406 }
409 } else if (prio < iter->ts_eprio) { 407 } else if (prio < iter->ts_eprio) {
410 prio = iter->ts_eprio; 408 prio = iter->ts_eprio;
411 } 409 }
412 prev = iter; 410 prev = iter;
413 } 411 }
414 412
415 lwp_lendpri(l, prio); 413 lwp_lendpri(l, prio);
416 414
417 if (dolock) { 415 if (dolock) {
418 lwp_unlock(l); 416 lwp_unlock(l);
419 } 417 }
420 } 418 }
421 419
422 if (nl != NULL) { 420 if (nl != NULL) {
423#if defined(DEBUG) || defined(LOCKDEBUG) 421#if defined(DEBUG) || defined(LOCKDEBUG)
424 TAILQ_FOREACH(l, sq, l_sleepchain) { 422 TAILQ_FOREACH(l, sq, l_sleepchain) {
425 if (l == nl) 423 if (l == nl)
426 break; 424 break;
427 } 425 }
428 if (l == NULL) 426 if (l == NULL)
429 panic("turnstile_wakeup: nl not on sleepq"); 427 panic("turnstile_wakeup: nl not on sleepq");
430#endif 428#endif
431 turnstile_remove(ts, nl, q); 429 turnstile_remove(ts, nl, q);
432 } else { 430 } else {
433 while (count-- > 0) { 431 while (count-- > 0) {
434 l = TAILQ_FIRST(sq); 432 l = TAILQ_FIRST(sq);
435 KASSERT(l != NULL); 433 KASSERT(l != NULL);
436 turnstile_remove(ts, l, q); 434 turnstile_remove(ts, l, q);
437 } 435 }
438 } 436 }
439 mutex_spin_exit(tc->tc_mutex); 437 mutex_spin_exit(tc->tc_mutex);
440} 438}
441 439
442/* 440/*
443 * turnstile_unsleep: 441 * turnstile_unsleep:
444 * 442 *
445 * Remove an LWP from the turnstile. This is called when the LWP has 443 * Remove an LWP from the turnstile. This is called when the LWP has
446 * not been awoken normally but instead interrupted: for example, if it 444 * not been awoken normally but instead interrupted: for example, if it
447 * has received a signal. It's not a valid action for turnstiles, 445 * has received a signal. It's not a valid action for turnstiles,
448 * since LWPs blocking on a turnstile are not interruptable. 446 * since LWPs blocking on a turnstile are not interruptable.
449 */ 447 */
450void 448void
451turnstile_unsleep(lwp_t *l, bool cleanup) 449turnstile_unsleep(lwp_t *l, bool cleanup)
452{ 450{
453 451
454 lwp_unlock(l); 452 lwp_unlock(l);
455 panic("turnstile_unsleep"); 453 panic("turnstile_unsleep");
456} 454}
457 455
458/* 456/*
459 * turnstile_changepri: 457 * turnstile_changepri:
460 * 458 *
461 * Adjust the priority of an LWP residing on a turnstile. 459 * Adjust the priority of an LWP residing on a turnstile.
462 */ 460 */
463void 461void
464turnstile_changepri(lwp_t *l, pri_t pri) 462turnstile_changepri(lwp_t *l, pri_t pri)
465{ 463{
466 464
467 /* XXX priority inheritance */ 465 /* XXX priority inheritance */
468 sleepq_changepri(l, pri); 466 sleepq_changepri(l, pri);
469} 467}
470 468
471#if defined(LOCKDEBUG) 469#if defined(LOCKDEBUG)
472/* 470/*
473 * turnstile_print: 471 * turnstile_print:
474 * 472 *
475 * Given the address of a lock object, print the contents of a 473 * Given the address of a lock object, print the contents of a
476 * turnstile. 474 * turnstile.
477 */ 475 */
478void 476void
479turnstile_print(volatile void *obj, void (*pr)(const char *, ...)) 477turnstile_print(volatile void *obj, void (*pr)(const char *, ...))
480{ 478{
481 turnstile_t *ts; 479 turnstile_t *ts;
482 tschain_t *tc; 480 tschain_t *tc;
483 sleepq_t *rsq, *wsq; 481 sleepq_t *rsq, *wsq;
484 lwp_t *l; 482 lwp_t *l;
485 483
486 tc = &turnstile_tab[TS_HASH(obj)]; 484 tc = &turnstile_tab[TS_HASH(obj)];
487 485
488 LIST_FOREACH(ts, &tc->tc_chain, ts_chain) 486 LIST_FOREACH(ts, &tc->tc_chain, ts_chain)
489 if (ts->ts_obj == obj) 487 if (ts->ts_obj == obj)
490 break; 488 break;
491 489
492 (*pr)("Turnstile chain at %p.\n", tc); 490 (*pr)("Turnstile chain at %p.\n", tc);
493 if (ts == NULL) { 491 if (ts == NULL) {
494 (*pr)("=> No active turnstile for this lock.\n"); 492 (*pr)("=> No active turnstile for this lock.\n");
495 return; 493 return;
496 } 494 }
497 495
498 rsq = &ts->ts_sleepq[TS_READER_Q]; 496 rsq = &ts->ts_sleepq[TS_READER_Q];
499 wsq = &ts->ts_sleepq[TS_WRITER_Q]; 497 wsq = &ts->ts_sleepq[TS_WRITER_Q];
500 498
501 (*pr)("=> Turnstile at %p (wrq=%p, rdq=%p).\n", ts, rsq, wsq); 499 (*pr)("=> Turnstile at %p (wrq=%p, rdq=%p).\n", ts, rsq, wsq);
502 500
503 (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q)); 501 (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q));
504 TAILQ_FOREACH(l, rsq, l_sleepchain) { 502 TAILQ_FOREACH(l, rsq, l_sleepchain) {
505 (*pr)(" %p", l); 503 (*pr)(" %p", l);
506 } 504 }
507 (*pr)("\n"); 505 (*pr)("\n");
508 506
509 (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q)); 507 (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q));
510 TAILQ_FOREACH(l, wsq, l_sleepchain) { 508 TAILQ_FOREACH(l, wsq, l_sleepchain) {
511 (*pr)(" %p", l); 509 (*pr)(" %p", l);
512 } 510 }
513 (*pr)("\n"); 511 (*pr)("\n");
514} 512}
515#endif /* LOCKDEBUG */ 513#endif /* LOCKDEBUG */

cvs diff -r1.63 -r1.64 src/sys/kern/kern_ksyms.c (switch to unified diff)

--- src/sys/kern/kern_ksyms.c 2011/04/24 18:46:22 1.63
+++ src/sys/kern/kern_ksyms.c 2011/07/27 14:35:34 1.64
@@ -1,1094 +1,1092 @@ @@ -1,1094 +1,1092 @@
1/* $NetBSD: kern_ksyms.c,v 1.63 2011/04/24 18:46:22 rmind Exp $ */ 1/* $NetBSD: kern_ksyms.c,v 1.64 2011/07/27 14:35:34 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software developed for The NetBSD Foundation 7 * This code is derived from software developed for The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * Copyright (c) 2001, 2003 Anders Magnusson (ragge@ludd.luth.se). 33 * Copyright (c) 2001, 2003 Anders Magnusson (ragge@ludd.luth.se).
34 * All rights reserved. 34 * All rights reserved.
35 * 35 *
36 * Redistribution and use in source and binary forms, with or without 36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions 37 * modification, are permitted provided that the following conditions
38 * are met: 38 * are met:
39 * 1. Redistributions of source code must retain the above copyright 39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer. 40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright 41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the 42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution. 43 * documentation and/or other materials provided with the distribution.
44 * 3. The name of the author may not be used to endorse or promote products 44 * 3. The name of the author may not be used to endorse or promote products
45 * derived from this software without specific prior written permission 45 * derived from this software without specific prior written permission
46 * 46 *
47 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 47 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
48 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 48 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
49 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 49 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
50 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 50 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
51 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 51 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
52 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 52 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
53 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 53 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
54 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 54 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
55 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 55 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
56 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 56 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
57 */ 57 */
58 58
59/* 59/*
60 * Code to deal with in-kernel symbol table management + /dev/ksyms. 60 * Code to deal with in-kernel symbol table management + /dev/ksyms.
61 * 61 *
62 * For each loaded module the symbol table info is kept track of by a 62 * For each loaded module the symbol table info is kept track of by a
63 * struct, placed in a circular list. The first entry is the kernel 63 * struct, placed in a circular list. The first entry is the kernel
64 * symbol table. 64 * symbol table.
65 */ 65 */
66 66
67/* 67/*
68 * TODO: 68 * TODO:
69 * 69 *
70 * Add support for mmap, poll. 70 * Add support for mmap, poll.
71 */ 71 */
72 72
73#include <sys/cdefs.h> 73#include <sys/cdefs.h>
74__KERNEL_RCSID(0, "$NetBSD: kern_ksyms.c,v 1.63 2011/04/24 18:46:22 rmind Exp $"); 74__KERNEL_RCSID(0, "$NetBSD: kern_ksyms.c,v 1.64 2011/07/27 14:35:34 uebayasi Exp $");
75 75
76#if defined(_KERNEL) && defined(_KERNEL_OPT) 76#if defined(_KERNEL) && defined(_KERNEL_OPT)
77#include "opt_ddb.h" 77#include "opt_ddb.h"
78#include "opt_ddbparam.h" /* for SYMTAB_SPACE */ 78#include "opt_ddbparam.h" /* for SYMTAB_SPACE */
79#include "opt_dtrace.h" 79#include "opt_dtrace.h"
80#endif 80#endif
81 81
82#define _KSYMS_PRIVATE 82#define _KSYMS_PRIVATE
83 83
84#include <sys/param.h> 84#include <sys/param.h>
85#include <sys/queue.h> 85#include <sys/queue.h>
86#include <sys/exec.h> 86#include <sys/exec.h>
87#include <sys/systm.h> 87#include <sys/systm.h>
88#include <sys/conf.h> 88#include <sys/conf.h>
89#include <sys/kmem.h> 89#include <sys/kmem.h>
90#include <sys/proc.h> 90#include <sys/proc.h>
91#include <sys/atomic.h> 91#include <sys/atomic.h>
92#include <sys/ksyms.h> 92#include <sys/ksyms.h>
93 93
94#include <uvm/uvm_extern.h> 
95 
96#ifdef DDB 94#ifdef DDB
97#include <ddb/db_output.h> 95#include <ddb/db_output.h>
98#endif 96#endif
99 97
100#include "ksyms.h" 98#include "ksyms.h"
101 99
102#define KSYMS_MAX_ID 65536 100#define KSYMS_MAX_ID 65536
103#ifdef KDTRACE_HOOKS 101#ifdef KDTRACE_HOOKS
104static uint32_t ksyms_nmap[KSYMS_MAX_ID]; /* sorted symbol table map */ 102static uint32_t ksyms_nmap[KSYMS_MAX_ID]; /* sorted symbol table map */
105#else 103#else
106static uint32_t *ksyms_nmap = NULL; 104static uint32_t *ksyms_nmap = NULL;
107#endif 105#endif
108 106
109static int ksyms_maxlen; 107static int ksyms_maxlen;
110static bool ksyms_isopen; 108static bool ksyms_isopen;
111static bool ksyms_initted; 109static bool ksyms_initted;
112static struct ksyms_hdr ksyms_hdr; 110static struct ksyms_hdr ksyms_hdr;
113static kmutex_t ksyms_lock; 111static kmutex_t ksyms_lock;
114 112
115void ksymsattach(int); 113void ksymsattach(int);
116static void ksyms_hdr_init(void *); 114static void ksyms_hdr_init(void *);
117static void ksyms_sizes_calc(void); 115static void ksyms_sizes_calc(void);
118 116
119#ifdef KSYMS_DEBUG 117#ifdef KSYMS_DEBUG
120#define FOLLOW_CALLS 1 118#define FOLLOW_CALLS 1
121#define FOLLOW_MORE_CALLS 2 119#define FOLLOW_MORE_CALLS 2
122#define FOLLOW_DEVKSYMS 4 120#define FOLLOW_DEVKSYMS 4
123static int ksyms_debug; 121static int ksyms_debug;
124#endif 122#endif
125 123
126#ifdef SYMTAB_SPACE 124#ifdef SYMTAB_SPACE
127#define SYMTAB_FILLER "|This is the symbol table!" 125#define SYMTAB_FILLER "|This is the symbol table!"
128 126
129char db_symtab[SYMTAB_SPACE] = SYMTAB_FILLER; 127char db_symtab[SYMTAB_SPACE] = SYMTAB_FILLER;
130int db_symtabsize = SYMTAB_SPACE; 128int db_symtabsize = SYMTAB_SPACE;
131#endif 129#endif
132 130
133int ksyms_symsz; 131int ksyms_symsz;
134int ksyms_strsz; 132int ksyms_strsz;
135int ksyms_ctfsz; 133int ksyms_ctfsz;
136TAILQ_HEAD(, ksyms_symtab) ksyms_symtabs = 134TAILQ_HEAD(, ksyms_symtab) ksyms_symtabs =
137 TAILQ_HEAD_INITIALIZER(ksyms_symtabs); 135 TAILQ_HEAD_INITIALIZER(ksyms_symtabs);
138static struct ksyms_symtab kernel_symtab; 136static struct ksyms_symtab kernel_symtab;
139 137
140static int 138static int
141ksyms_verify(void *symstart, void *strstart) 139ksyms_verify(void *symstart, void *strstart)
142{ 140{
143#if defined(DIAGNOSTIC) || defined(DEBUG) 141#if defined(DIAGNOSTIC) || defined(DEBUG)
144 if (symstart == NULL) 142 if (symstart == NULL)
145 printf("ksyms: Symbol table not found\n"); 143 printf("ksyms: Symbol table not found\n");
146 if (strstart == NULL) 144 if (strstart == NULL)
147 printf("ksyms: String table not found\n"); 145 printf("ksyms: String table not found\n");
148 if (symstart == NULL || strstart == NULL) 146 if (symstart == NULL || strstart == NULL)
149 printf("ksyms: Perhaps the kernel is stripped?\n"); 147 printf("ksyms: Perhaps the kernel is stripped?\n");
150#endif 148#endif
151 if (symstart == NULL || strstart == NULL) 149 if (symstart == NULL || strstart == NULL)
152 return 0; 150 return 0;
153 return 1; 151 return 1;
154} 152}
155 153
156/* 154/*
157 * Finds a certain symbol name in a certain symbol table. 155 * Finds a certain symbol name in a certain symbol table.
158 */ 156 */
159static Elf_Sym * 157static Elf_Sym *
160findsym(const char *name, struct ksyms_symtab *table, int type) 158findsym(const char *name, struct ksyms_symtab *table, int type)
161{ 159{
162 Elf_Sym *sym, *maxsym; 160 Elf_Sym *sym, *maxsym;
163 int low, mid, high, nglob; 161 int low, mid, high, nglob;
164 char *str, *cmp; 162 char *str, *cmp;
165 163
166 sym = table->sd_symstart; 164 sym = table->sd_symstart;
167 str = table->sd_strstart - table->sd_usroffset; 165 str = table->sd_strstart - table->sd_usroffset;
168 nglob = table->sd_nglob; 166 nglob = table->sd_nglob;
169 low = 0; 167 low = 0;
170 high = nglob; 168 high = nglob;
171 169
172 /* 170 /*
173 * Start with a binary search of all global symbols in this table. 171 * Start with a binary search of all global symbols in this table.
174 * Global symbols must have unique names. 172 * Global symbols must have unique names.
175 */ 173 */
176 while (low < high) { 174 while (low < high) {
177 mid = (low + high) >> 1; 175 mid = (low + high) >> 1;
178 cmp = sym[mid].st_name + str; 176 cmp = sym[mid].st_name + str;
179 if (cmp[0] < name[0] || strcmp(cmp, name) < 0) { 177 if (cmp[0] < name[0] || strcmp(cmp, name) < 0) {
180 low = mid + 1;  178 low = mid + 1;
181 } else { 179 } else {
182 high = mid; 180 high = mid;
183 } 181 }
184 } 182 }
185 KASSERT(low == high); 183 KASSERT(low == high);
186 if (__predict_true(low < nglob && 184 if (__predict_true(low < nglob &&
187 strcmp(sym[low].st_name + str, name) == 0)) { 185 strcmp(sym[low].st_name + str, name) == 0)) {
188 KASSERT(ELF_ST_BIND(sym[low].st_info) == STB_GLOBAL); 186 KASSERT(ELF_ST_BIND(sym[low].st_info) == STB_GLOBAL);
189 return &sym[low]; 187 return &sym[low];
190 } 188 }
191 189
192 /* 190 /*
193 * Perform a linear search of local symbols (rare). Many local 191 * Perform a linear search of local symbols (rare). Many local
194 * symbols with the same name can exist so are not included in 192 * symbols with the same name can exist so are not included in
195 * the binary search. 193 * the binary search.
196 */ 194 */
197 if (type != KSYMS_EXTERN) { 195 if (type != KSYMS_EXTERN) {
198 maxsym = sym + table->sd_symsize / sizeof(Elf_Sym); 196 maxsym = sym + table->sd_symsize / sizeof(Elf_Sym);
199 for (sym += nglob; sym < maxsym; sym++) { 197 for (sym += nglob; sym < maxsym; sym++) {
200 if (strcmp(name, sym->st_name + str) == 0) { 198 if (strcmp(name, sym->st_name + str) == 0) {
201 return sym; 199 return sym;
202 } 200 }
203 } 201 }
204 } 202 }
205 return NULL; 203 return NULL;
206} 204}
207 205
208/* 206/*
209 * The "attach" is in reality done in ksyms_init(). 207 * The "attach" is in reality done in ksyms_init().
210 */ 208 */
211void 209void
212ksymsattach(int arg) 210ksymsattach(int arg)
213{ 211{
214 212
215} 213}
216 214
217void 215void
218ksyms_init(void) 216ksyms_init(void)
219{ 217{
220 218
221#ifdef SYMTAB_SPACE 219#ifdef SYMTAB_SPACE
222 if (!ksyms_initted && 220 if (!ksyms_initted &&
223 strncmp(db_symtab, SYMTAB_FILLER, sizeof(SYMTAB_FILLER))) { 221 strncmp(db_symtab, SYMTAB_FILLER, sizeof(SYMTAB_FILLER))) {
224 ksyms_addsyms_elf(db_symtabsize, db_symtab, 222 ksyms_addsyms_elf(db_symtabsize, db_symtab,
225 db_symtab + db_symtabsize); 223 db_symtab + db_symtabsize);
226 } 224 }
227#endif 225#endif
228 226
229 mutex_init(&ksyms_lock, MUTEX_DEFAULT, IPL_NONE); 227 mutex_init(&ksyms_lock, MUTEX_DEFAULT, IPL_NONE);
230} 228}
231 229
232/* 230/*
233 * Add a symbol table. 231 * Add a symbol table.
234 * This is intended for use when the symbol table and its corresponding 232 * This is intended for use when the symbol table and its corresponding
235 * string table are easily available. If they are embedded in an ELF 233 * string table are easily available. If they are embedded in an ELF
236 * image, use addsymtab_elf() instead. 234 * image, use addsymtab_elf() instead.
237 * 235 *
238 * name - Symbol's table name. 236 * name - Symbol's table name.
239 * symstart, symsize - Address and size of the symbol table. 237 * symstart, symsize - Address and size of the symbol table.
240 * strstart, strsize - Address and size of the string table. 238 * strstart, strsize - Address and size of the string table.
241 * tab - Symbol table to be updated with this information. 239 * tab - Symbol table to be updated with this information.
242 * newstart - Address to which the symbol table has to be copied during 240 * newstart - Address to which the symbol table has to be copied during
243 * shrinking. If NULL, it is not moved. 241 * shrinking. If NULL, it is not moved.
244 */ 242 */
245static const char *addsymtab_strstart; 243static const char *addsymtab_strstart;
246 244
247static int 245static int
248addsymtab_compar(const void *a, const void *b) 246addsymtab_compar(const void *a, const void *b)
249{ 247{
250 const Elf_Sym *sa, *sb; 248 const Elf_Sym *sa, *sb;
251 249
252 sa = a; 250 sa = a;
253 sb = b; 251 sb = b;
254 252
255 /* 253 /*
256 * Split the symbol table into two, with globals at the start 254 * Split the symbol table into two, with globals at the start
257 * and locals at the end. 255 * and locals at the end.
258 */ 256 */
259 if (ELF_ST_BIND(sa->st_info) != ELF_ST_BIND(sb->st_info)) { 257 if (ELF_ST_BIND(sa->st_info) != ELF_ST_BIND(sb->st_info)) {
260 if (ELF_ST_BIND(sa->st_info) == STB_GLOBAL) { 258 if (ELF_ST_BIND(sa->st_info) == STB_GLOBAL) {
261 return -1; 259 return -1;
262 } 260 }
263 if (ELF_ST_BIND(sb->st_info) == STB_GLOBAL) { 261 if (ELF_ST_BIND(sb->st_info) == STB_GLOBAL) {
264 return 1; 262 return 1;
265 } 263 }
266 } 264 }
267 265
268 /* Within each band, sort by name. */ 266 /* Within each band, sort by name. */
269 return strcmp(sa->st_name + addsymtab_strstart, 267 return strcmp(sa->st_name + addsymtab_strstart,
270 sb->st_name + addsymtab_strstart); 268 sb->st_name + addsymtab_strstart);
271} 269}
272 270
273static void 271static void
274addsymtab(const char *name, void *symstart, size_t symsize, 272addsymtab(const char *name, void *symstart, size_t symsize,
275 void *strstart, size_t strsize, struct ksyms_symtab *tab, 273 void *strstart, size_t strsize, struct ksyms_symtab *tab,
276 void *newstart, void *ctfstart, size_t ctfsize, uint32_t *nmap) 274 void *newstart, void *ctfstart, size_t ctfsize, uint32_t *nmap)
277{ 275{
278 Elf_Sym *sym, *nsym, ts; 276 Elf_Sym *sym, *nsym, ts;
279 int i, j, n, nglob; 277 int i, j, n, nglob;
280 char *str; 278 char *str;
281 int nsyms = symsize / sizeof(Elf_Sym); 279 int nsyms = symsize / sizeof(Elf_Sym);
282 280
283 /* Sanity check for pre-allocated map table used during startup. */ 281 /* Sanity check for pre-allocated map table used during startup. */
284 if ((nmap == ksyms_nmap) && (nsyms >= KSYMS_MAX_ID)) { 282 if ((nmap == ksyms_nmap) && (nsyms >= KSYMS_MAX_ID)) {
285 printf("kern_ksyms: ERROR %d > %d, increase KSYMS_MAX_ID\n", 283 printf("kern_ksyms: ERROR %d > %d, increase KSYMS_MAX_ID\n",
286 nsyms, KSYMS_MAX_ID); 284 nsyms, KSYMS_MAX_ID);
287 285
288 /* truncate for now */ 286 /* truncate for now */
289 nsyms = KSYMS_MAX_ID - 1; 287 nsyms = KSYMS_MAX_ID - 1;
290 } 288 }
291 289
292 tab->sd_symstart = symstart; 290 tab->sd_symstart = symstart;
293 tab->sd_symsize = symsize; 291 tab->sd_symsize = symsize;
294 tab->sd_strstart = strstart; 292 tab->sd_strstart = strstart;
295 tab->sd_strsize = strsize; 293 tab->sd_strsize = strsize;
296 tab->sd_name = name; 294 tab->sd_name = name;
297 tab->sd_minsym = UINTPTR_MAX; 295 tab->sd_minsym = UINTPTR_MAX;
298 tab->sd_maxsym = 0; 296 tab->sd_maxsym = 0;
299 tab->sd_usroffset = 0; 297 tab->sd_usroffset = 0;
300 tab->sd_gone = false; 298 tab->sd_gone = false;
301#ifdef KDTRACE_HOOKS 299#ifdef KDTRACE_HOOKS
302 tab->sd_ctfstart = ctfstart; 300 tab->sd_ctfstart = ctfstart;
303 tab->sd_ctfsize = ctfsize; 301 tab->sd_ctfsize = ctfsize;
304 tab->sd_nmap = nmap; 302 tab->sd_nmap = nmap;
305 tab->sd_nmapsize = nsyms; 303 tab->sd_nmapsize = nsyms;
306#endif 304#endif
307#ifdef KSYMS_DEBUG 305#ifdef KSYMS_DEBUG
308 printf("newstart %p sym %p ksyms_symsz %zu str %p strsz %zu send %p\n", 306 printf("newstart %p sym %p ksyms_symsz %zu str %p strsz %zu send %p\n",
309 newstart, symstart, symsize, strstart, strsize, 307 newstart, symstart, symsize, strstart, strsize,
310 tab->sd_strstart + tab->sd_strsize); 308 tab->sd_strstart + tab->sd_strsize);
311#endif 309#endif
312 310
313 if (nmap) { 311 if (nmap) {
314 memset(nmap, 0, nsyms * sizeof(uint32_t)); 312 memset(nmap, 0, nsyms * sizeof(uint32_t));
315 } 313 }
316 314
317 /* Pack symbol table by removing all file name references. */ 315 /* Pack symbol table by removing all file name references. */
318 sym = tab->sd_symstart; 316 sym = tab->sd_symstart;
319 nsym = (Elf_Sym *)newstart; 317 nsym = (Elf_Sym *)newstart;
320 str = tab->sd_strstart; 318 str = tab->sd_strstart;
321 nglob = 0; 319 nglob = 0;
322 for (i = n = 0; i < nsyms; i++) { 320 for (i = n = 0; i < nsyms; i++) {
323 321
324 /* This breaks CTF mapping, so don't do it when 322 /* This breaks CTF mapping, so don't do it when
325 * DTrace is enabled 323 * DTrace is enabled
326 */ 324 */
327#ifndef KDTRACE_HOOKS 325#ifndef KDTRACE_HOOKS
328 /* 326 /*
329 * Remove useless symbols. 327 * Remove useless symbols.
330 * Should actually remove all typeless symbols. 328 * Should actually remove all typeless symbols.
331 */ 329 */
332 if (sym[i].st_name == 0) 330 if (sym[i].st_name == 0)
333 continue; /* Skip nameless entries */ 331 continue; /* Skip nameless entries */
334 if (sym[i].st_shndx == SHN_UNDEF) 332 if (sym[i].st_shndx == SHN_UNDEF)
335 continue; /* Skip external references */ 333 continue; /* Skip external references */
336 if (ELF_ST_TYPE(sym[i].st_info) == STT_FILE) 334 if (ELF_ST_TYPE(sym[i].st_info) == STT_FILE)
337 continue; /* Skip filenames */ 335 continue; /* Skip filenames */
338 if (ELF_ST_TYPE(sym[i].st_info) == STT_NOTYPE && 336 if (ELF_ST_TYPE(sym[i].st_info) == STT_NOTYPE &&
339 sym[i].st_value == 0 && 337 sym[i].st_value == 0 &&
340 strcmp(str + sym[i].st_name, "*ABS*") == 0) 338 strcmp(str + sym[i].st_name, "*ABS*") == 0)
341 continue; /* XXX */ 339 continue; /* XXX */
342 if (ELF_ST_TYPE(sym[i].st_info) == STT_NOTYPE && 340 if (ELF_ST_TYPE(sym[i].st_info) == STT_NOTYPE &&
343 strcmp(str + sym[i].st_name, "gcc2_compiled.") == 0) 341 strcmp(str + sym[i].st_name, "gcc2_compiled.") == 0)
344 continue; /* XXX */ 342 continue; /* XXX */
345#endif 343#endif
346 344
347 /* Save symbol. Set it as an absolute offset */ 345 /* Save symbol. Set it as an absolute offset */
348 nsym[n] = sym[i]; 346 nsym[n] = sym[i];
349 347
350#ifdef KDTRACE_HOOKS 348#ifdef KDTRACE_HOOKS
351 if (nmap != NULL) { 349 if (nmap != NULL) {
352 /* 350 /*
353 * Save the size, replace it with the symbol id so 351 * Save the size, replace it with the symbol id so
354 * the mapping can be done after the cleanup and sort. 352 * the mapping can be done after the cleanup and sort.
355 */ 353 */
356 nmap[i] = nsym[n].st_size; 354 nmap[i] = nsym[n].st_size;
357 nsym[n].st_size = i + 1; /* zero is reserved */ 355 nsym[n].st_size = i + 1; /* zero is reserved */
358 } 356 }
359#endif 357#endif
360 358
361 nsym[n].st_shndx = SHBSS; 359 nsym[n].st_shndx = SHBSS;
362 j = strlen(nsym[n].st_name + str) + 1; 360 j = strlen(nsym[n].st_name + str) + 1;
363 if (j > ksyms_maxlen) 361 if (j > ksyms_maxlen)
364 ksyms_maxlen = j; 362 ksyms_maxlen = j;
365 nglob += (ELF_ST_BIND(nsym[n].st_info) == STB_GLOBAL); 363 nglob += (ELF_ST_BIND(nsym[n].st_info) == STB_GLOBAL);
366 364
367 /* Compute min and max symbols. */ 365 /* Compute min and max symbols. */
368 if (strcmp(str + sym[i].st_name, "*ABS*") != 0 366 if (strcmp(str + sym[i].st_name, "*ABS*") != 0
369 && ELF_ST_TYPE(nsym[n].st_info) != STT_NOTYPE) { 367 && ELF_ST_TYPE(nsym[n].st_info) != STT_NOTYPE) {
370 if (nsym[n].st_value < tab->sd_minsym) { 368 if (nsym[n].st_value < tab->sd_minsym) {
371 tab->sd_minsym = nsym[n].st_value; 369 tab->sd_minsym = nsym[n].st_value;
372 } 370 }
373 if (nsym[n].st_value > tab->sd_maxsym) { 371 if (nsym[n].st_value > tab->sd_maxsym) {
374 tab->sd_maxsym = nsym[n].st_value; 372 tab->sd_maxsym = nsym[n].st_value;
375 } 373 }
376 } 374 }
377 n++; 375 n++;
378 } 376 }
379 377
380 /* Fill the rest of the record, and sort the symbols. */ 378 /* Fill the rest of the record, and sort the symbols. */
381 tab->sd_symstart = nsym; 379 tab->sd_symstart = nsym;
382 tab->sd_symsize = n * sizeof(Elf_Sym); 380 tab->sd_symsize = n * sizeof(Elf_Sym);
383 tab->sd_nglob = nglob; 381 tab->sd_nglob = nglob;
384 addsymtab_strstart = str; 382 addsymtab_strstart = str;
385 if (kheapsort(nsym, n, sizeof(Elf_Sym), addsymtab_compar, &ts) != 0) 383 if (kheapsort(nsym, n, sizeof(Elf_Sym), addsymtab_compar, &ts) != 0)
386 panic("addsymtab"); 384 panic("addsymtab");
387 385
388#ifdef KDTRACE_HOOKS 386#ifdef KDTRACE_HOOKS
389 /*  387 /*
390 * Build the mapping from original symbol id to new symbol table. 388 * Build the mapping from original symbol id to new symbol table.
391 * Deleted symbols will have a zero map, indices will be one based 389 * Deleted symbols will have a zero map, indices will be one based
392 * instead of zero based. 390 * instead of zero based.
393 * Resulting map is sd_nmap[original_index] = new_index + 1 391 * Resulting map is sd_nmap[original_index] = new_index + 1
394 */ 392 */
395 if (nmap != NULL) { 393 if (nmap != NULL) {
396 int new; 394 int new;
397 for (new = 0; new < n; new++) { 395 for (new = 0; new < n; new++) {
398 uint32_t orig = nsym[new].st_size - 1; 396 uint32_t orig = nsym[new].st_size - 1;
399 uint32_t size = nmap[orig]; 397 uint32_t size = nmap[orig];
400  398
401 nmap[orig] = new + 1; 399 nmap[orig] = new + 1;
402 400
403 /* restore the size */ 401 /* restore the size */
404 nsym[new].st_size = size; 402 nsym[new].st_size = size;
405 } 403 }
406 } 404 }
407#endif 405#endif
408 406
409 /* ksymsread() is unlocked, so membar. */ 407 /* ksymsread() is unlocked, so membar. */
410 membar_producer(); 408 membar_producer();
411 TAILQ_INSERT_TAIL(&ksyms_symtabs, tab, sd_queue); 409 TAILQ_INSERT_TAIL(&ksyms_symtabs, tab, sd_queue);
412 ksyms_sizes_calc(); 410 ksyms_sizes_calc();
413 ksyms_initted = true; 411 ksyms_initted = true;
414} 412}
415 413
416/* 414/*
417 * Setup the kernel symbol table stuff. 415 * Setup the kernel symbol table stuff.
418 */ 416 */
419void 417void
420ksyms_addsyms_elf(int symsize, void *start, void *end) 418ksyms_addsyms_elf(int symsize, void *start, void *end)
421{ 419{
422 int i, j; 420 int i, j;
423 Elf_Shdr *shdr; 421 Elf_Shdr *shdr;
424 char *symstart = NULL, *strstart = NULL; 422 char *symstart = NULL, *strstart = NULL;
425 size_t strsize = 0; 423 size_t strsize = 0;
426 Elf_Ehdr *ehdr; 424 Elf_Ehdr *ehdr;
427 char *ctfstart = NULL; 425 char *ctfstart = NULL;
428 size_t ctfsize = 0; 426 size_t ctfsize = 0;
429 427
430 if (symsize <= 0) { 428 if (symsize <= 0) {
431 printf("[ Kernel symbol table missing! ]\n"); 429 printf("[ Kernel symbol table missing! ]\n");
432 return; 430 return;
433 } 431 }
434 432
435 /* Sanity check */ 433 /* Sanity check */
436 if (ALIGNED_POINTER(start, long) == 0) { 434 if (ALIGNED_POINTER(start, long) == 0) {
437 printf("[ Kernel symbol table has bad start address %p ]\n", 435 printf("[ Kernel symbol table has bad start address %p ]\n",
438 start); 436 start);
439 return; 437 return;
440 } 438 }
441 439
442 ehdr = (Elf_Ehdr *)start; 440 ehdr = (Elf_Ehdr *)start;
443 441
444 /* check if this is a valid ELF header */ 442 /* check if this is a valid ELF header */
445 /* No reason to verify arch type, the kernel is actually running! */ 443 /* No reason to verify arch type, the kernel is actually running! */
446 if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) || 444 if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) ||
447 ehdr->e_ident[EI_CLASS] != ELFCLASS || 445 ehdr->e_ident[EI_CLASS] != ELFCLASS ||
448 ehdr->e_version > 1) { 446 ehdr->e_version > 1) {
449 printf("[ Kernel symbol table invalid! ]\n"); 447 printf("[ Kernel symbol table invalid! ]\n");
450 return; /* nothing to do */ 448 return; /* nothing to do */
451 } 449 }
452 450
453 /* Loaded header will be scratched in addsymtab */ 451 /* Loaded header will be scratched in addsymtab */
454 ksyms_hdr_init(start); 452 ksyms_hdr_init(start);
455 453
456 /* Find the symbol table and the corresponding string table. */ 454 /* Find the symbol table and the corresponding string table. */
457 shdr = (Elf_Shdr *)((uint8_t *)start + ehdr->e_shoff); 455 shdr = (Elf_Shdr *)((uint8_t *)start + ehdr->e_shoff);
458 for (i = 1; i < ehdr->e_shnum; i++) { 456 for (i = 1; i < ehdr->e_shnum; i++) {
459 if (shdr[i].sh_type != SHT_SYMTAB) 457 if (shdr[i].sh_type != SHT_SYMTAB)
460 continue; 458 continue;
461 if (shdr[i].sh_offset == 0) 459 if (shdr[i].sh_offset == 0)
462 continue; 460 continue;
463 symstart = (uint8_t *)start + shdr[i].sh_offset; 461 symstart = (uint8_t *)start + shdr[i].sh_offset;
464 symsize = shdr[i].sh_size; 462 symsize = shdr[i].sh_size;
465 j = shdr[i].sh_link; 463 j = shdr[i].sh_link;
466 if (shdr[j].sh_offset == 0) 464 if (shdr[j].sh_offset == 0)
467 continue; /* Can this happen? */ 465 continue; /* Can this happen? */
468 strstart = (uint8_t *)start + shdr[j].sh_offset; 466 strstart = (uint8_t *)start + shdr[j].sh_offset;
469 strsize = shdr[j].sh_size; 467 strsize = shdr[j].sh_size;
470 break; 468 break;
471 } 469 }
472 470
473#ifdef KDTRACE_HOOKS 471#ifdef KDTRACE_HOOKS
474 /* Find the CTF section */ 472 /* Find the CTF section */
475 shdr = (Elf_Shdr *)((uint8_t *)start + ehdr->e_shoff); 473 shdr = (Elf_Shdr *)((uint8_t *)start + ehdr->e_shoff);
476 if (ehdr->e_shstrndx != 0) { 474 if (ehdr->e_shstrndx != 0) {
477 char *shstr = (uint8_t *)start + 475 char *shstr = (uint8_t *)start +
478 shdr[ehdr->e_shstrndx].sh_offset; 476 shdr[ehdr->e_shstrndx].sh_offset;
479 for (i = 1; i < ehdr->e_shnum; i++) { 477 for (i = 1; i < ehdr->e_shnum; i++) {
480#ifdef DEBUG 478#ifdef DEBUG
481 printf("ksyms: checking %s\n", &shstr[shdr[i].sh_name]); 479 printf("ksyms: checking %s\n", &shstr[shdr[i].sh_name]);
482#endif 480#endif
483 if (shdr[i].sh_type != SHT_PROGBITS) 481 if (shdr[i].sh_type != SHT_PROGBITS)
484 continue; 482 continue;
485 if (strncmp(".SUNW_ctf", &shstr[shdr[i].sh_name], 10) 483 if (strncmp(".SUNW_ctf", &shstr[shdr[i].sh_name], 10)
486 != 0) 484 != 0)
487 continue; 485 continue;
488 ctfstart = (uint8_t *)start + shdr[i].sh_offset; 486 ctfstart = (uint8_t *)start + shdr[i].sh_offset;
489 ctfsize = shdr[i].sh_size; 487 ctfsize = shdr[i].sh_size;
490 ksyms_ctfsz = ctfsize; 488 ksyms_ctfsz = ctfsize;
491#ifdef DEBUG 489#ifdef DEBUG
492 aprint_normal("Found CTF at %p, size 0x%zx\n", 490 aprint_normal("Found CTF at %p, size 0x%zx\n",
493 ctfstart, ctfsize); 491 ctfstart, ctfsize);
494#endif 492#endif
495 break; 493 break;
496 } 494 }
497#ifdef DEBUG 495#ifdef DEBUG
498 } else { 496 } else {
499 printf("ksyms: e_shstrndx == 0\n"); 497 printf("ksyms: e_shstrndx == 0\n");
500#endif 498#endif
501 } 499 }
502#endif 500#endif
503 501
504 if (!ksyms_verify(symstart, strstart)) 502 if (!ksyms_verify(symstart, strstart))
505 return; 503 return;
506 504
507 addsymtab("netbsd", symstart, symsize, strstart, strsize, 505 addsymtab("netbsd", symstart, symsize, strstart, strsize,
508 &kernel_symtab, start, ctfstart, ctfsize, ksyms_nmap); 506 &kernel_symtab, start, ctfstart, ctfsize, ksyms_nmap);
509 507
510#ifdef DEBUG 508#ifdef DEBUG
511 aprint_normal("Loaded initial symtab at %p, strtab at %p, # entries %ld\n", 509 aprint_normal("Loaded initial symtab at %p, strtab at %p, # entries %ld\n",
512 kernel_symtab.sd_symstart, kernel_symtab.sd_strstart, 510 kernel_symtab.sd_symstart, kernel_symtab.sd_strstart,
513 (long)kernel_symtab.sd_symsize/sizeof(Elf_Sym)); 511 (long)kernel_symtab.sd_symsize/sizeof(Elf_Sym));
514#endif 512#endif
515} 513}
516 514
517/* 515/*
518 * Setup the kernel symbol table stuff. 516 * Setup the kernel symbol table stuff.
519 * Use this when the address of the symbol and string tables are known; 517 * Use this when the address of the symbol and string tables are known;
520 * otherwise use ksyms_init with an ELF image. 518 * otherwise use ksyms_init with an ELF image.
521 * We need to pass a minimal ELF header which will later be completed by 519 * We need to pass a minimal ELF header which will later be completed by
522 * ksyms_hdr_init and handed off to userland through /dev/ksyms. We use 520 * ksyms_hdr_init and handed off to userland through /dev/ksyms. We use
523 * a void *rather than a pointer to avoid exposing the Elf_Ehdr type. 521 * a void *rather than a pointer to avoid exposing the Elf_Ehdr type.
524 */ 522 */
525void 523void
526ksyms_addsyms_explicit(void *ehdr, void *symstart, size_t symsize, 524ksyms_addsyms_explicit(void *ehdr, void *symstart, size_t symsize,
527 void *strstart, size_t strsize) 525 void *strstart, size_t strsize)
528{ 526{
529 527
530 if (!ksyms_verify(symstart, strstart)) 528 if (!ksyms_verify(symstart, strstart))
531 return; 529 return;
532 530
533 ksyms_hdr_init(ehdr); 531 ksyms_hdr_init(ehdr);
534 addsymtab("netbsd", symstart, symsize, strstart, strsize, 532 addsymtab("netbsd", symstart, symsize, strstart, strsize,
535 &kernel_symtab, symstart, NULL, 0, ksyms_nmap); 533 &kernel_symtab, symstart, NULL, 0, ksyms_nmap);
536} 534}
537 535
538/* 536/*
539 * Get the value associated with a symbol. 537 * Get the value associated with a symbol.
540 * "mod" is the module name, or null if any module. 538 * "mod" is the module name, or null if any module.
541 * "sym" is the symbol name. 539 * "sym" is the symbol name.
542 * "val" is a pointer to the corresponding value, if call succeeded. 540 * "val" is a pointer to the corresponding value, if call succeeded.
543 * Returns 0 if success or ENOENT if no such entry. 541 * Returns 0 if success or ENOENT if no such entry.
544 * 542 *
545 * Call with ksyms_lock, unless known that the symbol table can't change. 543 * Call with ksyms_lock, unless known that the symbol table can't change.
546 */ 544 */
547int 545int
548ksyms_getval_unlocked(const char *mod, const char *sym, unsigned long *val, 546ksyms_getval_unlocked(const char *mod, const char *sym, unsigned long *val,
549 int type) 547 int type)
550{ 548{
551 struct ksyms_symtab *st; 549 struct ksyms_symtab *st;
552 Elf_Sym *es; 550 Elf_Sym *es;
553 551
554#ifdef KSYMS_DEBUG 552#ifdef KSYMS_DEBUG
555 if (ksyms_debug & FOLLOW_CALLS) 553 if (ksyms_debug & FOLLOW_CALLS)
556 printf("ksyms_getval_unlocked: mod %s sym %s valp %p\n", 554 printf("ksyms_getval_unlocked: mod %s sym %s valp %p\n",
557 mod, sym, val); 555 mod, sym, val);
558#endif 556#endif
559 557
560 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { 558 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
561 if (__predict_false(st->sd_gone)) 559 if (__predict_false(st->sd_gone))
562 continue; 560 continue;
563 if (mod != NULL && strcmp(st->sd_name, mod)) 561 if (mod != NULL && strcmp(st->sd_name, mod))
564 continue; 562 continue;
565 if ((es = findsym(sym, st, type)) != NULL) { 563 if ((es = findsym(sym, st, type)) != NULL) {
566 *val = es->st_value; 564 *val = es->st_value;
567 return 0; 565 return 0;
568 } 566 }
569 } 567 }
570 return ENOENT; 568 return ENOENT;
571} 569}
572 570
573int 571int
574ksyms_getval(const char *mod, const char *sym, unsigned long *val, int type) 572ksyms_getval(const char *mod, const char *sym, unsigned long *val, int type)
575{ 573{
576 int rc; 574 int rc;
577 575
578 if (!ksyms_initted) 576 if (!ksyms_initted)
579 return ENOENT; 577 return ENOENT;
580 578
581 mutex_enter(&ksyms_lock); 579 mutex_enter(&ksyms_lock);
582 rc = ksyms_getval_unlocked(mod, sym, val, type); 580 rc = ksyms_getval_unlocked(mod, sym, val, type);
583 mutex_exit(&ksyms_lock); 581 mutex_exit(&ksyms_lock);
584 return rc; 582 return rc;
585} 583}
586 584
587struct ksyms_symtab * 585struct ksyms_symtab *
588ksyms_get_mod(const char *mod) 586ksyms_get_mod(const char *mod)
589{ 587{
590 struct ksyms_symtab *st; 588 struct ksyms_symtab *st;
591 589
592 mutex_enter(&ksyms_lock); 590 mutex_enter(&ksyms_lock);
593 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { 591 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
594 if (__predict_false(st->sd_gone)) 592 if (__predict_false(st->sd_gone))
595 continue; 593 continue;
596 if (mod != NULL && strcmp(st->sd_name, mod)) 594 if (mod != NULL && strcmp(st->sd_name, mod))
597 continue; 595 continue;
598 break; 596 break;
599 } 597 }
600 mutex_exit(&ksyms_lock); 598 mutex_exit(&ksyms_lock);
601 599
602 return st; 600 return st;
603} 601}
604 602
605 603
606/* 604/*
607 * ksyms_mod_foreach() 605 * ksyms_mod_foreach()
608 * 606 *
609 * Iterate over the symbol table of the specified module, calling the callback 607 * Iterate over the symbol table of the specified module, calling the callback
610 * handler for each symbol. Stop iterating if the handler return is non-zero. 608 * handler for each symbol. Stop iterating if the handler return is non-zero.
611 * 609 *
612 */ 610 */
613 611
614int 612int
615ksyms_mod_foreach(const char *mod, ksyms_callback_t callback, void *opaque) 613ksyms_mod_foreach(const char *mod, ksyms_callback_t callback, void *opaque)
616{ 614{
617 struct ksyms_symtab *st; 615 struct ksyms_symtab *st;
618 Elf_Sym *sym, *maxsym; 616 Elf_Sym *sym, *maxsym;
619 char *str; 617 char *str;
620 int symindx; 618 int symindx;
621 619
622 if (!ksyms_initted) 620 if (!ksyms_initted)
623 return ENOENT; 621 return ENOENT;
624 622
625 mutex_enter(&ksyms_lock); 623 mutex_enter(&ksyms_lock);
626 624
627 /* find the module */ 625 /* find the module */
628 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { 626 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
629 if (__predict_false(st->sd_gone)) 627 if (__predict_false(st->sd_gone))
630 continue; 628 continue;
631 if (mod != NULL && strcmp(st->sd_name, mod)) 629 if (mod != NULL && strcmp(st->sd_name, mod))
632 continue; 630 continue;
633 631
634 sym = st->sd_symstart; 632 sym = st->sd_symstart;
635 str = st->sd_strstart - st->sd_usroffset; 633 str = st->sd_strstart - st->sd_usroffset;
636 634
637 /* now iterate through the symbols */ 635 /* now iterate through the symbols */
638 maxsym = sym + st->sd_symsize / sizeof(Elf_Sym); 636 maxsym = sym + st->sd_symsize / sizeof(Elf_Sym);
639 for (symindx = 0; sym < maxsym; sym++, symindx++) { 637 for (symindx = 0; sym < maxsym; sym++, symindx++) {
640 if (callback(str + sym->st_name, symindx, 638 if (callback(str + sym->st_name, symindx,
641 (void *)sym->st_value, 639 (void *)sym->st_value,
642 sym->st_size, 640 sym->st_size,
643 sym->st_info, 641 sym->st_info,
644 opaque) != 0) { 642 opaque) != 0) {
645 break; 643 break;
646 } 644 }
647 } 645 }
648 } 646 }
649 mutex_exit(&ksyms_lock); 647 mutex_exit(&ksyms_lock);
650 648
651 return 0; 649 return 0;
652} 650}
653 651
654/* 652/*
655 * Get "mod" and "symbol" associated with an address. 653 * Get "mod" and "symbol" associated with an address.
656 * Returns 0 if success or ENOENT if no such entry. 654 * Returns 0 if success or ENOENT if no such entry.
657 * 655 *
658 * Call with ksyms_lock, unless known that the symbol table can't change. 656 * Call with ksyms_lock, unless known that the symbol table can't change.
659 */ 657 */
660int 658int
661ksyms_getname(const char **mod, const char **sym, vaddr_t v, int f) 659ksyms_getname(const char **mod, const char **sym, vaddr_t v, int f)
662{ 660{
663 struct ksyms_symtab *st; 661 struct ksyms_symtab *st;
664 Elf_Sym *les, *es = NULL; 662 Elf_Sym *les, *es = NULL;
665 vaddr_t laddr = 0; 663 vaddr_t laddr = 0;
666 const char *lmod = NULL; 664 const char *lmod = NULL;
667 char *stable = NULL; 665 char *stable = NULL;
668 int type, i, sz; 666 int type, i, sz;
669 667
670 if (!ksyms_initted) 668 if (!ksyms_initted)
671 return ENOENT; 669 return ENOENT;
672 670
673 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { 671 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
674 if (st->sd_gone) 672 if (st->sd_gone)
675 continue; 673 continue;
676 if (v < st->sd_minsym || v > st->sd_maxsym) 674 if (v < st->sd_minsym || v > st->sd_maxsym)
677 continue; 675 continue;
678 sz = st->sd_symsize/sizeof(Elf_Sym); 676 sz = st->sd_symsize/sizeof(Elf_Sym);
679 for (i = 0; i < sz; i++) { 677 for (i = 0; i < sz; i++) {
680 les = st->sd_symstart + i; 678 les = st->sd_symstart + i;
681 type = ELF_ST_TYPE(les->st_info); 679 type = ELF_ST_TYPE(les->st_info);
682 680
683 if ((f & KSYMS_PROC) && (type != STT_FUNC)) 681 if ((f & KSYMS_PROC) && (type != STT_FUNC))
684 continue; 682 continue;
685 683
686 if (type == STT_NOTYPE) 684 if (type == STT_NOTYPE)
687 continue; 685 continue;
688 686
689 if (((f & KSYMS_ANY) == 0) && 687 if (((f & KSYMS_ANY) == 0) &&
690 (type != STT_FUNC) && (type != STT_OBJECT)) 688 (type != STT_FUNC) && (type != STT_OBJECT))
691 continue; 689 continue;
692 690
693 if ((les->st_value <= v) && (les->st_value > laddr)) { 691 if ((les->st_value <= v) && (les->st_value > laddr)) {
694 laddr = les->st_value; 692 laddr = les->st_value;
695 es = les; 693 es = les;
696 lmod = st->sd_name; 694 lmod = st->sd_name;
697 stable = st->sd_strstart - st->sd_usroffset; 695 stable = st->sd_strstart - st->sd_usroffset;
698 } 696 }
699 } 697 }
700 } 698 }
701 if (es == NULL) 699 if (es == NULL)
702 return ENOENT; 700 return ENOENT;
703 if ((f & KSYMS_EXACT) && (v != es->st_value)) 701 if ((f & KSYMS_EXACT) && (v != es->st_value))
704 return ENOENT; 702 return ENOENT;
705 if (mod) 703 if (mod)
706 *mod = lmod; 704 *mod = lmod;
707 if (sym) 705 if (sym)
708 *sym = stable + es->st_name; 706 *sym = stable + es->st_name;
709 return 0; 707 return 0;
710} 708}
711 709
712/* 710/*
713 * Add a symbol table from a loadable module. 711 * Add a symbol table from a loadable module.
714 */ 712 */
715void 713void
716ksyms_modload(const char *name, void *symstart, vsize_t symsize, 714ksyms_modload(const char *name, void *symstart, vsize_t symsize,
717 char *strstart, vsize_t strsize) 715 char *strstart, vsize_t strsize)
718{ 716{
719 struct ksyms_symtab *st; 717 struct ksyms_symtab *st;
720 718
721 st = kmem_zalloc(sizeof(*st), KM_SLEEP); 719 st = kmem_zalloc(sizeof(*st), KM_SLEEP);
722 mutex_enter(&ksyms_lock); 720 mutex_enter(&ksyms_lock);
723 addsymtab(name, symstart, symsize, strstart, strsize, st, symstart, 721 addsymtab(name, symstart, symsize, strstart, strsize, st, symstart,
724 NULL, 0, NULL); 722 NULL, 0, NULL);
725 mutex_exit(&ksyms_lock); 723 mutex_exit(&ksyms_lock);
726} 724}
727 725
728/* 726/*
729 * Remove a symbol table from a loadable module. 727 * Remove a symbol table from a loadable module.
730 */ 728 */
731void 729void
732ksyms_modunload(const char *name) 730ksyms_modunload(const char *name)
733{ 731{
734 struct ksyms_symtab *st; 732 struct ksyms_symtab *st;
735 733
736 mutex_enter(&ksyms_lock); 734 mutex_enter(&ksyms_lock);
737 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { 735 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
738 if (st->sd_gone) 736 if (st->sd_gone)
739 continue; 737 continue;
740 if (strcmp(name, st->sd_name) != 0) 738 if (strcmp(name, st->sd_name) != 0)
741 continue; 739 continue;
742 st->sd_gone = true; 740 st->sd_gone = true;
743 if (!ksyms_isopen) { 741 if (!ksyms_isopen) {
744 TAILQ_REMOVE(&ksyms_symtabs, st, sd_queue); 742 TAILQ_REMOVE(&ksyms_symtabs, st, sd_queue);
745 ksyms_sizes_calc(); 743 ksyms_sizes_calc();
746 kmem_free(st, sizeof(*st)); 744 kmem_free(st, sizeof(*st));
747 } 745 }
748 break; 746 break;
749 } 747 }
750 mutex_exit(&ksyms_lock); 748 mutex_exit(&ksyms_lock);
751 KASSERT(st != NULL); 749 KASSERT(st != NULL);
752} 750}
753 751
754#ifdef DDB 752#ifdef DDB
755/* 753/*
756 * Keep sifting stuff here, to avoid export of ksyms internals. 754 * Keep sifting stuff here, to avoid export of ksyms internals.
757 * 755 *
758 * Systems is expected to be quiescent, so no locking done. 756 * Systems is expected to be quiescent, so no locking done.
759 */ 757 */
760int 758int
761ksyms_sift(char *mod, char *sym, int mode) 759ksyms_sift(char *mod, char *sym, int mode)
762{ 760{
763 struct ksyms_symtab *st; 761 struct ksyms_symtab *st;
764 char *sb; 762 char *sb;
765 int i, sz; 763 int i, sz;
766 764
767 if (!ksyms_initted) 765 if (!ksyms_initted)
768 return ENOENT; 766 return ENOENT;
769 767
770 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { 768 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
771 if (st->sd_gone) 769 if (st->sd_gone)
772 continue; 770 continue;
773 if (mod && strcmp(mod, st->sd_name)) 771 if (mod && strcmp(mod, st->sd_name))
774 continue; 772 continue;
775 sb = st->sd_strstart - st->sd_usroffset; 773 sb = st->sd_strstart - st->sd_usroffset;
776 774
777 sz = st->sd_symsize/sizeof(Elf_Sym); 775 sz = st->sd_symsize/sizeof(Elf_Sym);
778 for (i = 0; i < sz; i++) { 776 for (i = 0; i < sz; i++) {
779 Elf_Sym *les = st->sd_symstart + i; 777 Elf_Sym *les = st->sd_symstart + i;
780 char c; 778 char c;
781 779
782 if (strstr(sb + les->st_name, sym) == NULL) 780 if (strstr(sb + les->st_name, sym) == NULL)
783 continue; 781 continue;
784 782
785 if (mode == 'F') { 783 if (mode == 'F') {
786 switch (ELF_ST_TYPE(les->st_info)) { 784 switch (ELF_ST_TYPE(les->st_info)) {
787 case STT_OBJECT: 785 case STT_OBJECT:
788 c = '+'; 786 c = '+';
789 break; 787 break;
790 case STT_FUNC: 788 case STT_FUNC:
791 c = '*'; 789 c = '*';
792 break; 790 break;
793 case STT_SECTION: 791 case STT_SECTION:
794 c = '&'; 792 c = '&';
795 break; 793 break;
796 case STT_FILE: 794 case STT_FILE:
797 c = '/'; 795 c = '/';
798 break; 796 break;
799 default: 797 default:
800 c = ' '; 798 c = ' ';
801 break; 799 break;
802 } 800 }
803 db_printf("%s%c ", sb + les->st_name, c); 801 db_printf("%s%c ", sb + les->st_name, c);
804 } else 802 } else
805 db_printf("%s ", sb + les->st_name); 803 db_printf("%s ", sb + les->st_name);
806 } 804 }
807 } 805 }
808 return ENOENT; 806 return ENOENT;
809} 807}
810#endif /* DDB */ 808#endif /* DDB */
811 809
812/* 810/*
813 * In case we exposing the symbol table to the userland using the pseudo- 811 * In case we exposing the symbol table to the userland using the pseudo-
814 * device /dev/ksyms, it is easier to provide all the tables as one. 812 * device /dev/ksyms, it is easier to provide all the tables as one.
815 * However, it means we have to change all the st_name fields for the 813 * However, it means we have to change all the st_name fields for the
816 * symbols so they match the ELF image that the userland will read 814 * symbols so they match the ELF image that the userland will read
817 * through the device. 815 * through the device.
818 * 816 *
819 * The actual (correct) value of st_name is preserved through a global 817 * The actual (correct) value of st_name is preserved through a global
820 * offset stored in the symbol table structure. 818 * offset stored in the symbol table structure.
821 * 819 *
822 * Call with ksyms_lock held. 820 * Call with ksyms_lock held.
823 */ 821 */
824static void 822static void
825ksyms_sizes_calc(void) 823ksyms_sizes_calc(void)
826{ 824{
827 struct ksyms_symtab *st; 825 struct ksyms_symtab *st;
828 int i, delta; 826 int i, delta;
829 827
830 ksyms_symsz = ksyms_strsz = 0; 828 ksyms_symsz = ksyms_strsz = 0;
831 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { 829 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
832 delta = ksyms_strsz - st->sd_usroffset; 830 delta = ksyms_strsz - st->sd_usroffset;
833 if (delta != 0) { 831 if (delta != 0) {
834 for (i = 0; i < st->sd_symsize/sizeof(Elf_Sym); i++) 832 for (i = 0; i < st->sd_symsize/sizeof(Elf_Sym); i++)
835 st->sd_symstart[i].st_name += delta; 833 st->sd_symstart[i].st_name += delta;
836 st->sd_usroffset = ksyms_strsz; 834 st->sd_usroffset = ksyms_strsz;
837 } 835 }
838 ksyms_symsz += st->sd_symsize; 836 ksyms_symsz += st->sd_symsize;
839 ksyms_strsz += st->sd_strsize; 837 ksyms_strsz += st->sd_strsize;
840 } 838 }
841} 839}
842 840
843static void 841static void
844ksyms_hdr_init(void *hdraddr) 842ksyms_hdr_init(void *hdraddr)
845{ 843{
846 844
847 /* Copy the loaded elf exec header */ 845 /* Copy the loaded elf exec header */
848 memcpy(&ksyms_hdr.kh_ehdr, hdraddr, sizeof(Elf_Ehdr)); 846 memcpy(&ksyms_hdr.kh_ehdr, hdraddr, sizeof(Elf_Ehdr));
849 847
850 /* Set correct program/section header sizes, offsets and numbers */ 848 /* Set correct program/section header sizes, offsets and numbers */
851 ksyms_hdr.kh_ehdr.e_phoff = offsetof(struct ksyms_hdr, kh_phdr[0]); 849 ksyms_hdr.kh_ehdr.e_phoff = offsetof(struct ksyms_hdr, kh_phdr[0]);
852 ksyms_hdr.kh_ehdr.e_phentsize = sizeof(Elf_Phdr); 850 ksyms_hdr.kh_ehdr.e_phentsize = sizeof(Elf_Phdr);
853 ksyms_hdr.kh_ehdr.e_phnum = NPRGHDR; 851 ksyms_hdr.kh_ehdr.e_phnum = NPRGHDR;
854 ksyms_hdr.kh_ehdr.e_shoff = offsetof(struct ksyms_hdr, kh_shdr[0]); 852 ksyms_hdr.kh_ehdr.e_shoff = offsetof(struct ksyms_hdr, kh_shdr[0]);
855 ksyms_hdr.kh_ehdr.e_shentsize = sizeof(Elf_Shdr); 853 ksyms_hdr.kh_ehdr.e_shentsize = sizeof(Elf_Shdr);
856 ksyms_hdr.kh_ehdr.e_shnum = NSECHDR; 854 ksyms_hdr.kh_ehdr.e_shnum = NSECHDR;
857 ksyms_hdr.kh_ehdr.e_shstrndx = SHSTRTAB; 855 ksyms_hdr.kh_ehdr.e_shstrndx = SHSTRTAB;
858 856
859 /* Text/data - fake */ 857 /* Text/data - fake */
860 ksyms_hdr.kh_phdr[0].p_type = PT_LOAD; 858 ksyms_hdr.kh_phdr[0].p_type = PT_LOAD;
861 ksyms_hdr.kh_phdr[0].p_memsz = (unsigned long)-1L; 859 ksyms_hdr.kh_phdr[0].p_memsz = (unsigned long)-1L;
862 ksyms_hdr.kh_phdr[0].p_flags = PF_R | PF_X | PF_W; 860 ksyms_hdr.kh_phdr[0].p_flags = PF_R | PF_X | PF_W;
863 861
864 /* First section is null */ 862 /* First section is null */
865 863
866 /* Second section header; ".symtab" */ 864 /* Second section header; ".symtab" */
867 ksyms_hdr.kh_shdr[SYMTAB].sh_name = 1; /* Section 3 offset */ 865 ksyms_hdr.kh_shdr[SYMTAB].sh_name = 1; /* Section 3 offset */
868 ksyms_hdr.kh_shdr[SYMTAB].sh_type = SHT_SYMTAB; 866 ksyms_hdr.kh_shdr[SYMTAB].sh_type = SHT_SYMTAB;
869 ksyms_hdr.kh_shdr[SYMTAB].sh_offset = sizeof(struct ksyms_hdr); 867 ksyms_hdr.kh_shdr[SYMTAB].sh_offset = sizeof(struct ksyms_hdr);
870/* ksyms_hdr.kh_shdr[SYMTAB].sh_size = filled in at open */ 868/* ksyms_hdr.kh_shdr[SYMTAB].sh_size = filled in at open */
871 ksyms_hdr.kh_shdr[SYMTAB].sh_link = 2; /* Corresponding strtab */ 869 ksyms_hdr.kh_shdr[SYMTAB].sh_link = 2; /* Corresponding strtab */
872 ksyms_hdr.kh_shdr[SYMTAB].sh_addralign = sizeof(long); 870 ksyms_hdr.kh_shdr[SYMTAB].sh_addralign = sizeof(long);
873 ksyms_hdr.kh_shdr[SYMTAB].sh_entsize = sizeof(Elf_Sym); 871 ksyms_hdr.kh_shdr[SYMTAB].sh_entsize = sizeof(Elf_Sym);
874 872
875 /* Third section header; ".strtab" */ 873 /* Third section header; ".strtab" */
876 ksyms_hdr.kh_shdr[STRTAB].sh_name = 9; /* Section 3 offset */ 874 ksyms_hdr.kh_shdr[STRTAB].sh_name = 9; /* Section 3 offset */
877 ksyms_hdr.kh_shdr[STRTAB].sh_type = SHT_STRTAB; 875 ksyms_hdr.kh_shdr[STRTAB].sh_type = SHT_STRTAB;
878/* ksyms_hdr.kh_shdr[STRTAB].sh_offset = filled in at open */ 876/* ksyms_hdr.kh_shdr[STRTAB].sh_offset = filled in at open */
879/* ksyms_hdr.kh_shdr[STRTAB].sh_size = filled in at open */ 877/* ksyms_hdr.kh_shdr[STRTAB].sh_size = filled in at open */
880 ksyms_hdr.kh_shdr[STRTAB].sh_addralign = sizeof(char); 878 ksyms_hdr.kh_shdr[STRTAB].sh_addralign = sizeof(char);
881 879
882 /* Fourth section, ".shstrtab" */ 880 /* Fourth section, ".shstrtab" */
883 ksyms_hdr.kh_shdr[SHSTRTAB].sh_name = 17; /* This section name offset */ 881 ksyms_hdr.kh_shdr[SHSTRTAB].sh_name = 17; /* This section name offset */
884 ksyms_hdr.kh_shdr[SHSTRTAB].sh_type = SHT_STRTAB; 882 ksyms_hdr.kh_shdr[SHSTRTAB].sh_type = SHT_STRTAB;
885 ksyms_hdr.kh_shdr[SHSTRTAB].sh_offset = 883 ksyms_hdr.kh_shdr[SHSTRTAB].sh_offset =
886 offsetof(struct ksyms_hdr, kh_strtab); 884 offsetof(struct ksyms_hdr, kh_strtab);
887 ksyms_hdr.kh_shdr[SHSTRTAB].sh_size = SHSTRSIZ; 885 ksyms_hdr.kh_shdr[SHSTRTAB].sh_size = SHSTRSIZ;
888 ksyms_hdr.kh_shdr[SHSTRTAB].sh_addralign = sizeof(char); 886 ksyms_hdr.kh_shdr[SHSTRTAB].sh_addralign = sizeof(char);
889 887
890 /* Fifth section, ".bss". All symbols reside here. */ 888 /* Fifth section, ".bss". All symbols reside here. */
891 ksyms_hdr.kh_shdr[SHBSS].sh_name = 27; /* This section name offset */ 889 ksyms_hdr.kh_shdr[SHBSS].sh_name = 27; /* This section name offset */
892 ksyms_hdr.kh_shdr[SHBSS].sh_type = SHT_NOBITS; 890 ksyms_hdr.kh_shdr[SHBSS].sh_type = SHT_NOBITS;
893 ksyms_hdr.kh_shdr[SHBSS].sh_offset = 0; 891 ksyms_hdr.kh_shdr[SHBSS].sh_offset = 0;
894 ksyms_hdr.kh_shdr[SHBSS].sh_size = (unsigned long)-1L; 892 ksyms_hdr.kh_shdr[SHBSS].sh_size = (unsigned long)-1L;
895 ksyms_hdr.kh_shdr[SHBSS].sh_addralign = PAGE_SIZE; 893 ksyms_hdr.kh_shdr[SHBSS].sh_addralign = PAGE_SIZE;
896 ksyms_hdr.kh_shdr[SHBSS].sh_flags = SHF_ALLOC | SHF_EXECINSTR; 894 ksyms_hdr.kh_shdr[SHBSS].sh_flags = SHF_ALLOC | SHF_EXECINSTR;
897 895
898#ifdef KDTRACE_HOOKS 896#ifdef KDTRACE_HOOKS
899 /* Sixth section header; ".SUNW_ctf" */ 897 /* Sixth section header; ".SUNW_ctf" */
900 ksyms_hdr.kh_shdr[SHCTF].sh_name = 32; /* Section 6 offset */ 898 ksyms_hdr.kh_shdr[SHCTF].sh_name = 32; /* Section 6 offset */
901 ksyms_hdr.kh_shdr[SHCTF].sh_type = SHT_PROGBITS; 899 ksyms_hdr.kh_shdr[SHCTF].sh_type = SHT_PROGBITS;
902/* ksyms_hdr.kh_shdr[SHCTF].sh_offset = filled in at open */ 900/* ksyms_hdr.kh_shdr[SHCTF].sh_offset = filled in at open */
903/* ksyms_hdr.kh_shdr[SHCTF].sh_size = filled in at open */ 901/* ksyms_hdr.kh_shdr[SHCTF].sh_size = filled in at open */
904 ksyms_hdr.kh_shdr[SHCTF].sh_link = SYMTAB; /* Corresponding symtab */ 902 ksyms_hdr.kh_shdr[SHCTF].sh_link = SYMTAB; /* Corresponding symtab */
905 ksyms_hdr.kh_shdr[SHCTF].sh_addralign = sizeof(char); 903 ksyms_hdr.kh_shdr[SHCTF].sh_addralign = sizeof(char);
906#endif 904#endif
907 905
908 /* Set section names */ 906 /* Set section names */
909 strlcpy(&ksyms_hdr.kh_strtab[1], ".symtab", 907 strlcpy(&ksyms_hdr.kh_strtab[1], ".symtab",
910 sizeof(ksyms_hdr.kh_strtab) - 1); 908 sizeof(ksyms_hdr.kh_strtab) - 1);
911 strlcpy(&ksyms_hdr.kh_strtab[9], ".strtab", 909 strlcpy(&ksyms_hdr.kh_strtab[9], ".strtab",
912 sizeof(ksyms_hdr.kh_strtab) - 9); 910 sizeof(ksyms_hdr.kh_strtab) - 9);
913 strlcpy(&ksyms_hdr.kh_strtab[17], ".shstrtab", 911 strlcpy(&ksyms_hdr.kh_strtab[17], ".shstrtab",
914 sizeof(ksyms_hdr.kh_strtab) - 17); 912 sizeof(ksyms_hdr.kh_strtab) - 17);
915 strlcpy(&ksyms_hdr.kh_strtab[27], ".bss", 913 strlcpy(&ksyms_hdr.kh_strtab[27], ".bss",
916 sizeof(ksyms_hdr.kh_strtab) - 27); 914 sizeof(ksyms_hdr.kh_strtab) - 27);
917#ifdef KDTRACE_HOOKS 915#ifdef KDTRACE_HOOKS
918 strlcpy(&ksyms_hdr.kh_strtab[32], ".SUNW_ctf", 916 strlcpy(&ksyms_hdr.kh_strtab[32], ".SUNW_ctf",
919 sizeof(ksyms_hdr.kh_strtab) - 32); 917 sizeof(ksyms_hdr.kh_strtab) - 32);
920#endif 918#endif
921} 919}
922 920
923static int 921static int
924ksymsopen(dev_t dev, int oflags, int devtype, struct lwp *l) 922ksymsopen(dev_t dev, int oflags, int devtype, struct lwp *l)
925{ 923{
926 924
927 if (minor(dev) != 0 || !ksyms_initted) 925 if (minor(dev) != 0 || !ksyms_initted)
928 return ENXIO; 926 return ENXIO;
929 927
930 /* 928 /*
931 * Create a "snapshot" of the kernel symbol table. Setting 929 * Create a "snapshot" of the kernel symbol table. Setting
932 * ksyms_isopen will prevent symbol tables from being freed. 930 * ksyms_isopen will prevent symbol tables from being freed.
933 */ 931 */
934 mutex_enter(&ksyms_lock); 932 mutex_enter(&ksyms_lock);
935 ksyms_hdr.kh_shdr[SYMTAB].sh_size = ksyms_symsz; 933 ksyms_hdr.kh_shdr[SYMTAB].sh_size = ksyms_symsz;
936 ksyms_hdr.kh_shdr[SYMTAB].sh_info = ksyms_symsz / sizeof(Elf_Sym); 934 ksyms_hdr.kh_shdr[SYMTAB].sh_info = ksyms_symsz / sizeof(Elf_Sym);
937 ksyms_hdr.kh_shdr[STRTAB].sh_offset = ksyms_symsz + 935 ksyms_hdr.kh_shdr[STRTAB].sh_offset = ksyms_symsz +
938 ksyms_hdr.kh_shdr[SYMTAB].sh_offset; 936 ksyms_hdr.kh_shdr[SYMTAB].sh_offset;
939 ksyms_hdr.kh_shdr[STRTAB].sh_size = ksyms_strsz; 937 ksyms_hdr.kh_shdr[STRTAB].sh_size = ksyms_strsz;
940#ifdef KDTRACE_HOOKS 938#ifdef KDTRACE_HOOKS
941 ksyms_hdr.kh_shdr[SHCTF].sh_offset = ksyms_strsz + 939 ksyms_hdr.kh_shdr[SHCTF].sh_offset = ksyms_strsz +
942 ksyms_hdr.kh_shdr[STRTAB].sh_offset; 940 ksyms_hdr.kh_shdr[STRTAB].sh_offset;
943 ksyms_hdr.kh_shdr[SHCTF].sh_size = ksyms_ctfsz; 941 ksyms_hdr.kh_shdr[SHCTF].sh_size = ksyms_ctfsz;
944#endif 942#endif
945 ksyms_isopen = true; 943 ksyms_isopen = true;
946 mutex_exit(&ksyms_lock); 944 mutex_exit(&ksyms_lock);
947 945
948 return 0; 946 return 0;
949} 947}
950 948
951static int 949static int
952ksymsclose(dev_t dev, int oflags, int devtype, struct lwp *l) 950ksymsclose(dev_t dev, int oflags, int devtype, struct lwp *l)
953{ 951{
954 struct ksyms_symtab *st, *next; 952 struct ksyms_symtab *st, *next;
955 bool resize; 953 bool resize;
956 954
957 /* Discard refernces to symbol tables. */ 955 /* Discard refernces to symbol tables. */
958 mutex_enter(&ksyms_lock); 956 mutex_enter(&ksyms_lock);
959 ksyms_isopen = false; 957 ksyms_isopen = false;
960 resize = false; 958 resize = false;
961 for (st = TAILQ_FIRST(&ksyms_symtabs); st != NULL; st = next) { 959 for (st = TAILQ_FIRST(&ksyms_symtabs); st != NULL; st = next) {
962 next = TAILQ_NEXT(st, sd_queue); 960 next = TAILQ_NEXT(st, sd_queue);
963 if (st->sd_gone) { 961 if (st->sd_gone) {
964 TAILQ_REMOVE(&ksyms_symtabs, st, sd_queue); 962 TAILQ_REMOVE(&ksyms_symtabs, st, sd_queue);
965 kmem_free(st, sizeof(*st)); 963 kmem_free(st, sizeof(*st));
966 resize = true; 964 resize = true;
967 } 965 }
968 } 966 }
969 if (resize) 967 if (resize)
970 ksyms_sizes_calc(); 968 ksyms_sizes_calc();
971 mutex_exit(&ksyms_lock); 969 mutex_exit(&ksyms_lock);
972 970
973 return 0; 971 return 0;
974} 972}
975 973
976static int 974static int
977ksymsread(dev_t dev, struct uio *uio, int ioflag) 975ksymsread(dev_t dev, struct uio *uio, int ioflag)
978{ 976{
979 struct ksyms_symtab *st; 977 struct ksyms_symtab *st;
980 size_t filepos, inpos, off; 978 size_t filepos, inpos, off;
981 int error; 979 int error;
982#ifdef KDTRACE_HOOKS 980#ifdef KDTRACE_HOOKS
983 struct ksyms_symtab *cst; 981 struct ksyms_symtab *cst;
984#endif 982#endif
985 983
986 /* 984 /*
987 * First: Copy out the ELF header. XXX Lose if ksymsopen() 985 * First: Copy out the ELF header. XXX Lose if ksymsopen()
988 * occurs during read of the header. 986 * occurs during read of the header.
989 */ 987 */
990 off = uio->uio_offset; 988 off = uio->uio_offset;
991 if (off < sizeof(struct ksyms_hdr)) { 989 if (off < sizeof(struct ksyms_hdr)) {
992 error = uiomove((char *)&ksyms_hdr + off, 990 error = uiomove((char *)&ksyms_hdr + off,
993 sizeof(struct ksyms_hdr) - off, uio); 991 sizeof(struct ksyms_hdr) - off, uio);
994 if (error != 0) 992 if (error != 0)
995 return error; 993 return error;
996 } 994 }
997 995
998 /* 996 /*
999 * Copy out the symbol table. 997 * Copy out the symbol table.
1000 */ 998 */
1001 filepos = sizeof(struct ksyms_hdr); 999 filepos = sizeof(struct ksyms_hdr);
1002 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { 1000 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
1003 if (uio->uio_resid == 0) 1001 if (uio->uio_resid == 0)
1004 return 0; 1002 return 0;
1005 if (uio->uio_offset <= st->sd_symsize + filepos) { 1003 if (uio->uio_offset <= st->sd_symsize + filepos) {
1006 inpos = uio->uio_offset - filepos; 1004 inpos = uio->uio_offset - filepos;
1007 error = uiomove((char *)st->sd_symstart + inpos, 1005 error = uiomove((char *)st->sd_symstart + inpos,
1008 st->sd_symsize - inpos, uio); 1006 st->sd_symsize - inpos, uio);
1009 if (error != 0) 1007 if (error != 0)
1010 return error; 1008 return error;
1011 } 1009 }
1012 filepos += st->sd_symsize; 1010 filepos += st->sd_symsize;
1013 } 1011 }
1014 1012
1015 /* 1013 /*
1016 * Copy out the string table 1014 * Copy out the string table
1017 */ 1015 */
1018 KASSERT(filepos == sizeof(struct ksyms_hdr) + 1016 KASSERT(filepos == sizeof(struct ksyms_hdr) +
1019 ksyms_hdr.kh_shdr[SYMTAB].sh_size); 1017 ksyms_hdr.kh_shdr[SYMTAB].sh_size);
1020 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { 1018 TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
1021 if (uio->uio_resid == 0) 1019 if (uio->uio_resid == 0)
1022 return 0; 1020 return 0;
1023 if (uio->uio_offset <= st->sd_strsize + filepos) { 1021 if (uio->uio_offset <= st->sd_strsize + filepos) {
1024 inpos = uio->uio_offset - filepos; 1022 inpos = uio->uio_offset - filepos;
1025 error = uiomove((char *)st->sd_strstart + inpos, 1023 error = uiomove((char *)st->sd_strstart + inpos,
1026 st->sd_strsize - inpos, uio); 1024 st->sd_strsize - inpos, uio);
1027 if (error != 0) 1025 if (error != 0)
1028 return error; 1026 return error;
1029 } 1027 }
1030 filepos += st->sd_strsize; 1028 filepos += st->sd_strsize;
1031 } 1029 }
1032 1030
1033#ifdef KDTRACE_HOOKS 1031#ifdef KDTRACE_HOOKS
1034 /* 1032 /*
1035 * Copy out the CTF table. 1033 * Copy out the CTF table.
1036 */ 1034 */
1037 cst = TAILQ_FIRST(&ksyms_symtabs); 1035 cst = TAILQ_FIRST(&ksyms_symtabs);
1038 if (cst->sd_ctfstart != NULL) { 1036 if (cst->sd_ctfstart != NULL) {
1039 if (uio->uio_resid == 0) 1037 if (uio->uio_resid == 0)
1040 return 0; 1038 return 0;
1041 if (uio->uio_offset <= cst->sd_ctfsize + filepos) { 1039 if (uio->uio_offset <= cst->sd_ctfsize + filepos) {
1042 inpos = uio->uio_offset - filepos; 1040 inpos = uio->uio_offset - filepos;
1043 error = uiomove((char *)cst->sd_ctfstart + inpos, 1041 error = uiomove((char *)cst->sd_ctfstart + inpos,
1044 cst->sd_ctfsize - inpos, uio); 1042 cst->sd_ctfsize - inpos, uio);
1045 if (error != 0) 1043 if (error != 0)
1046 return error; 1044 return error;
1047 } 1045 }
1048 filepos += cst->sd_ctfsize; 1046 filepos += cst->sd_ctfsize;
1049 } 1047 }
1050#endif 1048#endif
1051 1049
1052 return 0; 1050 return 0;
1053} 1051}
1054 1052
1055static int 1053static int
1056ksymswrite(dev_t dev, struct uio *uio, int ioflag) 1054ksymswrite(dev_t dev, struct uio *uio, int ioflag)
1057{ 1055{
1058 1056
1059 return EROFS; 1057 return EROFS;
1060} 1058}
1061 1059
1062static int 1060static int
1063ksymsioctl(dev_t dev, u_long cmd, void *data, int fflag, struct lwp *l) 1061ksymsioctl(dev_t dev, u_long cmd, void *data, int fflag, struct lwp *l)
1064{ 1062{
1065 struct ksyms_gsymbol *kg = (struct ksyms_gsymbol *)data; 1063 struct ksyms_gsymbol *kg = (struct ksyms_gsymbol *)data;
1066 struct ksyms_symtab *st; 1064 struct ksyms_symtab *st;
1067 Elf_Sym *sym = NULL, copy; 1065 Elf_Sym *sym = NULL, copy;
1068 unsigned long val; 1066 unsigned long val;
1069 int error = 0; 1067 int error = 0;
1070 char *str = NULL; 1068 char *str = NULL;
1071 int len; 1069 int len;
1072 1070
1073 /* Read ksyms_maxlen only once while not holding the lock. */ 1071 /* Read ksyms_maxlen only once while not holding the lock. */
1074 len = ksyms_maxlen; 1072 len = ksyms_maxlen;
1075 1073
1076 if (cmd == KIOCGVALUE || cmd == KIOCGSYMBOL) { 1074 if (cmd == KIOCGVALUE || cmd == KIOCGSYMBOL) {
1077 str = kmem_alloc(len, KM_SLEEP); 1075 str = kmem_alloc(len, KM_SLEEP);
1078 if ((error = copyinstr(kg->kg_name, str, len, NULL)) != 0) { 1076 if ((error = copyinstr(kg->kg_name, str, len, NULL)) != 0) {
1079 kmem_free(str, len); 1077 kmem_free(str, len);
1080 return error; 1078 return error;
1081 } 1079 }
1082 } 1080 }
1083 1081
1084 switch (cmd) { 1082 switch (cmd) {
1085 case KIOCGVALUE: 1083 case KIOCGVALUE:
1086 /* 1084 /*
1087 * Use the in-kernel symbol lookup code for fast 1085 * Use the in-kernel symbol lookup code for fast
1088 * retreival of a value. 1086 * retreival of a value.
1089 */ 1087 */
1090 error = ksyms_getval(NULL, str, &val, KSYMS_EXTERN); 1088 error = ksyms_getval(NULL, str, &val, KSYMS_EXTERN);
1091 if (error == 0) 1089 if (error == 0)
1092 error = copyout(&val, kg->kg_value, sizeof(long)); 1090 error = copyout(&val, kg->kg_value, sizeof(long));
1093 kmem_free(str, len); 1091 kmem_free(str, len);
1094 break; 1092 break;

cvs diff -r1.40 -r1.41 src/sys/kern/kern_sleepq.c (switch to unified diff)

--- src/sys/kern/kern_sleepq.c 2011/07/26 13:04:51 1.40
+++ src/sys/kern/kern_sleepq.c 2011/07/27 14:35:34 1.41
@@ -1,491 +1,489 @@ @@ -1,491 +1,489 @@
1/* $NetBSD: kern_sleepq.c,v 1.40 2011/07/26 13:04:51 yamt Exp $ */ 1/* $NetBSD: kern_sleepq.c,v 1.41 2011/07/27 14:35:34 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc. 4 * Copyright (c) 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * Sleep queue implementation, used by turnstiles and general sleep/wakeup 33 * Sleep queue implementation, used by turnstiles and general sleep/wakeup
34 * interfaces. 34 * interfaces.
35 */ 35 */
36 36
37#include <sys/cdefs.h> 37#include <sys/cdefs.h>
38__KERNEL_RCSID(0, "$NetBSD: kern_sleepq.c,v 1.40 2011/07/26 13:04:51 yamt Exp $"); 38__KERNEL_RCSID(0, "$NetBSD: kern_sleepq.c,v 1.41 2011/07/27 14:35:34 uebayasi Exp $");
39 39
40#include <sys/param.h> 40#include <sys/param.h>
41#include <sys/kernel.h> 41#include <sys/kernel.h>
42#include <sys/cpu.h> 42#include <sys/cpu.h>
43#include <sys/pool.h> 43#include <sys/pool.h>
44#include <sys/proc.h>  44#include <sys/proc.h>
45#include <sys/resourcevar.h> 45#include <sys/resourcevar.h>
46#include <sys/sa.h> 46#include <sys/sa.h>
47#include <sys/savar.h> 47#include <sys/savar.h>
48#include <sys/sched.h> 48#include <sys/sched.h>
49#include <sys/systm.h> 49#include <sys/systm.h>
50#include <sys/sleepq.h> 50#include <sys/sleepq.h>
51#include <sys/ktrace.h> 51#include <sys/ktrace.h>
52 52
53#include <uvm/uvm_extern.h> 
54 
55#include "opt_sa.h" 53#include "opt_sa.h"
56 54
57static int sleepq_sigtoerror(lwp_t *, int); 55static int sleepq_sigtoerror(lwp_t *, int);
58 56
59/* General purpose sleep table, used by ltsleep() and condition variables. */ 57/* General purpose sleep table, used by ltsleep() and condition variables. */
60sleeptab_t sleeptab __cacheline_aligned; 58sleeptab_t sleeptab __cacheline_aligned;
61 59
62/* 60/*
63 * sleeptab_init: 61 * sleeptab_init:
64 * 62 *
65 * Initialize a sleep table. 63 * Initialize a sleep table.
66 */ 64 */
67void 65void
68sleeptab_init(sleeptab_t *st) 66sleeptab_init(sleeptab_t *st)
69{ 67{
70 sleepq_t *sq; 68 sleepq_t *sq;
71 int i; 69 int i;
72 70
73 for (i = 0; i < SLEEPTAB_HASH_SIZE; i++) { 71 for (i = 0; i < SLEEPTAB_HASH_SIZE; i++) {
74 sq = &st->st_queues[i].st_queue; 72 sq = &st->st_queues[i].st_queue;
75 st->st_queues[i].st_mutex = 73 st->st_queues[i].st_mutex =
76 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 74 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
77 sleepq_init(sq); 75 sleepq_init(sq);
78 } 76 }
79} 77}
80 78
81/* 79/*
82 * sleepq_init: 80 * sleepq_init:
83 * 81 *
84 * Prepare a sleep queue for use. 82 * Prepare a sleep queue for use.
85 */ 83 */
86void 84void
87sleepq_init(sleepq_t *sq) 85sleepq_init(sleepq_t *sq)
88{ 86{
89 87
90 TAILQ_INIT(sq); 88 TAILQ_INIT(sq);
91} 89}
92 90
93/* 91/*
94 * sleepq_remove: 92 * sleepq_remove:
95 * 93 *
96 * Remove an LWP from a sleep queue and wake it up. 94 * Remove an LWP from a sleep queue and wake it up.
97 */ 95 */
98void 96void
99sleepq_remove(sleepq_t *sq, lwp_t *l) 97sleepq_remove(sleepq_t *sq, lwp_t *l)
100{ 98{
101 struct schedstate_percpu *spc; 99 struct schedstate_percpu *spc;
102 struct cpu_info *ci; 100 struct cpu_info *ci;
103 101
104 KASSERT(lwp_locked(l, NULL)); 102 KASSERT(lwp_locked(l, NULL));
105 103
106 TAILQ_REMOVE(sq, l, l_sleepchain); 104 TAILQ_REMOVE(sq, l, l_sleepchain);
107 l->l_syncobj = &sched_syncobj; 105 l->l_syncobj = &sched_syncobj;
108 l->l_wchan = NULL; 106 l->l_wchan = NULL;
109 l->l_sleepq = NULL; 107 l->l_sleepq = NULL;
110 l->l_flag &= ~LW_SINTR; 108 l->l_flag &= ~LW_SINTR;
111 109
112 ci = l->l_cpu; 110 ci = l->l_cpu;
113 spc = &ci->ci_schedstate; 111 spc = &ci->ci_schedstate;
114 112
115 /* 113 /*
116 * If not sleeping, the LWP must have been suspended. Let whoever 114 * If not sleeping, the LWP must have been suspended. Let whoever
117 * holds it stopped set it running again. 115 * holds it stopped set it running again.
118 */ 116 */
119 if (l->l_stat != LSSLEEP) { 117 if (l->l_stat != LSSLEEP) {
120 KASSERT(l->l_stat == LSSTOP || l->l_stat == LSSUSPENDED); 118 KASSERT(l->l_stat == LSSTOP || l->l_stat == LSSUSPENDED);
121 lwp_setlock(l, spc->spc_lwplock); 119 lwp_setlock(l, spc->spc_lwplock);
122 return; 120 return;
123 } 121 }
124 122
125 /* 123 /*
126 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 124 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
127 * about to call mi_switch(), in which case it will yield. 125 * about to call mi_switch(), in which case it will yield.
128 */ 126 */
129 if ((l->l_pflag & LP_RUNNING) != 0) { 127 if ((l->l_pflag & LP_RUNNING) != 0) {
130 l->l_stat = LSONPROC; 128 l->l_stat = LSONPROC;
131 l->l_slptime = 0; 129 l->l_slptime = 0;
132 lwp_setlock(l, spc->spc_lwplock); 130 lwp_setlock(l, spc->spc_lwplock);
133 return; 131 return;
134 } 132 }
135 133
136 /* Update sleep time delta, call the wake-up handler of scheduler */ 134 /* Update sleep time delta, call the wake-up handler of scheduler */
137 l->l_slpticksum += (hardclock_ticks - l->l_slpticks); 135 l->l_slpticksum += (hardclock_ticks - l->l_slpticks);
138 sched_wakeup(l); 136 sched_wakeup(l);
139 137
140 /* Look for a CPU to wake up */ 138 /* Look for a CPU to wake up */
141 l->l_cpu = sched_takecpu(l); 139 l->l_cpu = sched_takecpu(l);
142 ci = l->l_cpu; 140 ci = l->l_cpu;
143 spc = &ci->ci_schedstate; 141 spc = &ci->ci_schedstate;
144 142
145 /* 143 /*
146 * Set it running. 144 * Set it running.
147 */ 145 */
148 spc_lock(ci); 146 spc_lock(ci);
149 lwp_setlock(l, spc->spc_mutex); 147 lwp_setlock(l, spc->spc_mutex);
150#ifdef KERN_SA 148#ifdef KERN_SA
151 if (l->l_proc->p_sa != NULL) 149 if (l->l_proc->p_sa != NULL)
152 sa_awaken(l); 150 sa_awaken(l);
153#endif /* KERN_SA */ 151#endif /* KERN_SA */
154 sched_setrunnable(l); 152 sched_setrunnable(l);
155 l->l_stat = LSRUN; 153 l->l_stat = LSRUN;
156 l->l_slptime = 0; 154 l->l_slptime = 0;
157 sched_enqueue(l, false); 155 sched_enqueue(l, false);
158 spc_unlock(ci); 156 spc_unlock(ci);
159} 157}
160 158
161/* 159/*
162 * sleepq_insert: 160 * sleepq_insert:
163 * 161 *
164 * Insert an LWP into the sleep queue, optionally sorting by priority. 162 * Insert an LWP into the sleep queue, optionally sorting by priority.
165 */ 163 */
166void 164void
167sleepq_insert(sleepq_t *sq, lwp_t *l, syncobj_t *sobj) 165sleepq_insert(sleepq_t *sq, lwp_t *l, syncobj_t *sobj)
168{ 166{
169 167
170 if ((sobj->sobj_flag & SOBJ_SLEEPQ_SORTED) != 0) { 168 if ((sobj->sobj_flag & SOBJ_SLEEPQ_SORTED) != 0) {
171 lwp_t *l2; 169 lwp_t *l2;
172 const int pri = lwp_eprio(l); 170 const int pri = lwp_eprio(l);
173 171
174 TAILQ_FOREACH(l2, sq, l_sleepchain) { 172 TAILQ_FOREACH(l2, sq, l_sleepchain) {
175 if (lwp_eprio(l2) < pri) { 173 if (lwp_eprio(l2) < pri) {
176 TAILQ_INSERT_BEFORE(l2, l, l_sleepchain); 174 TAILQ_INSERT_BEFORE(l2, l, l_sleepchain);
177 return; 175 return;
178 } 176 }
179 } 177 }
180 } 178 }
181 179
182 if ((sobj->sobj_flag & SOBJ_SLEEPQ_LIFO) != 0) 180 if ((sobj->sobj_flag & SOBJ_SLEEPQ_LIFO) != 0)
183 TAILQ_INSERT_HEAD(sq, l, l_sleepchain); 181 TAILQ_INSERT_HEAD(sq, l, l_sleepchain);
184 else 182 else
185 TAILQ_INSERT_TAIL(sq, l, l_sleepchain); 183 TAILQ_INSERT_TAIL(sq, l, l_sleepchain);
186} 184}
187 185
188/* 186/*
189 * sleepq_enqueue: 187 * sleepq_enqueue:
190 * 188 *
191 * Enter an LWP into the sleep queue and prepare for sleep. The sleep 189 * Enter an LWP into the sleep queue and prepare for sleep. The sleep
192 * queue must already be locked, and any interlock (such as the kernel 190 * queue must already be locked, and any interlock (such as the kernel
193 * lock) must have be released (see sleeptab_lookup(), sleepq_enter()). 191 * lock) must have be released (see sleeptab_lookup(), sleepq_enter()).
194 */ 192 */
195void 193void
196sleepq_enqueue(sleepq_t *sq, wchan_t wchan, const char *wmesg, syncobj_t *sobj) 194sleepq_enqueue(sleepq_t *sq, wchan_t wchan, const char *wmesg, syncobj_t *sobj)
197{ 195{
198 lwp_t *l = curlwp; 196 lwp_t *l = curlwp;
199 197
200 KASSERT(lwp_locked(l, NULL)); 198 KASSERT(lwp_locked(l, NULL));
201 KASSERT(l->l_stat == LSONPROC); 199 KASSERT(l->l_stat == LSONPROC);
202 KASSERT(l->l_wchan == NULL && l->l_sleepq == NULL); 200 KASSERT(l->l_wchan == NULL && l->l_sleepq == NULL);
203 201
204 l->l_syncobj = sobj; 202 l->l_syncobj = sobj;
205 l->l_wchan = wchan; 203 l->l_wchan = wchan;
206 l->l_sleepq = sq; 204 l->l_sleepq = sq;
207 l->l_wmesg = wmesg; 205 l->l_wmesg = wmesg;
208 l->l_slptime = 0; 206 l->l_slptime = 0;
209 l->l_stat = LSSLEEP; 207 l->l_stat = LSSLEEP;
210 l->l_sleeperr = 0; 208 l->l_sleeperr = 0;
211 209
212 sleepq_insert(sq, l, sobj); 210 sleepq_insert(sq, l, sobj);
213 211
214 /* Save the time when thread has slept */ 212 /* Save the time when thread has slept */
215 l->l_slpticks = hardclock_ticks; 213 l->l_slpticks = hardclock_ticks;
216 sched_slept(l); 214 sched_slept(l);
217} 215}
218 216
219/* 217/*
220 * sleepq_block: 218 * sleepq_block:
221 * 219 *
222 * After any intermediate step such as releasing an interlock, switch. 220 * After any intermediate step such as releasing an interlock, switch.
223 * sleepq_block() may return early under exceptional conditions, for 221 * sleepq_block() may return early under exceptional conditions, for
224 * example if the LWP's containing process is exiting. 222 * example if the LWP's containing process is exiting.
225 */ 223 */
226int 224int
227sleepq_block(int timo, bool catch) 225sleepq_block(int timo, bool catch)
228{ 226{
229 int error = 0, sig; 227 int error = 0, sig;
230 struct proc *p; 228 struct proc *p;
231 lwp_t *l = curlwp; 229 lwp_t *l = curlwp;
232 bool early = false; 230 bool early = false;
233 int biglocks = l->l_biglocks; 231 int biglocks = l->l_biglocks;
234 232
235 ktrcsw(1, 0); 233 ktrcsw(1, 0);
236 234
237 /* 235 /*
238 * If sleeping interruptably, check for pending signals, exits or 236 * If sleeping interruptably, check for pending signals, exits or
239 * core dump events. 237 * core dump events.
240 */ 238 */
241 if (catch) { 239 if (catch) {
242 l->l_flag |= LW_SINTR; 240 l->l_flag |= LW_SINTR;
243 if ((l->l_flag & (LW_CANCELLED|LW_WEXIT|LW_WCORE)) != 0) { 241 if ((l->l_flag & (LW_CANCELLED|LW_WEXIT|LW_WCORE)) != 0) {
244 l->l_flag &= ~LW_CANCELLED; 242 l->l_flag &= ~LW_CANCELLED;
245 error = EINTR; 243 error = EINTR;
246 early = true; 244 early = true;
247 } else if ((l->l_flag & LW_PENDSIG) != 0 && sigispending(l, 0)) 245 } else if ((l->l_flag & LW_PENDSIG) != 0 && sigispending(l, 0))
248 early = true; 246 early = true;
249 } 247 }
250 248
251 if (early) { 249 if (early) {
252 /* lwp_unsleep() will release the lock */ 250 /* lwp_unsleep() will release the lock */
253 lwp_unsleep(l, true); 251 lwp_unsleep(l, true);
254 } else { 252 } else {
255 if (timo) 253 if (timo)
256 callout_schedule(&l->l_timeout_ch, timo); 254 callout_schedule(&l->l_timeout_ch, timo);
257 255
258#ifdef KERN_SA 256#ifdef KERN_SA
259 if (((l->l_flag & LW_SA) != 0) && (~l->l_pflag & LP_SA_NOBLOCK)) 257 if (((l->l_flag & LW_SA) != 0) && (~l->l_pflag & LP_SA_NOBLOCK))
260 sa_switch(l); 258 sa_switch(l);
261 else 259 else
262#endif 260#endif
263 mi_switch(l); 261 mi_switch(l);
264 262
265 /* The LWP and sleep queue are now unlocked. */ 263 /* The LWP and sleep queue are now unlocked. */
266 if (timo) { 264 if (timo) {
267 /* 265 /*
268 * Even if the callout appears to have fired, we need to 266 * Even if the callout appears to have fired, we need to
269 * stop it in order to synchronise with other CPUs. 267 * stop it in order to synchronise with other CPUs.
270 */ 268 */
271 if (callout_halt(&l->l_timeout_ch, NULL)) 269 if (callout_halt(&l->l_timeout_ch, NULL))
272 error = EWOULDBLOCK; 270 error = EWOULDBLOCK;
273 } 271 }
274 } 272 }
275 273
276 if (catch && error == 0) { 274 if (catch && error == 0) {
277 p = l->l_proc; 275 p = l->l_proc;
278 if ((l->l_flag & (LW_CANCELLED | LW_WEXIT | LW_WCORE)) != 0) 276 if ((l->l_flag & (LW_CANCELLED | LW_WEXIT | LW_WCORE)) != 0)
279 error = EINTR; 277 error = EINTR;
280 else if ((l->l_flag & LW_PENDSIG) != 0) { 278 else if ((l->l_flag & LW_PENDSIG) != 0) {
281 /* 279 /*
282 * Acquiring p_lock may cause us to recurse 280 * Acquiring p_lock may cause us to recurse
283 * through the sleep path and back into this 281 * through the sleep path and back into this
284 * routine, but is safe because LWPs sleeping 282 * routine, but is safe because LWPs sleeping
285 * on locks are non-interruptable. We will 283 * on locks are non-interruptable. We will
286 * not recurse again. 284 * not recurse again.
287 */ 285 */
288 mutex_enter(p->p_lock); 286 mutex_enter(p->p_lock);
289 if ((sig = issignal(l)) != 0) 287 if ((sig = issignal(l)) != 0)
290 error = sleepq_sigtoerror(l, sig); 288 error = sleepq_sigtoerror(l, sig);
291 mutex_exit(p->p_lock); 289 mutex_exit(p->p_lock);
292 } 290 }
293 } 291 }
294 292
295 ktrcsw(0, 0); 293 ktrcsw(0, 0);
296 if (__predict_false(biglocks != 0)) { 294 if (__predict_false(biglocks != 0)) {
297 KERNEL_LOCK(biglocks, NULL); 295 KERNEL_LOCK(biglocks, NULL);
298 } 296 }
299 return error; 297 return error;
300} 298}
301 299
302/* 300/*
303 * sleepq_wake: 301 * sleepq_wake:
304 * 302 *
305 * Wake zero or more LWPs blocked on a single wait channel. 303 * Wake zero or more LWPs blocked on a single wait channel.
306 */ 304 */
307lwp_t * 305lwp_t *
308sleepq_wake(sleepq_t *sq, wchan_t wchan, u_int expected, kmutex_t *mp) 306sleepq_wake(sleepq_t *sq, wchan_t wchan, u_int expected, kmutex_t *mp)
309{ 307{
310 lwp_t *l, *next; 308 lwp_t *l, *next;
311 309
312 KASSERT(mutex_owned(mp)); 310 KASSERT(mutex_owned(mp));
313 311
314 for (l = TAILQ_FIRST(sq); l != NULL; l = next) { 312 for (l = TAILQ_FIRST(sq); l != NULL; l = next) {
315 KASSERT(l->l_sleepq == sq); 313 KASSERT(l->l_sleepq == sq);
316 KASSERT(l->l_mutex == mp); 314 KASSERT(l->l_mutex == mp);
317 next = TAILQ_NEXT(l, l_sleepchain); 315 next = TAILQ_NEXT(l, l_sleepchain);
318 if (l->l_wchan != wchan) 316 if (l->l_wchan != wchan)
319 continue; 317 continue;
320 sleepq_remove(sq, l); 318 sleepq_remove(sq, l);
321 if (--expected == 0) 319 if (--expected == 0)
322 break; 320 break;
323 } 321 }
324 322
325 mutex_spin_exit(mp); 323 mutex_spin_exit(mp);
326 return l; 324 return l;
327} 325}
328 326
329/* 327/*
330 * sleepq_unsleep: 328 * sleepq_unsleep:
331 * 329 *
332 * Remove an LWP from its sleep queue and set it runnable again.  330 * Remove an LWP from its sleep queue and set it runnable again.
333 * sleepq_unsleep() is called with the LWP's mutex held, and will 331 * sleepq_unsleep() is called with the LWP's mutex held, and will
334 * always release it. 332 * always release it.
335 */ 333 */
336void 334void
337sleepq_unsleep(lwp_t *l, bool cleanup) 335sleepq_unsleep(lwp_t *l, bool cleanup)
338{ 336{
339 sleepq_t *sq = l->l_sleepq; 337 sleepq_t *sq = l->l_sleepq;
340 kmutex_t *mp = l->l_mutex; 338 kmutex_t *mp = l->l_mutex;
341 339
342 KASSERT(lwp_locked(l, mp)); 340 KASSERT(lwp_locked(l, mp));
343 KASSERT(l->l_wchan != NULL); 341 KASSERT(l->l_wchan != NULL);
344 342
345 sleepq_remove(sq, l); 343 sleepq_remove(sq, l);
346 if (cleanup) { 344 if (cleanup) {
347 mutex_spin_exit(mp); 345 mutex_spin_exit(mp);
348 } 346 }
349} 347}
350 348
351/* 349/*
352 * sleepq_timeout: 350 * sleepq_timeout:
353 * 351 *
354 * Entered via the callout(9) subsystem to time out an LWP that is on a 352 * Entered via the callout(9) subsystem to time out an LWP that is on a
355 * sleep queue. 353 * sleep queue.
356 */ 354 */
357void 355void
358sleepq_timeout(void *arg) 356sleepq_timeout(void *arg)
359{ 357{
360 lwp_t *l = arg; 358 lwp_t *l = arg;
361 359
362 /* 360 /*
363 * Lock the LWP. Assuming it's still on the sleep queue, its 361 * Lock the LWP. Assuming it's still on the sleep queue, its
364 * current mutex will also be the sleep queue mutex. 362 * current mutex will also be the sleep queue mutex.
365 */ 363 */
366 lwp_lock(l); 364 lwp_lock(l);
367 365
368 if (l->l_wchan == NULL) { 366 if (l->l_wchan == NULL) {
369 /* Somebody beat us to it. */ 367 /* Somebody beat us to it. */
370 lwp_unlock(l); 368 lwp_unlock(l);
371 return; 369 return;
372 } 370 }
373 371
374 lwp_unsleep(l, true); 372 lwp_unsleep(l, true);
375} 373}
376 374
377/* 375/*
378 * sleepq_sigtoerror: 376 * sleepq_sigtoerror:
379 * 377 *
380 * Given a signal number, interpret and return an error code. 378 * Given a signal number, interpret and return an error code.
381 */ 379 */
382static int 380static int
383sleepq_sigtoerror(lwp_t *l, int sig) 381sleepq_sigtoerror(lwp_t *l, int sig)
384{ 382{
385 struct proc *p = l->l_proc; 383 struct proc *p = l->l_proc;
386 int error; 384 int error;
387 385
388 KASSERT(mutex_owned(p->p_lock)); 386 KASSERT(mutex_owned(p->p_lock));
389 387
390 /* 388 /*
391 * If this sleep was canceled, don't let the syscall restart. 389 * If this sleep was canceled, don't let the syscall restart.
392 */ 390 */
393 if ((SIGACTION(p, sig).sa_flags & SA_RESTART) == 0) 391 if ((SIGACTION(p, sig).sa_flags & SA_RESTART) == 0)
394 error = EINTR; 392 error = EINTR;
395 else 393 else
396 error = ERESTART; 394 error = ERESTART;
397 395
398 return error; 396 return error;
399} 397}
400 398
401/* 399/*
402 * sleepq_abort: 400 * sleepq_abort:
403 * 401 *
404 * After a panic or during autoconfiguration, lower the interrupt 402 * After a panic or during autoconfiguration, lower the interrupt
405 * priority level to give pending interrupts a chance to run, and 403 * priority level to give pending interrupts a chance to run, and
406 * then return. Called if sleepq_dontsleep() returns non-zero, and 404 * then return. Called if sleepq_dontsleep() returns non-zero, and
407 * always returns zero. 405 * always returns zero.
408 */ 406 */
409int 407int
410sleepq_abort(kmutex_t *mtx, int unlock) 408sleepq_abort(kmutex_t *mtx, int unlock)
411{  409{
412 extern int safepri; 410 extern int safepri;
413 int s; 411 int s;
414 412
415 s = splhigh(); 413 s = splhigh();
416 splx(safepri); 414 splx(safepri);
417 splx(s); 415 splx(s);
418 if (mtx != NULL && unlock != 0) 416 if (mtx != NULL && unlock != 0)
419 mutex_exit(mtx); 417 mutex_exit(mtx);
420 418
421 return 0; 419 return 0;
422} 420}
423 421
424/* 422/*
425 * sleepq_changepri: 423 * sleepq_changepri:
426 * 424 *
427 * Adjust the priority of an LWP residing on a sleepq. This method 425 * Adjust the priority of an LWP residing on a sleepq. This method
428 * will only alter the user priority; the effective priority is 426 * will only alter the user priority; the effective priority is
429 * assumed to have been fixed at the time of insertion into the queue. 427 * assumed to have been fixed at the time of insertion into the queue.
430 */ 428 */
431void 429void
432sleepq_changepri(lwp_t *l, pri_t pri) 430sleepq_changepri(lwp_t *l, pri_t pri)
433{ 431{
434 sleepq_t *sq = l->l_sleepq; 432 sleepq_t *sq = l->l_sleepq;
435 pri_t opri; 433 pri_t opri;
436 434
437 KASSERT(lwp_locked(l, NULL)); 435 KASSERT(lwp_locked(l, NULL));
438 436
439 opri = lwp_eprio(l); 437 opri = lwp_eprio(l);
440 l->l_priority = pri; 438 l->l_priority = pri;
441 439
442 if (lwp_eprio(l) == opri) { 440 if (lwp_eprio(l) == opri) {
443 return; 441 return;
444 } 442 }
445 if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_SORTED) == 0) { 443 if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_SORTED) == 0) {
446 return; 444 return;
447 } 445 }
448 446
449 /* 447 /*
450 * Don't let the sleep queue become empty, even briefly. 448 * Don't let the sleep queue become empty, even briefly.
451 * cv_signal() and cv_broadcast() inspect it without the 449 * cv_signal() and cv_broadcast() inspect it without the
452 * sleep queue lock held and need to see a non-empty queue 450 * sleep queue lock held and need to see a non-empty queue
453 * head if there are waiters. 451 * head if there are waiters.
454 */ 452 */
455 if (TAILQ_FIRST(sq) == l && TAILQ_NEXT(l, l_sleepchain) == NULL) { 453 if (TAILQ_FIRST(sq) == l && TAILQ_NEXT(l, l_sleepchain) == NULL) {
456 return; 454 return;
457 } 455 }
458 TAILQ_REMOVE(sq, l, l_sleepchain); 456 TAILQ_REMOVE(sq, l, l_sleepchain);
459 sleepq_insert(sq, l, l->l_syncobj); 457 sleepq_insert(sq, l, l->l_syncobj);
460} 458}
461 459
462void 460void
463sleepq_lendpri(lwp_t *l, pri_t pri) 461sleepq_lendpri(lwp_t *l, pri_t pri)
464{ 462{
465 sleepq_t *sq = l->l_sleepq; 463 sleepq_t *sq = l->l_sleepq;
466 pri_t opri; 464 pri_t opri;
467 465
468 KASSERT(lwp_locked(l, NULL)); 466 KASSERT(lwp_locked(l, NULL));
469 467
470 opri = lwp_eprio(l); 468 opri = lwp_eprio(l);
471 l->l_inheritedprio = pri; 469 l->l_inheritedprio = pri;
472 470
473 if (lwp_eprio(l) == opri) { 471 if (lwp_eprio(l) == opri) {
474 return; 472 return;
475 } 473 }
476 if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_SORTED) == 0) { 474 if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_SORTED) == 0) {
477 return; 475 return;
478 } 476 }
479 477
480 /* 478 /*
481 * Don't let the sleep queue become empty, even briefly. 479 * Don't let the sleep queue become empty, even briefly.
482 * cv_signal() and cv_broadcast() inspect it without the 480 * cv_signal() and cv_broadcast() inspect it without the
483 * sleep queue lock held and need to see a non-empty queue 481 * sleep queue lock held and need to see a non-empty queue
484 * head if there are waiters. 482 * head if there are waiters.
485 */ 483 */
486 if (TAILQ_FIRST(sq) == l && TAILQ_NEXT(l, l_sleepchain) == NULL) { 484 if (TAILQ_FIRST(sq) == l && TAILQ_NEXT(l, l_sleepchain) == NULL) {
487 return; 485 return;
488 } 486 }
489 TAILQ_REMOVE(sq, l, l_sleepchain); 487 TAILQ_REMOVE(sq, l, l_sleepchain);
490 sleepq_insert(sq, l, l->l_syncobj); 488 sleepq_insert(sq, l, l->l_syncobj);
491} 489}

cvs diff -r1.208 -r1.209 src/sys/kern/kern_subr.c (switch to unified diff)

--- src/sys/kern/kern_subr.c 2010/11/11 11:07:07 1.208
+++ src/sys/kern/kern_subr.c 2011/07/27 14:35:34 1.209
@@ -1,726 +1,724 @@ @@ -1,726 +1,724 @@
1/* $NetBSD: kern_subr.c,v 1.208 2010/11/11 11:07:07 hannken Exp $ */ 1/* $NetBSD: kern_subr.c,v 1.209 2011/07/27 14:35:34 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, and by Luke Mewburn. 9 * NASA Ames Research Center, and by Luke Mewburn.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions 12 * modification, are permitted provided that the following conditions
13 * are met: 13 * are met:
14 * 1. Redistributions of source code must retain the above copyright 14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer. 15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright 16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the 17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution. 18 * documentation and/or other materials provided with the distribution.
19 * 19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE. 30 * POSSIBILITY OF SUCH DAMAGE.
31 */ 31 */
32 32
33/* 33/*
34 * Copyright (c) 1982, 1986, 1991, 1993 34 * Copyright (c) 1982, 1986, 1991, 1993
35 * The Regents of the University of California. All rights reserved. 35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc. 36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed 37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph 38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc. 40 * the permission of UNIX System Laboratories, Inc.
41 * 41 *
42 * Copyright (c) 1992, 1993 42 * Copyright (c) 1992, 1993
43 * The Regents of the University of California. All rights reserved. 43 * The Regents of the University of California. All rights reserved.
44 * 44 *
45 * This software was developed by the Computer Systems Engineering group 45 * This software was developed by the Computer Systems Engineering group
46 * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and 46 * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
47 * contributed to Berkeley. 47 * contributed to Berkeley.
48 * 48 *
49 * All advertising materials mentioning features or use of this software 49 * All advertising materials mentioning features or use of this software
50 * must display the following acknowledgement: 50 * must display the following acknowledgement:
51 * This product includes software developed by the University of 51 * This product includes software developed by the University of
52 * California, Lawrence Berkeley Laboratory. 52 * California, Lawrence Berkeley Laboratory.
53 * 53 *
54 * Redistribution and use in source and binary forms, with or without 54 * Redistribution and use in source and binary forms, with or without
55 * modification, are permitted provided that the following conditions 55 * modification, are permitted provided that the following conditions
56 * are met: 56 * are met:
57 * 1. Redistributions of source code must retain the above copyright 57 * 1. Redistributions of source code must retain the above copyright
58 * notice, this list of conditions and the following disclaimer. 58 * notice, this list of conditions and the following disclaimer.
59 * 2. Redistributions in binary form must reproduce the above copyright 59 * 2. Redistributions in binary form must reproduce the above copyright
60 * notice, this list of conditions and the following disclaimer in the 60 * notice, this list of conditions and the following disclaimer in the
61 * documentation and/or other materials provided with the distribution. 61 * documentation and/or other materials provided with the distribution.
62 * 3. Neither the name of the University nor the names of its contributors 62 * 3. Neither the name of the University nor the names of its contributors
63 * may be used to endorse or promote products derived from this software 63 * may be used to endorse or promote products derived from this software
64 * without specific prior written permission. 64 * without specific prior written permission.
65 * 65 *
66 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 66 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 69 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE. 76 * SUCH DAMAGE.
77 * 77 *
78 * @(#)kern_subr.c 8.4 (Berkeley) 2/14/95 78 * @(#)kern_subr.c 8.4 (Berkeley) 2/14/95
79 */ 79 */
80 80
81#include <sys/cdefs.h> 81#include <sys/cdefs.h>
82__KERNEL_RCSID(0, "$NetBSD: kern_subr.c,v 1.208 2010/11/11 11:07:07 hannken Exp $"); 82__KERNEL_RCSID(0, "$NetBSD: kern_subr.c,v 1.209 2011/07/27 14:35:34 uebayasi Exp $");
83 83
84#include "opt_ddb.h" 84#include "opt_ddb.h"
85#include "opt_md.h" 85#include "opt_md.h"
86#include "opt_syscall_debug.h" 86#include "opt_syscall_debug.h"
87#include "opt_ktrace.h" 87#include "opt_ktrace.h"
88#include "opt_ptrace.h" 88#include "opt_ptrace.h"
89#include "opt_tftproot.h" 89#include "opt_tftproot.h"
90 90
91#include <sys/param.h> 91#include <sys/param.h>
92#include <sys/systm.h> 92#include <sys/systm.h>
93#include <sys/proc.h> 93#include <sys/proc.h>
94#include <sys/mount.h> 94#include <sys/mount.h>
95#include <sys/device.h> 95#include <sys/device.h>
96#include <sys/reboot.h> 96#include <sys/reboot.h>
97#include <sys/conf.h> 97#include <sys/conf.h>
98#include <sys/disk.h> 98#include <sys/disk.h>
99#include <sys/disklabel.h> 99#include <sys/disklabel.h>
100#include <sys/queue.h> 100#include <sys/queue.h>
101#include <sys/ktrace.h> 101#include <sys/ktrace.h>
102#include <sys/ptrace.h> 102#include <sys/ptrace.h>
103#include <sys/fcntl.h> 103#include <sys/fcntl.h>
104#include <sys/kauth.h> 104#include <sys/kauth.h>
105#include <sys/stat.h> 105#include <sys/stat.h>
106#include <sys/vnode.h> 106#include <sys/vnode.h>
107#include <sys/module.h> 107#include <sys/module.h>
108 108
109#include <uvm/uvm_extern.h> 
110 
111#include <dev/cons.h> 109#include <dev/cons.h>
112 110
113#include <net/if.h> 111#include <net/if.h>
114 112
115/* XXX these should eventually move to subr_autoconf.c */ 113/* XXX these should eventually move to subr_autoconf.c */
116static device_t finddevice(const char *); 114static device_t finddevice(const char *);
117static device_t getdisk(char *, int, int, dev_t *, int); 115static device_t getdisk(char *, int, int, dev_t *, int);
118static device_t parsedisk(char *, int, int, dev_t *); 116static device_t parsedisk(char *, int, int, dev_t *);
119static const char *getwedgename(const char *, int); 117static const char *getwedgename(const char *, int);
120 118
121#ifdef TFTPROOT 119#ifdef TFTPROOT
122int tftproot_dhcpboot(device_t); 120int tftproot_dhcpboot(device_t);
123#endif 121#endif
124 122
125dev_t dumpcdev; /* for savecore */ 123dev_t dumpcdev; /* for savecore */
126 124
127static int 125static int
128isswap(device_t dv) 126isswap(device_t dv)
129{ 127{
130 struct dkwedge_info wi; 128 struct dkwedge_info wi;
131 struct vnode *vn; 129 struct vnode *vn;
132 int error; 130 int error;
133 131
134 if (device_class(dv) != DV_DISK || !device_is_a(dv, "dk")) 132 if (device_class(dv) != DV_DISK || !device_is_a(dv, "dk"))
135 return 0; 133 return 0;
136 134
137 if ((vn = opendisk(dv)) == NULL) 135 if ((vn = opendisk(dv)) == NULL)
138 return 0; 136 return 0;
139 137
140 error = VOP_IOCTL(vn, DIOCGWEDGEINFO, &wi, FREAD, NOCRED); 138 error = VOP_IOCTL(vn, DIOCGWEDGEINFO, &wi, FREAD, NOCRED);
141 VOP_CLOSE(vn, FREAD, NOCRED); 139 VOP_CLOSE(vn, FREAD, NOCRED);
142 vput(vn); 140 vput(vn);
143 if (error) { 141 if (error) {
144#ifdef DEBUG_WEDGE 142#ifdef DEBUG_WEDGE
145 printf("%s: Get wedge info returned %d\n", device_xname(dv), error); 143 printf("%s: Get wedge info returned %d\n", device_xname(dv), error);
146#endif 144#endif
147 return 0; 145 return 0;
148 } 146 }
149 return strcmp(wi.dkw_ptype, DKW_PTYPE_SWAP) == 0; 147 return strcmp(wi.dkw_ptype, DKW_PTYPE_SWAP) == 0;
150} 148}
151 149
152/* 150/*
153 * Determine the root device and, if instructed to, the root file system. 151 * Determine the root device and, if instructed to, the root file system.
154 */ 152 */
155 153
156#ifdef MEMORY_DISK_IS_ROOT 154#ifdef MEMORY_DISK_IS_ROOT
157int md_is_root = 1; 155int md_is_root = 1;
158#else 156#else
159int md_is_root = 0; 157int md_is_root = 0;
160#endif 158#endif
161 159
162/* 160/*
163 * The device and wedge that we booted from. If booted_wedge is NULL, 161 * The device and wedge that we booted from. If booted_wedge is NULL,
164 * the we might consult booted_partition. 162 * the we might consult booted_partition.
165 */ 163 */
166device_t booted_device; 164device_t booted_device;
167device_t booted_wedge; 165device_t booted_wedge;
168int booted_partition; 166int booted_partition;
169 167
170/* 168/*
171 * Use partition letters if it's a disk class but not a wedge. 169 * Use partition letters if it's a disk class but not a wedge.
172 * XXX Check for wedge is kinda gross. 170 * XXX Check for wedge is kinda gross.
173 */ 171 */
174#define DEV_USES_PARTITIONS(dv) \ 172#define DEV_USES_PARTITIONS(dv) \
175 (device_class((dv)) == DV_DISK && \ 173 (device_class((dv)) == DV_DISK && \
176 !device_is_a((dv), "dk")) 174 !device_is_a((dv), "dk"))
177 175
178void 176void
179setroot(device_t bootdv, int bootpartition) 177setroot(device_t bootdv, int bootpartition)
180{ 178{
181 device_t dv; 179 device_t dv;
182 deviter_t di; 180 deviter_t di;
183 int len, majdev; 181 int len, majdev;
184 dev_t nrootdev; 182 dev_t nrootdev;
185 dev_t ndumpdev = NODEV; 183 dev_t ndumpdev = NODEV;
186 char buf[128]; 184 char buf[128];
187 const char *rootdevname; 185 const char *rootdevname;
188 const char *dumpdevname; 186 const char *dumpdevname;
189 device_t rootdv = NULL; /* XXX gcc -Wuninitialized */ 187 device_t rootdv = NULL; /* XXX gcc -Wuninitialized */
190 device_t dumpdv = NULL; 188 device_t dumpdv = NULL;
191 struct ifnet *ifp; 189 struct ifnet *ifp;
192 const char *deffsname; 190 const char *deffsname;
193 struct vfsops *vops; 191 struct vfsops *vops;
194 192
195#ifdef TFTPROOT 193#ifdef TFTPROOT
196 if (tftproot_dhcpboot(bootdv) != 0) 194 if (tftproot_dhcpboot(bootdv) != 0)
197 boothowto |= RB_ASKNAME; 195 boothowto |= RB_ASKNAME;
198#endif 196#endif
199 197
200 /* 198 /*
201 * For root on md0 we have to force the attachment of md0. 199 * For root on md0 we have to force the attachment of md0.
202 */ 200 */
203 if (md_is_root) { 201 if (md_is_root) {
204 int md_major; 202 int md_major;
205 dev_t md_dev; 203 dev_t md_dev;
206 204
207 bootdv = NULL; 205 bootdv = NULL;
208 md_major = devsw_name2blk("md", NULL, 0); 206 md_major = devsw_name2blk("md", NULL, 0);
209 if (md_major >= 0) { 207 if (md_major >= 0) {
210 md_dev = MAKEDISKDEV(md_major, 0, RAW_PART); 208 md_dev = MAKEDISKDEV(md_major, 0, RAW_PART);
211 if (bdev_open(md_dev, FREAD, S_IFBLK, curlwp) == 0) 209 if (bdev_open(md_dev, FREAD, S_IFBLK, curlwp) == 0)
212 bootdv = device_find_by_xname("md0"); 210 bootdv = device_find_by_xname("md0");
213 } 211 }
214 if (bootdv == NULL) 212 if (bootdv == NULL)
215 panic("Cannot open \"md0\" (root)"); 213 panic("Cannot open \"md0\" (root)");
216 } 214 }
217 215
218 /* 216 /*
219 * If NFS is specified as the file system, and we found 217 * If NFS is specified as the file system, and we found
220 * a DV_DISK boot device (or no boot device at all), then 218 * a DV_DISK boot device (or no boot device at all), then
221 * find a reasonable network interface for "rootspec". 219 * find a reasonable network interface for "rootspec".
222 */ 220 */
223 vops = vfs_getopsbyname(MOUNT_NFS); 221 vops = vfs_getopsbyname(MOUNT_NFS);
224 if (vops != NULL && strcmp(rootfstype, MOUNT_NFS) == 0 && 222 if (vops != NULL && strcmp(rootfstype, MOUNT_NFS) == 0 &&
225 rootspec == NULL && 223 rootspec == NULL &&
226 (bootdv == NULL || device_class(bootdv) != DV_IFNET)) { 224 (bootdv == NULL || device_class(bootdv) != DV_IFNET)) {
227 IFNET_FOREACH(ifp) { 225 IFNET_FOREACH(ifp) {
228 if ((ifp->if_flags & 226 if ((ifp->if_flags &
229 (IFF_LOOPBACK|IFF_POINTOPOINT)) == 0) 227 (IFF_LOOPBACK|IFF_POINTOPOINT)) == 0)
230 break; 228 break;
231 } 229 }
232 if (ifp == NULL) { 230 if (ifp == NULL) {
233 /* 231 /*
234 * Can't find a suitable interface; ask the 232 * Can't find a suitable interface; ask the
235 * user. 233 * user.
236 */ 234 */
237 boothowto |= RB_ASKNAME; 235 boothowto |= RB_ASKNAME;
238 } else { 236 } else {
239 /* 237 /*
240 * Have a suitable interface; behave as if 238 * Have a suitable interface; behave as if
241 * the user specified this interface. 239 * the user specified this interface.
242 */ 240 */
243 rootspec = (const char *)ifp->if_xname; 241 rootspec = (const char *)ifp->if_xname;
244 } 242 }
245 } 243 }
246 if (vops != NULL) 244 if (vops != NULL)
247 vfs_delref(vops); 245 vfs_delref(vops);
248 246
249 /* 247 /*
250 * If wildcarded root and we the boot device wasn't determined, 248 * If wildcarded root and we the boot device wasn't determined,
251 * ask the user. 249 * ask the user.
252 */ 250 */
253 if (rootspec == NULL && bootdv == NULL) 251 if (rootspec == NULL && bootdv == NULL)
254 boothowto |= RB_ASKNAME; 252 boothowto |= RB_ASKNAME;
255 253
256 top: 254 top:
257 if (boothowto & RB_ASKNAME) { 255 if (boothowto & RB_ASKNAME) {
258 device_t defdumpdv; 256 device_t defdumpdv;
259 257
260 for (;;) { 258 for (;;) {
261 printf("root device"); 259 printf("root device");
262 if (bootdv != NULL) { 260 if (bootdv != NULL) {
263 printf(" (default %s", device_xname(bootdv)); 261 printf(" (default %s", device_xname(bootdv));
264 if (DEV_USES_PARTITIONS(bootdv)) 262 if (DEV_USES_PARTITIONS(bootdv))
265 printf("%c", bootpartition + 'a'); 263 printf("%c", bootpartition + 'a');
266 printf(")"); 264 printf(")");
267 } 265 }
268 printf(": "); 266 printf(": ");
269 len = cngetsn(buf, sizeof(buf)); 267 len = cngetsn(buf, sizeof(buf));
270 if (len == 0 && bootdv != NULL) { 268 if (len == 0 && bootdv != NULL) {
271 strlcpy(buf, device_xname(bootdv), sizeof(buf)); 269 strlcpy(buf, device_xname(bootdv), sizeof(buf));
272 len = strlen(buf); 270 len = strlen(buf);
273 } 271 }
274 if (len > 0 && buf[len - 1] == '*') { 272 if (len > 0 && buf[len - 1] == '*') {
275 buf[--len] = '\0'; 273 buf[--len] = '\0';
276 dv = getdisk(buf, len, 1, &nrootdev, 0); 274 dv = getdisk(buf, len, 1, &nrootdev, 0);
277 if (dv != NULL) { 275 if (dv != NULL) {
278 rootdv = dv; 276 rootdv = dv;
279 break; 277 break;
280 } 278 }
281 } 279 }
282 dv = getdisk(buf, len, bootpartition, &nrootdev, 0); 280 dv = getdisk(buf, len, bootpartition, &nrootdev, 0);
283 if (dv != NULL) { 281 if (dv != NULL) {
284 rootdv = dv; 282 rootdv = dv;
285 break; 283 break;
286 } 284 }
287 } 285 }
288 286
289 /* 287 /*
290 * Set up the default dump device. If root is on 288 * Set up the default dump device. If root is on
291 * a network device, there is no default dump 289 * a network device, there is no default dump
292 * device, since we don't support dumps to the 290 * device, since we don't support dumps to the
293 * network. 291 * network.
294 */ 292 */
295 if (DEV_USES_PARTITIONS(rootdv) == 0) 293 if (DEV_USES_PARTITIONS(rootdv) == 0)
296 defdumpdv = NULL; 294 defdumpdv = NULL;
297 else 295 else
298 defdumpdv = rootdv; 296 defdumpdv = rootdv;
299 297
300 for (;;) { 298 for (;;) {
301 printf("dump device"); 299 printf("dump device");
302 if (defdumpdv != NULL) { 300 if (defdumpdv != NULL) {
303 /* 301 /*
304 * Note, we know it's a disk if we get here. 302 * Note, we know it's a disk if we get here.
305 */ 303 */
306 printf(" (default %sb)", device_xname(defdumpdv)); 304 printf(" (default %sb)", device_xname(defdumpdv));
307 } 305 }
308 printf(": "); 306 printf(": ");
309 len = cngetsn(buf, sizeof(buf)); 307 len = cngetsn(buf, sizeof(buf));
310 if (len == 0) { 308 if (len == 0) {
311 if (defdumpdv != NULL) { 309 if (defdumpdv != NULL) {
312 ndumpdev = MAKEDISKDEV(major(nrootdev), 310 ndumpdev = MAKEDISKDEV(major(nrootdev),
313 DISKUNIT(nrootdev), 1); 311 DISKUNIT(nrootdev), 1);
314 } 312 }
315 dumpdv = defdumpdv; 313 dumpdv = defdumpdv;
316 break; 314 break;
317 } 315 }
318 if (len == 4 && strcmp(buf, "none") == 0) { 316 if (len == 4 && strcmp(buf, "none") == 0) {
319 dumpdv = NULL; 317 dumpdv = NULL;
320 break; 318 break;
321 } 319 }
322 dv = getdisk(buf, len, 1, &ndumpdev, 1); 320 dv = getdisk(buf, len, 1, &ndumpdev, 1);
323 if (dv != NULL) { 321 if (dv != NULL) {
324 dumpdv = dv; 322 dumpdv = dv;
325 break; 323 break;
326 } 324 }
327 } 325 }
328 326
329 rootdev = nrootdev; 327 rootdev = nrootdev;
330 dumpdev = ndumpdev; 328 dumpdev = ndumpdev;
331 329
332 for (vops = LIST_FIRST(&vfs_list); vops != NULL; 330 for (vops = LIST_FIRST(&vfs_list); vops != NULL;
333 vops = LIST_NEXT(vops, vfs_list)) { 331 vops = LIST_NEXT(vops, vfs_list)) {
334 if (vops->vfs_mountroot != NULL && 332 if (vops->vfs_mountroot != NULL &&
335 strcmp(rootfstype, vops->vfs_name) == 0) 333 strcmp(rootfstype, vops->vfs_name) == 0)
336 break; 334 break;
337 } 335 }
338 336
339 if (vops == NULL) { 337 if (vops == NULL) {
340 deffsname = "generic"; 338 deffsname = "generic";
341 } else 339 } else
342 deffsname = vops->vfs_name; 340 deffsname = vops->vfs_name;
343 341
344 for (;;) { 342 for (;;) {
345 printf("file system (default %s): ", deffsname); 343 printf("file system (default %s): ", deffsname);
346 len = cngetsn(buf, sizeof(buf)); 344 len = cngetsn(buf, sizeof(buf));
347 if (len == 0) { 345 if (len == 0) {
348 if (strcmp(deffsname, "generic") == 0) 346 if (strcmp(deffsname, "generic") == 0)
349 rootfstype = ROOT_FSTYPE_ANY; 347 rootfstype = ROOT_FSTYPE_ANY;
350 break; 348 break;
351 } 349 }
352 if (len == 4 && strcmp(buf, "halt") == 0) 350 if (len == 4 && strcmp(buf, "halt") == 0)
353 cpu_reboot(RB_HALT, NULL); 351 cpu_reboot(RB_HALT, NULL);
354 else if (len == 6 && strcmp(buf, "reboot") == 0) 352 else if (len == 6 && strcmp(buf, "reboot") == 0)
355 cpu_reboot(0, NULL); 353 cpu_reboot(0, NULL);
356#if defined(DDB) 354#if defined(DDB)
357 else if (len == 3 && strcmp(buf, "ddb") == 0) { 355 else if (len == 3 && strcmp(buf, "ddb") == 0) {
358 console_debugger(); 356 console_debugger();
359 } 357 }
360#endif 358#endif
361 else if (len == 7 && strcmp(buf, "generic") == 0) { 359 else if (len == 7 && strcmp(buf, "generic") == 0) {
362 rootfstype = ROOT_FSTYPE_ANY; 360 rootfstype = ROOT_FSTYPE_ANY;
363 break; 361 break;
364 } 362 }
365 vops = vfs_getopsbyname(buf); 363 vops = vfs_getopsbyname(buf);
366 if (vops == NULL || vops->vfs_mountroot == NULL) { 364 if (vops == NULL || vops->vfs_mountroot == NULL) {
367 printf("use one of: generic"); 365 printf("use one of: generic");
368 for (vops = LIST_FIRST(&vfs_list); 366 for (vops = LIST_FIRST(&vfs_list);
369 vops != NULL; 367 vops != NULL;
370 vops = LIST_NEXT(vops, vfs_list)) { 368 vops = LIST_NEXT(vops, vfs_list)) {
371 if (vops->vfs_mountroot != NULL) 369 if (vops->vfs_mountroot != NULL)
372 printf(" %s", vops->vfs_name); 370 printf(" %s", vops->vfs_name);
373 } 371 }
374 if (vops != NULL) 372 if (vops != NULL)
375 vfs_delref(vops); 373 vfs_delref(vops);
376#if defined(DDB) 374#if defined(DDB)
377 printf(" ddb"); 375 printf(" ddb");
378#endif 376#endif
379 printf(" halt reboot\n"); 377 printf(" halt reboot\n");
380 } else { 378 } else {
381 /* 379 /*
382 * XXX If *vops gets freed between here and 380 * XXX If *vops gets freed between here and
383 * the call to mountroot(), rootfstype will 381 * the call to mountroot(), rootfstype will
384 * point to something unexpected. But in 382 * point to something unexpected. But in
385 * this case the system will fail anyway. 383 * this case the system will fail anyway.
386 */ 384 */
387 rootfstype = vops->vfs_name; 385 rootfstype = vops->vfs_name;
388 vfs_delref(vops); 386 vfs_delref(vops);
389 break; 387 break;
390 } 388 }
391 } 389 }
392 390
393 } else if (rootspec == NULL) { 391 } else if (rootspec == NULL) {
394 /* 392 /*
395 * Wildcarded root; use the boot device. 393 * Wildcarded root; use the boot device.
396 */ 394 */
397 rootdv = bootdv; 395 rootdv = bootdv;
398 396
399 if (bootdv) 397 if (bootdv)
400 majdev = devsw_name2blk(device_xname(bootdv), NULL, 0); 398 majdev = devsw_name2blk(device_xname(bootdv), NULL, 0);
401 else 399 else
402 majdev = -1; 400 majdev = -1;
403 if (majdev >= 0) { 401 if (majdev >= 0) {
404 /* 402 /*
405 * Root is on a disk. `bootpartition' is root, 403 * Root is on a disk. `bootpartition' is root,
406 * unless the device does not use partitions. 404 * unless the device does not use partitions.
407 */ 405 */
408 if (DEV_USES_PARTITIONS(bootdv)) 406 if (DEV_USES_PARTITIONS(bootdv))
409 rootdev = MAKEDISKDEV(majdev, 407 rootdev = MAKEDISKDEV(majdev,
410 device_unit(bootdv), 408 device_unit(bootdv),
411 bootpartition); 409 bootpartition);
412 else 410 else
413 rootdev = makedev(majdev, device_unit(bootdv)); 411 rootdev = makedev(majdev, device_unit(bootdv));
414 } 412 }
415 } else { 413 } else {
416 414
417 /* 415 /*
418 * `root on <dev> ...' 416 * `root on <dev> ...'
419 */ 417 */
420 418
421 /* 419 /*
422 * If it's a network interface, we can bail out 420 * If it's a network interface, we can bail out
423 * early. 421 * early.
424 */ 422 */
425 dv = finddevice(rootspec); 423 dv = finddevice(rootspec);
426 if (dv != NULL && device_class(dv) == DV_IFNET) { 424 if (dv != NULL && device_class(dv) == DV_IFNET) {
427 rootdv = dv; 425 rootdv = dv;
428 goto haveroot; 426 goto haveroot;
429 } 427 }
430 428
431 if (rootdev == NODEV && 429 if (rootdev == NODEV &&
432 device_class(dv) == DV_DISK && device_is_a(dv, "dk") && 430 device_class(dv) == DV_DISK && device_is_a(dv, "dk") &&
433 (majdev = devsw_name2blk(device_xname(dv), NULL, 0)) >= 0) 431 (majdev = devsw_name2blk(device_xname(dv), NULL, 0)) >= 0)
434 rootdev = makedev(majdev, device_unit(dv)); 432 rootdev = makedev(majdev, device_unit(dv));
435 433
436 rootdevname = devsw_blk2name(major(rootdev)); 434 rootdevname = devsw_blk2name(major(rootdev));
437 if (rootdevname == NULL) { 435 if (rootdevname == NULL) {
438 printf("unknown device major 0x%llx\n", 436 printf("unknown device major 0x%llx\n",
439 (unsigned long long)rootdev); 437 (unsigned long long)rootdev);
440 boothowto |= RB_ASKNAME; 438 boothowto |= RB_ASKNAME;
441 goto top; 439 goto top;
442 } 440 }
443 memset(buf, 0, sizeof(buf)); 441 memset(buf, 0, sizeof(buf));
444 snprintf(buf, sizeof(buf), "%s%llu", rootdevname, 442 snprintf(buf, sizeof(buf), "%s%llu", rootdevname,
445 (unsigned long long)DISKUNIT(rootdev)); 443 (unsigned long long)DISKUNIT(rootdev));
446 444
447 rootdv = finddevice(buf); 445 rootdv = finddevice(buf);
448 if (rootdv == NULL) { 446 if (rootdv == NULL) {
449 printf("device %s (0x%llx) not configured\n", 447 printf("device %s (0x%llx) not configured\n",
450 buf, (unsigned long long)rootdev); 448 buf, (unsigned long long)rootdev);
451 boothowto |= RB_ASKNAME; 449 boothowto |= RB_ASKNAME;
452 goto top; 450 goto top;
453 } 451 }
454 } 452 }
455 453
456 haveroot: 454 haveroot:
457 455
458 root_device = rootdv; 456 root_device = rootdv;
459 457
460 switch (device_class(rootdv)) { 458 switch (device_class(rootdv)) {
461 case DV_IFNET: 459 case DV_IFNET:
462 case DV_DISK: 460 case DV_DISK:
463 aprint_normal("root on %s", device_xname(rootdv)); 461 aprint_normal("root on %s", device_xname(rootdv));
464 if (DEV_USES_PARTITIONS(rootdv)) 462 if (DEV_USES_PARTITIONS(rootdv))
465 aprint_normal("%c", (int)DISKPART(rootdev) + 'a'); 463 aprint_normal("%c", (int)DISKPART(rootdev) + 'a');
466 break; 464 break;
467 465
468 default: 466 default:
469 printf("can't determine root device\n"); 467 printf("can't determine root device\n");
470 boothowto |= RB_ASKNAME; 468 boothowto |= RB_ASKNAME;
471 goto top; 469 goto top;
472 } 470 }
473 471
474 /* 472 /*
475 * Now configure the dump device. 473 * Now configure the dump device.
476 * 474 *
477 * If we haven't figured out the dump device, do so, with 475 * If we haven't figured out the dump device, do so, with
478 * the following rules: 476 * the following rules:
479 * 477 *
480 * (a) We already know dumpdv in the RB_ASKNAME case. 478 * (a) We already know dumpdv in the RB_ASKNAME case.
481 * 479 *
482 * (b) If dumpspec is set, try to use it. If the device 480 * (b) If dumpspec is set, try to use it. If the device
483 * is not available, punt. 481 * is not available, punt.
484 * 482 *
485 * (c) If dumpspec is not set, the dump device is 483 * (c) If dumpspec is not set, the dump device is
486 * wildcarded or unspecified. If the root device 484 * wildcarded or unspecified. If the root device
487 * is DV_IFNET, punt. Otherwise, use partition b 485 * is DV_IFNET, punt. Otherwise, use partition b
488 * of the root device. 486 * of the root device.
489 */ 487 */
490 488
491 if (boothowto & RB_ASKNAME) { /* (a) */ 489 if (boothowto & RB_ASKNAME) { /* (a) */
492 if (dumpdv == NULL) 490 if (dumpdv == NULL)
493 goto nodumpdev; 491 goto nodumpdev;
494 } else if (dumpspec != NULL) { /* (b) */ 492 } else if (dumpspec != NULL) { /* (b) */
495 if (strcmp(dumpspec, "none") == 0 || dumpdev == NODEV) { 493 if (strcmp(dumpspec, "none") == 0 || dumpdev == NODEV) {
496 /* 494 /*
497 * Operator doesn't want a dump device. 495 * Operator doesn't want a dump device.
498 * Or looks like they tried to pick a network 496 * Or looks like they tried to pick a network
499 * device. Oops. 497 * device. Oops.
500 */ 498 */
501 goto nodumpdev; 499 goto nodumpdev;
502 } 500 }
503 501
504 dumpdevname = devsw_blk2name(major(dumpdev)); 502 dumpdevname = devsw_blk2name(major(dumpdev));
505 if (dumpdevname == NULL) 503 if (dumpdevname == NULL)
506 goto nodumpdev; 504 goto nodumpdev;
507 memset(buf, 0, sizeof(buf)); 505 memset(buf, 0, sizeof(buf));
508 snprintf(buf, sizeof(buf), "%s%llu", dumpdevname, 506 snprintf(buf, sizeof(buf), "%s%llu", dumpdevname,
509 (unsigned long long)DISKUNIT(dumpdev)); 507 (unsigned long long)DISKUNIT(dumpdev));
510 508
511 dumpdv = finddevice(buf); 509 dumpdv = finddevice(buf);
512 if (dumpdv == NULL) { 510 if (dumpdv == NULL) {
513 /* 511 /*
514 * Device not configured. 512 * Device not configured.
515 */ 513 */
516 goto nodumpdev; 514 goto nodumpdev;
517 } 515 }
518 } else { /* (c) */ 516 } else { /* (c) */
519 if (DEV_USES_PARTITIONS(rootdv) == 0) { 517 if (DEV_USES_PARTITIONS(rootdv) == 0) {
520 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); 518 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST);
521 dv != NULL; 519 dv != NULL;
522 dv = deviter_next(&di)) 520 dv = deviter_next(&di))
523 if (isswap(dv)) 521 if (isswap(dv))
524 break; 522 break;
525 deviter_release(&di); 523 deviter_release(&di);
526 if (dv == NULL) 524 if (dv == NULL)
527 goto nodumpdev; 525 goto nodumpdev;
528 526
529 majdev = devsw_name2blk(device_xname(dv), NULL, 0); 527 majdev = devsw_name2blk(device_xname(dv), NULL, 0);
530 if (majdev < 0) 528 if (majdev < 0)
531 goto nodumpdev; 529 goto nodumpdev;
532 dumpdv = dv; 530 dumpdv = dv;
533 dumpdev = makedev(majdev, device_unit(dumpdv)); 531 dumpdev = makedev(majdev, device_unit(dumpdv));
534 } else { 532 } else {
535 dumpdv = rootdv; 533 dumpdv = rootdv;
536 dumpdev = MAKEDISKDEV(major(rootdev), 534 dumpdev = MAKEDISKDEV(major(rootdev),
537 device_unit(dumpdv), 1); 535 device_unit(dumpdv), 1);
538 } 536 }
539 } 537 }
540 538
541 dumpcdev = devsw_blk2chr(dumpdev); 539 dumpcdev = devsw_blk2chr(dumpdev);
542 aprint_normal(" dumps on %s", device_xname(dumpdv)); 540 aprint_normal(" dumps on %s", device_xname(dumpdv));
543 if (DEV_USES_PARTITIONS(dumpdv)) 541 if (DEV_USES_PARTITIONS(dumpdv))
544 aprint_normal("%c", (int)DISKPART(dumpdev) + 'a'); 542 aprint_normal("%c", (int)DISKPART(dumpdev) + 'a');
545 aprint_normal("\n"); 543 aprint_normal("\n");
546 return; 544 return;
547 545
548 nodumpdev: 546 nodumpdev:
549 dumpdev = NODEV; 547 dumpdev = NODEV;
550 dumpcdev = NODEV; 548 dumpcdev = NODEV;
551 aprint_normal("\n"); 549 aprint_normal("\n");
552} 550}
553 551
554static device_t 552static device_t
555finddevice(const char *name) 553finddevice(const char *name)
556{ 554{
557 const char *wname; 555 const char *wname;
558 556
559 if ((wname = getwedgename(name, strlen(name))) != NULL) 557 if ((wname = getwedgename(name, strlen(name))) != NULL)
560 return dkwedge_find_by_wname(wname); 558 return dkwedge_find_by_wname(wname);
561 559
562 return device_find_by_xname(name); 560 return device_find_by_xname(name);
563} 561}
564 562
565static device_t 563static device_t
566getdisk(char *str, int len, int defpart, dev_t *devp, int isdump) 564getdisk(char *str, int len, int defpart, dev_t *devp, int isdump)
567{ 565{
568 device_t dv; 566 device_t dv;
569 deviter_t di; 567 deviter_t di;
570 568
571 if ((dv = parsedisk(str, len, defpart, devp)) == NULL) { 569 if ((dv = parsedisk(str, len, defpart, devp)) == NULL) {
572 printf("use one of:"); 570 printf("use one of:");
573 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL; 571 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
574 dv = deviter_next(&di)) { 572 dv = deviter_next(&di)) {
575 if (DEV_USES_PARTITIONS(dv)) 573 if (DEV_USES_PARTITIONS(dv))
576 printf(" %s[a-%c]", device_xname(dv), 574 printf(" %s[a-%c]", device_xname(dv),
577 'a' + MAXPARTITIONS - 1); 575 'a' + MAXPARTITIONS - 1);
578 else if (device_class(dv) == DV_DISK) 576 else if (device_class(dv) == DV_DISK)
579 printf(" %s", device_xname(dv)); 577 printf(" %s", device_xname(dv));
580 if (isdump == 0 && device_class(dv) == DV_IFNET) 578 if (isdump == 0 && device_class(dv) == DV_IFNET)
581 printf(" %s", device_xname(dv)); 579 printf(" %s", device_xname(dv));
582 } 580 }
583 deviter_release(&di); 581 deviter_release(&di);
584 dkwedge_print_wnames(); 582 dkwedge_print_wnames();
585 if (isdump) 583 if (isdump)
586 printf(" none"); 584 printf(" none");
587#if defined(DDB) 585#if defined(DDB)
588 printf(" ddb"); 586 printf(" ddb");
589#endif 587#endif
590 printf(" halt reboot\n"); 588 printf(" halt reboot\n");
591 } 589 }
592 return dv; 590 return dv;
593} 591}
594 592
595static const char * 593static const char *
596getwedgename(const char *name, int namelen) 594getwedgename(const char *name, int namelen)
597{ 595{
598 const char *wpfx = "wedge:"; 596 const char *wpfx = "wedge:";
599 const int wpfxlen = strlen(wpfx); 597 const int wpfxlen = strlen(wpfx);
600 598
601 if (namelen < wpfxlen || strncmp(name, wpfx, wpfxlen) != 0) 599 if (namelen < wpfxlen || strncmp(name, wpfx, wpfxlen) != 0)
602 return NULL; 600 return NULL;
603 601
604 return name + wpfxlen; 602 return name + wpfxlen;
605} 603}
606 604
607static device_t 605static device_t
608parsedisk(char *str, int len, int defpart, dev_t *devp) 606parsedisk(char *str, int len, int defpart, dev_t *devp)
609{ 607{
610 device_t dv; 608 device_t dv;
611 const char *wname; 609 const char *wname;
612 char *cp, c; 610 char *cp, c;
613 int majdev, part; 611 int majdev, part;
614 if (len == 0) 612 if (len == 0)
615 return (NULL); 613 return (NULL);
616 614
617 if (len == 4 && strcmp(str, "halt") == 0) 615 if (len == 4 && strcmp(str, "halt") == 0)
618 cpu_reboot(RB_HALT, NULL); 616 cpu_reboot(RB_HALT, NULL);
619 else if (len == 6 && strcmp(str, "reboot") == 0) 617 else if (len == 6 && strcmp(str, "reboot") == 0)
620 cpu_reboot(0, NULL); 618 cpu_reboot(0, NULL);
621#if defined(DDB) 619#if defined(DDB)
622 else if (len == 3 && strcmp(str, "ddb") == 0) 620 else if (len == 3 && strcmp(str, "ddb") == 0)
623 console_debugger(); 621 console_debugger();
624#endif 622#endif
625 623
626 cp = str + len - 1; 624 cp = str + len - 1;
627 c = *cp; 625 c = *cp;
628 626
629 if ((wname = getwedgename(str, len)) != NULL) { 627 if ((wname = getwedgename(str, len)) != NULL) {
630 if ((dv = dkwedge_find_by_wname(wname)) == NULL) 628 if ((dv = dkwedge_find_by_wname(wname)) == NULL)
631 return NULL; 629 return NULL;
632 part = defpart; 630 part = defpart;
633 goto gotdisk; 631 goto gotdisk;
634 } else if (c >= 'a' && c <= ('a' + MAXPARTITIONS - 1)) { 632 } else if (c >= 'a' && c <= ('a' + MAXPARTITIONS - 1)) {
635 part = c - 'a'; 633 part = c - 'a';
636 *cp = '\0'; 634 *cp = '\0';
637 } else 635 } else
638 part = defpart; 636 part = defpart;
639 637
640 dv = finddevice(str); 638 dv = finddevice(str);
641 if (dv != NULL) { 639 if (dv != NULL) {
642 if (device_class(dv) == DV_DISK) { 640 if (device_class(dv) == DV_DISK) {
643 gotdisk: 641 gotdisk:
644 majdev = devsw_name2blk(device_xname(dv), NULL, 0); 642 majdev = devsw_name2blk(device_xname(dv), NULL, 0);
645 if (majdev < 0) 643 if (majdev < 0)
646 panic("parsedisk"); 644 panic("parsedisk");
647 if (DEV_USES_PARTITIONS(dv)) 645 if (DEV_USES_PARTITIONS(dv))
648 *devp = MAKEDISKDEV(majdev, device_unit(dv), 646 *devp = MAKEDISKDEV(majdev, device_unit(dv),
649 part); 647 part);
650 else 648 else
651 *devp = makedev(majdev, device_unit(dv)); 649 *devp = makedev(majdev, device_unit(dv));
652 } 650 }
653 651
654 if (device_class(dv) == DV_IFNET) 652 if (device_class(dv) == DV_IFNET)
655 *devp = NODEV; 653 *devp = NODEV;
656 } 654 }
657 655
658 *cp = c; 656 *cp = c;
659 return (dv); 657 return (dv);
660} 658}
661 659
662/* 660/*
663 * Return true if system call tracing is enabled for the specified process. 661 * Return true if system call tracing is enabled for the specified process.
664 */ 662 */
665bool 663bool
666trace_is_enabled(struct proc *p) 664trace_is_enabled(struct proc *p)
667{ 665{
668#ifdef SYSCALL_DEBUG 666#ifdef SYSCALL_DEBUG
669 return (true); 667 return (true);
670#endif 668#endif
671#ifdef KTRACE 669#ifdef KTRACE
672 if (ISSET(p->p_traceflag, (KTRFAC_SYSCALL | KTRFAC_SYSRET))) 670 if (ISSET(p->p_traceflag, (KTRFAC_SYSCALL | KTRFAC_SYSRET)))
673 return (true); 671 return (true);
674#endif 672#endif
675#ifdef PTRACE 673#ifdef PTRACE
676 if (ISSET(p->p_slflag, PSL_SYSCALL)) 674 if (ISSET(p->p_slflag, PSL_SYSCALL))
677 return (true); 675 return (true);
678#endif 676#endif
679 677
680 return (false); 678 return (false);
681} 679}
682 680
683/* 681/*
684 * Start trace of particular system call. If process is being traced, 682 * Start trace of particular system call. If process is being traced,
685 * this routine is called by MD syscall dispatch code just before 683 * this routine is called by MD syscall dispatch code just before
686 * a system call is actually executed. 684 * a system call is actually executed.
687 */ 685 */
688int 686int
689trace_enter(register_t code, const register_t *args, int narg) 687trace_enter(register_t code, const register_t *args, int narg)
690{ 688{
691#ifdef SYSCALL_DEBUG 689#ifdef SYSCALL_DEBUG
692 scdebug_call(code, args); 690 scdebug_call(code, args);
693#endif /* SYSCALL_DEBUG */ 691#endif /* SYSCALL_DEBUG */
694 692
695 ktrsyscall(code, args, narg); 693 ktrsyscall(code, args, narg);
696 694
697#ifdef PTRACE 695#ifdef PTRACE
698 if ((curlwp->l_proc->p_slflag & (PSL_SYSCALL|PSL_TRACED)) == 696 if ((curlwp->l_proc->p_slflag & (PSL_SYSCALL|PSL_TRACED)) ==
699 (PSL_SYSCALL|PSL_TRACED)) 697 (PSL_SYSCALL|PSL_TRACED))
700 process_stoptrace(); 698 process_stoptrace();
701#endif 699#endif
702 return 0; 700 return 0;
703} 701}
704 702
705/* 703/*
706 * End trace of particular system call. If process is being traced, 704 * End trace of particular system call. If process is being traced,
707 * this routine is called by MD syscall dispatch code just after 705 * this routine is called by MD syscall dispatch code just after
708 * a system call finishes. 706 * a system call finishes.
709 * MD caller guarantees the passed 'code' is within the supported 707 * MD caller guarantees the passed 'code' is within the supported
710 * system call number range for emulation the process runs under. 708 * system call number range for emulation the process runs under.
711 */ 709 */
712void 710void
713trace_exit(register_t code, register_t rval[], int error) 711trace_exit(register_t code, register_t rval[], int error)
714{ 712{
715#ifdef SYSCALL_DEBUG 713#ifdef SYSCALL_DEBUG
716 scdebug_ret(code, error, rval); 714 scdebug_ret(code, error, rval);
717#endif /* SYSCALL_DEBUG */ 715#endif /* SYSCALL_DEBUG */
718 716
719 ktrsysret(code, error, rval); 717 ktrsysret(code, error, rval);
720  718
721#ifdef PTRACE 719#ifdef PTRACE
722 if ((curlwp->l_proc->p_slflag & (PSL_SYSCALL|PSL_TRACED)) == 720 if ((curlwp->l_proc->p_slflag & (PSL_SYSCALL|PSL_TRACED)) ==
723 (PSL_SYSCALL|PSL_TRACED)) 721 (PSL_SYSCALL|PSL_TRACED))
724 process_stoptrace(); 722 process_stoptrace();
725#endif 723#endif
726} 724}

cvs diff -r1.168 -r1.169 src/sys/kern/kern_time.c (switch to unified diff)

--- src/sys/kern/kern_time.c 2011/04/08 10:35:37 1.168
+++ src/sys/kern/kern_time.c 2011/07/27 14:35:34 1.169
@@ -1,1083 +1,1081 @@ @@ -1,1083 +1,1081 @@
1/* $NetBSD: kern_time.c,v 1.168 2011/04/08 10:35:37 yamt Exp $ */ 1/* $NetBSD: kern_time.c,v 1.169 2011/07/27 14:35:34 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2000, 2004, 2005, 2007, 2008, 2009 The NetBSD Foundation, Inc. 4 * Copyright (c) 2000, 2004, 2005, 2007, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Christopher G. Demetriou, and by Andrew Doran. 8 * by Christopher G. Demetriou, and by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * Copyright (c) 1982, 1986, 1989, 1993 33 * Copyright (c) 1982, 1986, 1989, 1993
34 * The Regents of the University of California. All rights reserved. 34 * The Regents of the University of California. All rights reserved.
35 * 35 *
36 * Redistribution and use in source and binary forms, with or without 36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions 37 * modification, are permitted provided that the following conditions
38 * are met: 38 * are met:
39 * 1. Redistributions of source code must retain the above copyright 39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer. 40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright 41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the 42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution. 43 * documentation and/or other materials provided with the distribution.
44 * 3. Neither the name of the University nor the names of its contributors 44 * 3. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software 45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission. 46 * without specific prior written permission.
47 * 47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE. 58 * SUCH DAMAGE.
59 * 59 *
60 * @(#)kern_time.c 8.4 (Berkeley) 5/26/95 60 * @(#)kern_time.c 8.4 (Berkeley) 5/26/95
61 */ 61 */
62 62
63#include <sys/cdefs.h> 63#include <sys/cdefs.h>
64__KERNEL_RCSID(0, "$NetBSD: kern_time.c,v 1.168 2011/04/08 10:35:37 yamt Exp $"); 64__KERNEL_RCSID(0, "$NetBSD: kern_time.c,v 1.169 2011/07/27 14:35:34 uebayasi Exp $");
65 65
66#include <sys/param.h> 66#include <sys/param.h>
67#include <sys/resourcevar.h> 67#include <sys/resourcevar.h>
68#include <sys/kernel.h> 68#include <sys/kernel.h>
69#include <sys/systm.h> 69#include <sys/systm.h>
70#include <sys/proc.h> 70#include <sys/proc.h>
71#include <sys/vnode.h> 71#include <sys/vnode.h>
72#include <sys/signalvar.h> 72#include <sys/signalvar.h>
73#include <sys/syslog.h> 73#include <sys/syslog.h>
74#include <sys/timetc.h> 74#include <sys/timetc.h>
75#include <sys/timex.h> 75#include <sys/timex.h>
76#include <sys/kauth.h> 76#include <sys/kauth.h>
77#include <sys/mount.h> 77#include <sys/mount.h>
78#include <sys/sa.h> 78#include <sys/sa.h>
79#include <sys/savar.h> 79#include <sys/savar.h>
80#include <sys/syscallargs.h> 80#include <sys/syscallargs.h>
81#include <sys/cpu.h> 81#include <sys/cpu.h>
82 82
83#include <uvm/uvm_extern.h> 
84 
85#include "opt_sa.h" 83#include "opt_sa.h"
86 84
87static void timer_intr(void *); 85static void timer_intr(void *);
88static void itimerfire(struct ptimer *); 86static void itimerfire(struct ptimer *);
89static void itimerfree(struct ptimers *, int); 87static void itimerfree(struct ptimers *, int);
90 88
91kmutex_t timer_lock; 89kmutex_t timer_lock;
92 90
93static void *timer_sih; 91static void *timer_sih;
94static TAILQ_HEAD(, ptimer) timer_queue; 92static TAILQ_HEAD(, ptimer) timer_queue;
95 93
96struct pool ptimer_pool, ptimers_pool; 94struct pool ptimer_pool, ptimers_pool;
97 95
98#define CLOCK_VIRTUAL_P(clockid) \ 96#define CLOCK_VIRTUAL_P(clockid) \
99 ((clockid) == CLOCK_VIRTUAL || (clockid) == CLOCK_PROF) 97 ((clockid) == CLOCK_VIRTUAL || (clockid) == CLOCK_PROF)
100 98
101CTASSERT(ITIMER_REAL == CLOCK_REALTIME); 99CTASSERT(ITIMER_REAL == CLOCK_REALTIME);
102CTASSERT(ITIMER_VIRTUAL == CLOCK_VIRTUAL); 100CTASSERT(ITIMER_VIRTUAL == CLOCK_VIRTUAL);
103CTASSERT(ITIMER_PROF == CLOCK_PROF); 101CTASSERT(ITIMER_PROF == CLOCK_PROF);
104 102
105/* 103/*
106 * Initialize timekeeping. 104 * Initialize timekeeping.
107 */ 105 */
108void 106void
109time_init(void) 107time_init(void)
110{ 108{
111 109
112 pool_init(&ptimer_pool, sizeof(struct ptimer), 0, 0, 0, "ptimerpl", 110 pool_init(&ptimer_pool, sizeof(struct ptimer), 0, 0, 0, "ptimerpl",
113 &pool_allocator_nointr, IPL_NONE); 111 &pool_allocator_nointr, IPL_NONE);
114 pool_init(&ptimers_pool, sizeof(struct ptimers), 0, 0, 0, "ptimerspl", 112 pool_init(&ptimers_pool, sizeof(struct ptimers), 0, 0, 0, "ptimerspl",
115 &pool_allocator_nointr, IPL_NONE); 113 &pool_allocator_nointr, IPL_NONE);
116} 114}
117 115
118void 116void
119time_init2(void) 117time_init2(void)
120{ 118{
121 119
122 TAILQ_INIT(&timer_queue); 120 TAILQ_INIT(&timer_queue);
123 mutex_init(&timer_lock, MUTEX_DEFAULT, IPL_SCHED); 121 mutex_init(&timer_lock, MUTEX_DEFAULT, IPL_SCHED);
124 timer_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE, 122 timer_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE,
125 timer_intr, NULL); 123 timer_intr, NULL);
126} 124}
127 125
128/* Time of day and interval timer support. 126/* Time of day and interval timer support.
129 * 127 *
130 * These routines provide the kernel entry points to get and set 128 * These routines provide the kernel entry points to get and set
131 * the time-of-day and per-process interval timers. Subroutines 129 * the time-of-day and per-process interval timers. Subroutines
132 * here provide support for adding and subtracting timeval structures 130 * here provide support for adding and subtracting timeval structures
133 * and decrementing interval timers, optionally reloading the interval 131 * and decrementing interval timers, optionally reloading the interval
134 * timers when they expire. 132 * timers when they expire.
135 */ 133 */
136 134
137/* This function is used by clock_settime and settimeofday */ 135/* This function is used by clock_settime and settimeofday */
138static int 136static int
139settime1(struct proc *p, const struct timespec *ts, bool check_kauth) 137settime1(struct proc *p, const struct timespec *ts, bool check_kauth)
140{ 138{
141 struct timespec delta, now; 139 struct timespec delta, now;
142 int s; 140 int s;
143 141
144 /* WHAT DO WE DO ABOUT PENDING REAL-TIME TIMEOUTS??? */ 142 /* WHAT DO WE DO ABOUT PENDING REAL-TIME TIMEOUTS??? */
145 s = splclock(); 143 s = splclock();
146 nanotime(&now); 144 nanotime(&now);
147 timespecsub(ts, &now, &delta); 145 timespecsub(ts, &now, &delta);
148 146
149 if (check_kauth && kauth_authorize_system(kauth_cred_get(), 147 if (check_kauth && kauth_authorize_system(kauth_cred_get(),
150 KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_SYSTEM, __UNCONST(ts), 148 KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_SYSTEM, __UNCONST(ts),
151 &delta, KAUTH_ARG(check_kauth ? false : true)) != 0) { 149 &delta, KAUTH_ARG(check_kauth ? false : true)) != 0) {
152 splx(s); 150 splx(s);
153 return (EPERM); 151 return (EPERM);
154 } 152 }
155 153
156#ifdef notyet 154#ifdef notyet
157 if ((delta.tv_sec < 86400) && securelevel > 0) { /* XXX elad - notyet */ 155 if ((delta.tv_sec < 86400) && securelevel > 0) { /* XXX elad - notyet */
158 splx(s); 156 splx(s);
159 return (EPERM); 157 return (EPERM);
160 } 158 }
161#endif 159#endif
162 160
163 tc_setclock(ts); 161 tc_setclock(ts);
164 162
165 timespecadd(&boottime, &delta, &boottime); 163 timespecadd(&boottime, &delta, &boottime);
166 164
167 resettodr(); 165 resettodr();
168 splx(s); 166 splx(s);
169 167
170 return (0); 168 return (0);
171} 169}
172 170
173int 171int
174settime(struct proc *p, struct timespec *ts) 172settime(struct proc *p, struct timespec *ts)
175{ 173{
176 return (settime1(p, ts, true)); 174 return (settime1(p, ts, true));
177} 175}
178 176
179/* ARGSUSED */ 177/* ARGSUSED */
180int 178int
181sys___clock_gettime50(struct lwp *l, 179sys___clock_gettime50(struct lwp *l,
182 const struct sys___clock_gettime50_args *uap, register_t *retval) 180 const struct sys___clock_gettime50_args *uap, register_t *retval)
183{ 181{
184 /* { 182 /* {
185 syscallarg(clockid_t) clock_id; 183 syscallarg(clockid_t) clock_id;
186 syscallarg(struct timespec *) tp; 184 syscallarg(struct timespec *) tp;
187 } */ 185 } */
188 int error; 186 int error;
189 struct timespec ats; 187 struct timespec ats;
190 188
191 error = clock_gettime1(SCARG(uap, clock_id), &ats); 189 error = clock_gettime1(SCARG(uap, clock_id), &ats);
192 if (error != 0) 190 if (error != 0)
193 return error; 191 return error;
194 192
195 return copyout(&ats, SCARG(uap, tp), sizeof(ats)); 193 return copyout(&ats, SCARG(uap, tp), sizeof(ats));
196} 194}
197 195
198int 196int
199clock_gettime1(clockid_t clock_id, struct timespec *ts) 197clock_gettime1(clockid_t clock_id, struct timespec *ts)
200{ 198{
201 199
202 switch (clock_id) { 200 switch (clock_id) {
203 case CLOCK_REALTIME: 201 case CLOCK_REALTIME:
204 nanotime(ts); 202 nanotime(ts);
205 break; 203 break;
206 case CLOCK_MONOTONIC: 204 case CLOCK_MONOTONIC:
207 nanouptime(ts); 205 nanouptime(ts);
208 break; 206 break;
209 default: 207 default:
210 return EINVAL; 208 return EINVAL;
211 } 209 }
212 210
213 return 0; 211 return 0;
214} 212}
215 213
216/* ARGSUSED */ 214/* ARGSUSED */
217int 215int
218sys___clock_settime50(struct lwp *l, 216sys___clock_settime50(struct lwp *l,
219 const struct sys___clock_settime50_args *uap, register_t *retval) 217 const struct sys___clock_settime50_args *uap, register_t *retval)
220{ 218{
221 /* { 219 /* {
222 syscallarg(clockid_t) clock_id; 220 syscallarg(clockid_t) clock_id;
223 syscallarg(const struct timespec *) tp; 221 syscallarg(const struct timespec *) tp;
224 } */ 222 } */
225 int error; 223 int error;
226 struct timespec ats; 224 struct timespec ats;
227 225
228 if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0) 226 if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0)
229 return error; 227 return error;
230 228
231 return clock_settime1(l->l_proc, SCARG(uap, clock_id), &ats, true); 229 return clock_settime1(l->l_proc, SCARG(uap, clock_id), &ats, true);
232} 230}
233 231
234 232
235int 233int
236clock_settime1(struct proc *p, clockid_t clock_id, const struct timespec *tp, 234clock_settime1(struct proc *p, clockid_t clock_id, const struct timespec *tp,
237 bool check_kauth) 235 bool check_kauth)
238{ 236{
239 int error; 237 int error;
240 238
241 switch (clock_id) { 239 switch (clock_id) {
242 case CLOCK_REALTIME: 240 case CLOCK_REALTIME:
243 if ((error = settime1(p, tp, check_kauth)) != 0) 241 if ((error = settime1(p, tp, check_kauth)) != 0)
244 return (error); 242 return (error);
245 break; 243 break;
246 case CLOCK_MONOTONIC: 244 case CLOCK_MONOTONIC:
247 return (EINVAL); /* read-only clock */ 245 return (EINVAL); /* read-only clock */
248 default: 246 default:
249 return (EINVAL); 247 return (EINVAL);
250 } 248 }
251 249
252 return 0; 250 return 0;
253} 251}
254 252
255int 253int
256sys___clock_getres50(struct lwp *l, const struct sys___clock_getres50_args *uap, 254sys___clock_getres50(struct lwp *l, const struct sys___clock_getres50_args *uap,
257 register_t *retval) 255 register_t *retval)
258{ 256{
259 /* { 257 /* {
260 syscallarg(clockid_t) clock_id; 258 syscallarg(clockid_t) clock_id;
261 syscallarg(struct timespec *) tp; 259 syscallarg(struct timespec *) tp;
262 } */ 260 } */
263 struct timespec ts; 261 struct timespec ts;
264 int error = 0; 262 int error = 0;
265 263
266 if ((error = clock_getres1(SCARG(uap, clock_id), &ts)) != 0) 264 if ((error = clock_getres1(SCARG(uap, clock_id), &ts)) != 0)
267 return error; 265 return error;
268 266
269 if (SCARG(uap, tp)) 267 if (SCARG(uap, tp))
270 error = copyout(&ts, SCARG(uap, tp), sizeof(ts)); 268 error = copyout(&ts, SCARG(uap, tp), sizeof(ts));
271 269
272 return error; 270 return error;
273} 271}
274 272
275int 273int
276clock_getres1(clockid_t clock_id, struct timespec *ts) 274clock_getres1(clockid_t clock_id, struct timespec *ts)
277{ 275{
278 276
279 switch (clock_id) { 277 switch (clock_id) {
280 case CLOCK_REALTIME: 278 case CLOCK_REALTIME:
281 case CLOCK_MONOTONIC: 279 case CLOCK_MONOTONIC:
282 ts->tv_sec = 0; 280 ts->tv_sec = 0;
283 if (tc_getfrequency() > 1000000000) 281 if (tc_getfrequency() > 1000000000)
284 ts->tv_nsec = 1; 282 ts->tv_nsec = 1;
285 else 283 else
286 ts->tv_nsec = 1000000000 / tc_getfrequency(); 284 ts->tv_nsec = 1000000000 / tc_getfrequency();
287 break; 285 break;
288 default: 286 default:
289 return EINVAL; 287 return EINVAL;
290 } 288 }
291 289
292 return 0; 290 return 0;
293} 291}
294 292
295/* ARGSUSED */ 293/* ARGSUSED */
296int 294int
297sys___nanosleep50(struct lwp *l, const struct sys___nanosleep50_args *uap, 295sys___nanosleep50(struct lwp *l, const struct sys___nanosleep50_args *uap,
298 register_t *retval) 296 register_t *retval)
299{ 297{
300 /* { 298 /* {
301 syscallarg(struct timespec *) rqtp; 299 syscallarg(struct timespec *) rqtp;
302 syscallarg(struct timespec *) rmtp; 300 syscallarg(struct timespec *) rmtp;
303 } */ 301 } */
304 struct timespec rmt, rqt; 302 struct timespec rmt, rqt;
305 int error, error1; 303 int error, error1;
306 304
307 error = copyin(SCARG(uap, rqtp), &rqt, sizeof(struct timespec)); 305 error = copyin(SCARG(uap, rqtp), &rqt, sizeof(struct timespec));
308 if (error) 306 if (error)
309 return (error); 307 return (error);
310 308
311 error = nanosleep1(l, &rqt, SCARG(uap, rmtp) ? &rmt : NULL); 309 error = nanosleep1(l, &rqt, SCARG(uap, rmtp) ? &rmt : NULL);
312 if (SCARG(uap, rmtp) == NULL || (error != 0 && error != EINTR)) 310 if (SCARG(uap, rmtp) == NULL || (error != 0 && error != EINTR))
313 return error; 311 return error;
314 312
315 error1 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt)); 313 error1 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt));
316 return error1 ? error1 : error; 314 return error1 ? error1 : error;
317} 315}
318 316
319int 317int
320nanosleep1(struct lwp *l, struct timespec *rqt, struct timespec *rmt) 318nanosleep1(struct lwp *l, struct timespec *rqt, struct timespec *rmt)
321{ 319{
322 struct timespec rmtstart; 320 struct timespec rmtstart;
323 int error, timo; 321 int error, timo;
324 322
325 if ((error = itimespecfix(rqt)) != 0) 323 if ((error = itimespecfix(rqt)) != 0)
326 return error; 324 return error;
327 325
328 timo = tstohz(rqt); 326 timo = tstohz(rqt);
329 /* 327 /*
330 * Avoid inadvertantly sleeping forever 328 * Avoid inadvertantly sleeping forever
331 */ 329 */
332 if (timo == 0) 330 if (timo == 0)
333 timo = 1; 331 timo = 1;
334 getnanouptime(&rmtstart); 332 getnanouptime(&rmtstart);
335again: 333again:
336 error = kpause("nanoslp", true, timo, NULL); 334 error = kpause("nanoslp", true, timo, NULL);
337 if (rmt != NULL || error == 0) { 335 if (rmt != NULL || error == 0) {
338 struct timespec rmtend; 336 struct timespec rmtend;
339 struct timespec t0; 337 struct timespec t0;
340 struct timespec *t; 338 struct timespec *t;
341 339
342 getnanouptime(&rmtend); 340 getnanouptime(&rmtend);
343 t = (rmt != NULL) ? rmt : &t0; 341 t = (rmt != NULL) ? rmt : &t0;
344 timespecsub(&rmtend, &rmtstart, t); 342 timespecsub(&rmtend, &rmtstart, t);
345 timespecsub(rqt, t, t); 343 timespecsub(rqt, t, t);
346 if (t->tv_sec < 0) 344 if (t->tv_sec < 0)
347 timespecclear(t); 345 timespecclear(t);
348 if (error == 0) { 346 if (error == 0) {
349 timo = tstohz(t); 347 timo = tstohz(t);
350 if (timo > 0) 348 if (timo > 0)
351 goto again; 349 goto again;
352 } 350 }
353 } 351 }
354 352
355 if (error == ERESTART) 353 if (error == ERESTART)
356 error = EINTR; 354 error = EINTR;
357 if (error == EWOULDBLOCK) 355 if (error == EWOULDBLOCK)
358 error = 0; 356 error = 0;
359 357
360 return error; 358 return error;
361} 359}
362 360
363/* ARGSUSED */ 361/* ARGSUSED */
364int 362int
365sys___gettimeofday50(struct lwp *l, const struct sys___gettimeofday50_args *uap, 363sys___gettimeofday50(struct lwp *l, const struct sys___gettimeofday50_args *uap,
366 register_t *retval) 364 register_t *retval)
367{ 365{
368 /* { 366 /* {
369 syscallarg(struct timeval *) tp; 367 syscallarg(struct timeval *) tp;
370 syscallarg(void *) tzp; really "struct timezone *"; 368 syscallarg(void *) tzp; really "struct timezone *";
371 } */ 369 } */
372 struct timeval atv; 370 struct timeval atv;
373 int error = 0; 371 int error = 0;
374 struct timezone tzfake; 372 struct timezone tzfake;
375 373
376 if (SCARG(uap, tp)) { 374 if (SCARG(uap, tp)) {
377 microtime(&atv); 375 microtime(&atv);
378 error = copyout(&atv, SCARG(uap, tp), sizeof(atv)); 376 error = copyout(&atv, SCARG(uap, tp), sizeof(atv));
379 if (error) 377 if (error)
380 return (error); 378 return (error);
381 } 379 }
382 if (SCARG(uap, tzp)) { 380 if (SCARG(uap, tzp)) {
383 /* 381 /*
384 * NetBSD has no kernel notion of time zone, so we just 382 * NetBSD has no kernel notion of time zone, so we just
385 * fake up a timezone struct and return it if demanded. 383 * fake up a timezone struct and return it if demanded.
386 */ 384 */
387 tzfake.tz_minuteswest = 0; 385 tzfake.tz_minuteswest = 0;
388 tzfake.tz_dsttime = 0; 386 tzfake.tz_dsttime = 0;
389 error = copyout(&tzfake, SCARG(uap, tzp), sizeof(tzfake)); 387 error = copyout(&tzfake, SCARG(uap, tzp), sizeof(tzfake));
390 } 388 }
391 return (error); 389 return (error);
392} 390}
393 391
394/* ARGSUSED */ 392/* ARGSUSED */
395int 393int
396sys___settimeofday50(struct lwp *l, const struct sys___settimeofday50_args *uap, 394sys___settimeofday50(struct lwp *l, const struct sys___settimeofday50_args *uap,
397 register_t *retval) 395 register_t *retval)
398{ 396{
399 /* { 397 /* {
400 syscallarg(const struct timeval *) tv; 398 syscallarg(const struct timeval *) tv;
401 syscallarg(const void *) tzp; really "const struct timezone *"; 399 syscallarg(const void *) tzp; really "const struct timezone *";
402 } */ 400 } */
403 401
404 return settimeofday1(SCARG(uap, tv), true, SCARG(uap, tzp), l, true); 402 return settimeofday1(SCARG(uap, tv), true, SCARG(uap, tzp), l, true);
405} 403}
406 404
407int 405int
408settimeofday1(const struct timeval *utv, bool userspace, 406settimeofday1(const struct timeval *utv, bool userspace,
409 const void *utzp, struct lwp *l, bool check_kauth) 407 const void *utzp, struct lwp *l, bool check_kauth)
410{ 408{
411 struct timeval atv; 409 struct timeval atv;
412 struct timespec ts; 410 struct timespec ts;
413 int error; 411 int error;
414 412
415 /* Verify all parameters before changing time. */ 413 /* Verify all parameters before changing time. */
416 414
417 /* 415 /*
418 * NetBSD has no kernel notion of time zone, and only an 416 * NetBSD has no kernel notion of time zone, and only an
419 * obsolete program would try to set it, so we log a warning. 417 * obsolete program would try to set it, so we log a warning.
420 */ 418 */
421 if (utzp) 419 if (utzp)
422 log(LOG_WARNING, "pid %d attempted to set the " 420 log(LOG_WARNING, "pid %d attempted to set the "
423 "(obsolete) kernel time zone\n", l->l_proc->p_pid); 421 "(obsolete) kernel time zone\n", l->l_proc->p_pid);
424 422
425 if (utv == NULL)  423 if (utv == NULL)
426 return 0; 424 return 0;
427 425
428 if (userspace) { 426 if (userspace) {
429 if ((error = copyin(utv, &atv, sizeof(atv))) != 0) 427 if ((error = copyin(utv, &atv, sizeof(atv))) != 0)
430 return error; 428 return error;
431 utv = &atv; 429 utv = &atv;
432 } 430 }
433 431
434 TIMEVAL_TO_TIMESPEC(utv, &ts); 432 TIMEVAL_TO_TIMESPEC(utv, &ts);
435 return settime1(l->l_proc, &ts, check_kauth); 433 return settime1(l->l_proc, &ts, check_kauth);
436} 434}
437 435
438int time_adjusted; /* set if an adjustment is made */ 436int time_adjusted; /* set if an adjustment is made */
439 437
440/* ARGSUSED */ 438/* ARGSUSED */
441int 439int
442sys___adjtime50(struct lwp *l, const struct sys___adjtime50_args *uap, 440sys___adjtime50(struct lwp *l, const struct sys___adjtime50_args *uap,
443 register_t *retval) 441 register_t *retval)
444{ 442{
445 /* { 443 /* {
446 syscallarg(const struct timeval *) delta; 444 syscallarg(const struct timeval *) delta;
447 syscallarg(struct timeval *) olddelta; 445 syscallarg(struct timeval *) olddelta;
448 } */ 446 } */
449 int error = 0; 447 int error = 0;
450 struct timeval atv, oldatv; 448 struct timeval atv, oldatv;
451 449
452 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_TIME, 450 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_TIME,
453 KAUTH_REQ_SYSTEM_TIME_ADJTIME, NULL, NULL, NULL)) != 0) 451 KAUTH_REQ_SYSTEM_TIME_ADJTIME, NULL, NULL, NULL)) != 0)
454 return error; 452 return error;
455 453
456 if (SCARG(uap, delta)) { 454 if (SCARG(uap, delta)) {
457 error = copyin(SCARG(uap, delta), &atv, 455 error = copyin(SCARG(uap, delta), &atv,
458 sizeof(*SCARG(uap, delta))); 456 sizeof(*SCARG(uap, delta)));
459 if (error) 457 if (error)
460 return (error); 458 return (error);
461 } 459 }
462 adjtime1(SCARG(uap, delta) ? &atv : NULL, 460 adjtime1(SCARG(uap, delta) ? &atv : NULL,
463 SCARG(uap, olddelta) ? &oldatv : NULL, l->l_proc); 461 SCARG(uap, olddelta) ? &oldatv : NULL, l->l_proc);
464 if (SCARG(uap, olddelta)) 462 if (SCARG(uap, olddelta))
465 error = copyout(&oldatv, SCARG(uap, olddelta), 463 error = copyout(&oldatv, SCARG(uap, olddelta),
466 sizeof(*SCARG(uap, olddelta))); 464 sizeof(*SCARG(uap, olddelta)));
467 return error; 465 return error;
468} 466}
469 467
470void 468void
471adjtime1(const struct timeval *delta, struct timeval *olddelta, struct proc *p) 469adjtime1(const struct timeval *delta, struct timeval *olddelta, struct proc *p)
472{ 470{
473 extern int64_t time_adjtime; /* in kern_ntptime.c */ 471 extern int64_t time_adjtime; /* in kern_ntptime.c */
474 472
475 if (olddelta) { 473 if (olddelta) {
476 mutex_spin_enter(&timecounter_lock); 474 mutex_spin_enter(&timecounter_lock);
477 olddelta->tv_sec = time_adjtime / 1000000; 475 olddelta->tv_sec = time_adjtime / 1000000;
478 olddelta->tv_usec = time_adjtime % 1000000; 476 olddelta->tv_usec = time_adjtime % 1000000;
479 if (olddelta->tv_usec < 0) { 477 if (olddelta->tv_usec < 0) {
480 olddelta->tv_usec += 1000000; 478 olddelta->tv_usec += 1000000;
481 olddelta->tv_sec--; 479 olddelta->tv_sec--;
482 } 480 }
483 mutex_spin_exit(&timecounter_lock); 481 mutex_spin_exit(&timecounter_lock);
484 } 482 }
485  483
486 if (delta) { 484 if (delta) {
487 mutex_spin_enter(&timecounter_lock); 485 mutex_spin_enter(&timecounter_lock);
488 time_adjtime = delta->tv_sec * 1000000 + delta->tv_usec; 486 time_adjtime = delta->tv_sec * 1000000 + delta->tv_usec;
489 487
490 if (time_adjtime) { 488 if (time_adjtime) {
491 /* We need to save the system time during shutdown */ 489 /* We need to save the system time during shutdown */
492 time_adjusted |= 1; 490 time_adjusted |= 1;
493 } 491 }
494 mutex_spin_exit(&timecounter_lock); 492 mutex_spin_exit(&timecounter_lock);
495 } 493 }
496} 494}
497 495
498/* 496/*
499 * Interval timer support. Both the BSD getitimer() family and the POSIX 497 * Interval timer support. Both the BSD getitimer() family and the POSIX
500 * timer_*() family of routines are supported. 498 * timer_*() family of routines are supported.
501 * 499 *
502 * All timers are kept in an array pointed to by p_timers, which is 500 * All timers are kept in an array pointed to by p_timers, which is
503 * allocated on demand - many processes don't use timers at all. The 501 * allocated on demand - many processes don't use timers at all. The
504 * first three elements in this array are reserved for the BSD timers: 502 * first three elements in this array are reserved for the BSD timers:
505 * element 0 is ITIMER_REAL, element 1 is ITIMER_VIRTUAL, and element 503 * element 0 is ITIMER_REAL, element 1 is ITIMER_VIRTUAL, and element
506 * 2 is ITIMER_PROF. The rest may be allocated by the timer_create() 504 * 2 is ITIMER_PROF. The rest may be allocated by the timer_create()
507 * syscall. 505 * syscall.
508 * 506 *
509 * Realtime timers are kept in the ptimer structure as an absolute 507 * Realtime timers are kept in the ptimer structure as an absolute
510 * time; virtual time timers are kept as a linked list of deltas. 508 * time; virtual time timers are kept as a linked list of deltas.
511 * Virtual time timers are processed in the hardclock() routine of 509 * Virtual time timers are processed in the hardclock() routine of
512 * kern_clock.c. The real time timer is processed by a callout 510 * kern_clock.c. The real time timer is processed by a callout
513 * routine, called from the softclock() routine. Since a callout may 511 * routine, called from the softclock() routine. Since a callout may
514 * be delayed in real time due to interrupt processing in the system, 512 * be delayed in real time due to interrupt processing in the system,
515 * it is possible for the real time timeout routine (realtimeexpire, 513 * it is possible for the real time timeout routine (realtimeexpire,
516 * given below), to be delayed in real time past when it is supposed 514 * given below), to be delayed in real time past when it is supposed
517 * to occur. It does not suffice, therefore, to reload the real timer 515 * to occur. It does not suffice, therefore, to reload the real timer
518 * .it_value from the real time timers .it_interval. Rather, we 516 * .it_value from the real time timers .it_interval. Rather, we
519 * compute the next time in absolute time the timer should go off. */ 517 * compute the next time in absolute time the timer should go off. */
520 518
521/* Allocate a POSIX realtime timer. */ 519/* Allocate a POSIX realtime timer. */
522int 520int
523sys_timer_create(struct lwp *l, const struct sys_timer_create_args *uap, 521sys_timer_create(struct lwp *l, const struct sys_timer_create_args *uap,
524 register_t *retval) 522 register_t *retval)
525{ 523{
526 /* { 524 /* {
527 syscallarg(clockid_t) clock_id; 525 syscallarg(clockid_t) clock_id;
528 syscallarg(struct sigevent *) evp; 526 syscallarg(struct sigevent *) evp;
529 syscallarg(timer_t *) timerid; 527 syscallarg(timer_t *) timerid;
530 } */ 528 } */
531 529
532 return timer_create1(SCARG(uap, timerid), SCARG(uap, clock_id), 530 return timer_create1(SCARG(uap, timerid), SCARG(uap, clock_id),
533 SCARG(uap, evp), copyin, l); 531 SCARG(uap, evp), copyin, l);
534} 532}
535 533
536int 534int
537timer_create1(timer_t *tid, clockid_t id, struct sigevent *evp, 535timer_create1(timer_t *tid, clockid_t id, struct sigevent *evp,
538 copyin_t fetch_event, struct lwp *l) 536 copyin_t fetch_event, struct lwp *l)
539{ 537{
540 int error; 538 int error;
541 timer_t timerid; 539 timer_t timerid;
542 struct ptimers *pts; 540 struct ptimers *pts;
543 struct ptimer *pt; 541 struct ptimer *pt;
544 struct proc *p; 542 struct proc *p;
545 543
546 p = l->l_proc; 544 p = l->l_proc;
547 545
548 if (id != CLOCK_REALTIME && id != CLOCK_VIRTUAL && 546 if (id != CLOCK_REALTIME && id != CLOCK_VIRTUAL &&
549 id != CLOCK_PROF && id != CLOCK_MONOTONIC) 547 id != CLOCK_PROF && id != CLOCK_MONOTONIC)
550 return (EINVAL); 548 return (EINVAL);
551 549
552 if ((pts = p->p_timers) == NULL) 550 if ((pts = p->p_timers) == NULL)
553 pts = timers_alloc(p); 551 pts = timers_alloc(p);
554 552
555 pt = pool_get(&ptimer_pool, PR_WAITOK); 553 pt = pool_get(&ptimer_pool, PR_WAITOK);
556 if (evp != NULL) { 554 if (evp != NULL) {
557 if (((error = 555 if (((error =
558 (*fetch_event)(evp, &pt->pt_ev, sizeof(pt->pt_ev))) != 0) || 556 (*fetch_event)(evp, &pt->pt_ev, sizeof(pt->pt_ev))) != 0) ||
559 ((pt->pt_ev.sigev_notify < SIGEV_NONE) || 557 ((pt->pt_ev.sigev_notify < SIGEV_NONE) ||
560 (pt->pt_ev.sigev_notify > SIGEV_SA)) || 558 (pt->pt_ev.sigev_notify > SIGEV_SA)) ||
561 (pt->pt_ev.sigev_notify == SIGEV_SIGNAL && 559 (pt->pt_ev.sigev_notify == SIGEV_SIGNAL &&
562 (pt->pt_ev.sigev_signo <= 0 || 560 (pt->pt_ev.sigev_signo <= 0 ||
563 pt->pt_ev.sigev_signo >= NSIG))) { 561 pt->pt_ev.sigev_signo >= NSIG))) {
564 pool_put(&ptimer_pool, pt); 562 pool_put(&ptimer_pool, pt);
565 return (error ? error : EINVAL); 563 return (error ? error : EINVAL);
566 } 564 }
567 } 565 }
568 566
569 /* Find a free timer slot, skipping those reserved for setitimer(). */ 567 /* Find a free timer slot, skipping those reserved for setitimer(). */
570 mutex_spin_enter(&timer_lock); 568 mutex_spin_enter(&timer_lock);
571 for (timerid = 3; timerid < TIMER_MAX; timerid++) 569 for (timerid = 3; timerid < TIMER_MAX; timerid++)
572 if (pts->pts_timers[timerid] == NULL) 570 if (pts->pts_timers[timerid] == NULL)
573 break; 571 break;
574 if (timerid == TIMER_MAX) { 572 if (timerid == TIMER_MAX) {
575 mutex_spin_exit(&timer_lock); 573 mutex_spin_exit(&timer_lock);
576 pool_put(&ptimer_pool, pt); 574 pool_put(&ptimer_pool, pt);
577 return EAGAIN; 575 return EAGAIN;
578 } 576 }
579 if (evp == NULL) { 577 if (evp == NULL) {
580 pt->pt_ev.sigev_notify = SIGEV_SIGNAL; 578 pt->pt_ev.sigev_notify = SIGEV_SIGNAL;
581 switch (id) { 579 switch (id) {
582 case CLOCK_REALTIME: 580 case CLOCK_REALTIME:
583 case CLOCK_MONOTONIC: 581 case CLOCK_MONOTONIC:
584 pt->pt_ev.sigev_signo = SIGALRM; 582 pt->pt_ev.sigev_signo = SIGALRM;
585 break; 583 break;
586 case CLOCK_VIRTUAL: 584 case CLOCK_VIRTUAL:
587 pt->pt_ev.sigev_signo = SIGVTALRM; 585 pt->pt_ev.sigev_signo = SIGVTALRM;
588 break; 586 break;
589 case CLOCK_PROF: 587 case CLOCK_PROF:
590 pt->pt_ev.sigev_signo = SIGPROF; 588 pt->pt_ev.sigev_signo = SIGPROF;
591 break; 589 break;
592 } 590 }
593 pt->pt_ev.sigev_value.sival_int = timerid; 591 pt->pt_ev.sigev_value.sival_int = timerid;
594 } 592 }
595 pt->pt_info.ksi_signo = pt->pt_ev.sigev_signo; 593 pt->pt_info.ksi_signo = pt->pt_ev.sigev_signo;
596 pt->pt_info.ksi_errno = 0; 594 pt->pt_info.ksi_errno = 0;
597 pt->pt_info.ksi_code = 0; 595 pt->pt_info.ksi_code = 0;
598 pt->pt_info.ksi_pid = p->p_pid; 596 pt->pt_info.ksi_pid = p->p_pid;
599 pt->pt_info.ksi_uid = kauth_cred_getuid(l->l_cred); 597 pt->pt_info.ksi_uid = kauth_cred_getuid(l->l_cred);
600 pt->pt_info.ksi_value = pt->pt_ev.sigev_value; 598 pt->pt_info.ksi_value = pt->pt_ev.sigev_value;
601 pt->pt_type = id; 599 pt->pt_type = id;
602 pt->pt_proc = p; 600 pt->pt_proc = p;
603 pt->pt_overruns = 0; 601 pt->pt_overruns = 0;
604 pt->pt_poverruns = 0; 602 pt->pt_poverruns = 0;
605 pt->pt_entry = timerid; 603 pt->pt_entry = timerid;
606 pt->pt_queued = false; 604 pt->pt_queued = false;
607 timespecclear(&pt->pt_time.it_value); 605 timespecclear(&pt->pt_time.it_value);
608 if (!CLOCK_VIRTUAL_P(id)) 606 if (!CLOCK_VIRTUAL_P(id))
609 callout_init(&pt->pt_ch, CALLOUT_MPSAFE); 607 callout_init(&pt->pt_ch, CALLOUT_MPSAFE);
610 else 608 else
611 pt->pt_active = 0; 609 pt->pt_active = 0;
612 610
613 pts->pts_timers[timerid] = pt; 611 pts->pts_timers[timerid] = pt;
614 mutex_spin_exit(&timer_lock); 612 mutex_spin_exit(&timer_lock);
615 613
616 return copyout(&timerid, tid, sizeof(timerid)); 614 return copyout(&timerid, tid, sizeof(timerid));
617} 615}
618 616
619/* Delete a POSIX realtime timer */ 617/* Delete a POSIX realtime timer */
620int 618int
621sys_timer_delete(struct lwp *l, const struct sys_timer_delete_args *uap, 619sys_timer_delete(struct lwp *l, const struct sys_timer_delete_args *uap,
622 register_t *retval) 620 register_t *retval)
623{ 621{
624 /* { 622 /* {
625 syscallarg(timer_t) timerid; 623 syscallarg(timer_t) timerid;
626 } */ 624 } */
627 struct proc *p = l->l_proc; 625 struct proc *p = l->l_proc;
628 timer_t timerid; 626 timer_t timerid;
629 struct ptimers *pts; 627 struct ptimers *pts;
630 struct ptimer *pt, *ptn; 628 struct ptimer *pt, *ptn;
631 629
632 timerid = SCARG(uap, timerid); 630 timerid = SCARG(uap, timerid);
633 pts = p->p_timers; 631 pts = p->p_timers;
634  632
635 if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX) 633 if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
636 return (EINVAL); 634 return (EINVAL);
637 635
638 mutex_spin_enter(&timer_lock); 636 mutex_spin_enter(&timer_lock);
639 if ((pt = pts->pts_timers[timerid]) == NULL) { 637 if ((pt = pts->pts_timers[timerid]) == NULL) {
640 mutex_spin_exit(&timer_lock); 638 mutex_spin_exit(&timer_lock);
641 return (EINVAL); 639 return (EINVAL);
642 } 640 }
643 if (CLOCK_VIRTUAL_P(pt->pt_type)) { 641 if (CLOCK_VIRTUAL_P(pt->pt_type)) {
644 if (pt->pt_active) { 642 if (pt->pt_active) {
645 ptn = LIST_NEXT(pt, pt_list); 643 ptn = LIST_NEXT(pt, pt_list);
646 LIST_REMOVE(pt, pt_list); 644 LIST_REMOVE(pt, pt_list);
647 for ( ; ptn; ptn = LIST_NEXT(ptn, pt_list)) 645 for ( ; ptn; ptn = LIST_NEXT(ptn, pt_list))
648 timespecadd(&pt->pt_time.it_value, 646 timespecadd(&pt->pt_time.it_value,
649 &ptn->pt_time.it_value, 647 &ptn->pt_time.it_value,
650 &ptn->pt_time.it_value); 648 &ptn->pt_time.it_value);
651 pt->pt_active = 0; 649 pt->pt_active = 0;
652 } 650 }
653 } 651 }
654 itimerfree(pts, timerid); 652 itimerfree(pts, timerid);
655 653
656 return (0); 654 return (0);
657} 655}
658 656
659/* 657/*
660 * Set up the given timer. The value in pt->pt_time.it_value is taken 658 * Set up the given timer. The value in pt->pt_time.it_value is taken
661 * to be an absolute time for CLOCK_REALTIME/CLOCK_MONOTONIC timers and 659 * to be an absolute time for CLOCK_REALTIME/CLOCK_MONOTONIC timers and
662 * a relative time for CLOCK_VIRTUAL/CLOCK_PROF timers. 660 * a relative time for CLOCK_VIRTUAL/CLOCK_PROF timers.
663 */ 661 */
664void 662void
665timer_settime(struct ptimer *pt) 663timer_settime(struct ptimer *pt)
666{ 664{
667 struct ptimer *ptn, *pptn; 665 struct ptimer *ptn, *pptn;
668 struct ptlist *ptl; 666 struct ptlist *ptl;
669 667
670 KASSERT(mutex_owned(&timer_lock)); 668 KASSERT(mutex_owned(&timer_lock));
671 669
672 if (!CLOCK_VIRTUAL_P(pt->pt_type)) { 670 if (!CLOCK_VIRTUAL_P(pt->pt_type)) {
673 callout_halt(&pt->pt_ch, &timer_lock); 671 callout_halt(&pt->pt_ch, &timer_lock);
674 if (timespecisset(&pt->pt_time.it_value)) { 672 if (timespecisset(&pt->pt_time.it_value)) {
675 /* 673 /*
676 * Don't need to check tshzto() return value, here. 674 * Don't need to check tshzto() return value, here.
677 * callout_reset() does it for us. 675 * callout_reset() does it for us.
678 */ 676 */
679 callout_reset(&pt->pt_ch, tshzto(&pt->pt_time.it_value), 677 callout_reset(&pt->pt_ch, tshzto(&pt->pt_time.it_value),
680 realtimerexpire, pt); 678 realtimerexpire, pt);
681 } 679 }
682 } else { 680 } else {
683 if (pt->pt_active) { 681 if (pt->pt_active) {
684 ptn = LIST_NEXT(pt, pt_list); 682 ptn = LIST_NEXT(pt, pt_list);
685 LIST_REMOVE(pt, pt_list); 683 LIST_REMOVE(pt, pt_list);
686 for ( ; ptn; ptn = LIST_NEXT(ptn, pt_list)) 684 for ( ; ptn; ptn = LIST_NEXT(ptn, pt_list))
687 timespecadd(&pt->pt_time.it_value, 685 timespecadd(&pt->pt_time.it_value,
688 &ptn->pt_time.it_value, 686 &ptn->pt_time.it_value,
689 &ptn->pt_time.it_value); 687 &ptn->pt_time.it_value);
690 } 688 }
691 if (timespecisset(&pt->pt_time.it_value)) { 689 if (timespecisset(&pt->pt_time.it_value)) {
692 if (pt->pt_type == CLOCK_VIRTUAL) 690 if (pt->pt_type == CLOCK_VIRTUAL)
693 ptl = &pt->pt_proc->p_timers->pts_virtual; 691 ptl = &pt->pt_proc->p_timers->pts_virtual;
694 else 692 else
695 ptl = &pt->pt_proc->p_timers->pts_prof; 693 ptl = &pt->pt_proc->p_timers->pts_prof;
696 694
697 for (ptn = LIST_FIRST(ptl), pptn = NULL; 695 for (ptn = LIST_FIRST(ptl), pptn = NULL;
698 ptn && timespeccmp(&pt->pt_time.it_value, 696 ptn && timespeccmp(&pt->pt_time.it_value,
699 &ptn->pt_time.it_value, >); 697 &ptn->pt_time.it_value, >);
700 pptn = ptn, ptn = LIST_NEXT(ptn, pt_list)) 698 pptn = ptn, ptn = LIST_NEXT(ptn, pt_list))
701 timespecsub(&pt->pt_time.it_value, 699 timespecsub(&pt->pt_time.it_value,
702 &ptn->pt_time.it_value, 700 &ptn->pt_time.it_value,
703 &pt->pt_time.it_value); 701 &pt->pt_time.it_value);
704 702
705 if (pptn) 703 if (pptn)
706 LIST_INSERT_AFTER(pptn, pt, pt_list); 704 LIST_INSERT_AFTER(pptn, pt, pt_list);
707 else 705 else
708 LIST_INSERT_HEAD(ptl, pt, pt_list); 706 LIST_INSERT_HEAD(ptl, pt, pt_list);
709 707
710 for ( ; ptn ; ptn = LIST_NEXT(ptn, pt_list)) 708 for ( ; ptn ; ptn = LIST_NEXT(ptn, pt_list))
711 timespecsub(&ptn->pt_time.it_value, 709 timespecsub(&ptn->pt_time.it_value,
712 &pt->pt_time.it_value, 710 &pt->pt_time.it_value,
713 &ptn->pt_time.it_value); 711 &ptn->pt_time.it_value);
714 712
715 pt->pt_active = 1; 713 pt->pt_active = 1;
716 } else 714 } else
717 pt->pt_active = 0; 715 pt->pt_active = 0;
718 } 716 }
719} 717}
720 718
721void 719void
722timer_gettime(struct ptimer *pt, struct itimerspec *aits) 720timer_gettime(struct ptimer *pt, struct itimerspec *aits)
723{ 721{
724 struct timespec now; 722 struct timespec now;
725 struct ptimer *ptn; 723 struct ptimer *ptn;
726 724
727 KASSERT(mutex_owned(&timer_lock)); 725 KASSERT(mutex_owned(&timer_lock));
728 726
729 *aits = pt->pt_time; 727 *aits = pt->pt_time;
730 if (!CLOCK_VIRTUAL_P(pt->pt_type)) { 728 if (!CLOCK_VIRTUAL_P(pt->pt_type)) {
731 /* 729 /*
732 * Convert from absolute to relative time in .it_value 730 * Convert from absolute to relative time in .it_value
733 * part of real time timer. If time for real time 731 * part of real time timer. If time for real time
734 * timer has passed return 0, else return difference 732 * timer has passed return 0, else return difference
735 * between current time and time for the timer to go 733 * between current time and time for the timer to go
736 * off. 734 * off.
737 */ 735 */
738 if (timespecisset(&aits->it_value)) { 736 if (timespecisset(&aits->it_value)) {
739 if (pt->pt_type == CLOCK_REALTIME) { 737 if (pt->pt_type == CLOCK_REALTIME) {
740 getnanotime(&now); 738 getnanotime(&now);
741 } else { /* CLOCK_MONOTONIC */ 739 } else { /* CLOCK_MONOTONIC */
742 getnanouptime(&now); 740 getnanouptime(&now);
743 } 741 }
744 if (timespeccmp(&aits->it_value, &now, <)) 742 if (timespeccmp(&aits->it_value, &now, <))
745 timespecclear(&aits->it_value); 743 timespecclear(&aits->it_value);
746 else 744 else
747 timespecsub(&aits->it_value, &now, 745 timespecsub(&aits->it_value, &now,
748 &aits->it_value); 746 &aits->it_value);
749 } 747 }
750 } else if (pt->pt_active) { 748 } else if (pt->pt_active) {
751 if (pt->pt_type == CLOCK_VIRTUAL) 749 if (pt->pt_type == CLOCK_VIRTUAL)
752 ptn = LIST_FIRST(&pt->pt_proc->p_timers->pts_virtual); 750 ptn = LIST_FIRST(&pt->pt_proc->p_timers->pts_virtual);
753 else 751 else
754 ptn = LIST_FIRST(&pt->pt_proc->p_timers->pts_prof); 752 ptn = LIST_FIRST(&pt->pt_proc->p_timers->pts_prof);
755 for ( ; ptn && ptn != pt; ptn = LIST_NEXT(ptn, pt_list)) 753 for ( ; ptn && ptn != pt; ptn = LIST_NEXT(ptn, pt_list))
756 timespecadd(&aits->it_value, 754 timespecadd(&aits->it_value,
757 &ptn->pt_time.it_value, &aits->it_value); 755 &ptn->pt_time.it_value, &aits->it_value);
758 KASSERT(ptn != NULL); /* pt should be findable on the list */ 756 KASSERT(ptn != NULL); /* pt should be findable on the list */
759 } else 757 } else
760 timespecclear(&aits->it_value); 758 timespecclear(&aits->it_value);
761} 759}
762 760
763 761
764 762
765/* Set and arm a POSIX realtime timer */ 763/* Set and arm a POSIX realtime timer */
766int 764int
767sys___timer_settime50(struct lwp *l, 765sys___timer_settime50(struct lwp *l,
768 const struct sys___timer_settime50_args *uap, 766 const struct sys___timer_settime50_args *uap,
769 register_t *retval) 767 register_t *retval)
770{ 768{
771 /* { 769 /* {
772 syscallarg(timer_t) timerid; 770 syscallarg(timer_t) timerid;
773 syscallarg(int) flags; 771 syscallarg(int) flags;
774 syscallarg(const struct itimerspec *) value; 772 syscallarg(const struct itimerspec *) value;
775 syscallarg(struct itimerspec *) ovalue; 773 syscallarg(struct itimerspec *) ovalue;
776 } */ 774 } */
777 int error; 775 int error;
778 struct itimerspec value, ovalue, *ovp = NULL; 776 struct itimerspec value, ovalue, *ovp = NULL;
779 777
780 if ((error = copyin(SCARG(uap, value), &value, 778 if ((error = copyin(SCARG(uap, value), &value,
781 sizeof(struct itimerspec))) != 0) 779 sizeof(struct itimerspec))) != 0)
782 return (error); 780 return (error);
783 781
784 if (SCARG(uap, ovalue)) 782 if (SCARG(uap, ovalue))
785 ovp = &ovalue; 783 ovp = &ovalue;
786 784
787 if ((error = dotimer_settime(SCARG(uap, timerid), &value, ovp, 785 if ((error = dotimer_settime(SCARG(uap, timerid), &value, ovp,
788 SCARG(uap, flags), l->l_proc)) != 0) 786 SCARG(uap, flags), l->l_proc)) != 0)
789 return error; 787 return error;
790 788
791 if (ovp) 789 if (ovp)
792 return copyout(&ovalue, SCARG(uap, ovalue), 790 return copyout(&ovalue, SCARG(uap, ovalue),
793 sizeof(struct itimerspec)); 791 sizeof(struct itimerspec));
794 return 0; 792 return 0;
795} 793}
796 794
797int 795int
798dotimer_settime(int timerid, struct itimerspec *value, 796dotimer_settime(int timerid, struct itimerspec *value,
799 struct itimerspec *ovalue, int flags, struct proc *p) 797 struct itimerspec *ovalue, int flags, struct proc *p)
800{ 798{
801 struct timespec now; 799 struct timespec now;
802 struct itimerspec val, oval; 800 struct itimerspec val, oval;
803 struct ptimers *pts; 801 struct ptimers *pts;
804 struct ptimer *pt; 802 struct ptimer *pt;
805 int error; 803 int error;
806 804
807 pts = p->p_timers; 805 pts = p->p_timers;
808 806
809 if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX) 807 if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
810 return EINVAL; 808 return EINVAL;
811 val = *value; 809 val = *value;
812 if ((error = itimespecfix(&val.it_value)) != 0 || 810 if ((error = itimespecfix(&val.it_value)) != 0 ||
813 (error = itimespecfix(&val.it_interval)) != 0) 811 (error = itimespecfix(&val.it_interval)) != 0)
814 return error; 812 return error;
815 813
816 mutex_spin_enter(&timer_lock); 814 mutex_spin_enter(&timer_lock);
817 if ((pt = pts->pts_timers[timerid]) == NULL) { 815 if ((pt = pts->pts_timers[timerid]) == NULL) {
818 mutex_spin_exit(&timer_lock); 816 mutex_spin_exit(&timer_lock);
819 return EINVAL; 817 return EINVAL;
820 } 818 }
821 819
822 oval = pt->pt_time; 820 oval = pt->pt_time;
823 pt->pt_time = val; 821 pt->pt_time = val;
824 822
825 /* 823 /*
826 * If we've been passed a relative time for a realtime timer, 824 * If we've been passed a relative time for a realtime timer,
827 * convert it to absolute; if an absolute time for a virtual 825 * convert it to absolute; if an absolute time for a virtual
828 * timer, convert it to relative and make sure we don't set it 826 * timer, convert it to relative and make sure we don't set it
829 * to zero, which would cancel the timer, or let it go 827 * to zero, which would cancel the timer, or let it go
830 * negative, which would confuse the comparison tests. 828 * negative, which would confuse the comparison tests.
831 */ 829 */
832 if (timespecisset(&pt->pt_time.it_value)) { 830 if (timespecisset(&pt->pt_time.it_value)) {
833 if (!CLOCK_VIRTUAL_P(pt->pt_type)) { 831 if (!CLOCK_VIRTUAL_P(pt->pt_type)) {
834 if ((flags & TIMER_ABSTIME) == 0) { 832 if ((flags & TIMER_ABSTIME) == 0) {
835 if (pt->pt_type == CLOCK_REALTIME) { 833 if (pt->pt_type == CLOCK_REALTIME) {
836 getnanotime(&now); 834 getnanotime(&now);
837 } else { /* CLOCK_MONOTONIC */ 835 } else { /* CLOCK_MONOTONIC */
838 getnanouptime(&now); 836 getnanouptime(&now);
839 } 837 }
840 timespecadd(&pt->pt_time.it_value, &now, 838 timespecadd(&pt->pt_time.it_value, &now,
841 &pt->pt_time.it_value); 839 &pt->pt_time.it_value);
842 } 840 }
843 } else { 841 } else {
844 if ((flags & TIMER_ABSTIME) != 0) { 842 if ((flags & TIMER_ABSTIME) != 0) {
845 getnanotime(&now); 843 getnanotime(&now);
846 timespecsub(&pt->pt_time.it_value, &now, 844 timespecsub(&pt->pt_time.it_value, &now,
847 &pt->pt_time.it_value); 845 &pt->pt_time.it_value);
848 if (!timespecisset(&pt->pt_time.it_value) || 846 if (!timespecisset(&pt->pt_time.it_value) ||
849 pt->pt_time.it_value.tv_sec < 0) { 847 pt->pt_time.it_value.tv_sec < 0) {
850 pt->pt_time.it_value.tv_sec = 0; 848 pt->pt_time.it_value.tv_sec = 0;
851 pt->pt_time.it_value.tv_nsec = 1; 849 pt->pt_time.it_value.tv_nsec = 1;
852 } 850 }
853 } 851 }
854 } 852 }
855 } 853 }
856 854
857 timer_settime(pt); 855 timer_settime(pt);
858 mutex_spin_exit(&timer_lock); 856 mutex_spin_exit(&timer_lock);
859 857
860 if (ovalue) 858 if (ovalue)
861 *ovalue = oval; 859 *ovalue = oval;
862 860
863 return (0); 861 return (0);
864} 862}
865 863
866/* Return the time remaining until a POSIX timer fires. */ 864/* Return the time remaining until a POSIX timer fires. */
867int 865int
868sys___timer_gettime50(struct lwp *l, 866sys___timer_gettime50(struct lwp *l,
869 const struct sys___timer_gettime50_args *uap, register_t *retval) 867 const struct sys___timer_gettime50_args *uap, register_t *retval)
870{ 868{
871 /* { 869 /* {
872 syscallarg(timer_t) timerid; 870 syscallarg(timer_t) timerid;
873 syscallarg(struct itimerspec *) value; 871 syscallarg(struct itimerspec *) value;
874 } */ 872 } */
875 struct itimerspec its; 873 struct itimerspec its;
876 int error; 874 int error;
877 875
878 if ((error = dotimer_gettime(SCARG(uap, timerid), l->l_proc, 876 if ((error = dotimer_gettime(SCARG(uap, timerid), l->l_proc,
879 &its)) != 0) 877 &its)) != 0)
880 return error; 878 return error;
881 879
882 return copyout(&its, SCARG(uap, value), sizeof(its)); 880 return copyout(&its, SCARG(uap, value), sizeof(its));
883} 881}
884 882
885int 883int
886dotimer_gettime(int timerid, struct proc *p, struct itimerspec *its) 884dotimer_gettime(int timerid, struct proc *p, struct itimerspec *its)
887{ 885{
888 struct ptimer *pt; 886 struct ptimer *pt;
889 struct ptimers *pts; 887 struct ptimers *pts;
890 888
891 pts = p->p_timers; 889 pts = p->p_timers;
892 if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX) 890 if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
893 return (EINVAL); 891 return (EINVAL);
894 mutex_spin_enter(&timer_lock); 892 mutex_spin_enter(&timer_lock);
895 if ((pt = pts->pts_timers[timerid]) == NULL) { 893 if ((pt = pts->pts_timers[timerid]) == NULL) {
896 mutex_spin_exit(&timer_lock); 894 mutex_spin_exit(&timer_lock);
897 return (EINVAL); 895 return (EINVAL);
898 } 896 }
899 timer_gettime(pt, its); 897 timer_gettime(pt, its);
900 mutex_spin_exit(&timer_lock); 898 mutex_spin_exit(&timer_lock);
901 899
902 return 0; 900 return 0;
903} 901}
904 902
905/* 903/*
906 * Return the count of the number of times a periodic timer expired 904 * Return the count of the number of times a periodic timer expired
907 * while a notification was already pending. The counter is reset when 905 * while a notification was already pending. The counter is reset when
908 * a timer expires and a notification can be posted. 906 * a timer expires and a notification can be posted.
909 */ 907 */
910int 908int
911sys_timer_getoverrun(struct lwp *l, const struct sys_timer_getoverrun_args *uap, 909sys_timer_getoverrun(struct lwp *l, const struct sys_timer_getoverrun_args *uap,
912 register_t *retval) 910 register_t *retval)
913{ 911{
914 /* { 912 /* {
915 syscallarg(timer_t) timerid; 913 syscallarg(timer_t) timerid;
916 } */ 914 } */
917 struct proc *p = l->l_proc; 915 struct proc *p = l->l_proc;
918 struct ptimers *pts; 916 struct ptimers *pts;
919 int timerid; 917 int timerid;
920 struct ptimer *pt; 918 struct ptimer *pt;
921 919
922 timerid = SCARG(uap, timerid); 920 timerid = SCARG(uap, timerid);
923 921
924 pts = p->p_timers; 922 pts = p->p_timers;
925 if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX) 923 if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
926 return (EINVAL); 924 return (EINVAL);
927 mutex_spin_enter(&timer_lock); 925 mutex_spin_enter(&timer_lock);
928 if ((pt = pts->pts_timers[timerid]) == NULL) { 926 if ((pt = pts->pts_timers[timerid]) == NULL) {
929 mutex_spin_exit(&timer_lock); 927 mutex_spin_exit(&timer_lock);
930 return (EINVAL); 928 return (EINVAL);
931 } 929 }
932 *retval = pt->pt_poverruns; 930 *retval = pt->pt_poverruns;
933 mutex_spin_exit(&timer_lock); 931 mutex_spin_exit(&timer_lock);
934 932
935 return (0); 933 return (0);
936} 934}
937 935
938#ifdef KERN_SA 936#ifdef KERN_SA
939/* Glue function that triggers an upcall; called from userret(). */ 937/* Glue function that triggers an upcall; called from userret(). */
940void 938void
941timerupcall(struct lwp *l) 939timerupcall(struct lwp *l)
942{ 940{
943 struct ptimers *pt = l->l_proc->p_timers; 941 struct ptimers *pt = l->l_proc->p_timers;
944 struct proc *p = l->l_proc; 942 struct proc *p = l->l_proc;
945 unsigned int i, fired, done; 943 unsigned int i, fired, done;
946 944
947 KDASSERT(l->l_proc->p_sa); 945 KDASSERT(l->l_proc->p_sa);
948 /* Bail out if we do not own the virtual processor */ 946 /* Bail out if we do not own the virtual processor */
949 if (l->l_savp->savp_lwp != l) 947 if (l->l_savp->savp_lwp != l)
950 return ; 948 return ;
951 949
952 mutex_enter(p->p_lock); 950 mutex_enter(p->p_lock);
953 951
954 fired = pt->pts_fired; 952 fired = pt->pts_fired;
955 done = 0; 953 done = 0;
956 while ((i = ffs(fired)) != 0) { 954 while ((i = ffs(fired)) != 0) {
957 siginfo_t *si; 955 siginfo_t *si;
958 int mask = 1 << --i; 956 int mask = 1 << --i;
959 int f; 957 int f;
960 958
961 f = ~l->l_pflag & LP_SA_NOBLOCK; 959 f = ~l->l_pflag & LP_SA_NOBLOCK;
962 l->l_pflag |= LP_SA_NOBLOCK; 960 l->l_pflag |= LP_SA_NOBLOCK;
963 si = siginfo_alloc(PR_WAITOK); 961 si = siginfo_alloc(PR_WAITOK);
964 si->_info = pt->pts_timers[i]->pt_info.ksi_info; 962 si->_info = pt->pts_timers[i]->pt_info.ksi_info;
965 if (sa_upcall(l, SA_UPCALL_SIGEV | SA_UPCALL_DEFER, NULL, l, 963 if (sa_upcall(l, SA_UPCALL_SIGEV | SA_UPCALL_DEFER, NULL, l,
966 sizeof(*si), si, siginfo_free) != 0) { 964 sizeof(*si), si, siginfo_free) != 0) {
967 siginfo_free(si); 965 siginfo_free(si);
968 /* XXX What do we do here?? */ 966 /* XXX What do we do here?? */
969 } else 967 } else
970 done |= mask; 968 done |= mask;
971 fired &= ~mask; 969 fired &= ~mask;
972 l->l_pflag ^= f; 970 l->l_pflag ^= f;
973 } 971 }
974 pt->pts_fired &= ~done; 972 pt->pts_fired &= ~done;
975 if (pt->pts_fired == 0) 973 if (pt->pts_fired == 0)
976 l->l_proc->p_timerpend = 0; 974 l->l_proc->p_timerpend = 0;
977 975
978 mutex_exit(p->p_lock); 976 mutex_exit(p->p_lock);
979} 977}
980#endif /* KERN_SA */ 978#endif /* KERN_SA */
981 979
982/* 980/*
983 * Real interval timer expired: 981 * Real interval timer expired:
984 * send process whose timer expired an alarm signal. 982 * send process whose timer expired an alarm signal.
985 * If time is not set up to reload, then just return. 983 * If time is not set up to reload, then just return.
986 * Else compute next time timer should go off which is > current time. 984 * Else compute next time timer should go off which is > current time.
987 * This is where delay in processing this timeout causes multiple 985 * This is where delay in processing this timeout causes multiple
988 * SIGALRM calls to be compressed into one. 986 * SIGALRM calls to be compressed into one.
989 */ 987 */
990void 988void
991realtimerexpire(void *arg) 989realtimerexpire(void *arg)
992{ 990{
993 uint64_t last_val, next_val, interval, now_ns; 991 uint64_t last_val, next_val, interval, now_ns;
994 struct timespec now, next; 992 struct timespec now, next;
995 struct ptimer *pt; 993 struct ptimer *pt;
996 int backwards; 994 int backwards;
997 995
998 pt = arg; 996 pt = arg;
999 997
1000 mutex_spin_enter(&timer_lock); 998 mutex_spin_enter(&timer_lock);
1001 itimerfire(pt); 999 itimerfire(pt);
1002 1000
1003 if (!timespecisset(&pt->pt_time.it_interval)) { 1001 if (!timespecisset(&pt->pt_time.it_interval)) {
1004 timespecclear(&pt->pt_time.it_value); 1002 timespecclear(&pt->pt_time.it_value);
1005 mutex_spin_exit(&timer_lock); 1003 mutex_spin_exit(&timer_lock);
1006 return; 1004 return;
1007 } 1005 }
1008 1006
1009 getnanotime(&now); 1007 getnanotime(&now);
1010 backwards = (timespeccmp(&pt->pt_time.it_value, &now, >)); 1008 backwards = (timespeccmp(&pt->pt_time.it_value, &now, >));
1011 timespecadd(&pt->pt_time.it_value, &pt->pt_time.it_interval, &next); 1009 timespecadd(&pt->pt_time.it_value, &pt->pt_time.it_interval, &next);
1012 /* Handle the easy case of non-overflown timers first. */ 1010 /* Handle the easy case of non-overflown timers first. */
1013 if (!backwards && timespeccmp(&next, &now, >)) { 1011 if (!backwards && timespeccmp(&next, &now, >)) {
1014 pt->pt_time.it_value = next; 1012 pt->pt_time.it_value = next;
1015 } else { 1013 } else {
1016 now_ns = timespec2ns(&now); 1014 now_ns = timespec2ns(&now);
1017 last_val = timespec2ns(&pt->pt_time.it_value); 1015 last_val = timespec2ns(&pt->pt_time.it_value);
1018 interval = timespec2ns(&pt->pt_time.it_interval); 1016 interval = timespec2ns(&pt->pt_time.it_interval);
1019 1017
1020 next_val = now_ns + 1018 next_val = now_ns +
1021 (now_ns - last_val + interval - 1) % interval; 1019 (now_ns - last_val + interval - 1) % interval;
1022 1020
1023 if (backwards) 1021 if (backwards)
1024 next_val += interval; 1022 next_val += interval;
1025 else 1023 else
1026 pt->pt_overruns += (now_ns - last_val) / interval; 1024 pt->pt_overruns += (now_ns - last_val) / interval;
1027 1025
1028 pt->pt_time.it_value.tv_sec = next_val / 1000000000; 1026 pt->pt_time.it_value.tv_sec = next_val / 1000000000;
1029 pt->pt_time.it_value.tv_nsec = next_val % 1000000000; 1027 pt->pt_time.it_value.tv_nsec = next_val % 1000000000;
1030 } 1028 }
1031 1029
1032 /* 1030 /*
1033 * Don't need to check tshzto() return value, here. 1031 * Don't need to check tshzto() return value, here.
1034 * callout_reset() does it for us. 1032 * callout_reset() does it for us.
1035 */ 1033 */
1036 callout_reset(&pt->pt_ch, tshzto(&pt->pt_time.it_value), 1034 callout_reset(&pt->pt_ch, tshzto(&pt->pt_time.it_value),
1037 realtimerexpire, pt); 1035 realtimerexpire, pt);
1038 mutex_spin_exit(&timer_lock); 1036 mutex_spin_exit(&timer_lock);
1039} 1037}
1040 1038
1041/* BSD routine to get the value of an interval timer. */ 1039/* BSD routine to get the value of an interval timer. */
1042/* ARGSUSED */ 1040/* ARGSUSED */
1043int 1041int
1044sys___getitimer50(struct lwp *l, const struct sys___getitimer50_args *uap, 1042sys___getitimer50(struct lwp *l, const struct sys___getitimer50_args *uap,
1045 register_t *retval) 1043 register_t *retval)
1046{ 1044{
1047 /* { 1045 /* {
1048 syscallarg(int) which; 1046 syscallarg(int) which;
1049 syscallarg(struct itimerval *) itv; 1047 syscallarg(struct itimerval *) itv;
1050 } */ 1048 } */
1051 struct proc *p = l->l_proc; 1049 struct proc *p = l->l_proc;
1052 struct itimerval aitv; 1050 struct itimerval aitv;
1053 int error; 1051 int error;
1054 1052
1055 error = dogetitimer(p, SCARG(uap, which), &aitv); 1053 error = dogetitimer(p, SCARG(uap, which), &aitv);
1056 if (error) 1054 if (error)
1057 return error; 1055 return error;
1058 return (copyout(&aitv, SCARG(uap, itv), sizeof(struct itimerval))); 1056 return (copyout(&aitv, SCARG(uap, itv), sizeof(struct itimerval)));
1059} 1057}
1060 1058
1061int 1059int
1062dogetitimer(struct proc *p, int which, struct itimerval *itvp) 1060dogetitimer(struct proc *p, int which, struct itimerval *itvp)
1063{ 1061{
1064 struct ptimers *pts; 1062 struct ptimers *pts;
1065 struct ptimer *pt; 1063 struct ptimer *pt;
1066 struct itimerspec its; 1064 struct itimerspec its;
1067 1065
1068 if ((u_int)which > ITIMER_PROF) 1066 if ((u_int)which > ITIMER_PROF)
1069 return (EINVAL); 1067 return (EINVAL);
1070 1068
1071 mutex_spin_enter(&timer_lock); 1069 mutex_spin_enter(&timer_lock);
1072 pts = p->p_timers; 1070 pts = p->p_timers;
1073 if (pts == NULL || (pt = pts->pts_timers[which]) == NULL) { 1071 if (pts == NULL || (pt = pts->pts_timers[which]) == NULL) {
1074 timerclear(&itvp->it_value); 1072 timerclear(&itvp->it_value);
1075 timerclear(&itvp->it_interval); 1073 timerclear(&itvp->it_interval);
1076 } else { 1074 } else {
1077 timer_gettime(pt, &its); 1075 timer_gettime(pt, &its);
1078 TIMESPEC_TO_TIMEVAL(&itvp->it_value, &its.it_value); 1076 TIMESPEC_TO_TIMEVAL(&itvp->it_value, &its.it_value);
1079 TIMESPEC_TO_TIMEVAL(&itvp->it_interval, &its.it_interval); 1077 TIMESPEC_TO_TIMEVAL(&itvp->it_interval, &its.it_interval);
1080 } 1078 }
1081 mutex_spin_exit(&timer_lock);  1079 mutex_spin_exit(&timer_lock);
1082 1080
1083 return 0; 1081 return 0;

cvs diff -r1.26 -r1.27 src/sys/kern/sched_4bsd.c (switch to unified diff)

--- src/sys/kern/sched_4bsd.c 2011/04/14 16:19:35 1.26
+++ src/sys/kern/sched_4bsd.c 2011/07/27 14:35:34 1.27
@@ -1,541 +1,539 @@ @@ -1,541 +1,539 @@
1/* $NetBSD: sched_4bsd.c,v 1.26 2011/04/14 16:19:35 yamt Exp $ */ 1/* $NetBSD: sched_4bsd.c,v 1.27 2011/07/27 14:35:34 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and 9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and
10 * Daniel Sieger. 10 * Daniel Sieger.
11 * 11 *
12 * Redistribution and use in source and binary forms, with or without 12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions 13 * modification, are permitted provided that the following conditions
14 * are met: 14 * are met:
15 * 1. Redistributions of source code must retain the above copyright 15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer. 16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright 17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the 18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution. 19 * documentation and/or other materials provided with the distribution.
20 * 20 *
21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE. 31 * POSSIBILITY OF SUCH DAMAGE.
32 */ 32 */
33 33
34/*- 34/*-
35 * Copyright (c) 1982, 1986, 1990, 1991, 1993 35 * Copyright (c) 1982, 1986, 1990, 1991, 1993
36 * The Regents of the University of California. All rights reserved. 36 * The Regents of the University of California. All rights reserved.
37 * (c) UNIX System Laboratories, Inc. 37 * (c) UNIX System Laboratories, Inc.
38 * All or some portions of this file are derived from material licensed 38 * All or some portions of this file are derived from material licensed
39 * to the University of California by American Telephone and Telegraph 39 * to the University of California by American Telephone and Telegraph
40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
41 * the permission of UNIX System Laboratories, Inc. 41 * the permission of UNIX System Laboratories, Inc.
42 * 42 *
43 * Redistribution and use in source and binary forms, with or without 43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions 44 * modification, are permitted provided that the following conditions
45 * are met: 45 * are met:
46 * 1. Redistributions of source code must retain the above copyright 46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer. 47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright 48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the 49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution. 50 * documentation and/or other materials provided with the distribution.
51 * 3. Neither the name of the University nor the names of its contributors 51 * 3. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software 52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission. 53 * without specific prior written permission.
54 * 54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE. 65 * SUCH DAMAGE.
66 * 66 *
67 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 67 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
68 */ 68 */
69 69
70#include <sys/cdefs.h> 70#include <sys/cdefs.h>
71__KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.26 2011/04/14 16:19:35 yamt Exp $"); 71__KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.27 2011/07/27 14:35:34 uebayasi Exp $");
72 72
73#include "opt_ddb.h" 73#include "opt_ddb.h"
74#include "opt_lockdebug.h" 74#include "opt_lockdebug.h"
75#include "opt_perfctrs.h" 75#include "opt_perfctrs.h"
76 76
77#include <sys/param.h> 77#include <sys/param.h>
78#include <sys/systm.h> 78#include <sys/systm.h>
79#include <sys/callout.h> 79#include <sys/callout.h>
80#include <sys/cpu.h> 80#include <sys/cpu.h>
81#include <sys/proc.h> 81#include <sys/proc.h>
82#include <sys/kernel.h> 82#include <sys/kernel.h>
83#include <sys/signalvar.h> 83#include <sys/signalvar.h>
84#include <sys/resourcevar.h> 84#include <sys/resourcevar.h>
85#include <sys/sched.h> 85#include <sys/sched.h>
86#include <sys/sysctl.h> 86#include <sys/sysctl.h>
87#include <sys/kauth.h> 87#include <sys/kauth.h>
88#include <sys/lockdebug.h> 88#include <sys/lockdebug.h>
89#include <sys/kmem.h> 89#include <sys/kmem.h>
90#include <sys/intr.h> 90#include <sys/intr.h>
91 91
92#include <uvm/uvm_extern.h> 
93 
94static void updatepri(struct lwp *); 92static void updatepri(struct lwp *);
95static void resetpriority(struct lwp *); 93static void resetpriority(struct lwp *);
96 94
97extern unsigned int sched_pstats_ticks; /* defined in kern_synch.c */ 95extern unsigned int sched_pstats_ticks; /* defined in kern_synch.c */
98 96
99/* Number of hardclock ticks per sched_tick() */ 97/* Number of hardclock ticks per sched_tick() */
100static int rrticks; 98static int rrticks;
101 99
102/* 100/*
103 * Force switch among equal priority processes every 100ms. 101 * Force switch among equal priority processes every 100ms.
104 * Called from hardclock every hz/10 == rrticks hardclock ticks. 102 * Called from hardclock every hz/10 == rrticks hardclock ticks.
105 * 103 *
106 * There's no need to lock anywhere in this routine, as it's 104 * There's no need to lock anywhere in this routine, as it's
107 * CPU-local and runs at IPL_SCHED (called from clock interrupt). 105 * CPU-local and runs at IPL_SCHED (called from clock interrupt).
108 */ 106 */
109/* ARGSUSED */ 107/* ARGSUSED */
110void 108void
111sched_tick(struct cpu_info *ci) 109sched_tick(struct cpu_info *ci)
112{ 110{
113 struct schedstate_percpu *spc = &ci->ci_schedstate; 111 struct schedstate_percpu *spc = &ci->ci_schedstate;
114 lwp_t *l; 112 lwp_t *l;
115 113
116 spc->spc_ticks = rrticks; 114 spc->spc_ticks = rrticks;
117 115
118 if (CURCPU_IDLE_P()) { 116 if (CURCPU_IDLE_P()) {
119 cpu_need_resched(ci, 0); 117 cpu_need_resched(ci, 0);
120 return; 118 return;
121 } 119 }
122 l = ci->ci_data.cpu_onproc; 120 l = ci->ci_data.cpu_onproc;
123 if (l == NULL) { 121 if (l == NULL) {
124 return; 122 return;
125 } 123 }
126 switch (l->l_class) { 124 switch (l->l_class) {
127 case SCHED_FIFO: 125 case SCHED_FIFO:
128 /* No timeslicing for FIFO jobs. */ 126 /* No timeslicing for FIFO jobs. */
129 break; 127 break;
130 case SCHED_RR: 128 case SCHED_RR:
131 /* Force it into mi_switch() to look for other jobs to run. */ 129 /* Force it into mi_switch() to look for other jobs to run. */
132 cpu_need_resched(ci, RESCHED_KPREEMPT); 130 cpu_need_resched(ci, RESCHED_KPREEMPT);
133 break; 131 break;
134 default: 132 default:
135 if (spc->spc_flags & SPCF_SHOULDYIELD) { 133 if (spc->spc_flags & SPCF_SHOULDYIELD) {
136 /* 134 /*
137 * Process is stuck in kernel somewhere, probably 135 * Process is stuck in kernel somewhere, probably
138 * due to buggy or inefficient code. Force a  136 * due to buggy or inefficient code. Force a
139 * kernel preemption. 137 * kernel preemption.
140 */ 138 */
141 cpu_need_resched(ci, RESCHED_KPREEMPT); 139 cpu_need_resched(ci, RESCHED_KPREEMPT);
142 } else if (spc->spc_flags & SPCF_SEENRR) { 140 } else if (spc->spc_flags & SPCF_SEENRR) {
143 /* 141 /*
144 * The process has already been through a roundrobin 142 * The process has already been through a roundrobin
145 * without switching and may be hogging the CPU. 143 * without switching and may be hogging the CPU.
146 * Indicate that the process should yield. 144 * Indicate that the process should yield.
147 */ 145 */
148 spc->spc_flags |= SPCF_SHOULDYIELD; 146 spc->spc_flags |= SPCF_SHOULDYIELD;
149 cpu_need_resched(ci, 0); 147 cpu_need_resched(ci, 0);
150 } else { 148 } else {
151 spc->spc_flags |= SPCF_SEENRR; 149 spc->spc_flags |= SPCF_SEENRR;
152 } 150 }
153 break; 151 break;
154 } 152 }
155} 153}
156 154
157/* 155/*
158 * Why PRIO_MAX - 2? From setpriority(2): 156 * Why PRIO_MAX - 2? From setpriority(2):
159 * 157 *
160 * prio is a value in the range -20 to 20. The default priority is 158 * prio is a value in the range -20 to 20. The default priority is
161 * 0; lower priorities cause more favorable scheduling. A value of 159 * 0; lower priorities cause more favorable scheduling. A value of
162 * 19 or 20 will schedule a process only when nothing at priority <= 160 * 19 or 20 will schedule a process only when nothing at priority <=
163 * 0 is runnable. 161 * 0 is runnable.
164 * 162 *
165 * This gives estcpu influence over 18 priority levels, and leaves nice 163 * This gives estcpu influence over 18 priority levels, and leaves nice
166 * with 40 levels. One way to think about it is that nice has 20 levels 164 * with 40 levels. One way to think about it is that nice has 20 levels
167 * either side of estcpu's 18. 165 * either side of estcpu's 18.
168 */ 166 */
169#define ESTCPU_SHIFT 11 167#define ESTCPU_SHIFT 11
170#define ESTCPU_MAX ((PRIO_MAX - 2) << ESTCPU_SHIFT) 168#define ESTCPU_MAX ((PRIO_MAX - 2) << ESTCPU_SHIFT)
171#define ESTCPU_ACCUM (1 << (ESTCPU_SHIFT - 1)) 169#define ESTCPU_ACCUM (1 << (ESTCPU_SHIFT - 1))
172#define ESTCPULIM(e) min((e), ESTCPU_MAX) 170#define ESTCPULIM(e) min((e), ESTCPU_MAX)
173 171
174/* 172/*
175 * Constants for digital decay and forget: 173 * Constants for digital decay and forget:
176 * 90% of (l_estcpu) usage in 5 * loadav time 174 * 90% of (l_estcpu) usage in 5 * loadav time
177 * 95% of (l_pctcpu) usage in 60 seconds (load insensitive) 175 * 95% of (l_pctcpu) usage in 60 seconds (load insensitive)
178 * Note that, as ps(1) mentions, this can let percentages 176 * Note that, as ps(1) mentions, this can let percentages
179 * total over 100% (I've seen 137.9% for 3 processes). 177 * total over 100% (I've seen 137.9% for 3 processes).
180 * 178 *
181 * Note that hardclock updates l_estcpu and l_cpticks independently. 179 * Note that hardclock updates l_estcpu and l_cpticks independently.
182 * 180 *
183 * We wish to decay away 90% of l_estcpu in (5 * loadavg) seconds. 181 * We wish to decay away 90% of l_estcpu in (5 * loadavg) seconds.
184 * That is, the system wants to compute a value of decay such 182 * That is, the system wants to compute a value of decay such
185 * that the following for loop: 183 * that the following for loop:
186 * for (i = 0; i < (5 * loadavg); i++) 184 * for (i = 0; i < (5 * loadavg); i++)
187 * l_estcpu *= decay; 185 * l_estcpu *= decay;
188 * will compute 186 * will compute
189 * l_estcpu *= 0.1; 187 * l_estcpu *= 0.1;
190 * for all values of loadavg: 188 * for all values of loadavg:
191 * 189 *
192 * Mathematically this loop can be expressed by saying: 190 * Mathematically this loop can be expressed by saying:
193 * decay ** (5 * loadavg) ~= .1 191 * decay ** (5 * loadavg) ~= .1
194 * 192 *
195 * The system computes decay as: 193 * The system computes decay as:
196 * decay = (2 * loadavg) / (2 * loadavg + 1) 194 * decay = (2 * loadavg) / (2 * loadavg + 1)
197 * 195 *
198 * We wish to prove that the system's computation of decay 196 * We wish to prove that the system's computation of decay
199 * will always fulfill the equation: 197 * will always fulfill the equation:
200 * decay ** (5 * loadavg) ~= .1 198 * decay ** (5 * loadavg) ~= .1
201 * 199 *
202 * If we compute b as: 200 * If we compute b as:
203 * b = 2 * loadavg 201 * b = 2 * loadavg
204 * then 202 * then
205 * decay = b / (b + 1) 203 * decay = b / (b + 1)
206 * 204 *
207 * We now need to prove two things: 205 * We now need to prove two things:
208 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 206 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
209 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 207 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
210 * 208 *
211 * Facts: 209 * Facts:
212 * For x close to zero, exp(x) =~ 1 + x, since 210 * For x close to zero, exp(x) =~ 1 + x, since
213 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 211 * exp(x) = 0! + x**1/1! + x**2/2! + ... .
214 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 212 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
215 * For x close to zero, ln(1+x) =~ x, since 213 * For x close to zero, ln(1+x) =~ x, since
216 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 214 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
217 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 215 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
218 * ln(.1) =~ -2.30 216 * ln(.1) =~ -2.30
219 * 217 *
220 * Proof of (1): 218 * Proof of (1):
221 * Solve (factor)**(power) =~ .1 given power (5*loadav): 219 * Solve (factor)**(power) =~ .1 given power (5*loadav):
222 * solving for factor, 220 * solving for factor,
223 * ln(factor) =~ (-2.30/5*loadav), or 221 * ln(factor) =~ (-2.30/5*loadav), or
224 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 222 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
225 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 223 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
226 * 224 *
227 * Proof of (2): 225 * Proof of (2):
228 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 226 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
229 * solving for power, 227 * solving for power,
230 * power*ln(b/(b+1)) =~ -2.30, or 228 * power*ln(b/(b+1)) =~ -2.30, or
231 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 229 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
232 * 230 *
233 * Actual power values for the implemented algorithm are as follows: 231 * Actual power values for the implemented algorithm are as follows:
234 * loadav: 1 2 3 4 232 * loadav: 1 2 3 4
235 * power: 5.68 10.32 14.94 19.55 233 * power: 5.68 10.32 14.94 19.55
236 */ 234 */
237 235
238/* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 236/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
239#define loadfactor(loadav) (2 * (loadav) / ncpu) 237#define loadfactor(loadav) (2 * (loadav) / ncpu)
240 238
241static fixpt_t 239static fixpt_t
242decay_cpu(fixpt_t loadfac, fixpt_t estcpu) 240decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
243{ 241{
244 242
245 if (estcpu == 0) { 243 if (estcpu == 0) {
246 return 0; 244 return 0;
247 } 245 }
248 246
249#if !defined(_LP64) 247#if !defined(_LP64)
250 /* avoid 64bit arithmetics. */ 248 /* avoid 64bit arithmetics. */
251#define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1)) 249#define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
252 if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) { 250 if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
253 return estcpu * loadfac / (loadfac + FSCALE); 251 return estcpu * loadfac / (loadfac + FSCALE);
254 } 252 }
255#endif /* !defined(_LP64) */ 253#endif /* !defined(_LP64) */
256 254
257 return (uint64_t)estcpu * loadfac / (loadfac + FSCALE); 255 return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
258} 256}
259 257
260/* 258/*
261 * For all load averages >= 1 and max l_estcpu of (255 << ESTCPU_SHIFT), 259 * For all load averages >= 1 and max l_estcpu of (255 << ESTCPU_SHIFT),
262 * sleeping for at least seven times the loadfactor will decay l_estcpu to 260 * sleeping for at least seven times the loadfactor will decay l_estcpu to
263 * less than (1 << ESTCPU_SHIFT). 261 * less than (1 << ESTCPU_SHIFT).
264 * 262 *
265 * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT). 263 * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
266 */ 264 */
267static fixpt_t 265static fixpt_t
268decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n) 266decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
269{ 267{
270 268
271 if ((n << FSHIFT) >= 7 * loadfac) { 269 if ((n << FSHIFT) >= 7 * loadfac) {
272 return 0; 270 return 0;
273 } 271 }
274 272
275 while (estcpu != 0 && n > 1) { 273 while (estcpu != 0 && n > 1) {
276 estcpu = decay_cpu(loadfac, estcpu); 274 estcpu = decay_cpu(loadfac, estcpu);
277 n--; 275 n--;
278 } 276 }
279 277
280 return estcpu; 278 return estcpu;
281} 279}
282 280
283/* 281/*
284 * sched_pstats_hook: 282 * sched_pstats_hook:
285 * 283 *
286 * Periodically called from sched_pstats(); used to recalculate priorities. 284 * Periodically called from sched_pstats(); used to recalculate priorities.
287 */ 285 */
288void 286void
289sched_pstats_hook(struct lwp *l, int batch) 287sched_pstats_hook(struct lwp *l, int batch)
290{ 288{
291 fixpt_t loadfac; 289 fixpt_t loadfac;
292 290
293 /* 291 /*
294 * If the LWP has slept an entire second, stop recalculating 292 * If the LWP has slept an entire second, stop recalculating
295 * its priority until it wakes up. 293 * its priority until it wakes up.
296 */ 294 */
297 KASSERT(lwp_locked(l, NULL)); 295 KASSERT(lwp_locked(l, NULL));
298 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 296 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
299 l->l_stat == LSSUSPENDED) { 297 l->l_stat == LSSUSPENDED) {
300 if (l->l_slptime > 1) { 298 if (l->l_slptime > 1) {
301 return; 299 return;
302 } 300 }
303 } 301 }
304 loadfac = 2 * (averunnable.ldavg[0]); 302 loadfac = 2 * (averunnable.ldavg[0]);
305 l->l_estcpu = decay_cpu(loadfac, l->l_estcpu); 303 l->l_estcpu = decay_cpu(loadfac, l->l_estcpu);
306 resetpriority(l); 304 resetpriority(l);
307} 305}
308 306
309/* 307/*
310 * Recalculate the priority of a process after it has slept for a while. 308 * Recalculate the priority of a process after it has slept for a while.
311 */ 309 */
312static void 310static void
313updatepri(struct lwp *l) 311updatepri(struct lwp *l)
314{ 312{
315 fixpt_t loadfac; 313 fixpt_t loadfac;
316 314
317 KASSERT(lwp_locked(l, NULL)); 315 KASSERT(lwp_locked(l, NULL));
318 KASSERT(l->l_slptime > 1); 316 KASSERT(l->l_slptime > 1);
319 317
320 loadfac = loadfactor(averunnable.ldavg[0]); 318 loadfac = loadfactor(averunnable.ldavg[0]);
321 319
322 l->l_slptime--; /* the first time was done in sched_pstats */ 320 l->l_slptime--; /* the first time was done in sched_pstats */
323 l->l_estcpu = decay_cpu_batch(loadfac, l->l_estcpu, l->l_slptime); 321 l->l_estcpu = decay_cpu_batch(loadfac, l->l_estcpu, l->l_slptime);
324 resetpriority(l); 322 resetpriority(l);
325} 323}
326 324
327void 325void
328sched_rqinit(void) 326sched_rqinit(void)
329{ 327{
330 328
331} 329}
332 330
333void 331void
334sched_setrunnable(struct lwp *l) 332sched_setrunnable(struct lwp *l)
335{ 333{
336 334
337 if (l->l_slptime > 1) 335 if (l->l_slptime > 1)
338 updatepri(l); 336 updatepri(l);
339} 337}
340 338
341void 339void
342sched_nice(struct proc *p, int n) 340sched_nice(struct proc *p, int n)
343{ 341{
344 struct lwp *l; 342 struct lwp *l;
345 343
346 KASSERT(mutex_owned(p->p_lock)); 344 KASSERT(mutex_owned(p->p_lock));
347 345
348 p->p_nice = n; 346 p->p_nice = n;
349 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 347 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
350 lwp_lock(l); 348 lwp_lock(l);
351 resetpriority(l); 349 resetpriority(l);
352 lwp_unlock(l); 350 lwp_unlock(l);
353 } 351 }
354} 352}
355 353
356/* 354/*
357 * Recompute the priority of an LWP. Arrange to reschedule if 355 * Recompute the priority of an LWP. Arrange to reschedule if
358 * the resulting priority is better than that of the current LWP. 356 * the resulting priority is better than that of the current LWP.
359 */ 357 */
360static void 358static void
361resetpriority(struct lwp *l) 359resetpriority(struct lwp *l)
362{ 360{
363 pri_t pri; 361 pri_t pri;
364 struct proc *p = l->l_proc; 362 struct proc *p = l->l_proc;
365 363
366 KASSERT(lwp_locked(l, NULL)); 364 KASSERT(lwp_locked(l, NULL));
367 365
368 if (l->l_class != SCHED_OTHER) 366 if (l->l_class != SCHED_OTHER)
369 return; 367 return;
370 368
371 /* See comments above ESTCPU_SHIFT definition. */ 369 /* See comments above ESTCPU_SHIFT definition. */
372 pri = (PRI_KERNEL - 1) - (l->l_estcpu >> ESTCPU_SHIFT) - p->p_nice; 370 pri = (PRI_KERNEL - 1) - (l->l_estcpu >> ESTCPU_SHIFT) - p->p_nice;
373 pri = imax(pri, 0); 371 pri = imax(pri, 0);
374 if (pri != l->l_priority) 372 if (pri != l->l_priority)
375 lwp_changepri(l, pri); 373 lwp_changepri(l, pri);
376} 374}
377 375
378/* 376/*
379 * We adjust the priority of the current process. The priority of a process 377 * We adjust the priority of the current process. The priority of a process
380 * gets worse as it accumulates CPU time. The CPU usage estimator (l_estcpu) 378 * gets worse as it accumulates CPU time. The CPU usage estimator (l_estcpu)
381 * is increased here. The formula for computing priorities (in kern_synch.c) 379 * is increased here. The formula for computing priorities (in kern_synch.c)
382 * will compute a different value each time l_estcpu increases. This can 380 * will compute a different value each time l_estcpu increases. This can
383 * cause a switch, but unless the priority crosses a PPQ boundary the actual 381 * cause a switch, but unless the priority crosses a PPQ boundary the actual
384 * queue will not change. The CPU usage estimator ramps up quite quickly 382 * queue will not change. The CPU usage estimator ramps up quite quickly
385 * when the process is running (linearly), and decays away exponentially, at 383 * when the process is running (linearly), and decays away exponentially, at
386 * a rate which is proportionally slower when the system is busy. The basic 384 * a rate which is proportionally slower when the system is busy. The basic
387 * principle is that the system will 90% forget that the process used a lot 385 * principle is that the system will 90% forget that the process used a lot
388 * of CPU time in 5 * loadav seconds. This causes the system to favor 386 * of CPU time in 5 * loadav seconds. This causes the system to favor
389 * processes which haven't run much recently, and to round-robin among other 387 * processes which haven't run much recently, and to round-robin among other
390 * processes. 388 * processes.
391 */ 389 */
392 390
393void 391void
394sched_schedclock(struct lwp *l) 392sched_schedclock(struct lwp *l)
395{ 393{
396 394
397 if (l->l_class != SCHED_OTHER) 395 if (l->l_class != SCHED_OTHER)
398 return; 396 return;
399 397
400 KASSERT(!CURCPU_IDLE_P()); 398 KASSERT(!CURCPU_IDLE_P());
401 l->l_estcpu = ESTCPULIM(l->l_estcpu + ESTCPU_ACCUM); 399 l->l_estcpu = ESTCPULIM(l->l_estcpu + ESTCPU_ACCUM);
402 lwp_lock(l); 400 lwp_lock(l);
403 resetpriority(l); 401 resetpriority(l);
404 lwp_unlock(l); 402 lwp_unlock(l);
405} 403}
406 404
407/* 405/*
408 * sched_proc_fork: 406 * sched_proc_fork:
409 * 407 *
410 * Inherit the parent's scheduler history. 408 * Inherit the parent's scheduler history.
411 */ 409 */
412void 410void
413sched_proc_fork(struct proc *parent, struct proc *child) 411sched_proc_fork(struct proc *parent, struct proc *child)
414{ 412{
415 lwp_t *pl; 413 lwp_t *pl;
416 414
417 KASSERT(mutex_owned(parent->p_lock)); 415 KASSERT(mutex_owned(parent->p_lock));
418 416
419 pl = LIST_FIRST(&parent->p_lwps); 417 pl = LIST_FIRST(&parent->p_lwps);
420 child->p_estcpu_inherited = pl->l_estcpu; 418 child->p_estcpu_inherited = pl->l_estcpu;
421 child->p_forktime = sched_pstats_ticks; 419 child->p_forktime = sched_pstats_ticks;
422} 420}
423 421
424/* 422/*
425 * sched_proc_exit: 423 * sched_proc_exit:
426 * 424 *
427 * Chargeback parents for the sins of their children. 425 * Chargeback parents for the sins of their children.
428 */ 426 */
429void 427void
430sched_proc_exit(struct proc *parent, struct proc *child) 428sched_proc_exit(struct proc *parent, struct proc *child)
431{ 429{
432 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 430 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
433 fixpt_t estcpu; 431 fixpt_t estcpu;
434 lwp_t *pl, *cl; 432 lwp_t *pl, *cl;
435 433
436 /* XXX Only if parent != init?? */ 434 /* XXX Only if parent != init?? */
437 435
438 mutex_enter(parent->p_lock); 436 mutex_enter(parent->p_lock);
439 pl = LIST_FIRST(&parent->p_lwps); 437 pl = LIST_FIRST(&parent->p_lwps);
440 cl = LIST_FIRST(&child->p_lwps); 438 cl = LIST_FIRST(&child->p_lwps);
441 estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited, 439 estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
442 sched_pstats_ticks - child->p_forktime); 440 sched_pstats_ticks - child->p_forktime);
443 if (cl->l_estcpu > estcpu) { 441 if (cl->l_estcpu > estcpu) {
444 lwp_lock(pl); 442 lwp_lock(pl);
445 pl->l_estcpu = ESTCPULIM(pl->l_estcpu + cl->l_estcpu - estcpu); 443 pl->l_estcpu = ESTCPULIM(pl->l_estcpu + cl->l_estcpu - estcpu);
446 lwp_unlock(pl); 444 lwp_unlock(pl);
447 } 445 }
448 mutex_exit(parent->p_lock); 446 mutex_exit(parent->p_lock);
449} 447}
450 448
451void 449void
452sched_wakeup(struct lwp *l) 450sched_wakeup(struct lwp *l)
453{ 451{
454 452
455} 453}
456 454
457void 455void
458sched_slept(struct lwp *l) 456sched_slept(struct lwp *l)
459{ 457{
460 458
461} 459}
462 460
463void 461void
464sched_lwp_fork(struct lwp *l1, struct lwp *l2) 462sched_lwp_fork(struct lwp *l1, struct lwp *l2)
465{ 463{
466 464
467 l2->l_estcpu = l1->l_estcpu; 465 l2->l_estcpu = l1->l_estcpu;
468} 466}
469 467
470void 468void
471sched_lwp_collect(struct lwp *t) 469sched_lwp_collect(struct lwp *t)
472{ 470{
473 lwp_t *l; 471 lwp_t *l;
474 472
475 /* Absorb estcpu value of collected LWP. */ 473 /* Absorb estcpu value of collected LWP. */
476 l = curlwp; 474 l = curlwp;
477 lwp_lock(l); 475 lwp_lock(l);
478 l->l_estcpu += t->l_estcpu; 476 l->l_estcpu += t->l_estcpu;
479 lwp_unlock(l); 477 lwp_unlock(l);
480} 478}
481 479
482void 480void
483sched_oncpu(lwp_t *l) 481sched_oncpu(lwp_t *l)
484{ 482{
485 483
486} 484}
487 485
488void 486void
489sched_newts(lwp_t *l) 487sched_newts(lwp_t *l)
490{ 488{
491 489
492} 490}
493 491
494/* 492/*
495 * Sysctl nodes and initialization. 493 * Sysctl nodes and initialization.
496 */ 494 */
497 495
498static int 496static int
499sysctl_sched_rtts(SYSCTLFN_ARGS) 497sysctl_sched_rtts(SYSCTLFN_ARGS)
500{ 498{
501 struct sysctlnode node; 499 struct sysctlnode node;
502 int rttsms = hztoms(rrticks); 500 int rttsms = hztoms(rrticks);
503 501
504 node = *rnode; 502 node = *rnode;
505 node.sysctl_data = &rttsms; 503 node.sysctl_data = &rttsms;
506 return sysctl_lookup(SYSCTLFN_CALL(&node)); 504 return sysctl_lookup(SYSCTLFN_CALL(&node));
507} 505}
508 506
509SYSCTL_SETUP(sysctl_sched_4bsd_setup, "sysctl sched setup") 507SYSCTL_SETUP(sysctl_sched_4bsd_setup, "sysctl sched setup")
510{ 508{
511 const struct sysctlnode *node = NULL; 509 const struct sysctlnode *node = NULL;
512 510
513 sysctl_createv(clog, 0, NULL, NULL, 511 sysctl_createv(clog, 0, NULL, NULL,
514 CTLFLAG_PERMANENT, 512 CTLFLAG_PERMANENT,
515 CTLTYPE_NODE, "kern", NULL, 513 CTLTYPE_NODE, "kern", NULL,
516 NULL, 0, NULL, 0, 514 NULL, 0, NULL, 0,
517 CTL_KERN, CTL_EOL); 515 CTL_KERN, CTL_EOL);
518 sysctl_createv(clog, 0, NULL, &node, 516 sysctl_createv(clog, 0, NULL, &node,
519 CTLFLAG_PERMANENT, 517 CTLFLAG_PERMANENT,
520 CTLTYPE_NODE, "sched", 518 CTLTYPE_NODE, "sched",
521 SYSCTL_DESCR("Scheduler options"), 519 SYSCTL_DESCR("Scheduler options"),
522 NULL, 0, NULL, 0, 520 NULL, 0, NULL, 0,
523 CTL_KERN, CTL_CREATE, CTL_EOL); 521 CTL_KERN, CTL_CREATE, CTL_EOL);
524 522
525 if (node == NULL) 523 if (node == NULL)
526 return; 524 return;
527 525
528 rrticks = hz / 10; 526 rrticks = hz / 10;
529 527
530 sysctl_createv(NULL, 0, &node, NULL, 528 sysctl_createv(NULL, 0, &node, NULL,
531 CTLFLAG_PERMANENT, 529 CTLFLAG_PERMANENT,
532 CTLTYPE_STRING, "name", NULL, 530 CTLTYPE_STRING, "name", NULL,
533 NULL, 0, __UNCONST("4.4BSD"), 0, 531 NULL, 0, __UNCONST("4.4BSD"), 0,
534 CTL_CREATE, CTL_EOL); 532 CTL_CREATE, CTL_EOL);
535 sysctl_createv(NULL, 0, &node, NULL, 533 sysctl_createv(NULL, 0, &node, NULL,
536 CTLFLAG_PERMANENT, 534 CTLFLAG_PERMANENT,
537 CTLTYPE_INT, "rtts", 535 CTLTYPE_INT, "rtts",
538 SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"), 536 SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
539 sysctl_sched_rtts, 0, NULL, 0, 537 sysctl_sched_rtts, 0, NULL, 0,
540 CTL_CREATE, CTL_EOL); 538 CTL_CREATE, CTL_EOL);
541} 539}

cvs diff -r1.13 -r1.14 src/sys/kern/subr_percpu.c (switch to unified diff)

--- src/sys/kern/subr_percpu.c 2011/05/13 22:16:44 1.13
+++ src/sys/kern/subr_percpu.c 2011/07/27 14:35:34 1.14
@@ -1,370 +1,368 @@ @@ -1,370 +1,368 @@
1/* $NetBSD: subr_percpu.c,v 1.13 2011/05/13 22:16:44 rmind Exp $ */ 1/* $NetBSD: subr_percpu.c,v 1.14 2011/07/27 14:35:34 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c)2007,2008 YAMAMOTO Takashi, 4 * Copyright (c)2007,2008 YAMAMOTO Takashi,
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE. 26 * SUCH DAMAGE.
27 */ 27 */
28 28
29/* 29/*
30 * per-cpu storage. 30 * per-cpu storage.
31 */ 31 */
32 32
33#include <sys/cdefs.h> 33#include <sys/cdefs.h>
34__KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.13 2011/05/13 22:16:44 rmind Exp $"); 34__KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.14 2011/07/27 14:35:34 uebayasi Exp $");
35 35
36#include <sys/param.h> 36#include <sys/param.h>
37#include <sys/cpu.h> 37#include <sys/cpu.h>
38#include <sys/kmem.h> 38#include <sys/kmem.h>
39#include <sys/kernel.h> 39#include <sys/kernel.h>
40#include <sys/mutex.h> 40#include <sys/mutex.h>
41#include <sys/percpu.h> 41#include <sys/percpu.h>
42#include <sys/rwlock.h> 42#include <sys/rwlock.h>
43#include <sys/vmem.h> 43#include <sys/vmem.h>
44#include <sys/xcall.h> 44#include <sys/xcall.h>
45 45
46#include <uvm/uvm_extern.h> 
47 
48#define PERCPU_QUANTUM_SIZE (ALIGNBYTES + 1) 46#define PERCPU_QUANTUM_SIZE (ALIGNBYTES + 1)
49#define PERCPU_QCACHE_MAX 0 47#define PERCPU_QCACHE_MAX 0
50#define PERCPU_IMPORT_SIZE 2048 48#define PERCPU_IMPORT_SIZE 2048
51 49
52#if defined(DIAGNOSTIC) 50#if defined(DIAGNOSTIC)
53#define MAGIC 0x50435055 /* "PCPU" */ 51#define MAGIC 0x50435055 /* "PCPU" */
54#define percpu_encrypt(pc) ((pc) ^ MAGIC) 52#define percpu_encrypt(pc) ((pc) ^ MAGIC)
55#define percpu_decrypt(pc) ((pc) ^ MAGIC) 53#define percpu_decrypt(pc) ((pc) ^ MAGIC)
56#else /* defined(DIAGNOSTIC) */ 54#else /* defined(DIAGNOSTIC) */
57#define percpu_encrypt(pc) (pc) 55#define percpu_encrypt(pc) (pc)
58#define percpu_decrypt(pc) (pc) 56#define percpu_decrypt(pc) (pc)
59#endif /* defined(DIAGNOSTIC) */ 57#endif /* defined(DIAGNOSTIC) */
60 58
61static krwlock_t percpu_swap_lock __cacheline_aligned; 59static krwlock_t percpu_swap_lock __cacheline_aligned;
62static kmutex_t percpu_allocation_lock __cacheline_aligned; 60static kmutex_t percpu_allocation_lock __cacheline_aligned;
63static vmem_t * percpu_offset_arena __cacheline_aligned; 61static vmem_t * percpu_offset_arena __cacheline_aligned;
64static unsigned int percpu_nextoff __cacheline_aligned; 62static unsigned int percpu_nextoff __cacheline_aligned;
65 63
66static percpu_cpu_t * 64static percpu_cpu_t *
67cpu_percpu(struct cpu_info *ci) 65cpu_percpu(struct cpu_info *ci)
68{ 66{
69 67
70 return &ci->ci_data.cpu_percpu; 68 return &ci->ci_data.cpu_percpu;
71} 69}
72 70
73static unsigned int 71static unsigned int
74percpu_offset(percpu_t *pc) 72percpu_offset(percpu_t *pc)
75{ 73{
76 const unsigned int off = percpu_decrypt((uintptr_t)pc); 74 const unsigned int off = percpu_decrypt((uintptr_t)pc);
77 75
78 KASSERT(off < percpu_nextoff); 76 KASSERT(off < percpu_nextoff);
79 return off; 77 return off;
80} 78}
81 79
82/* 80/*
83 * percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge 81 * percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge
84 */ 82 */
85 83
86static void 84static void
87percpu_cpu_swap(void *p1, void *p2) 85percpu_cpu_swap(void *p1, void *p2)
88{ 86{
89 struct cpu_info * const ci = p1; 87 struct cpu_info * const ci = p1;
90 percpu_cpu_t * const newpcc = p2; 88 percpu_cpu_t * const newpcc = p2;
91 percpu_cpu_t * const pcc = cpu_percpu(ci); 89 percpu_cpu_t * const pcc = cpu_percpu(ci);
92 90
93 KASSERT(ci == curcpu() || !mp_online); 91 KASSERT(ci == curcpu() || !mp_online);
94 92
95 /* 93 /*
96 * swap *pcc and *newpcc unless anyone has beaten us. 94 * swap *pcc and *newpcc unless anyone has beaten us.
97 */ 95 */
98 rw_enter(&percpu_swap_lock, RW_WRITER); 96 rw_enter(&percpu_swap_lock, RW_WRITER);
99 if (newpcc->pcc_size > pcc->pcc_size) { 97 if (newpcc->pcc_size > pcc->pcc_size) {
100 percpu_cpu_t tmp; 98 percpu_cpu_t tmp;
101 int s; 99 int s;
102 100
103 tmp = *pcc; 101 tmp = *pcc;
104 102
105 /* 103 /*
106 * block interrupts so that we don't lose their modifications. 104 * block interrupts so that we don't lose their modifications.
107 */ 105 */
108 106
109 s = splhigh(); 107 s = splhigh();
110 108
111 /* 109 /*
112 * copy data to new storage. 110 * copy data to new storage.
113 */ 111 */
114 112
115 memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size); 113 memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size);
116 114
117 /* 115 /*
118 * this assignment needs to be atomic for percpu_getptr_remote. 116 * this assignment needs to be atomic for percpu_getptr_remote.
119 */ 117 */
120 118
121 pcc->pcc_data = newpcc->pcc_data; 119 pcc->pcc_data = newpcc->pcc_data;
122 120
123 splx(s); 121 splx(s);
124 122
125 pcc->pcc_size = newpcc->pcc_size; 123 pcc->pcc_size = newpcc->pcc_size;
126 *newpcc = tmp; 124 *newpcc = tmp;
127 } 125 }
128 rw_exit(&percpu_swap_lock); 126 rw_exit(&percpu_swap_lock);
129} 127}
130 128
131/* 129/*
132 * percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space 130 * percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space
133 */ 131 */
134 132
135static void 133static void
136percpu_cpu_enlarge(size_t size) 134percpu_cpu_enlarge(size_t size)
137{ 135{
138 CPU_INFO_ITERATOR cii; 136 CPU_INFO_ITERATOR cii;
139 struct cpu_info *ci; 137 struct cpu_info *ci;
140 138
141 for (CPU_INFO_FOREACH(cii, ci)) { 139 for (CPU_INFO_FOREACH(cii, ci)) {
142 percpu_cpu_t pcc; 140 percpu_cpu_t pcc;
143 141
144 pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */ 142 pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */
145 pcc.pcc_size = size; 143 pcc.pcc_size = size;
146 if (!mp_online) { 144 if (!mp_online) {
147 percpu_cpu_swap(ci, &pcc); 145 percpu_cpu_swap(ci, &pcc);
148 } else { 146 } else {
149 uint64_t where; 147 uint64_t where;
150 148
151 where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci); 149 where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci);
152 xc_wait(where); 150 xc_wait(where);
153 } 151 }
154 KASSERT(pcc.pcc_size < size); 152 KASSERT(pcc.pcc_size < size);
155 if (pcc.pcc_data != NULL) { 153 if (pcc.pcc_data != NULL) {
156 kmem_free(pcc.pcc_data, pcc.pcc_size); 154 kmem_free(pcc.pcc_data, pcc.pcc_size);
157 } 155 }
158 } 156 }
159} 157}
160 158
161/* 159/*
162 * percpu_backend_alloc: vmem import callback for percpu_offset_arena 160 * percpu_backend_alloc: vmem import callback for percpu_offset_arena
163 */ 161 */
164 162
165static vmem_addr_t 163static vmem_addr_t
166percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize, 164percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize,
167 vm_flag_t vmflags) 165 vm_flag_t vmflags)
168{ 166{
169 unsigned int offset; 167 unsigned int offset;
170 unsigned int nextoff; 168 unsigned int nextoff;
171 169
172 ASSERT_SLEEPABLE(); 170 ASSERT_SLEEPABLE();
173 KASSERT(dummy == NULL); 171 KASSERT(dummy == NULL);
174 172
175 if ((vmflags & VM_NOSLEEP) != 0) 173 if ((vmflags & VM_NOSLEEP) != 0)
176 return VMEM_ADDR_NULL; 174 return VMEM_ADDR_NULL;
177 175
178 size = roundup(size, PERCPU_IMPORT_SIZE); 176 size = roundup(size, PERCPU_IMPORT_SIZE);
179 mutex_enter(&percpu_allocation_lock); 177 mutex_enter(&percpu_allocation_lock);
180 offset = percpu_nextoff; 178 offset = percpu_nextoff;
181 percpu_nextoff = nextoff = percpu_nextoff + size; 179 percpu_nextoff = nextoff = percpu_nextoff + size;
182 mutex_exit(&percpu_allocation_lock); 180 mutex_exit(&percpu_allocation_lock);
183 181
184 percpu_cpu_enlarge(nextoff); 182 percpu_cpu_enlarge(nextoff);
185 183
186 *resultsize = size; 184 *resultsize = size;
187 return (vmem_addr_t)offset; 185 return (vmem_addr_t)offset;
188} 186}
189 187
190static void 188static void
191percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci) 189percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci)
192{ 190{
193 size_t sz = (uintptr_t)vp2; 191 size_t sz = (uintptr_t)vp2;
194 192
195 memset(vp, 0, sz); 193 memset(vp, 0, sz);
196} 194}
197 195
198/* 196/*
199 * percpu_zero: initialize percpu storage with zero. 197 * percpu_zero: initialize percpu storage with zero.
200 */ 198 */
201 199
202static void 200static void
203percpu_zero(percpu_t *pc, size_t sz) 201percpu_zero(percpu_t *pc, size_t sz)
204{ 202{
205 203
206 percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz); 204 percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz);
207} 205}
208 206
209/* 207/*
210 * percpu_init: subsystem initialization 208 * percpu_init: subsystem initialization
211 */ 209 */
212 210
213void 211void
214percpu_init(void) 212percpu_init(void)
215{ 213{
216 214
217 ASSERT_SLEEPABLE(); 215 ASSERT_SLEEPABLE();
218 rw_init(&percpu_swap_lock); 216 rw_init(&percpu_swap_lock);
219 mutex_init(&percpu_allocation_lock, MUTEX_DEFAULT, IPL_NONE); 217 mutex_init(&percpu_allocation_lock, MUTEX_DEFAULT, IPL_NONE);
220 percpu_nextoff = PERCPU_QUANTUM_SIZE; 218 percpu_nextoff = PERCPU_QUANTUM_SIZE;
221 219
222 percpu_offset_arena = vmem_create("percpu", 0, 0, PERCPU_QUANTUM_SIZE, 220 percpu_offset_arena = vmem_create("percpu", 0, 0, PERCPU_QUANTUM_SIZE,
223 percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP, 221 percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP,
224 IPL_NONE); 222 IPL_NONE);
225} 223}
226 224
227/* 225/*
228 * percpu_init_cpu: cpu initialization 226 * percpu_init_cpu: cpu initialization
229 * 227 *
230 * => should be called before the cpu appears on the list for CPU_INFO_FOREACH. 228 * => should be called before the cpu appears on the list for CPU_INFO_FOREACH.
231 */ 229 */
232 230
233void 231void
234percpu_init_cpu(struct cpu_info *ci) 232percpu_init_cpu(struct cpu_info *ci)
235{ 233{
236 percpu_cpu_t * const pcc = cpu_percpu(ci); 234 percpu_cpu_t * const pcc = cpu_percpu(ci);
237 size_t size = percpu_nextoff; /* XXX racy */ 235 size_t size = percpu_nextoff; /* XXX racy */
238 236
239 ASSERT_SLEEPABLE(); 237 ASSERT_SLEEPABLE();
240 pcc->pcc_size = size; 238 pcc->pcc_size = size;
241 if (size) { 239 if (size) {
242 pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP); 240 pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP);
243 } 241 }
244} 242}
245 243
246/* 244/*
247 * percpu_alloc: allocate percpu storage 245 * percpu_alloc: allocate percpu storage
248 * 246 *
249 * => called in thread context. 247 * => called in thread context.
250 * => considered as an expensive and rare operation. 248 * => considered as an expensive and rare operation.
251 * => allocated storage is initialized with zeros. 249 * => allocated storage is initialized with zeros.
252 */ 250 */
253 251
254percpu_t * 252percpu_t *
255percpu_alloc(size_t size) 253percpu_alloc(size_t size)
256{ 254{
257 unsigned int offset; 255 unsigned int offset;
258 percpu_t *pc; 256 percpu_t *pc;
259 257
260 ASSERT_SLEEPABLE(); 258 ASSERT_SLEEPABLE();
261 offset = vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT); 259 offset = vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT);
262 pc = (percpu_t *)percpu_encrypt((uintptr_t)offset); 260 pc = (percpu_t *)percpu_encrypt((uintptr_t)offset);
263 percpu_zero(pc, size); 261 percpu_zero(pc, size);
264 return pc; 262 return pc;
265} 263}
266 264
267/* 265/*
268 * percpu_free: free percpu storage 266 * percpu_free: free percpu storage
269 * 267 *
270 * => called in thread context. 268 * => called in thread context.
271 * => considered as an expensive and rare operation. 269 * => considered as an expensive and rare operation.
272 */ 270 */
273 271
274void 272void
275percpu_free(percpu_t *pc, size_t size) 273percpu_free(percpu_t *pc, size_t size)
276{ 274{
277 275
278 ASSERT_SLEEPABLE(); 276 ASSERT_SLEEPABLE();
279 vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size); 277 vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size);
280} 278}
281 279
282/* 280/*
283 * percpu_getref: 281 * percpu_getref:
284 * 282 *
285 * => safe to be used in either thread or interrupt context 283 * => safe to be used in either thread or interrupt context
286 * => disables preemption; must be bracketed with a percpu_putref() 284 * => disables preemption; must be bracketed with a percpu_putref()
287 */ 285 */
288 286
289void * 287void *
290percpu_getref(percpu_t *pc) 288percpu_getref(percpu_t *pc)
291{ 289{
292 290
293 KPREEMPT_DISABLE(curlwp); 291 KPREEMPT_DISABLE(curlwp);
294 return percpu_getptr_remote(pc, curcpu()); 292 return percpu_getptr_remote(pc, curcpu());
295} 293}
296 294
297/* 295/*
298 * percpu_putref: 296 * percpu_putref:
299 * 297 *
300 * => drops the preemption-disabled count after caller is done with per-cpu 298 * => drops the preemption-disabled count after caller is done with per-cpu
301 * data 299 * data
302 */ 300 */
303 301
304void 302void
305percpu_putref(percpu_t *pc) 303percpu_putref(percpu_t *pc)
306{ 304{
307 305
308 KPREEMPT_ENABLE(curlwp); 306 KPREEMPT_ENABLE(curlwp);
309} 307}
310 308
311/* 309/*
312 * percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote: 310 * percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote:
313 * helpers to access remote cpu's percpu data. 311 * helpers to access remote cpu's percpu data.
314 * 312 *
315 * => called in thread context. 313 * => called in thread context.
316 * => percpu_traverse_enter can block low-priority xcalls. 314 * => percpu_traverse_enter can block low-priority xcalls.
317 * => typical usage would be: 315 * => typical usage would be:
318 * 316 *
319 * sum = 0; 317 * sum = 0;
320 * percpu_traverse_enter(); 318 * percpu_traverse_enter();
321 * for (CPU_INFO_FOREACH(cii, ci)) { 319 * for (CPU_INFO_FOREACH(cii, ci)) {
322 * unsigned int *p = percpu_getptr_remote(pc, ci); 320 * unsigned int *p = percpu_getptr_remote(pc, ci);
323 * sum += *p; 321 * sum += *p;
324 * } 322 * }
325 * percpu_traverse_exit(); 323 * percpu_traverse_exit();
326 */ 324 */
327 325
328void 326void
329percpu_traverse_enter(void) 327percpu_traverse_enter(void)
330{ 328{
331 329
332 ASSERT_SLEEPABLE(); 330 ASSERT_SLEEPABLE();
333 rw_enter(&percpu_swap_lock, RW_READER); 331 rw_enter(&percpu_swap_lock, RW_READER);
334} 332}
335 333
336void 334void
337percpu_traverse_exit(void) 335percpu_traverse_exit(void)
338{ 336{
339 337
340 rw_exit(&percpu_swap_lock); 338 rw_exit(&percpu_swap_lock);
341} 339}
342 340
343void * 341void *
344percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci) 342percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci)
345{ 343{
346 344
347 return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)]; 345 return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)];
348} 346}
349 347
350/* 348/*
351 * percpu_foreach: call the specified callback function for each cpus. 349 * percpu_foreach: call the specified callback function for each cpus.
352 * 350 *
353 * => called in thread context. 351 * => called in thread context.
354 * => caller should not rely on the cpu iteration order. 352 * => caller should not rely on the cpu iteration order.
355 * => the callback function should be minimum because it is executed with 353 * => the callback function should be minimum because it is executed with
356 * holding a global lock, which can block low-priority xcalls. 354 * holding a global lock, which can block low-priority xcalls.
357 * eg. it's illegal for a callback function to sleep for memory allocation. 355 * eg. it's illegal for a callback function to sleep for memory allocation.
358 */ 356 */
359void 357void
360percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg) 358percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg)
361{ 359{
362 CPU_INFO_ITERATOR cii; 360 CPU_INFO_ITERATOR cii;
363 struct cpu_info *ci; 361 struct cpu_info *ci;
364 362
365 percpu_traverse_enter(); 363 percpu_traverse_enter();
366 for (CPU_INFO_FOREACH(cii, ci)) { 364 for (CPU_INFO_FOREACH(cii, ci)) {
367 (*cb)(percpu_getptr_remote(pc, ci), arg, ci); 365 (*cb)(percpu_getptr_remote(pc, ci), arg, ci);
368 } 366 }
369 percpu_traverse_exit(); 367 percpu_traverse_exit();
370} 368}

cvs diff -r1.30 -r1.31 src/sys/kern/subr_workqueue.c (switch to unified diff)

--- src/sys/kern/subr_workqueue.c 2009/11/11 14:54:40 1.30
+++ src/sys/kern/subr_workqueue.c 2011/07/27 14:35:34 1.31
@@ -1,303 +1,301 @@ @@ -1,303 +1,301 @@
1/* $NetBSD: subr_workqueue.c,v 1.30 2009/11/11 14:54:40 rmind Exp $ */ 1/* $NetBSD: subr_workqueue.c,v 1.31 2011/07/27 14:35:34 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c)2002, 2005, 2006, 2007 YAMAMOTO Takashi, 4 * Copyright (c)2002, 2005, 2006, 2007 YAMAMOTO Takashi,
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE. 26 * SUCH DAMAGE.
27 */ 27 */
28 28
29#include <sys/cdefs.h> 29#include <sys/cdefs.h>
30__KERNEL_RCSID(0, "$NetBSD: subr_workqueue.c,v 1.30 2009/11/11 14:54:40 rmind Exp $"); 30__KERNEL_RCSID(0, "$NetBSD: subr_workqueue.c,v 1.31 2011/07/27 14:35:34 uebayasi Exp $");
31 31
32#include <sys/param.h> 32#include <sys/param.h>
33#include <sys/cpu.h> 33#include <sys/cpu.h>
34#include <sys/systm.h> 34#include <sys/systm.h>
35#include <sys/kthread.h> 35#include <sys/kthread.h>
36#include <sys/kmem.h> 36#include <sys/kmem.h>
37#include <sys/proc.h> 37#include <sys/proc.h>
38#include <sys/workqueue.h> 38#include <sys/workqueue.h>
39#include <sys/mutex.h> 39#include <sys/mutex.h>
40#include <sys/condvar.h> 40#include <sys/condvar.h>
41#include <sys/queue.h> 41#include <sys/queue.h>
42 42
43#include <uvm/uvm_extern.h> 
44 
45typedef struct work_impl { 43typedef struct work_impl {
46 SIMPLEQ_ENTRY(work_impl) wk_entry; 44 SIMPLEQ_ENTRY(work_impl) wk_entry;
47} work_impl_t; 45} work_impl_t;
48 46
49SIMPLEQ_HEAD(workqhead, work_impl); 47SIMPLEQ_HEAD(workqhead, work_impl);
50 48
51struct workqueue_queue { 49struct workqueue_queue {
52 kmutex_t q_mutex; 50 kmutex_t q_mutex;
53 kcondvar_t q_cv; 51 kcondvar_t q_cv;
54 struct workqhead q_queue; 52 struct workqhead q_queue;
55 lwp_t *q_worker; 53 lwp_t *q_worker;
56}; 54};
57 55
58struct workqueue { 56struct workqueue {
59 void (*wq_func)(struct work *, void *); 57 void (*wq_func)(struct work *, void *);
60 void *wq_arg; 58 void *wq_arg;
61 int wq_flags; 59 int wq_flags;
62 60
63 const char *wq_name; 61 const char *wq_name;
64 pri_t wq_prio; 62 pri_t wq_prio;
65 void *wq_ptr; 63 void *wq_ptr;
66}; 64};
67 65
68#define WQ_SIZE (roundup2(sizeof(struct workqueue), coherency_unit)) 66#define WQ_SIZE (roundup2(sizeof(struct workqueue), coherency_unit))
69#define WQ_QUEUE_SIZE (roundup2(sizeof(struct workqueue_queue), coherency_unit)) 67#define WQ_QUEUE_SIZE (roundup2(sizeof(struct workqueue_queue), coherency_unit))
70 68
71#define POISON 0xaabbccdd 69#define POISON 0xaabbccdd
72 70
73static size_t 71static size_t
74workqueue_size(int flags) 72workqueue_size(int flags)
75{ 73{
76 74
77 return WQ_SIZE 75 return WQ_SIZE
78 + ((flags & WQ_PERCPU) != 0 ? ncpu : 1) * WQ_QUEUE_SIZE 76 + ((flags & WQ_PERCPU) != 0 ? ncpu : 1) * WQ_QUEUE_SIZE
79 + coherency_unit; 77 + coherency_unit;
80} 78}
81 79
82static struct workqueue_queue * 80static struct workqueue_queue *
83workqueue_queue_lookup(struct workqueue *wq, struct cpu_info *ci) 81workqueue_queue_lookup(struct workqueue *wq, struct cpu_info *ci)
84{ 82{
85 u_int idx = 0; 83 u_int idx = 0;
86 84
87 if (wq->wq_flags & WQ_PERCPU) { 85 if (wq->wq_flags & WQ_PERCPU) {
88 idx = ci ? cpu_index(ci) : cpu_index(curcpu()); 86 idx = ci ? cpu_index(ci) : cpu_index(curcpu());
89 } 87 }
90 88
91 return (void *)((uintptr_t)(wq) + WQ_SIZE + (idx * WQ_QUEUE_SIZE)); 89 return (void *)((uintptr_t)(wq) + WQ_SIZE + (idx * WQ_QUEUE_SIZE));
92} 90}
93 91
94static void 92static void
95workqueue_runlist(struct workqueue *wq, struct workqhead *list) 93workqueue_runlist(struct workqueue *wq, struct workqhead *list)
96{ 94{
97 work_impl_t *wk; 95 work_impl_t *wk;
98 work_impl_t *next; 96 work_impl_t *next;
99 97
100 /* 98 /*
101 * note that "list" is not a complete SIMPLEQ. 99 * note that "list" is not a complete SIMPLEQ.
102 */ 100 */
103 101
104 for (wk = SIMPLEQ_FIRST(list); wk != NULL; wk = next) { 102 for (wk = SIMPLEQ_FIRST(list); wk != NULL; wk = next) {
105 next = SIMPLEQ_NEXT(wk, wk_entry); 103 next = SIMPLEQ_NEXT(wk, wk_entry);
106 (*wq->wq_func)((void *)wk, wq->wq_arg); 104 (*wq->wq_func)((void *)wk, wq->wq_arg);
107 } 105 }
108} 106}
109 107
110static void 108static void
111workqueue_worker(void *cookie) 109workqueue_worker(void *cookie)
112{ 110{
113 struct workqueue *wq = cookie; 111 struct workqueue *wq = cookie;
114 struct workqueue_queue *q; 112 struct workqueue_queue *q;
115 113
116 /* find the workqueue of this kthread */ 114 /* find the workqueue of this kthread */
117 q = workqueue_queue_lookup(wq, curlwp->l_cpu); 115 q = workqueue_queue_lookup(wq, curlwp->l_cpu);
118 116
119 for (;;) { 117 for (;;) {
120 struct workqhead tmp; 118 struct workqhead tmp;
121 119
122 /* 120 /*
123 * we violate abstraction of SIMPLEQ. 121 * we violate abstraction of SIMPLEQ.
124 */ 122 */
125 123
126#if defined(DIAGNOSTIC) 124#if defined(DIAGNOSTIC)
127 tmp.sqh_last = (void *)POISON; 125 tmp.sqh_last = (void *)POISON;
128#endif /* defined(DIAGNOSTIC) */ 126#endif /* defined(DIAGNOSTIC) */
129 127
130 mutex_enter(&q->q_mutex); 128 mutex_enter(&q->q_mutex);
131 while (SIMPLEQ_EMPTY(&q->q_queue)) 129 while (SIMPLEQ_EMPTY(&q->q_queue))
132 cv_wait(&q->q_cv, &q->q_mutex); 130 cv_wait(&q->q_cv, &q->q_mutex);
133 tmp.sqh_first = q->q_queue.sqh_first; /* XXX */ 131 tmp.sqh_first = q->q_queue.sqh_first; /* XXX */
134 SIMPLEQ_INIT(&q->q_queue); 132 SIMPLEQ_INIT(&q->q_queue);
135 mutex_exit(&q->q_mutex); 133 mutex_exit(&q->q_mutex);
136 134
137 workqueue_runlist(wq, &tmp); 135 workqueue_runlist(wq, &tmp);
138 } 136 }
139} 137}
140 138
141static void 139static void
142workqueue_init(struct workqueue *wq, const char *name, 140workqueue_init(struct workqueue *wq, const char *name,
143 void (*callback_func)(struct work *, void *), void *callback_arg, 141 void (*callback_func)(struct work *, void *), void *callback_arg,
144 pri_t prio, int ipl) 142 pri_t prio, int ipl)
145{ 143{
146 144
147 wq->wq_prio = prio; 145 wq->wq_prio = prio;
148 wq->wq_name = name; 146 wq->wq_name = name;
149 wq->wq_func = callback_func; 147 wq->wq_func = callback_func;
150 wq->wq_arg = callback_arg; 148 wq->wq_arg = callback_arg;
151} 149}
152 150
153static int 151static int
154workqueue_initqueue(struct workqueue *wq, struct workqueue_queue *q, 152workqueue_initqueue(struct workqueue *wq, struct workqueue_queue *q,
155 int ipl, struct cpu_info *ci) 153 int ipl, struct cpu_info *ci)
156{ 154{
157 int error, ktf; 155 int error, ktf;
158 156
159 KASSERT(q->q_worker == NULL); 157 KASSERT(q->q_worker == NULL);
160 158
161 mutex_init(&q->q_mutex, MUTEX_DEFAULT, ipl); 159 mutex_init(&q->q_mutex, MUTEX_DEFAULT, ipl);
162 cv_init(&q->q_cv, wq->wq_name); 160 cv_init(&q->q_cv, wq->wq_name);
163 SIMPLEQ_INIT(&q->q_queue); 161 SIMPLEQ_INIT(&q->q_queue);
164 ktf = ((wq->wq_flags & WQ_MPSAFE) != 0 ? KTHREAD_MPSAFE : 0); 162 ktf = ((wq->wq_flags & WQ_MPSAFE) != 0 ? KTHREAD_MPSAFE : 0);
165 if (ci) { 163 if (ci) {
166 error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker, 164 error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker,
167 wq, &q->q_worker, "%s/%u", wq->wq_name, ci->ci_index); 165 wq, &q->q_worker, "%s/%u", wq->wq_name, ci->ci_index);
168 } else { 166 } else {
169 error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker, 167 error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker,
170 wq, &q->q_worker, "%s", wq->wq_name); 168 wq, &q->q_worker, "%s", wq->wq_name);
171 } 169 }
172 if (error != 0) { 170 if (error != 0) {
173 mutex_destroy(&q->q_mutex); 171 mutex_destroy(&q->q_mutex);
174 cv_destroy(&q->q_cv); 172 cv_destroy(&q->q_cv);
175 KASSERT(q->q_worker == NULL); 173 KASSERT(q->q_worker == NULL);
176 } 174 }
177 return error; 175 return error;
178} 176}
179 177
180struct workqueue_exitargs { 178struct workqueue_exitargs {
181 work_impl_t wqe_wk; 179 work_impl_t wqe_wk;
182 struct workqueue_queue *wqe_q; 180 struct workqueue_queue *wqe_q;
183}; 181};
184 182
185static void 183static void
186workqueue_exit(struct work *wk, void *arg) 184workqueue_exit(struct work *wk, void *arg)
187{ 185{
188 struct workqueue_exitargs *wqe = (void *)wk; 186 struct workqueue_exitargs *wqe = (void *)wk;
189 struct workqueue_queue *q = wqe->wqe_q; 187 struct workqueue_queue *q = wqe->wqe_q;
190 188
191 /* 189 /*
192 * only competition at this point is workqueue_finiqueue. 190 * only competition at this point is workqueue_finiqueue.
193 */ 191 */
194 192
195 KASSERT(q->q_worker == curlwp); 193 KASSERT(q->q_worker == curlwp);
196 KASSERT(SIMPLEQ_EMPTY(&q->q_queue)); 194 KASSERT(SIMPLEQ_EMPTY(&q->q_queue));
197 mutex_enter(&q->q_mutex); 195 mutex_enter(&q->q_mutex);
198 q->q_worker = NULL; 196 q->q_worker = NULL;
199 cv_signal(&q->q_cv); 197 cv_signal(&q->q_cv);
200 mutex_exit(&q->q_mutex); 198 mutex_exit(&q->q_mutex);
201 kthread_exit(0); 199 kthread_exit(0);
202} 200}
203 201
204static void 202static void
205workqueue_finiqueue(struct workqueue *wq, struct workqueue_queue *q) 203workqueue_finiqueue(struct workqueue *wq, struct workqueue_queue *q)
206{ 204{
207 struct workqueue_exitargs wqe; 205 struct workqueue_exitargs wqe;
208 206
209 KASSERT(wq->wq_func == workqueue_exit); 207 KASSERT(wq->wq_func == workqueue_exit);
210 208
211 wqe.wqe_q = q; 209 wqe.wqe_q = q;
212 KASSERT(SIMPLEQ_EMPTY(&q->q_queue)); 210 KASSERT(SIMPLEQ_EMPTY(&q->q_queue));
213 KASSERT(q->q_worker != NULL); 211 KASSERT(q->q_worker != NULL);
214 mutex_enter(&q->q_mutex); 212 mutex_enter(&q->q_mutex);
215 SIMPLEQ_INSERT_TAIL(&q->q_queue, &wqe.wqe_wk, wk_entry); 213 SIMPLEQ_INSERT_TAIL(&q->q_queue, &wqe.wqe_wk, wk_entry);
216 cv_signal(&q->q_cv); 214 cv_signal(&q->q_cv);
217 while (q->q_worker != NULL) { 215 while (q->q_worker != NULL) {
218 cv_wait(&q->q_cv, &q->q_mutex); 216 cv_wait(&q->q_cv, &q->q_mutex);
219 } 217 }
220 mutex_exit(&q->q_mutex); 218 mutex_exit(&q->q_mutex);
221 mutex_destroy(&q->q_mutex); 219 mutex_destroy(&q->q_mutex);
222 cv_destroy(&q->q_cv); 220 cv_destroy(&q->q_cv);
223} 221}
224 222
225/* --- */ 223/* --- */
226 224
227int 225int
228workqueue_create(struct workqueue **wqp, const char *name, 226workqueue_create(struct workqueue **wqp, const char *name,
229 void (*callback_func)(struct work *, void *), void *callback_arg, 227 void (*callback_func)(struct work *, void *), void *callback_arg,
230 pri_t prio, int ipl, int flags) 228 pri_t prio, int ipl, int flags)
231{ 229{
232 struct workqueue *wq; 230 struct workqueue *wq;
233 struct workqueue_queue *q; 231 struct workqueue_queue *q;
234 void *ptr; 232 void *ptr;
235 int error = 0; 233 int error = 0;
236 234
237 CTASSERT(sizeof(work_impl_t) <= sizeof(struct work)); 235 CTASSERT(sizeof(work_impl_t) <= sizeof(struct work));
238 236
239 ptr = kmem_zalloc(workqueue_size(flags), KM_SLEEP); 237 ptr = kmem_zalloc(workqueue_size(flags), KM_SLEEP);
240 wq = (void *)roundup2((uintptr_t)ptr, coherency_unit); 238 wq = (void *)roundup2((uintptr_t)ptr, coherency_unit);
241 wq->wq_ptr = ptr; 239 wq->wq_ptr = ptr;
242 wq->wq_flags = flags; 240 wq->wq_flags = flags;
243 241
244 workqueue_init(wq, name, callback_func, callback_arg, prio, ipl); 242 workqueue_init(wq, name, callback_func, callback_arg, prio, ipl);
245 243
246 if (flags & WQ_PERCPU) { 244 if (flags & WQ_PERCPU) {
247 struct cpu_info *ci; 245 struct cpu_info *ci;
248 CPU_INFO_ITERATOR cii; 246 CPU_INFO_ITERATOR cii;
249 247
250 /* create the work-queue for each CPU */ 248 /* create the work-queue for each CPU */
251 for (CPU_INFO_FOREACH(cii, ci)) { 249 for (CPU_INFO_FOREACH(cii, ci)) {
252 q = workqueue_queue_lookup(wq, ci); 250 q = workqueue_queue_lookup(wq, ci);
253 error = workqueue_initqueue(wq, q, ipl, ci); 251 error = workqueue_initqueue(wq, q, ipl, ci);
254 if (error) { 252 if (error) {
255 break; 253 break;
256 } 254 }
257 } 255 }
258 } else { 256 } else {
259 /* initialize a work-queue */ 257 /* initialize a work-queue */
260 q = workqueue_queue_lookup(wq, NULL); 258 q = workqueue_queue_lookup(wq, NULL);
261 error = workqueue_initqueue(wq, q, ipl, NULL); 259 error = workqueue_initqueue(wq, q, ipl, NULL);
262 } 260 }
263 261
264 if (error != 0) { 262 if (error != 0) {
265 workqueue_destroy(wq); 263 workqueue_destroy(wq);
266 } else { 264 } else {
267 *wqp = wq; 265 *wqp = wq;
268 } 266 }
269 267
270 return error; 268 return error;
271} 269}
272 270
273void 271void
274workqueue_destroy(struct workqueue *wq) 272workqueue_destroy(struct workqueue *wq)
275{ 273{
276 struct workqueue_queue *q; 274 struct workqueue_queue *q;
277 struct cpu_info *ci; 275 struct cpu_info *ci;
278 CPU_INFO_ITERATOR cii; 276 CPU_INFO_ITERATOR cii;
279 277
280 wq->wq_func = workqueue_exit; 278 wq->wq_func = workqueue_exit;
281 for (CPU_INFO_FOREACH(cii, ci)) { 279 for (CPU_INFO_FOREACH(cii, ci)) {
282 q = workqueue_queue_lookup(wq, ci); 280 q = workqueue_queue_lookup(wq, ci);
283 if (q->q_worker != NULL) { 281 if (q->q_worker != NULL) {
284 workqueue_finiqueue(wq, q); 282 workqueue_finiqueue(wq, q);
285 } 283 }
286 } 284 }
287 kmem_free(wq->wq_ptr, workqueue_size(wq->wq_flags)); 285 kmem_free(wq->wq_ptr, workqueue_size(wq->wq_flags));
288} 286}
289 287
290void 288void
291workqueue_enqueue(struct workqueue *wq, struct work *wk0, struct cpu_info *ci) 289workqueue_enqueue(struct workqueue *wq, struct work *wk0, struct cpu_info *ci)
292{ 290{
293 struct workqueue_queue *q; 291 struct workqueue_queue *q;
294 work_impl_t *wk = (void *)wk0; 292 work_impl_t *wk = (void *)wk0;
295 293
296 KASSERT(wq->wq_flags & WQ_PERCPU || ci == NULL); 294 KASSERT(wq->wq_flags & WQ_PERCPU || ci == NULL);
297 q = workqueue_queue_lookup(wq, ci); 295 q = workqueue_queue_lookup(wq, ci);
298 296
299 mutex_enter(&q->q_mutex); 297 mutex_enter(&q->q_mutex);
300 SIMPLEQ_INSERT_TAIL(&q->q_queue, wk, wk_entry); 298 SIMPLEQ_INSERT_TAIL(&q->q_queue, wk, wk_entry);
301 cv_signal(&q->q_cv); 299 cv_signal(&q->q_cv);
302 mutex_exit(&q->q_mutex); 300 mutex_exit(&q->q_mutex);
303} 301}

cvs diff -r1.126 -r1.127 src/sys/kern/sys_generic.c (switch to unified diff)

--- src/sys/kern/sys_generic.c 2011/04/10 15:45:33 1.126
+++ src/sys/kern/sys_generic.c 2011/07/27 14:35:34 1.127
@@ -1,677 +1,675 @@ @@ -1,677 +1,675 @@
1/* $NetBSD: sys_generic.c,v 1.126 2011/04/10 15:45:33 christos Exp $ */ 1/* $NetBSD: sys_generic.c,v 1.127 2011/07/27 14:35:34 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc. 4 * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * Copyright (c) 1982, 1986, 1989, 1993 33 * Copyright (c) 1982, 1986, 1989, 1993
34 * The Regents of the University of California. All rights reserved. 34 * The Regents of the University of California. All rights reserved.
35 * (c) UNIX System Laboratories, Inc. 35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed 36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph 37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc. 39 * the permission of UNIX System Laboratories, Inc.
40 * 40 *
41 * Redistribution and use in source and binary forms, with or without 41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions 42 * modification, are permitted provided that the following conditions
43 * are met: 43 * are met:
44 * 1. Redistributions of source code must retain the above copyright 44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer. 45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright 46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the 47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution. 48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors 49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software 50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission. 51 * without specific prior written permission.
52 * 52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE. 63 * SUCH DAMAGE.
64 * 64 *
65 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 65 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
66 */ 66 */
67 67
68/* 68/*
69 * System calls relating to files. 69 * System calls relating to files.
70 */ 70 */
71 71
72#include <sys/cdefs.h> 72#include <sys/cdefs.h>
73__KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.126 2011/04/10 15:45:33 christos Exp $"); 73__KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.127 2011/07/27 14:35:34 uebayasi Exp $");
74 74
75#include <sys/param.h> 75#include <sys/param.h>
76#include <sys/systm.h> 76#include <sys/systm.h>
77#include <sys/filedesc.h> 77#include <sys/filedesc.h>
78#include <sys/ioctl.h> 78#include <sys/ioctl.h>
79#include <sys/file.h> 79#include <sys/file.h>
80#include <sys/proc.h> 80#include <sys/proc.h>
81#include <sys/socketvar.h> 81#include <sys/socketvar.h>
82#include <sys/signalvar.h> 82#include <sys/signalvar.h>
83#include <sys/uio.h> 83#include <sys/uio.h>
84#include <sys/kernel.h> 84#include <sys/kernel.h>
85#include <sys/stat.h> 85#include <sys/stat.h>
86#include <sys/kmem.h> 86#include <sys/kmem.h>
87#include <sys/poll.h> 87#include <sys/poll.h>
88#include <sys/vnode.h> 88#include <sys/vnode.h>
89#include <sys/mount.h> 89#include <sys/mount.h>
90#include <sys/syscallargs.h> 90#include <sys/syscallargs.h>
91#include <sys/ktrace.h> 91#include <sys/ktrace.h>
92#include <sys/atomic.h> 92#include <sys/atomic.h>
93#include <sys/disklabel.h> 93#include <sys/disklabel.h>
94 94
95#include <uvm/uvm_extern.h> 
96 
97/* 95/*
98 * Read system call. 96 * Read system call.
99 */ 97 */
100/* ARGSUSED */ 98/* ARGSUSED */
101int 99int
102sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval) 100sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval)
103{ 101{
104 /* { 102 /* {
105 syscallarg(int) fd; 103 syscallarg(int) fd;
106 syscallarg(void *) buf; 104 syscallarg(void *) buf;
107 syscallarg(size_t) nbyte; 105 syscallarg(size_t) nbyte;
108 } */ 106 } */
109 file_t *fp; 107 file_t *fp;
110 int fd; 108 int fd;
111 109
112 fd = SCARG(uap, fd); 110 fd = SCARG(uap, fd);
113 111
114 if ((fp = fd_getfile(fd)) == NULL) 112 if ((fp = fd_getfile(fd)) == NULL)
115 return (EBADF); 113 return (EBADF);
116 114
117 if ((fp->f_flag & FREAD) == 0) { 115 if ((fp->f_flag & FREAD) == 0) {
118 fd_putfile(fd); 116 fd_putfile(fd);
119 return (EBADF); 117 return (EBADF);
120 } 118 }
121 119
122 /* dofileread() will unuse the descriptor for us */ 120 /* dofileread() will unuse the descriptor for us */
123 return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), 121 return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
124 &fp->f_offset, FOF_UPDATE_OFFSET, retval)); 122 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
125} 123}
126 124
127int 125int
128dofileread(int fd, struct file *fp, void *buf, size_t nbyte, 126dofileread(int fd, struct file *fp, void *buf, size_t nbyte,
129 off_t *offset, int flags, register_t *retval) 127 off_t *offset, int flags, register_t *retval)
130{ 128{
131 struct iovec aiov; 129 struct iovec aiov;
132 struct uio auio; 130 struct uio auio;
133 size_t cnt; 131 size_t cnt;
134 int error; 132 int error;
135 lwp_t *l; 133 lwp_t *l;
136 134
137 l = curlwp; 135 l = curlwp;
138 136
139 aiov.iov_base = (void *)buf; 137 aiov.iov_base = (void *)buf;
140 aiov.iov_len = nbyte; 138 aiov.iov_len = nbyte;
141 auio.uio_iov = &aiov; 139 auio.uio_iov = &aiov;
142 auio.uio_iovcnt = 1; 140 auio.uio_iovcnt = 1;
143 auio.uio_resid = nbyte; 141 auio.uio_resid = nbyte;
144 auio.uio_rw = UIO_READ; 142 auio.uio_rw = UIO_READ;
145 auio.uio_vmspace = l->l_proc->p_vmspace; 143 auio.uio_vmspace = l->l_proc->p_vmspace;
146 144
147 /* 145 /*
148 * Reads return ssize_t because -1 is returned on error. Therefore 146 * Reads return ssize_t because -1 is returned on error. Therefore
149 * we must restrict the length to SSIZE_MAX to avoid garbage return 147 * we must restrict the length to SSIZE_MAX to avoid garbage return
150 * values. 148 * values.
151 */ 149 */
152 if (auio.uio_resid > SSIZE_MAX) { 150 if (auio.uio_resid > SSIZE_MAX) {
153 error = EINVAL; 151 error = EINVAL;
154 goto out; 152 goto out;
155 } 153 }
156 154
157 cnt = auio.uio_resid; 155 cnt = auio.uio_resid;
158 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags); 156 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
159 if (error) 157 if (error)
160 if (auio.uio_resid != cnt && (error == ERESTART || 158 if (auio.uio_resid != cnt && (error == ERESTART ||
161 error == EINTR || error == EWOULDBLOCK)) 159 error == EINTR || error == EWOULDBLOCK))
162 error = 0; 160 error = 0;
163 cnt -= auio.uio_resid; 161 cnt -= auio.uio_resid;
164 ktrgenio(fd, UIO_READ, buf, cnt, error); 162 ktrgenio(fd, UIO_READ, buf, cnt, error);
165 *retval = cnt; 163 *retval = cnt;
166 out: 164 out:
167 fd_putfile(fd); 165 fd_putfile(fd);
168 return (error); 166 return (error);
169} 167}
170 168
171/* 169/*
172 * Scatter read system call. 170 * Scatter read system call.
173 */ 171 */
174int 172int
175sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval) 173sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval)
176{ 174{
177 /* { 175 /* {
178 syscallarg(int) fd; 176 syscallarg(int) fd;
179 syscallarg(const struct iovec *) iovp; 177 syscallarg(const struct iovec *) iovp;
180 syscallarg(int) iovcnt; 178 syscallarg(int) iovcnt;
181 } */ 179 } */
182 180
183 return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp), 181 return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
184 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval); 182 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
185} 183}
186 184
187int 185int
188do_filereadv(int fd, const struct iovec *iovp, int iovcnt, 186do_filereadv(int fd, const struct iovec *iovp, int iovcnt,
189 off_t *offset, int flags, register_t *retval) 187 off_t *offset, int flags, register_t *retval)
190{ 188{
191 struct uio auio; 189 struct uio auio;
192 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; 190 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
193 int i, error; 191 int i, error;
194 size_t cnt; 192 size_t cnt;
195 u_int iovlen; 193 u_int iovlen;
196 struct file *fp; 194 struct file *fp;
197 struct iovec *ktriov = NULL; 195 struct iovec *ktriov = NULL;
198 196
199 if (iovcnt == 0) 197 if (iovcnt == 0)
200 return EINVAL; 198 return EINVAL;
201 199
202 if ((fp = fd_getfile(fd)) == NULL) 200 if ((fp = fd_getfile(fd)) == NULL)
203 return EBADF; 201 return EBADF;
204 202
205 if ((fp->f_flag & FREAD) == 0) { 203 if ((fp->f_flag & FREAD) == 0) {
206 fd_putfile(fd); 204 fd_putfile(fd);
207 return EBADF; 205 return EBADF;
208 } 206 }
209 207
210 if (offset == NULL) 208 if (offset == NULL)
211 offset = &fp->f_offset; 209 offset = &fp->f_offset;
212 else { 210 else {
213 struct vnode *vp = fp->f_data; 211 struct vnode *vp = fp->f_data;
214 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { 212 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
215 error = ESPIPE; 213 error = ESPIPE;
216 goto out; 214 goto out;
217 } 215 }
218 /* 216 /*
219 * Test that the device is seekable ? 217 * Test that the device is seekable ?
220 * XXX This works because no file systems actually 218 * XXX This works because no file systems actually
221 * XXX take any action on the seek operation. 219 * XXX take any action on the seek operation.
222 */ 220 */
223 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred); 221 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
224 if (error != 0) 222 if (error != 0)
225 goto out; 223 goto out;
226 } 224 }
227 225
228 iovlen = iovcnt * sizeof(struct iovec); 226 iovlen = iovcnt * sizeof(struct iovec);
229 if (flags & FOF_IOV_SYSSPACE) 227 if (flags & FOF_IOV_SYSSPACE)
230 iov = __UNCONST(iovp); 228 iov = __UNCONST(iovp);
231 else { 229 else {
232 iov = aiov; 230 iov = aiov;
233 if ((u_int)iovcnt > UIO_SMALLIOV) { 231 if ((u_int)iovcnt > UIO_SMALLIOV) {
234 if ((u_int)iovcnt > IOV_MAX) { 232 if ((u_int)iovcnt > IOV_MAX) {
235 error = EINVAL; 233 error = EINVAL;
236 goto out; 234 goto out;
237 } 235 }
238 iov = kmem_alloc(iovlen, KM_SLEEP); 236 iov = kmem_alloc(iovlen, KM_SLEEP);
239 if (iov == NULL) { 237 if (iov == NULL) {
240 error = ENOMEM; 238 error = ENOMEM;
241 goto out; 239 goto out;
242 } 240 }
243 needfree = iov; 241 needfree = iov;
244 } 242 }
245 error = copyin(iovp, iov, iovlen); 243 error = copyin(iovp, iov, iovlen);
246 if (error) 244 if (error)
247 goto done; 245 goto done;
248 } 246 }
249 247
250 auio.uio_iov = iov; 248 auio.uio_iov = iov;
251 auio.uio_iovcnt = iovcnt; 249 auio.uio_iovcnt = iovcnt;
252 auio.uio_rw = UIO_READ; 250 auio.uio_rw = UIO_READ;
253 auio.uio_vmspace = curproc->p_vmspace; 251 auio.uio_vmspace = curproc->p_vmspace;
254 252
255 auio.uio_resid = 0; 253 auio.uio_resid = 0;
256 for (i = 0; i < iovcnt; i++, iov++) { 254 for (i = 0; i < iovcnt; i++, iov++) {
257 auio.uio_resid += iov->iov_len; 255 auio.uio_resid += iov->iov_len;
258 /* 256 /*
259 * Reads return ssize_t because -1 is returned on error. 257 * Reads return ssize_t because -1 is returned on error.
260 * Therefore we must restrict the length to SSIZE_MAX to 258 * Therefore we must restrict the length to SSIZE_MAX to
261 * avoid garbage return values. 259 * avoid garbage return values.
262 */ 260 */
263 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { 261 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
264 error = EINVAL; 262 error = EINVAL;
265 goto done; 263 goto done;
266 } 264 }
267 } 265 }
268 266
269 /* 267 /*
270 * if tracing, save a copy of iovec 268 * if tracing, save a copy of iovec
271 */ 269 */
272 if (ktrpoint(KTR_GENIO)) { 270 if (ktrpoint(KTR_GENIO)) {
273 ktriov = kmem_alloc(iovlen, KM_SLEEP); 271 ktriov = kmem_alloc(iovlen, KM_SLEEP);
274 if (ktriov != NULL) 272 if (ktriov != NULL)
275 memcpy(ktriov, auio.uio_iov, iovlen); 273 memcpy(ktriov, auio.uio_iov, iovlen);
276 } 274 }
277 275
278 cnt = auio.uio_resid; 276 cnt = auio.uio_resid;
279 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags); 277 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
280 if (error) 278 if (error)
281 if (auio.uio_resid != cnt && (error == ERESTART || 279 if (auio.uio_resid != cnt && (error == ERESTART ||
282 error == EINTR || error == EWOULDBLOCK)) 280 error == EINTR || error == EWOULDBLOCK))
283 error = 0; 281 error = 0;
284 cnt -= auio.uio_resid; 282 cnt -= auio.uio_resid;
285 *retval = cnt; 283 *retval = cnt;
286 284
287 if (ktriov != NULL) { 285 if (ktriov != NULL) {
288 ktrgeniov(fd, UIO_READ, ktriov, cnt, error); 286 ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
289 kmem_free(ktriov, iovlen); 287 kmem_free(ktriov, iovlen);
290 } 288 }
291 289
292 done: 290 done:
293 if (needfree) 291 if (needfree)
294 kmem_free(needfree, iovlen); 292 kmem_free(needfree, iovlen);
295 out: 293 out:
296 fd_putfile(fd); 294 fd_putfile(fd);
297 return (error); 295 return (error);
298} 296}
299 297
300/* 298/*
301 * Write system call 299 * Write system call
302 */ 300 */
303int 301int
304sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval) 302sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval)
305{ 303{
306 /* { 304 /* {
307 syscallarg(int) fd; 305 syscallarg(int) fd;
308 syscallarg(const void *) buf; 306 syscallarg(const void *) buf;
309 syscallarg(size_t) nbyte; 307 syscallarg(size_t) nbyte;
310 } */ 308 } */
311 file_t *fp; 309 file_t *fp;
312 int fd; 310 int fd;
313 311
314 fd = SCARG(uap, fd); 312 fd = SCARG(uap, fd);
315 313
316 if ((fp = fd_getfile(fd)) == NULL) 314 if ((fp = fd_getfile(fd)) == NULL)
317 return (EBADF); 315 return (EBADF);
318 316
319 if ((fp->f_flag & FWRITE) == 0) { 317 if ((fp->f_flag & FWRITE) == 0) {
320 fd_putfile(fd); 318 fd_putfile(fd);
321 return (EBADF); 319 return (EBADF);
322 } 320 }
323 321
324 /* dofilewrite() will unuse the descriptor for us */ 322 /* dofilewrite() will unuse the descriptor for us */
325 return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), 323 return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
326 &fp->f_offset, FOF_UPDATE_OFFSET, retval)); 324 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
327} 325}
328 326
329int 327int
330dofilewrite(int fd, struct file *fp, const void *buf, 328dofilewrite(int fd, struct file *fp, const void *buf,
331 size_t nbyte, off_t *offset, int flags, register_t *retval) 329 size_t nbyte, off_t *offset, int flags, register_t *retval)
332{ 330{
333 struct iovec aiov; 331 struct iovec aiov;
334 struct uio auio; 332 struct uio auio;
335 size_t cnt; 333 size_t cnt;
336 int error; 334 int error;
337 335
338 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */ 336 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
339 aiov.iov_len = nbyte; 337 aiov.iov_len = nbyte;
340 auio.uio_iov = &aiov; 338 auio.uio_iov = &aiov;
341 auio.uio_iovcnt = 1; 339 auio.uio_iovcnt = 1;
342 auio.uio_resid = nbyte; 340 auio.uio_resid = nbyte;
343 auio.uio_rw = UIO_WRITE; 341 auio.uio_rw = UIO_WRITE;
344 auio.uio_vmspace = curproc->p_vmspace; 342 auio.uio_vmspace = curproc->p_vmspace;
345 343
346 /* 344 /*
347 * Writes return ssize_t because -1 is returned on error. Therefore 345 * Writes return ssize_t because -1 is returned on error. Therefore
348 * we must restrict the length to SSIZE_MAX to avoid garbage return 346 * we must restrict the length to SSIZE_MAX to avoid garbage return
349 * values. 347 * values.
350 */ 348 */
351 if (auio.uio_resid > SSIZE_MAX) { 349 if (auio.uio_resid > SSIZE_MAX) {
352 error = EINVAL; 350 error = EINVAL;
353 goto out; 351 goto out;
354 } 352 }
355 353
356 cnt = auio.uio_resid; 354 cnt = auio.uio_resid;
357 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags); 355 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
358 if (error) { 356 if (error) {
359 if (auio.uio_resid != cnt && (error == ERESTART || 357 if (auio.uio_resid != cnt && (error == ERESTART ||
360 error == EINTR || error == EWOULDBLOCK)) 358 error == EINTR || error == EWOULDBLOCK))
361 error = 0; 359 error = 0;
362 if (error == EPIPE) { 360 if (error == EPIPE) {
363 mutex_enter(proc_lock); 361 mutex_enter(proc_lock);
364 psignal(curproc, SIGPIPE); 362 psignal(curproc, SIGPIPE);
365 mutex_exit(proc_lock); 363 mutex_exit(proc_lock);
366 } 364 }
367 } 365 }
368 cnt -= auio.uio_resid; 366 cnt -= auio.uio_resid;
369 ktrgenio(fd, UIO_WRITE, buf, cnt, error); 367 ktrgenio(fd, UIO_WRITE, buf, cnt, error);
370 *retval = cnt; 368 *retval = cnt;
371 out: 369 out:
372 fd_putfile(fd); 370 fd_putfile(fd);
373 return (error); 371 return (error);
374} 372}
375 373
376/* 374/*
377 * Gather write system call 375 * Gather write system call
378 */ 376 */
379int 377int
380sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval) 378sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval)
381{ 379{
382 /* { 380 /* {
383 syscallarg(int) fd; 381 syscallarg(int) fd;
384 syscallarg(const struct iovec *) iovp; 382 syscallarg(const struct iovec *) iovp;
385 syscallarg(int) iovcnt; 383 syscallarg(int) iovcnt;
386 } */ 384 } */
387 385
388 return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp), 386 return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
389 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval); 387 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
390} 388}
391 389
392int 390int
393do_filewritev(int fd, const struct iovec *iovp, int iovcnt, 391do_filewritev(int fd, const struct iovec *iovp, int iovcnt,
394 off_t *offset, int flags, register_t *retval) 392 off_t *offset, int flags, register_t *retval)
395{ 393{
396 struct uio auio; 394 struct uio auio;
397 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; 395 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
398 int i, error; 396 int i, error;
399 size_t cnt; 397 size_t cnt;
400 u_int iovlen; 398 u_int iovlen;
401 struct file *fp; 399 struct file *fp;
402 struct iovec *ktriov = NULL; 400 struct iovec *ktriov = NULL;
403 401
404 if (iovcnt == 0) 402 if (iovcnt == 0)
405 return EINVAL; 403 return EINVAL;
406 404
407 if ((fp = fd_getfile(fd)) == NULL) 405 if ((fp = fd_getfile(fd)) == NULL)
408 return EBADF; 406 return EBADF;
409 407
410 if ((fp->f_flag & FWRITE) == 0) { 408 if ((fp->f_flag & FWRITE) == 0) {
411 fd_putfile(fd); 409 fd_putfile(fd);
412 return EBADF; 410 return EBADF;
413 } 411 }
414 412
415 if (offset == NULL) 413 if (offset == NULL)
416 offset = &fp->f_offset; 414 offset = &fp->f_offset;
417 else { 415 else {
418 struct vnode *vp = fp->f_data; 416 struct vnode *vp = fp->f_data;
419 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { 417 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
420 error = ESPIPE; 418 error = ESPIPE;
421 goto out; 419 goto out;
422 } 420 }
423 /* 421 /*
424 * Test that the device is seekable ? 422 * Test that the device is seekable ?
425 * XXX This works because no file systems actually 423 * XXX This works because no file systems actually
426 * XXX take any action on the seek operation. 424 * XXX take any action on the seek operation.
427 */ 425 */
428 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred); 426 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
429 if (error != 0) 427 if (error != 0)
430 goto out; 428 goto out;
431 } 429 }
432 430
433 iovlen = iovcnt * sizeof(struct iovec); 431 iovlen = iovcnt * sizeof(struct iovec);
434 if (flags & FOF_IOV_SYSSPACE) 432 if (flags & FOF_IOV_SYSSPACE)
435 iov = __UNCONST(iovp); 433 iov = __UNCONST(iovp);
436 else { 434 else {
437 iov = aiov; 435 iov = aiov;
438 if ((u_int)iovcnt > UIO_SMALLIOV) { 436 if ((u_int)iovcnt > UIO_SMALLIOV) {
439 if ((u_int)iovcnt > IOV_MAX) { 437 if ((u_int)iovcnt > IOV_MAX) {
440 error = EINVAL; 438 error = EINVAL;
441 goto out; 439 goto out;
442 } 440 }
443 iov = kmem_alloc(iovlen, KM_SLEEP); 441 iov = kmem_alloc(iovlen, KM_SLEEP);
444 if (iov == NULL) { 442 if (iov == NULL) {
445 error = ENOMEM; 443 error = ENOMEM;
446 goto out; 444 goto out;
447 } 445 }
448 needfree = iov; 446 needfree = iov;
449 } 447 }
450 error = copyin(iovp, iov, iovlen); 448 error = copyin(iovp, iov, iovlen);
451 if (error) 449 if (error)
452 goto done; 450 goto done;
453 } 451 }
454 452
455 auio.uio_iov = iov; 453 auio.uio_iov = iov;
456 auio.uio_iovcnt = iovcnt; 454 auio.uio_iovcnt = iovcnt;
457 auio.uio_rw = UIO_WRITE; 455 auio.uio_rw = UIO_WRITE;
458 auio.uio_vmspace = curproc->p_vmspace; 456 auio.uio_vmspace = curproc->p_vmspace;
459 457
460 auio.uio_resid = 0; 458 auio.uio_resid = 0;
461 for (i = 0; i < iovcnt; i++, iov++) { 459 for (i = 0; i < iovcnt; i++, iov++) {
462 auio.uio_resid += iov->iov_len; 460 auio.uio_resid += iov->iov_len;
463 /* 461 /*
464 * Writes return ssize_t because -1 is returned on error. 462 * Writes return ssize_t because -1 is returned on error.
465 * Therefore we must restrict the length to SSIZE_MAX to 463 * Therefore we must restrict the length to SSIZE_MAX to
466 * avoid garbage return values. 464 * avoid garbage return values.
467 */ 465 */
468 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { 466 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
469 error = EINVAL; 467 error = EINVAL;
470 goto done; 468 goto done;
471 } 469 }
472 } 470 }
473 471
474 /* 472 /*
475 * if tracing, save a copy of iovec 473 * if tracing, save a copy of iovec
476 */ 474 */
477 if (ktrpoint(KTR_GENIO)) { 475 if (ktrpoint(KTR_GENIO)) {
478 ktriov = kmem_alloc(iovlen, KM_SLEEP); 476 ktriov = kmem_alloc(iovlen, KM_SLEEP);
479 if (ktriov != NULL) 477 if (ktriov != NULL)
480 memcpy(ktriov, auio.uio_iov, iovlen); 478 memcpy(ktriov, auio.uio_iov, iovlen);
481 } 479 }
482 480
483 cnt = auio.uio_resid; 481 cnt = auio.uio_resid;
484 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags); 482 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
485 if (error) { 483 if (error) {
486 if (auio.uio_resid != cnt && (error == ERESTART || 484 if (auio.uio_resid != cnt && (error == ERESTART ||
487 error == EINTR || error == EWOULDBLOCK)) 485 error == EINTR || error == EWOULDBLOCK))
488 error = 0; 486 error = 0;
489 if (error == EPIPE) { 487 if (error == EPIPE) {
490 mutex_enter(proc_lock); 488 mutex_enter(proc_lock);
491 psignal(curproc, SIGPIPE); 489 psignal(curproc, SIGPIPE);
492 mutex_exit(proc_lock); 490 mutex_exit(proc_lock);
493 } 491 }
494 } 492 }
495 cnt -= auio.uio_resid; 493 cnt -= auio.uio_resid;
496 *retval = cnt; 494 *retval = cnt;
497 495
498 if (ktriov != NULL) { 496 if (ktriov != NULL) {
499 ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error); 497 ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
500 kmem_free(ktriov, iovlen); 498 kmem_free(ktriov, iovlen);
501 } 499 }
502 500
503 done: 501 done:
504 if (needfree) 502 if (needfree)
505 kmem_free(needfree, iovlen); 503 kmem_free(needfree, iovlen);
506 out: 504 out:
507 fd_putfile(fd); 505 fd_putfile(fd);
508 return (error); 506 return (error);
509} 507}
510 508
511/* 509/*
512 * Ioctl system call 510 * Ioctl system call
513 */ 511 */
514/* ARGSUSED */ 512/* ARGSUSED */
515int 513int
516sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval) 514sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval)
517{ 515{
518 /* { 516 /* {
519 syscallarg(int) fd; 517 syscallarg(int) fd;
520 syscallarg(u_long) com; 518 syscallarg(u_long) com;
521 syscallarg(void *) data; 519 syscallarg(void *) data;
522 } */ 520 } */
523 struct file *fp; 521 struct file *fp;
524 proc_t *p; 522 proc_t *p;
525 u_long com; 523 u_long com;
526 int error; 524 int error;
527 size_t size, alloc_size; 525 size_t size, alloc_size;
528 void *data, *memp; 526 void *data, *memp;
529#define STK_PARAMS 128 527#define STK_PARAMS 128
530 u_long stkbuf[STK_PARAMS/sizeof(u_long)]; 528 u_long stkbuf[STK_PARAMS/sizeof(u_long)];
531 529
532 memp = NULL; 530 memp = NULL;
533 alloc_size = 0; 531 alloc_size = 0;
534 error = 0; 532 error = 0;
535 p = l->l_proc; 533 p = l->l_proc;
536 534
537 if ((fp = fd_getfile(SCARG(uap, fd))) == NULL) 535 if ((fp = fd_getfile(SCARG(uap, fd))) == NULL)
538 return (EBADF); 536 return (EBADF);
539 537
540 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 538 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
541 error = EBADF; 539 error = EBADF;
542 com = 0; 540 com = 0;
543 goto out; 541 goto out;
544 } 542 }
545 543
546 switch (com = SCARG(uap, com)) { 544 switch (com = SCARG(uap, com)) {
547 case FIONCLEX: 545 case FIONCLEX:
548 case FIOCLEX: 546 case FIOCLEX:
549 fd_set_exclose(l, SCARG(uap, fd), com == FIOCLEX); 547 fd_set_exclose(l, SCARG(uap, fd), com == FIOCLEX);
550 goto out; 548 goto out;
551 } 549 }
552 550
553 /* 551 /*
554 * Interpret high order word to find amount of data to be 552 * Interpret high order word to find amount of data to be
555 * copied to/from the user's address space. 553 * copied to/from the user's address space.
556 */ 554 */
557 size = IOCPARM_LEN(com); 555 size = IOCPARM_LEN(com);
558 alloc_size = size; 556 alloc_size = size;
559 557
560 /* 558 /*
561 * The disklabel is now padded to a multiple of 8 bytes however the old 559 * The disklabel is now padded to a multiple of 8 bytes however the old
562 * disklabel on 32bit platforms wasn't. This leaves a difference in 560 * disklabel on 32bit platforms wasn't. This leaves a difference in
563 * size of 4 bytes between the two but are otherwise identical. 561 * size of 4 bytes between the two but are otherwise identical.
564 * To deal with this, we allocate enough space for the new disklabel 562 * To deal with this, we allocate enough space for the new disklabel
565 * but only copyin/out the smaller amount. 563 * but only copyin/out the smaller amount.
566 */ 564 */
567 if (IOCGROUP(com) == 'd') { 565 if (IOCGROUP(com) == 'd') {
568 u_long ncom = com ^ (DIOCGDINFO ^ DIOCGDINFO32); 566 u_long ncom = com ^ (DIOCGDINFO ^ DIOCGDINFO32);
569 switch (ncom) { 567 switch (ncom) {
570 case DIOCGDINFO: 568 case DIOCGDINFO:
571 case DIOCWDINFO: 569 case DIOCWDINFO:
572 case DIOCSDINFO: 570 case DIOCSDINFO:
573 case DIOCGDEFLABEL: 571 case DIOCGDEFLABEL:
574 com = ncom; 572 com = ncom;
575 if (IOCPARM_LEN(DIOCGDINFO32) < IOCPARM_LEN(DIOCGDINFO)) 573 if (IOCPARM_LEN(DIOCGDINFO32) < IOCPARM_LEN(DIOCGDINFO))
576 alloc_size = IOCPARM_LEN(DIOCGDINFO); 574 alloc_size = IOCPARM_LEN(DIOCGDINFO);
577 break; 575 break;
578 } 576 }
579 } 577 }
580 if (size > IOCPARM_MAX) { 578 if (size > IOCPARM_MAX) {
581 error = ENOTTY; 579 error = ENOTTY;
582 goto out; 580 goto out;
583 } 581 }
584 memp = NULL; 582 memp = NULL;
585 if ((com >> IOCPARM_SHIFT) == 0) { 583 if ((com >> IOCPARM_SHIFT) == 0) {
586 /* UNIX-style ioctl. */ 584 /* UNIX-style ioctl. */
587 data = SCARG(uap, data); 585 data = SCARG(uap, data);
588 } else { 586 } else {
589 if (alloc_size > sizeof(stkbuf)) { 587 if (alloc_size > sizeof(stkbuf)) {
590 memp = kmem_alloc(alloc_size, KM_SLEEP); 588 memp = kmem_alloc(alloc_size, KM_SLEEP);
591 data = memp; 589 data = memp;
592 } else { 590 } else {
593 data = (void *)stkbuf; 591 data = (void *)stkbuf;
594 } 592 }
595 if (com&IOC_IN) { 593 if (com&IOC_IN) {
596 if (size) { 594 if (size) {
597 error = copyin(SCARG(uap, data), data, size); 595 error = copyin(SCARG(uap, data), data, size);
598 if (error) { 596 if (error) {
599 goto out; 597 goto out;
600 } 598 }
601 /* 599 /*
602 * The data between size and alloc_size has 600 * The data between size and alloc_size has
603 * not been overwritten. It shouldn't matter 601 * not been overwritten. It shouldn't matter
604 * but let's clear that anyway. 602 * but let's clear that anyway.
605 */ 603 */
606 if (__predict_false(size < alloc_size)) { 604 if (__predict_false(size < alloc_size)) {
607 memset((char *)data+size, 0, 605 memset((char *)data+size, 0,
608 alloc_size - size); 606 alloc_size - size);
609 } 607 }
610 ktrgenio(SCARG(uap, fd), UIO_WRITE, 608 ktrgenio(SCARG(uap, fd), UIO_WRITE,
611 SCARG(uap, data), size, 0); 609 SCARG(uap, data), size, 0);
612 } else { 610 } else {
613 *(void **)data = SCARG(uap, data); 611 *(void **)data = SCARG(uap, data);
614 } 612 }
615 } else if ((com&IOC_OUT) && size) { 613 } else if ((com&IOC_OUT) && size) {
616 /* 614 /*
617 * Zero the buffer so the user always 615 * Zero the buffer so the user always
618 * gets back something deterministic. 616 * gets back something deterministic.
619 */ 617 */
620 memset(data, 0, size); 618 memset(data, 0, size);
621 } else if (com&IOC_VOID) { 619 } else if (com&IOC_VOID) {
622 *(void **)data = SCARG(uap, data); 620 *(void **)data = SCARG(uap, data);
623 } 621 }
624 } 622 }
625 623
626 switch (com) { 624 switch (com) {
627 625
628 case FIONBIO: 626 case FIONBIO:
629 /* XXX Code block is not atomic */ 627 /* XXX Code block is not atomic */
630 if (*(int *)data != 0) 628 if (*(int *)data != 0)
631 atomic_or_uint(&fp->f_flag, FNONBLOCK); 629 atomic_or_uint(&fp->f_flag, FNONBLOCK);
632 else 630 else
633 atomic_and_uint(&fp->f_flag, ~FNONBLOCK); 631 atomic_and_uint(&fp->f_flag, ~FNONBLOCK);
634 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data); 632 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data);
635 break; 633 break;
636 634
637 case FIOASYNC: 635 case FIOASYNC:
638 /* XXX Code block is not atomic */ 636 /* XXX Code block is not atomic */
639 if (*(int *)data != 0) 637 if (*(int *)data != 0)
640 atomic_or_uint(&fp->f_flag, FASYNC); 638 atomic_or_uint(&fp->f_flag, FASYNC);
641 else 639 else
642 atomic_and_uint(&fp->f_flag, ~FASYNC); 640 atomic_and_uint(&fp->f_flag, ~FASYNC);
643 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data); 641 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data);
644 break; 642 break;
645 643
646 default: 644 default:
647 error = (*fp->f_ops->fo_ioctl)(fp, com, data); 645 error = (*fp->f_ops->fo_ioctl)(fp, com, data);
648 /* 646 /*
649 * Copy any data to user, size was 647 * Copy any data to user, size was
650 * already set and checked above. 648 * already set and checked above.
651 */ 649 */
652 if (error == 0 && (com&IOC_OUT) && size) { 650 if (error == 0 && (com&IOC_OUT) && size) {
653 error = copyout(data, SCARG(uap, data), size); 651 error = copyout(data, SCARG(uap, data), size);
654 ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data), 652 ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
655 size, error); 653 size, error);
656 } 654 }
657 break; 655 break;
658 } 656 }
659 out: 657 out:
660 if (memp) 658 if (memp)
661 kmem_free(memp, alloc_size); 659 kmem_free(memp, alloc_size);
662 fd_putfile(SCARG(uap, fd)); 660 fd_putfile(SCARG(uap, fd));
663 switch (error) { 661 switch (error) {
664 case -1: 662 case -1:
665 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: " 663 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
666 "pid=%d comm=%s\n", 664 "pid=%d comm=%s\n",
667 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "", 665 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
668 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com), 666 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
669 p->p_pid, p->p_comm); 667 p->p_pid, p->p_comm);
670 /* FALLTHROUGH */ 668 /* FALLTHROUGH */
671 case EPASSTHROUGH: 669 case EPASSTHROUGH:
672 error = ENOTTY; 670 error = ENOTTY;
673 /* FALLTHROUGH */ 671 /* FALLTHROUGH */
674 default: 672 default:
675 return (error); 673 return (error);
676 } 674 }
677} 675}

cvs diff -r1.140 -r1.141 src/sys/kern/uipc_mbuf.c (switch to unified diff)

--- src/sys/kern/uipc_mbuf.c 2011/04/24 18:46:23 1.140
+++ src/sys/kern/uipc_mbuf.c 2011/07/27 14:35:34 1.141
@@ -1,1089 +1,1087 @@ @@ -1,1089 +1,1087 @@
1/* $NetBSD: uipc_mbuf.c,v 1.140 2011/04/24 18:46:23 rmind Exp $ */ 1/* $NetBSD: uipc_mbuf.c,v 1.141 2011/07/27 14:35:34 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1999, 2001 The NetBSD Foundation, Inc. 4 * Copyright (c) 1999, 2001 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center. 9 * NASA Ames Research Center.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions 12 * modification, are permitted provided that the following conditions
13 * are met: 13 * are met:
14 * 1. Redistributions of source code must retain the above copyright 14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer. 15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright 16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the 17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution. 18 * documentation and/or other materials provided with the distribution.
19 * 19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE. 30 * POSSIBILITY OF SUCH DAMAGE.
31 */ 31 */
32 32
33/* 33/*
34 * Copyright (c) 1982, 1986, 1988, 1991, 1993 34 * Copyright (c) 1982, 1986, 1988, 1991, 1993
35 * The Regents of the University of California. All rights reserved. 35 * The Regents of the University of California. All rights reserved.
36 * 36 *
37 * Redistribution and use in source and binary forms, with or without 37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions 38 * modification, are permitted provided that the following conditions
39 * are met: 39 * are met:
40 * 1. Redistributions of source code must retain the above copyright 40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer. 41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright 42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the 43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution. 44 * documentation and/or other materials provided with the distribution.
45 * 3. Neither the name of the University nor the names of its contributors 45 * 3. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software 46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission. 47 * without specific prior written permission.
48 * 48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE. 59 * SUCH DAMAGE.
60 * 60 *
61 * @(#)uipc_mbuf.c 8.4 (Berkeley) 2/14/95 61 * @(#)uipc_mbuf.c 8.4 (Berkeley) 2/14/95
62 */ 62 */
63 63
64#include <sys/cdefs.h> 64#include <sys/cdefs.h>
65__KERNEL_RCSID(0, "$NetBSD: uipc_mbuf.c,v 1.140 2011/04/24 18:46:23 rmind Exp $"); 65__KERNEL_RCSID(0, "$NetBSD: uipc_mbuf.c,v 1.141 2011/07/27 14:35:34 uebayasi Exp $");
66 66
67#include "opt_mbuftrace.h" 67#include "opt_mbuftrace.h"
68#include "opt_nmbclusters.h" 68#include "opt_nmbclusters.h"
69#include "opt_ddb.h" 69#include "opt_ddb.h"
70 70
71#include <sys/param.h> 71#include <sys/param.h>
72#include <sys/systm.h> 72#include <sys/systm.h>
73#include <sys/atomic.h> 73#include <sys/atomic.h>
74#include <sys/cpu.h> 74#include <sys/cpu.h>
75#include <sys/proc.h> 75#include <sys/proc.h>
76#define MBTYPES 76#define MBTYPES
77#include <sys/mbuf.h> 77#include <sys/mbuf.h>
78#include <sys/kernel.h> 78#include <sys/kernel.h>
79#include <sys/syslog.h> 79#include <sys/syslog.h>
80#include <sys/domain.h> 80#include <sys/domain.h>
81#include <sys/protosw.h> 81#include <sys/protosw.h>
82#include <sys/percpu.h> 82#include <sys/percpu.h>
83#include <sys/pool.h> 83#include <sys/pool.h>
84#include <sys/socket.h> 84#include <sys/socket.h>
85#include <sys/sysctl.h> 85#include <sys/sysctl.h>
86 86
87#include <net/if.h> 87#include <net/if.h>
88 88
89#include <uvm/uvm_extern.h> 
90 
91pool_cache_t mb_cache; /* mbuf cache */ 89pool_cache_t mb_cache; /* mbuf cache */
92pool_cache_t mcl_cache; /* mbuf cluster cache */ 90pool_cache_t mcl_cache; /* mbuf cluster cache */
93 91
94struct mbstat mbstat; 92struct mbstat mbstat;
95int max_linkhdr; 93int max_linkhdr;
96int max_protohdr; 94int max_protohdr;
97int max_hdr; 95int max_hdr;
98int max_datalen; 96int max_datalen;
99 97
100static int mb_ctor(void *, void *, int); 98static int mb_ctor(void *, void *, int);
101 99
102static void sysctl_kern_mbuf_setup(void); 100static void sysctl_kern_mbuf_setup(void);
103 101
104static struct sysctllog *mbuf_sysctllog; 102static struct sysctllog *mbuf_sysctllog;
105 103
106static struct mbuf *m_copym0(struct mbuf *, int, int, int, int); 104static struct mbuf *m_copym0(struct mbuf *, int, int, int, int);
107static struct mbuf *m_split0(struct mbuf *, int, int, int); 105static struct mbuf *m_split0(struct mbuf *, int, int, int);
108static int m_copyback0(struct mbuf **, int, int, const void *, int, int); 106static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
109 107
110/* flags for m_copyback0 */ 108/* flags for m_copyback0 */
111#define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */ 109#define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
112#define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */ 110#define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
113#define M_COPYBACK0_COW 0x0004 /* do copy-on-write */ 111#define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
114#define M_COPYBACK0_EXTEND 0x0008 /* extend chain */ 112#define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
115 113
116static const char mclpool_warnmsg[] = 114static const char mclpool_warnmsg[] =
117 "WARNING: mclpool limit reached; increase kern.mbuf.nmbclusters"; 115 "WARNING: mclpool limit reached; increase kern.mbuf.nmbclusters";
118 116
119MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf"); 117MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
120 118
121static percpu_t *mbstat_percpu; 119static percpu_t *mbstat_percpu;
122 120
123#ifdef MBUFTRACE 121#ifdef MBUFTRACE
124struct mownerhead mowners = LIST_HEAD_INITIALIZER(mowners); 122struct mownerhead mowners = LIST_HEAD_INITIALIZER(mowners);
125struct mowner unknown_mowners[] = { 123struct mowner unknown_mowners[] = {
126 MOWNER_INIT("unknown", "free"), 124 MOWNER_INIT("unknown", "free"),
127 MOWNER_INIT("unknown", "data"), 125 MOWNER_INIT("unknown", "data"),
128 MOWNER_INIT("unknown", "header"), 126 MOWNER_INIT("unknown", "header"),
129 MOWNER_INIT("unknown", "soname"), 127 MOWNER_INIT("unknown", "soname"),
130 MOWNER_INIT("unknown", "soopts"), 128 MOWNER_INIT("unknown", "soopts"),
131 MOWNER_INIT("unknown", "ftable"), 129 MOWNER_INIT("unknown", "ftable"),
132 MOWNER_INIT("unknown", "control"), 130 MOWNER_INIT("unknown", "control"),
133 MOWNER_INIT("unknown", "oobdata"), 131 MOWNER_INIT("unknown", "oobdata"),
134}; 132};
135struct mowner revoked_mowner = MOWNER_INIT("revoked", ""); 133struct mowner revoked_mowner = MOWNER_INIT("revoked", "");
136#endif 134#endif
137 135
138#define MEXT_ISEMBEDDED(m) ((m)->m_ext_ref == (m)) 136#define MEXT_ISEMBEDDED(m) ((m)->m_ext_ref == (m))
139 137
140#define MCLADDREFERENCE(o, n) \ 138#define MCLADDREFERENCE(o, n) \
141do { \ 139do { \
142 KASSERT(((o)->m_flags & M_EXT) != 0); \ 140 KASSERT(((o)->m_flags & M_EXT) != 0); \
143 KASSERT(((n)->m_flags & M_EXT) == 0); \ 141 KASSERT(((n)->m_flags & M_EXT) == 0); \
144 KASSERT((o)->m_ext.ext_refcnt >= 1); \ 142 KASSERT((o)->m_ext.ext_refcnt >= 1); \
145 (n)->m_flags |= ((o)->m_flags & M_EXTCOPYFLAGS); \ 143 (n)->m_flags |= ((o)->m_flags & M_EXTCOPYFLAGS); \
146 atomic_inc_uint(&(o)->m_ext.ext_refcnt); \ 144 atomic_inc_uint(&(o)->m_ext.ext_refcnt); \
147 (n)->m_ext_ref = (o)->m_ext_ref; \ 145 (n)->m_ext_ref = (o)->m_ext_ref; \
148 mowner_ref((n), (n)->m_flags); \ 146 mowner_ref((n), (n)->m_flags); \
149 MCLREFDEBUGN((n), __FILE__, __LINE__); \ 147 MCLREFDEBUGN((n), __FILE__, __LINE__); \
150} while (/* CONSTCOND */ 0) 148} while (/* CONSTCOND */ 0)
151 149
152static int 150static int
153nmbclusters_limit(void) 151nmbclusters_limit(void)
154{ 152{
155#if defined(PMAP_MAP_POOLPAGE) 153#if defined(PMAP_MAP_POOLPAGE)
156 /* direct mapping, doesn't use space in kmem_map */ 154 /* direct mapping, doesn't use space in kmem_map */
157 vsize_t max_size = physmem / 4; 155 vsize_t max_size = physmem / 4;
158#else 156#else
159 vsize_t max_size = MIN(physmem / 4, nkmempages / 2); 157 vsize_t max_size = MIN(physmem / 4, nkmempages / 2);
160#endif 158#endif
161 159
162 max_size = max_size * PAGE_SIZE / MCLBYTES; 160 max_size = max_size * PAGE_SIZE / MCLBYTES;
163#ifdef NMBCLUSTERS_MAX 161#ifdef NMBCLUSTERS_MAX
164 max_size = MIN(max_size, NMBCLUSTERS_MAX); 162 max_size = MIN(max_size, NMBCLUSTERS_MAX);
165#endif 163#endif
166 164
167#ifdef NMBCLUSTERS 165#ifdef NMBCLUSTERS
168 return MIN(max_size, NMBCLUSTERS); 166 return MIN(max_size, NMBCLUSTERS);
169#else 167#else
170 return max_size; 168 return max_size;
171#endif 169#endif
172} 170}
173 171
174/* 172/*
175 * Initialize the mbuf allocator. 173 * Initialize the mbuf allocator.
176 */ 174 */
177void 175void
178mbinit(void) 176mbinit(void)
179{ 177{
180 178
181 CTASSERT(sizeof(struct _m_ext) <= MHLEN); 179 CTASSERT(sizeof(struct _m_ext) <= MHLEN);
182 CTASSERT(sizeof(struct mbuf) == MSIZE); 180 CTASSERT(sizeof(struct mbuf) == MSIZE);
183 181
184 sysctl_kern_mbuf_setup(); 182 sysctl_kern_mbuf_setup();
185 183
186 mb_cache = pool_cache_init(msize, 0, 0, 0, "mbpl", 184 mb_cache = pool_cache_init(msize, 0, 0, 0, "mbpl",
187 NULL, IPL_VM, mb_ctor, NULL, NULL); 185 NULL, IPL_VM, mb_ctor, NULL, NULL);
188 KASSERT(mb_cache != NULL); 186 KASSERT(mb_cache != NULL);
189 187
190 mcl_cache = pool_cache_init(mclbytes, 0, 0, 0, "mclpl", NULL, 188 mcl_cache = pool_cache_init(mclbytes, 0, 0, 0, "mclpl", NULL,
191 IPL_VM, NULL, NULL, NULL); 189 IPL_VM, NULL, NULL, NULL);
192 KASSERT(mcl_cache != NULL); 190 KASSERT(mcl_cache != NULL);
193 191
194 pool_cache_set_drain_hook(mb_cache, m_reclaim, NULL); 192 pool_cache_set_drain_hook(mb_cache, m_reclaim, NULL);
195 pool_cache_set_drain_hook(mcl_cache, m_reclaim, NULL); 193 pool_cache_set_drain_hook(mcl_cache, m_reclaim, NULL);
196 194
197 /* 195 /*
198 * Set an arbitrary default limit on the number of mbuf clusters. 196 * Set an arbitrary default limit on the number of mbuf clusters.
199 */ 197 */
200#ifdef NMBCLUSTERS 198#ifdef NMBCLUSTERS
201 nmbclusters = nmbclusters_limit(); 199 nmbclusters = nmbclusters_limit();
202#else 200#else
203 nmbclusters = MAX(1024, 201 nmbclusters = MAX(1024,
204 (vsize_t)physmem * PAGE_SIZE / MCLBYTES / 16); 202 (vsize_t)physmem * PAGE_SIZE / MCLBYTES / 16);
205 nmbclusters = MIN(nmbclusters, nmbclusters_limit()); 203 nmbclusters = MIN(nmbclusters, nmbclusters_limit());
206#endif 204#endif
207 205
208 /* 206 /*
209 * Set the hard limit on the mclpool to the number of 207 * Set the hard limit on the mclpool to the number of
210 * mbuf clusters the kernel is to support. Log the limit 208 * mbuf clusters the kernel is to support. Log the limit
211 * reached message max once a minute. 209 * reached message max once a minute.
212 */ 210 */
213 pool_cache_sethardlimit(mcl_cache, nmbclusters, mclpool_warnmsg, 60); 211 pool_cache_sethardlimit(mcl_cache, nmbclusters, mclpool_warnmsg, 60);
214 212
215 mbstat_percpu = percpu_alloc(sizeof(struct mbstat_cpu)); 213 mbstat_percpu = percpu_alloc(sizeof(struct mbstat_cpu));
216 214
217 /* 215 /*
218 * Set a low water mark for both mbufs and clusters. This should 216 * Set a low water mark for both mbufs and clusters. This should
219 * help ensure that they can be allocated in a memory starvation 217 * help ensure that they can be allocated in a memory starvation
220 * situation. This is important for e.g. diskless systems which 218 * situation. This is important for e.g. diskless systems which
221 * must allocate mbufs in order for the pagedaemon to clean pages. 219 * must allocate mbufs in order for the pagedaemon to clean pages.
222 */ 220 */
223 pool_cache_setlowat(mb_cache, mblowat); 221 pool_cache_setlowat(mb_cache, mblowat);
224 pool_cache_setlowat(mcl_cache, mcllowat); 222 pool_cache_setlowat(mcl_cache, mcllowat);
225 223
226#ifdef MBUFTRACE 224#ifdef MBUFTRACE
227 { 225 {
228 /* 226 /*
229 * Attach the unknown mowners. 227 * Attach the unknown mowners.
230 */ 228 */
231 int i; 229 int i;
232 MOWNER_ATTACH(&revoked_mowner); 230 MOWNER_ATTACH(&revoked_mowner);
233 for (i = sizeof(unknown_mowners)/sizeof(unknown_mowners[0]); 231 for (i = sizeof(unknown_mowners)/sizeof(unknown_mowners[0]);
234 i-- > 0; ) 232 i-- > 0; )
235 MOWNER_ATTACH(&unknown_mowners[i]); 233 MOWNER_ATTACH(&unknown_mowners[i]);
236 } 234 }
237#endif 235#endif
238} 236}
239 237
240/* 238/*
241 * sysctl helper routine for the kern.mbuf subtree. 239 * sysctl helper routine for the kern.mbuf subtree.
242 * nmbclusters, mblowat and mcllowat need range 240 * nmbclusters, mblowat and mcllowat need range
243 * checking and pool tweaking after being reset. 241 * checking and pool tweaking after being reset.
244 */ 242 */
245static int 243static int
246sysctl_kern_mbuf(SYSCTLFN_ARGS) 244sysctl_kern_mbuf(SYSCTLFN_ARGS)
247{ 245{
248 int error, newval; 246 int error, newval;
249 struct sysctlnode node; 247 struct sysctlnode node;
250 248
251 node = *rnode; 249 node = *rnode;
252 node.sysctl_data = &newval; 250 node.sysctl_data = &newval;
253 switch (rnode->sysctl_num) { 251 switch (rnode->sysctl_num) {
254 case MBUF_NMBCLUSTERS: 252 case MBUF_NMBCLUSTERS:
255 case MBUF_MBLOWAT: 253 case MBUF_MBLOWAT:
256 case MBUF_MCLLOWAT: 254 case MBUF_MCLLOWAT:
257 newval = *(int*)rnode->sysctl_data; 255 newval = *(int*)rnode->sysctl_data;
258 break; 256 break;
259 default: 257 default:
260 return (EOPNOTSUPP); 258 return (EOPNOTSUPP);
261 } 259 }
262 260
263 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 261 error = sysctl_lookup(SYSCTLFN_CALL(&node));
264 if (error || newp == NULL) 262 if (error || newp == NULL)
265 return (error); 263 return (error);
266 if (newval < 0) 264 if (newval < 0)
267 return (EINVAL); 265 return (EINVAL);
268 266
269 switch (node.sysctl_num) { 267 switch (node.sysctl_num) {
270 case MBUF_NMBCLUSTERS: 268 case MBUF_NMBCLUSTERS:
271 if (newval < nmbclusters) 269 if (newval < nmbclusters)
272 return (EINVAL); 270 return (EINVAL);
273 if (newval > nmbclusters_limit()) 271 if (newval > nmbclusters_limit())
274 return (EINVAL); 272 return (EINVAL);
275 nmbclusters = newval; 273 nmbclusters = newval;
276 pool_cache_sethardlimit(mcl_cache, nmbclusters, 274 pool_cache_sethardlimit(mcl_cache, nmbclusters,
277 mclpool_warnmsg, 60); 275 mclpool_warnmsg, 60);
278 break; 276 break;
279 case MBUF_MBLOWAT: 277 case MBUF_MBLOWAT:
280 mblowat = newval; 278 mblowat = newval;
281 pool_cache_setlowat(mb_cache, mblowat); 279 pool_cache_setlowat(mb_cache, mblowat);
282 break; 280 break;
283 case MBUF_MCLLOWAT: 281 case MBUF_MCLLOWAT:
284 mcllowat = newval; 282 mcllowat = newval;
285 pool_cache_setlowat(mcl_cache, mcllowat); 283 pool_cache_setlowat(mcl_cache, mcllowat);
286 break; 284 break;
287 } 285 }
288 286
289 return (0); 287 return (0);
290} 288}
291 289
292#ifdef MBUFTRACE 290#ifdef MBUFTRACE
293static void 291static void
294mowner_conver_to_user_cb(void *v1, void *v2, struct cpu_info *ci) 292mowner_conver_to_user_cb(void *v1, void *v2, struct cpu_info *ci)
295{ 293{
296 struct mowner_counter *mc = v1; 294 struct mowner_counter *mc = v1;
297 struct mowner_user *mo_user = v2; 295 struct mowner_user *mo_user = v2;
298 int i; 296 int i;
299 297
300 for (i = 0; i < MOWNER_COUNTER_NCOUNTERS; i++) { 298 for (i = 0; i < MOWNER_COUNTER_NCOUNTERS; i++) {
301 mo_user->mo_counter[i] += mc->mc_counter[i]; 299 mo_user->mo_counter[i] += mc->mc_counter[i];
302 } 300 }
303} 301}
304 302
305static void 303static void
306mowner_convert_to_user(struct mowner *mo, struct mowner_user *mo_user) 304mowner_convert_to_user(struct mowner *mo, struct mowner_user *mo_user)
307{ 305{
308 306
309 memset(mo_user, 0, sizeof(*mo_user)); 307 memset(mo_user, 0, sizeof(*mo_user));
310 CTASSERT(sizeof(mo_user->mo_name) == sizeof(mo->mo_name)); 308 CTASSERT(sizeof(mo_user->mo_name) == sizeof(mo->mo_name));
311 CTASSERT(sizeof(mo_user->mo_descr) == sizeof(mo->mo_descr)); 309 CTASSERT(sizeof(mo_user->mo_descr) == sizeof(mo->mo_descr));
312 memcpy(mo_user->mo_name, mo->mo_name, sizeof(mo->mo_name)); 310 memcpy(mo_user->mo_name, mo->mo_name, sizeof(mo->mo_name));
313 memcpy(mo_user->mo_descr, mo->mo_descr, sizeof(mo->mo_descr)); 311 memcpy(mo_user->mo_descr, mo->mo_descr, sizeof(mo->mo_descr));
314 percpu_foreach(mo->mo_counters, mowner_conver_to_user_cb, mo_user); 312 percpu_foreach(mo->mo_counters, mowner_conver_to_user_cb, mo_user);
315} 313}
316 314
317static int 315static int
318sysctl_kern_mbuf_mowners(SYSCTLFN_ARGS) 316sysctl_kern_mbuf_mowners(SYSCTLFN_ARGS)
319{ 317{
320 struct mowner *mo; 318 struct mowner *mo;
321 size_t len = 0; 319 size_t len = 0;
322 int error = 0; 320 int error = 0;
323 321
324 if (namelen != 0) 322 if (namelen != 0)
325 return (EINVAL); 323 return (EINVAL);
326 if (newp != NULL) 324 if (newp != NULL)
327 return (EPERM); 325 return (EPERM);
328 326
329 LIST_FOREACH(mo, &mowners, mo_link) { 327 LIST_FOREACH(mo, &mowners, mo_link) {
330 struct mowner_user mo_user; 328 struct mowner_user mo_user;
331 329
332 mowner_convert_to_user(mo, &mo_user); 330 mowner_convert_to_user(mo, &mo_user);
333 331
334 if (oldp != NULL) { 332 if (oldp != NULL) {
335 if (*oldlenp - len < sizeof(mo_user)) { 333 if (*oldlenp - len < sizeof(mo_user)) {
336 error = ENOMEM; 334 error = ENOMEM;
337 break; 335 break;
338 } 336 }
339 error = copyout(&mo_user, (char *)oldp + len, 337 error = copyout(&mo_user, (char *)oldp + len,
340 sizeof(mo_user)); 338 sizeof(mo_user));
341 if (error) 339 if (error)
342 break; 340 break;
343 } 341 }
344 len += sizeof(mo_user); 342 len += sizeof(mo_user);
345 } 343 }
346 344
347 if (error == 0) 345 if (error == 0)
348 *oldlenp = len; 346 *oldlenp = len;
349 347
350 return (error); 348 return (error);
351} 349}
352#endif /* MBUFTRACE */ 350#endif /* MBUFTRACE */
353 351
354static void 352static void
355mbstat_conver_to_user_cb(void *v1, void *v2, struct cpu_info *ci) 353mbstat_conver_to_user_cb(void *v1, void *v2, struct cpu_info *ci)
356{ 354{
357 struct mbstat_cpu *mbsc = v1; 355 struct mbstat_cpu *mbsc = v1;
358 struct mbstat *mbs = v2; 356 struct mbstat *mbs = v2;
359 int i; 357 int i;
360 358
361 for (i = 0; i < __arraycount(mbs->m_mtypes); i++) { 359 for (i = 0; i < __arraycount(mbs->m_mtypes); i++) {
362 mbs->m_mtypes[i] += mbsc->m_mtypes[i]; 360 mbs->m_mtypes[i] += mbsc->m_mtypes[i];
363 } 361 }
364} 362}
365 363
366static void 364static void
367mbstat_convert_to_user(struct mbstat *mbs) 365mbstat_convert_to_user(struct mbstat *mbs)
368{ 366{
369 367
370 memset(mbs, 0, sizeof(*mbs)); 368 memset(mbs, 0, sizeof(*mbs));
371 mbs->m_drain = mbstat.m_drain; 369 mbs->m_drain = mbstat.m_drain;
372 percpu_foreach(mbstat_percpu, mbstat_conver_to_user_cb, mbs); 370 percpu_foreach(mbstat_percpu, mbstat_conver_to_user_cb, mbs);
373} 371}
374 372
375static int 373static int
376sysctl_kern_mbuf_stats(SYSCTLFN_ARGS) 374sysctl_kern_mbuf_stats(SYSCTLFN_ARGS)
377{ 375{
378 struct sysctlnode node; 376 struct sysctlnode node;
379 struct mbstat mbs; 377 struct mbstat mbs;
380 378
381 mbstat_convert_to_user(&mbs); 379 mbstat_convert_to_user(&mbs);
382 node = *rnode; 380 node = *rnode;
383 node.sysctl_data = &mbs; 381 node.sysctl_data = &mbs;
384 node.sysctl_size = sizeof(mbs); 382 node.sysctl_size = sizeof(mbs);
385 return sysctl_lookup(SYSCTLFN_CALL(&node)); 383 return sysctl_lookup(SYSCTLFN_CALL(&node));
386} 384}
387 385
388static void 386static void
389sysctl_kern_mbuf_setup(void) 387sysctl_kern_mbuf_setup(void)
390{ 388{
391 389
392 KASSERT(mbuf_sysctllog == NULL); 390 KASSERT(mbuf_sysctllog == NULL);
393 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, 391 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
394 CTLFLAG_PERMANENT, 392 CTLFLAG_PERMANENT,
395 CTLTYPE_NODE, "kern", NULL, 393 CTLTYPE_NODE, "kern", NULL,
396 NULL, 0, NULL, 0, 394 NULL, 0, NULL, 0,
397 CTL_KERN, CTL_EOL); 395 CTL_KERN, CTL_EOL);
398 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, 396 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
399 CTLFLAG_PERMANENT, 397 CTLFLAG_PERMANENT,
400 CTLTYPE_NODE, "mbuf", 398 CTLTYPE_NODE, "mbuf",
401 SYSCTL_DESCR("mbuf control variables"), 399 SYSCTL_DESCR("mbuf control variables"),
402 NULL, 0, NULL, 0, 400 NULL, 0, NULL, 0,
403 CTL_KERN, KERN_MBUF, CTL_EOL); 401 CTL_KERN, KERN_MBUF, CTL_EOL);
404 402
405 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, 403 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
406 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, 404 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
407 CTLTYPE_INT, "msize", 405 CTLTYPE_INT, "msize",
408 SYSCTL_DESCR("mbuf base size"), 406 SYSCTL_DESCR("mbuf base size"),
409 NULL, msize, NULL, 0, 407 NULL, msize, NULL, 0,
410 CTL_KERN, KERN_MBUF, MBUF_MSIZE, CTL_EOL); 408 CTL_KERN, KERN_MBUF, MBUF_MSIZE, CTL_EOL);
411 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, 409 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
412 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, 410 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
413 CTLTYPE_INT, "mclbytes", 411 CTLTYPE_INT, "mclbytes",
414 SYSCTL_DESCR("mbuf cluster size"), 412 SYSCTL_DESCR("mbuf cluster size"),
415 NULL, mclbytes, NULL, 0, 413 NULL, mclbytes, NULL, 0,
416 CTL_KERN, KERN_MBUF, MBUF_MCLBYTES, CTL_EOL); 414 CTL_KERN, KERN_MBUF, MBUF_MCLBYTES, CTL_EOL);
417 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, 415 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
418 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 416 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
419 CTLTYPE_INT, "nmbclusters", 417 CTLTYPE_INT, "nmbclusters",
420 SYSCTL_DESCR("Limit on the number of mbuf clusters"), 418 SYSCTL_DESCR("Limit on the number of mbuf clusters"),
421 sysctl_kern_mbuf, 0, &nmbclusters, 0, 419 sysctl_kern_mbuf, 0, &nmbclusters, 0,
422 CTL_KERN, KERN_MBUF, MBUF_NMBCLUSTERS, CTL_EOL); 420 CTL_KERN, KERN_MBUF, MBUF_NMBCLUSTERS, CTL_EOL);
423 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, 421 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
424 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 422 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
425 CTLTYPE_INT, "mblowat", 423 CTLTYPE_INT, "mblowat",
426 SYSCTL_DESCR("mbuf low water mark"), 424 SYSCTL_DESCR("mbuf low water mark"),
427 sysctl_kern_mbuf, 0, &mblowat, 0, 425 sysctl_kern_mbuf, 0, &mblowat, 0,
428 CTL_KERN, KERN_MBUF, MBUF_MBLOWAT, CTL_EOL); 426 CTL_KERN, KERN_MBUF, MBUF_MBLOWAT, CTL_EOL);
429 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, 427 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
430 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 428 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
431 CTLTYPE_INT, "mcllowat", 429 CTLTYPE_INT, "mcllowat",
432 SYSCTL_DESCR("mbuf cluster low water mark"), 430 SYSCTL_DESCR("mbuf cluster low water mark"),
433 sysctl_kern_mbuf, 0, &mcllowat, 0, 431 sysctl_kern_mbuf, 0, &mcllowat, 0,
434 CTL_KERN, KERN_MBUF, MBUF_MCLLOWAT, CTL_EOL); 432 CTL_KERN, KERN_MBUF, MBUF_MCLLOWAT, CTL_EOL);
435 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, 433 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
436 CTLFLAG_PERMANENT, 434 CTLFLAG_PERMANENT,
437 CTLTYPE_STRUCT, "stats", 435 CTLTYPE_STRUCT, "stats",
438 SYSCTL_DESCR("mbuf allocation statistics"), 436 SYSCTL_DESCR("mbuf allocation statistics"),
439 sysctl_kern_mbuf_stats, 0, NULL, 0, 437 sysctl_kern_mbuf_stats, 0, NULL, 0,
440 CTL_KERN, KERN_MBUF, MBUF_STATS, CTL_EOL); 438 CTL_KERN, KERN_MBUF, MBUF_STATS, CTL_EOL);
441#ifdef MBUFTRACE 439#ifdef MBUFTRACE
442 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL, 440 sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
443 CTLFLAG_PERMANENT, 441 CTLFLAG_PERMANENT,
444 CTLTYPE_STRUCT, "mowners", 442 CTLTYPE_STRUCT, "mowners",
445 SYSCTL_DESCR("Information about mbuf owners"), 443 SYSCTL_DESCR("Information about mbuf owners"),
446 sysctl_kern_mbuf_mowners, 0, NULL, 0, 444 sysctl_kern_mbuf_mowners, 0, NULL, 0,
447 CTL_KERN, KERN_MBUF, MBUF_MOWNERS, CTL_EOL); 445 CTL_KERN, KERN_MBUF, MBUF_MOWNERS, CTL_EOL);
448#endif /* MBUFTRACE */ 446#endif /* MBUFTRACE */
449} 447}
450 448
451static int 449static int
452mb_ctor(void *arg, void *object, int flags) 450mb_ctor(void *arg, void *object, int flags)
453{ 451{
454 struct mbuf *m = object; 452 struct mbuf *m = object;
455 453
456#ifdef POOL_VTOPHYS 454#ifdef POOL_VTOPHYS
457 m->m_paddr = POOL_VTOPHYS(m); 455 m->m_paddr = POOL_VTOPHYS(m);
458#else 456#else
459 m->m_paddr = M_PADDR_INVALID; 457 m->m_paddr = M_PADDR_INVALID;
460#endif 458#endif
461 return (0); 459 return (0);
462} 460}
463 461
464void 462void
465m_reclaim(void *arg, int flags) 463m_reclaim(void *arg, int flags)
466{ 464{
467 struct domain *dp; 465 struct domain *dp;
468 const struct protosw *pr; 466 const struct protosw *pr;
469 struct ifnet *ifp; 467 struct ifnet *ifp;
470 int s; 468 int s;
471 469
472 KERNEL_LOCK(1, NULL); 470 KERNEL_LOCK(1, NULL);
473 s = splvm(); 471 s = splvm();
474 DOMAIN_FOREACH(dp) { 472 DOMAIN_FOREACH(dp) {
475 for (pr = dp->dom_protosw; 473 for (pr = dp->dom_protosw;
476 pr < dp->dom_protoswNPROTOSW; pr++) 474 pr < dp->dom_protoswNPROTOSW; pr++)
477 if (pr->pr_drain) 475 if (pr->pr_drain)
478 (*pr->pr_drain)(); 476 (*pr->pr_drain)();
479 } 477 }
480 IFNET_FOREACH(ifp) { 478 IFNET_FOREACH(ifp) {
481 if (ifp->if_drain) 479 if (ifp->if_drain)
482 (*ifp->if_drain)(ifp); 480 (*ifp->if_drain)(ifp);
483 } 481 }
484 splx(s); 482 splx(s);
485 mbstat.m_drain++; 483 mbstat.m_drain++;
486 KERNEL_UNLOCK_ONE(NULL); 484 KERNEL_UNLOCK_ONE(NULL);
487} 485}
488 486
489/* 487/*
490 * Space allocation routines. 488 * Space allocation routines.
491 * These are also available as macros 489 * These are also available as macros
492 * for critical paths. 490 * for critical paths.
493 */ 491 */
494struct mbuf * 492struct mbuf *
495m_get(int nowait, int type) 493m_get(int nowait, int type)
496{ 494{
497 struct mbuf *m; 495 struct mbuf *m;
498 496
499 m = pool_cache_get(mb_cache, 497 m = pool_cache_get(mb_cache,
500 nowait == M_WAIT ? PR_WAITOK|PR_LIMITFAIL : 0); 498 nowait == M_WAIT ? PR_WAITOK|PR_LIMITFAIL : 0);
501 if (m == NULL) 499 if (m == NULL)
502 return NULL; 500 return NULL;
503 501
504 mbstat_type_add(type, 1); 502 mbstat_type_add(type, 1);
505 mowner_init(m, type); 503 mowner_init(m, type);
506 m->m_ext_ref = m; 504 m->m_ext_ref = m;
507 m->m_type = type; 505 m->m_type = type;
508 m->m_next = NULL; 506 m->m_next = NULL;
509 m->m_nextpkt = NULL; 507 m->m_nextpkt = NULL;
510 m->m_data = m->m_dat; 508 m->m_data = m->m_dat;
511 m->m_flags = 0; 509 m->m_flags = 0;
512 510
513 return m; 511 return m;
514} 512}
515 513
516struct mbuf * 514struct mbuf *
517m_gethdr(int nowait, int type) 515m_gethdr(int nowait, int type)
518{ 516{
519 struct mbuf *m; 517 struct mbuf *m;
520 518
521 m = m_get(nowait, type); 519 m = m_get(nowait, type);
522 if (m == NULL) 520 if (m == NULL)
523 return NULL; 521 return NULL;
524 522
525 m->m_data = m->m_pktdat; 523 m->m_data = m->m_pktdat;
526 m->m_flags = M_PKTHDR; 524 m->m_flags = M_PKTHDR;
527 m->m_pkthdr.rcvif = NULL; 525 m->m_pkthdr.rcvif = NULL;
528 m->m_pkthdr.csum_flags = 0; 526 m->m_pkthdr.csum_flags = 0;
529 m->m_pkthdr.csum_data = 0; 527 m->m_pkthdr.csum_data = 0;
530 SLIST_INIT(&m->m_pkthdr.tags); 528 SLIST_INIT(&m->m_pkthdr.tags);
531 529
532 return m; 530 return m;
533} 531}
534 532
535struct mbuf * 533struct mbuf *
536m_getclr(int nowait, int type) 534m_getclr(int nowait, int type)
537{ 535{
538 struct mbuf *m; 536 struct mbuf *m;
539 537
540 MGET(m, nowait, type); 538 MGET(m, nowait, type);
541 if (m == 0) 539 if (m == 0)
542 return (NULL); 540 return (NULL);
543 memset(mtod(m, void *), 0, MLEN); 541 memset(mtod(m, void *), 0, MLEN);
544 return (m); 542 return (m);
545} 543}
546 544
547void 545void
548m_clget(struct mbuf *m, int nowait) 546m_clget(struct mbuf *m, int nowait)
549{ 547{
550 548
551 MCLGET(m, nowait); 549 MCLGET(m, nowait);
552} 550}
553 551
554struct mbuf * 552struct mbuf *
555m_free(struct mbuf *m) 553m_free(struct mbuf *m)
556{ 554{
557 struct mbuf *n; 555 struct mbuf *n;
558 556
559 MFREE(m, n); 557 MFREE(m, n);
560 return (n); 558 return (n);
561} 559}
562 560
563void 561void
564m_freem(struct mbuf *m) 562m_freem(struct mbuf *m)
565{ 563{
566 struct mbuf *n; 564 struct mbuf *n;
567 565
568 if (m == NULL) 566 if (m == NULL)
569 return; 567 return;
570 do { 568 do {
571 MFREE(m, n); 569 MFREE(m, n);
572 m = n; 570 m = n;
573 } while (m); 571 } while (m);
574} 572}
575 573
576#ifdef MBUFTRACE 574#ifdef MBUFTRACE
577/* 575/*
578 * Walk a chain of mbufs, claiming ownership of each mbuf in the chain. 576 * Walk a chain of mbufs, claiming ownership of each mbuf in the chain.
579 */ 577 */
580void 578void
581m_claimm(struct mbuf *m, struct mowner *mo) 579m_claimm(struct mbuf *m, struct mowner *mo)
582{ 580{
583 581
584 for (; m != NULL; m = m->m_next) 582 for (; m != NULL; m = m->m_next)
585 MCLAIM(m, mo); 583 MCLAIM(m, mo);
586} 584}
587#endif 585#endif
588 586
589/* 587/*
590 * Mbuffer utility routines. 588 * Mbuffer utility routines.
591 */ 589 */
592 590
593/* 591/*
594 * Lesser-used path for M_PREPEND: 592 * Lesser-used path for M_PREPEND:
595 * allocate new mbuf to prepend to chain, 593 * allocate new mbuf to prepend to chain,
596 * copy junk along. 594 * copy junk along.
597 */ 595 */
598struct mbuf * 596struct mbuf *
599m_prepend(struct mbuf *m, int len, int how) 597m_prepend(struct mbuf *m, int len, int how)
600{ 598{
601 struct mbuf *mn; 599 struct mbuf *mn;
602 600
603 MGET(mn, how, m->m_type); 601 MGET(mn, how, m->m_type);
604 if (mn == (struct mbuf *)NULL) { 602 if (mn == (struct mbuf *)NULL) {
605 m_freem(m); 603 m_freem(m);
606 return ((struct mbuf *)NULL); 604 return ((struct mbuf *)NULL);
607 } 605 }
608 if (m->m_flags & M_PKTHDR) { 606 if (m->m_flags & M_PKTHDR) {
609 M_MOVE_PKTHDR(mn, m); 607 M_MOVE_PKTHDR(mn, m);
610 } else { 608 } else {
611 MCLAIM(mn, m->m_owner); 609 MCLAIM(mn, m->m_owner);
612 } 610 }
613 mn->m_next = m; 611 mn->m_next = m;
614 m = mn; 612 m = mn;
615 if (len < MHLEN) 613 if (len < MHLEN)
616 MH_ALIGN(m, len); 614 MH_ALIGN(m, len);
617 m->m_len = len; 615 m->m_len = len;
618 return (m); 616 return (m);
619} 617}
620 618
621/* 619/*
622 * Make a copy of an mbuf chain starting "off0" bytes from the beginning, 620 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
623 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. 621 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
624 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller. 622 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
625 */ 623 */
626int MCFail; 624int MCFail;
627 625
628struct mbuf * 626struct mbuf *
629m_copym(struct mbuf *m, int off0, int len, int wait) 627m_copym(struct mbuf *m, int off0, int len, int wait)
630{ 628{
631 629
632 return m_copym0(m, off0, len, wait, 0); /* shallow copy on M_EXT */ 630 return m_copym0(m, off0, len, wait, 0); /* shallow copy on M_EXT */
633} 631}
634 632
635struct mbuf * 633struct mbuf *
636m_dup(struct mbuf *m, int off0, int len, int wait) 634m_dup(struct mbuf *m, int off0, int len, int wait)
637{ 635{
638 636
639 return m_copym0(m, off0, len, wait, 1); /* deep copy */ 637 return m_copym0(m, off0, len, wait, 1); /* deep copy */
640} 638}
641 639
642static struct mbuf * 640static struct mbuf *
643m_copym0(struct mbuf *m, int off0, int len, int wait, int deep) 641m_copym0(struct mbuf *m, int off0, int len, int wait, int deep)
644{ 642{
645 struct mbuf *n, **np; 643 struct mbuf *n, **np;
646 int off = off0; 644 int off = off0;
647 struct mbuf *top; 645 struct mbuf *top;
648 int copyhdr = 0; 646 int copyhdr = 0;
649 647
650 if (off < 0 || len < 0) 648 if (off < 0 || len < 0)
651 panic("m_copym: off %d, len %d", off, len); 649 panic("m_copym: off %d, len %d", off, len);
652 if (off == 0 && m->m_flags & M_PKTHDR) 650 if (off == 0 && m->m_flags & M_PKTHDR)
653 copyhdr = 1; 651 copyhdr = 1;
654 while (off > 0) { 652 while (off > 0) {
655 if (m == 0) 653 if (m == 0)
656 panic("m_copym: m == 0, off %d", off); 654 panic("m_copym: m == 0, off %d", off);
657 if (off < m->m_len) 655 if (off < m->m_len)
658 break; 656 break;
659 off -= m->m_len; 657 off -= m->m_len;
660 m = m->m_next; 658 m = m->m_next;
661 } 659 }
662 np = &top; 660 np = &top;
663 top = 0; 661 top = 0;
664 while (len > 0) { 662 while (len > 0) {
665 if (m == 0) { 663 if (m == 0) {
666 if (len != M_COPYALL) 664 if (len != M_COPYALL)
667 panic("m_copym: m == 0, len %d [!COPYALL]", 665 panic("m_copym: m == 0, len %d [!COPYALL]",
668 len); 666 len);
669 break; 667 break;
670 } 668 }
671 MGET(n, wait, m->m_type); 669 MGET(n, wait, m->m_type);
672 *np = n; 670 *np = n;
673 if (n == 0) 671 if (n == 0)
674 goto nospace; 672 goto nospace;
675 MCLAIM(n, m->m_owner); 673 MCLAIM(n, m->m_owner);
676 if (copyhdr) { 674 if (copyhdr) {
677 M_COPY_PKTHDR(n, m); 675 M_COPY_PKTHDR(n, m);
678 if (len == M_COPYALL) 676 if (len == M_COPYALL)
679 n->m_pkthdr.len -= off0; 677 n->m_pkthdr.len -= off0;
680 else 678 else
681 n->m_pkthdr.len = len; 679 n->m_pkthdr.len = len;
682 copyhdr = 0; 680 copyhdr = 0;
683 } 681 }
684 n->m_len = min(len, m->m_len - off); 682 n->m_len = min(len, m->m_len - off);
685 if (m->m_flags & M_EXT) { 683 if (m->m_flags & M_EXT) {
686 if (!deep) { 684 if (!deep) {
687 n->m_data = m->m_data + off; 685 n->m_data = m->m_data + off;
688 MCLADDREFERENCE(m, n); 686 MCLADDREFERENCE(m, n);
689 } else { 687 } else {
690 /* 688 /*
691 * we are unsure about the way m was allocated. 689 * we are unsure about the way m was allocated.
692 * copy into multiple MCLBYTES cluster mbufs. 690 * copy into multiple MCLBYTES cluster mbufs.
693 */ 691 */
694 MCLGET(n, wait); 692 MCLGET(n, wait);
695 n->m_len = 0; 693 n->m_len = 0;
696 n->m_len = M_TRAILINGSPACE(n); 694 n->m_len = M_TRAILINGSPACE(n);
697 n->m_len = min(n->m_len, len); 695 n->m_len = min(n->m_len, len);
698 n->m_len = min(n->m_len, m->m_len - off); 696 n->m_len = min(n->m_len, m->m_len - off);
699 memcpy(mtod(n, void *), mtod(m, char *) + off, 697 memcpy(mtod(n, void *), mtod(m, char *) + off,
700 (unsigned)n->m_len); 698 (unsigned)n->m_len);
701 } 699 }
702 } else 700 } else
703 memcpy(mtod(n, void *), mtod(m, char *) + off, 701 memcpy(mtod(n, void *), mtod(m, char *) + off,
704 (unsigned)n->m_len); 702 (unsigned)n->m_len);
705 if (len != M_COPYALL) 703 if (len != M_COPYALL)
706 len -= n->m_len; 704 len -= n->m_len;
707 off += n->m_len; 705 off += n->m_len;
708#ifdef DIAGNOSTIC 706#ifdef DIAGNOSTIC
709 if (off > m->m_len) 707 if (off > m->m_len)
710 panic("m_copym0 overrun"); 708 panic("m_copym0 overrun");
711#endif 709#endif
712 if (off == m->m_len) { 710 if (off == m->m_len) {
713 m = m->m_next; 711 m = m->m_next;
714 off = 0; 712 off = 0;
715 } 713 }
716 np = &n->m_next; 714 np = &n->m_next;
717 } 715 }
718 if (top == 0) 716 if (top == 0)
719 MCFail++; 717 MCFail++;
720 return (top); 718 return (top);
721nospace: 719nospace:
722 m_freem(top); 720 m_freem(top);
723 MCFail++; 721 MCFail++;
724 return (NULL); 722 return (NULL);
725} 723}
726 724
727/* 725/*
728 * Copy an entire packet, including header (which must be present). 726 * Copy an entire packet, including header (which must be present).
729 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. 727 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
730 */ 728 */
731struct mbuf * 729struct mbuf *
732m_copypacket(struct mbuf *m, int how) 730m_copypacket(struct mbuf *m, int how)
733{ 731{
734 struct mbuf *top, *n, *o; 732 struct mbuf *top, *n, *o;
735 733
736 MGET(n, how, m->m_type); 734 MGET(n, how, m->m_type);
737 top = n; 735 top = n;
738 if (!n) 736 if (!n)
739 goto nospace; 737 goto nospace;
740 738
741 MCLAIM(n, m->m_owner); 739 MCLAIM(n, m->m_owner);
742 M_COPY_PKTHDR(n, m); 740 M_COPY_PKTHDR(n, m);
743 n->m_len = m->m_len; 741 n->m_len = m->m_len;
744 if (m->m_flags & M_EXT) { 742 if (m->m_flags & M_EXT) {
745 n->m_data = m->m_data; 743 n->m_data = m->m_data;
746 MCLADDREFERENCE(m, n); 744 MCLADDREFERENCE(m, n);
747 } else { 745 } else {
748 memcpy(mtod(n, char *), mtod(m, char *), n->m_len); 746 memcpy(mtod(n, char *), mtod(m, char *), n->m_len);
749 } 747 }
750 748
751 m = m->m_next; 749 m = m->m_next;
752 while (m) { 750 while (m) {
753 MGET(o, how, m->m_type); 751 MGET(o, how, m->m_type);
754 if (!o) 752 if (!o)
755 goto nospace; 753 goto nospace;
756 754
757 MCLAIM(o, m->m_owner); 755 MCLAIM(o, m->m_owner);
758 n->m_next = o; 756 n->m_next = o;
759 n = n->m_next; 757 n = n->m_next;
760 758
761 n->m_len = m->m_len; 759 n->m_len = m->m_len;
762 if (m->m_flags & M_EXT) { 760 if (m->m_flags & M_EXT) {
763 n->m_data = m->m_data; 761 n->m_data = m->m_data;
764 MCLADDREFERENCE(m, n); 762 MCLADDREFERENCE(m, n);
765 } else { 763 } else {
766 memcpy(mtod(n, char *), mtod(m, char *), n->m_len); 764 memcpy(mtod(n, char *), mtod(m, char *), n->m_len);
767 } 765 }
768 766
769 m = m->m_next; 767 m = m->m_next;
770 } 768 }
771 return top; 769 return top;
772nospace: 770nospace:
773 m_freem(top); 771 m_freem(top);
774 MCFail++; 772 MCFail++;
775 return NULL; 773 return NULL;
776} 774}
777 775
778/* 776/*
779 * Copy data from an mbuf chain starting "off" bytes from the beginning, 777 * Copy data from an mbuf chain starting "off" bytes from the beginning,
780 * continuing for "len" bytes, into the indicated buffer. 778 * continuing for "len" bytes, into the indicated buffer.
781 */ 779 */
782void 780void
783m_copydata(struct mbuf *m, int off, int len, void *vp) 781m_copydata(struct mbuf *m, int off, int len, void *vp)
784{ 782{
785 unsigned count; 783 unsigned count;
786 void * cp = vp; 784 void * cp = vp;
787 785
788 if (off < 0 || len < 0) 786 if (off < 0 || len < 0)
789 panic("m_copydata: off %d, len %d", off, len); 787 panic("m_copydata: off %d, len %d", off, len);
790 while (off > 0) { 788 while (off > 0) {
791 if (m == NULL) 789 if (m == NULL)
792 panic("m_copydata: m == NULL, off %d", off); 790 panic("m_copydata: m == NULL, off %d", off);
793 if (off < m->m_len) 791 if (off < m->m_len)
794 break; 792 break;
795 off -= m->m_len; 793 off -= m->m_len;
796 m = m->m_next; 794 m = m->m_next;
797 } 795 }
798 while (len > 0) { 796 while (len > 0) {
799 if (m == NULL) 797 if (m == NULL)
800 panic("m_copydata: m == NULL, len %d", len); 798 panic("m_copydata: m == NULL, len %d", len);
801 count = min(m->m_len - off, len); 799 count = min(m->m_len - off, len);
802 memcpy(cp, mtod(m, char *) + off, count); 800 memcpy(cp, mtod(m, char *) + off, count);
803 len -= count; 801 len -= count;
804 cp = (char *)cp + count; 802 cp = (char *)cp + count;
805 off = 0; 803 off = 0;
806 m = m->m_next; 804 m = m->m_next;
807 } 805 }
808} 806}
809 807
810/* 808/*
811 * Concatenate mbuf chain n to m. 809 * Concatenate mbuf chain n to m.
812 * n might be copied into m (when n->m_len is small), therefore data portion of 810 * n might be copied into m (when n->m_len is small), therefore data portion of
813 * n could be copied into an mbuf of different mbuf type. 811 * n could be copied into an mbuf of different mbuf type.
814 * Any m_pkthdr is not updated. 812 * Any m_pkthdr is not updated.
815 */ 813 */
816void 814void
817m_cat(struct mbuf *m, struct mbuf *n) 815m_cat(struct mbuf *m, struct mbuf *n)
818{ 816{
819 817
820 while (m->m_next) 818 while (m->m_next)
821 m = m->m_next; 819 m = m->m_next;
822 while (n) { 820 while (n) {
823 if (M_READONLY(m) || n->m_len > M_TRAILINGSPACE(m)) { 821 if (M_READONLY(m) || n->m_len > M_TRAILINGSPACE(m)) {
824 /* just join the two chains */ 822 /* just join the two chains */
825 m->m_next = n; 823 m->m_next = n;
826 return; 824 return;
827 } 825 }
828 /* splat the data from one into the other */ 826 /* splat the data from one into the other */
829 memcpy(mtod(m, char *) + m->m_len, mtod(n, void *), 827 memcpy(mtod(m, char *) + m->m_len, mtod(n, void *),
830 (u_int)n->m_len); 828 (u_int)n->m_len);
831 m->m_len += n->m_len; 829 m->m_len += n->m_len;
832 n = m_free(n); 830 n = m_free(n);
833 } 831 }
834} 832}
835 833
836void 834void
837m_adj(struct mbuf *mp, int req_len) 835m_adj(struct mbuf *mp, int req_len)
838{ 836{
839 int len = req_len; 837 int len = req_len;
840 struct mbuf *m; 838 struct mbuf *m;
841 int count; 839 int count;
842 840
843 if ((m = mp) == NULL) 841 if ((m = mp) == NULL)
844 return; 842 return;
845 if (len >= 0) { 843 if (len >= 0) {
846 /* 844 /*
847 * Trim from head. 845 * Trim from head.
848 */ 846 */
849 while (m != NULL && len > 0) { 847 while (m != NULL && len > 0) {
850 if (m->m_len <= len) { 848 if (m->m_len <= len) {
851 len -= m->m_len; 849 len -= m->m_len;
852 m->m_len = 0; 850 m->m_len = 0;
853 m = m->m_next; 851 m = m->m_next;
854 } else { 852 } else {
855 m->m_len -= len; 853 m->m_len -= len;
856 m->m_data += len; 854 m->m_data += len;
857 len = 0; 855 len = 0;
858 } 856 }
859 } 857 }
860 m = mp; 858 m = mp;
861 if (mp->m_flags & M_PKTHDR) 859 if (mp->m_flags & M_PKTHDR)
862 m->m_pkthdr.len -= (req_len - len); 860 m->m_pkthdr.len -= (req_len - len);
863 } else { 861 } else {
864 /* 862 /*
865 * Trim from tail. Scan the mbuf chain, 863 * Trim from tail. Scan the mbuf chain,
866 * calculating its length and finding the last mbuf. 864 * calculating its length and finding the last mbuf.
867 * If the adjustment only affects this mbuf, then just 865 * If the adjustment only affects this mbuf, then just
868 * adjust and return. Otherwise, rescan and truncate 866 * adjust and return. Otherwise, rescan and truncate
869 * after the remaining size. 867 * after the remaining size.
870 */ 868 */
871 len = -len; 869 len = -len;
872 count = 0; 870 count = 0;
873 for (;;) { 871 for (;;) {
874 count += m->m_len; 872 count += m->m_len;
875 if (m->m_next == (struct mbuf *)0) 873 if (m->m_next == (struct mbuf *)0)
876 break; 874 break;
877 m = m->m_next; 875 m = m->m_next;
878 } 876 }
879 if (m->m_len >= len) { 877 if (m->m_len >= len) {
880 m->m_len -= len; 878 m->m_len -= len;
881 if (mp->m_flags & M_PKTHDR) 879 if (mp->m_flags & M_PKTHDR)
882 mp->m_pkthdr.len -= len; 880 mp->m_pkthdr.len -= len;
883 return; 881 return;
884 } 882 }
885 count -= len; 883 count -= len;
886 if (count < 0) 884 if (count < 0)
887 count = 0; 885 count = 0;
888 /* 886 /*
889 * Correct length for chain is "count". 887 * Correct length for chain is "count".
890 * Find the mbuf with last data, adjust its length, 888 * Find the mbuf with last data, adjust its length,
891 * and toss data from remaining mbufs on chain. 889 * and toss data from remaining mbufs on chain.
892 */ 890 */
893 m = mp; 891 m = mp;
894 if (m->m_flags & M_PKTHDR) 892 if (m->m_flags & M_PKTHDR)
895 m->m_pkthdr.len = count; 893 m->m_pkthdr.len = count;
896 for (; m; m = m->m_next) { 894 for (; m; m = m->m_next) {
897 if (m->m_len >= count) { 895 if (m->m_len >= count) {
898 m->m_len = count; 896 m->m_len = count;
899 break; 897 break;
900 } 898 }
901 count -= m->m_len; 899 count -= m->m_len;
902 } 900 }
903 if (m) 901 if (m)
904 while (m->m_next) 902 while (m->m_next)
905 (m = m->m_next)->m_len = 0; 903 (m = m->m_next)->m_len = 0;
906 } 904 }
907} 905}
908 906
909/* 907/*
910 * Rearrange an mbuf chain so that len bytes are contiguous 908 * Rearrange an mbuf chain so that len bytes are contiguous
911 * and in the data area of an mbuf (so that mtod and dtom 909 * and in the data area of an mbuf (so that mtod and dtom
912 * will work for a structure of size len). Returns the resulting 910 * will work for a structure of size len). Returns the resulting
913 * mbuf chain on success, frees it and returns null on failure. 911 * mbuf chain on success, frees it and returns null on failure.
914 * If there is room, it will add up to max_protohdr-len extra bytes to the 912 * If there is room, it will add up to max_protohdr-len extra bytes to the
915 * contiguous region in an attempt to avoid being called next time. 913 * contiguous region in an attempt to avoid being called next time.
916 */ 914 */
917int MPFail; 915int MPFail;
918 916
919struct mbuf * 917struct mbuf *
920m_pullup(struct mbuf *n, int len) 918m_pullup(struct mbuf *n, int len)
921{ 919{
922 struct mbuf *m; 920 struct mbuf *m;
923 int count; 921 int count;
924 int space; 922 int space;
925 923
926 /* 924 /*
927 * If first mbuf has no cluster, and has room for len bytes 925 * If first mbuf has no cluster, and has room for len bytes
928 * without shifting current data, pullup into it, 926 * without shifting current data, pullup into it,
929 * otherwise allocate a new mbuf to prepend to the chain. 927 * otherwise allocate a new mbuf to prepend to the chain.
930 */ 928 */
931 if ((n->m_flags & M_EXT) == 0 && 929 if ((n->m_flags & M_EXT) == 0 &&
932 n->m_data + len < &n->m_dat[MLEN] && n->m_next) { 930 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
933 if (n->m_len >= len) 931 if (n->m_len >= len)
934 return (n); 932 return (n);
935 m = n; 933 m = n;
936 n = n->m_next; 934 n = n->m_next;
937 len -= m->m_len; 935 len -= m->m_len;
938 } else { 936 } else {
939 if (len > MHLEN) 937 if (len > MHLEN)
940 goto bad; 938 goto bad;
941 MGET(m, M_DONTWAIT, n->m_type); 939 MGET(m, M_DONTWAIT, n->m_type);
942 if (m == 0) 940 if (m == 0)
943 goto bad; 941 goto bad;
944 MCLAIM(m, n->m_owner); 942 MCLAIM(m, n->m_owner);
945 m->m_len = 0; 943 m->m_len = 0;
946 if (n->m_flags & M_PKTHDR) { 944 if (n->m_flags & M_PKTHDR) {
947 M_MOVE_PKTHDR(m, n); 945 M_MOVE_PKTHDR(m, n);
948 } 946 }
949 } 947 }
950 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 948 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
951 do { 949 do {
952 count = min(min(max(len, max_protohdr), space), n->m_len); 950 count = min(min(max(len, max_protohdr), space), n->m_len);
953 memcpy(mtod(m, char *) + m->m_len, mtod(n, void *), 951 memcpy(mtod(m, char *) + m->m_len, mtod(n, void *),
954 (unsigned)count); 952 (unsigned)count);
955 len -= count; 953 len -= count;
956 m->m_len += count; 954 m->m_len += count;
957 n->m_len -= count; 955 n->m_len -= count;
958 space -= count; 956 space -= count;
959 if (n->m_len) 957 if (n->m_len)
960 n->m_data += count; 958 n->m_data += count;
961 else 959 else
962 n = m_free(n); 960 n = m_free(n);
963 } while (len > 0 && n); 961 } while (len > 0 && n);
964 if (len > 0) { 962 if (len > 0) {
965 (void) m_free(m); 963 (void) m_free(m);
966 goto bad; 964 goto bad;
967 } 965 }
968 m->m_next = n; 966 m->m_next = n;
969 return (m); 967 return (m);
970bad: 968bad:
971 m_freem(n); 969 m_freem(n);
972 MPFail++; 970 MPFail++;
973 return (NULL); 971 return (NULL);
974} 972}
975 973
976/* 974/*
977 * Like m_pullup(), except a new mbuf is always allocated, and we allow 975 * Like m_pullup(), except a new mbuf is always allocated, and we allow
978 * the amount of empty space before the data in the new mbuf to be specified 976 * the amount of empty space before the data in the new mbuf to be specified
979 * (in the event that the caller expects to prepend later). 977 * (in the event that the caller expects to prepend later).
980 */ 978 */
981int MSFail; 979int MSFail;
982 980
983struct mbuf * 981struct mbuf *
984m_copyup(struct mbuf *n, int len, int dstoff) 982m_copyup(struct mbuf *n, int len, int dstoff)
985{ 983{
986 struct mbuf *m; 984 struct mbuf *m;
987 int count, space; 985 int count, space;
988 986
989 if (len > (MHLEN - dstoff)) 987 if (len > (MHLEN - dstoff))
990 goto bad; 988 goto bad;
991 MGET(m, M_DONTWAIT, n->m_type); 989 MGET(m, M_DONTWAIT, n->m_type);
992 if (m == NULL) 990 if (m == NULL)
993 goto bad; 991 goto bad;
994 MCLAIM(m, n->m_owner); 992 MCLAIM(m, n->m_owner);
995 m->m_len = 0; 993 m->m_len = 0;
996 if (n->m_flags & M_PKTHDR) { 994 if (n->m_flags & M_PKTHDR) {
997 M_MOVE_PKTHDR(m, n); 995 M_MOVE_PKTHDR(m, n);
998 } 996 }
999 m->m_data += dstoff; 997 m->m_data += dstoff;
1000 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 998 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1001 do { 999 do {
1002 count = min(min(max(len, max_protohdr), space), n->m_len); 1000 count = min(min(max(len, max_protohdr), space), n->m_len);
1003 memcpy(mtod(m, char *) + m->m_len, mtod(n, void *), 1001 memcpy(mtod(m, char *) + m->m_len, mtod(n, void *),
1004 (unsigned)count); 1002 (unsigned)count);
1005 len -= count; 1003 len -= count;
1006 m->m_len += count; 1004 m->m_len += count;
1007 n->m_len -= count; 1005 n->m_len -= count;
1008 space -= count; 1006 space -= count;
1009 if (n->m_len) 1007 if (n->m_len)
1010 n->m_data += count; 1008 n->m_data += count;
1011 else 1009 else
1012 n = m_free(n); 1010 n = m_free(n);
1013 } while (len > 0 && n); 1011 } while (len > 0 && n);
1014 if (len > 0) { 1012 if (len > 0) {
1015 (void) m_free(m); 1013 (void) m_free(m);
1016 goto bad; 1014 goto bad;
1017 } 1015 }
1018 m->m_next = n; 1016 m->m_next = n;
1019 return (m); 1017 return (m);
1020 bad: 1018 bad:
1021 m_freem(n); 1019 m_freem(n);
1022 MSFail++; 1020 MSFail++;
1023 return (NULL); 1021 return (NULL);
1024} 1022}
1025 1023
1026/* 1024/*
1027 * Partition an mbuf chain in two pieces, returning the tail -- 1025 * Partition an mbuf chain in two pieces, returning the tail --
1028 * all but the first len0 bytes. In case of failure, it returns NULL and 1026 * all but the first len0 bytes. In case of failure, it returns NULL and
1029 * attempts to restore the chain to its original state. 1027 * attempts to restore the chain to its original state.
1030 */ 1028 */
1031struct mbuf * 1029struct mbuf *
1032m_split(struct mbuf *m0, int len0, int wait) 1030m_split(struct mbuf *m0, int len0, int wait)
1033{ 1031{
1034 1032
1035 return m_split0(m0, len0, wait, 1); 1033 return m_split0(m0, len0, wait, 1);
1036} 1034}
1037 1035
1038static struct mbuf * 1036static struct mbuf *
1039m_split0(struct mbuf *m0, int len0, int wait, int copyhdr) 1037m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
1040{ 1038{
1041 struct mbuf *m, *n; 1039 struct mbuf *m, *n;
1042 unsigned len = len0, remain, len_save; 1040 unsigned len = len0, remain, len_save;
1043 1041
1044 for (m = m0; m && len > m->m_len; m = m->m_next) 1042 for (m = m0; m && len > m->m_len; m = m->m_next)
1045 len -= m->m_len; 1043 len -= m->m_len;
1046 if (m == 0) 1044 if (m == 0)
1047 return (NULL); 1045 return (NULL);
1048 remain = m->m_len - len; 1046 remain = m->m_len - len;
1049 if (copyhdr && (m0->m_flags & M_PKTHDR)) { 1047 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
1050 MGETHDR(n, wait, m0->m_type); 1048 MGETHDR(n, wait, m0->m_type);
1051 if (n == 0) 1049 if (n == 0)
1052 return (NULL); 1050 return (NULL);
1053 MCLAIM(n, m0->m_owner); 1051 MCLAIM(n, m0->m_owner);
1054 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 1052 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1055 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 1053 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1056 len_save = m0->m_pkthdr.len; 1054 len_save = m0->m_pkthdr.len;
1057 m0->m_pkthdr.len = len0; 1055 m0->m_pkthdr.len = len0;
1058 if (m->m_flags & M_EXT) 1056 if (m->m_flags & M_EXT)
1059 goto extpacket; 1057 goto extpacket;
1060 if (remain > MHLEN) { 1058 if (remain > MHLEN) {
1061 /* m can't be the lead packet */ 1059 /* m can't be the lead packet */
1062 MH_ALIGN(n, 0); 1060 MH_ALIGN(n, 0);
1063 n->m_len = 0; 1061 n->m_len = 0;
1064 n->m_next = m_split(m, len, wait); 1062 n->m_next = m_split(m, len, wait);
1065 if (n->m_next == 0) { 1063 if (n->m_next == 0) {
1066 (void) m_free(n); 1064 (void) m_free(n);
1067 m0->m_pkthdr.len = len_save; 1065 m0->m_pkthdr.len = len_save;
1068 return (NULL); 1066 return (NULL);
1069 } else 1067 } else
1070 return (n); 1068 return (n);
1071 } else 1069 } else
1072 MH_ALIGN(n, remain); 1070 MH_ALIGN(n, remain);
1073 } else if (remain == 0) { 1071 } else if (remain == 0) {
1074 n = m->m_next; 1072 n = m->m_next;
1075 m->m_next = 0; 1073 m->m_next = 0;
1076 return (n); 1074 return (n);
1077 } else { 1075 } else {
1078 MGET(n, wait, m->m_type); 1076 MGET(n, wait, m->m_type);
1079 if (n == 0) 1077 if (n == 0)
1080 return (NULL); 1078 return (NULL);
1081 MCLAIM(n, m->m_owner); 1079 MCLAIM(n, m->m_owner);
1082 M_ALIGN(n, remain); 1080 M_ALIGN(n, remain);
1083 } 1081 }
1084extpacket: 1082extpacket:
1085 if (m->m_flags & M_EXT) { 1083 if (m->m_flags & M_EXT) {
1086 n->m_data = m->m_data + len; 1084 n->m_data = m->m_data + len;
1087 MCLADDREFERENCE(m, n); 1085 MCLADDREFERENCE(m, n);
1088 } else { 1086 } else {
1089 memcpy(mtod(n, void *), mtod(m, char *) + len, remain); 1087 memcpy(mtod(n, void *), mtod(m, char *) + len, remain);

cvs diff -r1.145 -r1.146 src/sys/kern/uipc_syscalls.c (switch to unified diff)

--- src/sys/kern/uipc_syscalls.c 2011/07/15 14:50:19 1.145
+++ src/sys/kern/uipc_syscalls.c 2011/07/27 14:35:34 1.146
@@ -1,1087 +1,1085 @@ @@ -1,1087 +1,1085 @@
1/* $NetBSD: uipc_syscalls.c,v 1.145 2011/07/15 14:50:19 christos Exp $ */ 1/* $NetBSD: uipc_syscalls.c,v 1.146 2011/07/27 14:35:34 uebayasi Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * Copyright (c) 1982, 1986, 1989, 1990, 1993 33 * Copyright (c) 1982, 1986, 1989, 1990, 1993
34 * The Regents of the University of California. All rights reserved. 34 * The Regents of the University of California. All rights reserved.
35 * 35 *
36 * Redistribution and use in source and binary forms, with or without 36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions 37 * modification, are permitted provided that the following conditions
38 * are met: 38 * are met:
39 * 1. Redistributions of source code must retain the above copyright 39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer. 40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright 41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the 42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution. 43 * documentation and/or other materials provided with the distribution.
44 * 3. Neither the name of the University nor the names of its contributors 44 * 3. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software 45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission. 46 * without specific prior written permission.
47 * 47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE. 58 * SUCH DAMAGE.
59 * 59 *
60 * @(#)uipc_syscalls.c 8.6 (Berkeley) 2/14/95 60 * @(#)uipc_syscalls.c 8.6 (Berkeley) 2/14/95
61 */ 61 */
62 62
63#include <sys/cdefs.h> 63#include <sys/cdefs.h>
64__KERNEL_RCSID(0, "$NetBSD: uipc_syscalls.c,v 1.145 2011/07/15 14:50:19 christos Exp $"); 64__KERNEL_RCSID(0, "$NetBSD: uipc_syscalls.c,v 1.146 2011/07/27 14:35:34 uebayasi Exp $");
65 65
66#include "opt_pipe.h" 66#include "opt_pipe.h"
67 67
68#include <sys/param.h> 68#include <sys/param.h>
69#include <sys/systm.h> 69#include <sys/systm.h>
70#include <sys/filedesc.h> 70#include <sys/filedesc.h>
71#include <sys/proc.h> 71#include <sys/proc.h>
72#include <sys/file.h> 72#include <sys/file.h>
73#include <sys/buf.h> 73#include <sys/buf.h>
74#include <sys/mbuf.h> 74#include <sys/mbuf.h>
75#include <sys/protosw.h> 75#include <sys/protosw.h>
76#include <sys/socket.h> 76#include <sys/socket.h>
77#include <sys/socketvar.h> 77#include <sys/socketvar.h>
78#include <sys/signalvar.h> 78#include <sys/signalvar.h>
79#include <sys/un.h> 79#include <sys/un.h>
80#include <sys/ktrace.h> 80#include <sys/ktrace.h>
81#include <sys/event.h> 81#include <sys/event.h>
82#include <sys/kauth.h> 82#include <sys/kauth.h>
83 83
84#include <sys/mount.h> 84#include <sys/mount.h>
85#include <sys/syscallargs.h> 85#include <sys/syscallargs.h>
86 86
87#include <uvm/uvm_extern.h> 
88 
89/* 87/*
90 * System call interface to the socket abstraction. 88 * System call interface to the socket abstraction.
91 */ 89 */
92extern const struct fileops socketops; 90extern const struct fileops socketops;
93 91
94int 92int
95sys___socket30(struct lwp *l, const struct sys___socket30_args *uap, register_t *retval) 93sys___socket30(struct lwp *l, const struct sys___socket30_args *uap, register_t *retval)
96{ 94{
97 /* { 95 /* {
98 syscallarg(int) domain; 96 syscallarg(int) domain;
99 syscallarg(int) type; 97 syscallarg(int) type;
100 syscallarg(int) protocol; 98 syscallarg(int) protocol;
101 } */ 99 } */
102 int fd, error; 100 int fd, error;
103 101
104 error = fsocreate(SCARG(uap, domain), NULL, SCARG(uap, type), 102 error = fsocreate(SCARG(uap, domain), NULL, SCARG(uap, type),
105 SCARG(uap, protocol), l, &fd); 103 SCARG(uap, protocol), l, &fd);
106 if (error == 0) 104 if (error == 0)
107 *retval = fd; 105 *retval = fd;
108 return error; 106 return error;
109} 107}
110 108
111/* ARGSUSED */ 109/* ARGSUSED */
112int 110int
113sys_bind(struct lwp *l, const struct sys_bind_args *uap, register_t *retval) 111sys_bind(struct lwp *l, const struct sys_bind_args *uap, register_t *retval)
114{ 112{
115 /* { 113 /* {
116 syscallarg(int) s; 114 syscallarg(int) s;
117 syscallarg(const struct sockaddr *) name; 115 syscallarg(const struct sockaddr *) name;
118 syscallarg(unsigned int) namelen; 116 syscallarg(unsigned int) namelen;
119 } */ 117 } */
120 struct mbuf *nam; 118 struct mbuf *nam;
121 int error; 119 int error;
122 120
123 error = sockargs(&nam, SCARG(uap, name), SCARG(uap, namelen), 121 error = sockargs(&nam, SCARG(uap, name), SCARG(uap, namelen),
124 MT_SONAME); 122 MT_SONAME);
125 if (error) 123 if (error)
126 return error; 124 return error;
127 125
128 return do_sys_bind(l, SCARG(uap, s), nam); 126 return do_sys_bind(l, SCARG(uap, s), nam);
129} 127}
130 128
131int 129int
132do_sys_bind(struct lwp *l, int fd, struct mbuf *nam) 130do_sys_bind(struct lwp *l, int fd, struct mbuf *nam)
133{ 131{
134 struct socket *so; 132 struct socket *so;
135 int error; 133 int error;
136 134
137 if ((error = fd_getsock(fd, &so)) != 0) { 135 if ((error = fd_getsock(fd, &so)) != 0) {
138 m_freem(nam); 136 m_freem(nam);
139 return (error); 137 return (error);
140 } 138 }
141 MCLAIM(nam, so->so_mowner); 139 MCLAIM(nam, so->so_mowner);
142 error = sobind(so, nam, l); 140 error = sobind(so, nam, l);
143 m_freem(nam); 141 m_freem(nam);
144 fd_putfile(fd); 142 fd_putfile(fd);
145 return error; 143 return error;
146} 144}
147 145
148/* ARGSUSED */ 146/* ARGSUSED */
149int 147int
150sys_listen(struct lwp *l, const struct sys_listen_args *uap, register_t *retval) 148sys_listen(struct lwp *l, const struct sys_listen_args *uap, register_t *retval)
151{ 149{
152 /* { 150 /* {
153 syscallarg(int) s; 151 syscallarg(int) s;
154 syscallarg(int) backlog; 152 syscallarg(int) backlog;
155 } */ 153 } */
156 struct socket *so; 154 struct socket *so;
157 int error; 155 int error;
158 156
159 if ((error = fd_getsock(SCARG(uap, s), &so)) != 0) 157 if ((error = fd_getsock(SCARG(uap, s), &so)) != 0)
160 return (error); 158 return (error);
161 error = solisten(so, SCARG(uap, backlog), l); 159 error = solisten(so, SCARG(uap, backlog), l);
162 fd_putfile(SCARG(uap, s)); 160 fd_putfile(SCARG(uap, s));
163 return error; 161 return error;
164} 162}
165 163
166int 164int
167do_sys_accept(struct lwp *l, int sock, struct mbuf **name, register_t *new_sock, 165do_sys_accept(struct lwp *l, int sock, struct mbuf **name, register_t *new_sock,
168 const sigset_t *mask, int flags, int clrflags) 166 const sigset_t *mask, int flags, int clrflags)
169{ 167{
170 file_t *fp, *fp2; 168 file_t *fp, *fp2;
171 struct mbuf *nam; 169 struct mbuf *nam;
172 int error, fd; 170 int error, fd;
173 struct socket *so, *so2; 171 struct socket *so, *so2;
174 short wakeup_state = 0; 172 short wakeup_state = 0;
175 173
176 if ((fp = fd_getfile(sock)) == NULL) 174 if ((fp = fd_getfile(sock)) == NULL)
177 return (EBADF); 175 return (EBADF);
178 if (fp->f_type != DTYPE_SOCKET) { 176 if (fp->f_type != DTYPE_SOCKET) {
179 fd_putfile(sock); 177 fd_putfile(sock);
180 return (ENOTSOCK); 178 return (ENOTSOCK);
181 } 179 }
182 if ((error = fd_allocfile(&fp2, &fd)) != 0) { 180 if ((error = fd_allocfile(&fp2, &fd)) != 0) {
183 fd_putfile(sock); 181 fd_putfile(sock);
184 return (error); 182 return (error);
185 } 183 }
186 nam = m_get(M_WAIT, MT_SONAME); 184 nam = m_get(M_WAIT, MT_SONAME);
187 *new_sock = fd; 185 *new_sock = fd;
188 so = fp->f_data; 186 so = fp->f_data;
189 solock(so); 187 solock(so);
190 188
191 if (__predict_false(mask)) 189 if (__predict_false(mask))
192 sigsuspendsetup(l, mask); 190 sigsuspendsetup(l, mask);
193 191
194 if (!(so->so_proto->pr_flags & PR_LISTEN)) { 192 if (!(so->so_proto->pr_flags & PR_LISTEN)) {
195 error = EOPNOTSUPP; 193 error = EOPNOTSUPP;
196 goto bad; 194 goto bad;
197 } 195 }
198 if ((so->so_options & SO_ACCEPTCONN) == 0) { 196 if ((so->so_options & SO_ACCEPTCONN) == 0) {
199 error = EINVAL; 197 error = EINVAL;
200 goto bad; 198 goto bad;
201 } 199 }
202 if (so->so_nbio && so->so_qlen == 0) { 200 if (so->so_nbio && so->so_qlen == 0) {
203 error = EWOULDBLOCK; 201 error = EWOULDBLOCK;
204 goto bad; 202 goto bad;
205 } 203 }
206 while (so->so_qlen == 0 && so->so_error == 0) { 204 while (so->so_qlen == 0 && so->so_error == 0) {
207 if (so->so_state & SS_CANTRCVMORE) { 205 if (so->so_state & SS_CANTRCVMORE) {
208 so->so_error = ECONNABORTED; 206 so->so_error = ECONNABORTED;
209 break; 207 break;
210 } 208 }
211 if (wakeup_state & SS_RESTARTSYS) { 209 if (wakeup_state & SS_RESTARTSYS) {
212 error = ERESTART; 210 error = ERESTART;
213 goto bad; 211 goto bad;
214 } 212 }
215 error = sowait(so, true, 0); 213 error = sowait(so, true, 0);
216 if (error) { 214 if (error) {
217 goto bad; 215 goto bad;
218 } 216 }
219 wakeup_state = so->so_state; 217 wakeup_state = so->so_state;
220 } 218 }
221 if (so->so_error) { 219 if (so->so_error) {
222 error = so->so_error; 220 error = so->so_error;
223 so->so_error = 0; 221 so->so_error = 0;
224 goto bad; 222 goto bad;
225 } 223 }
226 /* connection has been removed from the listen queue */ 224 /* connection has been removed from the listen queue */
227 KNOTE(&so->so_rcv.sb_sel.sel_klist, NOTE_SUBMIT); 225 KNOTE(&so->so_rcv.sb_sel.sel_klist, NOTE_SUBMIT);
228 so2 = TAILQ_FIRST(&so->so_q); 226 so2 = TAILQ_FIRST(&so->so_q);
229 if (soqremque(so2, 1) == 0) 227 if (soqremque(so2, 1) == 0)
230 panic("accept"); 228 panic("accept");
231 fp2->f_type = DTYPE_SOCKET; 229 fp2->f_type = DTYPE_SOCKET;
232 fp2->f_flag = (fp->f_flag & ~clrflags) | 230 fp2->f_flag = (fp->f_flag & ~clrflags) |
233 ((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0);  231 ((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0);
234 fp2->f_ops = &socketops; 232 fp2->f_ops = &socketops;
235 fp2->f_data = so2; 233 fp2->f_data = so2;
236 error = soaccept(so2, nam); 234 error = soaccept(so2, nam);
237 so2->so_cred = kauth_cred_dup(so->so_cred); 235 so2->so_cred = kauth_cred_dup(so->so_cred);
238 sounlock(so); 236 sounlock(so);
239 if (error) { 237 if (error) {
240 /* an error occurred, free the file descriptor and mbuf */ 238 /* an error occurred, free the file descriptor and mbuf */
241 m_freem(nam); 239 m_freem(nam);
242 mutex_enter(&fp2->f_lock); 240 mutex_enter(&fp2->f_lock);
243 fp2->f_count++; 241 fp2->f_count++;
244 mutex_exit(&fp2->f_lock); 242 mutex_exit(&fp2->f_lock);
245 closef(fp2); 243 closef(fp2);
246 fd_abort(curproc, NULL, fd); 244 fd_abort(curproc, NULL, fd);
247 } else { 245 } else {
248 fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0); 246 fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0);
249 fd_affix(curproc, fp2, fd); 247 fd_affix(curproc, fp2, fd);
250 *name = nam; 248 *name = nam;
251 } 249 }
252 fd_putfile(sock); 250 fd_putfile(sock);
253 if (__predict_false(mask)) 251 if (__predict_false(mask))
254 sigsuspendteardown(l); 252 sigsuspendteardown(l);
255 return (error); 253 return (error);
256 bad: 254 bad:
257 sounlock(so); 255 sounlock(so);
258 m_freem(nam); 256 m_freem(nam);
259 fd_putfile(sock); 257 fd_putfile(sock);
260 fd_abort(curproc, fp2, fd); 258 fd_abort(curproc, fp2, fd);
261 if (__predict_false(mask)) 259 if (__predict_false(mask))
262 sigsuspendteardown(l); 260 sigsuspendteardown(l);
263 return (error); 261 return (error);
264} 262}
265 263
266int 264int
267sys_accept(struct lwp *l, const struct sys_accept_args *uap, register_t *retval) 265sys_accept(struct lwp *l, const struct sys_accept_args *uap, register_t *retval)
268{ 266{
269 /* { 267 /* {
270 syscallarg(int) s; 268 syscallarg(int) s;
271 syscallarg(struct sockaddr *) name; 269 syscallarg(struct sockaddr *) name;
272 syscallarg(unsigned int *) anamelen; 270 syscallarg(unsigned int *) anamelen;
273 } */ 271 } */
274 int error, fd; 272 int error, fd;
275 struct mbuf *name; 273 struct mbuf *name;
276 274
277 error = do_sys_accept(l, SCARG(uap, s), &name, retval, NULL, 0, 0); 275 error = do_sys_accept(l, SCARG(uap, s), &name, retval, NULL, 0, 0);
278 if (error != 0) 276 if (error != 0)
279 return error; 277 return error;
280 error = copyout_sockname(SCARG(uap, name), SCARG(uap, anamelen), 278 error = copyout_sockname(SCARG(uap, name), SCARG(uap, anamelen),
281 MSG_LENUSRSPACE, name); 279 MSG_LENUSRSPACE, name);
282 if (name != NULL) 280 if (name != NULL)
283 m_free(name); 281 m_free(name);
284 if (error != 0) { 282 if (error != 0) {
285 fd = (int)*retval; 283 fd = (int)*retval;
286 if (fd_getfile(fd) != NULL) 284 if (fd_getfile(fd) != NULL)
287 (void)fd_close(fd); 285 (void)fd_close(fd);
288 } 286 }
289 return error; 287 return error;
290} 288}
291 289
292int 290int
293sys_paccept(struct lwp *l, const struct sys_paccept_args *uap, 291sys_paccept(struct lwp *l, const struct sys_paccept_args *uap,
294 register_t *retval) 292 register_t *retval)
295{ 293{
296 /* { 294 /* {
297 syscallarg(int) s; 295 syscallarg(int) s;
298 syscallarg(struct sockaddr *) name; 296 syscallarg(struct sockaddr *) name;
299 syscallarg(unsigned int *) anamelen; 297 syscallarg(unsigned int *) anamelen;
300 syscallarg(const sigset_t *) mask; 298 syscallarg(const sigset_t *) mask;
301 syscallarg(int) flags; 299 syscallarg(int) flags;
302 } */ 300 } */
303 int error, fd; 301 int error, fd;
304 struct mbuf *name; 302 struct mbuf *name;
305 sigset_t *mask, amask; 303 sigset_t *mask, amask;
306 304
307 if (SCARG(uap, mask) != NULL) { 305 if (SCARG(uap, mask) != NULL) {
308 error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 306 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
309 if (error) 307 if (error)
310 return error; 308 return error;
311 mask = &amask; 309 mask = &amask;
312 } else 310 } else
313 mask = NULL; 311 mask = NULL;
314 312
315 error = do_sys_accept(l, SCARG(uap, s), &name, retval, mask, 313 error = do_sys_accept(l, SCARG(uap, s), &name, retval, mask,
316 SCARG(uap, flags), FNONBLOCK); 314 SCARG(uap, flags), FNONBLOCK);
317 if (error != 0) 315 if (error != 0)
318 return error; 316 return error;
319 error = copyout_sockname(SCARG(uap, name), SCARG(uap, anamelen), 317 error = copyout_sockname(SCARG(uap, name), SCARG(uap, anamelen),
320 MSG_LENUSRSPACE, name); 318 MSG_LENUSRSPACE, name);
321 if (name != NULL) 319 if (name != NULL)
322 m_free(name); 320 m_free(name);
323 if (error != 0) { 321 if (error != 0) {
324 fd = (int)*retval; 322 fd = (int)*retval;
325 if (fd_getfile(fd) != NULL) 323 if (fd_getfile(fd) != NULL)
326 (void)fd_close(fd); 324 (void)fd_close(fd);
327 } 325 }
328 return error; 326 return error;
329} 327}
330 328
331/* ARGSUSED */ 329/* ARGSUSED */
332int 330int
333sys_connect(struct lwp *l, const struct sys_connect_args *uap, register_t *retval) 331sys_connect(struct lwp *l, const struct sys_connect_args *uap, register_t *retval)
334{ 332{
335 /* { 333 /* {
336 syscallarg(int) s; 334 syscallarg(int) s;
337 syscallarg(const struct sockaddr *) name; 335 syscallarg(const struct sockaddr *) name;
338 syscallarg(unsigned int) namelen; 336 syscallarg(unsigned int) namelen;
339 } */ 337 } */
340 int error; 338 int error;
341 struct mbuf *nam; 339 struct mbuf *nam;
342 340
343 error = sockargs(&nam, SCARG(uap, name), SCARG(uap, namelen), 341 error = sockargs(&nam, SCARG(uap, name), SCARG(uap, namelen),
344 MT_SONAME); 342 MT_SONAME);
345 if (error) 343 if (error)
346 return error; 344 return error;
347 return do_sys_connect(l, SCARG(uap, s), nam); 345 return do_sys_connect(l, SCARG(uap, s), nam);
348} 346}
349 347
350int 348int
351do_sys_connect(struct lwp *l, int fd, struct mbuf *nam) 349do_sys_connect(struct lwp *l, int fd, struct mbuf *nam)
352{ 350{
353 struct socket *so; 351 struct socket *so;
354 int error; 352 int error;
355 int interrupted = 0; 353 int interrupted = 0;
356 354
357 if ((error = fd_getsock(fd, &so)) != 0) { 355 if ((error = fd_getsock(fd, &so)) != 0) {
358 m_freem(nam); 356 m_freem(nam);
359 return (error); 357 return (error);
360 } 358 }
361 solock(so); 359 solock(so);
362 MCLAIM(nam, so->so_mowner); 360 MCLAIM(nam, so->so_mowner);
363 if ((so->so_state & SS_ISCONNECTING) != 0) { 361 if ((so->so_state & SS_ISCONNECTING) != 0) {
364 error = EALREADY; 362 error = EALREADY;
365 goto out; 363 goto out;
366 } 364 }
367 365
368 error = soconnect(so, nam, l); 366 error = soconnect(so, nam, l);
369 if (error) 367 if (error)
370 goto bad; 368 goto bad;
371 if (so->so_nbio && (so->so_state & SS_ISCONNECTING) != 0) { 369 if (so->so_nbio && (so->so_state & SS_ISCONNECTING) != 0) {
372 error = EINPROGRESS; 370 error = EINPROGRESS;
373 goto out; 371 goto out;
374 } 372 }
375 while ((so->so_state & SS_ISCONNECTING) != 0 && so->so_error == 0) { 373 while ((so->so_state & SS_ISCONNECTING) != 0 && so->so_error == 0) {
376 error = sowait(so, true, 0); 374 error = sowait(so, true, 0);
377 if (__predict_false((so->so_state & SS_ISABORTING) != 0)) { 375 if (__predict_false((so->so_state & SS_ISABORTING) != 0)) {
378 error = EPIPE; 376 error = EPIPE;
379 interrupted = 1; 377 interrupted = 1;
380 break; 378 break;
381 } 379 }
382 if (error) { 380 if (error) {
383 if (error == EINTR || error == ERESTART) 381 if (error == EINTR || error == ERESTART)
384 interrupted = 1; 382 interrupted = 1;
385 break; 383 break;
386 } 384 }
387 } 385 }
388 if (error == 0) { 386 if (error == 0) {
389 error = so->so_error; 387 error = so->so_error;
390 so->so_error = 0; 388 so->so_error = 0;
391 } 389 }
392 bad: 390 bad:
393 if (!interrupted) 391 if (!interrupted)
394 so->so_state &= ~SS_ISCONNECTING; 392 so->so_state &= ~SS_ISCONNECTING;
395 if (error == ERESTART) 393 if (error == ERESTART)
396 error = EINTR; 394 error = EINTR;
397 out: 395 out:
398 sounlock(so); 396 sounlock(so);
399 fd_putfile(fd); 397 fd_putfile(fd);
400 m_freem(nam); 398 m_freem(nam);
401 return (error); 399 return (error);
402} 400}
403 401
404int 402int
405sys_socketpair(struct lwp *l, const struct sys_socketpair_args *uap, register_t *retval) 403sys_socketpair(struct lwp *l, const struct sys_socketpair_args *uap, register_t *retval)
406{ 404{
407 /* { 405 /* {
408 syscallarg(int) domain; 406 syscallarg(int) domain;
409 syscallarg(int) type; 407 syscallarg(int) type;
410 syscallarg(int) protocol; 408 syscallarg(int) protocol;
411 syscallarg(int *) rsv; 409 syscallarg(int *) rsv;
412 } */ 410 } */
413 file_t *fp1, *fp2; 411 file_t *fp1, *fp2;
414 struct socket *so1, *so2; 412 struct socket *so1, *so2;
415 int fd, error, sv[2]; 413 int fd, error, sv[2];
416 proc_t *p; 414 proc_t *p;
417 int flags = SCARG(uap, type) & SOCK_FLAGS_MASK; 415 int flags = SCARG(uap, type) & SOCK_FLAGS_MASK;
418 int type = SCARG(uap, type) & ~SOCK_FLAGS_MASK; 416 int type = SCARG(uap, type) & ~SOCK_FLAGS_MASK;
419 int fnonblock = (flags & SOCK_NONBLOCK) ? FNONBLOCK : 0;  417 int fnonblock = (flags & SOCK_NONBLOCK) ? FNONBLOCK : 0;
420 418
421 p = curproc; 419 p = curproc;
422 error = socreate(SCARG(uap, domain), &so1, type, 420 error = socreate(SCARG(uap, domain), &so1, type,
423 SCARG(uap, protocol), l, NULL); 421 SCARG(uap, protocol), l, NULL);
424 if (error) 422 if (error)
425 return (error); 423 return (error);
426 error = socreate(SCARG(uap, domain), &so2, type, 424 error = socreate(SCARG(uap, domain), &so2, type,
427 SCARG(uap, protocol), l, so1); 425 SCARG(uap, protocol), l, so1);
428 if (error) 426 if (error)
429 goto free1; 427 goto free1;
430 if ((error = fd_allocfile(&fp1, &fd)) != 0) 428 if ((error = fd_allocfile(&fp1, &fd)) != 0)
431 goto free2; 429 goto free2;
432 fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0); 430 fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0);
433 sv[0] = fd; 431 sv[0] = fd;
434 fp1->f_flag = FREAD|FWRITE|fnonblock; 432 fp1->f_flag = FREAD|FWRITE|fnonblock;
435 fp1->f_type = DTYPE_SOCKET; 433 fp1->f_type = DTYPE_SOCKET;
436 fp1->f_ops = &socketops; 434 fp1->f_ops = &socketops;
437 fp1->f_data = so1; 435 fp1->f_data = so1;
438 if ((error = fd_allocfile(&fp2, &fd)) != 0) 436 if ((error = fd_allocfile(&fp2, &fd)) != 0)
439 goto free3; 437 goto free3;
440 fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0); 438 fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0);
441 fp2->f_flag = FREAD|FWRITE|fnonblock; 439 fp2->f_flag = FREAD|FWRITE|fnonblock;
442 fp2->f_type = DTYPE_SOCKET; 440 fp2->f_type = DTYPE_SOCKET;
443 fp2->f_ops = &socketops; 441 fp2->f_ops = &socketops;
444 fp2->f_data = so2; 442 fp2->f_data = so2;
445 sv[1] = fd; 443 sv[1] = fd;
446 solock(so1); 444 solock(so1);
447 error = soconnect2(so1, so2); 445 error = soconnect2(so1, so2);
448 if (error == 0 && SCARG(uap, type) == SOCK_DGRAM) { 446 if (error == 0 && SCARG(uap, type) == SOCK_DGRAM) {
449 /* 447 /*
450 * Datagram socket connection is asymmetric. 448 * Datagram socket connection is asymmetric.
451 */ 449 */
452 error = soconnect2(so2, so1); 450 error = soconnect2(so2, so1);
453 } 451 }
454 sounlock(so1); 452 sounlock(so1);
455 if (error == 0) 453 if (error == 0)
456 error = copyout(sv, SCARG(uap, rsv), 2 * sizeof(int)); 454 error = copyout(sv, SCARG(uap, rsv), 2 * sizeof(int));
457 if (error == 0) { 455 if (error == 0) {
458 fd_affix(p, fp2, sv[1]); 456 fd_affix(p, fp2, sv[1]);
459 fd_affix(p, fp1, sv[0]); 457 fd_affix(p, fp1, sv[0]);
460 return (0); 458 return (0);
461 } 459 }
462 fd_abort(p, fp2, sv[1]); 460 fd_abort(p, fp2, sv[1]);
463 free3: 461 free3:
464 fd_abort(p, fp1, sv[0]); 462 fd_abort(p, fp1, sv[0]);
465 free2: 463 free2:
466 (void)soclose(so2); 464 (void)soclose(so2);
467 free1: 465 free1:
468 (void)soclose(so1); 466 (void)soclose(so1);
469 return (error); 467 return (error);
470} 468}
471 469
472int 470int
473sys_sendto(struct lwp *l, const struct sys_sendto_args *uap, register_t *retval) 471sys_sendto(struct lwp *l, const struct sys_sendto_args *uap, register_t *retval)
474{ 472{
475 /* { 473 /* {
476 syscallarg(int) s; 474 syscallarg(int) s;
477 syscallarg(const void *) buf; 475 syscallarg(const void *) buf;
478 syscallarg(size_t) len; 476 syscallarg(size_t) len;
479 syscallarg(int) flags; 477 syscallarg(int) flags;
480 syscallarg(const struct sockaddr *) to; 478 syscallarg(const struct sockaddr *) to;
481 syscallarg(unsigned int) tolen; 479 syscallarg(unsigned int) tolen;
482 } */ 480 } */
483 struct msghdr msg; 481 struct msghdr msg;
484 struct iovec aiov; 482 struct iovec aiov;
485 483
486 msg.msg_name = __UNCONST(SCARG(uap, to)); /* XXXUNCONST kills const */ 484 msg.msg_name = __UNCONST(SCARG(uap, to)); /* XXXUNCONST kills const */
487 msg.msg_namelen = SCARG(uap, tolen); 485 msg.msg_namelen = SCARG(uap, tolen);
488 msg.msg_iov = &aiov; 486 msg.msg_iov = &aiov;
489 msg.msg_iovlen = 1; 487 msg.msg_iovlen = 1;
490 msg.msg_control = NULL; 488 msg.msg_control = NULL;
491 msg.msg_flags = 0; 489 msg.msg_flags = 0;
492 aiov.iov_base = __UNCONST(SCARG(uap, buf)); /* XXXUNCONST kills const */ 490 aiov.iov_base = __UNCONST(SCARG(uap, buf)); /* XXXUNCONST kills const */
493 aiov.iov_len = SCARG(uap, len); 491 aiov.iov_len = SCARG(uap, len);
494 return do_sys_sendmsg(l, SCARG(uap, s), &msg, SCARG(uap, flags), retval); 492 return do_sys_sendmsg(l, SCARG(uap, s), &msg, SCARG(uap, flags), retval);
495} 493}
496 494
497int 495int
498sys_sendmsg(struct lwp *l, const struct sys_sendmsg_args *uap, register_t *retval) 496sys_sendmsg(struct lwp *l, const struct sys_sendmsg_args *uap, register_t *retval)
499{ 497{
500 /* { 498 /* {
501 syscallarg(int) s; 499 syscallarg(int) s;
502 syscallarg(const struct msghdr *) msg; 500 syscallarg(const struct msghdr *) msg;
503 syscallarg(int) flags; 501 syscallarg(int) flags;
504 } */ 502 } */
505 struct msghdr msg; 503 struct msghdr msg;
506 int error; 504 int error;
507 505
508 error = copyin(SCARG(uap, msg), &msg, sizeof(msg)); 506 error = copyin(SCARG(uap, msg), &msg, sizeof(msg));
509 if (error) 507 if (error)
510 return (error); 508 return (error);
511 509
512 msg.msg_flags = MSG_IOVUSRSPACE; 510 msg.msg_flags = MSG_IOVUSRSPACE;
513 return do_sys_sendmsg(l, SCARG(uap, s), &msg, SCARG(uap, flags), retval); 511 return do_sys_sendmsg(l, SCARG(uap, s), &msg, SCARG(uap, flags), retval);
514} 512}
515 513
516int 514int
517do_sys_sendmsg(struct lwp *l, int s, struct msghdr *mp, int flags, 515do_sys_sendmsg(struct lwp *l, int s, struct msghdr *mp, int flags,
518 register_t *retsize) 516 register_t *retsize)
519{ 517{
520 struct iovec aiov[UIO_SMALLIOV], *iov = aiov, *tiov, *ktriov = NULL; 518 struct iovec aiov[UIO_SMALLIOV], *iov = aiov, *tiov, *ktriov = NULL;
521 struct mbuf *to, *control; 519 struct mbuf *to, *control;
522 struct socket *so; 520 struct socket *so;
523 struct uio auio; 521 struct uio auio;
524 size_t len, iovsz; 522 size_t len, iovsz;
525 int i, error; 523 int i, error;
526 524
527 ktrkuser("msghdr", mp, sizeof *mp); 525 ktrkuser("msghdr", mp, sizeof *mp);
528 526
529 /* If the caller passed us stuff in mbufs, we must free them. */ 527 /* If the caller passed us stuff in mbufs, we must free them. */
530 to = (mp->msg_flags & MSG_NAMEMBUF) ? mp->msg_name : NULL; 528 to = (mp->msg_flags & MSG_NAMEMBUF) ? mp->msg_name : NULL;
531 control = (mp->msg_flags & MSG_CONTROLMBUF) ? mp->msg_control : NULL; 529 control = (mp->msg_flags & MSG_CONTROLMBUF) ? mp->msg_control : NULL;
532 iovsz = mp->msg_iovlen * sizeof(struct iovec); 530 iovsz = mp->msg_iovlen * sizeof(struct iovec);
533 531
534 if (mp->msg_flags & MSG_IOVUSRSPACE) { 532 if (mp->msg_flags & MSG_IOVUSRSPACE) {
535 if ((unsigned int)mp->msg_iovlen > UIO_SMALLIOV) { 533 if ((unsigned int)mp->msg_iovlen > UIO_SMALLIOV) {
536 if ((unsigned int)mp->msg_iovlen > IOV_MAX) { 534 if ((unsigned int)mp->msg_iovlen > IOV_MAX) {
537 error = EMSGSIZE; 535 error = EMSGSIZE;
538 goto bad; 536 goto bad;
539 } 537 }
540 iov = kmem_alloc(iovsz, KM_SLEEP); 538 iov = kmem_alloc(iovsz, KM_SLEEP);
541 } 539 }
542 if (mp->msg_iovlen != 0) { 540 if (mp->msg_iovlen != 0) {
543 error = copyin(mp->msg_iov, iov, iovsz); 541 error = copyin(mp->msg_iov, iov, iovsz);
544 if (error) 542 if (error)
545 goto bad; 543 goto bad;
546 } 544 }
547 mp->msg_iov = iov; 545 mp->msg_iov = iov;
548 } 546 }
549 547
550 auio.uio_iov = mp->msg_iov; 548 auio.uio_iov = mp->msg_iov;
551 auio.uio_iovcnt = mp->msg_iovlen; 549 auio.uio_iovcnt = mp->msg_iovlen;
552 auio.uio_rw = UIO_WRITE; 550 auio.uio_rw = UIO_WRITE;
553 auio.uio_offset = 0; /* XXX */ 551 auio.uio_offset = 0; /* XXX */
554 auio.uio_resid = 0; 552 auio.uio_resid = 0;
555 KASSERT(l == curlwp); 553 KASSERT(l == curlwp);
556 auio.uio_vmspace = l->l_proc->p_vmspace; 554 auio.uio_vmspace = l->l_proc->p_vmspace;
557 555
558 for (i = 0, tiov = mp->msg_iov; i < mp->msg_iovlen; i++, tiov++) { 556 for (i = 0, tiov = mp->msg_iov; i < mp->msg_iovlen; i++, tiov++) {
559 /* 557 /*
560 * Writes return ssize_t because -1 is returned on error. 558 * Writes return ssize_t because -1 is returned on error.
561 * Therefore, we must restrict the length to SSIZE_MAX to 559 * Therefore, we must restrict the length to SSIZE_MAX to
562 * avoid garbage return values. 560 * avoid garbage return values.
563 */ 561 */
564 auio.uio_resid += tiov->iov_len; 562 auio.uio_resid += tiov->iov_len;
565 if (tiov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { 563 if (tiov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
566 error = EINVAL; 564 error = EINVAL;
567 goto bad; 565 goto bad;
568 } 566 }
569 } 567 }
570 568
571 if (mp->msg_name && to == NULL) { 569 if (mp->msg_name && to == NULL) {
572 error = sockargs(&to, mp->msg_name, mp->msg_namelen, 570 error = sockargs(&to, mp->msg_name, mp->msg_namelen,
573 MT_SONAME); 571 MT_SONAME);
574 if (error) 572 if (error)
575 goto bad; 573 goto bad;
576 } 574 }
577 575
578 if (mp->msg_control) { 576 if (mp->msg_control) {
579 if (mp->msg_controllen < CMSG_ALIGN(sizeof(struct cmsghdr))) { 577 if (mp->msg_controllen < CMSG_ALIGN(sizeof(struct cmsghdr))) {
580 error = EINVAL; 578 error = EINVAL;
581 goto bad; 579 goto bad;
582 } 580 }
583 if (control == NULL) { 581 if (control == NULL) {
584 error = sockargs(&control, mp->msg_control, 582 error = sockargs(&control, mp->msg_control,
585 mp->msg_controllen, MT_CONTROL); 583 mp->msg_controllen, MT_CONTROL);
586 if (error) 584 if (error)
587 goto bad; 585 goto bad;
588 } 586 }
589 } 587 }
590 588
591 if (ktrpoint(KTR_GENIO)) { 589 if (ktrpoint(KTR_GENIO)) {
592 ktriov = kmem_alloc(iovsz, KM_SLEEP); 590 ktriov = kmem_alloc(iovsz, KM_SLEEP);
593 memcpy(ktriov, auio.uio_iov, iovsz); 591 memcpy(ktriov, auio.uio_iov, iovsz);
594 } 592 }
595 593
596 if ((error = fd_getsock(s, &so)) != 0) 594 if ((error = fd_getsock(s, &so)) != 0)
597 goto bad; 595 goto bad;
598 596
599 if (mp->msg_name) 597 if (mp->msg_name)
600 MCLAIM(to, so->so_mowner); 598 MCLAIM(to, so->so_mowner);
601 if (mp->msg_control) 599 if (mp->msg_control)
602 MCLAIM(control, so->so_mowner); 600 MCLAIM(control, so->so_mowner);
603 601
604 len = auio.uio_resid; 602 len = auio.uio_resid;
605 error = (*so->so_send)(so, to, &auio, NULL, control, flags, l); 603 error = (*so->so_send)(so, to, &auio, NULL, control, flags, l);
606 /* Protocol is responsible for freeing 'control' */ 604 /* Protocol is responsible for freeing 'control' */
607 control = NULL; 605 control = NULL;
608 606
609 fd_putfile(s); 607 fd_putfile(s);
610 608
611 if (error) { 609 if (error) {
612 if (auio.uio_resid != len && (error == ERESTART || 610 if (auio.uio_resid != len && (error == ERESTART ||
613 error == EINTR || error == EWOULDBLOCK)) 611 error == EINTR || error == EWOULDBLOCK))
614 error = 0; 612 error = 0;
615 if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) { 613 if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) {
616 mutex_enter(proc_lock); 614 mutex_enter(proc_lock);
617 psignal(l->l_proc, SIGPIPE); 615 psignal(l->l_proc, SIGPIPE);
618 mutex_exit(proc_lock); 616 mutex_exit(proc_lock);
619 } 617 }
620 } 618 }
621 if (error == 0) 619 if (error == 0)
622 *retsize = len - auio.uio_resid; 620 *retsize = len - auio.uio_resid;
623 621
624bad: 622bad:
625 if (ktriov != NULL) { 623 if (ktriov != NULL) {
626 ktrgeniov(s, UIO_WRITE, ktriov, *retsize, error); 624 ktrgeniov(s, UIO_WRITE, ktriov, *retsize, error);
627 kmem_free(ktriov, iovsz); 625 kmem_free(ktriov, iovsz);
628 } 626 }
629 627
630 if (iov != aiov) 628 if (iov != aiov)
631 kmem_free(iov, iovsz); 629 kmem_free(iov, iovsz);
632 if (to) 630 if (to)
633 m_freem(to); 631 m_freem(to);
634 if (control) 632 if (control)
635 m_freem(control); 633 m_freem(control);
636 634
637 return (error); 635 return (error);
638} 636}
639 637
640int 638int
641sys_recvfrom(struct lwp *l, const struct sys_recvfrom_args *uap, register_t *retval) 639sys_recvfrom(struct lwp *l, const struct sys_recvfrom_args *uap, register_t *retval)
642{ 640{
643 /* { 641 /* {
644 syscallarg(int) s; 642 syscallarg(int) s;
645 syscallarg(void *) buf; 643 syscallarg(void *) buf;
646 syscallarg(size_t) len; 644 syscallarg(size_t) len;
647 syscallarg(int) flags; 645 syscallarg(int) flags;
648 syscallarg(struct sockaddr *) from; 646 syscallarg(struct sockaddr *) from;
649 syscallarg(unsigned int *) fromlenaddr; 647 syscallarg(unsigned int *) fromlenaddr;
650 } */ 648 } */
651 struct msghdr msg; 649 struct msghdr msg;
652 struct iovec aiov; 650 struct iovec aiov;
653 int error; 651 int error;
654 struct mbuf *from; 652 struct mbuf *from;
655 653
656 msg.msg_name = NULL; 654 msg.msg_name = NULL;
657 msg.msg_iov = &aiov; 655 msg.msg_iov = &aiov;
658 msg.msg_iovlen = 1; 656 msg.msg_iovlen = 1;
659 aiov.iov_base = SCARG(uap, buf); 657 aiov.iov_base = SCARG(uap, buf);
660 aiov.iov_len = SCARG(uap, len); 658 aiov.iov_len = SCARG(uap, len);
661 msg.msg_control = NULL; 659 msg.msg_control = NULL;
662 msg.msg_flags = SCARG(uap, flags) & MSG_USERFLAGS; 660 msg.msg_flags = SCARG(uap, flags) & MSG_USERFLAGS;
663 661
664 error = do_sys_recvmsg(l, SCARG(uap, s), &msg, &from, NULL, retval); 662 error = do_sys_recvmsg(l, SCARG(uap, s), &msg, &from, NULL, retval);
665 if (error != 0) 663 if (error != 0)
666 return error; 664 return error;
667 665
668 error = copyout_sockname(SCARG(uap, from), SCARG(uap, fromlenaddr), 666 error = copyout_sockname(SCARG(uap, from), SCARG(uap, fromlenaddr),
669 MSG_LENUSRSPACE, from); 667 MSG_LENUSRSPACE, from);
670 if (from != NULL) 668 if (from != NULL)
671 m_free(from); 669 m_free(from);
672 return error; 670 return error;
673} 671}
674 672
675int 673int
676sys_recvmsg(struct lwp *l, const struct sys_recvmsg_args *uap, register_t *retval) 674sys_recvmsg(struct lwp *l, const struct sys_recvmsg_args *uap, register_t *retval)
677{ 675{
678 /* { 676 /* {
679 syscallarg(int) s; 677 syscallarg(int) s;
680 syscallarg(struct msghdr *) msg; 678 syscallarg(struct msghdr *) msg;
681 syscallarg(int) flags; 679 syscallarg(int) flags;
682 } */ 680 } */
683 struct msghdr msg; 681 struct msghdr msg;
684 int error; 682 int error;
685 struct mbuf *from, *control; 683 struct mbuf *from, *control;
686 684
687 error = copyin(SCARG(uap, msg), &msg, sizeof(msg)); 685 error = copyin(SCARG(uap, msg), &msg, sizeof(msg));
688 if (error) 686 if (error)
689 return (error); 687 return (error);
690 688
691 msg.msg_flags = (SCARG(uap, flags) & MSG_USERFLAGS) | MSG_IOVUSRSPACE; 689 msg.msg_flags = (SCARG(uap, flags) & MSG_USERFLAGS) | MSG_IOVUSRSPACE;
692 690
693 error = do_sys_recvmsg(l, SCARG(uap, s), &msg, &from, 691 error = do_sys_recvmsg(l, SCARG(uap, s), &msg, &from,
694 msg.msg_control != NULL ? &control : NULL, retval); 692 msg.msg_control != NULL ? &control : NULL, retval);
695 if (error != 0) 693 if (error != 0)
696 return error; 694 return error;
697 695
698 if (msg.msg_control != NULL) 696 if (msg.msg_control != NULL)
699 error = copyout_msg_control(l, &msg, control); 697 error = copyout_msg_control(l, &msg, control);
700 698
701 if (error == 0) 699 if (error == 0)
702 error = copyout_sockname(msg.msg_name, &msg.msg_namelen, 0, 700 error = copyout_sockname(msg.msg_name, &msg.msg_namelen, 0,
703 from); 701 from);
704 if (from != NULL) 702 if (from != NULL)
705 m_free(from); 703 m_free(from);
706 if (error == 0) { 704 if (error == 0) {
707 ktrkuser("msghdr", &msg, sizeof msg); 705 ktrkuser("msghdr", &msg, sizeof msg);
708 error = copyout(&msg, SCARG(uap, msg), sizeof(msg)); 706 error = copyout(&msg, SCARG(uap, msg), sizeof(msg));
709 } 707 }
710 708
711 return (error); 709 return (error);
712} 710}
713 711
714/* 712/*
715 * Adjust for a truncated SCM_RIGHTS control message. 713 * Adjust for a truncated SCM_RIGHTS control message.
716 * This means closing any file descriptors that aren't present 714 * This means closing any file descriptors that aren't present
717 * in the returned buffer. 715 * in the returned buffer.
718 * m is the mbuf holding the (already externalized) SCM_RIGHTS message. 716 * m is the mbuf holding the (already externalized) SCM_RIGHTS message.
719 */ 717 */
720static void 718static void
721free_rights(struct mbuf *m) 719free_rights(struct mbuf *m)
722{ 720{
723 int nfd; 721 int nfd;
724 int i; 722 int i;
725 int *fdv; 723 int *fdv;
726 724
727 nfd = m->m_len < CMSG_SPACE(sizeof(int)) ? 0 725 nfd = m->m_len < CMSG_SPACE(sizeof(int)) ? 0
728 : (m->m_len - CMSG_SPACE(sizeof(int))) / sizeof(int) + 1; 726 : (m->m_len - CMSG_SPACE(sizeof(int))) / sizeof(int) + 1;
729 fdv = (int *) CMSG_DATA(mtod(m,struct cmsghdr *)); 727 fdv = (int *) CMSG_DATA(mtod(m,struct cmsghdr *));
730 for (i = 0; i < nfd; i++) { 728 for (i = 0; i < nfd; i++) {
731 if (fd_getfile(fdv[i]) != NULL) 729 if (fd_getfile(fdv[i]) != NULL)
732 (void)fd_close(fdv[i]); 730 (void)fd_close(fdv[i]);
733 } 731 }
734} 732}
735 733
736void 734void
737free_control_mbuf(struct lwp *l, struct mbuf *control, struct mbuf *uncopied) 735free_control_mbuf(struct lwp *l, struct mbuf *control, struct mbuf *uncopied)
738{ 736{
739 struct mbuf *next; 737 struct mbuf *next;
740 struct cmsghdr *cmsg; 738 struct cmsghdr *cmsg;
741 bool do_free_rights = false; 739 bool do_free_rights = false;
742 740
743 while (control != NULL) { 741 while (control != NULL) {
744 cmsg = mtod(control, struct cmsghdr *); 742 cmsg = mtod(control, struct cmsghdr *);
745 if (control == uncopied) 743 if (control == uncopied)
746 do_free_rights = true; 744 do_free_rights = true;
747 if (do_free_rights && cmsg->cmsg_level == SOL_SOCKET 745 if (do_free_rights && cmsg->cmsg_level == SOL_SOCKET
748 && cmsg->cmsg_type == SCM_RIGHTS) 746 && cmsg->cmsg_type == SCM_RIGHTS)
749 free_rights(control); 747 free_rights(control);
750 next = control->m_next; 748 next = control->m_next;
751 m_free(control); 749 m_free(control);
752 control = next; 750 control = next;
753 } 751 }
754} 752}
755 753
756/* Copy socket control/CMSG data to user buffer, frees the mbuf */ 754/* Copy socket control/CMSG data to user buffer, frees the mbuf */
757int 755int
758copyout_msg_control(struct lwp *l, struct msghdr *mp, struct mbuf *control) 756copyout_msg_control(struct lwp *l, struct msghdr *mp, struct mbuf *control)
759{ 757{
760 int i, len, error = 0; 758 int i, len, error = 0;
761 struct cmsghdr *cmsg; 759 struct cmsghdr *cmsg;
762 struct mbuf *m; 760 struct mbuf *m;
763 char *q; 761 char *q;
764 762
765 len = mp->msg_controllen; 763 len = mp->msg_controllen;
766 if (len <= 0 || control == 0) { 764 if (len <= 0 || control == 0) {
767 mp->msg_controllen = 0; 765 mp->msg_controllen = 0;
768 free_control_mbuf(l, control, control); 766 free_control_mbuf(l, control, control);
769 return 0; 767 return 0;
770 } 768 }
771 769
772 q = (char *)mp->msg_control; 770 q = (char *)mp->msg_control;
773 771
774 for (m = control; m != NULL; ) { 772 for (m = control; m != NULL; ) {
775 cmsg = mtod(m, struct cmsghdr *); 773 cmsg = mtod(m, struct cmsghdr *);
776 i = m->m_len; 774 i = m->m_len;
777 if (len < i) { 775 if (len < i) {
778 mp->msg_flags |= MSG_CTRUNC; 776 mp->msg_flags |= MSG_CTRUNC;
779 if (cmsg->cmsg_level == SOL_SOCKET 777 if (cmsg->cmsg_level == SOL_SOCKET
780 && cmsg->cmsg_type == SCM_RIGHTS) 778 && cmsg->cmsg_type == SCM_RIGHTS)
781 /* Do not truncate me ... */ 779 /* Do not truncate me ... */
782 break; 780 break;
783 i = len; 781 i = len;
784 } 782 }
785 error = copyout(mtod(m, void *), q, i); 783 error = copyout(mtod(m, void *), q, i);
786 ktrkuser("msgcontrol", mtod(m, void *), i); 784 ktrkuser("msgcontrol", mtod(m, void *), i);
787 if (error != 0) { 785 if (error != 0) {
788 /* We must free all the SCM_RIGHTS */ 786 /* We must free all the SCM_RIGHTS */
789 m = control; 787 m = control;
790 break; 788 break;
791 } 789 }
792 m = m->m_next; 790 m = m->m_next;
793 if (m) 791 if (m)
794 i = ALIGN(i); 792 i = ALIGN(i);
795 q += i; 793 q += i;
796 len -= i; 794 len -= i;
797 if (len <= 0) 795 if (len <= 0)
798 break; 796 break;
799 } 797 }
800 798
801 free_control_mbuf(l, control, m); 799 free_control_mbuf(l, control, m);
802 800
803 mp->msg_controllen = q - (char *)mp->msg_control; 801 mp->msg_controllen = q - (char *)mp->msg_control;
804 return error; 802 return error;
805} 803}
806 804
807int 805int
808do_sys_recvmsg(struct lwp *l, int s, struct msghdr *mp, struct mbuf **from, 806do_sys_recvmsg(struct lwp *l, int s, struct msghdr *mp, struct mbuf **from,
809 struct mbuf **control, register_t *retsize) 807 struct mbuf **control, register_t *retsize)
810{ 808{
811 struct iovec aiov[UIO_SMALLIOV], *iov = aiov, *tiov, *ktriov; 809 struct iovec aiov[UIO_SMALLIOV], *iov = aiov, *tiov, *ktriov;
812 struct socket *so; 810 struct socket *so;
813 struct uio auio; 811 struct uio auio;
814 size_t len, iovsz; 812 size_t len, iovsz;
815 int i, error; 813 int i, error;
816 814
817 ktrkuser("msghdr", mp, sizeof *mp); 815 ktrkuser("msghdr", mp, sizeof *mp);
818 816
819 *from = NULL; 817 *from = NULL;
820 if (control != NULL) 818 if (control != NULL)
821 *control = NULL; 819 *control = NULL;
822 820
823 if ((error = fd_getsock(s, &so)) != 0) 821 if ((error = fd_getsock(s, &so)) != 0)
824 return (error); 822 return (error);
825 823
826 iovsz = mp->msg_iovlen * sizeof(struct iovec); 824 iovsz = mp->msg_iovlen * sizeof(struct iovec);
827 825
828 if (mp->msg_flags & MSG_IOVUSRSPACE) { 826 if (mp->msg_flags & MSG_IOVUSRSPACE) {
829 if ((unsigned int)mp->msg_iovlen > UIO_SMALLIOV) { 827 if ((unsigned int)mp->msg_iovlen > UIO_SMALLIOV) {
830 if ((unsigned int)mp->msg_iovlen > IOV_MAX) { 828 if ((unsigned int)mp->msg_iovlen > IOV_MAX) {
831 error = EMSGSIZE; 829 error = EMSGSIZE;
832 goto out; 830 goto out;
833 } 831 }
834 iov = kmem_alloc(iovsz, KM_SLEEP); 832 iov = kmem_alloc(iovsz, KM_SLEEP);
835 } 833 }
836 if (mp->msg_iovlen != 0) { 834 if (mp->msg_iovlen != 0) {
837 error = copyin(mp->msg_iov, iov, iovsz); 835 error = copyin(mp->msg_iov, iov, iovsz);
838 if (error) 836 if (error)
839 goto out; 837 goto out;
840 } 838 }
841 auio.uio_iov = iov; 839 auio.uio_iov = iov;
842 } else 840 } else
843 auio.uio_iov = mp->msg_iov; 841 auio.uio_iov = mp->msg_iov;
844 auio.uio_iovcnt = mp->msg_iovlen; 842 auio.uio_iovcnt = mp->msg_iovlen;
845 auio.uio_rw = UIO_READ; 843 auio.uio_rw = UIO_READ;
846 auio.uio_offset = 0; /* XXX */ 844 auio.uio_offset = 0; /* XXX */
847 auio.uio_resid = 0; 845 auio.uio_resid = 0;
848 KASSERT(l == curlwp); 846 KASSERT(l == curlwp);
849 auio.uio_vmspace = l->l_proc->p_vmspace; 847 auio.uio_vmspace = l->l_proc->p_vmspace;
850 848
851 tiov = auio.uio_iov; 849 tiov = auio.uio_iov;
852 for (i = 0; i < mp->msg_iovlen; i++, tiov++) { 850 for (i = 0; i < mp->msg_iovlen; i++, tiov++) {
853 /* 851 /*
854 * Reads return ssize_t because -1 is returned on error. 852 * Reads return ssize_t because -1 is returned on error.
855 * Therefore we must restrict the length to SSIZE_MAX to 853 * Therefore we must restrict the length to SSIZE_MAX to
856 * avoid garbage return values. 854 * avoid garbage return values.
857 */ 855 */
858 auio.uio_resid += tiov->iov_len; 856 auio.uio_resid += tiov->iov_len;
859 if (tiov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { 857 if (tiov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
860 error = EINVAL; 858 error = EINVAL;
861 goto out; 859 goto out;
862 } 860 }
863 } 861 }
864 862
865 ktriov = NULL; 863 ktriov = NULL;
866 if (ktrpoint(KTR_GENIO)) { 864 if (ktrpoint(KTR_GENIO)) {
867 ktriov = kmem_alloc(iovsz, KM_SLEEP); 865 ktriov = kmem_alloc(iovsz, KM_SLEEP);
868 memcpy(ktriov, auio.uio_iov, iovsz); 866 memcpy(ktriov, auio.uio_iov, iovsz);
869 } 867 }
870 868
871 len = auio.uio_resid; 869 len = auio.uio_resid;
872 mp->msg_flags &= MSG_USERFLAGS; 870 mp->msg_flags &= MSG_USERFLAGS;
873 error = (*so->so_receive)(so, from, &auio, NULL, control, 871 error = (*so->so_receive)(so, from, &auio, NULL, control,
874 &mp->msg_flags); 872 &mp->msg_flags);
875 len -= auio.uio_resid; 873 len -= auio.uio_resid;
876 *retsize = len; 874 *retsize = len;
877 if (error != 0 && len != 0 875 if (error != 0 && len != 0
878 && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) 876 && (error == ERESTART || error == EINTR || error == EWOULDBLOCK))
879 /* Some data transferred */ 877 /* Some data transferred */
880 error = 0; 878 error = 0;
881 879
882 if (ktriov != NULL) { 880 if (ktriov != NULL) {
883 ktrgeniov(s, UIO_READ, ktriov, len, error); 881 ktrgeniov(s, UIO_READ, ktriov, len, error);
884 kmem_free(ktriov, iovsz); 882 kmem_free(ktriov, iovsz);
885 } 883 }
886 884
887 if (error != 0) { 885 if (error != 0) {
888 m_freem(*from); 886 m_freem(*from);
889 *from = NULL; 887 *from = NULL;
890 if (control != NULL) { 888 if (control != NULL) {
891 free_control_mbuf(l, *control, *control); 889 free_control_mbuf(l, *control, *control);
892 *control = NULL; 890 *control = NULL;
893 } 891 }
894 } 892 }
895 out: 893 out:
896 if (iov != aiov) 894 if (iov != aiov)
897 kmem_free(iov, iovsz); 895 kmem_free(iov, iovsz);
898 fd_putfile(s); 896 fd_putfile(s);
899 return (error); 897 return (error);
900} 898}
901 899
902 900
903/* ARGSUSED */ 901/* ARGSUSED */
904int 902int
905sys_shutdown(struct lwp *l, const struct sys_shutdown_args *uap, register_t *retval) 903sys_shutdown(struct lwp *l, const struct sys_shutdown_args *uap, register_t *retval)
906{ 904{
907 /* { 905 /* {
908 syscallarg(int) s; 906 syscallarg(int) s;
909 syscallarg(int) how; 907 syscallarg(int) how;
910 } */ 908 } */
911 struct socket *so; 909 struct socket *so;
912 int error; 910 int error;
913 911
914 if ((error = fd_getsock(SCARG(uap, s), &so)) != 0) 912 if ((error = fd_getsock(SCARG(uap, s), &so)) != 0)
915 return (error); 913 return (error);
916 solock(so); 914 solock(so);
917 error = soshutdown(so, SCARG(uap, how)); 915 error = soshutdown(so, SCARG(uap, how));
918 sounlock(so); 916 sounlock(so);
919 fd_putfile(SCARG(uap, s)); 917 fd_putfile(SCARG(uap, s));
920 return (error); 918 return (error);
921} 919}
922 920
923/* ARGSUSED */ 921/* ARGSUSED */
924int 922int
925sys_setsockopt(struct lwp *l, const struct sys_setsockopt_args *uap, register_t *retval) 923sys_setsockopt(struct lwp *l, const struct sys_setsockopt_args *uap, register_t *retval)
926{ 924{
927 /* { 925 /* {
928 syscallarg(int) s; 926 syscallarg(int) s;
929 syscallarg(int) level; 927 syscallarg(int) level;
930 syscallarg(int) name; 928 syscallarg(int) name;
931 syscallarg(const void *) val; 929 syscallarg(const void *) val;
932 syscallarg(unsigned int) valsize; 930 syscallarg(unsigned int) valsize;
933 } */ 931 } */
934 struct sockopt sopt; 932 struct sockopt sopt;
935 struct socket *so; 933 struct socket *so;
936 int error; 934 int error;
937 unsigned int len; 935 unsigned int len;
938 936
939 len = SCARG(uap, valsize); 937 len = SCARG(uap, valsize);
940 if (len > 0 && SCARG(uap, val) == NULL) 938 if (len > 0 && SCARG(uap, val) == NULL)
941 return (EINVAL); 939 return (EINVAL);
942 940
943 if (len > MCLBYTES) 941 if (len > MCLBYTES)
944 return (EINVAL); 942 return (EINVAL);
945 943
946 if ((error = fd_getsock(SCARG(uap, s), &so)) != 0) 944 if ((error = fd_getsock(SCARG(uap, s), &so)) != 0)
947 return (error); 945 return (error);
948 946
949 sockopt_init(&sopt, SCARG(uap, level), SCARG(uap, name), len); 947 sockopt_init(&sopt, SCARG(uap, level), SCARG(uap, name), len);
950 948
951 if (len > 0) { 949 if (len > 0) {
952 error = copyin(SCARG(uap, val), sopt.sopt_data, len); 950 error = copyin(SCARG(uap, val), sopt.sopt_data, len);
953 if (error) 951 if (error)
954 goto out; 952 goto out;
955 } 953 }
956 954
957 error = sosetopt(so, &sopt); 955 error = sosetopt(so, &sopt);
958 956
959 out: 957 out:
960 sockopt_destroy(&sopt); 958 sockopt_destroy(&sopt);
961 fd_putfile(SCARG(uap, s)); 959 fd_putfile(SCARG(uap, s));
962 return (error); 960 return (error);
963} 961}
964 962
965/* ARGSUSED */ 963/* ARGSUSED */
966int 964int
967sys_getsockopt(struct lwp *l, const struct sys_getsockopt_args *uap, register_t *retval) 965sys_getsockopt(struct lwp *l, const struct sys_getsockopt_args *uap, register_t *retval)
968{ 966{
969 /* { 967 /* {
970 syscallarg(int) s; 968 syscallarg(int) s;
971 syscallarg(int) level; 969 syscallarg(int) level;
972 syscallarg(int) name; 970 syscallarg(int) name;
973 syscallarg(void *) val; 971 syscallarg(void *) val;
974 syscallarg(unsigned int *) avalsize; 972 syscallarg(unsigned int *) avalsize;
975 } */ 973 } */
976 struct sockopt sopt; 974 struct sockopt sopt;
977 struct socket *so; 975 struct socket *so;
978 unsigned int valsize, len; 976 unsigned int valsize, len;
979 int error; 977 int error;
980 978
981 if (SCARG(uap, val) != NULL) { 979 if (SCARG(uap, val) != NULL) {
982 error = copyin(SCARG(uap, avalsize), &valsize, sizeof(valsize)); 980 error = copyin(SCARG(uap, avalsize), &valsize, sizeof(valsize));
983 if (error) 981 if (error)
984 return (error); 982 return (error);
985 } else 983 } else
986 valsize = 0; 984 valsize = 0;
987 985
988 if ((error = fd_getsock(SCARG(uap, s), &so)) != 0) 986 if ((error = fd_getsock(SCARG(uap, s), &so)) != 0)
989 return (error); 987 return (error);
990 988
991 sockopt_init(&sopt, SCARG(uap, level), SCARG(uap, name), 0); 989 sockopt_init(&sopt, SCARG(uap, level), SCARG(uap, name), 0);
992 990
993 error = sogetopt(so, &sopt); 991 error = sogetopt(so, &sopt);
994 if (error) 992 if (error)
995 goto out; 993 goto out;
996 994
997 if (valsize > 0) { 995 if (valsize > 0) {
998 len = min(valsize, sopt.sopt_size); 996 len = min(valsize, sopt.sopt_size);
999 error = copyout(sopt.sopt_data, SCARG(uap, val), len); 997 error = copyout(sopt.sopt_data, SCARG(uap, val), len);
1000 if (error) 998 if (error)
1001 goto out; 999 goto out;
1002 1000
1003 error = copyout(&len, SCARG(uap, avalsize), sizeof(len)); 1001 error = copyout(&len, SCARG(uap, avalsize), sizeof(len));
1004 if (error) 1002 if (error)
1005 goto out; 1003 goto out;
1006 } 1004 }
1007 1005
1008 out: 1006 out:
1009 sockopt_destroy(&sopt); 1007 sockopt_destroy(&sopt);
1010 fd_putfile(SCARG(uap, s)); 1008 fd_putfile(SCARG(uap, s));
1011 return (error); 1009 return (error);
1012} 1010}
1013 1011
1014#ifdef PIPE_SOCKETPAIR 1012#ifdef PIPE_SOCKETPAIR
1015/* ARGSUSED */ 1013/* ARGSUSED */
1016int 1014int
1017pipe1(struct lwp *l, register_t *retval, int flags) 1015pipe1(struct lwp *l, register_t *retval, int flags)
1018{ 1016{
1019 file_t *rf, *wf; 1017 file_t *rf, *wf;
1020 struct socket *rso, *wso; 1018 struct socket *rso, *wso;
1021 int fd, error; 1019 int fd, error;
1022 proc_t *p; 1020 proc_t *p;
1023 1021
1024 if (flags & ~(O_CLOEXEC|O_NONBLOCK)) 1022 if (flags & ~(O_CLOEXEC|O_NONBLOCK))
1025 return EINVAL; 1023 return EINVAL;
1026 p = curproc; 1024 p = curproc;
1027 if ((error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0, l, NULL)) != 0) 1025 if ((error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0, l, NULL)) != 0)
1028 return (error); 1026 return (error);
1029 if ((error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0, l, rso)) != 0) 1027 if ((error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0, l, rso)) != 0)
1030 goto free1; 1028 goto free1;
1031 /* remember this socket pair implements a pipe */ 1029 /* remember this socket pair implements a pipe */
1032 wso->so_state |= SS_ISAPIPE; 1030 wso->so_state |= SS_ISAPIPE;
1033 rso->so_state |= SS_ISAPIPE; 1031 rso->so_state |= SS_ISAPIPE;
1034 if ((error = fd_allocfile(&rf, &fd)) != 0) 1032 if ((error = fd_allocfile(&rf, &fd)) != 0)
1035 goto free2; 1033 goto free2;
1036 retval[0] = fd; 1034 retval[0] = fd;
1037 rf->f_flag = FREAD | flags; 1035 rf->f_flag = FREAD | flags;
1038 rf->f_type = DTYPE_SOCKET; 1036 rf->f_type = DTYPE_SOCKET;
1039 rf->f_ops = &socketops; 1037 rf->f_ops = &socketops;
1040 rf->f_data = rso; 1038 rf->f_data = rso;
1041 if ((error = fd_allocfile(&wf, &fd)) != 0) 1039 if ((error = fd_allocfile(&wf, &fd)) != 0)
1042 goto free3; 1040 goto free3;
1043 wf->f_flag = FWRITE | flags; 1041 wf->f_flag = FWRITE | flags;
1044 wf->f_type = DTYPE_SOCKET; 1042 wf->f_type = DTYPE_SOCKET;
1045 wf->f_ops = &socketops; 1043 wf->f_ops = &socketops;
1046 wf->f_data = wso; 1044 wf->f_data = wso;
1047 retval[1] = fd; 1045 retval[1] = fd;
1048 solock(wso); 1046 solock(wso);
1049 error = unp_connect2(wso, rso, PRU_CONNECT2); 1047 error = unp_connect2(wso, rso, PRU_CONNECT2);
1050 sounlock(wso); 1048 sounlock(wso);
1051 if (error != 0) 1049 if (error != 0)
1052 goto free4; 1050 goto free4;
1053 fd_affix(p, wf, (int)retval[1]); 1051 fd_affix(p, wf, (int)retval[1]);
1054 fd_affix(p, rf, (int)retval[0]); 1052 fd_affix(p, rf, (int)retval[0]);
1055 return (0); 1053 return (0);
1056 free4: 1054 free4:
1057 fd_abort(p, wf, (int)retval[1]); 1055 fd_abort(p, wf, (int)retval[1]);
1058 free3: 1056 free3:
1059 fd_abort(p, rf, (int)retval[0]); 1057 fd_abort(p, rf, (int)retval[0]);
1060 free2: 1058 free2:
1061 (void)soclose(wso); 1059 (void)soclose(wso);
1062 free1: 1060 free1:
1063 (void)soclose(rso); 1061 (void)soclose(rso);
1064 return (error); 1062 return (error);
1065} 1063}
1066#endif /* PIPE_SOCKETPAIR */ 1064#endif /* PIPE_SOCKETPAIR */
1067 1065
1068/* 1066/*
1069 * Get socket name. 1067 * Get socket name.
1070 */ 1068 */
1071/* ARGSUSED */ 1069/* ARGSUSED */
1072int 1070int
1073do_sys_getsockname(struct lwp *l, int fd, int which, struct mbuf **nam) 1071do_sys_getsockname(struct lwp *l, int fd, int which, struct mbuf **nam)
1074{ 1072{
1075 struct socket *so; 1073 struct socket *so;
1076 struct mbuf *m; 1074 struct mbuf *m;
1077 int error; 1075 int error;
1078 1076
1079 if ((error = fd_getsock(fd, &so)) != 0) 1077 if ((error = fd_getsock(fd, &so)) != 0)
1080 return error; 1078 return error;
1081 1079
1082 m = m_getclr(M_WAIT, MT_SONAME); 1080 m = m_getclr(M_WAIT, MT_SONAME);
1083 MCLAIM(m, so->so_mowner); 1081 MCLAIM(m, so->so_mowner);
1084 1082
1085 solock(so); 1083 solock(so);
1086 if (which == PRU_PEERADDR 1084 if (which == PRU_PEERADDR
1087 && (so->so_state & (SS_ISCONNECTED | SS_ISCONFIRMING)) == 0) { 1085 && (so->so_state & (SS_ISCONNECTED | SS_ISCONFIRMING)) == 0) {