Sat Jan 3 03:31:23 2009 UTC ()
remove extra semicolons.


(yamt)
diff -r1.8 -r1.9 src/sys/kern/kern_cctr.c
diff -r1.20 -r1.21 src/sys/kern/kern_drvctl.c
diff -r1.363 -r1.364 src/sys/kern/vfs_subr.c
diff -r1.16 -r1.17 src/sys/kern/vfs_wapbl.c

cvs diff -r1.8 -r1.9 src/sys/kern/kern_cctr.c (switch to unified diff)

--- src/sys/kern/kern_cctr.c 2008/05/19 17:06:02 1.8
+++ src/sys/kern/kern_cctr.c 2009/01/03 03:31:23 1.9
@@ -1,305 +1,305 @@ @@ -1,305 +1,305 @@
1/* $NetBSD: kern_cctr.c,v 1.8 2008/05/19 17:06:02 ad Exp $ */ 1/* $NetBSD: kern_cctr.c,v 1.9 2009/01/03 03:31:23 yamt Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2006, 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 2006, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * re-implementation of TSC for MP systems merging cc_microtime and 7 * re-implementation of TSC for MP systems merging cc_microtime and
8 * TSC for timecounters by Frank Kardel 8 * TSC for timecounters by Frank Kardel
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* basic calibration ideas are (kern_microtime.c): */ 32/* basic calibration ideas are (kern_microtime.c): */
33/****************************************************************************** 33/******************************************************************************
34 * * 34 * *
35 * Copyright (c) David L. Mills 1993, 1994 * 35 * Copyright (c) David L. Mills 1993, 1994 *
36 * * 36 * *
37 * Permission to use, copy, modify, and distribute this software and its * 37 * Permission to use, copy, modify, and distribute this software and its *
38 * documentation for any purpose and without fee is hereby granted, provided * 38 * documentation for any purpose and without fee is hereby granted, provided *
39 * that the above copyright notice appears in all copies and that both the * 39 * that the above copyright notice appears in all copies and that both the *
40 * copyright notice and this permission notice appear in supporting * 40 * copyright notice and this permission notice appear in supporting *
41 * documentation, and that the name University of Delaware not be used in * 41 * documentation, and that the name University of Delaware not be used in *
42 * advertising or publicity pertaining to distribution of the software * 42 * advertising or publicity pertaining to distribution of the software *
43 * without specific, written prior permission. The University of Delaware * 43 * without specific, written prior permission. The University of Delaware *
44 * makes no representations about the suitability this software for any * 44 * makes no representations about the suitability this software for any *
45 * purpose. It is provided "as is" without express or implied warranty. * 45 * purpose. It is provided "as is" without express or implied warranty. *
46 * * 46 * *
47 ******************************************************************************/ 47 ******************************************************************************/
48 48
49/* reminiscents from older version of this file are: */ 49/* reminiscents from older version of this file are: */
50/*- 50/*-
51 * Copyright (c) 1998-2003 Poul-Henning Kamp 51 * Copyright (c) 1998-2003 Poul-Henning Kamp
52 * All rights reserved. 52 * All rights reserved.
53 * 53 *
54 * Redistribution and use in source and binary forms, with or without 54 * Redistribution and use in source and binary forms, with or without
55 * modification, are permitted provided that the following conditions 55 * modification, are permitted provided that the following conditions
56 * are met: 56 * are met:
57 * 1. Redistributions of source code must retain the above copyright 57 * 1. Redistributions of source code must retain the above copyright
58 * notice, this list of conditions and the following disclaimer. 58 * notice, this list of conditions and the following disclaimer.
59 * 2. Redistributions in binary form must reproduce the above copyright 59 * 2. Redistributions in binary form must reproduce the above copyright
60 * notice, this list of conditions and the following disclaimer in the 60 * notice, this list of conditions and the following disclaimer in the
61 * documentation and/or other materials provided with the distribution. 61 * documentation and/or other materials provided with the distribution.
62 * 62 *
63 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 63 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 66 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73 * SUCH DAMAGE. 73 * SUCH DAMAGE.
74 */ 74 */
75 75
76#include <sys/cdefs.h> 76#include <sys/cdefs.h>
77/* __FBSDID("$FreeBSD: src/sys/i386/i386/tsc.c,v 1.204 2003/10/21 18:28:34 silby Exp $"); */ 77/* __FBSDID("$FreeBSD: src/sys/i386/i386/tsc.c,v 1.204 2003/10/21 18:28:34 silby Exp $"); */
78__KERNEL_RCSID(0, "$NetBSD: kern_cctr.c,v 1.8 2008/05/19 17:06:02 ad Exp $"); 78__KERNEL_RCSID(0, "$NetBSD: kern_cctr.c,v 1.9 2009/01/03 03:31:23 yamt Exp $");
79 79
80#include <sys/param.h> 80#include <sys/param.h>
81#include <sys/systm.h> 81#include <sys/systm.h>
82#include <sys/sysctl.h> 82#include <sys/sysctl.h>
83#include <sys/time.h> 83#include <sys/time.h>
84#include <sys/timetc.h> 84#include <sys/timetc.h>
85#include <sys/kernel.h> 85#include <sys/kernel.h>
86#include <sys/power.h> 86#include <sys/power.h>
87#include <sys/cpu.h> 87#include <sys/cpu.h>
88#include <machine/cpu_counter.h> 88#include <machine/cpu_counter.h>
89 89
90/* XXX make cc_timecounter.tc_frequency settable by sysctl() */ 90/* XXX make cc_timecounter.tc_frequency settable by sysctl() */
91 91
92static timecounter_pps_t cc_calibrate; 92static timecounter_pps_t cc_calibrate;
93 93
94void cc_calibrate_cpu(struct cpu_info *); 94void cc_calibrate_cpu(struct cpu_info *);
95 95
96static int64_t cc_cal_val; /* last calibrate time stamp */ 96static int64_t cc_cal_val; /* last calibrate time stamp */
97 97
98static struct timecounter cc_timecounter = { 98static struct timecounter cc_timecounter = {
99 .tc_get_timecount = cc_get_timecount, 99 .tc_get_timecount = cc_get_timecount,
100 .tc_poll_pps = cc_calibrate, 100 .tc_poll_pps = cc_calibrate,
101 .tc_counter_mask = ~0u, 101 .tc_counter_mask = ~0u,
102 .tc_frequency = 0, 102 .tc_frequency = 0,
103 .tc_name = "unkown cycle counter", 103 .tc_name = "unkown cycle counter",
104 /* 104 /*
105 * don't pick cycle counter automatically 105 * don't pick cycle counter automatically
106 * if frequency changes might affect cycle counter 106 * if frequency changes might affect cycle counter
107 */ 107 */
108 .tc_quality = -100000, 108 .tc_quality = -100000,
109 109
110 .tc_priv = NULL, 110 .tc_priv = NULL,
111 .tc_next = NULL 111 .tc_next = NULL
112}; 112};
113 113
114/* 114/*
115 * initialize cycle counter based timecounter 115 * initialize cycle counter based timecounter
116 */ 116 */
117struct timecounter * 117struct timecounter *
118cc_init(timecounter_get_t getcc, uint64_t freq, const char *name, int quality) 118cc_init(timecounter_get_t getcc, uint64_t freq, const char *name, int quality)
119{ 119{
120 120
121 if (getcc != NULL) 121 if (getcc != NULL)
122 cc_timecounter.tc_get_timecount = getcc; 122 cc_timecounter.tc_get_timecount = getcc;
123 123
124 cc_timecounter.tc_frequency = freq; 124 cc_timecounter.tc_frequency = freq;
125 cc_timecounter.tc_name = name; 125 cc_timecounter.tc_name = name;
126 cc_timecounter.tc_quality = quality; 126 cc_timecounter.tc_quality = quality;
127 tc_init(&cc_timecounter); 127 tc_init(&cc_timecounter);
128 128
129 return &cc_timecounter; 129 return &cc_timecounter;
130} 130}
131 131
132/* 132/*
133 * pick up tick count scaled to reference tick count 133 * pick up tick count scaled to reference tick count
134 */ 134 */
135u_int 135u_int
136cc_get_timecount(struct timecounter *tc) 136cc_get_timecount(struct timecounter *tc)
137{ 137{
138 struct cpu_info *ci; 138 struct cpu_info *ci;
139 int64_t rcc, cc, ncsw; 139 int64_t rcc, cc, ncsw;
140 u_int gen; 140 u_int gen;
141 141
142 retry: 142 retry:
143 ncsw = curlwp->l_ncsw; 143 ncsw = curlwp->l_ncsw;
144 __insn_barrier(); 144 __insn_barrier();
145 ci = curcpu(); 145 ci = curcpu();
146 if (ci->ci_cc.cc_denom == 0) { 146 if (ci->ci_cc.cc_denom == 0) {
147 /* 147 /*
148 * This is our first time here on this CPU. Just 148 * This is our first time here on this CPU. Just
149 * start with reasonable initial values. 149 * start with reasonable initial values.
150 */ 150 */
151 ci->ci_cc.cc_cc = cpu_counter32(); 151 ci->ci_cc.cc_cc = cpu_counter32();
152 ci->ci_cc.cc_val = 0; 152 ci->ci_cc.cc_val = 0;
153 if (ci->ci_cc.cc_gen == 0) 153 if (ci->ci_cc.cc_gen == 0)
154 ci->ci_cc.cc_gen++; 154 ci->ci_cc.cc_gen++;
155 155
156 ci->ci_cc.cc_denom = cpu_frequency(ci); 156 ci->ci_cc.cc_denom = cpu_frequency(ci);
157 if (ci->ci_cc.cc_denom == 0) 157 if (ci->ci_cc.cc_denom == 0)
158 ci->ci_cc.cc_denom = cc_timecounter.tc_frequency; 158 ci->ci_cc.cc_denom = cc_timecounter.tc_frequency;
159 ci->ci_cc.cc_delta = ci->ci_cc.cc_denom; 159 ci->ci_cc.cc_delta = ci->ci_cc.cc_denom;
160 } 160 }
161 161
162 /* 162 /*
163 * read counter and re-read when the re-calibration 163 * read counter and re-read when the re-calibration
164 * strikes inbetween 164 * strikes inbetween
165 */ 165 */
166 do { 166 do {
167 /* pick up current generation number */ 167 /* pick up current generation number */
168 gen = ci->ci_cc.cc_gen; 168 gen = ci->ci_cc.cc_gen;
169 169
170 /* determine local delta ticks */ 170 /* determine local delta ticks */
171 cc = cpu_counter32() - ci->ci_cc.cc_cc; 171 cc = cpu_counter32() - ci->ci_cc.cc_cc;
172 if (cc < 0) 172 if (cc < 0)
173 cc += 0x100000000LL; 173 cc += 0x100000000LL;
174 174
175 /* scale to primary */ 175 /* scale to primary */
176 rcc = (cc * ci->ci_cc.cc_delta) / ci->ci_cc.cc_denom 176 rcc = (cc * ci->ci_cc.cc_delta) / ci->ci_cc.cc_denom
177 + ci->ci_cc.cc_val; 177 + ci->ci_cc.cc_val;
178 } while (gen == 0 || gen != ci->ci_cc.cc_gen); 178 } while (gen == 0 || gen != ci->ci_cc.cc_gen);
179 __insn_barrier(); 179 __insn_barrier();
180 if (ncsw != curlwp->l_ncsw) { 180 if (ncsw != curlwp->l_ncsw) {
181 /* Was preempted */  181 /* Was preempted */
182 goto retry; 182 goto retry;
183 } 183 }
184 184
185 return rcc; 185 return rcc;
186} 186}
187 187
188/* 188/*
189 * called once per clock tick via the pps callback 189 * called once per clock tick via the pps callback
190 * for the calibration of the TSC counters. 190 * for the calibration of the TSC counters.
191 * it is called only for the PRIMARY cpu. all 191 * it is called only for the PRIMARY cpu. all
192 * other cpus are called via a broadcast IPI 192 * other cpus are called via a broadcast IPI
193 * calibration interval is 1 second - we call 193 * calibration interval is 1 second - we call
194 * the calibration code only every hz calls 194 * the calibration code only every hz calls
195 */ 195 */
196static void 196static void
197cc_calibrate(struct timecounter *tc) 197cc_calibrate(struct timecounter *tc)
198{ 198{
199 static int calls; 199 static int calls;
200 struct cpu_info *ci; 200 struct cpu_info *ci;
201 201
202 KASSERT(kpreempt_disabled()); 202 KASSERT(kpreempt_disabled());
203 203
204 /* 204 /*
205 * XXX: for high interrupt frequency 205 * XXX: for high interrupt frequency
206 * support: ++calls < hz / tc_tick 206 * support: ++calls < hz / tc_tick
207 */ 207 */
208 if (++calls < hz) 208 if (++calls < hz)
209 return; 209 return;
210 210
211 calls = 0; 211 calls = 0;
212 ci = curcpu(); 212 ci = curcpu();
213 /* pick up reference ticks */ 213 /* pick up reference ticks */
214 cc_cal_val = cpu_counter32(); 214 cc_cal_val = cpu_counter32();
215 215
216#if defined(MULTIPROCESSOR) 216#if defined(MULTIPROCESSOR)
217 cc_calibrate_mp(ci); 217 cc_calibrate_mp(ci);
218#endif 218#endif
219 cc_calibrate_cpu(ci); 219 cc_calibrate_cpu(ci);
220} 220}
221 221
222/* 222/*
223 * This routine is called about once per second directly by the master 223 * This routine is called about once per second directly by the master
224 * processor and via an interprocessor interrupt for other processors. 224 * processor and via an interprocessor interrupt for other processors.
225 * It determines the CC frequency of each processor relative to the 225 * It determines the CC frequency of each processor relative to the
226 * master clock and the time this determination is made. These values 226 * master clock and the time this determination is made. These values
227 * are used by cc_get_timecount() to interpolate the ticks between 227 * are used by cc_get_timecount() to interpolate the ticks between
228 * timer interrupts. Note that we assume the kernel variables have 228 * timer interrupts. Note that we assume the kernel variables have
229 * been zeroed early in life. 229 * been zeroed early in life.
230 */ 230 */
231void 231void
232cc_calibrate_cpu(struct cpu_info *ci) 232cc_calibrate_cpu(struct cpu_info *ci)
233{ 233{
234 u_int gen; 234 u_int gen;
235 int64_t val; 235 int64_t val;
236 int64_t delta, denom; 236 int64_t delta, denom;
237 int s; 237 int s;
238#ifdef TIMECOUNTER_DEBUG 238#ifdef TIMECOUNTER_DEBUG
239 int64_t factor, old_factor; 239 int64_t factor, old_factor;
240#endif 240#endif
241 val = cc_cal_val; 241 val = cc_cal_val;
242 242
243 s = splhigh(); 243 s = splhigh();
244 /* create next generation number */ 244 /* create next generation number */
245 gen = ci->ci_cc.cc_gen; 245 gen = ci->ci_cc.cc_gen;
246 gen++; 246 gen++;
247 if (gen == 0) 247 if (gen == 0)
248 gen++; 248 gen++;
249 249
250 /* update in progress */ 250 /* update in progress */
251 ci->ci_cc.cc_gen = 0; 251 ci->ci_cc.cc_gen = 0;
252 252
253 denom = ci->ci_cc.cc_cc; 253 denom = ci->ci_cc.cc_cc;
254 ci->ci_cc.cc_cc = cpu_counter32(); 254 ci->ci_cc.cc_cc = cpu_counter32();
255 255
256 if (ci->ci_cc.cc_denom == 0) { 256 if (ci->ci_cc.cc_denom == 0) {
257 /* 257 /*
258 * This is our first time here on this CPU. Just 258 * This is our first time here on this CPU. Just
259 * start with reasonable initial values. 259 * start with reasonable initial values.
260 */ 260 */
261 ci->ci_cc.cc_val = val; 261 ci->ci_cc.cc_val = val;
262 ci->ci_cc.cc_denom = cpu_frequency(ci); 262 ci->ci_cc.cc_denom = cpu_frequency(ci);
263 if (ci->ci_cc.cc_denom == 0) 263 if (ci->ci_cc.cc_denom == 0)
264 ci->ci_cc.cc_denom = cc_timecounter.tc_frequency;; 264 ci->ci_cc.cc_denom = cc_timecounter.tc_frequency;
265 ci->ci_cc.cc_delta = ci->ci_cc.cc_denom; 265 ci->ci_cc.cc_delta = ci->ci_cc.cc_denom;
266 ci->ci_cc.cc_gen = gen; 266 ci->ci_cc.cc_gen = gen;
267 splx(s); 267 splx(s);
268 return; 268 return;
269 } 269 }
270 270
271#ifdef TIMECOUNTER_DEBUG 271#ifdef TIMECOUNTER_DEBUG
272 old_factor = (ci->ci_cc.cc_delta * 1000 ) / ci->ci_cc.cc_denom; 272 old_factor = (ci->ci_cc.cc_delta * 1000 ) / ci->ci_cc.cc_denom;
273#endif 273#endif
274 274
275 /* local ticks per period */ 275 /* local ticks per period */
276 denom = ci->ci_cc.cc_cc - denom; 276 denom = ci->ci_cc.cc_cc - denom;
277 if (denom < 0) 277 if (denom < 0)
278 denom += 0x100000000LL; 278 denom += 0x100000000LL;
279 279
280 ci->ci_cc.cc_denom = denom; 280 ci->ci_cc.cc_denom = denom;
281 281
282 /* reference ticks per period */ 282 /* reference ticks per period */
283 delta = val - ci->ci_cc.cc_val; 283 delta = val - ci->ci_cc.cc_val;
284 if (delta < 0) 284 if (delta < 0)
285 delta += 0x100000000LL; 285 delta += 0x100000000LL;
286 286
287 ci->ci_cc.cc_val = val; 287 ci->ci_cc.cc_val = val;
288 ci->ci_cc.cc_delta = delta; 288 ci->ci_cc.cc_delta = delta;
289  289
290 /* publish new generation number */ 290 /* publish new generation number */
291 ci->ci_cc.cc_gen = gen; 291 ci->ci_cc.cc_gen = gen;
292 splx(s); 292 splx(s);
293 293
294#ifdef TIMECOUNTER_DEBUG 294#ifdef TIMECOUNTER_DEBUG
295 factor = (delta * 1000) / denom - old_factor; 295 factor = (delta * 1000) / denom - old_factor;
296 if (factor < 0) 296 if (factor < 0)
297 factor = -factor; 297 factor = -factor;
298 298
299 if (factor > old_factor / 10) 299 if (factor > old_factor / 10)
300 printf("cc_calibrate_cpu[%u]: 10%% exceeded - delta %" 300 printf("cc_calibrate_cpu[%u]: 10%% exceeded - delta %"
301 PRId64 ", denom %" PRId64 ", factor %" PRId64 301 PRId64 ", denom %" PRId64 ", factor %" PRId64
302 ", old factor %" PRId64"\n", ci->ci_index, 302 ", old factor %" PRId64"\n", ci->ci_index,
303 delta, denom, (delta * 1000) / denom, old_factor); 303 delta, denom, (delta * 1000) / denom, old_factor);
304#endif /* TIMECOUNTER_DEBUG */ 304#endif /* TIMECOUNTER_DEBUG */
305} 305}

cvs diff -r1.20 -r1.21 src/sys/kern/kern_drvctl.c (switch to unified diff)

--- src/sys/kern/kern_drvctl.c 2008/11/23 23:59:41 1.20
+++ src/sys/kern/kern_drvctl.c 2009/01/03 03:31:23 1.21
@@ -1,553 +1,553 @@ @@ -1,553 +1,553 @@
1/* $NetBSD: kern_drvctl.c,v 1.20 2008/11/23 23:59:41 jmcneill Exp $ */ 1/* $NetBSD: kern_drvctl.c,v 1.21 2009/01/03 03:31:23 yamt Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 2004 4 * Copyright (c) 2004
5 * Matthias Drochner. All rights reserved. 5 * Matthias Drochner. All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions, and the following disclaimer. 11 * notice, this list of conditions, and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE. 26 * SUCH DAMAGE.
27 */ 27 */
28 28
29#include <sys/cdefs.h> 29#include <sys/cdefs.h>
30__KERNEL_RCSID(0, "$NetBSD: kern_drvctl.c,v 1.20 2008/11/23 23:59:41 jmcneill Exp $"); 30__KERNEL_RCSID(0, "$NetBSD: kern_drvctl.c,v 1.21 2009/01/03 03:31:23 yamt Exp $");
31 31
32#include <sys/param.h> 32#include <sys/param.h>
33#include <sys/systm.h> 33#include <sys/systm.h>
34#include <sys/kernel.h> 34#include <sys/kernel.h>
35#include <sys/conf.h> 35#include <sys/conf.h>
36#include <sys/device.h> 36#include <sys/device.h>
37#include <sys/event.h> 37#include <sys/event.h>
38#include <sys/malloc.h> 38#include <sys/malloc.h>
39#include <sys/kmem.h> 39#include <sys/kmem.h>
40#include <sys/ioctl.h> 40#include <sys/ioctl.h>
41#include <sys/fcntl.h> 41#include <sys/fcntl.h>
42#include <sys/file.h> 42#include <sys/file.h>
43#include <sys/filedesc.h> 43#include <sys/filedesc.h>
44#include <sys/select.h> 44#include <sys/select.h>
45#include <sys/poll.h> 45#include <sys/poll.h>
46#include <sys/drvctlio.h> 46#include <sys/drvctlio.h>
47#include <sys/devmon.h> 47#include <sys/devmon.h>
48 48
49struct drvctl_event { 49struct drvctl_event {
50 TAILQ_ENTRY(drvctl_event) dce_link; 50 TAILQ_ENTRY(drvctl_event) dce_link;
51 prop_dictionary_t dce_event; 51 prop_dictionary_t dce_event;
52}; 52};
53 53
54TAILQ_HEAD(drvctl_queue, drvctl_event); 54TAILQ_HEAD(drvctl_queue, drvctl_event);
55 55
56static struct drvctl_queue drvctl_eventq; /* FIFO */ 56static struct drvctl_queue drvctl_eventq; /* FIFO */
57static kcondvar_t drvctl_cond; 57static kcondvar_t drvctl_cond;
58static kmutex_t drvctl_lock; 58static kmutex_t drvctl_lock;
59static int drvctl_nopen = 0, drvctl_eventcnt = 0; 59static int drvctl_nopen = 0, drvctl_eventcnt = 0;
60static struct selinfo drvctl_rdsel; 60static struct selinfo drvctl_rdsel;
61 61
62#define DRVCTL_EVENTQ_DEPTH 64 /* arbitrary queue limit */ 62#define DRVCTL_EVENTQ_DEPTH 64 /* arbitrary queue limit */
63 63
64dev_type_open(drvctlopen); 64dev_type_open(drvctlopen);
65 65
66const struct cdevsw drvctl_cdevsw = { 66const struct cdevsw drvctl_cdevsw = {
67 drvctlopen, nullclose, nullread, nullwrite, noioctl, 67 drvctlopen, nullclose, nullread, nullwrite, noioctl,
68 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER 68 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER
69}; 69};
70 70
71void drvctlattach(int); 71void drvctlattach(int);
72 72
73static int drvctl_read(struct file *, off_t *, struct uio *, 73static int drvctl_read(struct file *, off_t *, struct uio *,
74 kauth_cred_t, int); 74 kauth_cred_t, int);
75static int drvctl_write(struct file *, off_t *, struct uio *, 75static int drvctl_write(struct file *, off_t *, struct uio *,
76 kauth_cred_t, int); 76 kauth_cred_t, int);
77static int drvctl_ioctl(struct file *, u_long, void *); 77static int drvctl_ioctl(struct file *, u_long, void *);
78static int drvctl_poll(struct file *, int); 78static int drvctl_poll(struct file *, int);
79static int drvctl_close(struct file *); 79static int drvctl_close(struct file *);
80 80
81static const struct fileops drvctl_fileops = { 81static const struct fileops drvctl_fileops = {
82 drvctl_read, 82 drvctl_read,
83 drvctl_write, 83 drvctl_write,
84 drvctl_ioctl, 84 drvctl_ioctl,
85 fnullop_fcntl, 85 fnullop_fcntl,
86 drvctl_poll, 86 drvctl_poll,
87 fbadop_stat, 87 fbadop_stat,
88 drvctl_close, 88 drvctl_close,
89 fnullop_kqfilter 89 fnullop_kqfilter
90}; 90};
91 91
92#define MAXLOCATORS 100 92#define MAXLOCATORS 100
93 93
94static int drvctl_command(struct lwp *, struct plistref *, u_long, int); 94static int drvctl_command(struct lwp *, struct plistref *, u_long, int);
95static int drvctl_getevent(struct lwp *, struct plistref *, u_long, int); 95static int drvctl_getevent(struct lwp *, struct plistref *, u_long, int);
96 96
97void 97void
98drvctl_init(void) 98drvctl_init(void)
99{ 99{
100 TAILQ_INIT(&drvctl_eventq); 100 TAILQ_INIT(&drvctl_eventq);
101 mutex_init(&drvctl_lock, MUTEX_DEFAULT, IPL_NONE); 101 mutex_init(&drvctl_lock, MUTEX_DEFAULT, IPL_NONE);
102 cv_init(&drvctl_cond, "devmon"); 102 cv_init(&drvctl_cond, "devmon");
103 selinit(&drvctl_rdsel); 103 selinit(&drvctl_rdsel);
104} 104}
105 105
106void 106void
107devmon_insert(const char *event, prop_dictionary_t ev) 107devmon_insert(const char *event, prop_dictionary_t ev)
108{ 108{
109 struct drvctl_event *dce, *odce;; 109 struct drvctl_event *dce, *odce;
110 110
111 mutex_enter(&drvctl_lock); 111 mutex_enter(&drvctl_lock);
112 112
113 if (drvctl_nopen == 0) { 113 if (drvctl_nopen == 0) {
114 mutex_exit(&drvctl_lock); 114 mutex_exit(&drvctl_lock);
115 return; 115 return;
116 } 116 }
117 117
118 /* Fill in mandatory member */ 118 /* Fill in mandatory member */
119 if (!prop_dictionary_set_cstring_nocopy(ev, "event", event)) { 119 if (!prop_dictionary_set_cstring_nocopy(ev, "event", event)) {
120 prop_object_release(ev); 120 prop_object_release(ev);
121 mutex_exit(&drvctl_lock); 121 mutex_exit(&drvctl_lock);
122 return; 122 return;
123 } 123 }
124 124
125 dce = kmem_alloc(sizeof(*dce), KM_SLEEP); 125 dce = kmem_alloc(sizeof(*dce), KM_SLEEP);
126 if (dce == NULL) { 126 if (dce == NULL) {
127 mutex_exit(&drvctl_lock); 127 mutex_exit(&drvctl_lock);
128 return; 128 return;
129 } 129 }
130 130
131 dce->dce_event = ev; 131 dce->dce_event = ev;
132 132
133 if (drvctl_eventcnt == DRVCTL_EVENTQ_DEPTH) { 133 if (drvctl_eventcnt == DRVCTL_EVENTQ_DEPTH) {
134 odce = TAILQ_FIRST(&drvctl_eventq); 134 odce = TAILQ_FIRST(&drvctl_eventq);
135 TAILQ_REMOVE(&drvctl_eventq, odce, dce_link); 135 TAILQ_REMOVE(&drvctl_eventq, odce, dce_link);
136 prop_object_release(odce->dce_event); 136 prop_object_release(odce->dce_event);
137 kmem_free(odce, sizeof(*odce)); 137 kmem_free(odce, sizeof(*odce));
138 --drvctl_eventcnt; 138 --drvctl_eventcnt;
139 } 139 }
140 140
141 TAILQ_INSERT_TAIL(&drvctl_eventq, dce, dce_link); 141 TAILQ_INSERT_TAIL(&drvctl_eventq, dce, dce_link);
142 ++drvctl_eventcnt; 142 ++drvctl_eventcnt;
143 cv_broadcast(&drvctl_cond); 143 cv_broadcast(&drvctl_cond);
144 selnotify(&drvctl_rdsel, 0, 0); 144 selnotify(&drvctl_rdsel, 0, 0);
145 145
146 mutex_exit(&drvctl_lock); 146 mutex_exit(&drvctl_lock);
147} 147}
148 148
149int 149int
150drvctlopen(dev_t dev, int flags, int mode, struct lwp *l) 150drvctlopen(dev_t dev, int flags, int mode, struct lwp *l)
151{ 151{
152 struct file *fp; 152 struct file *fp;
153 int fd; 153 int fd;
154 int ret; 154 int ret;
155 155
156 ret = fd_allocfile(&fp, &fd); 156 ret = fd_allocfile(&fp, &fd);
157 if (ret) 157 if (ret)
158 return (ret); 158 return (ret);
159 159
160 /* XXX setup context */ 160 /* XXX setup context */
161 mutex_enter(&drvctl_lock); 161 mutex_enter(&drvctl_lock);
162 ret = fd_clone(fp, fd, flags, &drvctl_fileops, /* context */NULL); 162 ret = fd_clone(fp, fd, flags, &drvctl_fileops, /* context */NULL);
163 ++drvctl_nopen; 163 ++drvctl_nopen;
164 mutex_exit(&drvctl_lock); 164 mutex_exit(&drvctl_lock);
165 165
166 return ret; 166 return ret;
167} 167}
168 168
169static int 169static int
170pmdevbyname(u_long cmd, struct devpmargs *a) 170pmdevbyname(u_long cmd, struct devpmargs *a)
171{ 171{
172 struct device *d; 172 struct device *d;
173 173
174 if ((d = device_find_by_xname(a->devname)) == NULL) 174 if ((d = device_find_by_xname(a->devname)) == NULL)
175 return ENXIO; 175 return ENXIO;
176 176
177 switch (cmd) { 177 switch (cmd) {
178 case DRVSUSPENDDEV: 178 case DRVSUSPENDDEV:
179 return pmf_device_recursive_suspend(d, PMF_F_NONE) ? 0 : EBUSY; 179 return pmf_device_recursive_suspend(d, PMF_F_NONE) ? 0 : EBUSY;
180 case DRVRESUMEDEV: 180 case DRVRESUMEDEV:
181 if (a->flags & DEVPM_F_SUBTREE) { 181 if (a->flags & DEVPM_F_SUBTREE) {
182 return pmf_device_resume_subtree(d, PMF_F_NONE) 182 return pmf_device_resume_subtree(d, PMF_F_NONE)
183 ? 0 : EBUSY; 183 ? 0 : EBUSY;
184 } else { 184 } else {
185 return pmf_device_recursive_resume(d, PMF_F_NONE) 185 return pmf_device_recursive_resume(d, PMF_F_NONE)
186 ? 0 : EBUSY; 186 ? 0 : EBUSY;
187 } 187 }
188 default: 188 default:
189 return EPASSTHROUGH; 189 return EPASSTHROUGH;
190 } 190 }
191} 191}
192 192
193static int 193static int
194listdevbyname(struct devlistargs *l) 194listdevbyname(struct devlistargs *l)
195{ 195{
196 device_t d, child; 196 device_t d, child;
197 deviter_t di; 197 deviter_t di;
198 int cnt = 0, idx, error = 0; 198 int cnt = 0, idx, error = 0;
199 199
200 if ((d = device_find_by_xname(l->l_devname)) == NULL) 200 if ((d = device_find_by_xname(l->l_devname)) == NULL)
201 return ENXIO; 201 return ENXIO;
202 202
203 for (child = deviter_first(&di, 0); child != NULL; 203 for (child = deviter_first(&di, 0); child != NULL;
204 child = deviter_next(&di)) { 204 child = deviter_next(&di)) {
205 if (device_parent(child) != d) 205 if (device_parent(child) != d)
206 continue; 206 continue;
207 idx = cnt++; 207 idx = cnt++;
208 if (l->l_childname == NULL || idx >= l->l_children) 208 if (l->l_childname == NULL || idx >= l->l_children)
209 continue; 209 continue;
210 error = copyoutstr(device_xname(child), l->l_childname[idx], 210 error = copyoutstr(device_xname(child), l->l_childname[idx],
211 sizeof(l->l_childname[idx]), NULL); 211 sizeof(l->l_childname[idx]), NULL);
212 if (error != 0) 212 if (error != 0)
213 break; 213 break;
214 } 214 }
215 deviter_release(&di); 215 deviter_release(&di);
216 216
217 l->l_children = cnt; 217 l->l_children = cnt;
218 return error; 218 return error;
219} 219}
220 220
221static int 221static int
222detachdevbyname(const char *devname) 222detachdevbyname(const char *devname)
223{ 223{
224 struct device *d; 224 struct device *d;
225 225
226 if ((d = device_find_by_xname(devname)) == NULL) 226 if ((d = device_find_by_xname(devname)) == NULL)
227 return ENXIO; 227 return ENXIO;
228 228
229#ifndef XXXFULLRISK 229#ifndef XXXFULLRISK
230 /* 230 /*
231 * If the parent cannot be notified, it might keep 231 * If the parent cannot be notified, it might keep
232 * pointers to the detached device. 232 * pointers to the detached device.
233 * There might be a private notification mechanism, 233 * There might be a private notification mechanism,
234 * but better play save here. 234 * but better play save here.
235 */ 235 */
236 if (d->dv_parent && !d->dv_parent->dv_cfattach->ca_childdetached) 236 if (d->dv_parent && !d->dv_parent->dv_cfattach->ca_childdetached)
237 return (ENOTSUP); 237 return (ENOTSUP);
238#endif 238#endif
239 return (config_detach(d, 0)); 239 return (config_detach(d, 0));
240} 240}
241 241
242static int 242static int
243rescanbus(const char *busname, const char *ifattr, 243rescanbus(const char *busname, const char *ifattr,
244 int numlocators, const int *locators) 244 int numlocators, const int *locators)
245{ 245{
246 int i, rc; 246 int i, rc;
247 struct device *d; 247 struct device *d;
248 const struct cfiattrdata * const *ap; 248 const struct cfiattrdata * const *ap;
249 249
250 /* XXX there should be a way to get limits and defaults (per device) 250 /* XXX there should be a way to get limits and defaults (per device)
251 from config generated data */ 251 from config generated data */
252 int locs[MAXLOCATORS]; 252 int locs[MAXLOCATORS];
253 for (i = 0; i < MAXLOCATORS; i++) 253 for (i = 0; i < MAXLOCATORS; i++)
254 locs[i] = -1; 254 locs[i] = -1;
255 255
256 for (i = 0; i < numlocators;i++) 256 for (i = 0; i < numlocators;i++)
257 locs[i] = locators[i]; 257 locs[i] = locators[i];
258 258
259 if ((d = device_find_by_xname(busname)) == NULL) 259 if ((d = device_find_by_xname(busname)) == NULL)
260 return ENXIO; 260 return ENXIO;
261 261
262 /* 262 /*
263 * must support rescan, and must have something 263 * must support rescan, and must have something
264 * to attach to 264 * to attach to
265 */ 265 */
266 if (!d->dv_cfattach->ca_rescan || 266 if (!d->dv_cfattach->ca_rescan ||
267 !d->dv_cfdriver->cd_attrs) 267 !d->dv_cfdriver->cd_attrs)
268 return (ENODEV); 268 return (ENODEV);
269 269
270 /* allow to omit attribute if there is exactly one */ 270 /* allow to omit attribute if there is exactly one */
271 if (!ifattr) { 271 if (!ifattr) {
272 if (d->dv_cfdriver->cd_attrs[1]) 272 if (d->dv_cfdriver->cd_attrs[1])
273 return (EINVAL); 273 return (EINVAL);
274 ifattr = d->dv_cfdriver->cd_attrs[0]->ci_name; 274 ifattr = d->dv_cfdriver->cd_attrs[0]->ci_name;
275 } else { 275 } else {
276 /* check for valid attribute passed */ 276 /* check for valid attribute passed */
277 for (ap = d->dv_cfdriver->cd_attrs; *ap; ap++) 277 for (ap = d->dv_cfdriver->cd_attrs; *ap; ap++)
278 if (!strcmp((*ap)->ci_name, ifattr)) 278 if (!strcmp((*ap)->ci_name, ifattr))
279 break; 279 break;
280 if (!*ap) 280 if (!*ap)
281 return (EINVAL); 281 return (EINVAL);
282 } 282 }
283 283
284 rc = (*d->dv_cfattach->ca_rescan)(d, ifattr, locs); 284 rc = (*d->dv_cfattach->ca_rescan)(d, ifattr, locs);
285 config_deferred(NULL); 285 config_deferred(NULL);
286 return rc; 286 return rc;
287} 287}
288 288
289static int 289static int
290drvctl_read(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, 290drvctl_read(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
291 int flags) 291 int flags)
292{ 292{
293 return (ENODEV); 293 return (ENODEV);
294} 294}
295 295
296static int 296static int
297drvctl_write(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, 297drvctl_write(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
298 int flags) 298 int flags)
299{ 299{
300 return (ENODEV); 300 return (ENODEV);
301} 301}
302 302
303static int 303static int
304drvctl_ioctl(struct file *fp, u_long cmd, void *data) 304drvctl_ioctl(struct file *fp, u_long cmd, void *data)
305{ 305{
306 int res; 306 int res;
307 char *ifattr; 307 char *ifattr;
308 int *locs; 308 int *locs;
309 309
310 switch (cmd) { 310 switch (cmd) {
311 case DRVSUSPENDDEV: 311 case DRVSUSPENDDEV:
312 case DRVRESUMEDEV: 312 case DRVRESUMEDEV:
313#define d ((struct devpmargs *)data) 313#define d ((struct devpmargs *)data)
314 res = pmdevbyname(cmd, d); 314 res = pmdevbyname(cmd, d);
315#undef d 315#undef d
316 break; 316 break;
317 case DRVLISTDEV: 317 case DRVLISTDEV:
318 res = listdevbyname((struct devlistargs *)data); 318 res = listdevbyname((struct devlistargs *)data);
319 break; 319 break;
320 case DRVDETACHDEV: 320 case DRVDETACHDEV:
321#define d ((struct devdetachargs *)data) 321#define d ((struct devdetachargs *)data)
322 res = detachdevbyname(d->devname); 322 res = detachdevbyname(d->devname);
323#undef d 323#undef d
324 break; 324 break;
325 case DRVRESCANBUS: 325 case DRVRESCANBUS:
326#define d ((struct devrescanargs *)data) 326#define d ((struct devrescanargs *)data)
327 d->busname[sizeof(d->busname) - 1] = '\0'; 327 d->busname[sizeof(d->busname) - 1] = '\0';
328 328
329 /* XXX better copyin? */ 329 /* XXX better copyin? */
330 if (d->ifattr[0]) { 330 if (d->ifattr[0]) {
331 d->ifattr[sizeof(d->ifattr) - 1] = '\0'; 331 d->ifattr[sizeof(d->ifattr) - 1] = '\0';
332 ifattr = d->ifattr; 332 ifattr = d->ifattr;
333 } else 333 } else
334 ifattr = 0; 334 ifattr = 0;
335 335
336 if (d->numlocators) { 336 if (d->numlocators) {
337 if (d->numlocators > MAXLOCATORS) 337 if (d->numlocators > MAXLOCATORS)
338 return (EINVAL); 338 return (EINVAL);
339 locs = malloc(d->numlocators * sizeof(int), M_DEVBUF, 339 locs = malloc(d->numlocators * sizeof(int), M_DEVBUF,
340 M_WAITOK); 340 M_WAITOK);
341 res = copyin(d->locators, locs, 341 res = copyin(d->locators, locs,
342 d->numlocators * sizeof(int)); 342 d->numlocators * sizeof(int));
343 if (res) { 343 if (res) {
344 free(locs, M_DEVBUF); 344 free(locs, M_DEVBUF);
345 return (res); 345 return (res);
346 } 346 }
347 } else 347 } else
348 locs = 0; 348 locs = 0;
349 res = rescanbus(d->busname, ifattr, d->numlocators, locs); 349 res = rescanbus(d->busname, ifattr, d->numlocators, locs);
350 if (locs) 350 if (locs)
351 free(locs, M_DEVBUF); 351 free(locs, M_DEVBUF);
352#undef d 352#undef d
353 break; 353 break;
354 case DRVCTLCOMMAND: 354 case DRVCTLCOMMAND:
355 res = drvctl_command(curlwp, (struct plistref *)data, cmd, 355 res = drvctl_command(curlwp, (struct plistref *)data, cmd,
356 fp->f_flag); 356 fp->f_flag);
357 break; 357 break;
358 case DRVGETEVENT: 358 case DRVGETEVENT:
359 res = drvctl_getevent(curlwp, (struct plistref *)data, cmd, 359 res = drvctl_getevent(curlwp, (struct plistref *)data, cmd,
360 fp->f_flag); 360 fp->f_flag);
361 break; 361 break;
362 default: 362 default:
363 return (EPASSTHROUGH); 363 return (EPASSTHROUGH);
364 } 364 }
365 return (res); 365 return (res);
366} 366}
367 367
368static int 368static int
369drvctl_poll(struct file *fp, int events) 369drvctl_poll(struct file *fp, int events)
370{ 370{
371 int revents = 0; 371 int revents = 0;
372 372
373 if (!TAILQ_EMPTY(&drvctl_eventq)) 373 if (!TAILQ_EMPTY(&drvctl_eventq))
374 revents |= events & (POLLIN | POLLRDNORM); 374 revents |= events & (POLLIN | POLLRDNORM);
375 else 375 else
376 selrecord(curlwp, &drvctl_rdsel); 376 selrecord(curlwp, &drvctl_rdsel);
377 377
378 return revents; 378 return revents;
379} 379}
380 380
381static int 381static int
382drvctl_close(struct file *fp) 382drvctl_close(struct file *fp)
383{ 383{
384 struct drvctl_event *dce; 384 struct drvctl_event *dce;
385 385
386 /* XXX free context */ 386 /* XXX free context */
387 mutex_enter(&drvctl_lock); 387 mutex_enter(&drvctl_lock);
388 KASSERT(drvctl_nopen > 0); 388 KASSERT(drvctl_nopen > 0);
389 --drvctl_nopen; 389 --drvctl_nopen;
390 if (drvctl_nopen == 0) { 390 if (drvctl_nopen == 0) {
391 /* flush queue */ 391 /* flush queue */
392 while ((dce = TAILQ_FIRST(&drvctl_eventq)) != NULL) { 392 while ((dce = TAILQ_FIRST(&drvctl_eventq)) != NULL) {
393 TAILQ_REMOVE(&drvctl_eventq, dce, dce_link); 393 TAILQ_REMOVE(&drvctl_eventq, dce, dce_link);
394 KASSERT(drvctl_eventcnt > 0); 394 KASSERT(drvctl_eventcnt > 0);
395 --drvctl_eventcnt; 395 --drvctl_eventcnt;
396 prop_object_release(dce->dce_event); 396 prop_object_release(dce->dce_event);
397 kmem_free(dce, sizeof(*dce)); 397 kmem_free(dce, sizeof(*dce));
398 } 398 }
399 } 399 }
400 mutex_exit(&drvctl_lock); 400 mutex_exit(&drvctl_lock);
401 401
402 return (0); 402 return (0);
403} 403}
404 404
405void 405void
406drvctlattach(int arg) 406drvctlattach(int arg)
407{ 407{
408} 408}
409 409
410/***************************************************************************** 410/*****************************************************************************
411 * Driver control command processing engine 411 * Driver control command processing engine
412 *****************************************************************************/ 412 *****************************************************************************/
413 413
414static int 414static int
415drvctl_command_get_properties(struct lwp *l, 415drvctl_command_get_properties(struct lwp *l,
416 prop_dictionary_t command_dict, 416 prop_dictionary_t command_dict,
417 prop_dictionary_t results_dict) 417 prop_dictionary_t results_dict)
418{ 418{
419 prop_dictionary_t args_dict; 419 prop_dictionary_t args_dict;
420 prop_string_t devname_string; 420 prop_string_t devname_string;
421 device_t dev; 421 device_t dev;
422 deviter_t di; 422 deviter_t di;
423  423
424 args_dict = prop_dictionary_get(command_dict, "drvctl-arguments"); 424 args_dict = prop_dictionary_get(command_dict, "drvctl-arguments");
425 if (args_dict == NULL) 425 if (args_dict == NULL)
426 return (EINVAL); 426 return (EINVAL);
427 427
428 devname_string = prop_dictionary_get(args_dict, "device-name"); 428 devname_string = prop_dictionary_get(args_dict, "device-name");
429 if (devname_string == NULL) 429 if (devname_string == NULL)
430 return (EINVAL); 430 return (EINVAL);
431  431
432 for (dev = deviter_first(&di, 0); dev != NULL; 432 for (dev = deviter_first(&di, 0); dev != NULL;
433 dev = deviter_next(&di)) { 433 dev = deviter_next(&di)) {
434 if (prop_string_equals_cstring(devname_string, 434 if (prop_string_equals_cstring(devname_string,
435 device_xname(dev))) { 435 device_xname(dev))) {
436 prop_dictionary_set(results_dict, "drvctl-result-data", 436 prop_dictionary_set(results_dict, "drvctl-result-data",
437 device_properties(dev)); 437 device_properties(dev));
438 break; 438 break;
439 } 439 }
440 } 440 }
441 441
442 deviter_release(&di); 442 deviter_release(&di);
443 443
444 if (dev == NULL) 444 if (dev == NULL)
445 return (ESRCH); 445 return (ESRCH);
446 446
447 return (0); 447 return (0);
448} 448}
449 449
450struct drvctl_command_desc { 450struct drvctl_command_desc {
451 const char *dcd_name; /* command name */ 451 const char *dcd_name; /* command name */
452 int (*dcd_func)(struct lwp *, /* handler function */ 452 int (*dcd_func)(struct lwp *, /* handler function */
453 prop_dictionary_t, 453 prop_dictionary_t,
454 prop_dictionary_t); 454 prop_dictionary_t);
455 int dcd_rw; /* read or write required */ 455 int dcd_rw; /* read or write required */
456}; 456};
457 457
458static const struct drvctl_command_desc drvctl_command_table[] = { 458static const struct drvctl_command_desc drvctl_command_table[] = {
459 { .dcd_name = "get-properties", 459 { .dcd_name = "get-properties",
460 .dcd_func = drvctl_command_get_properties, 460 .dcd_func = drvctl_command_get_properties,
461 .dcd_rw = FREAD, 461 .dcd_rw = FREAD,
462 }, 462 },
463 463
464 { .dcd_name = NULL } 464 { .dcd_name = NULL }
465}; 465};
466 466
467static int 467static int
468drvctl_command(struct lwp *l, struct plistref *pref, u_long ioctl_cmd, 468drvctl_command(struct lwp *l, struct plistref *pref, u_long ioctl_cmd,
469 int fflag) 469 int fflag)
470{ 470{
471 prop_dictionary_t command_dict, results_dict; 471 prop_dictionary_t command_dict, results_dict;
472 prop_string_t command_string; 472 prop_string_t command_string;
473 const struct drvctl_command_desc *dcd; 473 const struct drvctl_command_desc *dcd;
474 int error; 474 int error;
475 475
476 error = prop_dictionary_copyin_ioctl(pref, ioctl_cmd, &command_dict); 476 error = prop_dictionary_copyin_ioctl(pref, ioctl_cmd, &command_dict);
477 if (error) 477 if (error)
478 return (error); 478 return (error);
479 479
480 results_dict = prop_dictionary_create(); 480 results_dict = prop_dictionary_create();
481 if (results_dict == NULL) { 481 if (results_dict == NULL) {
482 prop_object_release(command_dict); 482 prop_object_release(command_dict);
483 return (ENOMEM); 483 return (ENOMEM);
484 } 484 }
485  485
486 command_string = prop_dictionary_get(command_dict, "drvctl-command"); 486 command_string = prop_dictionary_get(command_dict, "drvctl-command");
487 if (command_string == NULL) { 487 if (command_string == NULL) {
488 error = EINVAL; 488 error = EINVAL;
489 goto out; 489 goto out;
490 } 490 }
491 491
492 for (dcd = drvctl_command_table; dcd->dcd_name != NULL; dcd++) { 492 for (dcd = drvctl_command_table; dcd->dcd_name != NULL; dcd++) {
493 if (prop_string_equals_cstring(command_string, 493 if (prop_string_equals_cstring(command_string,
494 dcd->dcd_name)) 494 dcd->dcd_name))
495 break; 495 break;
496 } 496 }
497 497
498 if (dcd->dcd_name == NULL) { 498 if (dcd->dcd_name == NULL) {
499 error = EINVAL; 499 error = EINVAL;
500 goto out; 500 goto out;
501 } 501 }
502 502
503 if ((fflag & dcd->dcd_rw) == 0) { 503 if ((fflag & dcd->dcd_rw) == 0) {
504 error = EPERM; 504 error = EPERM;
505 goto out; 505 goto out;
506 } 506 }
507 507
508 error = (*dcd->dcd_func)(l, command_dict, results_dict); 508 error = (*dcd->dcd_func)(l, command_dict, results_dict);
509 509
510 prop_dictionary_set_int32(results_dict, "drvctl-error", error); 510 prop_dictionary_set_int32(results_dict, "drvctl-error", error);
511 511
512 error = prop_dictionary_copyout_ioctl(pref, ioctl_cmd, results_dict); 512 error = prop_dictionary_copyout_ioctl(pref, ioctl_cmd, results_dict);
513 out: 513 out:
514 prop_object_release(command_dict); 514 prop_object_release(command_dict);
515 prop_object_release(results_dict); 515 prop_object_release(results_dict);
516 return (error); 516 return (error);
517} 517}
518 518
519static int 519static int
520drvctl_getevent(struct lwp *l, struct plistref *pref, u_long ioctl_cmd, 520drvctl_getevent(struct lwp *l, struct plistref *pref, u_long ioctl_cmd,
521 int fflag) 521 int fflag)
522{ 522{
523 struct drvctl_event *dce; 523 struct drvctl_event *dce;
524 int ret; 524 int ret;
525 525
526 if ((fflag & (FREAD|FWRITE)) != (FREAD|FWRITE)) 526 if ((fflag & (FREAD|FWRITE)) != (FREAD|FWRITE))
527 return (EPERM); 527 return (EPERM);
528 528
529 mutex_enter(&drvctl_lock); 529 mutex_enter(&drvctl_lock);
530 while ((dce = TAILQ_FIRST(&drvctl_eventq)) == NULL) { 530 while ((dce = TAILQ_FIRST(&drvctl_eventq)) == NULL) {
531 if (fflag & O_NONBLOCK) { 531 if (fflag & O_NONBLOCK) {
532 mutex_exit(&drvctl_lock); 532 mutex_exit(&drvctl_lock);
533 return (EWOULDBLOCK); 533 return (EWOULDBLOCK);
534 } 534 }
535 535
536 ret = cv_wait_sig(&drvctl_cond, &drvctl_lock); 536 ret = cv_wait_sig(&drvctl_cond, &drvctl_lock);
537 if (ret) { 537 if (ret) {
538 mutex_exit(&drvctl_lock); 538 mutex_exit(&drvctl_lock);
539 return (ret); 539 return (ret);
540 } 540 }
541 } 541 }
542 TAILQ_REMOVE(&drvctl_eventq, dce, dce_link); 542 TAILQ_REMOVE(&drvctl_eventq, dce, dce_link);
543 KASSERT(drvctl_eventcnt > 0); 543 KASSERT(drvctl_eventcnt > 0);
544 --drvctl_eventcnt; 544 --drvctl_eventcnt;
545 mutex_exit(&drvctl_lock); 545 mutex_exit(&drvctl_lock);
546 546
547 ret = prop_dictionary_copyout_ioctl(pref, ioctl_cmd, dce->dce_event); 547 ret = prop_dictionary_copyout_ioctl(pref, ioctl_cmd, dce->dce_event);
548 548
549 prop_object_release(dce->dce_event); 549 prop_object_release(dce->dce_event);
550 kmem_free(dce, sizeof(*dce)); 550 kmem_free(dce, sizeof(*dce));
551 551
552 return (ret); 552 return (ret);
553} 553}

cvs diff -r1.363 -r1.364 src/sys/kern/vfs_subr.c (switch to unified diff)

--- src/sys/kern/vfs_subr.c 2008/12/29 17:41:18 1.363
+++ src/sys/kern/vfs_subr.c 2009/01/03 03:31:23 1.364
@@ -1,1180 +1,1180 @@ @@ -1,1180 +1,1180 @@
1/* $NetBSD: vfs_subr.c,v 1.363 2008/12/29 17:41:18 pooka Exp $ */ 1/* $NetBSD: vfs_subr.c,v 1.364 2009/01/03 03:31:23 yamt Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions 12 * modification, are permitted provided that the following conditions
13 * are met: 13 * are met:
14 * 1. Redistributions of source code must retain the above copyright 14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer. 15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright 16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the 17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution. 18 * documentation and/or other materials provided with the distribution.
19 * 19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE. 30 * POSSIBILITY OF SUCH DAMAGE.
31 */ 31 */
32 32
33/* 33/*
34 * Copyright (c) 1989, 1993 34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved. 35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc. 36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed 37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph 38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc. 40 * the permission of UNIX System Laboratories, Inc.
41 * 41 *
42 * Redistribution and use in source and binary forms, with or without 42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions 43 * modification, are permitted provided that the following conditions
44 * are met: 44 * are met:
45 * 1. Redistributions of source code must retain the above copyright 45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer. 46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright 47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the 48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution. 49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors 50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software 51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission. 52 * without specific prior written permission.
53 * 53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE. 64 * SUCH DAMAGE.
65 * 65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */ 67 */
68 68
69/* 69/*
70 * Note on v_usecount and locking: 70 * Note on v_usecount and locking:
71 * 71 *
72 * At nearly all points it is known that v_usecount could be zero, the 72 * At nearly all points it is known that v_usecount could be zero, the
73 * vnode interlock will be held. 73 * vnode interlock will be held.
74 * 74 *
75 * To change v_usecount away from zero, the interlock must be held. To 75 * To change v_usecount away from zero, the interlock must be held. To
76 * change from a non-zero value to zero, again the interlock must be 76 * change from a non-zero value to zero, again the interlock must be
77 * held. 77 * held.
78 * 78 *
79 * Changing the usecount from a non-zero value to a non-zero value can 79 * Changing the usecount from a non-zero value to a non-zero value can
80 * safely be done using atomic operations, without the interlock held. 80 * safely be done using atomic operations, without the interlock held.
81 */ 81 */
82 82
83#include <sys/cdefs.h> 83#include <sys/cdefs.h>
84__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.363 2008/12/29 17:41:18 pooka Exp $"); 84__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.364 2009/01/03 03:31:23 yamt Exp $");
85 85
86#include "opt_ddb.h" 86#include "opt_ddb.h"
87#include "opt_compat_netbsd.h" 87#include "opt_compat_netbsd.h"
88#include "opt_compat_43.h" 88#include "opt_compat_43.h"
89 89
90#include <sys/param.h> 90#include <sys/param.h>
91#include <sys/systm.h> 91#include <sys/systm.h>
92#include <sys/conf.h> 92#include <sys/conf.h>
93#include <sys/proc.h> 93#include <sys/proc.h>
94#include <sys/kernel.h> 94#include <sys/kernel.h>
95#include <sys/mount.h> 95#include <sys/mount.h>
96#include <sys/fcntl.h> 96#include <sys/fcntl.h>
97#include <sys/vnode.h> 97#include <sys/vnode.h>
98#include <sys/stat.h> 98#include <sys/stat.h>
99#include <sys/namei.h> 99#include <sys/namei.h>
100#include <sys/ucred.h> 100#include <sys/ucred.h>
101#include <sys/buf.h> 101#include <sys/buf.h>
102#include <sys/errno.h> 102#include <sys/errno.h>
103#include <sys/malloc.h> 103#include <sys/malloc.h>
104#include <sys/syscallargs.h> 104#include <sys/syscallargs.h>
105#include <sys/device.h> 105#include <sys/device.h>
106#include <sys/filedesc.h> 106#include <sys/filedesc.h>
107#include <sys/kauth.h> 107#include <sys/kauth.h>
108#include <sys/atomic.h> 108#include <sys/atomic.h>
109#include <sys/kthread.h> 109#include <sys/kthread.h>
110#include <sys/wapbl.h> 110#include <sys/wapbl.h>
111 111
112#include <miscfs/specfs/specdev.h> 112#include <miscfs/specfs/specdev.h>
113#include <miscfs/syncfs/syncfs.h> 113#include <miscfs/syncfs/syncfs.h>
114 114
115#include <uvm/uvm.h> 115#include <uvm/uvm.h>
116#include <uvm/uvm_readahead.h> 116#include <uvm/uvm_readahead.h>
117#include <uvm/uvm_ddb.h> 117#include <uvm/uvm_ddb.h>
118 118
119#include <sys/sysctl.h> 119#include <sys/sysctl.h>
120 120
121const enum vtype iftovt_tab[16] = { 121const enum vtype iftovt_tab[16] = {
122 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 122 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
123 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 123 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
124}; 124};
125const int vttoif_tab[9] = { 125const int vttoif_tab[9] = {
126 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 126 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
127 S_IFSOCK, S_IFIFO, S_IFMT, 127 S_IFSOCK, S_IFIFO, S_IFMT,
128}; 128};
129 129
130/* 130/*
131 * Insq/Remq for the vnode usage lists. 131 * Insq/Remq for the vnode usage lists.
132 */ 132 */
133#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 133#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
134#define bufremvn(bp) { \ 134#define bufremvn(bp) { \
135 LIST_REMOVE(bp, b_vnbufs); \ 135 LIST_REMOVE(bp, b_vnbufs); \
136 (bp)->b_vnbufs.le_next = NOLIST; \ 136 (bp)->b_vnbufs.le_next = NOLIST; \
137} 137}
138 138
139int doforce = 1; /* 1 => permit forcible unmounting */ 139int doforce = 1; /* 1 => permit forcible unmounting */
140int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 140int prtactive = 0; /* 1 => print out reclaim of active vnodes */
141 141
142static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 142static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
143static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 143static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
144static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list); 144static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list);
145 145
146struct mntlist mountlist = /* mounted filesystem list */ 146struct mntlist mountlist = /* mounted filesystem list */
147 CIRCLEQ_HEAD_INITIALIZER(mountlist); 147 CIRCLEQ_HEAD_INITIALIZER(mountlist);
148 148
149u_int numvnodes; 149u_int numvnodes;
150static specificdata_domain_t mount_specificdata_domain; 150static specificdata_domain_t mount_specificdata_domain;
151 151
152static int vrele_pending; 152static int vrele_pending;
153static int vrele_gen; 153static int vrele_gen;
154static kmutex_t vrele_lock; 154static kmutex_t vrele_lock;
155static kcondvar_t vrele_cv; 155static kcondvar_t vrele_cv;
156static lwp_t *vrele_lwp; 156static lwp_t *vrele_lwp;
157 157
158kmutex_t mountlist_lock; 158kmutex_t mountlist_lock;
159kmutex_t mntid_lock; 159kmutex_t mntid_lock;
160kmutex_t mntvnode_lock; 160kmutex_t mntvnode_lock;
161kmutex_t vnode_free_list_lock; 161kmutex_t vnode_free_list_lock;
162kmutex_t vfs_list_lock; 162kmutex_t vfs_list_lock;
163 163
164static pool_cache_t vnode_cache; 164static pool_cache_t vnode_cache;
165 165
166MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 166MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
167 167
168/* 168/*
169 * These define the root filesystem and device. 169 * These define the root filesystem and device.
170 */ 170 */
171struct vnode *rootvnode; 171struct vnode *rootvnode;
172struct device *root_device; /* root device */ 172struct device *root_device; /* root device */
173 173
174/* 174/*
175 * Local declarations. 175 * Local declarations.
176 */ 176 */
177 177
178static void vrele_thread(void *); 178static void vrele_thread(void *);
179static void insmntque(vnode_t *, struct mount *); 179static void insmntque(vnode_t *, struct mount *);
180static int getdevvp(dev_t, vnode_t **, enum vtype); 180static int getdevvp(dev_t, vnode_t **, enum vtype);
181static vnode_t *getcleanvnode(void);; 181static vnode_t *getcleanvnode(void);
182void vpanic(vnode_t *, const char *); 182void vpanic(vnode_t *, const char *);
183 183
184#ifdef DEBUG  184#ifdef DEBUG
185void printlockedvnodes(void); 185void printlockedvnodes(void);
186#endif 186#endif
187 187
188#ifdef DIAGNOSTIC 188#ifdef DIAGNOSTIC
189void 189void
190vpanic(vnode_t *vp, const char *msg) 190vpanic(vnode_t *vp, const char *msg)
191{ 191{
192 192
193 vprint(NULL, vp); 193 vprint(NULL, vp);
194 panic("%s\n", msg); 194 panic("%s\n", msg);
195} 195}
196#else 196#else
197#define vpanic(vp, msg) /* nothing */ 197#define vpanic(vp, msg) /* nothing */
198#endif 198#endif
199 199
200void 200void
201vn_init1(void) 201vn_init1(void)
202{ 202{
203 203
204 vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl", 204 vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl",
205 NULL, IPL_NONE, NULL, NULL, NULL); 205 NULL, IPL_NONE, NULL, NULL, NULL);
206 KASSERT(vnode_cache != NULL); 206 KASSERT(vnode_cache != NULL);
207 207
208 /* Create deferred release thread. */ 208 /* Create deferred release thread. */
209 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 209 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
210 cv_init(&vrele_cv, "vrele"); 210 cv_init(&vrele_cv, "vrele");
211 if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 211 if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
212 NULL, &vrele_lwp, "vrele")) 212 NULL, &vrele_lwp, "vrele"))
213 panic("fork vrele"); 213 panic("fork vrele");
214} 214}
215 215
216/* 216/*
217 * Initialize the vnode management data structures. 217 * Initialize the vnode management data structures.
218 */ 218 */
219void 219void
220vntblinit(void) 220vntblinit(void)
221{ 221{
222 222
223 mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE); 223 mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
224 mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE); 224 mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
225 mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE); 225 mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE);
226 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 226 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
227 mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE); 227 mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
228 228
229 mount_specificdata_domain = specificdata_domain_create(); 229 mount_specificdata_domain = specificdata_domain_create();
230 230
231 /* Initialize the filesystem syncer. */ 231 /* Initialize the filesystem syncer. */
232 vn_initialize_syncerd(); 232 vn_initialize_syncerd();
233 vn_init1(); 233 vn_init1();
234} 234}
235 235
236int 236int
237vfs_drainvnodes(long target, struct lwp *l) 237vfs_drainvnodes(long target, struct lwp *l)
238{ 238{
239 239
240 while (numvnodes > target) { 240 while (numvnodes > target) {
241 vnode_t *vp; 241 vnode_t *vp;
242 242
243 mutex_enter(&vnode_free_list_lock); 243 mutex_enter(&vnode_free_list_lock);
244 vp = getcleanvnode(); 244 vp = getcleanvnode();
245 if (vp == NULL) 245 if (vp == NULL)
246 return EBUSY; /* give up */ 246 return EBUSY; /* give up */
247 ungetnewvnode(vp); 247 ungetnewvnode(vp);
248 } 248 }
249 249
250 return 0; 250 return 0;
251} 251}
252 252
253/* 253/*
254 * Lookup a mount point by filesystem identifier. 254 * Lookup a mount point by filesystem identifier.
255 * 255 *
256 * XXX Needs to add a reference to the mount point. 256 * XXX Needs to add a reference to the mount point.
257 */ 257 */
258struct mount * 258struct mount *
259vfs_getvfs(fsid_t *fsid) 259vfs_getvfs(fsid_t *fsid)
260{ 260{
261 struct mount *mp; 261 struct mount *mp;
262 262
263 mutex_enter(&mountlist_lock); 263 mutex_enter(&mountlist_lock);
264 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { 264 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
265 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && 265 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
266 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { 266 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
267 mutex_exit(&mountlist_lock); 267 mutex_exit(&mountlist_lock);
268 return (mp); 268 return (mp);
269 } 269 }
270 } 270 }
271 mutex_exit(&mountlist_lock); 271 mutex_exit(&mountlist_lock);
272 return ((struct mount *)0); 272 return ((struct mount *)0);
273} 273}
274 274
275/* 275/*
276 * Drop a reference to a mount structure, freeing if the last reference. 276 * Drop a reference to a mount structure, freeing if the last reference.
277 */ 277 */
278void 278void
279vfs_destroy(struct mount *mp) 279vfs_destroy(struct mount *mp)
280{ 280{
281 281
282 if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) { 282 if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
283 return; 283 return;
284 } 284 }
285 285
286 /* 286 /*
287 * Nothing else has visibility of the mount: we can now 287 * Nothing else has visibility of the mount: we can now
288 * free the data structures. 288 * free the data structures.
289 */ 289 */
290 KASSERT(mp->mnt_refcnt == 0); 290 KASSERT(mp->mnt_refcnt == 0);
291 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref); 291 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
292 rw_destroy(&mp->mnt_unmounting); 292 rw_destroy(&mp->mnt_unmounting);
293 mutex_destroy(&mp->mnt_updating); 293 mutex_destroy(&mp->mnt_updating);
294 mutex_destroy(&mp->mnt_renamelock); 294 mutex_destroy(&mp->mnt_renamelock);
295 if (mp->mnt_op != NULL) { 295 if (mp->mnt_op != NULL) {
296 vfs_delref(mp->mnt_op); 296 vfs_delref(mp->mnt_op);
297 } 297 }
298 kmem_free(mp, sizeof(*mp)); 298 kmem_free(mp, sizeof(*mp));
299} 299}
300 300
301/* 301/*
302 * grab a vnode from freelist and clean it. 302 * grab a vnode from freelist and clean it.
303 */ 303 */
304vnode_t * 304vnode_t *
305getcleanvnode(void) 305getcleanvnode(void)
306{ 306{
307 vnode_t *vp; 307 vnode_t *vp;
308 vnodelst_t *listhd; 308 vnodelst_t *listhd;
309 309
310 KASSERT(mutex_owned(&vnode_free_list_lock)); 310 KASSERT(mutex_owned(&vnode_free_list_lock));
311 311
312retry: 312retry:
313 listhd = &vnode_free_list; 313 listhd = &vnode_free_list;
314try_nextlist: 314try_nextlist:
315 TAILQ_FOREACH(vp, listhd, v_freelist) { 315 TAILQ_FOREACH(vp, listhd, v_freelist) {
316 /* 316 /*
317 * It's safe to test v_usecount and v_iflag 317 * It's safe to test v_usecount and v_iflag
318 * without holding the interlock here, since 318 * without holding the interlock here, since
319 * these vnodes should never appear on the 319 * these vnodes should never appear on the
320 * lists. 320 * lists.
321 */ 321 */
322 if (vp->v_usecount != 0) { 322 if (vp->v_usecount != 0) {
323 vpanic(vp, "free vnode isn't"); 323 vpanic(vp, "free vnode isn't");
324 } 324 }
325 if ((vp->v_iflag & VI_CLEAN) != 0) { 325 if ((vp->v_iflag & VI_CLEAN) != 0) {
326 vpanic(vp, "clean vnode on freelist"); 326 vpanic(vp, "clean vnode on freelist");
327 } 327 }
328 if (vp->v_freelisthd != listhd) { 328 if (vp->v_freelisthd != listhd) {
329 printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd); 329 printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd);
330 vpanic(vp, "list head mismatch"); 330 vpanic(vp, "list head mismatch");
331 } 331 }
332 if (!mutex_tryenter(&vp->v_interlock)) 332 if (!mutex_tryenter(&vp->v_interlock))
333 continue; 333 continue;
334 /* 334 /*
335 * Our lwp might hold the underlying vnode 335 * Our lwp might hold the underlying vnode
336 * locked, so don't try to reclaim a VI_LAYER 336 * locked, so don't try to reclaim a VI_LAYER
337 * node if it's locked. 337 * node if it's locked.
338 */ 338 */
339 if ((vp->v_iflag & VI_XLOCK) == 0 && 339 if ((vp->v_iflag & VI_XLOCK) == 0 &&
340 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 340 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
341 break; 341 break;
342 } 342 }
343 mutex_exit(&vp->v_interlock); 343 mutex_exit(&vp->v_interlock);
344 } 344 }
345 345
346 if (vp == NULL) { 346 if (vp == NULL) {
347 if (listhd == &vnode_free_list) { 347 if (listhd == &vnode_free_list) {
348 listhd = &vnode_hold_list; 348 listhd = &vnode_hold_list;
349 goto try_nextlist; 349 goto try_nextlist;
350 } 350 }
351 mutex_exit(&vnode_free_list_lock); 351 mutex_exit(&vnode_free_list_lock);
352 return NULL; 352 return NULL;
353 } 353 }
354 354
355 /* Remove it from the freelist. */ 355 /* Remove it from the freelist. */
356 TAILQ_REMOVE(listhd, vp, v_freelist); 356 TAILQ_REMOVE(listhd, vp, v_freelist);
357 vp->v_freelisthd = NULL; 357 vp->v_freelisthd = NULL;
358 mutex_exit(&vnode_free_list_lock); 358 mutex_exit(&vnode_free_list_lock);
359 359
360 /* 360 /*
361 * The vnode is still associated with a file system, so we must 361 * The vnode is still associated with a file system, so we must
362 * clean it out before reusing it. We need to add a reference 362 * clean it out before reusing it. We need to add a reference
363 * before doing this. If the vnode gains another reference while 363 * before doing this. If the vnode gains another reference while
364 * being cleaned out then we lose - retry. 364 * being cleaned out then we lose - retry.
365 */ 365 */
366 atomic_inc_uint(&vp->v_usecount); 366 atomic_inc_uint(&vp->v_usecount);
367 vclean(vp, DOCLOSE); 367 vclean(vp, DOCLOSE);
368 if (vp->v_usecount == 1) { 368 if (vp->v_usecount == 1) {
369 /* We're about to dirty it. */ 369 /* We're about to dirty it. */
370 vp->v_iflag &= ~VI_CLEAN; 370 vp->v_iflag &= ~VI_CLEAN;
371 mutex_exit(&vp->v_interlock); 371 mutex_exit(&vp->v_interlock);
372 if (vp->v_type == VBLK || vp->v_type == VCHR) { 372 if (vp->v_type == VBLK || vp->v_type == VCHR) {
373 spec_node_destroy(vp); 373 spec_node_destroy(vp);
374 } 374 }
375 vp->v_type = VNON; 375 vp->v_type = VNON;
376 } else { 376 } else {
377 /* 377 /*
378 * Don't return to freelist - the holder of the last 378 * Don't return to freelist - the holder of the last
379 * reference will destroy it. 379 * reference will destroy it.
380 */ 380 */
381 vrelel(vp, 0); /* releases vp->v_interlock */ 381 vrelel(vp, 0); /* releases vp->v_interlock */
382 mutex_enter(&vnode_free_list_lock); 382 mutex_enter(&vnode_free_list_lock);
383 goto retry; 383 goto retry;
384 } 384 }
385 385
386 if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 || 386 if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 ||
387 !TAILQ_EMPTY(&vp->v_uobj.memq)) { 387 !TAILQ_EMPTY(&vp->v_uobj.memq)) {
388 vpanic(vp, "cleaned vnode isn't"); 388 vpanic(vp, "cleaned vnode isn't");
389 } 389 }
390 if (vp->v_numoutput != 0) { 390 if (vp->v_numoutput != 0) {
391 vpanic(vp, "clean vnode has pending I/O's"); 391 vpanic(vp, "clean vnode has pending I/O's");
392 } 392 }
393 if ((vp->v_iflag & VI_ONWORKLST) != 0) { 393 if ((vp->v_iflag & VI_ONWORKLST) != 0) {
394 vpanic(vp, "clean vnode on syncer list"); 394 vpanic(vp, "clean vnode on syncer list");
395 } 395 }
396 396
397 return vp; 397 return vp;
398} 398}
399 399
400/* 400/*
401 * Mark a mount point as busy, and gain a new reference to it. Used to 401 * Mark a mount point as busy, and gain a new reference to it. Used to
402 * prevent the file system from being unmounted during critical sections. 402 * prevent the file system from being unmounted during critical sections.
403 * 403 *
404 * => The caller must hold a pre-existing reference to the mount. 404 * => The caller must hold a pre-existing reference to the mount.
405 * => Will fail if the file system is being unmounted, or is unmounted. 405 * => Will fail if the file system is being unmounted, or is unmounted.
406 */ 406 */
407int 407int
408vfs_busy(struct mount *mp, struct mount **nextp) 408vfs_busy(struct mount *mp, struct mount **nextp)
409{ 409{
410 410
411 KASSERT(mp->mnt_refcnt > 0); 411 KASSERT(mp->mnt_refcnt > 0);
412 412
413 if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) { 413 if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) {
414 if (nextp != NULL) { 414 if (nextp != NULL) {
415 KASSERT(mutex_owned(&mountlist_lock)); 415 KASSERT(mutex_owned(&mountlist_lock));
416 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 416 *nextp = CIRCLEQ_NEXT(mp, mnt_list);
417 } 417 }
418 return EBUSY; 418 return EBUSY;
419 } 419 }
420 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) { 420 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
421 rw_exit(&mp->mnt_unmounting); 421 rw_exit(&mp->mnt_unmounting);
422 if (nextp != NULL) { 422 if (nextp != NULL) {
423 KASSERT(mutex_owned(&mountlist_lock)); 423 KASSERT(mutex_owned(&mountlist_lock));
424 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 424 *nextp = CIRCLEQ_NEXT(mp, mnt_list);
425 } 425 }
426 return ENOENT; 426 return ENOENT;
427 } 427 }
428 if (nextp != NULL) { 428 if (nextp != NULL) {
429 mutex_exit(&mountlist_lock); 429 mutex_exit(&mountlist_lock);
430 } 430 }
431 atomic_inc_uint(&mp->mnt_refcnt); 431 atomic_inc_uint(&mp->mnt_refcnt);
432 return 0; 432 return 0;
433} 433}
434 434
435/* 435/*
436 * Unbusy a busy filesystem. 436 * Unbusy a busy filesystem.
437 * 437 *
438 * => If keepref is true, preserve reference added by vfs_busy(). 438 * => If keepref is true, preserve reference added by vfs_busy().
439 * => If nextp != NULL, acquire mountlist_lock. 439 * => If nextp != NULL, acquire mountlist_lock.
440 */ 440 */
441void 441void
442vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp) 442vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp)
443{ 443{
444 444
445 KASSERT(mp->mnt_refcnt > 0); 445 KASSERT(mp->mnt_refcnt > 0);
446 446
447 if (nextp != NULL) { 447 if (nextp != NULL) {
448 mutex_enter(&mountlist_lock); 448 mutex_enter(&mountlist_lock);
449 } 449 }
450 rw_exit(&mp->mnt_unmounting); 450 rw_exit(&mp->mnt_unmounting);
451 if (!keepref) { 451 if (!keepref) {
452 vfs_destroy(mp); 452 vfs_destroy(mp);
453 } 453 }
454 if (nextp != NULL) { 454 if (nextp != NULL) {
455 KASSERT(mutex_owned(&mountlist_lock)); 455 KASSERT(mutex_owned(&mountlist_lock));
456 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 456 *nextp = CIRCLEQ_NEXT(mp, mnt_list);
457 } 457 }
458} 458}
459 459
460/* 460/*
461 * Lookup a filesystem type, and if found allocate and initialize 461 * Lookup a filesystem type, and if found allocate and initialize
462 * a mount structure for it. 462 * a mount structure for it.
463 * 463 *
464 * Devname is usually updated by mount(8) after booting. 464 * Devname is usually updated by mount(8) after booting.
465 */ 465 */
466int 466int
467vfs_rootmountalloc(const char *fstypename, const char *devname, 467vfs_rootmountalloc(const char *fstypename, const char *devname,
468 struct mount **mpp) 468 struct mount **mpp)
469{ 469{
470 struct vfsops *vfsp = NULL; 470 struct vfsops *vfsp = NULL;
471 struct mount *mp; 471 struct mount *mp;
472 472
473 mutex_enter(&vfs_list_lock); 473 mutex_enter(&vfs_list_lock);
474 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 474 LIST_FOREACH(vfsp, &vfs_list, vfs_list)
475 if (!strncmp(vfsp->vfs_name, fstypename,  475 if (!strncmp(vfsp->vfs_name, fstypename,
476 sizeof(mp->mnt_stat.f_fstypename))) 476 sizeof(mp->mnt_stat.f_fstypename)))
477 break; 477 break;
478 if (vfsp == NULL) { 478 if (vfsp == NULL) {
479 mutex_exit(&vfs_list_lock); 479 mutex_exit(&vfs_list_lock);
480 return (ENODEV); 480 return (ENODEV);
481 } 481 }
482 vfsp->vfs_refcount++; 482 vfsp->vfs_refcount++;
483 mutex_exit(&vfs_list_lock); 483 mutex_exit(&vfs_list_lock);
484 484
485 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP); 485 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
486 if (mp == NULL) 486 if (mp == NULL)
487 return ENOMEM; 487 return ENOMEM;
488 mp->mnt_refcnt = 1; 488 mp->mnt_refcnt = 1;
489 rw_init(&mp->mnt_unmounting); 489 rw_init(&mp->mnt_unmounting);
490 mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE); 490 mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
491 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE); 491 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
492 (void)vfs_busy(mp, NULL); 492 (void)vfs_busy(mp, NULL);
493 TAILQ_INIT(&mp->mnt_vnodelist); 493 TAILQ_INIT(&mp->mnt_vnodelist);
494 mp->mnt_op = vfsp; 494 mp->mnt_op = vfsp;
495 mp->mnt_flag = MNT_RDONLY; 495 mp->mnt_flag = MNT_RDONLY;
496 mp->mnt_vnodecovered = NULL; 496 mp->mnt_vnodecovered = NULL;
497 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, 497 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
498 sizeof(mp->mnt_stat.f_fstypename)); 498 sizeof(mp->mnt_stat.f_fstypename));
499 mp->mnt_stat.f_mntonname[0] = '/'; 499 mp->mnt_stat.f_mntonname[0] = '/';
500 mp->mnt_stat.f_mntonname[1] = '\0'; 500 mp->mnt_stat.f_mntonname[1] = '\0';
501 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] = 501 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
502 '\0'; 502 '\0';
503 (void)copystr(devname, mp->mnt_stat.f_mntfromname, 503 (void)copystr(devname, mp->mnt_stat.f_mntfromname,
504 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0); 504 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
505 mount_initspecific(mp); 505 mount_initspecific(mp);
506 *mpp = mp; 506 *mpp = mp;
507 return (0); 507 return (0);
508} 508}
509 509
510/* 510/*
511 * Routines having to do with the management of the vnode table. 511 * Routines having to do with the management of the vnode table.
512 */ 512 */
513extern int (**dead_vnodeop_p)(void *); 513extern int (**dead_vnodeop_p)(void *);
514 514
515/* 515/*
516 * Return the next vnode from the free list. 516 * Return the next vnode from the free list.
517 */ 517 */
518int 518int
519getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 519getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
520 vnode_t **vpp) 520 vnode_t **vpp)
521{ 521{
522 struct uvm_object *uobj; 522 struct uvm_object *uobj;
523 static int toggle; 523 static int toggle;
524 vnode_t *vp; 524 vnode_t *vp;
525 int error = 0, tryalloc; 525 int error = 0, tryalloc;
526 526
527 try_again: 527 try_again:
528 if (mp != NULL) { 528 if (mp != NULL) {
529 /* 529 /*
530 * Mark filesystem busy while we're creating a 530 * Mark filesystem busy while we're creating a
531 * vnode. If unmount is in progress, this will 531 * vnode. If unmount is in progress, this will
532 * fail. 532 * fail.
533 */ 533 */
534 error = vfs_busy(mp, NULL); 534 error = vfs_busy(mp, NULL);
535 if (error) 535 if (error)
536 return error; 536 return error;
537 } 537 }
538 538
539 /* 539 /*
540 * We must choose whether to allocate a new vnode or recycle an 540 * We must choose whether to allocate a new vnode or recycle an
541 * existing one. The criterion for allocating a new one is that 541 * existing one. The criterion for allocating a new one is that
542 * the total number of vnodes is less than the number desired or 542 * the total number of vnodes is less than the number desired or
543 * there are no vnodes on either free list. Generally we only 543 * there are no vnodes on either free list. Generally we only
544 * want to recycle vnodes that have no buffers associated with 544 * want to recycle vnodes that have no buffers associated with
545 * them, so we look first on the vnode_free_list. If it is empty, 545 * them, so we look first on the vnode_free_list. If it is empty,
546 * we next consider vnodes with referencing buffers on the 546 * we next consider vnodes with referencing buffers on the
547 * vnode_hold_list. The toggle ensures that half the time we 547 * vnode_hold_list. The toggle ensures that half the time we
548 * will use a buffer from the vnode_hold_list, and half the time 548 * will use a buffer from the vnode_hold_list, and half the time
549 * we will allocate a new one unless the list has grown to twice 549 * we will allocate a new one unless the list has grown to twice
550 * the desired size. We are reticent to recycle vnodes from the 550 * the desired size. We are reticent to recycle vnodes from the
551 * vnode_hold_list because we will lose the identity of all its 551 * vnode_hold_list because we will lose the identity of all its
552 * referencing buffers. 552 * referencing buffers.
553 */ 553 */
554 554
555 vp = NULL; 555 vp = NULL;
556 556
557 mutex_enter(&vnode_free_list_lock); 557 mutex_enter(&vnode_free_list_lock);
558 558
559 toggle ^= 1; 559 toggle ^= 1;
560 if (numvnodes > 2 * desiredvnodes) 560 if (numvnodes > 2 * desiredvnodes)
561 toggle = 0; 561 toggle = 0;
562 562
563 tryalloc = numvnodes < desiredvnodes || 563 tryalloc = numvnodes < desiredvnodes ||
564 (TAILQ_FIRST(&vnode_free_list) == NULL && 564 (TAILQ_FIRST(&vnode_free_list) == NULL &&
565 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 565 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
566 566
567 if (tryalloc) { 567 if (tryalloc) {
568 numvnodes++; 568 numvnodes++;
569 mutex_exit(&vnode_free_list_lock); 569 mutex_exit(&vnode_free_list_lock);
570 if ((vp = vnalloc(NULL)) == NULL) { 570 if ((vp = vnalloc(NULL)) == NULL) {
571 mutex_enter(&vnode_free_list_lock); 571 mutex_enter(&vnode_free_list_lock);
572 numvnodes--; 572 numvnodes--;
573 } else 573 } else
574 vp->v_usecount = 1; 574 vp->v_usecount = 1;
575 } 575 }
576 576
577 if (vp == NULL) { 577 if (vp == NULL) {
578 vp = getcleanvnode(); 578 vp = getcleanvnode();
579 if (vp == NULL) { 579 if (vp == NULL) {
580 if (mp != NULL) { 580 if (mp != NULL) {
581 vfs_unbusy(mp, false, NULL); 581 vfs_unbusy(mp, false, NULL);
582 } 582 }
583 if (tryalloc) { 583 if (tryalloc) {
584 printf("WARNING: unable to allocate new " 584 printf("WARNING: unable to allocate new "
585 "vnode, retrying...\n"); 585 "vnode, retrying...\n");
586 kpause("newvn", false, hz, NULL); 586 kpause("newvn", false, hz, NULL);
587 goto try_again; 587 goto try_again;
588 } 588 }
589 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 589 tablefull("vnode", "increase kern.maxvnodes or NVNODE");
590 *vpp = 0; 590 *vpp = 0;
591 return (ENFILE); 591 return (ENFILE);
592 } 592 }
593 vp->v_iflag = 0; 593 vp->v_iflag = 0;
594 vp->v_vflag = 0; 594 vp->v_vflag = 0;
595 vp->v_uflag = 0; 595 vp->v_uflag = 0;
596 vp->v_socket = NULL; 596 vp->v_socket = NULL;
597 } 597 }
598 598
599 KASSERT(vp->v_usecount == 1); 599 KASSERT(vp->v_usecount == 1);
600 KASSERT(vp->v_freelisthd == NULL); 600 KASSERT(vp->v_freelisthd == NULL);
601 KASSERT(LIST_EMPTY(&vp->v_nclist)); 601 KASSERT(LIST_EMPTY(&vp->v_nclist));
602 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 602 KASSERT(LIST_EMPTY(&vp->v_dnclist));
603 603
604 vp->v_type = VNON; 604 vp->v_type = VNON;
605 vp->v_vnlock = &vp->v_lock; 605 vp->v_vnlock = &vp->v_lock;
606 vp->v_tag = tag; 606 vp->v_tag = tag;
607 vp->v_op = vops; 607 vp->v_op = vops;
608 insmntque(vp, mp); 608 insmntque(vp, mp);
609 *vpp = vp; 609 *vpp = vp;
610 vp->v_data = 0; 610 vp->v_data = 0;
611 611
612 /* 612 /*
613 * initialize uvm_object within vnode. 613 * initialize uvm_object within vnode.
614 */ 614 */
615 615
616 uobj = &vp->v_uobj; 616 uobj = &vp->v_uobj;
617 KASSERT(uobj->pgops == &uvm_vnodeops); 617 KASSERT(uobj->pgops == &uvm_vnodeops);
618 KASSERT(uobj->uo_npages == 0); 618 KASSERT(uobj->uo_npages == 0);
619 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 619 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
620 vp->v_size = vp->v_writesize = VSIZENOTSET; 620 vp->v_size = vp->v_writesize = VSIZENOTSET;
621 621
622 if (mp != NULL) { 622 if (mp != NULL) {
623 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 623 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
624 vp->v_vflag |= VV_MPSAFE; 624 vp->v_vflag |= VV_MPSAFE;
625 vfs_unbusy(mp, true, NULL); 625 vfs_unbusy(mp, true, NULL);
626 } 626 }
627 627
628 return (0); 628 return (0);
629} 629}
630 630
631/* 631/*
632 * This is really just the reverse of getnewvnode(). Needed for 632 * This is really just the reverse of getnewvnode(). Needed for
633 * VFS_VGET functions who may need to push back a vnode in case 633 * VFS_VGET functions who may need to push back a vnode in case
634 * of a locking race. 634 * of a locking race.
635 */ 635 */
636void 636void
637ungetnewvnode(vnode_t *vp) 637ungetnewvnode(vnode_t *vp)
638{ 638{
639 639
640 KASSERT(vp->v_usecount == 1); 640 KASSERT(vp->v_usecount == 1);
641 KASSERT(vp->v_data == NULL); 641 KASSERT(vp->v_data == NULL);
642 KASSERT(vp->v_freelisthd == NULL); 642 KASSERT(vp->v_freelisthd == NULL);
643 643
644 mutex_enter(&vp->v_interlock); 644 mutex_enter(&vp->v_interlock);
645 vp->v_iflag |= VI_CLEAN; 645 vp->v_iflag |= VI_CLEAN;
646 vrelel(vp, 0); 646 vrelel(vp, 0);
647} 647}
648 648
649/* 649/*
650 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 650 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a
651 * marker vnode and we are prepared to wait for the allocation. 651 * marker vnode and we are prepared to wait for the allocation.
652 */ 652 */
653vnode_t * 653vnode_t *
654vnalloc(struct mount *mp) 654vnalloc(struct mount *mp)
655{ 655{
656 vnode_t *vp; 656 vnode_t *vp;
657 657
658 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT)); 658 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT));
659 if (vp == NULL) { 659 if (vp == NULL) {
660 return NULL; 660 return NULL;
661 } 661 }
662 662
663 memset(vp, 0, sizeof(*vp)); 663 memset(vp, 0, sizeof(*vp));
664 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0); 664 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0);
665 cv_init(&vp->v_cv, "vnode"); 665 cv_init(&vp->v_cv, "vnode");
666 /* 666 /*
667 * done by memset() above. 667 * done by memset() above.
668 * LIST_INIT(&vp->v_nclist); 668 * LIST_INIT(&vp->v_nclist);
669 * LIST_INIT(&vp->v_dnclist); 669 * LIST_INIT(&vp->v_dnclist);
670 */ 670 */
671 671
672 if (mp != NULL) { 672 if (mp != NULL) {
673 vp->v_mount = mp; 673 vp->v_mount = mp;
674 vp->v_type = VBAD; 674 vp->v_type = VBAD;
675 vp->v_iflag = VI_MARKER; 675 vp->v_iflag = VI_MARKER;
676 } else { 676 } else {
677 rw_init(&vp->v_lock.vl_lock); 677 rw_init(&vp->v_lock.vl_lock);
678 } 678 }
679 679
680 return vp; 680 return vp;
681} 681}
682 682
683/* 683/*
684 * Free an unused, unreferenced vnode. 684 * Free an unused, unreferenced vnode.
685 */ 685 */
686void 686void
687vnfree(vnode_t *vp) 687vnfree(vnode_t *vp)
688{ 688{
689 689
690 KASSERT(vp->v_usecount == 0); 690 KASSERT(vp->v_usecount == 0);
691 691
692 if ((vp->v_iflag & VI_MARKER) == 0) { 692 if ((vp->v_iflag & VI_MARKER) == 0) {
693 rw_destroy(&vp->v_lock.vl_lock); 693 rw_destroy(&vp->v_lock.vl_lock);
694 mutex_enter(&vnode_free_list_lock); 694 mutex_enter(&vnode_free_list_lock);
695 numvnodes--; 695 numvnodes--;
696 mutex_exit(&vnode_free_list_lock); 696 mutex_exit(&vnode_free_list_lock);
697 } 697 }
698 698
699 UVM_OBJ_DESTROY(&vp->v_uobj); 699 UVM_OBJ_DESTROY(&vp->v_uobj);
700 cv_destroy(&vp->v_cv); 700 cv_destroy(&vp->v_cv);
701 pool_cache_put(vnode_cache, vp); 701 pool_cache_put(vnode_cache, vp);
702} 702}
703 703
704/* 704/*
705 * Remove a vnode from its freelist. 705 * Remove a vnode from its freelist.
706 */ 706 */
707static inline void 707static inline void
708vremfree(vnode_t *vp) 708vremfree(vnode_t *vp)
709{ 709{
710 710
711 KASSERT(mutex_owned(&vp->v_interlock)); 711 KASSERT(mutex_owned(&vp->v_interlock));
712 KASSERT(vp->v_usecount == 0); 712 KASSERT(vp->v_usecount == 0);
713 713
714 /* 714 /*
715 * Note that the reference count must not change until 715 * Note that the reference count must not change until
716 * the vnode is removed. 716 * the vnode is removed.
717 */ 717 */
718 mutex_enter(&vnode_free_list_lock); 718 mutex_enter(&vnode_free_list_lock);
719 if (vp->v_holdcnt > 0) { 719 if (vp->v_holdcnt > 0) {
720 KASSERT(vp->v_freelisthd == &vnode_hold_list); 720 KASSERT(vp->v_freelisthd == &vnode_hold_list);
721 } else { 721 } else {
722 KASSERT(vp->v_freelisthd == &vnode_free_list); 722 KASSERT(vp->v_freelisthd == &vnode_free_list);
723 } 723 }
724 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 724 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
725 vp->v_freelisthd = NULL; 725 vp->v_freelisthd = NULL;
726 mutex_exit(&vnode_free_list_lock); 726 mutex_exit(&vnode_free_list_lock);
727} 727}
728 728
729/* 729/*
730 * Move a vnode from one mount queue to another. 730 * Move a vnode from one mount queue to another.
731 */ 731 */
732static void 732static void
733insmntque(vnode_t *vp, struct mount *mp) 733insmntque(vnode_t *vp, struct mount *mp)
734{ 734{
735 struct mount *omp; 735 struct mount *omp;
736 736
737#ifdef DIAGNOSTIC 737#ifdef DIAGNOSTIC
738 if ((mp != NULL) && 738 if ((mp != NULL) &&
739 (mp->mnt_iflag & IMNT_UNMOUNT) && 739 (mp->mnt_iflag & IMNT_UNMOUNT) &&
740 !(mp->mnt_flag & MNT_SOFTDEP) && 740 !(mp->mnt_flag & MNT_SOFTDEP) &&
741 vp->v_tag != VT_VFS) { 741 vp->v_tag != VT_VFS) {
742 panic("insmntque into dying filesystem"); 742 panic("insmntque into dying filesystem");
743 } 743 }
744#endif 744#endif
745 745
746 mutex_enter(&mntvnode_lock); 746 mutex_enter(&mntvnode_lock);
747 /* 747 /*
748 * Delete from old mount point vnode list, if on one. 748 * Delete from old mount point vnode list, if on one.
749 */ 749 */
750 if ((omp = vp->v_mount) != NULL) 750 if ((omp = vp->v_mount) != NULL)
751 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes); 751 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
752 /* 752 /*
753 * Insert into list of vnodes for the new mount point, if 753 * Insert into list of vnodes for the new mount point, if
754 * available. The caller must take a reference on the mount 754 * available. The caller must take a reference on the mount
755 * structure and donate to the vnode. 755 * structure and donate to the vnode.
756 */ 756 */
757 if ((vp->v_mount = mp) != NULL) 757 if ((vp->v_mount = mp) != NULL)
758 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 758 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
759 mutex_exit(&mntvnode_lock); 759 mutex_exit(&mntvnode_lock);
760 760
761 if (omp != NULL) { 761 if (omp != NULL) {
762 /* Release reference to old mount. */ 762 /* Release reference to old mount. */
763 vfs_destroy(omp); 763 vfs_destroy(omp);
764 } 764 }
765} 765}
766 766
767/* 767/*
768 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 768 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
769 * recycled. 769 * recycled.
770 */ 770 */
771void 771void
772vwait(vnode_t *vp, int flags) 772vwait(vnode_t *vp, int flags)
773{ 773{
774 774
775 KASSERT(mutex_owned(&vp->v_interlock)); 775 KASSERT(mutex_owned(&vp->v_interlock));
776 KASSERT(vp->v_usecount != 0); 776 KASSERT(vp->v_usecount != 0);
777 777
778 while ((vp->v_iflag & flags) != 0) 778 while ((vp->v_iflag & flags) != 0)
779 cv_wait(&vp->v_cv, &vp->v_interlock); 779 cv_wait(&vp->v_cv, &vp->v_interlock);
780} 780}
781 781
782/* 782/*
783 * Insert a marker vnode into a mount's vnode list, after the 783 * Insert a marker vnode into a mount's vnode list, after the
784 * specified vnode. mntvnode_lock must be held. 784 * specified vnode. mntvnode_lock must be held.
785 */ 785 */
786void 786void
787vmark(vnode_t *mvp, vnode_t *vp) 787vmark(vnode_t *mvp, vnode_t *vp)
788{ 788{
789 struct mount *mp; 789 struct mount *mp;
790 790
791 mp = mvp->v_mount; 791 mp = mvp->v_mount;
792 792
793 KASSERT(mutex_owned(&mntvnode_lock)); 793 KASSERT(mutex_owned(&mntvnode_lock));
794 KASSERT((mvp->v_iflag & VI_MARKER) != 0); 794 KASSERT((mvp->v_iflag & VI_MARKER) != 0);
795 KASSERT(vp->v_mount == mp); 795 KASSERT(vp->v_mount == mp);
796 796
797 TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes); 797 TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes);
798} 798}
799 799
800/* 800/*
801 * Remove a marker vnode from a mount's vnode list, and return 801 * Remove a marker vnode from a mount's vnode list, and return
802 * a pointer to the next vnode in the list. mntvnode_lock must 802 * a pointer to the next vnode in the list. mntvnode_lock must
803 * be held. 803 * be held.
804 */ 804 */
805vnode_t * 805vnode_t *
806vunmark(vnode_t *mvp) 806vunmark(vnode_t *mvp)
807{ 807{
808 vnode_t *vp; 808 vnode_t *vp;
809 struct mount *mp; 809 struct mount *mp;
810 810
811 mp = mvp->v_mount; 811 mp = mvp->v_mount;
812 812
813 KASSERT(mutex_owned(&mntvnode_lock)); 813 KASSERT(mutex_owned(&mntvnode_lock));
814 KASSERT((mvp->v_iflag & VI_MARKER) != 0); 814 KASSERT((mvp->v_iflag & VI_MARKER) != 0);
815 815
816 vp = TAILQ_NEXT(mvp, v_mntvnodes); 816 vp = TAILQ_NEXT(mvp, v_mntvnodes);
817 TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes);  817 TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes);
818 818
819 KASSERT(vp == NULL || vp->v_mount == mp); 819 KASSERT(vp == NULL || vp->v_mount == mp);
820 820
821 return vp; 821 return vp;
822} 822}
823 823
824/* 824/*
825 * Update outstanding I/O count and do wakeup if requested. 825 * Update outstanding I/O count and do wakeup if requested.
826 */ 826 */
827void 827void
828vwakeup(struct buf *bp) 828vwakeup(struct buf *bp)
829{ 829{
830 struct vnode *vp; 830 struct vnode *vp;
831 831
832 if ((vp = bp->b_vp) == NULL) 832 if ((vp = bp->b_vp) == NULL)
833 return; 833 return;
834 834
835 KASSERT(bp->b_objlock == &vp->v_interlock); 835 KASSERT(bp->b_objlock == &vp->v_interlock);
836 KASSERT(mutex_owned(bp->b_objlock)); 836 KASSERT(mutex_owned(bp->b_objlock));
837 837
838 if (--vp->v_numoutput < 0) 838 if (--vp->v_numoutput < 0)
839 panic("vwakeup: neg numoutput, vp %p", vp); 839 panic("vwakeup: neg numoutput, vp %p", vp);
840 if (vp->v_numoutput == 0) 840 if (vp->v_numoutput == 0)
841 cv_broadcast(&vp->v_cv); 841 cv_broadcast(&vp->v_cv);
842} 842}
843 843
844/* 844/*
845 * Flush out and invalidate all buffers associated with a vnode. 845 * Flush out and invalidate all buffers associated with a vnode.
846 * Called with the underlying vnode locked, which should prevent new dirty 846 * Called with the underlying vnode locked, which should prevent new dirty
847 * buffers from being queued. 847 * buffers from being queued.
848 */ 848 */
849int 849int
850vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l, 850vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
851 bool catch, int slptimeo) 851 bool catch, int slptimeo)
852{ 852{
853 struct buf *bp, *nbp; 853 struct buf *bp, *nbp;
854 int error; 854 int error;
855 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 855 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
856 (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0); 856 (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);
857 857
858 /* XXXUBC this doesn't look at flags or slp* */ 858 /* XXXUBC this doesn't look at flags or slp* */
859 mutex_enter(&vp->v_interlock); 859 mutex_enter(&vp->v_interlock);
860 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 860 error = VOP_PUTPAGES(vp, 0, 0, flushflags);
861 if (error) { 861 if (error) {
862 return error; 862 return error;
863 } 863 }
864 864
865 if (flags & V_SAVE) { 865 if (flags & V_SAVE) {
866 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0); 866 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0);
867 if (error) 867 if (error)
868 return (error); 868 return (error);
869 KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd)); 869 KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
870 } 870 }
871 871
872 mutex_enter(&bufcache_lock); 872 mutex_enter(&bufcache_lock);
873restart: 873restart:
874 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 874 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
875 nbp = LIST_NEXT(bp, b_vnbufs); 875 nbp = LIST_NEXT(bp, b_vnbufs);
876 error = bbusy(bp, catch, slptimeo, NULL); 876 error = bbusy(bp, catch, slptimeo, NULL);
877 if (error != 0) { 877 if (error != 0) {
878 if (error == EPASSTHROUGH) 878 if (error == EPASSTHROUGH)
879 goto restart; 879 goto restart;
880 mutex_exit(&bufcache_lock); 880 mutex_exit(&bufcache_lock);
881 return (error); 881 return (error);
882 } 882 }
883 brelsel(bp, BC_INVAL | BC_VFLUSH); 883 brelsel(bp, BC_INVAL | BC_VFLUSH);
884 } 884 }
885 885
886 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 886 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
887 nbp = LIST_NEXT(bp, b_vnbufs); 887 nbp = LIST_NEXT(bp, b_vnbufs);
888 error = bbusy(bp, catch, slptimeo, NULL); 888 error = bbusy(bp, catch, slptimeo, NULL);
889 if (error != 0) { 889 if (error != 0) {
890 if (error == EPASSTHROUGH) 890 if (error == EPASSTHROUGH)
891 goto restart; 891 goto restart;
892 mutex_exit(&bufcache_lock); 892 mutex_exit(&bufcache_lock);
893 return (error); 893 return (error);
894 } 894 }
895 /* 895 /*
896 * XXX Since there are no node locks for NFS, I believe 896 * XXX Since there are no node locks for NFS, I believe
897 * there is a slight chance that a delayed write will 897 * there is a slight chance that a delayed write will
898 * occur while sleeping just above, so check for it. 898 * occur while sleeping just above, so check for it.
899 */ 899 */
900 if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) { 900 if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) {
901#ifdef DEBUG 901#ifdef DEBUG
902 printf("buffer still DELWRI\n"); 902 printf("buffer still DELWRI\n");
903#endif 903#endif
904 bp->b_cflags |= BC_BUSY | BC_VFLUSH; 904 bp->b_cflags |= BC_BUSY | BC_VFLUSH;
905 mutex_exit(&bufcache_lock); 905 mutex_exit(&bufcache_lock);
906 VOP_BWRITE(bp); 906 VOP_BWRITE(bp);
907 mutex_enter(&bufcache_lock); 907 mutex_enter(&bufcache_lock);
908 goto restart; 908 goto restart;
909 } 909 }
910 brelsel(bp, BC_INVAL | BC_VFLUSH); 910 brelsel(bp, BC_INVAL | BC_VFLUSH);
911 } 911 }
912 912
913#ifdef DIAGNOSTIC 913#ifdef DIAGNOSTIC
914 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 914 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
915 panic("vinvalbuf: flush failed, vp %p", vp); 915 panic("vinvalbuf: flush failed, vp %p", vp);
916#endif 916#endif
917 917
918 mutex_exit(&bufcache_lock); 918 mutex_exit(&bufcache_lock);
919 919
920 return (0); 920 return (0);
921} 921}
922 922
923/* 923/*
924 * Destroy any in core blocks past the truncation length. 924 * Destroy any in core blocks past the truncation length.
925 * Called with the underlying vnode locked, which should prevent new dirty 925 * Called with the underlying vnode locked, which should prevent new dirty
926 * buffers from being queued. 926 * buffers from being queued.
927 */ 927 */
928int 928int
929vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo) 929vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo)
930{ 930{
931 struct buf *bp, *nbp; 931 struct buf *bp, *nbp;
932 int error; 932 int error;
933 voff_t off; 933 voff_t off;
934 934
935 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 935 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
936 mutex_enter(&vp->v_interlock); 936 mutex_enter(&vp->v_interlock);
937 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 937 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
938 if (error) { 938 if (error) {
939 return error; 939 return error;
940 } 940 }
941 941
942 mutex_enter(&bufcache_lock); 942 mutex_enter(&bufcache_lock);
943restart: 943restart:
944 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 944 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
945 nbp = LIST_NEXT(bp, b_vnbufs); 945 nbp = LIST_NEXT(bp, b_vnbufs);
946 if (bp->b_lblkno < lbn) 946 if (bp->b_lblkno < lbn)
947 continue; 947 continue;
948 error = bbusy(bp, catch, slptimeo, NULL); 948 error = bbusy(bp, catch, slptimeo, NULL);
949 if (error != 0) { 949 if (error != 0) {
950 if (error == EPASSTHROUGH) 950 if (error == EPASSTHROUGH)
951 goto restart; 951 goto restart;
952 mutex_exit(&bufcache_lock); 952 mutex_exit(&bufcache_lock);
953 return (error); 953 return (error);
954 } 954 }
955 brelsel(bp, BC_INVAL | BC_VFLUSH); 955 brelsel(bp, BC_INVAL | BC_VFLUSH);
956 } 956 }
957 957
958 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 958 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
959 nbp = LIST_NEXT(bp, b_vnbufs); 959 nbp = LIST_NEXT(bp, b_vnbufs);
960 if (bp->b_lblkno < lbn) 960 if (bp->b_lblkno < lbn)
961 continue; 961 continue;
962 error = bbusy(bp, catch, slptimeo, NULL); 962 error = bbusy(bp, catch, slptimeo, NULL);
963 if (error != 0) { 963 if (error != 0) {
964 if (error == EPASSTHROUGH) 964 if (error == EPASSTHROUGH)
965 goto restart; 965 goto restart;
966 mutex_exit(&bufcache_lock); 966 mutex_exit(&bufcache_lock);
967 return (error); 967 return (error);
968 } 968 }
969 brelsel(bp, BC_INVAL | BC_VFLUSH); 969 brelsel(bp, BC_INVAL | BC_VFLUSH);
970 } 970 }
971 mutex_exit(&bufcache_lock); 971 mutex_exit(&bufcache_lock);
972 972
973 return (0); 973 return (0);
974} 974}
975 975
976/* 976/*
977 * Flush all dirty buffers from a vnode. 977 * Flush all dirty buffers from a vnode.
978 * Called with the underlying vnode locked, which should prevent new dirty 978 * Called with the underlying vnode locked, which should prevent new dirty
979 * buffers from being queued. 979 * buffers from being queued.
980 */ 980 */
981void 981void
982vflushbuf(struct vnode *vp, int sync) 982vflushbuf(struct vnode *vp, int sync)
983{ 983{
984 struct buf *bp, *nbp; 984 struct buf *bp, *nbp;
985 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0); 985 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0);
986 bool dirty; 986 bool dirty;
987 987
988 mutex_enter(&vp->v_interlock); 988 mutex_enter(&vp->v_interlock);
989 (void) VOP_PUTPAGES(vp, 0, 0, flags); 989 (void) VOP_PUTPAGES(vp, 0, 0, flags);
990 990
991loop: 991loop:
992 mutex_enter(&bufcache_lock); 992 mutex_enter(&bufcache_lock);
993 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 993 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
994 nbp = LIST_NEXT(bp, b_vnbufs); 994 nbp = LIST_NEXT(bp, b_vnbufs);
995 if ((bp->b_cflags & BC_BUSY)) 995 if ((bp->b_cflags & BC_BUSY))
996 continue; 996 continue;
997 if ((bp->b_oflags & BO_DELWRI) == 0) 997 if ((bp->b_oflags & BO_DELWRI) == 0)
998 panic("vflushbuf: not dirty, bp %p", bp); 998 panic("vflushbuf: not dirty, bp %p", bp);
999 bp->b_cflags |= BC_BUSY | BC_VFLUSH; 999 bp->b_cflags |= BC_BUSY | BC_VFLUSH;
1000 mutex_exit(&bufcache_lock); 1000 mutex_exit(&bufcache_lock);
1001 /* 1001 /*
1002 * Wait for I/O associated with indirect blocks to complete, 1002 * Wait for I/O associated with indirect blocks to complete,
1003 * since there is no way to quickly wait for them below. 1003 * since there is no way to quickly wait for them below.
1004 */ 1004 */
1005 if (bp->b_vp == vp || sync == 0) 1005 if (bp->b_vp == vp || sync == 0)
1006 (void) bawrite(bp); 1006 (void) bawrite(bp);
1007 else 1007 else
1008 (void) bwrite(bp); 1008 (void) bwrite(bp);
1009 goto loop; 1009 goto loop;
1010 } 1010 }
1011 mutex_exit(&bufcache_lock); 1011 mutex_exit(&bufcache_lock);
1012 1012
1013 if (sync == 0) 1013 if (sync == 0)
1014 return; 1014 return;
1015 1015
1016 mutex_enter(&vp->v_interlock); 1016 mutex_enter(&vp->v_interlock);
1017 while (vp->v_numoutput != 0) 1017 while (vp->v_numoutput != 0)
1018 cv_wait(&vp->v_cv, &vp->v_interlock); 1018 cv_wait(&vp->v_cv, &vp->v_interlock);
1019 dirty = !LIST_EMPTY(&vp->v_dirtyblkhd); 1019 dirty = !LIST_EMPTY(&vp->v_dirtyblkhd);
1020 mutex_exit(&vp->v_interlock); 1020 mutex_exit(&vp->v_interlock);
1021 1021
1022 if (dirty) { 1022 if (dirty) {
1023 vprint("vflushbuf: dirty", vp); 1023 vprint("vflushbuf: dirty", vp);
1024 goto loop; 1024 goto loop;
1025 } 1025 }
1026} 1026}
1027 1027
1028/* 1028/*
1029 * Create a vnode for a block device. 1029 * Create a vnode for a block device.
1030 * Used for root filesystem and swap areas. 1030 * Used for root filesystem and swap areas.
1031 * Also used for memory file system special devices. 1031 * Also used for memory file system special devices.
1032 */ 1032 */
1033int 1033int
1034bdevvp(dev_t dev, vnode_t **vpp) 1034bdevvp(dev_t dev, vnode_t **vpp)
1035{ 1035{
1036 1036
1037 return (getdevvp(dev, vpp, VBLK)); 1037 return (getdevvp(dev, vpp, VBLK));
1038} 1038}
1039 1039
1040/* 1040/*
1041 * Create a vnode for a character device. 1041 * Create a vnode for a character device.
1042 * Used for kernfs and some console handling. 1042 * Used for kernfs and some console handling.
1043 */ 1043 */
1044int 1044int
1045cdevvp(dev_t dev, vnode_t **vpp) 1045cdevvp(dev_t dev, vnode_t **vpp)
1046{ 1046{
1047 1047
1048 return (getdevvp(dev, vpp, VCHR)); 1048 return (getdevvp(dev, vpp, VCHR));
1049} 1049}
1050 1050
1051/* 1051/*
1052 * Associate a buffer with a vnode. There must already be a hold on 1052 * Associate a buffer with a vnode. There must already be a hold on
1053 * the vnode. 1053 * the vnode.
1054 */ 1054 */
1055void 1055void
1056bgetvp(struct vnode *vp, struct buf *bp) 1056bgetvp(struct vnode *vp, struct buf *bp)
1057{ 1057{
1058 1058
1059 KASSERT(bp->b_vp == NULL); 1059 KASSERT(bp->b_vp == NULL);
1060 KASSERT(bp->b_objlock == &buffer_lock); 1060 KASSERT(bp->b_objlock == &buffer_lock);
1061 KASSERT(mutex_owned(&vp->v_interlock)); 1061 KASSERT(mutex_owned(&vp->v_interlock));
1062 KASSERT(mutex_owned(&bufcache_lock)); 1062 KASSERT(mutex_owned(&bufcache_lock));
1063 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1063 KASSERT((bp->b_cflags & BC_BUSY) != 0);
1064 KASSERT(!cv_has_waiters(&bp->b_done)); 1064 KASSERT(!cv_has_waiters(&bp->b_done));
1065 1065
1066 vholdl(vp); 1066 vholdl(vp);
1067 bp->b_vp = vp; 1067 bp->b_vp = vp;
1068 if (vp->v_type == VBLK || vp->v_type == VCHR) 1068 if (vp->v_type == VBLK || vp->v_type == VCHR)
1069 bp->b_dev = vp->v_rdev; 1069 bp->b_dev = vp->v_rdev;
1070 else 1070 else
1071 bp->b_dev = NODEV; 1071 bp->b_dev = NODEV;
1072 1072
1073 /* 1073 /*
1074 * Insert onto list for new vnode. 1074 * Insert onto list for new vnode.
1075 */ 1075 */
1076 bufinsvn(bp, &vp->v_cleanblkhd); 1076 bufinsvn(bp, &vp->v_cleanblkhd);
1077 bp->b_objlock = &vp->v_interlock; 1077 bp->b_objlock = &vp->v_interlock;
1078} 1078}
1079 1079
1080/* 1080/*
1081 * Disassociate a buffer from a vnode. 1081 * Disassociate a buffer from a vnode.
1082 */ 1082 */
1083void 1083void
1084brelvp(struct buf *bp) 1084brelvp(struct buf *bp)
1085{ 1085{
1086 struct vnode *vp = bp->b_vp; 1086 struct vnode *vp = bp->b_vp;
1087 1087
1088 KASSERT(vp != NULL); 1088 KASSERT(vp != NULL);
1089 KASSERT(bp->b_objlock == &vp->v_interlock); 1089 KASSERT(bp->b_objlock == &vp->v_interlock);
1090 KASSERT(mutex_owned(&vp->v_interlock)); 1090 KASSERT(mutex_owned(&vp->v_interlock));
1091 KASSERT(mutex_owned(&bufcache_lock)); 1091 KASSERT(mutex_owned(&bufcache_lock));
1092 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1092 KASSERT((bp->b_cflags & BC_BUSY) != 0);
1093 KASSERT(!cv_has_waiters(&bp->b_done)); 1093 KASSERT(!cv_has_waiters(&bp->b_done));
1094 1094
1095 /* 1095 /*
1096 * Delete from old vnode list, if on one. 1096 * Delete from old vnode list, if on one.
1097 */ 1097 */
1098 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1098 if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
1099 bufremvn(bp); 1099 bufremvn(bp);
1100 1100
1101 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_iflag & VI_ONWORKLST) && 1101 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_iflag & VI_ONWORKLST) &&
1102 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 1102 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
1103 vp->v_iflag &= ~VI_WRMAPDIRTY; 1103 vp->v_iflag &= ~VI_WRMAPDIRTY;
1104 vn_syncer_remove_from_worklist(vp); 1104 vn_syncer_remove_from_worklist(vp);
1105 } 1105 }
1106 1106
1107 bp->b_objlock = &buffer_lock; 1107 bp->b_objlock = &buffer_lock;
1108 bp->b_vp = NULL; 1108 bp->b_vp = NULL;
1109 holdrelel(vp); 1109 holdrelel(vp);
1110} 1110}
1111 1111
1112/* 1112/*
1113 * Reassign a buffer from one vnode list to another. 1113 * Reassign a buffer from one vnode list to another.
1114 * The list reassignment must be within the same vnode. 1114 * The list reassignment must be within the same vnode.
1115 * Used to assign file specific control information 1115 * Used to assign file specific control information
1116 * (indirect blocks) to the list to which they belong. 1116 * (indirect blocks) to the list to which they belong.
1117 */ 1117 */
1118void 1118void
1119reassignbuf(struct buf *bp, struct vnode *vp) 1119reassignbuf(struct buf *bp, struct vnode *vp)
1120{ 1120{
1121 struct buflists *listheadp; 1121 struct buflists *listheadp;
1122 int delayx; 1122 int delayx;
1123 1123
1124 KASSERT(mutex_owned(&bufcache_lock)); 1124 KASSERT(mutex_owned(&bufcache_lock));
1125 KASSERT(bp->b_objlock == &vp->v_interlock); 1125 KASSERT(bp->b_objlock == &vp->v_interlock);
1126 KASSERT(mutex_owned(&vp->v_interlock)); 1126 KASSERT(mutex_owned(&vp->v_interlock));
1127 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1127 KASSERT((bp->b_cflags & BC_BUSY) != 0);
1128 1128
1129 /* 1129 /*
1130 * Delete from old vnode list, if on one. 1130 * Delete from old vnode list, if on one.
1131 */ 1131 */
1132 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1132 if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
1133 bufremvn(bp); 1133 bufremvn(bp);
1134 1134
1135 /* 1135 /*
1136 * If dirty, put on list of dirty buffers; 1136 * If dirty, put on list of dirty buffers;
1137 * otherwise insert onto list of clean buffers. 1137 * otherwise insert onto list of clean buffers.
1138 */ 1138 */
1139 if ((bp->b_oflags & BO_DELWRI) == 0) { 1139 if ((bp->b_oflags & BO_DELWRI) == 0) {
1140 listheadp = &vp->v_cleanblkhd; 1140 listheadp = &vp->v_cleanblkhd;
1141 if (TAILQ_EMPTY(&vp->v_uobj.memq) && 1141 if (TAILQ_EMPTY(&vp->v_uobj.memq) &&
1142 (vp->v_iflag & VI_ONWORKLST) && 1142 (vp->v_iflag & VI_ONWORKLST) &&
1143 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 1143 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
1144 vp->v_iflag &= ~VI_WRMAPDIRTY; 1144 vp->v_iflag &= ~VI_WRMAPDIRTY;
1145 vn_syncer_remove_from_worklist(vp); 1145 vn_syncer_remove_from_worklist(vp);
1146 } 1146 }
1147 } else { 1147 } else {
1148 listheadp = &vp->v_dirtyblkhd; 1148 listheadp = &vp->v_dirtyblkhd;
1149 if ((vp->v_iflag & VI_ONWORKLST) == 0) { 1149 if ((vp->v_iflag & VI_ONWORKLST) == 0) {
1150 switch (vp->v_type) { 1150 switch (vp->v_type) {
1151 case VDIR: 1151 case VDIR:
1152 delayx = dirdelay; 1152 delayx = dirdelay;
1153 break; 1153 break;
1154 case VBLK: 1154 case VBLK:
1155 if (vp->v_specmountpoint != NULL) { 1155 if (vp->v_specmountpoint != NULL) {
1156 delayx = metadelay; 1156 delayx = metadelay;
1157 break; 1157 break;
1158 } 1158 }
1159 /* fall through */ 1159 /* fall through */
1160 default: 1160 default:
1161 delayx = filedelay; 1161 delayx = filedelay;
1162 break; 1162 break;
1163 } 1163 }
1164 if (!vp->v_mount || 1164 if (!vp->v_mount ||
1165 (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) 1165 (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
1166 vn_syncer_add_to_worklist(vp, delayx); 1166 vn_syncer_add_to_worklist(vp, delayx);
1167 } 1167 }
1168 } 1168 }
1169 bufinsvn(bp, listheadp); 1169 bufinsvn(bp, listheadp);
1170} 1170}
1171 1171
1172/* 1172/*
1173 * Create a vnode for a device. 1173 * Create a vnode for a device.
1174 * Used by bdevvp (block device) for root file system etc., 1174 * Used by bdevvp (block device) for root file system etc.,
1175 * and by cdevvp (character device) for console and kernfs. 1175 * and by cdevvp (character device) for console and kernfs.
1176 */ 1176 */
1177static int 1177static int
1178getdevvp(dev_t dev, vnode_t **vpp, enum vtype type) 1178getdevvp(dev_t dev, vnode_t **vpp, enum vtype type)
1179{ 1179{
1180 vnode_t *vp; 1180 vnode_t *vp;

cvs diff -r1.16 -r1.17 src/sys/kern/vfs_wapbl.c (switch to unified diff)

--- src/sys/kern/vfs_wapbl.c 2008/11/24 16:05:21 1.16
+++ src/sys/kern/vfs_wapbl.c 2009/01/03 03:31:23 1.17
@@ -1,2738 +1,2738 @@ @@ -1,2738 +1,2738 @@
1/* $NetBSD: vfs_wapbl.c,v 1.16 2008/11/24 16:05:21 joerg Exp $ */ 1/* $NetBSD: vfs_wapbl.c,v 1.17 2009/01/03 03:31:23 yamt Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2003,2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc. 8 * by Wasabi Systems, Inc.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * This implements file system independent write ahead filesystem logging. 33 * This implements file system independent write ahead filesystem logging.
34 */ 34 */
35 35
36#define WAPBL_INTERNAL 36#define WAPBL_INTERNAL
37 37
38#include <sys/cdefs.h> 38#include <sys/cdefs.h>
39__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.16 2008/11/24 16:05:21 joerg Exp $"); 39__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.17 2009/01/03 03:31:23 yamt Exp $");
40 40
41#include <sys/param.h> 41#include <sys/param.h>
42 42
43#ifdef _KERNEL 43#ifdef _KERNEL
44#include <sys/param.h> 44#include <sys/param.h>
45#include <sys/namei.h> 45#include <sys/namei.h>
46#include <sys/proc.h> 46#include <sys/proc.h>
47#include <sys/uio.h> 47#include <sys/uio.h>
48#include <sys/vnode.h> 48#include <sys/vnode.h>
49#include <sys/file.h> 49#include <sys/file.h>
50#include <sys/malloc.h> 50#include <sys/malloc.h>
51#include <sys/resourcevar.h> 51#include <sys/resourcevar.h>
52#include <sys/conf.h> 52#include <sys/conf.h>
53#include <sys/mount.h> 53#include <sys/mount.h>
54#include <sys/kernel.h> 54#include <sys/kernel.h>
55#include <sys/kauth.h> 55#include <sys/kauth.h>
56#include <sys/mutex.h> 56#include <sys/mutex.h>
57#include <sys/atomic.h> 57#include <sys/atomic.h>
58#include <sys/wapbl.h> 58#include <sys/wapbl.h>
59#include <sys/wapbl_replay.h> 59#include <sys/wapbl_replay.h>
60 60
61#if WAPBL_UVM_ALLOC 61#if WAPBL_UVM_ALLOC
62#include <uvm/uvm.h> 62#include <uvm/uvm.h>
63#endif 63#endif
64 64
65#include <miscfs/specfs/specdev.h> 65#include <miscfs/specfs/specdev.h>
66 66
67MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging"); 67MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging");
68#define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK) 68#define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
69#define wapbl_free(a) free((a), M_WAPBL) 69#define wapbl_free(a) free((a), M_WAPBL)
70#define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO) 70#define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
71#define wapbl_realloc(ptr, s) realloc((ptr), (s), M_WAPBL, M_WAITOK | M_ZERO) 71#define wapbl_realloc(ptr, s) realloc((ptr), (s), M_WAPBL, M_WAITOK | M_ZERO)
72 72
73#else /* !_KERNEL */ 73#else /* !_KERNEL */
74#include <assert.h> 74#include <assert.h>
75#include <errno.h> 75#include <errno.h>
76#include <stdio.h> 76#include <stdio.h>
77#include <stdbool.h> 77#include <stdbool.h>
78#include <stdlib.h> 78#include <stdlib.h>
79#include <string.h> 79#include <string.h>
80 80
81#include <sys/time.h> 81#include <sys/time.h>
82#include <sys/wapbl.h> 82#include <sys/wapbl.h>
83#include <sys/wapbl_replay.h> 83#include <sys/wapbl_replay.h>
84 84
85#define KDASSERT(x) assert(x) 85#define KDASSERT(x) assert(x)
86#define KASSERT(x) assert(x) 86#define KASSERT(x) assert(x)
87#define wapbl_malloc(s) malloc(s) 87#define wapbl_malloc(s) malloc(s)
88#define wapbl_free(a) free(a) 88#define wapbl_free(a) free(a)
89#define wapbl_calloc(n, s) calloc((n), (s)) 89#define wapbl_calloc(n, s) calloc((n), (s))
90#define wapbl_realloc(ptr, s) realloc((ptr), (s)) 90#define wapbl_realloc(ptr, s) realloc((ptr), (s))
91 91
92#endif /* !_KERNEL */ 92#endif /* !_KERNEL */
93 93
94/* 94/*
95 * INTERNAL DATA STRUCTURES 95 * INTERNAL DATA STRUCTURES
96 */ 96 */
97 97
98/*  98/*
99 * This structure holds per-mount log information. 99 * This structure holds per-mount log information.
100 * 100 *
101 * Legend: a = atomic access only 101 * Legend: a = atomic access only
102 * r = read-only after init 102 * r = read-only after init
103 * l = rwlock held 103 * l = rwlock held
104 * m = mutex held 104 * m = mutex held
105 * u = unlocked access ok 105 * u = unlocked access ok
106 * b = bufcache_lock held 106 * b = bufcache_lock held
107 */ 107 */
108struct wapbl { 108struct wapbl {
109 struct vnode *wl_logvp; /* r: log here */ 109 struct vnode *wl_logvp; /* r: log here */
110 struct vnode *wl_devvp; /* r: log on this device */ 110 struct vnode *wl_devvp; /* r: log on this device */
111 struct mount *wl_mount; /* r: mountpoint wl is associated with */ 111 struct mount *wl_mount; /* r: mountpoint wl is associated with */
112 daddr_t wl_logpbn; /* r: Physical block number of start of log */ 112 daddr_t wl_logpbn; /* r: Physical block number of start of log */
113 int wl_log_dev_bshift; /* r: logarithm of device block size of log 113 int wl_log_dev_bshift; /* r: logarithm of device block size of log
114 device */ 114 device */
115 int wl_fs_dev_bshift; /* r: logarithm of device block size of 115 int wl_fs_dev_bshift; /* r: logarithm of device block size of
116 filesystem device */ 116 filesystem device */
117 117
118 unsigned wl_lock_count; /* m: Count of transactions in progress */ 118 unsigned wl_lock_count; /* m: Count of transactions in progress */
119 119
120 size_t wl_circ_size; /* r: Number of bytes in buffer of log */ 120 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
121 size_t wl_circ_off; /* r: Number of bytes reserved at start */ 121 size_t wl_circ_off; /* r: Number of bytes reserved at start */
122 122
123 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ 123 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
124 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ 124 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
125 125
126 off_t wl_head; /* l: Byte offset of log head */ 126 off_t wl_head; /* l: Byte offset of log head */
127 off_t wl_tail; /* l: Byte offset of log tail */ 127 off_t wl_tail; /* l: Byte offset of log tail */
128 /* 128 /*
129 * head == tail == 0 means log is empty 129 * head == tail == 0 means log is empty
130 * head == tail != 0 means log is full 130 * head == tail != 0 means log is full
131 * see assertions in wapbl_advance() for other boundary conditions. 131 * see assertions in wapbl_advance() for other boundary conditions.
132 * only truncate moves the tail, except when flush sets it to 132 * only truncate moves the tail, except when flush sets it to
133 * wl_header_size only flush moves the head, except when truncate 133 * wl_header_size only flush moves the head, except when truncate
134 * sets it to 0. 134 * sets it to 0.
135 */ 135 */
136 136
137 struct wapbl_wc_header *wl_wc_header; /* l */ 137 struct wapbl_wc_header *wl_wc_header; /* l */
138 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ 138 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
139 139
140 kmutex_t wl_mtx; /* u: short-term lock */ 140 kmutex_t wl_mtx; /* u: short-term lock */
141 krwlock_t wl_rwlock; /* u: File system transaction lock */ 141 krwlock_t wl_rwlock; /* u: File system transaction lock */
142 142
143 /* 143 /*
144 * Must be held while accessing 144 * Must be held while accessing
145 * wl_count or wl_bufs or head or tail 145 * wl_count or wl_bufs or head or tail
146 */ 146 */
147 147
148 /* 148 /*
149 * Callback called from within the flush routine to flush any extra 149 * Callback called from within the flush routine to flush any extra
150 * bits. Note that flush may be skipped without calling this if 150 * bits. Note that flush may be skipped without calling this if
151 * there are no outstanding buffers in the transaction. 151 * there are no outstanding buffers in the transaction.
152 */ 152 */
153#if _KERNEL 153#if _KERNEL
154 wapbl_flush_fn_t wl_flush; /* r */ 154 wapbl_flush_fn_t wl_flush; /* r */
155 wapbl_flush_fn_t wl_flush_abort;/* r */ 155 wapbl_flush_fn_t wl_flush_abort;/* r */
156#endif 156#endif
157 157
158 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ 158 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
159 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ 159 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
160 size_t wl_bcount; /* m: Total bcount of wl_bufs */ 160 size_t wl_bcount; /* m: Total bcount of wl_bufs */
161 161
162 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ 162 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
163 163
164 kcondvar_t wl_reclaimable_cv; /* m (obviously) */ 164 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
165 size_t wl_reclaimable_bytes; /* m: Amount of space available for 165 size_t wl_reclaimable_bytes; /* m: Amount of space available for
166 reclamation by truncate */ 166 reclamation by truncate */
167 int wl_error_count; /* m: # of wl_entries with errors */ 167 int wl_error_count; /* m: # of wl_entries with errors */
168 size_t wl_reserved_bytes; /* never truncate log smaller than this */ 168 size_t wl_reserved_bytes; /* never truncate log smaller than this */
169 169
170#ifdef WAPBL_DEBUG_BUFBYTES 170#ifdef WAPBL_DEBUG_BUFBYTES
171 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ 171 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
172#endif 172#endif
173 173
174 daddr_t *wl_deallocblks;/* l: address of block */ 174 daddr_t *wl_deallocblks;/* l: address of block */
175 int *wl_dealloclens; /* l: size of block (fragments, kom ihåg) */ 175 int *wl_dealloclens; /* l: size of block (fragments, kom ihåg) */
176 int wl_dealloccnt; /* l: total count */ 176 int wl_dealloccnt; /* l: total count */
177 int wl_dealloclim; /* l: max count */ 177 int wl_dealloclim; /* l: max count */
178 178
179 /* hashtable of inode numbers for allocated but unlinked inodes */ 179 /* hashtable of inode numbers for allocated but unlinked inodes */
180 /* synch ??? */ 180 /* synch ??? */
181 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash; 181 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
182 u_long wl_inohashmask; 182 u_long wl_inohashmask;
183 int wl_inohashcnt; 183 int wl_inohashcnt;
184 184
185 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction 185 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
186 accounting */ 186 accounting */
187}; 187};
188 188
189#ifdef WAPBL_DEBUG_PRINT 189#ifdef WAPBL_DEBUG_PRINT
190int wapbl_debug_print = WAPBL_DEBUG_PRINT; 190int wapbl_debug_print = WAPBL_DEBUG_PRINT;
191#endif 191#endif
192 192
193/****************************************************************/ 193/****************************************************************/
194#ifdef _KERNEL 194#ifdef _KERNEL
195 195
196#ifdef WAPBL_DEBUG 196#ifdef WAPBL_DEBUG
197struct wapbl *wapbl_debug_wl; 197struct wapbl *wapbl_debug_wl;
198#endif 198#endif
199 199
200static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); 200static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
201static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); 201static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
202static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); 202static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
203static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); 203static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
204#endif /* _KERNEL */ 204#endif /* _KERNEL */
205 205
206static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t); 206static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
207 207
208static __inline size_t wapbl_space_free(size_t avail, off_t head, 208static __inline size_t wapbl_space_free(size_t avail, off_t head,
209 off_t tail); 209 off_t tail);
210static __inline size_t wapbl_space_used(size_t avail, off_t head, 210static __inline size_t wapbl_space_used(size_t avail, off_t head,
211 off_t tail); 211 off_t tail);
212 212
213#ifdef _KERNEL 213#ifdef _KERNEL
214 214
215#define WAPBL_INODETRK_SIZE 83 215#define WAPBL_INODETRK_SIZE 83
216static int wapbl_ino_pool_refcount; 216static int wapbl_ino_pool_refcount;
217static struct pool wapbl_ino_pool; 217static struct pool wapbl_ino_pool;
218struct wapbl_ino { 218struct wapbl_ino {
219 LIST_ENTRY(wapbl_ino) wi_hash; 219 LIST_ENTRY(wapbl_ino) wi_hash;
220 ino_t wi_ino; 220 ino_t wi_ino;
221 mode_t wi_mode; 221 mode_t wi_mode;
222}; 222};
223 223
224static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); 224static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
225static void wapbl_inodetrk_free(struct wapbl *wl); 225static void wapbl_inodetrk_free(struct wapbl *wl);
226static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); 226static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
227 227
228static size_t wapbl_transaction_len(struct wapbl *wl); 228static size_t wapbl_transaction_len(struct wapbl *wl);
229static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); 229static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
230 230
231#if 0 231#if 0
232int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); 232int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
233#endif 233#endif
234 234
235static int wapbl_replay_isopen1(struct wapbl_replay *); 235static int wapbl_replay_isopen1(struct wapbl_replay *);
236 236
237/* 237/*
238 * This is useful for debugging. If set, the log will 238 * This is useful for debugging. If set, the log will
239 * only be truncated when necessary. 239 * only be truncated when necessary.
240 */ 240 */
241int wapbl_lazy_truncate = 0; 241int wapbl_lazy_truncate = 0;
242 242
243struct wapbl_ops wapbl_ops = { 243struct wapbl_ops wapbl_ops = {
244 .wo_wapbl_discard = wapbl_discard, 244 .wo_wapbl_discard = wapbl_discard,
245 .wo_wapbl_replay_isopen = wapbl_replay_isopen1, 245 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
246 .wo_wapbl_replay_can_read = wapbl_replay_can_read, 246 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
247 .wo_wapbl_replay_read = wapbl_replay_read, 247 .wo_wapbl_replay_read = wapbl_replay_read,
248 .wo_wapbl_add_buf = wapbl_add_buf, 248 .wo_wapbl_add_buf = wapbl_add_buf,
249 .wo_wapbl_remove_buf = wapbl_remove_buf, 249 .wo_wapbl_remove_buf = wapbl_remove_buf,
250 .wo_wapbl_resize_buf = wapbl_resize_buf, 250 .wo_wapbl_resize_buf = wapbl_resize_buf,
251 .wo_wapbl_begin = wapbl_begin, 251 .wo_wapbl_begin = wapbl_begin,
252 .wo_wapbl_end = wapbl_end, 252 .wo_wapbl_end = wapbl_end,
253 .wo_wapbl_junlock_assert= wapbl_junlock_assert, 253 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
254 254
255 /* XXX: the following is only used to say "this is a wapbl buf" */ 255 /* XXX: the following is only used to say "this is a wapbl buf" */
256 .wo_wapbl_biodone = wapbl_biodone, 256 .wo_wapbl_biodone = wapbl_biodone,
257}; 257};
258 258
259void 259void
260wapbl_init() 260wapbl_init()
261{ 261{
262 262
263 malloc_type_attach(M_WAPBL); 263 malloc_type_attach(M_WAPBL);
264} 264}
265 265
266static int 266static int
267wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr) 267wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
268{ 268{
269 int error, i; 269 int error, i;
270 270
271 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 271 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
272 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt)); 272 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
273 273
274 /* 274 /*
275 * Its only valid to reuse the replay log if its 275 * Its only valid to reuse the replay log if its
276 * the same as the new log we just opened. 276 * the same as the new log we just opened.
277 */ 277 */
278 KDASSERT(!wapbl_replay_isopen(wr)); 278 KDASSERT(!wapbl_replay_isopen(wr));
279 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev); 279 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
280 KASSERT(wl->wl_logpbn == wr->wr_logpbn); 280 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
281 KASSERT(wl->wl_circ_size == wr->wr_circ_size); 281 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
282 KASSERT(wl->wl_circ_off == wr->wr_circ_off); 282 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
283 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift); 283 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
284 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift); 284 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
285 285
286 wl->wl_wc_header->wc_generation = wr->wr_generation + 1; 286 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
287 287
288 for (i = 0; i < wr->wr_inodescnt; i++) 288 for (i = 0; i < wr->wr_inodescnt; i++)
289 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, 289 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
290 wr->wr_inodes[i].wr_imode); 290 wr->wr_inodes[i].wr_imode);
291 291
292 /* Make sure new transaction won't overwrite old inodes list */ 292 /* Make sure new transaction won't overwrite old inodes list */
293 KDASSERT(wapbl_transaction_len(wl) <=  293 KDASSERT(wapbl_transaction_len(wl) <=
294 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, 294 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
295 wr->wr_inodestail)); 295 wr->wr_inodestail));
296 296
297 wl->wl_head = wl->wl_tail = wr->wr_inodeshead; 297 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
298 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = 298 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
299 wapbl_transaction_len(wl); 299 wapbl_transaction_len(wl);
300 300
301 error = wapbl_write_inodes(wl, &wl->wl_head); 301 error = wapbl_write_inodes(wl, &wl->wl_head);
302 if (error) 302 if (error)
303 return error; 303 return error;
304 304
305 KASSERT(wl->wl_head != wl->wl_tail); 305 KASSERT(wl->wl_head != wl->wl_tail);
306 KASSERT(wl->wl_head != 0); 306 KASSERT(wl->wl_head != 0);
307 307
308 return 0; 308 return 0;
309} 309}
310 310
311int 311int
312wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, 312wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
313 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, 313 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
314 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) 314 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
315{ 315{
316 struct wapbl *wl; 316 struct wapbl *wl;
317 struct vnode *devvp; 317 struct vnode *devvp;
318 daddr_t logpbn; 318 daddr_t logpbn;
319 int error; 319 int error;
320 int log_dev_bshift = DEV_BSHIFT; 320 int log_dev_bshift = DEV_BSHIFT;
321 int fs_dev_bshift = DEV_BSHIFT; 321 int fs_dev_bshift = DEV_BSHIFT;
322 int run; 322 int run;
323 323
324 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 324 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
325 " count=%zu blksize=%zu\n", vp, off, count, blksize)); 325 " count=%zu blksize=%zu\n", vp, off, count, blksize));
326 326
327 if (log_dev_bshift > fs_dev_bshift) { 327 if (log_dev_bshift > fs_dev_bshift) {
328 WAPBL_PRINTF(WAPBL_PRINT_OPEN, 328 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
329 ("wapbl: log device's block size cannot be larger " 329 ("wapbl: log device's block size cannot be larger "
330 "than filesystem's\n")); 330 "than filesystem's\n"));
331 /* 331 /*
332 * Not currently implemented, although it could be if 332 * Not currently implemented, although it could be if
333 * needed someday. 333 * needed someday.
334 */ 334 */
335 return ENOSYS; 335 return ENOSYS;
336 } 336 }
337 337
338 if (off < 0) 338 if (off < 0)
339 return EINVAL; 339 return EINVAL;
340 340
341 if (blksize < DEV_BSIZE) 341 if (blksize < DEV_BSIZE)
342 return EINVAL; 342 return EINVAL;
343 if (blksize % DEV_BSIZE) 343 if (blksize % DEV_BSIZE)
344 return EINVAL; 344 return EINVAL;
345 345
346 /* XXXTODO: verify that the full load is writable */ 346 /* XXXTODO: verify that the full load is writable */
347 347
348 /* 348 /*
349 * XXX check for minimum log size 349 * XXX check for minimum log size
350 * minimum is governed by minimum amount of space 350 * minimum is governed by minimum amount of space
351 * to complete a transaction. (probably truncate) 351 * to complete a transaction. (probably truncate)
352 */ 352 */
353 /* XXX for now pick something minimal */ 353 /* XXX for now pick something minimal */
354 if ((count * blksize) < MAXPHYS) { 354 if ((count * blksize) < MAXPHYS) {
355 return ENOSPC; 355 return ENOSPC;
356 } 356 }
357 357
358 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { 358 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
359 return error; 359 return error;
360 } 360 }
361 361
362 wl = wapbl_calloc(1, sizeof(*wl)); 362 wl = wapbl_calloc(1, sizeof(*wl));
363 rw_init(&wl->wl_rwlock); 363 rw_init(&wl->wl_rwlock);
364 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); 364 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
365 cv_init(&wl->wl_reclaimable_cv, "wapblrec"); 365 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
366 LIST_INIT(&wl->wl_bufs); 366 LIST_INIT(&wl->wl_bufs);
367 SIMPLEQ_INIT(&wl->wl_entries); 367 SIMPLEQ_INIT(&wl->wl_entries);
368 368
369 wl->wl_logvp = vp; 369 wl->wl_logvp = vp;
370 wl->wl_devvp = devvp; 370 wl->wl_devvp = devvp;
371 wl->wl_mount = mp; 371 wl->wl_mount = mp;
372 wl->wl_logpbn = logpbn; 372 wl->wl_logpbn = logpbn;
373 wl->wl_log_dev_bshift = log_dev_bshift; 373 wl->wl_log_dev_bshift = log_dev_bshift;
374 wl->wl_fs_dev_bshift = fs_dev_bshift; 374 wl->wl_fs_dev_bshift = fs_dev_bshift;
375 375
376 wl->wl_flush = flushfn; 376 wl->wl_flush = flushfn;
377 wl->wl_flush_abort = flushabortfn; 377 wl->wl_flush_abort = flushabortfn;
378 378
379 /* Reserve two log device blocks for the commit headers */ 379 /* Reserve two log device blocks for the commit headers */
380 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift; 380 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
381 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); 381 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
382 /* truncate the log usage to a multiple of log_dev_bshift */ 382 /* truncate the log usage to a multiple of log_dev_bshift */
383 wl->wl_circ_size >>= wl->wl_log_dev_bshift; 383 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
384 wl->wl_circ_size <<= wl->wl_log_dev_bshift; 384 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
385 385
386 /* 386 /*
387 * wl_bufbytes_max limits the size of the in memory transaction space. 387 * wl_bufbytes_max limits the size of the in memory transaction space.
388 * - Since buffers are allocated and accounted for in units of 388 * - Since buffers are allocated and accounted for in units of
389 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE 389 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
390 * (i.e. 1<<PAGE_SHIFT) 390 * (i.e. 1<<PAGE_SHIFT)
391 * - Since the log device has to be written in units of 391 * - Since the log device has to be written in units of
392 * 1<<wl_log_dev_bshift it is required to be a mulitple of 392 * 1<<wl_log_dev_bshift it is required to be a mulitple of
393 * 1<<wl_log_dev_bshift. 393 * 1<<wl_log_dev_bshift.
394 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift, 394 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
395 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift. 395 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
396 * Therefore it must be multiple of the least common multiple of those 396 * Therefore it must be multiple of the least common multiple of those
397 * three quantities. Fortunately, all of those quantities are 397 * three quantities. Fortunately, all of those quantities are
398 * guaranteed to be a power of two, and the least common multiple of 398 * guaranteed to be a power of two, and the least common multiple of
399 * a set of numbers which are all powers of two is simply the maximum 399 * a set of numbers which are all powers of two is simply the maximum
400 * of those numbers. Finally, the maximum logarithm of a power of two 400 * of those numbers. Finally, the maximum logarithm of a power of two
401 * is the same as the log of the maximum power of two. So we can do 401 * is the same as the log of the maximum power of two. So we can do
402 * the following operations to size wl_bufbytes_max: 402 * the following operations to size wl_bufbytes_max:
403 */ 403 */
404 404
405 /* XXX fix actual number of pages reserved per filesystem. */ 405 /* XXX fix actual number of pages reserved per filesystem. */
406 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); 406 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
407 407
408 /* Round wl_bufbytes_max to the largest power of two constraint */ 408 /* Round wl_bufbytes_max to the largest power of two constraint */
409 wl->wl_bufbytes_max >>= PAGE_SHIFT; 409 wl->wl_bufbytes_max >>= PAGE_SHIFT;
410 wl->wl_bufbytes_max <<= PAGE_SHIFT; 410 wl->wl_bufbytes_max <<= PAGE_SHIFT;
411 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; 411 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
412 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; 412 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
413 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; 413 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
414 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; 414 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
415 415
416 /* XXX maybe use filesystem fragment size instead of 1024 */ 416 /* XXX maybe use filesystem fragment size instead of 1024 */
417 /* XXX fix actual number of buffers reserved per filesystem. */ 417 /* XXX fix actual number of buffers reserved per filesystem. */
418 wl->wl_bufcount_max = (nbuf / 2) * 1024; 418 wl->wl_bufcount_max = (nbuf / 2) * 1024;
419 419
420 /* XXX tie this into resource estimation */ 420 /* XXX tie this into resource estimation */
421 wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max); 421 wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max);
422  422
423#if WAPBL_UVM_ALLOC 423#if WAPBL_UVM_ALLOC
424 wl->wl_deallocblks = (void *) uvm_km_zalloc(kernel_map, 424 wl->wl_deallocblks = (void *) uvm_km_zalloc(kernel_map,
425 round_page(sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim)); 425 round_page(sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim));
426 KASSERT(wl->wl_deallocblks != NULL); 426 KASSERT(wl->wl_deallocblks != NULL);
427 wl->wl_dealloclens = (void *) uvm_km_zalloc(kernel_map, 427 wl->wl_dealloclens = (void *) uvm_km_zalloc(kernel_map,
428 round_page(sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim)); 428 round_page(sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim));
429 KASSERT(wl->wl_dealloclens != NULL); 429 KASSERT(wl->wl_dealloclens != NULL);
430#else 430#else
431 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) * 431 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) *
432 wl->wl_dealloclim); 432 wl->wl_dealloclim);
433 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) * 433 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) *
434 wl->wl_dealloclim); 434 wl->wl_dealloclim);
435#endif 435#endif
436 436
437 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); 437 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
438 438
439 /* Initialize the commit header */ 439 /* Initialize the commit header */
440 { 440 {
441 struct wapbl_wc_header *wc; 441 struct wapbl_wc_header *wc;
442 size_t len = 1 << wl->wl_log_dev_bshift; 442 size_t len = 1 << wl->wl_log_dev_bshift;
443 wc = wapbl_calloc(1, len); 443 wc = wapbl_calloc(1, len);
444 wc->wc_type = WAPBL_WC_HEADER; 444 wc->wc_type = WAPBL_WC_HEADER;
445 wc->wc_len = len; 445 wc->wc_len = len;
446 wc->wc_circ_off = wl->wl_circ_off; 446 wc->wc_circ_off = wl->wl_circ_off;
447 wc->wc_circ_size = wl->wl_circ_size; 447 wc->wc_circ_size = wl->wl_circ_size;
448 /* XXX wc->wc_fsid */ 448 /* XXX wc->wc_fsid */
449 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; 449 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
450 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; 450 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
451 wl->wl_wc_header = wc; 451 wl->wl_wc_header = wc;
452 wl->wl_wc_scratch = wapbl_malloc(len); 452 wl->wl_wc_scratch = wapbl_malloc(len);
453 } 453 }
454 454
455 /* 455 /*
456 * if there was an existing set of unlinked but 456 * if there was an existing set of unlinked but
457 * allocated inodes, preserve it in the new 457 * allocated inodes, preserve it in the new
458 * log. 458 * log.
459 */ 459 */
460 if (wr && wr->wr_inodescnt) { 460 if (wr && wr->wr_inodescnt) {
461 error = wapbl_start_flush_inodes(wl, wr); 461 error = wapbl_start_flush_inodes(wl, wr);
462 if (error) 462 if (error)
463 goto errout; 463 goto errout;
464 } 464 }
465 465
466 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); 466 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
467 if (error) { 467 if (error) {
468 goto errout; 468 goto errout;
469 } 469 }
470 470
471 *wlp = wl; 471 *wlp = wl;
472#if defined(WAPBL_DEBUG) 472#if defined(WAPBL_DEBUG)
473 wapbl_debug_wl = wl; 473 wapbl_debug_wl = wl;
474#endif 474#endif
475 475
476 return 0; 476 return 0;
477 errout: 477 errout:
478 wapbl_discard(wl); 478 wapbl_discard(wl);
479 wapbl_free(wl->wl_wc_scratch); 479 wapbl_free(wl->wl_wc_scratch);
480 wapbl_free(wl->wl_wc_header); 480 wapbl_free(wl->wl_wc_header);
481#if WAPBL_UVM_ALLOC 481#if WAPBL_UVM_ALLOC
482 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks, 482 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks,
483 round_page(sizeof(*wl->wl_deallocblks * 483 round_page(sizeof(*wl->wl_deallocblks *
484 wl->wl_dealloclim))); 484 wl->wl_dealloclim)));
485 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens, 485 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens,
486 round_page(sizeof(*wl->wl_dealloclens * 486 round_page(sizeof(*wl->wl_dealloclens *
487 wl->wl_dealloclim))); 487 wl->wl_dealloclim)));
488#else 488#else
489 wapbl_free(wl->wl_deallocblks); 489 wapbl_free(wl->wl_deallocblks);
490 wapbl_free(wl->wl_dealloclens); 490 wapbl_free(wl->wl_dealloclens);
491#endif 491#endif
492 wapbl_inodetrk_free(wl); 492 wapbl_inodetrk_free(wl);
493 wapbl_free(wl); 493 wapbl_free(wl);
494 494
495 return error; 495 return error;
496} 496}
497 497
498/* 498/*
499 * Like wapbl_flush, only discards the transaction 499 * Like wapbl_flush, only discards the transaction
500 * completely 500 * completely
501 */ 501 */
502 502
503void 503void
504wapbl_discard(struct wapbl *wl) 504wapbl_discard(struct wapbl *wl)
505{ 505{
506 struct wapbl_entry *we; 506 struct wapbl_entry *we;
507 struct buf *bp; 507 struct buf *bp;
508 int i; 508 int i;
509 509
510 /* 510 /*
511 * XXX we may consider using upgrade here 511 * XXX we may consider using upgrade here
512 * if we want to call flush from inside a transaction 512 * if we want to call flush from inside a transaction
513 */ 513 */
514 rw_enter(&wl->wl_rwlock, RW_WRITER); 514 rw_enter(&wl->wl_rwlock, RW_WRITER);
515 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 515 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
516 wl->wl_dealloccnt); 516 wl->wl_dealloccnt);
517 517
518#ifdef WAPBL_DEBUG_PRINT 518#ifdef WAPBL_DEBUG_PRINT
519 { 519 {
520 struct wapbl_entry *we; 520 struct wapbl_entry *we;
521 pid_t pid = -1; 521 pid_t pid = -1;
522 lwpid_t lid = -1; 522 lwpid_t lid = -1;
523 if (curproc) 523 if (curproc)
524 pid = curproc->p_pid; 524 pid = curproc->p_pid;
525 if (curlwp) 525 if (curlwp)
526 lid = curlwp->l_lid; 526 lid = curlwp->l_lid;
527#ifdef WAPBL_DEBUG_BUFBYTES 527#ifdef WAPBL_DEBUG_BUFBYTES
528 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 528 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
529 ("wapbl_discard: thread %d.%d discarding " 529 ("wapbl_discard: thread %d.%d discarding "
530 "transaction\n" 530 "transaction\n"
531 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 531 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
532 "deallocs=%d inodes=%d\n" 532 "deallocs=%d inodes=%d\n"
533 "\terrcnt = %u, reclaimable=%zu reserved=%zu " 533 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
534 "unsynced=%zu\n", 534 "unsynced=%zu\n",
535 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 535 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
536 wl->wl_bcount, wl->wl_dealloccnt, 536 wl->wl_bcount, wl->wl_dealloccnt,
537 wl->wl_inohashcnt, wl->wl_error_count, 537 wl->wl_inohashcnt, wl->wl_error_count,
538 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 538 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
539 wl->wl_unsynced_bufbytes)); 539 wl->wl_unsynced_bufbytes));
540 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 540 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
541 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 541 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
542 ("\tentry: bufcount = %zu, reclaimable = %zu, " 542 ("\tentry: bufcount = %zu, reclaimable = %zu, "
543 "error = %d, unsynced = %zu\n", 543 "error = %d, unsynced = %zu\n",
544 we->we_bufcount, we->we_reclaimable_bytes, 544 we->we_bufcount, we->we_reclaimable_bytes,
545 we->we_error, we->we_unsynced_bufbytes)); 545 we->we_error, we->we_unsynced_bufbytes));
546 } 546 }
547#else /* !WAPBL_DEBUG_BUFBYTES */ 547#else /* !WAPBL_DEBUG_BUFBYTES */
548 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 548 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
549 ("wapbl_discard: thread %d.%d discarding transaction\n" 549 ("wapbl_discard: thread %d.%d discarding transaction\n"
550 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 550 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
551 "deallocs=%d inodes=%d\n" 551 "deallocs=%d inodes=%d\n"
552 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", 552 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
553 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 553 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
554 wl->wl_bcount, wl->wl_dealloccnt, 554 wl->wl_bcount, wl->wl_dealloccnt,
555 wl->wl_inohashcnt, wl->wl_error_count, 555 wl->wl_inohashcnt, wl->wl_error_count,
556 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); 556 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
557 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 557 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
558 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 558 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
559 ("\tentry: bufcount = %zu, reclaimable = %zu, " 559 ("\tentry: bufcount = %zu, reclaimable = %zu, "
560 "error = %d\n", 560 "error = %d\n",
561 we->we_bufcount, we->we_reclaimable_bytes, 561 we->we_bufcount, we->we_reclaimable_bytes,
562 we->we_error)); 562 we->we_error));
563 } 563 }
564#endif /* !WAPBL_DEBUG_BUFBYTES */ 564#endif /* !WAPBL_DEBUG_BUFBYTES */
565 } 565 }
566#endif /* WAPBL_DEBUG_PRINT */ 566#endif /* WAPBL_DEBUG_PRINT */
567 567
568 for (i = 0; i <= wl->wl_inohashmask; i++) { 568 for (i = 0; i <= wl->wl_inohashmask; i++) {
569 struct wapbl_ino_head *wih; 569 struct wapbl_ino_head *wih;
570 struct wapbl_ino *wi; 570 struct wapbl_ino *wi;
571 571
572 wih = &wl->wl_inohash[i]; 572 wih = &wl->wl_inohash[i];
573 while ((wi = LIST_FIRST(wih)) != NULL) { 573 while ((wi = LIST_FIRST(wih)) != NULL) {
574 LIST_REMOVE(wi, wi_hash); 574 LIST_REMOVE(wi, wi_hash);
575 pool_put(&wapbl_ino_pool, wi); 575 pool_put(&wapbl_ino_pool, wi);
576 KASSERT(wl->wl_inohashcnt > 0); 576 KASSERT(wl->wl_inohashcnt > 0);
577 wl->wl_inohashcnt--; 577 wl->wl_inohashcnt--;
578 } 578 }
579 } 579 }
580 580
581 /* 581 /*
582 * clean buffer list 582 * clean buffer list
583 */ 583 */
584 mutex_enter(&bufcache_lock); 584 mutex_enter(&bufcache_lock);
585 mutex_enter(&wl->wl_mtx); 585 mutex_enter(&wl->wl_mtx);
586 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 586 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
587 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { 587 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
588 /* 588 /*
589 * The buffer will be unlocked and 589 * The buffer will be unlocked and
590 * removed from the transaction in brelse 590 * removed from the transaction in brelse
591 */ 591 */
592 mutex_exit(&wl->wl_mtx); 592 mutex_exit(&wl->wl_mtx);
593 brelsel(bp, 0); 593 brelsel(bp, 0);
594 mutex_enter(&wl->wl_mtx); 594 mutex_enter(&wl->wl_mtx);
595 } 595 }
596 } 596 }
597 mutex_exit(&wl->wl_mtx); 597 mutex_exit(&wl->wl_mtx);
598 mutex_exit(&bufcache_lock); 598 mutex_exit(&bufcache_lock);
599 599
600 /* 600 /*
601 * Remove references to this wl from wl_entries, free any which 601 * Remove references to this wl from wl_entries, free any which
602 * no longer have buffers, others will be freed in wapbl_biodone 602 * no longer have buffers, others will be freed in wapbl_biodone
603 * when they no longer have any buffers. 603 * when they no longer have any buffers.
604 */ 604 */
605 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { 605 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
606 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 606 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
607 /* XXX should we be accumulating wl_error_count 607 /* XXX should we be accumulating wl_error_count
608 * and increasing reclaimable bytes ? */ 608 * and increasing reclaimable bytes ? */
609 we->we_wapbl = NULL; 609 we->we_wapbl = NULL;
610 if (we->we_bufcount == 0) { 610 if (we->we_bufcount == 0) {
611#ifdef WAPBL_DEBUG_BUFBYTES 611#ifdef WAPBL_DEBUG_BUFBYTES
612 KASSERT(we->we_unsynced_bufbytes == 0); 612 KASSERT(we->we_unsynced_bufbytes == 0);
613#endif 613#endif
614 wapbl_free(we); 614 wapbl_free(we);
615 } 615 }
616 } 616 }
617 617
618 /* Discard list of deallocs */ 618 /* Discard list of deallocs */
619 wl->wl_dealloccnt = 0; 619 wl->wl_dealloccnt = 0;
620 /* XXX should we clear wl_reserved_bytes? */ 620 /* XXX should we clear wl_reserved_bytes? */
621 621
622 KASSERT(wl->wl_bufbytes == 0); 622 KASSERT(wl->wl_bufbytes == 0);
623 KASSERT(wl->wl_bcount == 0); 623 KASSERT(wl->wl_bcount == 0);
624 KASSERT(wl->wl_bufcount == 0); 624 KASSERT(wl->wl_bufcount == 0);
625 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 625 KASSERT(LIST_EMPTY(&wl->wl_bufs));
626 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 626 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
627 KASSERT(wl->wl_inohashcnt == 0); 627 KASSERT(wl->wl_inohashcnt == 0);
628 628
629 rw_exit(&wl->wl_rwlock); 629 rw_exit(&wl->wl_rwlock);
630} 630}
631 631
632int 632int
633wapbl_stop(struct wapbl *wl, int force) 633wapbl_stop(struct wapbl *wl, int force)
634{ 634{
635 struct vnode *vp; 635 struct vnode *vp;
636 int error; 636 int error;
637 637
638 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); 638 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
639 error = wapbl_flush(wl, 1); 639 error = wapbl_flush(wl, 1);
640 if (error) { 640 if (error) {
641 if (force) 641 if (force)
642 wapbl_discard(wl); 642 wapbl_discard(wl);
643 else 643 else
644 return error; 644 return error;
645 } 645 }
646 646
647 /* Unlinked inodes persist after a flush */ 647 /* Unlinked inodes persist after a flush */
648 if (wl->wl_inohashcnt) { 648 if (wl->wl_inohashcnt) {
649 if (force) { 649 if (force) {
650 wapbl_discard(wl); 650 wapbl_discard(wl);
651 } else { 651 } else {
652 return EBUSY; 652 return EBUSY;
653 } 653 }
654 } 654 }
655 655
656 KASSERT(wl->wl_bufbytes == 0); 656 KASSERT(wl->wl_bufbytes == 0);
657 KASSERT(wl->wl_bcount == 0); 657 KASSERT(wl->wl_bcount == 0);
658 KASSERT(wl->wl_bufcount == 0); 658 KASSERT(wl->wl_bufcount == 0);
659 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 659 KASSERT(LIST_EMPTY(&wl->wl_bufs));
660 KASSERT(wl->wl_dealloccnt == 0); 660 KASSERT(wl->wl_dealloccnt == 0);
661 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 661 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
662 KASSERT(wl->wl_inohashcnt == 0); 662 KASSERT(wl->wl_inohashcnt == 0);
663 663
664 vp = wl->wl_logvp; 664 vp = wl->wl_logvp;
665 665
666 wapbl_free(wl->wl_wc_scratch); 666 wapbl_free(wl->wl_wc_scratch);
667 wapbl_free(wl->wl_wc_header); 667 wapbl_free(wl->wl_wc_header);
668#if WAPBL_UVM_ALLOC 668#if WAPBL_UVM_ALLOC
669 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks, 669 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks,
670 round_page(sizeof(*wl->wl_deallocblks * 670 round_page(sizeof(*wl->wl_deallocblks *
671 wl->wl_dealloclim))); 671 wl->wl_dealloclim)));
672 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens, 672 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens,
673 round_page(sizeof(*wl->wl_dealloclens * 673 round_page(sizeof(*wl->wl_dealloclens *
674 wl->wl_dealloclim))); 674 wl->wl_dealloclim)));
675#else 675#else
676 wapbl_free(wl->wl_deallocblks); 676 wapbl_free(wl->wl_deallocblks);
677 wapbl_free(wl->wl_dealloclens); 677 wapbl_free(wl->wl_dealloclens);
678#endif 678#endif
679 wapbl_inodetrk_free(wl); 679 wapbl_inodetrk_free(wl);
680 680
681 cv_destroy(&wl->wl_reclaimable_cv); 681 cv_destroy(&wl->wl_reclaimable_cv);
682 mutex_destroy(&wl->wl_mtx); 682 mutex_destroy(&wl->wl_mtx);
683 rw_destroy(&wl->wl_rwlock); 683 rw_destroy(&wl->wl_rwlock);
684 wapbl_free(wl); 684 wapbl_free(wl);
685 685
686 return 0; 686 return 0;
687} 687}
688 688
689static int 689static int
690wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) 690wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
691{ 691{
692 struct pstats *pstats = curlwp->l_proc->p_stats; 692 struct pstats *pstats = curlwp->l_proc->p_stats;
693 struct buf *bp; 693 struct buf *bp;
694 int error; 694 int error;
695 695
696 KASSERT((flags & ~(B_WRITE | B_READ)) == 0); 696 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
697 KASSERT(devvp->v_type == VBLK); 697 KASSERT(devvp->v_type == VBLK);
698 698
699 if ((flags & (B_WRITE | B_READ)) == B_WRITE) { 699 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
700 mutex_enter(&devvp->v_interlock); 700 mutex_enter(&devvp->v_interlock);
701 devvp->v_numoutput++; 701 devvp->v_numoutput++;
702 mutex_exit(&devvp->v_interlock); 702 mutex_exit(&devvp->v_interlock);
703 pstats->p_ru.ru_oublock++; 703 pstats->p_ru.ru_oublock++;
704 } else { 704 } else {
705 pstats->p_ru.ru_inblock++; 705 pstats->p_ru.ru_inblock++;
706 } 706 }
707 707
708 bp = getiobuf(devvp, true); 708 bp = getiobuf(devvp, true);
709 bp->b_flags = flags; 709 bp->b_flags = flags;
710 bp->b_cflags = BC_BUSY; /* silly & dubious */ 710 bp->b_cflags = BC_BUSY; /* silly & dubious */
711 bp->b_dev = devvp->v_rdev; 711 bp->b_dev = devvp->v_rdev;
712 bp->b_data = data; 712 bp->b_data = data;
713 bp->b_bufsize = bp->b_resid = bp->b_bcount = len; 713 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
714 bp->b_blkno = pbn; 714 bp->b_blkno = pbn;
715 715
716 WAPBL_PRINTF(WAPBL_PRINT_IO, 716 WAPBL_PRINTF(WAPBL_PRINT_IO,
717 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n", 717 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n",
718 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, 718 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
719 bp->b_blkno, bp->b_dev)); 719 bp->b_blkno, bp->b_dev));
720 720
721 VOP_STRATEGY(devvp, bp); 721 VOP_STRATEGY(devvp, bp);
722 722
723 error = biowait(bp); 723 error = biowait(bp);
724 putiobuf(bp); 724 putiobuf(bp);
725 725
726 if (error) { 726 if (error) {
727 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 727 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
728 ("wapbl_doio: %s %zu bytes at block %" PRId64 728 ("wapbl_doio: %s %zu bytes at block %" PRId64
729 " on dev 0x%x failed with error %d\n", 729 " on dev 0x%x failed with error %d\n",
730 (((flags & (B_WRITE | B_READ)) == B_WRITE) ? 730 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
731 "write" : "read"), 731 "write" : "read"),
732 len, pbn, devvp->v_rdev, error)); 732 len, pbn, devvp->v_rdev, error));
733 } 733 }
734 734
735 return error; 735 return error;
736} 736}
737 737
738int 738int
739wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 739wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
740{ 740{
741 741
742 return wapbl_doio(data, len, devvp, pbn, B_WRITE); 742 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
743} 743}
744 744
745int 745int
746wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 746wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
747{ 747{
748 748
749 return wapbl_doio(data, len, devvp, pbn, B_READ); 749 return wapbl_doio(data, len, devvp, pbn, B_READ);
750} 750}
751 751
752/* 752/*
753 * Off is byte offset returns new offset for next write 753 * Off is byte offset returns new offset for next write
754 * handles log wraparound 754 * handles log wraparound
755 */ 755 */
756static int 756static int
757wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) 757wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
758{ 758{
759 size_t slen; 759 size_t slen;
760 off_t off = *offp; 760 off_t off = *offp;
761 int error; 761 int error;
762 762
763 KDASSERT(((len >> wl->wl_log_dev_bshift) << 763 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
764 wl->wl_log_dev_bshift) == len); 764 wl->wl_log_dev_bshift) == len);
765 765
766 if (off < wl->wl_circ_off) 766 if (off < wl->wl_circ_off)
767 off = wl->wl_circ_off; 767 off = wl->wl_circ_off;
768 slen = wl->wl_circ_off + wl->wl_circ_size - off; 768 slen = wl->wl_circ_off + wl->wl_circ_size - off;
769 if (slen < len) { 769 if (slen < len) {
770 error = wapbl_write(data, slen, wl->wl_devvp, 770 error = wapbl_write(data, slen, wl->wl_devvp,
771 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift)); 771 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
772 if (error) 772 if (error)
773 return error; 773 return error;
774 data = (uint8_t *)data + slen; 774 data = (uint8_t *)data + slen;
775 len -= slen; 775 len -= slen;
776 off = wl->wl_circ_off; 776 off = wl->wl_circ_off;
777 } 777 }
778 error = wapbl_write(data, len, wl->wl_devvp, 778 error = wapbl_write(data, len, wl->wl_devvp,
779 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift)); 779 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
780 if (error) 780 if (error)
781 return error; 781 return error;
782 off += len; 782 off += len;
783 if (off >= wl->wl_circ_off + wl->wl_circ_size) 783 if (off >= wl->wl_circ_off + wl->wl_circ_size)
784 off = wl->wl_circ_off; 784 off = wl->wl_circ_off;
785 *offp = off; 785 *offp = off;
786 return 0; 786 return 0;
787} 787}
788 788
789/****************************************************************/ 789/****************************************************************/
790 790
791int 791int
792wapbl_begin(struct wapbl *wl, const char *file, int line) 792wapbl_begin(struct wapbl *wl, const char *file, int line)
793{ 793{
794 int doflush; 794 int doflush;
795 unsigned lockcount; 795 unsigned lockcount;
796 krw_t op; 796 krw_t op;
797 797
798 KDASSERT(wl); 798 KDASSERT(wl);
799 799
800/* 800/*
801 * XXX: The original code calls for the use of a RW_READER lock  801 * XXX: The original code calls for the use of a RW_READER lock
802 * here, but it turns out there are performance issues with high  802 * here, but it turns out there are performance issues with high
803 * metadata-rate workloads (e.g. multiple simultaneous tar 803 * metadata-rate workloads (e.g. multiple simultaneous tar
804 * extractions). For now, we force the lock to be RW_WRITER,  804 * extractions). For now, we force the lock to be RW_WRITER,
805 * since that currently has the best performance characteristics  805 * since that currently has the best performance characteristics
806 * (even for a single tar-file extraction).  806 * (even for a single tar-file extraction).
807 *  807 *
808 */ 808 */
809#define WAPBL_DEBUG_SERIALIZE 1 809#define WAPBL_DEBUG_SERIALIZE 1
810 810
811#ifdef WAPBL_DEBUG_SERIALIZE 811#ifdef WAPBL_DEBUG_SERIALIZE
812 op = RW_WRITER; 812 op = RW_WRITER;
813#else 813#else
814 op = RW_READER; 814 op = RW_READER;
815#endif 815#endif
816 816
817 /* 817 /*
818 * XXX this needs to be made much more sophisticated. 818 * XXX this needs to be made much more sophisticated.
819 * perhaps each wapbl_begin could reserve a specified 819 * perhaps each wapbl_begin could reserve a specified
820 * number of buffers and bytes. 820 * number of buffers and bytes.
821 */ 821 */
822 mutex_enter(&wl->wl_mtx); 822 mutex_enter(&wl->wl_mtx);
823 lockcount = wl->wl_lock_count; 823 lockcount = wl->wl_lock_count;
824 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > 824 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
825 wl->wl_bufbytes_max / 2) || 825 wl->wl_bufbytes_max / 2) ||
826 ((wl->wl_bufcount + (lockcount * 10)) > 826 ((wl->wl_bufcount + (lockcount * 10)) >
827 wl->wl_bufcount_max / 2) || 827 wl->wl_bufcount_max / 2) ||
828 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2); 828 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2);
829 mutex_exit(&wl->wl_mtx); 829 mutex_exit(&wl->wl_mtx);
830 830
831 if (doflush) { 831 if (doflush) {
832 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 832 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
833 ("force flush lockcnt=%d bufbytes=%zu " 833 ("force flush lockcnt=%d bufbytes=%zu "
834 "(max=%zu) bufcount=%zu (max=%zu)\n", 834 "(max=%zu) bufcount=%zu (max=%zu)\n",
835 lockcount, wl->wl_bufbytes, 835 lockcount, wl->wl_bufbytes,
836 wl->wl_bufbytes_max, wl->wl_bufcount, 836 wl->wl_bufbytes_max, wl->wl_bufcount,
837 wl->wl_bufcount_max)); 837 wl->wl_bufcount_max));
838 } 838 }
839 839
840 if (doflush) { 840 if (doflush) {
841 int error = wapbl_flush(wl, 0); 841 int error = wapbl_flush(wl, 0);
842 if (error) 842 if (error)
843 return error; 843 return error;
844 } 844 }
845 845
846 rw_enter(&wl->wl_rwlock, op); 846 rw_enter(&wl->wl_rwlock, op);
847 mutex_enter(&wl->wl_mtx); 847 mutex_enter(&wl->wl_mtx);
848 wl->wl_lock_count++; 848 wl->wl_lock_count++;
849 mutex_exit(&wl->wl_mtx); 849 mutex_exit(&wl->wl_mtx);
850 850
851#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE) 851#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
852 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 852 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
853 ("wapbl_begin thread %d.%d with bufcount=%zu " 853 ("wapbl_begin thread %d.%d with bufcount=%zu "
854 "bufbytes=%zu bcount=%zu at %s:%d\n", 854 "bufbytes=%zu bcount=%zu at %s:%d\n",
855 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 855 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
856 wl->wl_bufbytes, wl->wl_bcount, file, line)); 856 wl->wl_bufbytes, wl->wl_bcount, file, line));
857#endif 857#endif
858 858
859 return 0; 859 return 0;
860} 860}
861 861
862void 862void
863wapbl_end(struct wapbl *wl) 863wapbl_end(struct wapbl *wl)
864{ 864{
865 865
866#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE) 866#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
867 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 867 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
868 ("wapbl_end thread %d.%d with bufcount=%zu " 868 ("wapbl_end thread %d.%d with bufcount=%zu "
869 "bufbytes=%zu bcount=%zu\n", 869 "bufbytes=%zu bcount=%zu\n",
870 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 870 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
871 wl->wl_bufbytes, wl->wl_bcount)); 871 wl->wl_bufbytes, wl->wl_bcount));
872#endif 872#endif
873 873
874 mutex_enter(&wl->wl_mtx); 874 mutex_enter(&wl->wl_mtx);
875 KASSERT(wl->wl_lock_count > 0); 875 KASSERT(wl->wl_lock_count > 0);
876 wl->wl_lock_count--; 876 wl->wl_lock_count--;
877 mutex_exit(&wl->wl_mtx); 877 mutex_exit(&wl->wl_mtx);
878 878
879 rw_exit(&wl->wl_rwlock); 879 rw_exit(&wl->wl_rwlock);
880} 880}
881 881
882void 882void
883wapbl_add_buf(struct wapbl *wl, struct buf * bp) 883wapbl_add_buf(struct wapbl *wl, struct buf * bp)
884{ 884{
885 885
886 KASSERT(bp->b_cflags & BC_BUSY); 886 KASSERT(bp->b_cflags & BC_BUSY);
887 KASSERT(bp->b_vp); 887 KASSERT(bp->b_vp);
888 888
889 wapbl_jlock_assert(wl); 889 wapbl_jlock_assert(wl);
890 890
891#if 0 891#if 0
892 /* 892 /*
893 * XXX this might be an issue for swapfiles. 893 * XXX this might be an issue for swapfiles.
894 * see uvm_swap.c:1702 894 * see uvm_swap.c:1702
895 * 895 *
896 * XXX2 why require it then? leap of semantics? 896 * XXX2 why require it then? leap of semantics?
897 */ 897 */
898 KASSERT((bp->b_cflags & BC_NOCACHE) == 0); 898 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
899#endif 899#endif
900 900
901 mutex_enter(&wl->wl_mtx); 901 mutex_enter(&wl->wl_mtx);
902 if (bp->b_flags & B_LOCKED) { 902 if (bp->b_flags & B_LOCKED) {
903 LIST_REMOVE(bp, b_wapbllist); 903 LIST_REMOVE(bp, b_wapbllist);
904 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, 904 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
905 ("wapbl_add_buf thread %d.%d re-adding buf %p " 905 ("wapbl_add_buf thread %d.%d re-adding buf %p "
906 "with %d bytes %d bcount\n", 906 "with %d bytes %d bcount\n",
907 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 907 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
908 bp->b_bcount)); 908 bp->b_bcount));
909 } else { 909 } else {
910 /* unlocked by dirty buffers shouldn't exist */ 910 /* unlocked by dirty buffers shouldn't exist */
911 KASSERT(!(bp->b_oflags & BO_DELWRI)); 911 KASSERT(!(bp->b_oflags & BO_DELWRI));
912 wl->wl_bufbytes += bp->b_bufsize; 912 wl->wl_bufbytes += bp->b_bufsize;
913 wl->wl_bcount += bp->b_bcount; 913 wl->wl_bcount += bp->b_bcount;
914 wl->wl_bufcount++; 914 wl->wl_bufcount++;
915 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 915 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
916 ("wapbl_add_buf thread %d.%d adding buf %p " 916 ("wapbl_add_buf thread %d.%d adding buf %p "
917 "with %d bytes %d bcount\n", 917 "with %d bytes %d bcount\n",
918 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 918 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
919 bp->b_bcount)); 919 bp->b_bcount));
920 } 920 }
921 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); 921 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
922 mutex_exit(&wl->wl_mtx); 922 mutex_exit(&wl->wl_mtx);
923 923
924 bp->b_flags |= B_LOCKED; 924 bp->b_flags |= B_LOCKED;
925} 925}
926 926
927static void 927static void
928wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) 928wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
929{ 929{
930 930
931 KASSERT(mutex_owned(&wl->wl_mtx)); 931 KASSERT(mutex_owned(&wl->wl_mtx));
932 KASSERT(bp->b_cflags & BC_BUSY); 932 KASSERT(bp->b_cflags & BC_BUSY);
933 wapbl_jlock_assert(wl); 933 wapbl_jlock_assert(wl);
934 934
935#if 0 935#if 0
936 /* 936 /*
937 * XXX this might be an issue for swapfiles. 937 * XXX this might be an issue for swapfiles.
938 * see uvm_swap.c:1725 938 * see uvm_swap.c:1725
939 * 939 *
940 * XXXdeux: see above 940 * XXXdeux: see above
941 */ 941 */
942 KASSERT((bp->b_flags & BC_NOCACHE) == 0); 942 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
943#endif 943#endif
944 KASSERT(bp->b_flags & B_LOCKED); 944 KASSERT(bp->b_flags & B_LOCKED);
945 945
946 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 946 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
947 ("wapbl_remove_buf thread %d.%d removing buf %p with " 947 ("wapbl_remove_buf thread %d.%d removing buf %p with "
948 "%d bytes %d bcount\n", 948 "%d bytes %d bcount\n",
949 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); 949 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
950 950
951 KASSERT(wl->wl_bufbytes >= bp->b_bufsize); 951 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
952 wl->wl_bufbytes -= bp->b_bufsize; 952 wl->wl_bufbytes -= bp->b_bufsize;
953 KASSERT(wl->wl_bcount >= bp->b_bcount); 953 KASSERT(wl->wl_bcount >= bp->b_bcount);
954 wl->wl_bcount -= bp->b_bcount; 954 wl->wl_bcount -= bp->b_bcount;
955 KASSERT(wl->wl_bufcount > 0); 955 KASSERT(wl->wl_bufcount > 0);
956 wl->wl_bufcount--; 956 wl->wl_bufcount--;
957 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 957 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
958 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 958 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
959 LIST_REMOVE(bp, b_wapbllist); 959 LIST_REMOVE(bp, b_wapbllist);
960 960
961 bp->b_flags &= ~B_LOCKED; 961 bp->b_flags &= ~B_LOCKED;
962} 962}
963 963
964/* called from brelsel() in vfs_bio among other places */ 964/* called from brelsel() in vfs_bio among other places */
965void 965void
966wapbl_remove_buf(struct wapbl * wl, struct buf *bp) 966wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
967{ 967{
968 968
969 mutex_enter(&wl->wl_mtx); 969 mutex_enter(&wl->wl_mtx);
970 wapbl_remove_buf_locked(wl, bp); 970 wapbl_remove_buf_locked(wl, bp);
971 mutex_exit(&wl->wl_mtx); 971 mutex_exit(&wl->wl_mtx);
972} 972}
973 973
974void 974void
975wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) 975wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
976{ 976{
977 977
978 KASSERT(bp->b_cflags & BC_BUSY); 978 KASSERT(bp->b_cflags & BC_BUSY);
979 979
980 /* 980 /*
981 * XXX: why does this depend on B_LOCKED? otherwise the buf 981 * XXX: why does this depend on B_LOCKED? otherwise the buf
982 * is not for a transaction? if so, why is this called in the 982 * is not for a transaction? if so, why is this called in the
983 * first place? 983 * first place?
984 */ 984 */
985 if (bp->b_flags & B_LOCKED) { 985 if (bp->b_flags & B_LOCKED) {
986 mutex_enter(&wl->wl_mtx); 986 mutex_enter(&wl->wl_mtx);
987 wl->wl_bufbytes += bp->b_bufsize - oldsz; 987 wl->wl_bufbytes += bp->b_bufsize - oldsz;
988 wl->wl_bcount += bp->b_bcount - oldcnt; 988 wl->wl_bcount += bp->b_bcount - oldcnt;
989 mutex_exit(&wl->wl_mtx); 989 mutex_exit(&wl->wl_mtx);
990 } 990 }
991} 991}
992 992
993#endif /* _KERNEL */ 993#endif /* _KERNEL */
994 994
995/****************************************************************/ 995/****************************************************************/
996/* Some utility inlines */ 996/* Some utility inlines */
997 997
998/* This is used to advance the pointer at old to new value at old+delta */ 998/* This is used to advance the pointer at old to new value at old+delta */
999static __inline off_t 999static __inline off_t
1000wapbl_advance(size_t size, size_t off, off_t old, size_t delta) 1000wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
1001{ 1001{
1002 off_t new; 1002 off_t new;
1003 1003
1004 /* Define acceptable ranges for inputs. */ 1004 /* Define acceptable ranges for inputs. */
1005 KASSERT(delta <= size); 1005 KASSERT(delta <= size);
1006 KASSERT((old == 0) || (old >= off)); 1006 KASSERT((old == 0) || (old >= off));
1007 KASSERT(old < (size + off)); 1007 KASSERT(old < (size + off));
1008 1008
1009 if ((old == 0) && (delta != 0)) 1009 if ((old == 0) && (delta != 0))
1010 new = off + delta; 1010 new = off + delta;
1011 else if ((old + delta) < (size + off)) 1011 else if ((old + delta) < (size + off))
1012 new = old + delta; 1012 new = old + delta;
1013 else 1013 else
1014 new = (old + delta) - size; 1014 new = (old + delta) - size;
1015 1015
1016 /* Note some interesting axioms */ 1016 /* Note some interesting axioms */
1017 KASSERT((delta != 0) || (new == old)); 1017 KASSERT((delta != 0) || (new == old));
1018 KASSERT((delta == 0) || (new != 0)); 1018 KASSERT((delta == 0) || (new != 0));
1019 KASSERT((delta != (size)) || (new == old)); 1019 KASSERT((delta != (size)) || (new == old));
1020 1020
1021 /* Define acceptable ranges for output. */ 1021 /* Define acceptable ranges for output. */
1022 KASSERT((new == 0) || (new >= off)); 1022 KASSERT((new == 0) || (new >= off));
1023 KASSERT(new < (size + off)); 1023 KASSERT(new < (size + off));
1024 return new; 1024 return new;
1025} 1025}
1026 1026
1027static __inline size_t 1027static __inline size_t
1028wapbl_space_used(size_t avail, off_t head, off_t tail) 1028wapbl_space_used(size_t avail, off_t head, off_t tail)
1029{ 1029{
1030 1030
1031 if (tail == 0) { 1031 if (tail == 0) {
1032 KASSERT(head == 0); 1032 KASSERT(head == 0);
1033 return 0; 1033 return 0;
1034 } 1034 }
1035 return ((head + (avail - 1) - tail) % avail) + 1; 1035 return ((head + (avail - 1) - tail) % avail) + 1;
1036} 1036}
1037 1037
1038static __inline size_t 1038static __inline size_t
1039wapbl_space_free(size_t avail, off_t head, off_t tail) 1039wapbl_space_free(size_t avail, off_t head, off_t tail)
1040{ 1040{
1041 1041
1042 return avail - wapbl_space_used(avail, head, tail); 1042 return avail - wapbl_space_used(avail, head, tail);
1043} 1043}
1044 1044
1045static __inline void 1045static __inline void
1046wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, 1046wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1047 off_t *tailp) 1047 off_t *tailp)
1048{ 1048{
1049 off_t head = *headp; 1049 off_t head = *headp;
1050 off_t tail = *tailp; 1050 off_t tail = *tailp;
1051 1051
1052 KASSERT(delta <= wapbl_space_free(size, head, tail)); 1052 KASSERT(delta <= wapbl_space_free(size, head, tail));
1053 head = wapbl_advance(size, off, head, delta); 1053 head = wapbl_advance(size, off, head, delta);
1054 if ((tail == 0) && (head != 0)) 1054 if ((tail == 0) && (head != 0))
1055 tail = off; 1055 tail = off;
1056 *headp = head; 1056 *headp = head;
1057 *tailp = tail; 1057 *tailp = tail;
1058} 1058}
1059 1059
1060static __inline void 1060static __inline void
1061wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, 1061wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1062 off_t *tailp) 1062 off_t *tailp)
1063{ 1063{
1064 off_t head = *headp; 1064 off_t head = *headp;
1065 off_t tail = *tailp; 1065 off_t tail = *tailp;
1066 1066
1067 KASSERT(delta <= wapbl_space_used(size, head, tail)); 1067 KASSERT(delta <= wapbl_space_used(size, head, tail));
1068 tail = wapbl_advance(size, off, tail, delta); 1068 tail = wapbl_advance(size, off, tail, delta);
1069 if (head == tail) { 1069 if (head == tail) {
1070 head = tail = 0; 1070 head = tail = 0;
1071 } 1071 }
1072 *headp = head; 1072 *headp = head;
1073 *tailp = tail; 1073 *tailp = tail;
1074} 1074}
1075 1075
1076#ifdef _KERNEL 1076#ifdef _KERNEL
1077 1077
1078/****************************************************************/ 1078/****************************************************************/
1079 1079
1080/* 1080/*
1081 * Remove transactions whose buffers are completely flushed to disk. 1081 * Remove transactions whose buffers are completely flushed to disk.
1082 * Will block until at least minfree space is available. 1082 * Will block until at least minfree space is available.
1083 * only intended to be called from inside wapbl_flush and therefore 1083 * only intended to be called from inside wapbl_flush and therefore
1084 * does not protect against commit races with itself or with flush. 1084 * does not protect against commit races with itself or with flush.
1085 */ 1085 */
1086static int 1086static int
1087wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly) 1087wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1088{ 1088{
1089 size_t delta; 1089 size_t delta;
1090 size_t avail; 1090 size_t avail;
1091 off_t head; 1091 off_t head;
1092 off_t tail; 1092 off_t tail;
1093 int error = 0; 1093 int error = 0;
1094 1094
1095 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); 1095 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1096 KASSERT(rw_write_held(&wl->wl_rwlock)); 1096 KASSERT(rw_write_held(&wl->wl_rwlock));
1097 1097
1098 mutex_enter(&wl->wl_mtx); 1098 mutex_enter(&wl->wl_mtx);
1099 1099
1100 /* 1100 /*
1101 * First check to see if we have to do a commit 1101 * First check to see if we have to do a commit
1102 * at all. 1102 * at all.
1103 */ 1103 */
1104 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); 1104 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1105 if (minfree < avail) { 1105 if (minfree < avail) {
1106 mutex_exit(&wl->wl_mtx); 1106 mutex_exit(&wl->wl_mtx);
1107 return 0; 1107 return 0;
1108 } 1108 }
1109 minfree -= avail; 1109 minfree -= avail;
1110 while ((wl->wl_error_count == 0) && 1110 while ((wl->wl_error_count == 0) &&
1111 (wl->wl_reclaimable_bytes < minfree)) { 1111 (wl->wl_reclaimable_bytes < minfree)) {
1112 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1112 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1113 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " 1113 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1114 "minfree=%zd\n", 1114 "minfree=%zd\n",
1115 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, 1115 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1116 minfree)); 1116 minfree));
1117 1117
1118 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); 1118 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1119 } 1119 }
1120 if (wl->wl_reclaimable_bytes < minfree) { 1120 if (wl->wl_reclaimable_bytes < minfree) {
1121 KASSERT(wl->wl_error_count); 1121 KASSERT(wl->wl_error_count);
1122 /* XXX maybe get actual error from buffer instead someday? */ 1122 /* XXX maybe get actual error from buffer instead someday? */
1123 error = EIO; 1123 error = EIO;
1124 } 1124 }
1125 head = wl->wl_head; 1125 head = wl->wl_head;
1126 tail = wl->wl_tail; 1126 tail = wl->wl_tail;
1127 delta = wl->wl_reclaimable_bytes; 1127 delta = wl->wl_reclaimable_bytes;
1128 1128
1129 /* If all of of the entries are flushed, then be sure to keep 1129 /* If all of of the entries are flushed, then be sure to keep
1130 * the reserved bytes reserved. Watch out for discarded transactions, 1130 * the reserved bytes reserved. Watch out for discarded transactions,
1131 * which could leave more bytes reserved than are reclaimable. 1131 * which could leave more bytes reserved than are reclaimable.
1132 */ 1132 */
1133 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&  1133 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1134 (delta >= wl->wl_reserved_bytes)) { 1134 (delta >= wl->wl_reserved_bytes)) {
1135 delta -= wl->wl_reserved_bytes; 1135 delta -= wl->wl_reserved_bytes;
1136 } 1136 }
1137 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, 1137 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1138 &tail); 1138 &tail);
1139 KDASSERT(wl->wl_reserved_bytes <= 1139 KDASSERT(wl->wl_reserved_bytes <=
1140 wapbl_space_used(wl->wl_circ_size, head, tail)); 1140 wapbl_space_used(wl->wl_circ_size, head, tail));
1141 mutex_exit(&wl->wl_mtx); 1141 mutex_exit(&wl->wl_mtx);
1142 1142
1143 if (error) 1143 if (error)
1144 return error; 1144 return error;
1145 1145
1146 if (waitonly) 1146 if (waitonly)
1147 return 0; 1147 return 0;
1148 1148
1149 /* 1149 /*
1150 * This is where head, tail and delta are unprotected 1150 * This is where head, tail and delta are unprotected
1151 * from races against itself or flush. This is ok since 1151 * from races against itself or flush. This is ok since
1152 * we only call this routine from inside flush itself. 1152 * we only call this routine from inside flush itself.
1153 * 1153 *
1154 * XXX: how can it race against itself when accessed only 1154 * XXX: how can it race against itself when accessed only
1155 * from behind the write-locked rwlock? 1155 * from behind the write-locked rwlock?
1156 */ 1156 */
1157 error = wapbl_write_commit(wl, head, tail); 1157 error = wapbl_write_commit(wl, head, tail);
1158 if (error) 1158 if (error)
1159 return error; 1159 return error;
1160 1160
1161 wl->wl_head = head; 1161 wl->wl_head = head;
1162 wl->wl_tail = tail; 1162 wl->wl_tail = tail;
1163 1163
1164 mutex_enter(&wl->wl_mtx); 1164 mutex_enter(&wl->wl_mtx);
1165 KASSERT(wl->wl_reclaimable_bytes >= delta); 1165 KASSERT(wl->wl_reclaimable_bytes >= delta);
1166 wl->wl_reclaimable_bytes -= delta; 1166 wl->wl_reclaimable_bytes -= delta;
1167 mutex_exit(&wl->wl_mtx); 1167 mutex_exit(&wl->wl_mtx);
1168 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1168 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1169 ("wapbl_truncate thread %d.%d truncating %zu bytes\n", 1169 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1170 curproc->p_pid, curlwp->l_lid, delta)); 1170 curproc->p_pid, curlwp->l_lid, delta));
1171 1171
1172 return 0; 1172 return 0;
1173} 1173}
1174 1174
1175/****************************************************************/ 1175/****************************************************************/
1176 1176
1177void 1177void
1178wapbl_biodone(struct buf *bp) 1178wapbl_biodone(struct buf *bp)
1179{ 1179{
1180 struct wapbl_entry *we = bp->b_private; 1180 struct wapbl_entry *we = bp->b_private;
1181 struct wapbl *wl = we->we_wapbl; 1181 struct wapbl *wl = we->we_wapbl;
1182 1182
1183 /* 1183 /*
1184 * Handle possible flushing of buffers after log has been 1184 * Handle possible flushing of buffers after log has been
1185 * decomissioned. 1185 * decomissioned.
1186 */ 1186 */
1187 if (!wl) { 1187 if (!wl) {
1188 KASSERT(we->we_bufcount > 0); 1188 KASSERT(we->we_bufcount > 0);
1189 we->we_bufcount--; 1189 we->we_bufcount--;
1190#ifdef WAPBL_DEBUG_BUFBYTES 1190#ifdef WAPBL_DEBUG_BUFBYTES
1191 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); 1191 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1192 we->we_unsynced_bufbytes -= bp->b_bufsize; 1192 we->we_unsynced_bufbytes -= bp->b_bufsize;
1193#endif 1193#endif
1194 1194
1195 if (we->we_bufcount == 0) { 1195 if (we->we_bufcount == 0) {
1196#ifdef WAPBL_DEBUG_BUFBYTES 1196#ifdef WAPBL_DEBUG_BUFBYTES
1197 KASSERT(we->we_unsynced_bufbytes == 0); 1197 KASSERT(we->we_unsynced_bufbytes == 0);
1198#endif 1198#endif
1199 wapbl_free(we); 1199 wapbl_free(we);
1200 } 1200 }
1201 1201
1202 brelse(bp, 0); 1202 brelse(bp, 0);
1203 return; 1203 return;
1204 } 1204 }
1205 1205
1206#ifdef ohbother 1206#ifdef ohbother
1207 KDASSERT(bp->b_flags & B_DONE); 1207 KDASSERT(bp->b_flags & B_DONE);
1208 KDASSERT(!(bp->b_flags & B_DELWRI)); 1208 KDASSERT(!(bp->b_flags & B_DELWRI));
1209 KDASSERT(bp->b_flags & B_ASYNC); 1209 KDASSERT(bp->b_flags & B_ASYNC);
1210 KDASSERT(bp->b_flags & B_BUSY); 1210 KDASSERT(bp->b_flags & B_BUSY);
1211 KDASSERT(!(bp->b_flags & B_LOCKED)); 1211 KDASSERT(!(bp->b_flags & B_LOCKED));
1212 KDASSERT(!(bp->b_flags & B_READ)); 1212 KDASSERT(!(bp->b_flags & B_READ));
1213 KDASSERT(!(bp->b_flags & B_INVAL)); 1213 KDASSERT(!(bp->b_flags & B_INVAL));
1214 KDASSERT(!(bp->b_flags & B_NOCACHE)); 1214 KDASSERT(!(bp->b_flags & B_NOCACHE));
1215#endif 1215#endif
1216 1216
1217 if (bp->b_error) { 1217 if (bp->b_error) {
1218#ifdef notyet /* Can't currently handle possible dirty buffer reuse */ 1218#ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1219 XXXpooka: interfaces not fully updated 1219 XXXpooka: interfaces not fully updated
1220 Note: this was not enabled in the original patch 1220 Note: this was not enabled in the original patch
1221 against netbsd4 either. I don't know if comment 1221 against netbsd4 either. I don't know if comment
1222 above is true or not. 1222 above is true or not.
1223 1223
1224 /* 1224 /*
1225 * If an error occurs, report the error and leave the 1225 * If an error occurs, report the error and leave the
1226 * buffer as a delayed write on the LRU queue. 1226 * buffer as a delayed write on the LRU queue.
1227 * restarting the write would likely result in 1227 * restarting the write would likely result in
1228 * an error spinloop, so let it be done harmlessly 1228 * an error spinloop, so let it be done harmlessly
1229 * by the syncer. 1229 * by the syncer.
1230 */ 1230 */
1231 bp->b_flags &= ~(B_DONE); 1231 bp->b_flags &= ~(B_DONE);
1232 simple_unlock(&bp->b_interlock); 1232 simple_unlock(&bp->b_interlock);
1233 1233
1234 if (we->we_error == 0) { 1234 if (we->we_error == 0) {
1235 mutex_enter(&wl->wl_mtx); 1235 mutex_enter(&wl->wl_mtx);
1236 wl->wl_error_count++; 1236 wl->wl_error_count++;
1237 mutex_exit(&wl->wl_mtx); 1237 mutex_exit(&wl->wl_mtx);
1238 cv_broadcast(&wl->wl_reclaimable_cv); 1238 cv_broadcast(&wl->wl_reclaimable_cv);
1239 } 1239 }
1240 we->we_error = bp->b_error; 1240 we->we_error = bp->b_error;
1241 bp->b_error = 0; 1241 bp->b_error = 0;
1242 brelse(bp); 1242 brelse(bp);
1243 return; 1243 return;
1244#else 1244#else
1245 /* For now, just mark the log permanently errored out */ 1245 /* For now, just mark the log permanently errored out */
1246 1246
1247 mutex_enter(&wl->wl_mtx); 1247 mutex_enter(&wl->wl_mtx);
1248 if (wl->wl_error_count == 0) { 1248 if (wl->wl_error_count == 0) {
1249 wl->wl_error_count++; 1249 wl->wl_error_count++;
1250 cv_broadcast(&wl->wl_reclaimable_cv); 1250 cv_broadcast(&wl->wl_reclaimable_cv);
1251 } 1251 }
1252 mutex_exit(&wl->wl_mtx); 1252 mutex_exit(&wl->wl_mtx);
1253#endif 1253#endif
1254 } 1254 }
1255 1255
1256 mutex_enter(&wl->wl_mtx); 1256 mutex_enter(&wl->wl_mtx);
1257 1257
1258 KASSERT(we->we_bufcount > 0); 1258 KASSERT(we->we_bufcount > 0);
1259 we->we_bufcount--; 1259 we->we_bufcount--;
1260#ifdef WAPBL_DEBUG_BUFBYTES 1260#ifdef WAPBL_DEBUG_BUFBYTES
1261 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); 1261 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1262 we->we_unsynced_bufbytes -= bp->b_bufsize; 1262 we->we_unsynced_bufbytes -= bp->b_bufsize;
1263 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize); 1263 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
1264 wl->wl_unsynced_bufbytes -= bp->b_bufsize; 1264 wl->wl_unsynced_bufbytes -= bp->b_bufsize;
1265#endif 1265#endif
1266 1266
1267 /* 1267 /*
1268 * If the current transaction can be reclaimed, start 1268 * If the current transaction can be reclaimed, start
1269 * at the beginning and reclaim any consecutive reclaimable 1269 * at the beginning and reclaim any consecutive reclaimable
1270 * transactions. If we successfully reclaim anything, 1270 * transactions. If we successfully reclaim anything,
1271 * then wakeup anyone waiting for the reclaim. 1271 * then wakeup anyone waiting for the reclaim.
1272 */ 1272 */
1273 if (we->we_bufcount == 0) { 1273 if (we->we_bufcount == 0) {
1274 size_t delta = 0; 1274 size_t delta = 0;
1275 int errcnt = 0; 1275 int errcnt = 0;
1276#ifdef WAPBL_DEBUG_BUFBYTES 1276#ifdef WAPBL_DEBUG_BUFBYTES
1277 KDASSERT(we->we_unsynced_bufbytes == 0); 1277 KDASSERT(we->we_unsynced_bufbytes == 0);
1278#endif 1278#endif
1279 /* 1279 /*
1280 * clear any posted error, since the buffer it came from 1280 * clear any posted error, since the buffer it came from
1281 * has successfully flushed by now 1281 * has successfully flushed by now
1282 */ 1282 */
1283 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && 1283 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1284 (we->we_bufcount == 0)) { 1284 (we->we_bufcount == 0)) {
1285 delta += we->we_reclaimable_bytes; 1285 delta += we->we_reclaimable_bytes;
1286 if (we->we_error) 1286 if (we->we_error)
1287 errcnt++; 1287 errcnt++;
1288 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 1288 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1289 wapbl_free(we); 1289 wapbl_free(we);
1290 } 1290 }
1291 1291
1292 if (delta) { 1292 if (delta) {
1293 wl->wl_reclaimable_bytes += delta; 1293 wl->wl_reclaimable_bytes += delta;
1294 KASSERT(wl->wl_error_count >= errcnt); 1294 KASSERT(wl->wl_error_count >= errcnt);
1295 wl->wl_error_count -= errcnt; 1295 wl->wl_error_count -= errcnt;
1296 cv_broadcast(&wl->wl_reclaimable_cv); 1296 cv_broadcast(&wl->wl_reclaimable_cv);
1297 } 1297 }
1298 } 1298 }
1299 1299
1300 mutex_exit(&wl->wl_mtx); 1300 mutex_exit(&wl->wl_mtx);
1301 brelse(bp, 0); 1301 brelse(bp, 0);
1302} 1302}
1303 1303
1304/* 1304/*
1305 * Write transactions to disk + start I/O for contents 1305 * Write transactions to disk + start I/O for contents
1306 */ 1306 */
1307int 1307int
1308wapbl_flush(struct wapbl *wl, int waitfor) 1308wapbl_flush(struct wapbl *wl, int waitfor)
1309{ 1309{
1310 struct buf *bp; 1310 struct buf *bp;
1311 struct wapbl_entry *we; 1311 struct wapbl_entry *we;
1312 off_t off; 1312 off_t off;
1313 off_t head; 1313 off_t head;
1314 off_t tail; 1314 off_t tail;
1315 size_t delta = 0; 1315 size_t delta = 0;
1316 size_t flushsize; 1316 size_t flushsize;
1317 size_t reserved; 1317 size_t reserved;
1318 int error = 0; 1318 int error = 0;
1319 1319
1320 /* 1320 /*
1321 * Do a quick check to see if a full flush can be skipped 1321 * Do a quick check to see if a full flush can be skipped
1322 * This assumes that the flush callback does not need to be called 1322 * This assumes that the flush callback does not need to be called
1323 * unless there are other outstanding bufs. 1323 * unless there are other outstanding bufs.
1324 */ 1324 */
1325 if (!waitfor) { 1325 if (!waitfor) {
1326 size_t nbufs; 1326 size_t nbufs;
1327 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to 1327 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1328 protect the KASSERTS */ 1328 protect the KASSERTS */
1329 nbufs = wl->wl_bufcount; 1329 nbufs = wl->wl_bufcount;
1330 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1330 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1331 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1331 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1332 mutex_exit(&wl->wl_mtx); 1332 mutex_exit(&wl->wl_mtx);
1333 if (nbufs == 0) 1333 if (nbufs == 0)
1334 return 0; 1334 return 0;
1335 } 1335 }
1336 1336
1337 /* 1337 /*
1338 * XXX we may consider using LK_UPGRADE here 1338 * XXX we may consider using LK_UPGRADE here
1339 * if we want to call flush from inside a transaction 1339 * if we want to call flush from inside a transaction
1340 */ 1340 */
1341 rw_enter(&wl->wl_rwlock, RW_WRITER); 1341 rw_enter(&wl->wl_rwlock, RW_WRITER);
1342 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 1342 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1343 wl->wl_dealloccnt); 1343 wl->wl_dealloccnt);
1344 1344
1345 /* 1345 /*
1346 * Now that we are fully locked and flushed, 1346 * Now that we are fully locked and flushed,
1347 * do another check for nothing to do. 1347 * do another check for nothing to do.
1348 */ 1348 */
1349 if (wl->wl_bufcount == 0) { 1349 if (wl->wl_bufcount == 0) {
1350 goto out; 1350 goto out;
1351 } 1351 }
1352 1352
1353#if 0 1353#if 0
1354 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1354 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1355 ("wapbl_flush thread %d.%d flushing entries with " 1355 ("wapbl_flush thread %d.%d flushing entries with "
1356 "bufcount=%zu bufbytes=%zu\n", 1356 "bufcount=%zu bufbytes=%zu\n",
1357 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 1357 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1358 wl->wl_bufbytes)); 1358 wl->wl_bufbytes));
1359#endif 1359#endif
1360 1360
1361 /* Calculate amount of space needed to flush */ 1361 /* Calculate amount of space needed to flush */
1362 flushsize = wapbl_transaction_len(wl); 1362 flushsize = wapbl_transaction_len(wl);
1363 1363
1364 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 1364 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1365 /* 1365 /*
1366 * XXX this could be handled more gracefully, perhaps place 1366 * XXX this could be handled more gracefully, perhaps place
1367 * only a partial transaction in the log and allow the 1367 * only a partial transaction in the log and allow the
1368 * remaining to flush without the protection of the journal. 1368 * remaining to flush without the protection of the journal.
1369 */ 1369 */
1370 panic("wapbl_flush: current transaction too big to flush\n"); 1370 panic("wapbl_flush: current transaction too big to flush\n");
1371 } 1371 }
1372 1372
1373 error = wapbl_truncate(wl, flushsize, 0); 1373 error = wapbl_truncate(wl, flushsize, 0);
1374 if (error) 1374 if (error)
1375 goto out2; 1375 goto out2;
1376 1376
1377 off = wl->wl_head; 1377 off = wl->wl_head;
1378 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&  1378 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1379 (off < wl->wl_circ_off + wl->wl_circ_size))); 1379 (off < wl->wl_circ_off + wl->wl_circ_size)));
1380 error = wapbl_write_blocks(wl, &off); 1380 error = wapbl_write_blocks(wl, &off);
1381 if (error) 1381 if (error)
1382 goto out2; 1382 goto out2;
1383 error = wapbl_write_revocations(wl, &off); 1383 error = wapbl_write_revocations(wl, &off);
1384 if (error) 1384 if (error)
1385 goto out2; 1385 goto out2;
1386 error = wapbl_write_inodes(wl, &off); 1386 error = wapbl_write_inodes(wl, &off);
1387 if (error) 1387 if (error)
1388 goto out2; 1388 goto out2;
1389 1389
1390 reserved = 0; 1390 reserved = 0;
1391 if (wl->wl_inohashcnt) 1391 if (wl->wl_inohashcnt)
1392 reserved = wapbl_transaction_inodes_len(wl); 1392 reserved = wapbl_transaction_inodes_len(wl);
1393 1393
1394 head = wl->wl_head; 1394 head = wl->wl_head;
1395 tail = wl->wl_tail; 1395 tail = wl->wl_tail;
1396 1396
1397 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, 1397 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1398 &head, &tail); 1398 &head, &tail);
1399#ifdef WAPBL_DEBUG 1399#ifdef WAPBL_DEBUG
1400 if (head != off) { 1400 if (head != off) {
1401 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX 1401 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1402 " off=%"PRIdMAX" flush=%zu\n", 1402 " off=%"PRIdMAX" flush=%zu\n",
1403 (intmax_t)head, (intmax_t)tail, (intmax_t)off, 1403 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1404 flushsize); 1404 flushsize);
1405 } 1405 }
1406#else 1406#else
1407 KASSERT(head == off); 1407 KASSERT(head == off);
1408#endif 1408#endif
1409 1409
1410 /* Opportunistically move the tail forward if we can */ 1410 /* Opportunistically move the tail forward if we can */
1411 if (!wapbl_lazy_truncate) { 1411 if (!wapbl_lazy_truncate) {
1412 mutex_enter(&wl->wl_mtx); 1412 mutex_enter(&wl->wl_mtx);
1413 delta = wl->wl_reclaimable_bytes; 1413 delta = wl->wl_reclaimable_bytes;
1414 mutex_exit(&wl->wl_mtx); 1414 mutex_exit(&wl->wl_mtx);
1415 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, 1415 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1416 &head, &tail); 1416 &head, &tail);
1417 } 1417 }
1418 1418
1419 error = wapbl_write_commit(wl, head, tail); 1419 error = wapbl_write_commit(wl, head, tail);
1420 if (error) 1420 if (error)
1421 goto out2; 1421 goto out2;
1422 1422
1423 /* poolme? or kmemme? */ 1423 /* poolme? or kmemme? */
1424 we = wapbl_calloc(1, sizeof(*we)); 1424 we = wapbl_calloc(1, sizeof(*we));
1425 1425
1426#ifdef WAPBL_DEBUG_BUFBYTES 1426#ifdef WAPBL_DEBUG_BUFBYTES
1427 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1427 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1428 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1428 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1429 " unsynced=%zu" 1429 " unsynced=%zu"
1430 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1430 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1431 "inodes=%d\n", 1431 "inodes=%d\n",
1432 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1432 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1433 wapbl_space_used(wl->wl_circ_size, head, tail), 1433 wapbl_space_used(wl->wl_circ_size, head, tail),
1434 wl->wl_unsynced_bufbytes, wl->wl_bufcount, 1434 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1435 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, 1435 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1436 wl->wl_inohashcnt)); 1436 wl->wl_inohashcnt));
1437#else 1437#else
1438 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1438 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1439 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1439 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1440 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1440 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1441 "inodes=%d\n", 1441 "inodes=%d\n",
1442 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1442 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1443 wapbl_space_used(wl->wl_circ_size, head, tail), 1443 wapbl_space_used(wl->wl_circ_size, head, tail),
1444 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1444 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1445 wl->wl_dealloccnt, wl->wl_inohashcnt)); 1445 wl->wl_dealloccnt, wl->wl_inohashcnt));
1446#endif 1446#endif
1447 1447
1448 1448
1449 mutex_enter(&bufcache_lock); 1449 mutex_enter(&bufcache_lock);
1450 mutex_enter(&wl->wl_mtx); 1450 mutex_enter(&wl->wl_mtx);
1451 1451
1452 wl->wl_reserved_bytes = reserved; 1452 wl->wl_reserved_bytes = reserved;
1453 wl->wl_head = head; 1453 wl->wl_head = head;
1454 wl->wl_tail = tail; 1454 wl->wl_tail = tail;
1455 KASSERT(wl->wl_reclaimable_bytes >= delta); 1455 KASSERT(wl->wl_reclaimable_bytes >= delta);
1456 wl->wl_reclaimable_bytes -= delta; 1456 wl->wl_reclaimable_bytes -= delta;
1457 wl->wl_dealloccnt = 0; 1457 wl->wl_dealloccnt = 0;
1458#ifdef WAPBL_DEBUG_BUFBYTES 1458#ifdef WAPBL_DEBUG_BUFBYTES
1459 wl->wl_unsynced_bufbytes += wl->wl_bufbytes; 1459 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1460#endif 1460#endif
1461 1461
1462 we->we_wapbl = wl; 1462 we->we_wapbl = wl;
1463 we->we_bufcount = wl->wl_bufcount; 1463 we->we_bufcount = wl->wl_bufcount;
1464#ifdef WAPBL_DEBUG_BUFBYTES 1464#ifdef WAPBL_DEBUG_BUFBYTES
1465 we->we_unsynced_bufbytes = wl->wl_bufbytes; 1465 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1466#endif 1466#endif
1467 we->we_reclaimable_bytes = flushsize; 1467 we->we_reclaimable_bytes = flushsize;
1468 we->we_error = 0; 1468 we->we_error = 0;
1469 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); 1469 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1470 1470
1471 /* 1471 /*
1472 * this flushes bufs in reverse order than they were queued 1472 * this flushes bufs in reverse order than they were queued
1473 * it shouldn't matter, but if we care we could use TAILQ instead. 1473 * it shouldn't matter, but if we care we could use TAILQ instead.
1474 * XXX Note they will get put on the lru queue when they flush 1474 * XXX Note they will get put on the lru queue when they flush
1475 * so we might actually want to change this to preserve order. 1475 * so we might actually want to change this to preserve order.
1476 */ 1476 */
1477 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 1477 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1478 if (bbusy(bp, 0, 0, &wl->wl_mtx)) { 1478 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1479 continue; 1479 continue;
1480 } 1480 }
1481 bp->b_iodone = wapbl_biodone; 1481 bp->b_iodone = wapbl_biodone;
1482 bp->b_private = we; 1482 bp->b_private = we;
1483 bremfree(bp); 1483 bremfree(bp);
1484 wapbl_remove_buf_locked(wl, bp); 1484 wapbl_remove_buf_locked(wl, bp);
1485 mutex_exit(&wl->wl_mtx); 1485 mutex_exit(&wl->wl_mtx);
1486 mutex_exit(&bufcache_lock); 1486 mutex_exit(&bufcache_lock);
1487 bawrite(bp); 1487 bawrite(bp);
1488 mutex_enter(&bufcache_lock); 1488 mutex_enter(&bufcache_lock);
1489 mutex_enter(&wl->wl_mtx); 1489 mutex_enter(&wl->wl_mtx);
1490 } 1490 }
1491 mutex_exit(&wl->wl_mtx); 1491 mutex_exit(&wl->wl_mtx);
1492 mutex_exit(&bufcache_lock); 1492 mutex_exit(&bufcache_lock);
1493 1493
1494#if 0 1494#if 0
1495 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1495 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1496 ("wapbl_flush thread %d.%d done flushing entries...\n", 1496 ("wapbl_flush thread %d.%d done flushing entries...\n",
1497 curproc->p_pid, curlwp->l_lid)); 1497 curproc->p_pid, curlwp->l_lid));
1498#endif 1498#endif
1499 1499
1500 out: 1500 out:
1501 1501
1502 /* 1502 /*
1503 * If the waitfor flag is set, don't return until everything is 1503 * If the waitfor flag is set, don't return until everything is
1504 * fully flushed and the on disk log is empty. 1504 * fully flushed and the on disk log is empty.
1505 */ 1505 */
1506 if (waitfor) { 1506 if (waitfor) {
1507 error = wapbl_truncate(wl, wl->wl_circ_size -  1507 error = wapbl_truncate(wl, wl->wl_circ_size -
1508 wl->wl_reserved_bytes, wapbl_lazy_truncate); 1508 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1509 } 1509 }
1510 1510
1511 out2: 1511 out2:
1512 if (error) { 1512 if (error) {
1513 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks, 1513 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1514 wl->wl_dealloclens, wl->wl_dealloccnt); 1514 wl->wl_dealloclens, wl->wl_dealloccnt);
1515 } 1515 }
1516 1516
1517#ifdef WAPBL_DEBUG_PRINT 1517#ifdef WAPBL_DEBUG_PRINT
1518 if (error) { 1518 if (error) {
1519 pid_t pid = -1; 1519 pid_t pid = -1;
1520 lwpid_t lid = -1; 1520 lwpid_t lid = -1;
1521 if (curproc) 1521 if (curproc)
1522 pid = curproc->p_pid; 1522 pid = curproc->p_pid;
1523 if (curlwp) 1523 if (curlwp)
1524 lid = curlwp->l_lid; 1524 lid = curlwp->l_lid;
1525 mutex_enter(&wl->wl_mtx); 1525 mutex_enter(&wl->wl_mtx);
1526#ifdef WAPBL_DEBUG_BUFBYTES 1526#ifdef WAPBL_DEBUG_BUFBYTES
1527 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1527 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1528 ("wapbl_flush: thread %d.%d aborted flush: " 1528 ("wapbl_flush: thread %d.%d aborted flush: "
1529 "error = %d\n" 1529 "error = %d\n"
1530 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1530 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1531 "deallocs=%d inodes=%d\n" 1531 "deallocs=%d inodes=%d\n"
1532 "\terrcnt = %d, reclaimable=%zu reserved=%zu " 1532 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1533 "unsynced=%zu\n", 1533 "unsynced=%zu\n",
1534 pid, lid, error, wl->wl_bufcount, 1534 pid, lid, error, wl->wl_bufcount,
1535 wl->wl_bufbytes, wl->wl_bcount, 1535 wl->wl_bufbytes, wl->wl_bcount,
1536 wl->wl_dealloccnt, wl->wl_inohashcnt, 1536 wl->wl_dealloccnt, wl->wl_inohashcnt,
1537 wl->wl_error_count, wl->wl_reclaimable_bytes, 1537 wl->wl_error_count, wl->wl_reclaimable_bytes,
1538 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); 1538 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1539 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1539 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1540 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1540 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1541 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1541 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1542 "error = %d, unsynced = %zu\n", 1542 "error = %d, unsynced = %zu\n",
1543 we->we_bufcount, we->we_reclaimable_bytes, 1543 we->we_bufcount, we->we_reclaimable_bytes,
1544 we->we_error, we->we_unsynced_bufbytes)); 1544 we->we_error, we->we_unsynced_bufbytes));
1545 } 1545 }
1546#else 1546#else
1547 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1547 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1548 ("wapbl_flush: thread %d.%d aborted flush: " 1548 ("wapbl_flush: thread %d.%d aborted flush: "
1549 "error = %d\n" 1549 "error = %d\n"
1550 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1550 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1551 "deallocs=%d inodes=%d\n" 1551 "deallocs=%d inodes=%d\n"
1552 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", 1552 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1553 pid, lid, error, wl->wl_bufcount, 1553 pid, lid, error, wl->wl_bufcount,
1554 wl->wl_bufbytes, wl->wl_bcount, 1554 wl->wl_bufbytes, wl->wl_bcount,
1555 wl->wl_dealloccnt, wl->wl_inohashcnt, 1555 wl->wl_dealloccnt, wl->wl_inohashcnt,
1556 wl->wl_error_count, wl->wl_reclaimable_bytes, 1556 wl->wl_error_count, wl->wl_reclaimable_bytes,
1557 wl->wl_reserved_bytes)); 1557 wl->wl_reserved_bytes));
1558 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1558 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1559 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1559 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1560 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1560 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1561 "error = %d\n", we->we_bufcount, 1561 "error = %d\n", we->we_bufcount,
1562 we->we_reclaimable_bytes, we->we_error)); 1562 we->we_reclaimable_bytes, we->we_error));
1563 } 1563 }
1564#endif 1564#endif
1565 mutex_exit(&wl->wl_mtx); 1565 mutex_exit(&wl->wl_mtx);
1566 } 1566 }
1567#endif 1567#endif
1568 1568
1569 rw_exit(&wl->wl_rwlock); 1569 rw_exit(&wl->wl_rwlock);
1570 return error; 1570 return error;
1571} 1571}
1572 1572
1573/****************************************************************/ 1573/****************************************************************/
1574 1574
1575void 1575void
1576wapbl_jlock_assert(struct wapbl *wl) 1576wapbl_jlock_assert(struct wapbl *wl)
1577{ 1577{
1578 1578
1579#ifdef WAPBL_DEBUG_SERIALIZE 1579#ifdef WAPBL_DEBUG_SERIALIZE
1580 KASSERT(rw_write_held(&wl->wl_rwlock)); 1580 KASSERT(rw_write_held(&wl->wl_rwlock));
1581#else 1581#else
1582 KASSERT(rw_read_held(&wl->wl_rwlock) || rw_write_held(&wl->wl_rwlock)); 1582 KASSERT(rw_read_held(&wl->wl_rwlock) || rw_write_held(&wl->wl_rwlock));
1583#endif 1583#endif
1584} 1584}
1585 1585
1586void 1586void
1587wapbl_junlock_assert(struct wapbl *wl) 1587wapbl_junlock_assert(struct wapbl *wl)
1588{ 1588{
1589 1589
1590#ifdef WAPBL_DEBUG_SERIALIZE 1590#ifdef WAPBL_DEBUG_SERIALIZE
1591 KASSERT(!rw_write_held(&wl->wl_rwlock)); 1591 KASSERT(!rw_write_held(&wl->wl_rwlock));
1592#endif 1592#endif
1593} 1593}
1594 1594
1595/****************************************************************/ 1595/****************************************************************/
1596 1596
1597/* locks missing */ 1597/* locks missing */
1598void 1598void
1599wapbl_print(struct wapbl *wl, 1599wapbl_print(struct wapbl *wl,
1600 int full, 1600 int full,
1601 void (*pr)(const char *, ...)) 1601 void (*pr)(const char *, ...))
1602{ 1602{
1603 struct buf *bp; 1603 struct buf *bp;
1604 struct wapbl_entry *we; 1604 struct wapbl_entry *we;
1605 (*pr)("wapbl %p", wl); 1605 (*pr)("wapbl %p", wl);
1606 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", 1606 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1607 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); 1607 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1608 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", 1608 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1609 wl->wl_circ_size, wl->wl_circ_off, 1609 wl->wl_circ_size, wl->wl_circ_off,
1610 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); 1610 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1611 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", 1611 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1612 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); 1612 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1613#ifdef WAPBL_DEBUG_BUFBYTES 1613#ifdef WAPBL_DEBUG_BUFBYTES
1614 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1614 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1615 "reserved = %zu errcnt = %d unsynced = %zu\n", 1615 "reserved = %zu errcnt = %d unsynced = %zu\n",
1616 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1616 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1617 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1617 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1618 wl->wl_error_count, wl->wl_unsynced_bufbytes); 1618 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1619#else 1619#else
1620 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1620 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1621 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, 1621 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1622 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1622 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1623 wl->wl_error_count); 1623 wl->wl_error_count);
1624#endif 1624#endif
1625 (*pr)("\tdealloccnt = %d, dealloclim = %d\n", 1625 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1626 wl->wl_dealloccnt, wl->wl_dealloclim); 1626 wl->wl_dealloccnt, wl->wl_dealloclim);
1627 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", 1627 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1628 wl->wl_inohashcnt, wl->wl_inohashmask); 1628 wl->wl_inohashcnt, wl->wl_inohashmask);
1629 (*pr)("entries:\n"); 1629 (*pr)("entries:\n");
1630 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1630 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1631#ifdef WAPBL_DEBUG_BUFBYTES 1631#ifdef WAPBL_DEBUG_BUFBYTES
1632 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " 1632 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1633 "unsynced = %zu\n", 1633 "unsynced = %zu\n",
1634 we->we_bufcount, we->we_reclaimable_bytes, 1634 we->we_bufcount, we->we_reclaimable_bytes,
1635 we->we_error, we->we_unsynced_bufbytes); 1635 we->we_error, we->we_unsynced_bufbytes);
1636#else 1636#else
1637 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", 1637 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1638 we->we_bufcount, we->we_reclaimable_bytes, we->we_error); 1638 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1639#endif 1639#endif
1640 } 1640 }
1641 if (full) { 1641 if (full) {
1642 int cnt = 0; 1642 int cnt = 0;
1643 (*pr)("bufs ="); 1643 (*pr)("bufs =");
1644 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { 1644 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1645 if (!LIST_NEXT(bp, b_wapbllist)) { 1645 if (!LIST_NEXT(bp, b_wapbllist)) {
1646 (*pr)(" %p", bp); 1646 (*pr)(" %p", bp);
1647 } else if ((++cnt % 6) == 0) { 1647 } else if ((++cnt % 6) == 0) {
1648 (*pr)(" %p,\n\t", bp); 1648 (*pr)(" %p,\n\t", bp);
1649 } else { 1649 } else {
1650 (*pr)(" %p,", bp); 1650 (*pr)(" %p,", bp);
1651 } 1651 }
1652 } 1652 }
1653 (*pr)("\n"); 1653 (*pr)("\n");
1654 1654
1655 (*pr)("dealloced blks = "); 1655 (*pr)("dealloced blks = ");
1656 { 1656 {
1657 int i; 1657 int i;
1658 cnt = 0; 1658 cnt = 0;
1659 for (i = 0; i < wl->wl_dealloccnt; i++) { 1659 for (i = 0; i < wl->wl_dealloccnt; i++) {
1660 (*pr)(" %"PRId64":%d,", 1660 (*pr)(" %"PRId64":%d,",
1661 wl->wl_deallocblks[i], 1661 wl->wl_deallocblks[i],
1662 wl->wl_dealloclens[i]); 1662 wl->wl_dealloclens[i]);
1663 if ((++cnt % 4) == 0) { 1663 if ((++cnt % 4) == 0) {
1664 (*pr)("\n\t"); 1664 (*pr)("\n\t");
1665 } 1665 }
1666 } 1666 }
1667 } 1667 }
1668 (*pr)("\n"); 1668 (*pr)("\n");
1669 1669
1670 (*pr)("registered inodes = "); 1670 (*pr)("registered inodes = ");
1671 { 1671 {
1672 int i; 1672 int i;
1673 cnt = 0; 1673 cnt = 0;
1674 for (i = 0; i <= wl->wl_inohashmask; i++) { 1674 for (i = 0; i <= wl->wl_inohashmask; i++) {
1675 struct wapbl_ino_head *wih; 1675 struct wapbl_ino_head *wih;
1676 struct wapbl_ino *wi; 1676 struct wapbl_ino *wi;
1677 1677
1678 wih = &wl->wl_inohash[i]; 1678 wih = &wl->wl_inohash[i];
1679 LIST_FOREACH(wi, wih, wi_hash) { 1679 LIST_FOREACH(wi, wih, wi_hash) {
1680 if (wi->wi_ino == 0) 1680 if (wi->wi_ino == 0)
1681 continue; 1681 continue;
1682 (*pr)(" %"PRId32"/0%06"PRIo32",", 1682 (*pr)(" %"PRId32"/0%06"PRIo32",",
1683 wi->wi_ino, wi->wi_mode); 1683 wi->wi_ino, wi->wi_mode);
1684 if ((++cnt % 4) == 0) { 1684 if ((++cnt % 4) == 0) {
1685 (*pr)("\n\t"); 1685 (*pr)("\n\t");
1686 } 1686 }
1687 } 1687 }
1688 } 1688 }
1689 (*pr)("\n"); 1689 (*pr)("\n");
1690 } 1690 }
1691 } 1691 }
1692} 1692}
1693 1693
1694#if defined(WAPBL_DEBUG) || defined(DDB) 1694#if defined(WAPBL_DEBUG) || defined(DDB)
1695void 1695void
1696wapbl_dump(struct wapbl *wl) 1696wapbl_dump(struct wapbl *wl)
1697{ 1697{
1698#if defined(WAPBL_DEBUG) 1698#if defined(WAPBL_DEBUG)
1699 if (!wl) 1699 if (!wl)
1700 wl = wapbl_debug_wl; 1700 wl = wapbl_debug_wl;
1701#endif 1701#endif
1702 if (!wl) 1702 if (!wl)
1703 return; 1703 return;
1704 wapbl_print(wl, 1, printf); 1704 wapbl_print(wl, 1, printf);
1705} 1705}
1706#endif 1706#endif
1707 1707
1708/****************************************************************/ 1708/****************************************************************/
1709 1709
1710void 1710void
1711wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len) 1711wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1712{ 1712{
1713 1713
1714 wapbl_jlock_assert(wl); 1714 wapbl_jlock_assert(wl);
1715 1715
1716 /* XXX should eventually instead tie this into resource estimation */ 1716 /* XXX should eventually instead tie this into resource estimation */
1717 /* XXX this KASSERT needs locking/mutex analysis */ 1717 /* XXX this KASSERT needs locking/mutex analysis */
1718 KASSERT(wl->wl_dealloccnt < wl->wl_dealloclim); 1718 KASSERT(wl->wl_dealloccnt < wl->wl_dealloclim);
1719 wl->wl_deallocblks[wl->wl_dealloccnt] = blk; 1719 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1720 wl->wl_dealloclens[wl->wl_dealloccnt] = len; 1720 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1721 wl->wl_dealloccnt++; 1721 wl->wl_dealloccnt++;
1722 WAPBL_PRINTF(WAPBL_PRINT_ALLOC, 1722 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1723 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len)); 1723 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1724} 1724}
1725 1725
1726/****************************************************************/ 1726/****************************************************************/
1727 1727
1728static void 1728static void
1729wapbl_inodetrk_init(struct wapbl *wl, u_int size) 1729wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1730{ 1730{
1731 1731
1732 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); 1732 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1733 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { 1733 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1734 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, 1734 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1735 "wapblinopl", &pool_allocator_nointr, IPL_NONE); 1735 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1736 } 1736 }
1737} 1737}
1738 1738
1739static void 1739static void
1740wapbl_inodetrk_free(struct wapbl *wl) 1740wapbl_inodetrk_free(struct wapbl *wl)
1741{ 1741{
1742 1742
1743 /* XXX this KASSERT needs locking/mutex analysis */ 1743 /* XXX this KASSERT needs locking/mutex analysis */
1744 KASSERT(wl->wl_inohashcnt == 0); 1744 KASSERT(wl->wl_inohashcnt == 0);
1745 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); 1745 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1746 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { 1746 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1747 pool_destroy(&wapbl_ino_pool); 1747 pool_destroy(&wapbl_ino_pool);
1748 } 1748 }
1749} 1749}
1750 1750
1751static struct wapbl_ino * 1751static struct wapbl_ino *
1752wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) 1752wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1753{ 1753{
1754 struct wapbl_ino_head *wih; 1754 struct wapbl_ino_head *wih;
1755 struct wapbl_ino *wi; 1755 struct wapbl_ino *wi;
1756 1756
1757 KASSERT(mutex_owned(&wl->wl_mtx)); 1757 KASSERT(mutex_owned(&wl->wl_mtx));
1758 1758
1759 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1759 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1760 LIST_FOREACH(wi, wih, wi_hash) { 1760 LIST_FOREACH(wi, wih, wi_hash) {
1761 if (ino == wi->wi_ino) 1761 if (ino == wi->wi_ino)
1762 return wi; 1762 return wi;
1763 } 1763 }
1764 return 0; 1764 return 0;
1765} 1765}
1766 1766
1767void 1767void
1768wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1768wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1769{ 1769{
1770 struct wapbl_ino_head *wih; 1770 struct wapbl_ino_head *wih;
1771 struct wapbl_ino *wi; 1771 struct wapbl_ino *wi;
1772 1772
1773 wi = pool_get(&wapbl_ino_pool, PR_WAITOK); 1773 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1774 1774
1775 mutex_enter(&wl->wl_mtx); 1775 mutex_enter(&wl->wl_mtx);
1776 if (wapbl_inodetrk_get(wl, ino) == NULL) { 1776 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1777 wi->wi_ino = ino; 1777 wi->wi_ino = ino;
1778 wi->wi_mode = mode; 1778 wi->wi_mode = mode;
1779 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1779 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1780 LIST_INSERT_HEAD(wih, wi, wi_hash); 1780 LIST_INSERT_HEAD(wih, wi, wi_hash);
1781 wl->wl_inohashcnt++; 1781 wl->wl_inohashcnt++;
1782 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1782 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1783 ("wapbl_register_inode: ino=%"PRId64"\n", ino)); 1783 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1784 mutex_exit(&wl->wl_mtx); 1784 mutex_exit(&wl->wl_mtx);
1785 } else { 1785 } else {
1786 mutex_exit(&wl->wl_mtx); 1786 mutex_exit(&wl->wl_mtx);
1787 pool_put(&wapbl_ino_pool, wi); 1787 pool_put(&wapbl_ino_pool, wi);
1788 } 1788 }
1789} 1789}
1790 1790
1791void 1791void
1792wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1792wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1793{ 1793{
1794 struct wapbl_ino *wi; 1794 struct wapbl_ino *wi;
1795 1795
1796 mutex_enter(&wl->wl_mtx); 1796 mutex_enter(&wl->wl_mtx);
1797 wi = wapbl_inodetrk_get(wl, ino); 1797 wi = wapbl_inodetrk_get(wl, ino);
1798 if (wi) { 1798 if (wi) {
1799 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1799 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1800 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); 1800 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1801 KASSERT(wl->wl_inohashcnt > 0); 1801 KASSERT(wl->wl_inohashcnt > 0);
1802 wl->wl_inohashcnt--; 1802 wl->wl_inohashcnt--;
1803 LIST_REMOVE(wi, wi_hash); 1803 LIST_REMOVE(wi, wi_hash);
1804 mutex_exit(&wl->wl_mtx); 1804 mutex_exit(&wl->wl_mtx);
1805 1805
1806 pool_put(&wapbl_ino_pool, wi); 1806 pool_put(&wapbl_ino_pool, wi);
1807 } else { 1807 } else {
1808 mutex_exit(&wl->wl_mtx); 1808 mutex_exit(&wl->wl_mtx);
1809 } 1809 }
1810} 1810}
1811 1811
1812/****************************************************************/ 1812/****************************************************************/
1813 1813
1814static __inline size_t 1814static __inline size_t
1815wapbl_transaction_inodes_len(struct wapbl *wl) 1815wapbl_transaction_inodes_len(struct wapbl *wl)
1816{ 1816{
1817 int blocklen = 1<<wl->wl_log_dev_bshift; 1817 int blocklen = 1<<wl->wl_log_dev_bshift;
1818 int iph; 1818 int iph;
1819 1819
1820 /* Calculate number of inodes described in a inodelist header */ 1820 /* Calculate number of inodes described in a inodelist header */
1821 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 1821 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1822 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 1822 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1823 1823
1824 KASSERT(iph > 0); 1824 KASSERT(iph > 0);
1825 1825
1826 return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen; 1826 return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen;
1827} 1827}
1828 1828
1829 1829
1830/* Calculate amount of space a transaction will take on disk */ 1830/* Calculate amount of space a transaction will take on disk */
1831static size_t 1831static size_t
1832wapbl_transaction_len(struct wapbl *wl) 1832wapbl_transaction_len(struct wapbl *wl)
1833{ 1833{
1834 int blocklen = 1<<wl->wl_log_dev_bshift; 1834 int blocklen = 1<<wl->wl_log_dev_bshift;
1835 size_t len; 1835 size_t len;
1836 int bph; 1836 int bph;
1837 1837
1838 /* Calculate number of blocks described in a blocklist header */ 1838 /* Calculate number of blocks described in a blocklist header */
1839 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1839 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1840 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1840 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1841 1841
1842 KASSERT(bph > 0); 1842 KASSERT(bph > 0);
1843 1843
1844 len = wl->wl_bcount; 1844 len = wl->wl_bcount;
1845 len += howmany(wl->wl_bufcount, bph)*blocklen; 1845 len += howmany(wl->wl_bufcount, bph)*blocklen;
1846 len += howmany(wl->wl_dealloccnt, bph)*blocklen; 1846 len += howmany(wl->wl_dealloccnt, bph)*blocklen;
1847 len += wapbl_transaction_inodes_len(wl); 1847 len += wapbl_transaction_inodes_len(wl);
1848 1848
1849 return len; 1849 return len;
1850} 1850}
1851 1851
1852/* 1852/*
1853 * Perform commit operation 1853 * Perform commit operation
1854 * 1854 *
1855 * Note that generation number incrementation needs to 1855 * Note that generation number incrementation needs to
1856 * be protected against racing with other invocations 1856 * be protected against racing with other invocations
1857 * of wapbl_commit. This is ok since this routine 1857 * of wapbl_commit. This is ok since this routine
1858 * is only invoked from wapbl_flush 1858 * is only invoked from wapbl_flush
1859 */ 1859 */
1860static int 1860static int
1861wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) 1861wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1862{ 1862{
1863 struct wapbl_wc_header *wc = wl->wl_wc_header; 1863 struct wapbl_wc_header *wc = wl->wl_wc_header;
1864 struct timespec ts; 1864 struct timespec ts;
1865 int error; 1865 int error;
1866 int force = 1; 1866 int force = 1;
1867 1867
1868 /* XXX Calc checksum here, instead we do this for now */ 1868 /* XXX Calc checksum here, instead we do this for now */
1869 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED); 1869 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1870 if (error) { 1870 if (error) {
1871 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1871 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1872 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " 1872 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1873 "returned %d\n", wl->wl_devvp->v_rdev, error)); 1873 "returned %d\n", wl->wl_devvp->v_rdev, error));
1874 } 1874 }
1875 1875
1876 wc->wc_head = head; 1876 wc->wc_head = head;
1877 wc->wc_tail = tail; 1877 wc->wc_tail = tail;
1878 wc->wc_checksum = 0; 1878 wc->wc_checksum = 0;
1879 wc->wc_version = 1; 1879 wc->wc_version = 1;
1880 getnanotime(&ts); 1880 getnanotime(&ts);
1881 wc->wc_time = ts.tv_sec;; 1881 wc->wc_time = ts.tv_sec;
1882 wc->wc_timensec = ts.tv_nsec; 1882 wc->wc_timensec = ts.tv_nsec;
1883 1883
1884 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 1884 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1885 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", 1885 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1886 (intmax_t)head, (intmax_t)tail)); 1886 (intmax_t)head, (intmax_t)tail));
1887 1887
1888 /* 1888 /*
1889 * XXX if generation will rollover, then first zero 1889 * XXX if generation will rollover, then first zero
1890 * over second commit header before trying to write both headers. 1890 * over second commit header before trying to write both headers.
1891 */ 1891 */
1892 1892
1893 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, 1893 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp,
1894 wl->wl_logpbn + wc->wc_generation % 2); 1894 wl->wl_logpbn + wc->wc_generation % 2);
1895 if (error) 1895 if (error)
1896 return error; 1896 return error;
1897 1897
1898 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED); 1898 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1899 if (error) { 1899 if (error) {
1900 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1900 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1901 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " 1901 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1902 "returned %d\n", wl->wl_devvp->v_rdev, error)); 1902 "returned %d\n", wl->wl_devvp->v_rdev, error));
1903 } 1903 }
1904 1904
1905 /* 1905 /*
1906 * If the generation number was zero, write it out a second time. 1906 * If the generation number was zero, write it out a second time.
1907 * This handles initialization and generation number rollover 1907 * This handles initialization and generation number rollover
1908 */ 1908 */
1909 if (wc->wc_generation++ == 0) { 1909 if (wc->wc_generation++ == 0) {
1910 error = wapbl_write_commit(wl, head, tail); 1910 error = wapbl_write_commit(wl, head, tail);
1911 /* 1911 /*
1912 * This panic should be able to be removed if we do the 1912 * This panic should be able to be removed if we do the
1913 * zero'ing mentioned above, and we are certain to roll 1913 * zero'ing mentioned above, and we are certain to roll
1914 * back generation number on failure. 1914 * back generation number on failure.
1915 */ 1915 */
1916 if (error) 1916 if (error)
1917 panic("wapbl_write_commit: error writing duplicate " 1917 panic("wapbl_write_commit: error writing duplicate "
1918 "log header: %d\n", error); 1918 "log header: %d\n", error);
1919 } 1919 }
1920 return 0; 1920 return 0;
1921} 1921}
1922 1922
1923/* Returns new offset value */ 1923/* Returns new offset value */
1924static int 1924static int
1925wapbl_write_blocks(struct wapbl *wl, off_t *offp) 1925wapbl_write_blocks(struct wapbl *wl, off_t *offp)
1926{ 1926{
1927 struct wapbl_wc_blocklist *wc = 1927 struct wapbl_wc_blocklist *wc =
1928 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 1928 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1929 int blocklen = 1<<wl->wl_log_dev_bshift; 1929 int blocklen = 1<<wl->wl_log_dev_bshift;
1930 int bph; 1930 int bph;
1931 struct buf *bp; 1931 struct buf *bp;
1932 off_t off = *offp; 1932 off_t off = *offp;
1933 int error; 1933 int error;
1934 size_t padding; 1934 size_t padding;
1935 1935
1936 KASSERT(rw_write_held(&wl->wl_rwlock)); 1936 KASSERT(rw_write_held(&wl->wl_rwlock));
1937 1937
1938 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1938 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1939 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1939 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1940 1940
1941 bp = LIST_FIRST(&wl->wl_bufs); 1941 bp = LIST_FIRST(&wl->wl_bufs);
1942 1942
1943 while (bp) { 1943 while (bp) {
1944 int cnt; 1944 int cnt;
1945 struct buf *obp = bp; 1945 struct buf *obp = bp;
1946 1946
1947 KASSERT(bp->b_flags & B_LOCKED); 1947 KASSERT(bp->b_flags & B_LOCKED);
1948 1948
1949 wc->wc_type = WAPBL_WC_BLOCKS; 1949 wc->wc_type = WAPBL_WC_BLOCKS;
1950 wc->wc_len = blocklen; 1950 wc->wc_len = blocklen;
1951 wc->wc_blkcount = 0; 1951 wc->wc_blkcount = 0;
1952 while (bp && (wc->wc_blkcount < bph)) { 1952 while (bp && (wc->wc_blkcount < bph)) {
1953 /* 1953 /*
1954 * Make sure all the physical block numbers are up to 1954 * Make sure all the physical block numbers are up to
1955 * date. If this is not always true on a given 1955 * date. If this is not always true on a given
1956 * filesystem, then VOP_BMAP must be called. We 1956 * filesystem, then VOP_BMAP must be called. We
1957 * could call VOP_BMAP here, or else in the filesystem 1957 * could call VOP_BMAP here, or else in the filesystem
1958 * specific flush callback, although neither of those 1958 * specific flush callback, although neither of those
1959 * solutions allow us to take the vnode lock. If a 1959 * solutions allow us to take the vnode lock. If a
1960 * filesystem requires that we must take the vnode lock 1960 * filesystem requires that we must take the vnode lock
1961 * to call VOP_BMAP, then we can probably do it in 1961 * to call VOP_BMAP, then we can probably do it in
1962 * bwrite when the vnode lock should already be held 1962 * bwrite when the vnode lock should already be held
1963 * by the invoking code. 1963 * by the invoking code.
1964 */ 1964 */
1965 KASSERT((bp->b_vp->v_type == VBLK) || 1965 KASSERT((bp->b_vp->v_type == VBLK) ||
1966 (bp->b_blkno != bp->b_lblkno)); 1966 (bp->b_blkno != bp->b_lblkno));
1967 KASSERT(bp->b_blkno > 0); 1967 KASSERT(bp->b_blkno > 0);
1968 1968
1969 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; 1969 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
1970 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; 1970 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
1971 wc->wc_len += bp->b_bcount; 1971 wc->wc_len += bp->b_bcount;
1972 wc->wc_blkcount++; 1972 wc->wc_blkcount++;
1973 bp = LIST_NEXT(bp, b_wapbllist); 1973 bp = LIST_NEXT(bp, b_wapbllist);
1974 } 1974 }
1975 if (wc->wc_len % blocklen != 0) { 1975 if (wc->wc_len % blocklen != 0) {
1976 padding = blocklen - wc->wc_len % blocklen; 1976 padding = blocklen - wc->wc_len % blocklen;
1977 wc->wc_len += padding; 1977 wc->wc_len += padding;
1978 } else { 1978 } else {
1979 padding = 0; 1979 padding = 0;
1980 } 1980 }
1981 1981
1982 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 1982 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1983 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n", 1983 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
1984 wc->wc_len, padding, (intmax_t)off)); 1984 wc->wc_len, padding, (intmax_t)off));
1985 1985
1986 error = wapbl_circ_write(wl, wc, blocklen, &off); 1986 error = wapbl_circ_write(wl, wc, blocklen, &off);
1987 if (error) 1987 if (error)
1988 return error; 1988 return error;
1989 bp = obp; 1989 bp = obp;
1990 cnt = 0; 1990 cnt = 0;
1991 while (bp && (cnt++ < bph)) { 1991 while (bp && (cnt++ < bph)) {
1992 error = wapbl_circ_write(wl, bp->b_data, 1992 error = wapbl_circ_write(wl, bp->b_data,
1993 bp->b_bcount, &off); 1993 bp->b_bcount, &off);
1994 if (error) 1994 if (error)
1995 return error; 1995 return error;
1996 bp = LIST_NEXT(bp, b_wapbllist); 1996 bp = LIST_NEXT(bp, b_wapbllist);
1997 } 1997 }
1998 if (padding) { 1998 if (padding) {
1999 void *zero; 1999 void *zero;
2000  2000
2001 zero = wapbl_malloc(padding); 2001 zero = wapbl_malloc(padding);
2002 memset(zero, 0, padding); 2002 memset(zero, 0, padding);
2003 error = wapbl_circ_write(wl, zero, padding, &off); 2003 error = wapbl_circ_write(wl, zero, padding, &off);
2004 wapbl_free(zero); 2004 wapbl_free(zero);
2005 if (error) 2005 if (error)
2006 return error; 2006 return error;
2007 } 2007 }
2008 } 2008 }
2009 *offp = off; 2009 *offp = off;
2010 return 0; 2010 return 0;
2011} 2011}
2012 2012
2013static int 2013static int
2014wapbl_write_revocations(struct wapbl *wl, off_t *offp) 2014wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2015{ 2015{
2016 struct wapbl_wc_blocklist *wc = 2016 struct wapbl_wc_blocklist *wc =
2017 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 2017 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2018 int i; 2018 int i;
2019 int blocklen = 1<<wl->wl_log_dev_bshift; 2019 int blocklen = 1<<wl->wl_log_dev_bshift;
2020 int bph; 2020 int bph;
2021 off_t off = *offp; 2021 off_t off = *offp;
2022 int error; 2022 int error;
2023 2023
2024 if (wl->wl_dealloccnt == 0) 2024 if (wl->wl_dealloccnt == 0)
2025 return 0; 2025 return 0;
2026 2026
2027 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 2027 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2028 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 2028 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2029 2029
2030 i = 0; 2030 i = 0;
2031 while (i < wl->wl_dealloccnt) { 2031 while (i < wl->wl_dealloccnt) {
2032 wc->wc_type = WAPBL_WC_REVOCATIONS; 2032 wc->wc_type = WAPBL_WC_REVOCATIONS;
2033 wc->wc_len = blocklen; 2033 wc->wc_len = blocklen;
2034 wc->wc_blkcount = 0; 2034 wc->wc_blkcount = 0;
2035 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) { 2035 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2036 wc->wc_blocks[wc->wc_blkcount].wc_daddr = 2036 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2037 wl->wl_deallocblks[i]; 2037 wl->wl_deallocblks[i];
2038 wc->wc_blocks[wc->wc_blkcount].wc_dlen = 2038 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2039 wl->wl_dealloclens[i]; 2039 wl->wl_dealloclens[i];
2040 wc->wc_blkcount++; 2040 wc->wc_blkcount++;
2041 i++; 2041 i++;
2042 } 2042 }
2043 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2043 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2044 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", 2044 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2045 wc->wc_len, (intmax_t)off)); 2045 wc->wc_len, (intmax_t)off));
2046 error = wapbl_circ_write(wl, wc, blocklen, &off); 2046 error = wapbl_circ_write(wl, wc, blocklen, &off);
2047 if (error) 2047 if (error)
2048 return error; 2048 return error;
2049 } 2049 }
2050 *offp = off; 2050 *offp = off;
2051 return 0; 2051 return 0;
2052} 2052}
2053 2053
2054static int 2054static int
2055wapbl_write_inodes(struct wapbl *wl, off_t *offp) 2055wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2056{ 2056{
2057 struct wapbl_wc_inodelist *wc = 2057 struct wapbl_wc_inodelist *wc =
2058 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; 2058 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2059 int i; 2059 int i;
2060 int blocklen = 1 << wl->wl_log_dev_bshift; 2060 int blocklen = 1 << wl->wl_log_dev_bshift;
2061 off_t off = *offp; 2061 off_t off = *offp;
2062 int error; 2062 int error;
2063 2063
2064 struct wapbl_ino_head *wih; 2064 struct wapbl_ino_head *wih;
2065 struct wapbl_ino *wi; 2065 struct wapbl_ino *wi;
2066 int iph; 2066 int iph;
2067 2067
2068 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 2068 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2069 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 2069 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2070 2070
2071 i = 0; 2071 i = 0;
2072 wih = &wl->wl_inohash[0]; 2072 wih = &wl->wl_inohash[0];
2073 wi = 0; 2073 wi = 0;
2074 do { 2074 do {
2075 wc->wc_type = WAPBL_WC_INODES; 2075 wc->wc_type = WAPBL_WC_INODES;
2076 wc->wc_len = blocklen; 2076 wc->wc_len = blocklen;
2077 wc->wc_inocnt = 0; 2077 wc->wc_inocnt = 0;
2078 wc->wc_clear = (i == 0); 2078 wc->wc_clear = (i == 0);
2079 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { 2079 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2080 while (!wi) { 2080 while (!wi) {
2081 KASSERT((wih - &wl->wl_inohash[0]) 2081 KASSERT((wih - &wl->wl_inohash[0])
2082 <= wl->wl_inohashmask); 2082 <= wl->wl_inohashmask);
2083 wi = LIST_FIRST(wih++); 2083 wi = LIST_FIRST(wih++);
2084 } 2084 }
2085 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; 2085 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2086 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; 2086 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2087 wc->wc_inocnt++; 2087 wc->wc_inocnt++;
2088 i++; 2088 i++;
2089 wi = LIST_NEXT(wi, wi_hash); 2089 wi = LIST_NEXT(wi, wi_hash);
2090 } 2090 }
2091 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2091 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2092 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", 2092 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2093 wc->wc_len, (intmax_t)off)); 2093 wc->wc_len, (intmax_t)off));
2094 error = wapbl_circ_write(wl, wc, blocklen, &off); 2094 error = wapbl_circ_write(wl, wc, blocklen, &off);
2095 if (error) 2095 if (error)
2096 return error; 2096 return error;
2097 } while (i < wl->wl_inohashcnt); 2097 } while (i < wl->wl_inohashcnt);
2098  2098
2099 *offp = off; 2099 *offp = off;
2100 return 0; 2100 return 0;
2101} 2101}
2102 2102
2103#endif /* _KERNEL */ 2103#endif /* _KERNEL */
2104 2104
2105/****************************************************************/ 2105/****************************************************************/
2106 2106
2107#ifdef _KERNEL 2107#ifdef _KERNEL
2108static struct pool wapbl_blk_pool; 2108static struct pool wapbl_blk_pool;
2109static int wapbl_blk_pool_refcount; 2109static int wapbl_blk_pool_refcount;
2110#endif 2110#endif
2111struct wapbl_blk { 2111struct wapbl_blk {
2112 LIST_ENTRY(wapbl_blk) wb_hash; 2112 LIST_ENTRY(wapbl_blk) wb_hash;
2113 daddr_t wb_blk; 2113 daddr_t wb_blk;
2114 off_t wb_off; /* Offset of this block in the log */ 2114 off_t wb_off; /* Offset of this block in the log */
2115}; 2115};
2116#define WAPBL_BLKPOOL_MIN 83 2116#define WAPBL_BLKPOOL_MIN 83
2117 2117
2118static void 2118static void
2119wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) 2119wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2120{ 2120{
2121 if (size < WAPBL_BLKPOOL_MIN) 2121 if (size < WAPBL_BLKPOOL_MIN)
2122 size = WAPBL_BLKPOOL_MIN; 2122 size = WAPBL_BLKPOOL_MIN;
2123 KASSERT(wr->wr_blkhash == 0); 2123 KASSERT(wr->wr_blkhash == 0);
2124#ifdef _KERNEL 2124#ifdef _KERNEL
2125 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); 2125 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2126 if (atomic_inc_uint_nv(&wapbl_blk_pool_refcount) == 1) { 2126 if (atomic_inc_uint_nv(&wapbl_blk_pool_refcount) == 1) {
2127 pool_init(&wapbl_blk_pool, sizeof(struct wapbl_blk), 0, 0, 0, 2127 pool_init(&wapbl_blk_pool, sizeof(struct wapbl_blk), 0, 0, 0,
2128 "wapblblkpl", &pool_allocator_nointr, IPL_NONE); 2128 "wapblblkpl", &pool_allocator_nointr, IPL_NONE);
2129 } 2129 }
2130#else /* ! _KERNEL */ 2130#else /* ! _KERNEL */
2131 /* Manually implement hashinit */ 2131 /* Manually implement hashinit */
2132 { 2132 {
2133 int i; 2133 int i;
2134 unsigned long hashsize; 2134 unsigned long hashsize;
2135 for (hashsize = 1; hashsize < size; hashsize <<= 1) 2135 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2136 continue; 2136 continue;
2137 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash)); 2137 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash));
2138 for (i = 0; i < wr->wr_blkhashmask; i++) 2138 for (i = 0; i < wr->wr_blkhashmask; i++)
2139 LIST_INIT(&wr->wr_blkhash[i]); 2139 LIST_INIT(&wr->wr_blkhash[i]);
2140 wr->wr_blkhashmask = hashsize - 1; 2140 wr->wr_blkhashmask = hashsize - 1;
2141 } 2141 }
2142#endif /* ! _KERNEL */ 2142#endif /* ! _KERNEL */
2143} 2143}
2144 2144
2145static void 2145static void
2146wapbl_blkhash_free(struct wapbl_replay *wr) 2146wapbl_blkhash_free(struct wapbl_replay *wr)
2147{ 2147{
2148 KASSERT(wr->wr_blkhashcnt == 0); 2148 KASSERT(wr->wr_blkhashcnt == 0);
2149#ifdef _KERNEL 2149#ifdef _KERNEL
2150 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); 2150 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2151 if (atomic_dec_uint_nv(&wapbl_blk_pool_refcount) == 0) { 2151 if (atomic_dec_uint_nv(&wapbl_blk_pool_refcount) == 0) {
2152 pool_destroy(&wapbl_blk_pool); 2152 pool_destroy(&wapbl_blk_pool);
2153 } 2153 }
2154#else /* ! _KERNEL */ 2154#else /* ! _KERNEL */
2155 wapbl_free(wr->wr_blkhash); 2155 wapbl_free(wr->wr_blkhash);
2156#endif /* ! _KERNEL */ 2156#endif /* ! _KERNEL */
2157} 2157}
2158 2158
2159static struct wapbl_blk * 2159static struct wapbl_blk *
2160wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) 2160wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2161{ 2161{
2162 struct wapbl_blk_head *wbh; 2162 struct wapbl_blk_head *wbh;
2163 struct wapbl_blk *wb; 2163 struct wapbl_blk *wb;
2164 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2164 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2165 LIST_FOREACH(wb, wbh, wb_hash) { 2165 LIST_FOREACH(wb, wbh, wb_hash) {
2166 if (blk == wb->wb_blk) 2166 if (blk == wb->wb_blk)
2167 return wb; 2167 return wb;
2168 } 2168 }
2169 return 0; 2169 return 0;
2170} 2170}
2171 2171
2172static void 2172static void
2173wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) 2173wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2174{ 2174{
2175 struct wapbl_blk_head *wbh; 2175 struct wapbl_blk_head *wbh;
2176 struct wapbl_blk *wb; 2176 struct wapbl_blk *wb;
2177 wb = wapbl_blkhash_get(wr, blk); 2177 wb = wapbl_blkhash_get(wr, blk);
2178 if (wb) { 2178 if (wb) {
2179 KASSERT(wb->wb_blk == blk); 2179 KASSERT(wb->wb_blk == blk);
2180 wb->wb_off = off; 2180 wb->wb_off = off;
2181 } else { 2181 } else {
2182#ifdef _KERNEL 2182#ifdef _KERNEL
2183 wb = pool_get(&wapbl_blk_pool, PR_WAITOK); 2183 wb = pool_get(&wapbl_blk_pool, PR_WAITOK);
2184#else /* ! _KERNEL */ 2184#else /* ! _KERNEL */
2185 wb = wapbl_malloc(sizeof(*wb)); 2185 wb = wapbl_malloc(sizeof(*wb));
2186#endif /* ! _KERNEL */ 2186#endif /* ! _KERNEL */
2187 wb->wb_blk = blk; 2187 wb->wb_blk = blk;
2188 wb->wb_off = off; 2188 wb->wb_off = off;
2189 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2189 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2190 LIST_INSERT_HEAD(wbh, wb, wb_hash); 2190 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2191 wr->wr_blkhashcnt++; 2191 wr->wr_blkhashcnt++;
2192 } 2192 }
2193} 2193}
2194 2194
2195static void 2195static void
2196wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) 2196wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2197{ 2197{
2198 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2198 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2199 if (wb) { 2199 if (wb) {
2200 KASSERT(wr->wr_blkhashcnt > 0); 2200 KASSERT(wr->wr_blkhashcnt > 0);
2201 wr->wr_blkhashcnt--; 2201 wr->wr_blkhashcnt--;
2202 LIST_REMOVE(wb, wb_hash); 2202 LIST_REMOVE(wb, wb_hash);
2203#ifdef _KERNEL 2203#ifdef _KERNEL
2204 pool_put(&wapbl_blk_pool, wb); 2204 pool_put(&wapbl_blk_pool, wb);
2205#else /* ! _KERNEL */ 2205#else /* ! _KERNEL */
2206 wapbl_free(wb); 2206 wapbl_free(wb);
2207#endif /* ! _KERNEL */ 2207#endif /* ! _KERNEL */
2208 } 2208 }
2209} 2209}
2210 2210
2211static void 2211static void
2212wapbl_blkhash_clear(struct wapbl_replay *wr) 2212wapbl_blkhash_clear(struct wapbl_replay *wr)
2213{ 2213{
2214 int i; 2214 int i;
2215 for (i = 0; i <= wr->wr_blkhashmask; i++) { 2215 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2216 struct wapbl_blk *wb; 2216 struct wapbl_blk *wb;
2217 2217
2218 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { 2218 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2219 KASSERT(wr->wr_blkhashcnt > 0); 2219 KASSERT(wr->wr_blkhashcnt > 0);
2220 wr->wr_blkhashcnt--; 2220 wr->wr_blkhashcnt--;
2221 LIST_REMOVE(wb, wb_hash); 2221 LIST_REMOVE(wb, wb_hash);
2222#ifdef _KERNEL 2222#ifdef _KERNEL
2223 pool_put(&wapbl_blk_pool, wb); 2223 pool_put(&wapbl_blk_pool, wb);
2224#else /* ! _KERNEL */ 2224#else /* ! _KERNEL */
2225 wapbl_free(wb); 2225 wapbl_free(wb);
2226#endif /* ! _KERNEL */ 2226#endif /* ! _KERNEL */
2227 } 2227 }
2228 } 2228 }
2229 KASSERT(wr->wr_blkhashcnt == 0); 2229 KASSERT(wr->wr_blkhashcnt == 0);
2230} 2230}
2231 2231
2232/****************************************************************/ 2232/****************************************************************/
2233 2233
2234static int 2234static int
2235wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) 2235wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2236{ 2236{
2237 size_t slen; 2237 size_t slen;
2238 off_t off = *offp; 2238 off_t off = *offp;
2239 int error; 2239 int error;
2240 2240
2241 KASSERT(((len >> wr->wr_log_dev_bshift) << 2241 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2242 wr->wr_log_dev_bshift) == len); 2242 wr->wr_log_dev_bshift) == len);
2243 if (off < wr->wr_circ_off) 2243 if (off < wr->wr_circ_off)
2244 off = wr->wr_circ_off; 2244 off = wr->wr_circ_off;
2245 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2245 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2246 if (slen < len) { 2246 if (slen < len) {
2247 error = wapbl_read(data, slen, wr->wr_devvp, 2247 error = wapbl_read(data, slen, wr->wr_devvp,
2248 wr->wr_logpbn + (off >> wr->wr_log_dev_bshift)); 2248 wr->wr_logpbn + (off >> wr->wr_log_dev_bshift));
2249 if (error) 2249 if (error)
2250 return error; 2250 return error;
2251 data = (uint8_t *)data + slen; 2251 data = (uint8_t *)data + slen;
2252 len -= slen; 2252 len -= slen;
2253 off = wr->wr_circ_off; 2253 off = wr->wr_circ_off;
2254 } 2254 }
2255 error = wapbl_read(data, len, wr->wr_devvp, 2255 error = wapbl_read(data, len, wr->wr_devvp,
2256 wr->wr_logpbn + (off >> wr->wr_log_dev_bshift)); 2256 wr->wr_logpbn + (off >> wr->wr_log_dev_bshift));
2257 if (error) 2257 if (error)
2258 return error; 2258 return error;
2259 off += len; 2259 off += len;
2260 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2260 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2261 off = wr->wr_circ_off; 2261 off = wr->wr_circ_off;
2262 *offp = off; 2262 *offp = off;
2263 return 0; 2263 return 0;
2264} 2264}
2265 2265
2266static void 2266static void
2267wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) 2267wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2268{ 2268{
2269 size_t slen; 2269 size_t slen;
2270 off_t off = *offp; 2270 off_t off = *offp;
2271 2271
2272 KASSERT(((len >> wr->wr_log_dev_bshift) << 2272 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2273 wr->wr_log_dev_bshift) == len); 2273 wr->wr_log_dev_bshift) == len);
2274 2274
2275 if (off < wr->wr_circ_off) 2275 if (off < wr->wr_circ_off)
2276 off = wr->wr_circ_off; 2276 off = wr->wr_circ_off;
2277 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2277 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2278 if (slen < len) { 2278 if (slen < len) {
2279 len -= slen; 2279 len -= slen;
2280 off = wr->wr_circ_off; 2280 off = wr->wr_circ_off;
2281 } 2281 }
2282 off += len; 2282 off += len;
2283 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2283 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2284 off = wr->wr_circ_off; 2284 off = wr->wr_circ_off;
2285 *offp = off; 2285 *offp = off;
2286} 2286}
2287 2287
2288/****************************************************************/ 2288/****************************************************************/
2289 2289
2290int 2290int
2291wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, 2291wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2292 daddr_t off, size_t count, size_t blksize) 2292 daddr_t off, size_t count, size_t blksize)
2293{ 2293{
2294 struct wapbl_replay *wr; 2294 struct wapbl_replay *wr;
2295 int error; 2295 int error;
2296 struct vnode *devvp; 2296 struct vnode *devvp;
2297 daddr_t logpbn; 2297 daddr_t logpbn;
2298 uint8_t *scratch; 2298 uint8_t *scratch;
2299 struct wapbl_wc_header *wch; 2299 struct wapbl_wc_header *wch;
2300 struct wapbl_wc_header *wch2; 2300 struct wapbl_wc_header *wch2;
2301 /* Use this until we read the actual log header */ 2301 /* Use this until we read the actual log header */
2302 int log_dev_bshift = DEV_BSHIFT; 2302 int log_dev_bshift = DEV_BSHIFT;
2303 size_t used; 2303 size_t used;
2304 2304
2305 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2305 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2306 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", 2306 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2307 vp, off, count, blksize)); 2307 vp, off, count, blksize));
2308 2308
2309 if (off < 0) 2309 if (off < 0)
2310 return EINVAL; 2310 return EINVAL;
2311 2311
2312 if (blksize < DEV_BSIZE) 2312 if (blksize < DEV_BSIZE)
2313 return EINVAL; 2313 return EINVAL;
2314 if (blksize % DEV_BSIZE) 2314 if (blksize % DEV_BSIZE)
2315 return EINVAL; 2315 return EINVAL;
2316 2316
2317#ifdef _KERNEL 2317#ifdef _KERNEL
2318#if 0 2318#if 0
2319 /* XXX vp->v_size isn't reliably set for VBLK devices, 2319 /* XXX vp->v_size isn't reliably set for VBLK devices,
2320 * especially root. However, we might still want to verify 2320 * especially root. However, we might still want to verify
2321 * that the full load is readable */ 2321 * that the full load is readable */
2322 if ((off + count) * blksize > vp->v_size) 2322 if ((off + count) * blksize > vp->v_size)
2323 return EINVAL; 2323 return EINVAL;
2324#endif 2324#endif
2325 2325
2326 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { 2326 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2327 return error; 2327 return error;
2328 } 2328 }
2329#else /* ! _KERNEL */ 2329#else /* ! _KERNEL */
2330 devvp = vp; 2330 devvp = vp;
2331 logpbn = off; 2331 logpbn = off;
2332#endif /* ! _KERNEL */ 2332#endif /* ! _KERNEL */
2333 2333
2334 scratch = wapbl_malloc(MAXBSIZE); 2334 scratch = wapbl_malloc(MAXBSIZE);
2335 2335
2336 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, logpbn); 2336 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, logpbn);
2337 if (error) 2337 if (error)
2338 goto errout; 2338 goto errout;
2339 2339
2340 wch = (struct wapbl_wc_header *)scratch; 2340 wch = (struct wapbl_wc_header *)scratch;
2341 wch2 = 2341 wch2 =
2342 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift)); 2342 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2343 /* XXX verify checksums and magic numbers */ 2343 /* XXX verify checksums and magic numbers */
2344 if (wch->wc_type != WAPBL_WC_HEADER) { 2344 if (wch->wc_type != WAPBL_WC_HEADER) {
2345 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); 2345 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2346 error = EFTYPE; 2346 error = EFTYPE;
2347 goto errout; 2347 goto errout;
2348 } 2348 }
2349 2349
2350 if (wch2->wc_generation > wch->wc_generation) 2350 if (wch2->wc_generation > wch->wc_generation)
2351 wch = wch2; 2351 wch = wch2;
2352 2352
2353 wr = wapbl_calloc(1, sizeof(*wr)); 2353 wr = wapbl_calloc(1, sizeof(*wr));
2354 2354
2355 wr->wr_logvp = vp; 2355 wr->wr_logvp = vp;
2356 wr->wr_devvp = devvp; 2356 wr->wr_devvp = devvp;
2357 wr->wr_logpbn = logpbn; 2357 wr->wr_logpbn = logpbn;
2358 2358
2359 wr->wr_scratch = scratch; 2359 wr->wr_scratch = scratch;
2360 2360
2361 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift; 2361 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2362 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift; 2362 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2363 wr->wr_circ_off = wch->wc_circ_off; 2363 wr->wr_circ_off = wch->wc_circ_off;
2364 wr->wr_circ_size = wch->wc_circ_size; 2364 wr->wr_circ_size = wch->wc_circ_size;
2365 wr->wr_generation = wch->wc_generation; 2365 wr->wr_generation = wch->wc_generation;
2366 2366
2367 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); 2367 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2368 2368
2369 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2369 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2370 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 2370 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2371 " len=%"PRId64" used=%zu\n", 2371 " len=%"PRId64" used=%zu\n",
2372 wch->wc_head, wch->wc_tail, wch->wc_circ_off, 2372 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2373 wch->wc_circ_size, used)); 2373 wch->wc_circ_size, used));
2374 2374
2375 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); 2375 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2376 2376
2377 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail); 2377 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2378 if (error) { 2378 if (error) {
2379 wapbl_replay_stop(wr); 2379 wapbl_replay_stop(wr);
2380 wapbl_replay_free(wr); 2380 wapbl_replay_free(wr);
2381 return error; 2381 return error;
2382 } 2382 }
2383 2383
2384 *wrp = wr; 2384 *wrp = wr;
2385 return 0; 2385 return 0;
2386 2386
2387 errout: 2387 errout:
2388 wapbl_free(scratch); 2388 wapbl_free(scratch);
2389 return error; 2389 return error;
2390} 2390}
2391 2391
2392void 2392void
2393wapbl_replay_stop(struct wapbl_replay *wr) 2393wapbl_replay_stop(struct wapbl_replay *wr)
2394{ 2394{
2395 2395
2396 if (!wapbl_replay_isopen(wr)) 2396 if (!wapbl_replay_isopen(wr))
2397 return; 2397 return;
2398 2398
2399 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); 2399 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2400 2400
2401 wapbl_free(wr->wr_scratch); 2401 wapbl_free(wr->wr_scratch);
2402 wr->wr_scratch = 0; 2402 wr->wr_scratch = 0;
2403 2403
2404 wr->wr_logvp = 0; 2404 wr->wr_logvp = 0;
2405 2405
2406 wapbl_blkhash_clear(wr); 2406 wapbl_blkhash_clear(wr);
2407 wapbl_blkhash_free(wr); 2407 wapbl_blkhash_free(wr);
2408} 2408}
2409 2409
2410void 2410void
2411wapbl_replay_free(struct wapbl_replay *wr) 2411wapbl_replay_free(struct wapbl_replay *wr)
2412{ 2412{
2413 2413
2414 KDASSERT(!wapbl_replay_isopen(wr)); 2414 KDASSERT(!wapbl_replay_isopen(wr));
2415 2415
2416 if (wr->wr_inodes) 2416 if (wr->wr_inodes)
2417 wapbl_free(wr->wr_inodes); 2417 wapbl_free(wr->wr_inodes);
2418 wapbl_free(wr); 2418 wapbl_free(wr);
2419} 2419}
2420 2420
2421#ifdef _KERNEL 2421#ifdef _KERNEL
2422int 2422int
2423wapbl_replay_isopen1(struct wapbl_replay *wr) 2423wapbl_replay_isopen1(struct wapbl_replay *wr)
2424{ 2424{
2425 2425
2426 return wapbl_replay_isopen(wr); 2426 return wapbl_replay_isopen(wr);
2427} 2427}
2428#endif 2428#endif
2429 2429
2430static void 2430static void
2431wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp) 2431wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2432{ 2432{
2433 struct wapbl_wc_blocklist *wc = 2433 struct wapbl_wc_blocklist *wc =
2434 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2434 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2435 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2435 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2436 int i, j, n; 2436 int i, j, n;
2437 2437
2438 for (i = 0; i < wc->wc_blkcount; i++) { 2438 for (i = 0; i < wc->wc_blkcount; i++) {
2439 /* 2439 /*
2440 * Enter each physical block into the hashtable independently. 2440 * Enter each physical block into the hashtable independently.
2441 */ 2441 */
2442 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2442 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2443 for (j = 0; j < n; j++) { 2443 for (j = 0; j < n; j++) {
2444 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + j, 2444 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + j,
2445 *offp); 2445 *offp);
2446 wapbl_circ_advance(wr, fsblklen, offp); 2446 wapbl_circ_advance(wr, fsblklen, offp);
2447 } 2447 }
2448 } 2448 }
2449} 2449}
2450 2450
2451static void 2451static void
2452wapbl_replay_process_revocations(struct wapbl_replay *wr) 2452wapbl_replay_process_revocations(struct wapbl_replay *wr)
2453{ 2453{
2454 struct wapbl_wc_blocklist *wc = 2454 struct wapbl_wc_blocklist *wc =
2455 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2455 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2456 int i, j, n; 2456 int i, j, n;
2457 2457
2458 for (i = 0; i < wc->wc_blkcount; i++) { 2458 for (i = 0; i < wc->wc_blkcount; i++) {
2459 /* 2459 /*
2460 * Remove any blocks found from the hashtable. 2460 * Remove any blocks found from the hashtable.
2461 */ 2461 */
2462 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2462 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2463 for (j = 0; j < n; j++) 2463 for (j = 0; j < n; j++)
2464 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + j); 2464 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + j);
2465 } 2465 }
2466} 2466}
2467 2467
2468static void 2468static void
2469wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff) 2469wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2470{ 2470{
2471 struct wapbl_wc_inodelist *wc = 2471 struct wapbl_wc_inodelist *wc =
2472 (struct wapbl_wc_inodelist *)wr->wr_scratch; 2472 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2473 /* 2473 /*
2474 * Keep track of where we found this so location won't be 2474 * Keep track of where we found this so location won't be
2475 * overwritten. 2475 * overwritten.
2476 */ 2476 */
2477 if (wc->wc_clear) { 2477 if (wc->wc_clear) {
2478 wr->wr_inodestail = oldoff; 2478 wr->wr_inodestail = oldoff;
2479 wr->wr_inodescnt = 0; 2479 wr->wr_inodescnt = 0;
2480 if (wr->wr_inodes != NULL) { 2480 if (wr->wr_inodes != NULL) {
2481 wapbl_free(wr->wr_inodes); 2481 wapbl_free(wr->wr_inodes);
2482 wr->wr_inodes = NULL; 2482 wr->wr_inodes = NULL;
2483 } 2483 }
2484 } 2484 }
2485 wr->wr_inodeshead = newoff; 2485 wr->wr_inodeshead = newoff;
2486 if (wc->wc_inocnt == 0) 2486 if (wc->wc_inocnt == 0)
2487 return; 2487 return;
2488 2488
2489 wr->wr_inodes = wapbl_realloc(wr->wr_inodes, 2489 wr->wr_inodes = wapbl_realloc(wr->wr_inodes,
2490 (wr->wr_inodescnt + wc->wc_inocnt) * sizeof(wc->wc_inodes[0])); 2490 (wr->wr_inodescnt + wc->wc_inocnt) * sizeof(wc->wc_inodes[0]));
2491 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes, 2491 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2492 wc->wc_inocnt * sizeof(wc->wc_inodes[0])); 2492 wc->wc_inocnt * sizeof(wc->wc_inodes[0]));
2493 wr->wr_inodescnt += wc->wc_inocnt; 2493 wr->wr_inodescnt += wc->wc_inocnt;
2494} 2494}
2495 2495
2496static int 2496static int
2497wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail) 2497wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2498{ 2498{
2499 off_t off; 2499 off_t off;
2500 int error; 2500 int error;
2501 2501
2502 int logblklen = 1 << wr->wr_log_dev_bshift; 2502 int logblklen = 1 << wr->wr_log_dev_bshift;
2503 2503
2504 wapbl_blkhash_clear(wr); 2504 wapbl_blkhash_clear(wr);
2505 2505
2506 off = tail; 2506 off = tail;
2507 while (off != head) { 2507 while (off != head) {
2508 struct wapbl_wc_null *wcn; 2508 struct wapbl_wc_null *wcn;
2509 off_t saveoff = off; 2509 off_t saveoff = off;
2510 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2510 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2511 if (error) 2511 if (error)
2512 goto errout; 2512 goto errout;
2513 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2513 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2514 switch (wcn->wc_type) { 2514 switch (wcn->wc_type) {
2515 case WAPBL_WC_BLOCKS: 2515 case WAPBL_WC_BLOCKS:
2516 wapbl_replay_process_blocks(wr, &off); 2516 wapbl_replay_process_blocks(wr, &off);
2517 break; 2517 break;
2518 2518
2519 case WAPBL_WC_REVOCATIONS: 2519 case WAPBL_WC_REVOCATIONS:
2520 wapbl_replay_process_revocations(wr); 2520 wapbl_replay_process_revocations(wr);
2521 break; 2521 break;
2522 2522
2523 case WAPBL_WC_INODES: 2523 case WAPBL_WC_INODES:
2524 wapbl_replay_process_inodes(wr, saveoff, off); 2524 wapbl_replay_process_inodes(wr, saveoff, off);
2525 break; 2525 break;
2526 2526
2527 default: 2527 default:
2528 printf("Unrecognized wapbl type: 0x%08x\n", 2528 printf("Unrecognized wapbl type: 0x%08x\n",
2529 wcn->wc_type); 2529 wcn->wc_type);
2530 error = EFTYPE; 2530 error = EFTYPE;
2531 goto errout; 2531 goto errout;
2532 } 2532 }
2533 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2533 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2534 if (off != saveoff) { 2534 if (off != saveoff) {
2535 printf("wapbl_replay: corrupted records\n"); 2535 printf("wapbl_replay: corrupted records\n");
2536 error = EFTYPE; 2536 error = EFTYPE;
2537 goto errout; 2537 goto errout;
2538 } 2538 }
2539 } 2539 }
2540 return 0; 2540 return 0;
2541 2541
2542 errout: 2542 errout:
2543 wapbl_blkhash_clear(wr); 2543 wapbl_blkhash_clear(wr);
2544 return error; 2544 return error;
2545} 2545}
2546 2546
2547#if 0 2547#if 0
2548int 2548int
2549wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) 2549wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2550{ 2550{
2551 off_t off; 2551 off_t off;
2552 int mismatchcnt = 0; 2552 int mismatchcnt = 0;
2553 int logblklen = 1 << wr->wr_log_dev_bshift; 2553 int logblklen = 1 << wr->wr_log_dev_bshift;
2554 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2554 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2555 void *scratch1 = wapbl_malloc(MAXBSIZE); 2555 void *scratch1 = wapbl_malloc(MAXBSIZE);
2556 void *scratch2 = wapbl_malloc(MAXBSIZE); 2556 void *scratch2 = wapbl_malloc(MAXBSIZE);
2557 int error = 0; 2557 int error = 0;
2558 2558
2559 KDASSERT(wapbl_replay_isopen(wr)); 2559 KDASSERT(wapbl_replay_isopen(wr));
2560 2560
2561 off = wch->wc_tail; 2561 off = wch->wc_tail;
2562 while (off != wch->wc_head) { 2562 while (off != wch->wc_head) {
2563 struct wapbl_wc_null *wcn; 2563 struct wapbl_wc_null *wcn;
2564#ifdef DEBUG 2564#ifdef DEBUG
2565 off_t saveoff = off; 2565 off_t saveoff = off;
2566#endif 2566#endif
2567 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2567 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2568 if (error) 2568 if (error)
2569 goto out; 2569 goto out;
2570 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2570 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2571 switch (wcn->wc_type) { 2571 switch (wcn->wc_type) {
2572 case WAPBL_WC_BLOCKS: 2572 case WAPBL_WC_BLOCKS:
2573 { 2573 {
2574 struct wapbl_wc_blocklist *wc = 2574 struct wapbl_wc_blocklist *wc =
2575 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2575 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2576 int i; 2576 int i;
2577 for (i = 0; i < wc->wc_blkcount; i++) { 2577 for (i = 0; i < wc->wc_blkcount; i++) {
2578 int foundcnt = 0; 2578 int foundcnt = 0;
2579 int dirtycnt = 0; 2579 int dirtycnt = 0;
2580 int j, n; 2580 int j, n;
2581 /* 2581 /*
2582 * Check each physical block into the 2582 * Check each physical block into the
2583 * hashtable independently 2583 * hashtable independently
2584 */ 2584 */
2585 n = wc->wc_blocks[i].wc_dlen >> 2585 n = wc->wc_blocks[i].wc_dlen >>
2586 wch->wc_fs_dev_bshift; 2586 wch->wc_fs_dev_bshift;
2587 for (j = 0; j < n; j++) { 2587 for (j = 0; j < n; j++) {
2588 struct wapbl_blk *wb = 2588 struct wapbl_blk *wb =
2589 wapbl_blkhash_get(wr, 2589 wapbl_blkhash_get(wr,
2590 wc->wc_blocks[i].wc_daddr + j); 2590 wc->wc_blocks[i].wc_daddr + j);
2591 if (wb && (wb->wb_off == off)) { 2591 if (wb && (wb->wb_off == off)) {
2592 foundcnt++; 2592 foundcnt++;
2593 error = 2593 error =
2594 wapbl_circ_read(wr, 2594 wapbl_circ_read(wr,
2595 scratch1, fsblklen, 2595 scratch1, fsblklen,
2596 &off); 2596 &off);
2597 if (error) 2597 if (error)
2598 goto out; 2598 goto out;
2599 error = 2599 error =
2600 wapbl_read(scratch2, 2600 wapbl_read(scratch2,
2601 fsblklen, fsdevvp, 2601 fsblklen, fsdevvp,
2602 wb->wb_blk); 2602 wb->wb_blk);
2603 if (error) 2603 if (error)
2604 goto out; 2604 goto out;
2605 if (memcmp(scratch1, 2605 if (memcmp(scratch1,
2606 scratch2, 2606 scratch2,
2607 fsblklen)) { 2607 fsblklen)) {
2608 printf( 2608 printf(
2609 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", 2609 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2610 wb->wb_blk, (intmax_t)off); 2610 wb->wb_blk, (intmax_t)off);
2611 dirtycnt++; 2611 dirtycnt++;
2612 mismatchcnt++; 2612 mismatchcnt++;
2613 } 2613 }
2614 } else { 2614 } else {
2615 wapbl_circ_advance(wr, 2615 wapbl_circ_advance(wr,
2616 fsblklen, &off); 2616 fsblklen, &off);
2617 } 2617 }
2618 } 2618 }
2619#if 0 2619#if 0
2620 /* 2620 /*
2621 * If all of the blocks in an entry 2621 * If all of the blocks in an entry
2622 * are clean, then remove all of its 2622 * are clean, then remove all of its
2623 * blocks from the hashtable since they 2623 * blocks from the hashtable since they
2624 * never will need replay. 2624 * never will need replay.
2625 */ 2625 */
2626 if ((foundcnt != 0) && 2626 if ((foundcnt != 0) &&
2627 (dirtycnt == 0)) { 2627 (dirtycnt == 0)) {
2628 off = saveoff; 2628 off = saveoff;
2629 wapbl_circ_advance(wr, 2629 wapbl_circ_advance(wr,
2630 logblklen, &off); 2630 logblklen, &off);
2631 for (j = 0; j < n; j++) { 2631 for (j = 0; j < n; j++) {
2632 struct wapbl_blk *wb = 2632 struct wapbl_blk *wb =
2633 wapbl_blkhash_get(wr, 2633 wapbl_blkhash_get(wr,
2634 wc->wc_blocks[i].wc_daddr + j); 2634 wc->wc_blocks[i].wc_daddr + j);
2635 if (wb && 2635 if (wb &&
2636 (wb->wb_off == off)) { 2636 (wb->wb_off == off)) {
2637 wapbl_blkhash_rem(wr, wb->wb_blk); 2637 wapbl_blkhash_rem(wr, wb->wb_blk);
2638 } 2638 }
2639 wapbl_circ_advance(wr, 2639 wapbl_circ_advance(wr,
2640 fsblklen, &off); 2640 fsblklen, &off);
2641 } 2641 }
2642 } 2642 }
2643#endif 2643#endif
2644 } 2644 }
2645 } 2645 }
2646 break; 2646 break;
2647 case WAPBL_WC_REVOCATIONS: 2647 case WAPBL_WC_REVOCATIONS:
2648 case WAPBL_WC_INODES: 2648 case WAPBL_WC_INODES:
2649 break; 2649 break;
2650 default: 2650 default:
2651 KASSERT(0); 2651 KASSERT(0);
2652 } 2652 }
2653#ifdef DEBUG 2653#ifdef DEBUG
2654 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2654 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2655 KASSERT(off == saveoff); 2655 KASSERT(off == saveoff);
2656#endif 2656#endif
2657 } 2657 }
2658 out: 2658 out:
2659 wapbl_free(scratch1); 2659 wapbl_free(scratch1);
2660 wapbl_free(scratch2); 2660 wapbl_free(scratch2);
2661 if (!error && mismatchcnt) 2661 if (!error && mismatchcnt)
2662 error = EFTYPE; 2662 error = EFTYPE;
2663 return error; 2663 return error;
2664} 2664}
2665#endif 2665#endif
2666 2666
2667int 2667int
2668wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) 2668wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2669{ 2669{
2670 struct wapbl_blk *wb; 2670 struct wapbl_blk *wb;
2671 size_t i; 2671 size_t i;
2672 off_t off; 2672 off_t off;
2673 void *scratch; 2673 void *scratch;
2674 int error = 0; 2674 int error = 0;
2675 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2675 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2676 2676
2677 KDASSERT(wapbl_replay_isopen(wr)); 2677 KDASSERT(wapbl_replay_isopen(wr));
2678 2678
2679 scratch = wapbl_malloc(MAXBSIZE); 2679 scratch = wapbl_malloc(MAXBSIZE);
2680 2680
2681 for (i = 0; i < wr->wr_blkhashmask; ++i) { 2681 for (i = 0; i < wr->wr_blkhashmask; ++i) {
2682 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) { 2682 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2683 off = wb->wb_off; 2683 off = wb->wb_off;
2684 error = wapbl_circ_read(wr, scratch, fsblklen, &off); 2684 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2685 if (error) 2685 if (error)
2686 break; 2686 break;
2687 error = wapbl_write(scratch, fsblklen, fsdevvp, 2687 error = wapbl_write(scratch, fsblklen, fsdevvp,
2688 wb->wb_blk); 2688 wb->wb_blk);
2689 if (error) 2689 if (error)
2690 break; 2690 break;
2691 } 2691 }
2692 } 2692 }
2693 2693
2694 wapbl_free(scratch); 2694 wapbl_free(scratch);
2695 return error; 2695 return error;
2696} 2696}
2697 2697
2698int 2698int
2699wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len) 2699wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2700{ 2700{
2701 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2701 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2702 2702
2703 KDASSERT(wapbl_replay_isopen(wr)); 2703 KDASSERT(wapbl_replay_isopen(wr));
2704 KASSERT((len % fsblklen) == 0); 2704 KASSERT((len % fsblklen) == 0);
2705 2705
2706 while (len != 0) { 2706 while (len != 0) {
2707 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2707 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2708 if (wb) 2708 if (wb)
2709 return 1; 2709 return 1;
2710 len -= fsblklen; 2710 len -= fsblklen;
2711 } 2711 }
2712 return 0; 2712 return 0;
2713} 2713}
2714 2714
2715int 2715int
2716wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) 2716wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2717{ 2717{
2718 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2718 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2719 2719
2720 KDASSERT(wapbl_replay_isopen(wr)); 2720 KDASSERT(wapbl_replay_isopen(wr));
2721 2721
2722 KASSERT((len % fsblklen) == 0); 2722 KASSERT((len % fsblklen) == 0);
2723 2723
2724 while (len != 0) { 2724 while (len != 0) {
2725 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2725 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2726 if (wb) { 2726 if (wb) {
2727 off_t off = wb->wb_off; 2727 off_t off = wb->wb_off;
2728 int error; 2728 int error;
2729 error = wapbl_circ_read(wr, data, fsblklen, &off); 2729 error = wapbl_circ_read(wr, data, fsblklen, &off);
2730 if (error) 2730 if (error)
2731 return error; 2731 return error;
2732 } 2732 }
2733 data = (uint8_t *)data + fsblklen; 2733 data = (uint8_t *)data + fsblklen;
2734 len -= fsblklen; 2734 len -= fsblklen;
2735 blk++; 2735 blk++;
2736 } 2736 }
2737 return 0; 2737 return 0;
2738} 2738}