Thu Apr 1 06:26:00 2021 UTC ()
Add a sysctl hashstat collector for bufhash.


(simonb)
diff -r1.297 -r1.298 src/sys/kern/vfs_bio.c

cvs diff -r1.297 -r1.298 src/sys/kern/vfs_bio.c (switch to unified diff)

--- src/sys/kern/vfs_bio.c 2020/07/31 04:07:30 1.297
+++ src/sys/kern/vfs_bio.c 2021/04/01 06:25:59 1.298
@@ -1,2186 +1,2223 @@ @@ -1,2186 +1,2223 @@
1/* $NetBSD: vfs_bio.c,v 1.297 2020/07/31 04:07:30 chs Exp $ */ 1/* $NetBSD: vfs_bio.c,v 1.298 2021/04/01 06:25:59 simonb Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2007, 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2007, 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran, and by Wasabi Systems, Inc. 8 * by Andrew Doran, and by Wasabi Systems, Inc.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/*- 32/*-
33 * Copyright (c) 1982, 1986, 1989, 1993 33 * Copyright (c) 1982, 1986, 1989, 1993
34 * The Regents of the University of California. All rights reserved. 34 * The Regents of the University of California. All rights reserved.
35 * (c) UNIX System Laboratories, Inc. 35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed 36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph 37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc. 39 * the permission of UNIX System Laboratories, Inc.
40 * 40 *
41 * Redistribution and use in source and binary forms, with or without 41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions 42 * modification, are permitted provided that the following conditions
43 * are met: 43 * are met:
44 * 1. Redistributions of source code must retain the above copyright 44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer. 45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright 46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the 47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution. 48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors 49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software 50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission. 51 * without specific prior written permission.
52 * 52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE. 63 * SUCH DAMAGE.
64 * 64 *
65 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 65 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
66 */ 66 */
67 67
68/*- 68/*-
69 * Copyright (c) 1994 Christopher G. Demetriou 69 * Copyright (c) 1994 Christopher G. Demetriou
70 * 70 *
71 * Redistribution and use in source and binary forms, with or without 71 * Redistribution and use in source and binary forms, with or without
72 * modification, are permitted provided that the following conditions 72 * modification, are permitted provided that the following conditions
73 * are met: 73 * are met:
74 * 1. Redistributions of source code must retain the above copyright 74 * 1. Redistributions of source code must retain the above copyright
75 * notice, this list of conditions and the following disclaimer. 75 * notice, this list of conditions and the following disclaimer.
76 * 2. Redistributions in binary form must reproduce the above copyright 76 * 2. Redistributions in binary form must reproduce the above copyright
77 * notice, this list of conditions and the following disclaimer in the 77 * notice, this list of conditions and the following disclaimer in the
78 * documentation and/or other materials provided with the distribution. 78 * documentation and/or other materials provided with the distribution.
79 * 3. All advertising materials mentioning features or use of this software 79 * 3. All advertising materials mentioning features or use of this software
80 * must display the following acknowledgement: 80 * must display the following acknowledgement:
81 * This product includes software developed by the University of 81 * This product includes software developed by the University of
82 * California, Berkeley and its contributors. 82 * California, Berkeley and its contributors.
83 * 4. Neither the name of the University nor the names of its contributors 83 * 4. Neither the name of the University nor the names of its contributors
84 * may be used to endorse or promote products derived from this software 84 * may be used to endorse or promote products derived from this software
85 * without specific prior written permission. 85 * without specific prior written permission.
86 * 86 *
87 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 87 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
88 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 88 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
89 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 89 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
90 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 90 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
91 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 91 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
92 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 92 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
93 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 93 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
94 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 94 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
95 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 95 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
96 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 96 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
97 * SUCH DAMAGE. 97 * SUCH DAMAGE.
98 * 98 *
99 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 99 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
100 */ 100 */
101 101
102/* 102/*
103 * The buffer cache subsystem. 103 * The buffer cache subsystem.
104 * 104 *
105 * Some references: 105 * Some references:
106 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) 106 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
107 * Leffler, et al.: The Design and Implementation of the 4.3BSD 107 * Leffler, et al.: The Design and Implementation of the 4.3BSD
108 * UNIX Operating System (Addison Welley, 1989) 108 * UNIX Operating System (Addison Welley, 1989)
109 * 109 *
110 * Locking 110 * Locking
111 * 111 *
112 * There are three locks: 112 * There are three locks:
113 * - bufcache_lock: protects global buffer cache state. 113 * - bufcache_lock: protects global buffer cache state.
114 * - BC_BUSY: a long term per-buffer lock. 114 * - BC_BUSY: a long term per-buffer lock.
115 * - buf_t::b_objlock: lock on completion (biowait vs biodone). 115 * - buf_t::b_objlock: lock on completion (biowait vs biodone).
116 * 116 *
117 * For buffers associated with vnodes (a most common case) b_objlock points 117 * For buffers associated with vnodes (a most common case) b_objlock points
118 * to the vnode_t::v_interlock. Otherwise, it points to generic buffer_lock. 118 * to the vnode_t::v_interlock. Otherwise, it points to generic buffer_lock.
119 * 119 *
120 * Lock order: 120 * Lock order:
121 * bufcache_lock -> 121 * bufcache_lock ->
122 * buf_t::b_objlock 122 * buf_t::b_objlock
123 */ 123 */
124 124
125#include <sys/cdefs.h> 125#include <sys/cdefs.h>
126__KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.297 2020/07/31 04:07:30 chs Exp $"); 126__KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.298 2021/04/01 06:25:59 simonb Exp $");
127 127
128#ifdef _KERNEL_OPT 128#ifdef _KERNEL_OPT
129#include "opt_bufcache.h" 129#include "opt_bufcache.h"
130#include "opt_dtrace.h" 130#include "opt_dtrace.h"
131#include "opt_biohist.h" 131#include "opt_biohist.h"
132#endif 132#endif
133 133
134#include <sys/param.h> 134#include <sys/param.h>
135#include <sys/systm.h> 135#include <sys/systm.h>
136#include <sys/kernel.h> 136#include <sys/kernel.h>
137#include <sys/proc.h> 137#include <sys/proc.h>
138#include <sys/buf.h> 138#include <sys/buf.h>
139#include <sys/vnode.h> 139#include <sys/vnode.h>
140#include <sys/mount.h> 140#include <sys/mount.h>
141#include <sys/resourcevar.h> 141#include <sys/resourcevar.h>
142#include <sys/sysctl.h> 142#include <sys/sysctl.h>
143#include <sys/conf.h> 143#include <sys/conf.h>
144#include <sys/kauth.h> 144#include <sys/kauth.h>
145#include <sys/fstrans.h> 145#include <sys/fstrans.h>
146#include <sys/intr.h> 146#include <sys/intr.h>
147#include <sys/cpu.h> 147#include <sys/cpu.h>
148#include <sys/wapbl.h> 148#include <sys/wapbl.h>
149#include <sys/bitops.h> 149#include <sys/bitops.h>
150#include <sys/cprng.h> 150#include <sys/cprng.h>
151#include <sys/sdt.h> 151#include <sys/sdt.h>
152 152
153#include <uvm/uvm.h> /* extern struct uvm uvm */ 153#include <uvm/uvm.h> /* extern struct uvm uvm */
154 154
155#include <miscfs/specfs/specdev.h> 155#include <miscfs/specfs/specdev.h>
156 156
157SDT_PROVIDER_DEFINE(io); 157SDT_PROVIDER_DEFINE(io);
158 158
159SDT_PROBE_DEFINE4(io, kernel, , bbusy__start, 159SDT_PROBE_DEFINE4(io, kernel, , bbusy__start,
160 "struct buf *"/*bp*/, 160 "struct buf *"/*bp*/,
161 "bool"/*intr*/, "int"/*timo*/, "kmutex_t *"/*interlock*/); 161 "bool"/*intr*/, "int"/*timo*/, "kmutex_t *"/*interlock*/);
162SDT_PROBE_DEFINE5(io, kernel, , bbusy__done, 162SDT_PROBE_DEFINE5(io, kernel, , bbusy__done,
163 "struct buf *"/*bp*/, 163 "struct buf *"/*bp*/,
164 "bool"/*intr*/, 164 "bool"/*intr*/,
165 "int"/*timo*/, 165 "int"/*timo*/,
166 "kmutex_t *"/*interlock*/, 166 "kmutex_t *"/*interlock*/,
167 "int"/*error*/); 167 "int"/*error*/);
168SDT_PROBE_DEFINE0(io, kernel, , getnewbuf__start); 168SDT_PROBE_DEFINE0(io, kernel, , getnewbuf__start);
169SDT_PROBE_DEFINE1(io, kernel, , getnewbuf__done, "struct buf *"/*bp*/); 169SDT_PROBE_DEFINE1(io, kernel, , getnewbuf__done, "struct buf *"/*bp*/);
170SDT_PROBE_DEFINE3(io, kernel, , getblk__start, 170SDT_PROBE_DEFINE3(io, kernel, , getblk__start,
171 "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/); 171 "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/);
172SDT_PROBE_DEFINE4(io, kernel, , getblk__done, 172SDT_PROBE_DEFINE4(io, kernel, , getblk__done,
173 "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/, 173 "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/,
174 "struct buf *"/*bp*/); 174 "struct buf *"/*bp*/);
175SDT_PROBE_DEFINE2(io, kernel, , brelse, "struct buf *"/*bp*/, "int"/*set*/); 175SDT_PROBE_DEFINE2(io, kernel, , brelse, "struct buf *"/*bp*/, "int"/*set*/);
176SDT_PROBE_DEFINE1(io, kernel, , wait__start, "struct buf *"/*bp*/); 176SDT_PROBE_DEFINE1(io, kernel, , wait__start, "struct buf *"/*bp*/);
177SDT_PROBE_DEFINE1(io, kernel, , wait__done, "struct buf *"/*bp*/); 177SDT_PROBE_DEFINE1(io, kernel, , wait__done, "struct buf *"/*bp*/);
178 178
179#ifndef BUFPAGES 179#ifndef BUFPAGES
180# define BUFPAGES 0 180# define BUFPAGES 0
181#endif 181#endif
182 182
183#ifdef BUFCACHE 183#ifdef BUFCACHE
184# if (BUFCACHE < 5) || (BUFCACHE > 95) 184# if (BUFCACHE < 5) || (BUFCACHE > 95)
185# error BUFCACHE is not between 5 and 95 185# error BUFCACHE is not between 5 and 95
186# endif 186# endif
187#else 187#else
188# define BUFCACHE 15 188# define BUFCACHE 15
189#endif 189#endif
190 190
191u_int nbuf; /* desired number of buffer headers */ 191u_int nbuf; /* desired number of buffer headers */
192u_int bufpages = BUFPAGES; /* optional hardwired count */ 192u_int bufpages = BUFPAGES; /* optional hardwired count */
193u_int bufcache = BUFCACHE; /* max % of RAM to use for buffer cache */ 193u_int bufcache = BUFCACHE; /* max % of RAM to use for buffer cache */
194 194
195/* 195/*
196 * Definitions for the buffer free lists. 196 * Definitions for the buffer free lists.
197 */ 197 */
198#define BQUEUES 3 /* number of free buffer queues */ 198#define BQUEUES 3 /* number of free buffer queues */
199 199
200#define BQ_LOCKED 0 /* super-blocks &c */ 200#define BQ_LOCKED 0 /* super-blocks &c */
201#define BQ_LRU 1 /* lru, useful buffers */ 201#define BQ_LRU 1 /* lru, useful buffers */
202#define BQ_AGE 2 /* rubbish */ 202#define BQ_AGE 2 /* rubbish */
203 203
204struct bqueue { 204struct bqueue {
205 TAILQ_HEAD(, buf) bq_queue; 205 TAILQ_HEAD(, buf) bq_queue;
206 uint64_t bq_bytes; 206 uint64_t bq_bytes;
207 buf_t *bq_marker; 207 buf_t *bq_marker;
208}; 208};
209static struct bqueue bufqueues[BQUEUES] __cacheline_aligned; 209static struct bqueue bufqueues[BQUEUES] __cacheline_aligned;
210 210
211/* Function prototypes */ 211/* Function prototypes */
212static void buf_setwm(void); 212static void buf_setwm(void);
213static int buf_trim(void); 213static int buf_trim(void);
214static void *bufpool_page_alloc(struct pool *, int); 214static void *bufpool_page_alloc(struct pool *, int);
215static void bufpool_page_free(struct pool *, void *); 215static void bufpool_page_free(struct pool *, void *);
216static buf_t *bio_doread(struct vnode *, daddr_t, int, int); 216static buf_t *bio_doread(struct vnode *, daddr_t, int, int);
217static buf_t *getnewbuf(int, int, int); 217static buf_t *getnewbuf(int, int, int);
218static int buf_lotsfree(void); 218static int buf_lotsfree(void);
219static int buf_canrelease(void); 219static int buf_canrelease(void);
220static u_long buf_mempoolidx(u_long); 220static u_long buf_mempoolidx(u_long);
221static u_long buf_roundsize(u_long); 221static u_long buf_roundsize(u_long);
222static void *buf_alloc(size_t); 222static void *buf_alloc(size_t);
223static void buf_mrelease(void *, size_t); 223static void buf_mrelease(void *, size_t);
224static void binsheadfree(buf_t *, struct bqueue *); 224static void binsheadfree(buf_t *, struct bqueue *);
225static void binstailfree(buf_t *, struct bqueue *); 225static void binstailfree(buf_t *, struct bqueue *);
226#ifdef DEBUG 226#ifdef DEBUG
227static int checkfreelist(buf_t *, struct bqueue *, int); 227static int checkfreelist(buf_t *, struct bqueue *, int);
228#endif 228#endif
229static void biointr(void *); 229static void biointr(void *);
230static void biodone2(buf_t *); 230static void biodone2(buf_t *);
231static void sysctl_kern_buf_setup(void); 231static void sysctl_kern_buf_setup(void);
232static void sysctl_vm_buf_setup(void); 232static void sysctl_vm_buf_setup(void);
233 233
234/* Initialization for biohist */ 234/* Initialization for biohist */
235 235
236#include <sys/biohist.h> 236#include <sys/biohist.h>
237 237
238BIOHIST_DEFINE(biohist); 238BIOHIST_DEFINE(biohist);
239 239
240void 240void
241biohist_init(void) 241biohist_init(void)
242{ 242{
243 243
244 BIOHIST_INIT(biohist, BIOHIST_SIZE); 244 BIOHIST_INIT(biohist, BIOHIST_SIZE);
245} 245}
246 246
247/* 247/*
248 * Definitions for the buffer hash lists. 248 * Definitions for the buffer hash lists.
249 */ 249 */
250#define BUFHASH(dvp, lbn) \ 250#define BUFHASH(dvp, lbn) \
251 (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash]) 251 (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
252LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; 252LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
253u_long bufhash; 253u_long bufhash;
254 254
 255static int bufhash_stats(struct hashstat_sysctl *, bool);
 256
255static kcondvar_t needbuffer_cv; 257static kcondvar_t needbuffer_cv;
256 258
257/* 259/*
258 * Buffer queue lock. 260 * Buffer queue lock.
259 */ 261 */
260kmutex_t bufcache_lock __cacheline_aligned; 262kmutex_t bufcache_lock __cacheline_aligned;
261kmutex_t buffer_lock __cacheline_aligned; 263kmutex_t buffer_lock __cacheline_aligned;
262 264
263/* Software ISR for completed transfers. */ 265/* Software ISR for completed transfers. */
264static void *biodone_sih; 266static void *biodone_sih;
265 267
266/* Buffer pool for I/O buffers. */ 268/* Buffer pool for I/O buffers. */
267static pool_cache_t buf_cache; 269static pool_cache_t buf_cache;
268static pool_cache_t bufio_cache; 270static pool_cache_t bufio_cache;
269 271
270#define MEMPOOL_INDEX_OFFSET (ilog2(DEV_BSIZE)) /* smallest pool is 512 bytes */ 272#define MEMPOOL_INDEX_OFFSET (ilog2(DEV_BSIZE)) /* smallest pool is 512 bytes */
271#define NMEMPOOLS (ilog2(MAXBSIZE) - MEMPOOL_INDEX_OFFSET + 1) 273#define NMEMPOOLS (ilog2(MAXBSIZE) - MEMPOOL_INDEX_OFFSET + 1)
272__CTASSERT((1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) == MAXBSIZE); 274__CTASSERT((1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) == MAXBSIZE);
273 275
274/* Buffer memory pools */ 276/* Buffer memory pools */
275static struct pool bmempools[NMEMPOOLS]; 277static struct pool bmempools[NMEMPOOLS];
276 278
277static struct vm_map *buf_map; 279static struct vm_map *buf_map;
278 280
279/* 281/*
280 * Buffer memory pool allocator. 282 * Buffer memory pool allocator.
281 */ 283 */
282static void * 284static void *
283bufpool_page_alloc(struct pool *pp, int flags) 285bufpool_page_alloc(struct pool *pp, int flags)
284{ 286{
285 287
286 return (void *)uvm_km_alloc(buf_map, 288 return (void *)uvm_km_alloc(buf_map,
287 MAXBSIZE, MAXBSIZE, 289 MAXBSIZE, MAXBSIZE,
288 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT|UVM_KMF_TRYLOCK) 290 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT|UVM_KMF_TRYLOCK)
289 | UVM_KMF_WIRED); 291 | UVM_KMF_WIRED);
290} 292}
291 293
292static void 294static void
293bufpool_page_free(struct pool *pp, void *v) 295bufpool_page_free(struct pool *pp, void *v)
294{ 296{
295 297
296 uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED); 298 uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
297} 299}
298 300
299static struct pool_allocator bufmempool_allocator = { 301static struct pool_allocator bufmempool_allocator = {
300 .pa_alloc = bufpool_page_alloc, 302 .pa_alloc = bufpool_page_alloc,
301 .pa_free = bufpool_page_free, 303 .pa_free = bufpool_page_free,
302 .pa_pagesz = MAXBSIZE, 304 .pa_pagesz = MAXBSIZE,
303}; 305};
304 306
305/* Buffer memory management variables */ 307/* Buffer memory management variables */
306u_long bufmem_valimit; 308u_long bufmem_valimit;
307u_long bufmem_hiwater; 309u_long bufmem_hiwater;
308u_long bufmem_lowater; 310u_long bufmem_lowater;
309u_long bufmem; 311u_long bufmem;
310 312
311/* 313/*
312 * MD code can call this to set a hard limit on the amount 314 * MD code can call this to set a hard limit on the amount
313 * of virtual memory used by the buffer cache. 315 * of virtual memory used by the buffer cache.
314 */ 316 */
315int 317int
316buf_setvalimit(vsize_t sz) 318buf_setvalimit(vsize_t sz)
317{ 319{
318 320
319 /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */ 321 /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */
320 if (sz < NMEMPOOLS * MAXBSIZE) 322 if (sz < NMEMPOOLS * MAXBSIZE)
321 return EINVAL; 323 return EINVAL;
322 324
323 bufmem_valimit = sz; 325 bufmem_valimit = sz;
324 return 0; 326 return 0;
325} 327}
326 328
327static void 329static void
328buf_setwm(void) 330buf_setwm(void)
329{ 331{
330 332
331 bufmem_hiwater = buf_memcalc(); 333 bufmem_hiwater = buf_memcalc();
332 /* lowater is approx. 2% of memory (with bufcache = 15) */ 334 /* lowater is approx. 2% of memory (with bufcache = 15) */
333#define BUFMEM_WMSHIFT 3 335#define BUFMEM_WMSHIFT 3
334#define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT) 336#define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT)
335 if (bufmem_hiwater < BUFMEM_HIWMMIN) 337 if (bufmem_hiwater < BUFMEM_HIWMMIN)
336 /* Ensure a reasonable minimum value */ 338 /* Ensure a reasonable minimum value */
337 bufmem_hiwater = BUFMEM_HIWMMIN; 339 bufmem_hiwater = BUFMEM_HIWMMIN;
338 bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT; 340 bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT;
339} 341}
340 342
341#ifdef DEBUG 343#ifdef DEBUG
342int debug_verify_freelist = 0; 344int debug_verify_freelist = 0;
343static int 345static int
344checkfreelist(buf_t *bp, struct bqueue *dp, int ison) 346checkfreelist(buf_t *bp, struct bqueue *dp, int ison)
345{ 347{
346 buf_t *b; 348 buf_t *b;
347 349
348 if (!debug_verify_freelist) 350 if (!debug_verify_freelist)
349 return 1; 351 return 1;
350 352
351 TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) { 353 TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) {
352 if (b == bp) 354 if (b == bp)
353 return ison ? 1 : 0; 355 return ison ? 1 : 0;
354 } 356 }
355 357
356 return ison ? 0 : 1; 358 return ison ? 0 : 1;
357} 359}
358#endif 360#endif
359 361
360/* 362/*
361 * Insq/Remq for the buffer hash lists. 363 * Insq/Remq for the buffer hash lists.
362 * Call with buffer queue locked. 364 * Call with buffer queue locked.
363 */ 365 */
364static void 366static void
365binsheadfree(buf_t *bp, struct bqueue *dp) 367binsheadfree(buf_t *bp, struct bqueue *dp)
366{ 368{
367 369
368 KASSERT(mutex_owned(&bufcache_lock)); 370 KASSERT(mutex_owned(&bufcache_lock));
369 KASSERT(bp->b_freelistindex == -1); 371 KASSERT(bp->b_freelistindex == -1);
370 TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist); 372 TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist);
371 dp->bq_bytes += bp->b_bufsize; 373 dp->bq_bytes += bp->b_bufsize;
372 bp->b_freelistindex = dp - bufqueues; 374 bp->b_freelistindex = dp - bufqueues;
373} 375}
374 376
375static void 377static void
376binstailfree(buf_t *bp, struct bqueue *dp) 378binstailfree(buf_t *bp, struct bqueue *dp)
377{ 379{
378 380
379 KASSERT(mutex_owned(&bufcache_lock)); 381 KASSERT(mutex_owned(&bufcache_lock));
380 KASSERTMSG(bp->b_freelistindex == -1, "double free of buffer? " 382 KASSERTMSG(bp->b_freelistindex == -1, "double free of buffer? "
381 "bp=%p, b_freelistindex=%d\n", bp, bp->b_freelistindex); 383 "bp=%p, b_freelistindex=%d\n", bp, bp->b_freelistindex);
382 TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist); 384 TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist);
383 dp->bq_bytes += bp->b_bufsize; 385 dp->bq_bytes += bp->b_bufsize;
384 bp->b_freelistindex = dp - bufqueues; 386 bp->b_freelistindex = dp - bufqueues;
385} 387}
386 388
387void 389void
388bremfree(buf_t *bp) 390bremfree(buf_t *bp)
389{ 391{
390 struct bqueue *dp; 392 struct bqueue *dp;
391 int bqidx = bp->b_freelistindex; 393 int bqidx = bp->b_freelistindex;
392 394
393 KASSERT(mutex_owned(&bufcache_lock)); 395 KASSERT(mutex_owned(&bufcache_lock));
394 396
395 KASSERT(bqidx != -1); 397 KASSERT(bqidx != -1);
396 dp = &bufqueues[bqidx]; 398 dp = &bufqueues[bqidx];
397 KDASSERT(checkfreelist(bp, dp, 1)); 399 KDASSERT(checkfreelist(bp, dp, 1));
398 KASSERT(dp->bq_bytes >= bp->b_bufsize); 400 KASSERT(dp->bq_bytes >= bp->b_bufsize);
399 TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist); 401 TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist);
400 dp->bq_bytes -= bp->b_bufsize; 402 dp->bq_bytes -= bp->b_bufsize;
401 403
402 /* For the sysctl helper. */ 404 /* For the sysctl helper. */
403 if (bp == dp->bq_marker) 405 if (bp == dp->bq_marker)
404 dp->bq_marker = NULL; 406 dp->bq_marker = NULL;
405 407
406#if defined(DIAGNOSTIC) 408#if defined(DIAGNOSTIC)
407 bp->b_freelistindex = -1; 409 bp->b_freelistindex = -1;
408#endif /* defined(DIAGNOSTIC) */ 410#endif /* defined(DIAGNOSTIC) */
409} 411}
410 412
411/* 413/*
412 * note that for some ports this is used by pmap bootstrap code to 414 * note that for some ports this is used by pmap bootstrap code to
413 * determine kva size. 415 * determine kva size.
414 */ 416 */
415u_long 417u_long
416buf_memcalc(void) 418buf_memcalc(void)
417{ 419{
418 u_long n; 420 u_long n;
419 vsize_t mapsz = 0; 421 vsize_t mapsz = 0;
420 422
421 /* 423 /*
422 * Determine the upper bound of memory to use for buffers. 424 * Determine the upper bound of memory to use for buffers.
423 * 425 *
424 * - If bufpages is specified, use that as the number 426 * - If bufpages is specified, use that as the number
425 * pages. 427 * pages.
426 * 428 *
427 * - Otherwise, use bufcache as the percentage of 429 * - Otherwise, use bufcache as the percentage of
428 * physical memory. 430 * physical memory.
429 */ 431 */
430 if (bufpages != 0) { 432 if (bufpages != 0) {
431 n = bufpages; 433 n = bufpages;
432 } else { 434 } else {
433 if (bufcache < 5) { 435 if (bufcache < 5) {
434 printf("forcing bufcache %d -> 5", bufcache); 436 printf("forcing bufcache %d -> 5", bufcache);
435 bufcache = 5; 437 bufcache = 5;
436 } 438 }
437 if (bufcache > 95) { 439 if (bufcache > 95) {
438 printf("forcing bufcache %d -> 95", bufcache); 440 printf("forcing bufcache %d -> 95", bufcache);
439 bufcache = 95; 441 bufcache = 95;
440 } 442 }
441 if (buf_map != NULL) 443 if (buf_map != NULL)
442 mapsz = vm_map_max(buf_map) - vm_map_min(buf_map); 444 mapsz = vm_map_max(buf_map) - vm_map_min(buf_map);
443 n = calc_cache_size(mapsz, bufcache, 445 n = calc_cache_size(mapsz, bufcache,
444 (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT) 446 (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT)
445 / PAGE_SIZE; 447 / PAGE_SIZE;
446 } 448 }
447 449
448 n <<= PAGE_SHIFT; 450 n <<= PAGE_SHIFT;
449 if (bufmem_valimit != 0 && n > bufmem_valimit) 451 if (bufmem_valimit != 0 && n > bufmem_valimit)
450 n = bufmem_valimit; 452 n = bufmem_valimit;
451 453
452 return (n); 454 return (n);
453} 455}
454 456
455/* 457/*
456 * Initialize buffers and hash links for buffers. 458 * Initialize buffers and hash links for buffers.
457 */ 459 */
458void 460void
459bufinit(void) 461bufinit(void)
460{ 462{
461 struct bqueue *dp; 463 struct bqueue *dp;
462 int use_std; 464 int use_std;
463 u_int i; 465 u_int i;
464 466
465 biodone_vfs = biodone; 467 biodone_vfs = biodone;
466 468
467 mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE); 469 mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE);
468 mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE); 470 mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE);
469 cv_init(&needbuffer_cv, "needbuf"); 471 cv_init(&needbuffer_cv, "needbuf");
470 472
471 if (bufmem_valimit != 0) { 473 if (bufmem_valimit != 0) {
472 vaddr_t minaddr = 0, maxaddr; 474 vaddr_t minaddr = 0, maxaddr;
473 buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, 475 buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
474 bufmem_valimit, 0, false, 0); 476 bufmem_valimit, 0, false, 0);
475 if (buf_map == NULL) 477 if (buf_map == NULL)
476 panic("bufinit: cannot allocate submap"); 478 panic("bufinit: cannot allocate submap");
477 } else 479 } else
478 buf_map = kernel_map; 480 buf_map = kernel_map;
479 481
480 /* 482 /*
481 * Initialize buffer cache memory parameters. 483 * Initialize buffer cache memory parameters.
482 */ 484 */
483 bufmem = 0; 485 bufmem = 0;
484 buf_setwm(); 486 buf_setwm();
485 487
486 /* On "small" machines use small pool page sizes where possible */ 488 /* On "small" machines use small pool page sizes where possible */
487 use_std = (physmem < atop(16*1024*1024)); 489 use_std = (physmem < atop(16*1024*1024));
488 490
489 /* 491 /*
490 * Also use them on systems that can map the pool pages using 492 * Also use them on systems that can map the pool pages using
491 * a direct-mapped segment. 493 * a direct-mapped segment.
492 */ 494 */
493#ifdef PMAP_MAP_POOLPAGE 495#ifdef PMAP_MAP_POOLPAGE
494 use_std = 1; 496 use_std = 1;
495#endif 497#endif
496 498
497 buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0, 499 buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
498 "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL); 500 "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL);
499 bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0, 501 bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
500 "biopl", NULL, IPL_BIO, NULL, NULL, NULL); 502 "biopl", NULL, IPL_BIO, NULL, NULL, NULL);
501 503
502 for (i = 0; i < NMEMPOOLS; i++) { 504 for (i = 0; i < NMEMPOOLS; i++) {
503 struct pool_allocator *pa; 505 struct pool_allocator *pa;
504 struct pool *pp = &bmempools[i]; 506 struct pool *pp = &bmempools[i];
505 u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET); 507 u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET);
506 char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */ 508 char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */
507 if (__predict_false(size >= 1048576)) 509 if (__predict_false(size >= 1048576))
508 (void)snprintf(name, 8, "buf%um", size / 1048576); 510 (void)snprintf(name, 8, "buf%um", size / 1048576);
509 else if (__predict_true(size >= 1024)) 511 else if (__predict_true(size >= 1024))
510 (void)snprintf(name, 8, "buf%uk", size / 1024); 512 (void)snprintf(name, 8, "buf%uk", size / 1024);
511 else 513 else
512 (void)snprintf(name, 8, "buf%ub", size); 514 (void)snprintf(name, 8, "buf%ub", size);
513 pa = (size <= PAGE_SIZE && use_std) 515 pa = (size <= PAGE_SIZE && use_std)
514 ? &pool_allocator_nointr 516 ? &pool_allocator_nointr
515 : &bufmempool_allocator; 517 : &bufmempool_allocator;
516 pool_init(pp, size, DEV_BSIZE, 0, 0, name, pa, IPL_NONE); 518 pool_init(pp, size, DEV_BSIZE, 0, 0, name, pa, IPL_NONE);
517 pool_setlowat(pp, 1); 519 pool_setlowat(pp, 1);
518 pool_sethiwat(pp, 1); 520 pool_sethiwat(pp, 1);
519 } 521 }
520 522
521 /* Initialize the buffer queues */ 523 /* Initialize the buffer queues */
522 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) { 524 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
523 TAILQ_INIT(&dp->bq_queue); 525 TAILQ_INIT(&dp->bq_queue);
524 dp->bq_bytes = 0; 526 dp->bq_bytes = 0;
525 } 527 }
526 528
527 /* 529 /*
528 * Estimate hash table size based on the amount of memory we 530 * Estimate hash table size based on the amount of memory we
529 * intend to use for the buffer cache. The average buffer 531 * intend to use for the buffer cache. The average buffer
530 * size is dependent on our clients (i.e. filesystems). 532 * size is dependent on our clients (i.e. filesystems).
531 * 533 *
532 * For now, use an empirical 3K per buffer. 534 * For now, use an empirical 3K per buffer.
533 */ 535 */
534 nbuf = (bufmem_hiwater / 1024) / 3; 536 nbuf = (bufmem_hiwater / 1024) / 3;
535 bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash); 537 bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash);
536 538
537 sysctl_kern_buf_setup(); 539 sysctl_kern_buf_setup();
538 sysctl_vm_buf_setup(); 540 sysctl_vm_buf_setup();
 541 hashstat_register("bufhash", bufhash_stats);
539} 542}
540 543
541void 544void
542bufinit2(void) 545bufinit2(void)
543{ 546{
544 547
545 biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr, 548 biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr,
546 NULL); 549 NULL);
547 if (biodone_sih == NULL) 550 if (biodone_sih == NULL)
548 panic("bufinit2: can't establish soft interrupt"); 551 panic("bufinit2: can't establish soft interrupt");
549} 552}
550 553
551static int 554static int
552buf_lotsfree(void) 555buf_lotsfree(void)
553{ 556{
554 u_long guess; 557 u_long guess;
555 558
556 /* Always allocate if less than the low water mark. */ 559 /* Always allocate if less than the low water mark. */
557 if (bufmem < bufmem_lowater) 560 if (bufmem < bufmem_lowater)
558 return 1; 561 return 1;
559 562
560 /* Never allocate if greater than the high water mark. */ 563 /* Never allocate if greater than the high water mark. */
561 if (bufmem > bufmem_hiwater) 564 if (bufmem > bufmem_hiwater)
562 return 0; 565 return 0;
563 566
564 /* If there's anything on the AGE list, it should be eaten. */ 567 /* If there's anything on the AGE list, it should be eaten. */
565 if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL) 568 if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL)
566 return 0; 569 return 0;
567 570
568 /* 571 /*
569 * The probabily of getting a new allocation is inversely 572 * The probabily of getting a new allocation is inversely
570 * proportional to the current size of the cache above 573 * proportional to the current size of the cache above
571 * the low water mark. Divide the total first to avoid overflows 574 * the low water mark. Divide the total first to avoid overflows
572 * in the product. 575 * in the product.
573 */ 576 */
574 guess = cprng_fast32() % 16; 577 guess = cprng_fast32() % 16;
575 578
576 if ((bufmem_hiwater - bufmem_lowater) / 16 * guess >= 579 if ((bufmem_hiwater - bufmem_lowater) / 16 * guess >=
577 (bufmem - bufmem_lowater)) 580 (bufmem - bufmem_lowater))
578 return 1; 581 return 1;
579 582
580 /* Otherwise don't allocate. */ 583 /* Otherwise don't allocate. */
581 return 0; 584 return 0;
582} 585}
583 586
584/* 587/*
585 * Return estimate of bytes we think need to be 588 * Return estimate of bytes we think need to be
586 * released to help resolve low memory conditions. 589 * released to help resolve low memory conditions.
587 * 590 *
588 * => called with bufcache_lock held. 591 * => called with bufcache_lock held.
589 */ 592 */
590static int 593static int
591buf_canrelease(void) 594buf_canrelease(void)
592{ 595{
593 int pagedemand, ninvalid = 0; 596 int pagedemand, ninvalid = 0;
594 597
595 KASSERT(mutex_owned(&bufcache_lock)); 598 KASSERT(mutex_owned(&bufcache_lock));
596 599
597 if (bufmem < bufmem_lowater) 600 if (bufmem < bufmem_lowater)
598 return 0; 601 return 0;
599 602
600 if (bufmem > bufmem_hiwater) 603 if (bufmem > bufmem_hiwater)
601 return bufmem - bufmem_hiwater; 604 return bufmem - bufmem_hiwater;
602 605
603 ninvalid += bufqueues[BQ_AGE].bq_bytes; 606 ninvalid += bufqueues[BQ_AGE].bq_bytes;
604 607
605 pagedemand = uvmexp.freetarg - uvm_availmem(false); 608 pagedemand = uvmexp.freetarg - uvm_availmem(false);
606 if (pagedemand < 0) 609 if (pagedemand < 0)
607 return ninvalid; 610 return ninvalid;
608 return MAX(ninvalid, MIN(2 * MAXBSIZE, 611 return MAX(ninvalid, MIN(2 * MAXBSIZE,
609 MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE))); 612 MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE)));
610} 613}
611 614
612/* 615/*
613 * Buffer memory allocation helper functions 616 * Buffer memory allocation helper functions
614 */ 617 */
615static u_long 618static u_long
616buf_mempoolidx(u_long size) 619buf_mempoolidx(u_long size)
617{ 620{
618 u_int n = 0; 621 u_int n = 0;
619 622
620 size -= 1; 623 size -= 1;
621 size >>= MEMPOOL_INDEX_OFFSET; 624 size >>= MEMPOOL_INDEX_OFFSET;
622 while (size) { 625 while (size) {
623 size >>= 1; 626 size >>= 1;
624 n += 1; 627 n += 1;
625 } 628 }
626 if (n >= NMEMPOOLS) 629 if (n >= NMEMPOOLS)
627 panic("buf mem pool index %d", n); 630 panic("buf mem pool index %d", n);
628 return n; 631 return n;
629} 632}
630 633
631static u_long 634static u_long
632buf_roundsize(u_long size) 635buf_roundsize(u_long size)
633{ 636{
634 /* Round up to nearest power of 2 */ 637 /* Round up to nearest power of 2 */
635 return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET)); 638 return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET));
636} 639}
637 640
638static void * 641static void *
639buf_alloc(size_t size) 642buf_alloc(size_t size)
640{ 643{
641 u_int n = buf_mempoolidx(size); 644 u_int n = buf_mempoolidx(size);
642 void *addr; 645 void *addr;
643 646
644 while (1) { 647 while (1) {
645 addr = pool_get(&bmempools[n], PR_NOWAIT); 648 addr = pool_get(&bmempools[n], PR_NOWAIT);
646 if (addr != NULL) 649 if (addr != NULL)
647 break; 650 break;
648 651
649 /* No memory, see if we can free some. If so, try again */ 652 /* No memory, see if we can free some. If so, try again */
650 mutex_enter(&bufcache_lock); 653 mutex_enter(&bufcache_lock);
651 if (buf_drain(1) > 0) { 654 if (buf_drain(1) > 0) {
652 mutex_exit(&bufcache_lock); 655 mutex_exit(&bufcache_lock);
653 continue; 656 continue;
654 } 657 }
655 658
656 if (curlwp == uvm.pagedaemon_lwp) { 659 if (curlwp == uvm.pagedaemon_lwp) {
657 mutex_exit(&bufcache_lock); 660 mutex_exit(&bufcache_lock);
658 return NULL; 661 return NULL;
659 } 662 }
660 663
661 /* Wait for buffers to arrive on the LRU queue */ 664 /* Wait for buffers to arrive on the LRU queue */
662 cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4); 665 cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4);
663 mutex_exit(&bufcache_lock); 666 mutex_exit(&bufcache_lock);
664 } 667 }
665 668
666 return addr; 669 return addr;
667} 670}
668 671
669static void 672static void
670buf_mrelease(void *addr, size_t size) 673buf_mrelease(void *addr, size_t size)
671{ 674{
672 675
673 pool_put(&bmempools[buf_mempoolidx(size)], addr); 676 pool_put(&bmempools[buf_mempoolidx(size)], addr);
674} 677}
675 678
676/* 679/*
677 * bread()/breadn() helper. 680 * bread()/breadn() helper.
678 */ 681 */
679static buf_t * 682static buf_t *
680bio_doread(struct vnode *vp, daddr_t blkno, int size, int async) 683bio_doread(struct vnode *vp, daddr_t blkno, int size, int async)
681{ 684{
682 buf_t *bp; 685 buf_t *bp;
683 struct mount *mp; 686 struct mount *mp;
684 687
685 bp = getblk(vp, blkno, size, 0, 0); 688 bp = getblk(vp, blkno, size, 0, 0);
686 689
687 /* 690 /*
688 * getblk() may return NULL if we are the pagedaemon. 691 * getblk() may return NULL if we are the pagedaemon.
689 */ 692 */
690 if (bp == NULL) { 693 if (bp == NULL) {
691 KASSERT(curlwp == uvm.pagedaemon_lwp); 694 KASSERT(curlwp == uvm.pagedaemon_lwp);
692 return NULL; 695 return NULL;
693 } 696 }
694 697
695 /* 698 /*
696 * If buffer does not have data valid, start a read. 699 * If buffer does not have data valid, start a read.
697 * Note that if buffer is BC_INVAL, getblk() won't return it. 700 * Note that if buffer is BC_INVAL, getblk() won't return it.
698 * Therefore, it's valid if its I/O has completed or been delayed. 701 * Therefore, it's valid if its I/O has completed or been delayed.
699 */ 702 */
700 if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) { 703 if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) {
701 /* Start I/O for the buffer. */ 704 /* Start I/O for the buffer. */
702 SET(bp->b_flags, B_READ | async); 705 SET(bp->b_flags, B_READ | async);
703 if (async) 706 if (async)
704 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 707 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
705 else 708 else
706 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 709 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
707 VOP_STRATEGY(vp, bp); 710 VOP_STRATEGY(vp, bp);
708 711
709 /* Pay for the read. */ 712 /* Pay for the read. */
710 curlwp->l_ru.ru_inblock++; 713 curlwp->l_ru.ru_inblock++;
711 } else if (async) 714 } else if (async)
712 brelse(bp, 0); 715 brelse(bp, 0);
713 716
714 if (vp->v_type == VBLK) 717 if (vp->v_type == VBLK)
715 mp = spec_node_getmountedfs(vp); 718 mp = spec_node_getmountedfs(vp);
716 else 719 else
717 mp = vp->v_mount; 720 mp = vp->v_mount;
718 721
719 /* 722 /*
720 * Collect statistics on synchronous and asynchronous reads. 723 * Collect statistics on synchronous and asynchronous reads.
721 * Reads from block devices are charged to their associated 724 * Reads from block devices are charged to their associated
722 * filesystem (if any). 725 * filesystem (if any).
723 */ 726 */
724 if (mp != NULL) { 727 if (mp != NULL) {
725 if (async == 0) 728 if (async == 0)
726 mp->mnt_stat.f_syncreads++; 729 mp->mnt_stat.f_syncreads++;
727 else 730 else
728 mp->mnt_stat.f_asyncreads++; 731 mp->mnt_stat.f_asyncreads++;
729 } 732 }
730 733
731 return (bp); 734 return (bp);
732} 735}
733 736
734/* 737/*
735 * Read a disk block. 738 * Read a disk block.
736 * This algorithm described in Bach (p.54). 739 * This algorithm described in Bach (p.54).
737 */ 740 */
738int 741int
739bread(struct vnode *vp, daddr_t blkno, int size, int flags, buf_t **bpp) 742bread(struct vnode *vp, daddr_t blkno, int size, int flags, buf_t **bpp)
740{ 743{
741 buf_t *bp; 744 buf_t *bp;
742 int error; 745 int error;
743 746
744 BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); 747 BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
745 748
746 /* Get buffer for block. */ 749 /* Get buffer for block. */
747 bp = *bpp = bio_doread(vp, blkno, size, 0); 750 bp = *bpp = bio_doread(vp, blkno, size, 0);
748 if (bp == NULL) 751 if (bp == NULL)
749 return ENOMEM; 752 return ENOMEM;
750 753
751 /* Wait for the read to complete, and return result. */ 754 /* Wait for the read to complete, and return result. */
752 error = biowait(bp); 755 error = biowait(bp);
753 if (error == 0 && (flags & B_MODIFY) != 0) 756 if (error == 0 && (flags & B_MODIFY) != 0)
754 error = fscow_run(bp, true); 757 error = fscow_run(bp, true);
755 if (error) { 758 if (error) {
756 brelse(bp, 0); 759 brelse(bp, 0);
757 *bpp = NULL; 760 *bpp = NULL;
758 } 761 }
759 762
760 return error; 763 return error;
761} 764}
762 765
763/* 766/*
764 * Read-ahead multiple disk blocks. The first is sync, the rest async. 767 * Read-ahead multiple disk blocks. The first is sync, the rest async.
765 * Trivial modification to the breada algorithm presented in Bach (p.55). 768 * Trivial modification to the breada algorithm presented in Bach (p.55).
766 */ 769 */
767int 770int
768breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks, 771breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks,
769 int *rasizes, int nrablks, int flags, buf_t **bpp) 772 int *rasizes, int nrablks, int flags, buf_t **bpp)
770{ 773{
771 buf_t *bp; 774 buf_t *bp;
772 int error, i; 775 int error, i;
773 776
774 BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); 777 BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
775 778
776 bp = *bpp = bio_doread(vp, blkno, size, 0); 779 bp = *bpp = bio_doread(vp, blkno, size, 0);
777 if (bp == NULL) 780 if (bp == NULL)
778 return ENOMEM; 781 return ENOMEM;
779 782
780 /* 783 /*
781 * For each of the read-ahead blocks, start a read, if necessary. 784 * For each of the read-ahead blocks, start a read, if necessary.
782 */ 785 */
783 mutex_enter(&bufcache_lock); 786 mutex_enter(&bufcache_lock);
784 for (i = 0; i < nrablks; i++) { 787 for (i = 0; i < nrablks; i++) {
785 /* If it's in the cache, just go on to next one. */ 788 /* If it's in the cache, just go on to next one. */
786 if (incore(vp, rablks[i])) 789 if (incore(vp, rablks[i]))
787 continue; 790 continue;
788 791
789 /* Get a buffer for the read-ahead block */ 792 /* Get a buffer for the read-ahead block */
790 mutex_exit(&bufcache_lock); 793 mutex_exit(&bufcache_lock);
791 (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC); 794 (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
792 mutex_enter(&bufcache_lock); 795 mutex_enter(&bufcache_lock);
793 } 796 }
794 mutex_exit(&bufcache_lock); 797 mutex_exit(&bufcache_lock);
795 798
796 /* Otherwise, we had to start a read for it; wait until it's valid. */ 799 /* Otherwise, we had to start a read for it; wait until it's valid. */
797 error = biowait(bp); 800 error = biowait(bp);
798 if (error == 0 && (flags & B_MODIFY) != 0) 801 if (error == 0 && (flags & B_MODIFY) != 0)
799 error = fscow_run(bp, true); 802 error = fscow_run(bp, true);
800 if (error) { 803 if (error) {
801 brelse(bp, 0); 804 brelse(bp, 0);
802 *bpp = NULL; 805 *bpp = NULL;
803 } 806 }
804 807
805 return error; 808 return error;
806} 809}
807 810
808/* 811/*
809 * Block write. Described in Bach (p.56) 812 * Block write. Described in Bach (p.56)
810 */ 813 */
811int 814int
812bwrite(buf_t *bp) 815bwrite(buf_t *bp)
813{ 816{
814 int rv, sync, wasdelayed; 817 int rv, sync, wasdelayed;
815 struct vnode *vp; 818 struct vnode *vp;
816 struct mount *mp; 819 struct mount *mp;
817 820
818 BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx", 821 BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx",
819 (uintptr_t)bp, 0, 0, 0); 822 (uintptr_t)bp, 0, 0, 0);
820 823
821 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 824 KASSERT(ISSET(bp->b_cflags, BC_BUSY));
822 KASSERT(!cv_has_waiters(&bp->b_done)); 825 KASSERT(!cv_has_waiters(&bp->b_done));
823 826
824 vp = bp->b_vp; 827 vp = bp->b_vp;
825 828
826 /* 829 /*
827 * dholland 20160728 AFAICT vp==NULL must be impossible as it 830 * dholland 20160728 AFAICT vp==NULL must be impossible as it
828 * will crash upon reaching VOP_STRATEGY below... see further 831 * will crash upon reaching VOP_STRATEGY below... see further
829 * analysis on tech-kern. 832 * analysis on tech-kern.
830 */ 833 */
831 KASSERTMSG(vp != NULL, "bwrite given buffer with null vnode"); 834 KASSERTMSG(vp != NULL, "bwrite given buffer with null vnode");
832 835
833 if (vp != NULL) { 836 if (vp != NULL) {
834 KASSERT(bp->b_objlock == vp->v_interlock); 837 KASSERT(bp->b_objlock == vp->v_interlock);
835 if (vp->v_type == VBLK) 838 if (vp->v_type == VBLK)
836 mp = spec_node_getmountedfs(vp); 839 mp = spec_node_getmountedfs(vp);
837 else 840 else
838 mp = vp->v_mount; 841 mp = vp->v_mount;
839 } else { 842 } else {
840 mp = NULL; 843 mp = NULL;
841 } 844 }
842 845
843 if (mp && mp->mnt_wapbl) { 846 if (mp && mp->mnt_wapbl) {
844 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { 847 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
845 bdwrite(bp); 848 bdwrite(bp);
846 return 0; 849 return 0;
847 } 850 }
848 } 851 }
849 852
850 /* 853 /*
851 * Remember buffer type, to switch on it later. If the write was 854 * Remember buffer type, to switch on it later. If the write was
852 * synchronous, but the file system was mounted with MNT_ASYNC, 855 * synchronous, but the file system was mounted with MNT_ASYNC,
853 * convert it to a delayed write. 856 * convert it to a delayed write.
854 * XXX note that this relies on delayed tape writes being converted 857 * XXX note that this relies on delayed tape writes being converted
855 * to async, not sync writes (which is safe, but ugly). 858 * to async, not sync writes (which is safe, but ugly).
856 */ 859 */
857 sync = !ISSET(bp->b_flags, B_ASYNC); 860 sync = !ISSET(bp->b_flags, B_ASYNC);
858 if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) { 861 if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) {
859 bdwrite(bp); 862 bdwrite(bp);
860 return (0); 863 return (0);
861 } 864 }
862 865
863 /* 866 /*
864 * Collect statistics on synchronous and asynchronous writes. 867 * Collect statistics on synchronous and asynchronous writes.
865 * Writes to block devices are charged to their associated 868 * Writes to block devices are charged to their associated
866 * filesystem (if any). 869 * filesystem (if any).
867 */ 870 */
868 if (mp != NULL) { 871 if (mp != NULL) {
869 if (sync) 872 if (sync)
870 mp->mnt_stat.f_syncwrites++; 873 mp->mnt_stat.f_syncwrites++;
871 else 874 else
872 mp->mnt_stat.f_asyncwrites++; 875 mp->mnt_stat.f_asyncwrites++;
873 } 876 }
874 877
875 /* 878 /*
876 * Pay for the I/O operation and make sure the buf is on the correct 879 * Pay for the I/O operation and make sure the buf is on the correct
877 * vnode queue. 880 * vnode queue.
878 */ 881 */
879 bp->b_error = 0; 882 bp->b_error = 0;
880 wasdelayed = ISSET(bp->b_oflags, BO_DELWRI); 883 wasdelayed = ISSET(bp->b_oflags, BO_DELWRI);
881 CLR(bp->b_flags, B_READ); 884 CLR(bp->b_flags, B_READ);
882 if (wasdelayed) { 885 if (wasdelayed) {
883 mutex_enter(&bufcache_lock); 886 mutex_enter(&bufcache_lock);
884 mutex_enter(bp->b_objlock); 887 mutex_enter(bp->b_objlock);
885 CLR(bp->b_oflags, BO_DONE | BO_DELWRI); 888 CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
886 reassignbuf(bp, bp->b_vp); 889 reassignbuf(bp, bp->b_vp);
887 /* Wake anyone trying to busy the buffer via vnode's lists. */ 890 /* Wake anyone trying to busy the buffer via vnode's lists. */
888 cv_broadcast(&bp->b_busy); 891 cv_broadcast(&bp->b_busy);
889 mutex_exit(&bufcache_lock); 892 mutex_exit(&bufcache_lock);
890 } else { 893 } else {
891 curlwp->l_ru.ru_oublock++; 894 curlwp->l_ru.ru_oublock++;
892 mutex_enter(bp->b_objlock); 895 mutex_enter(bp->b_objlock);
893 CLR(bp->b_oflags, BO_DONE | BO_DELWRI); 896 CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
894 } 897 }
895 if (vp != NULL) 898 if (vp != NULL)
896 vp->v_numoutput++; 899 vp->v_numoutput++;
897 mutex_exit(bp->b_objlock); 900 mutex_exit(bp->b_objlock);
898 901
899 /* Initiate disk write. */ 902 /* Initiate disk write. */
900 if (sync) 903 if (sync)
901 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 904 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
902 else 905 else
903 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 906 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
904 907
905 VOP_STRATEGY(vp, bp); 908 VOP_STRATEGY(vp, bp);
906 909
907 if (sync) { 910 if (sync) {
908 /* If I/O was synchronous, wait for it to complete. */ 911 /* If I/O was synchronous, wait for it to complete. */
909 rv = biowait(bp); 912 rv = biowait(bp);
910 913
911 /* Release the buffer. */ 914 /* Release the buffer. */
912 brelse(bp, 0); 915 brelse(bp, 0);
913 916
914 return (rv); 917 return (rv);
915 } else { 918 } else {
916 return (0); 919 return (0);
917 } 920 }
918} 921}
919 922
920int 923int
921vn_bwrite(void *v) 924vn_bwrite(void *v)
922{ 925{
923 struct vop_bwrite_args *ap = v; 926 struct vop_bwrite_args *ap = v;
924 927
925 return (bwrite(ap->a_bp)); 928 return (bwrite(ap->a_bp));
926} 929}
927 930
928/* 931/*
929 * Delayed write. 932 * Delayed write.
930 * 933 *
931 * The buffer is marked dirty, but is not queued for I/O. 934 * The buffer is marked dirty, but is not queued for I/O.
932 * This routine should be used when the buffer is expected 935 * This routine should be used when the buffer is expected
933 * to be modified again soon, typically a small write that 936 * to be modified again soon, typically a small write that
934 * partially fills a buffer. 937 * partially fills a buffer.
935 * 938 *
936 * NB: magnetic tapes cannot be delayed; they must be 939 * NB: magnetic tapes cannot be delayed; they must be
937 * written in the order that the writes are requested. 940 * written in the order that the writes are requested.
938 * 941 *
939 * Described in Leffler, et al. (pp. 208-213). 942 * Described in Leffler, et al. (pp. 208-213).
940 */ 943 */
941void 944void
942bdwrite(buf_t *bp) 945bdwrite(buf_t *bp)
943{ 946{
944 947
945 BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx", 948 BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx",
946 (uintptr_t)bp, 0, 0, 0); 949 (uintptr_t)bp, 0, 0, 0);
947 950
948 KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS || 951 KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS ||
949 bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE)); 952 bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE));
950 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 953 KASSERT(ISSET(bp->b_cflags, BC_BUSY));
951 KASSERT(!cv_has_waiters(&bp->b_done)); 954 KASSERT(!cv_has_waiters(&bp->b_done));
952 955
953 /* If this is a tape block, write the block now. */ 956 /* If this is a tape block, write the block now. */
954 if (bdev_type(bp->b_dev) == D_TAPE) { 957 if (bdev_type(bp->b_dev) == D_TAPE) {
955 bawrite(bp); 958 bawrite(bp);
956 return; 959 return;
957 } 960 }
958 961
959 if (wapbl_vphaswapbl(bp->b_vp)) { 962 if (wapbl_vphaswapbl(bp->b_vp)) {
960 struct mount *mp = wapbl_vptomp(bp->b_vp); 963 struct mount *mp = wapbl_vptomp(bp->b_vp);
961 964
962 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { 965 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
963 WAPBL_ADD_BUF(mp, bp); 966 WAPBL_ADD_BUF(mp, bp);
964 } 967 }
965 } 968 }
966 969
967 /* 970 /*
968 * If the block hasn't been seen before: 971 * If the block hasn't been seen before:
969 * (1) Mark it as having been seen, 972 * (1) Mark it as having been seen,
970 * (2) Charge for the write, 973 * (2) Charge for the write,
971 * (3) Make sure it's on its vnode's correct block list. 974 * (3) Make sure it's on its vnode's correct block list.
972 */ 975 */
973 KASSERT(bp->b_vp == NULL || bp->b_objlock == bp->b_vp->v_interlock); 976 KASSERT(bp->b_vp == NULL || bp->b_objlock == bp->b_vp->v_interlock);
974 977
975 if (!ISSET(bp->b_oflags, BO_DELWRI)) { 978 if (!ISSET(bp->b_oflags, BO_DELWRI)) {
976 mutex_enter(&bufcache_lock); 979 mutex_enter(&bufcache_lock);
977 mutex_enter(bp->b_objlock); 980 mutex_enter(bp->b_objlock);
978 SET(bp->b_oflags, BO_DELWRI); 981 SET(bp->b_oflags, BO_DELWRI);
979 curlwp->l_ru.ru_oublock++; 982 curlwp->l_ru.ru_oublock++;
980 reassignbuf(bp, bp->b_vp); 983 reassignbuf(bp, bp->b_vp);
981 /* Wake anyone trying to busy the buffer via vnode's lists. */ 984 /* Wake anyone trying to busy the buffer via vnode's lists. */
982 cv_broadcast(&bp->b_busy); 985 cv_broadcast(&bp->b_busy);
983 mutex_exit(&bufcache_lock); 986 mutex_exit(&bufcache_lock);
984 } else { 987 } else {
985 mutex_enter(bp->b_objlock); 988 mutex_enter(bp->b_objlock);
986 } 989 }
987 /* Otherwise, the "write" is done, so mark and release the buffer. */ 990 /* Otherwise, the "write" is done, so mark and release the buffer. */
988 CLR(bp->b_oflags, BO_DONE); 991 CLR(bp->b_oflags, BO_DONE);
989 mutex_exit(bp->b_objlock); 992 mutex_exit(bp->b_objlock);
990 993
991 brelse(bp, 0); 994 brelse(bp, 0);
992} 995}
993 996
994/* 997/*
995 * Asynchronous block write; just an asynchronous bwrite(). 998 * Asynchronous block write; just an asynchronous bwrite().
996 */ 999 */
997void 1000void
998bawrite(buf_t *bp) 1001bawrite(buf_t *bp)
999{ 1002{
1000 1003
1001 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 1004 KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1002 KASSERT(bp->b_vp != NULL); 1005 KASSERT(bp->b_vp != NULL);
1003 1006
1004 SET(bp->b_flags, B_ASYNC); 1007 SET(bp->b_flags, B_ASYNC);
1005 VOP_BWRITE(bp->b_vp, bp); 1008 VOP_BWRITE(bp->b_vp, bp);
1006} 1009}
1007 1010
1008/* 1011/*
1009 * Release a buffer on to the free lists. 1012 * Release a buffer on to the free lists.
1010 * Described in Bach (p. 46). 1013 * Described in Bach (p. 46).
1011 */ 1014 */
1012void 1015void
1013brelsel(buf_t *bp, int set) 1016brelsel(buf_t *bp, int set)
1014{ 1017{
1015 struct bqueue *bufq; 1018 struct bqueue *bufq;
1016 struct vnode *vp; 1019 struct vnode *vp;
1017 1020
1018 SDT_PROBE2(io, kernel, , brelse, bp, set); 1021 SDT_PROBE2(io, kernel, , brelse, bp, set);
1019 1022
1020 KASSERT(bp != NULL); 1023 KASSERT(bp != NULL);
1021 KASSERT(mutex_owned(&bufcache_lock)); 1024 KASSERT(mutex_owned(&bufcache_lock));
1022 KASSERT(!cv_has_waiters(&bp->b_done)); 1025 KASSERT(!cv_has_waiters(&bp->b_done));
1023 1026
1024 SET(bp->b_cflags, set); 1027 SET(bp->b_cflags, set);
1025 1028
1026 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 1029 KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1027 KASSERT(bp->b_iodone == NULL); 1030 KASSERT(bp->b_iodone == NULL);
1028 1031
1029 /* Wake up any processes waiting for any buffer to become free. */ 1032 /* Wake up any processes waiting for any buffer to become free. */
1030 cv_signal(&needbuffer_cv); 1033 cv_signal(&needbuffer_cv);
1031 1034
1032 /* Wake up any proceeses waiting for _this_ buffer to become free */ 1035 /* Wake up any proceeses waiting for _this_ buffer to become free */
1033 if (ISSET(bp->b_cflags, BC_WANTED)) 1036 if (ISSET(bp->b_cflags, BC_WANTED))
1034 CLR(bp->b_cflags, BC_WANTED|BC_AGE); 1037 CLR(bp->b_cflags, BC_WANTED|BC_AGE);
1035 1038
1036 /* If it's clean clear the copy-on-write flag. */ 1039 /* If it's clean clear the copy-on-write flag. */
1037 if (ISSET(bp->b_flags, B_COWDONE)) { 1040 if (ISSET(bp->b_flags, B_COWDONE)) {
1038 mutex_enter(bp->b_objlock); 1041 mutex_enter(bp->b_objlock);
1039 if (!ISSET(bp->b_oflags, BO_DELWRI)) 1042 if (!ISSET(bp->b_oflags, BO_DELWRI))
1040 CLR(bp->b_flags, B_COWDONE); 1043 CLR(bp->b_flags, B_COWDONE);
1041 mutex_exit(bp->b_objlock); 1044 mutex_exit(bp->b_objlock);
1042 } 1045 }
1043 1046
1044 /* 1047 /*
1045 * Determine which queue the buffer should be on, then put it there. 1048 * Determine which queue the buffer should be on, then put it there.
1046 */ 1049 */
1047 1050
1048 /* If it's locked, don't report an error; try again later. */ 1051 /* If it's locked, don't report an error; try again later. */
1049 if (ISSET(bp->b_flags, B_LOCKED)) 1052 if (ISSET(bp->b_flags, B_LOCKED))
1050 bp->b_error = 0; 1053 bp->b_error = 0;
1051 1054
1052 /* If it's not cacheable, or an error, mark it invalid. */ 1055 /* If it's not cacheable, or an error, mark it invalid. */
1053 if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0) 1056 if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0)
1054 SET(bp->b_cflags, BC_INVAL); 1057 SET(bp->b_cflags, BC_INVAL);
1055 1058
1056 if (ISSET(bp->b_cflags, BC_VFLUSH)) { 1059 if (ISSET(bp->b_cflags, BC_VFLUSH)) {
1057 /* 1060 /*
1058 * This is a delayed write buffer that was just flushed to 1061 * This is a delayed write buffer that was just flushed to
1059 * disk. It is still on the LRU queue. If it's become 1062 * disk. It is still on the LRU queue. If it's become
1060 * invalid, then we need to move it to a different queue; 1063 * invalid, then we need to move it to a different queue;
1061 * otherwise leave it in its current position. 1064 * otherwise leave it in its current position.
1062 */ 1065 */
1063 CLR(bp->b_cflags, BC_VFLUSH); 1066 CLR(bp->b_cflags, BC_VFLUSH);
1064 if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) && 1067 if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) &&
1065 !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) { 1068 !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) {
1066 KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1)); 1069 KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1));
1067 goto already_queued; 1070 goto already_queued;
1068 } else { 1071 } else {
1069 bremfree(bp); 1072 bremfree(bp);
1070 } 1073 }
1071 } 1074 }
1072 1075
1073 KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0)); 1076 KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0));
1074 KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0)); 1077 KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0));
1075 KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0)); 1078 KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0));
1076 1079
1077 if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) { 1080 if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) {
1078 /* 1081 /*
1079 * If it's invalid or empty, dissociate it from its vnode 1082 * If it's invalid or empty, dissociate it from its vnode
1080 * and put on the head of the appropriate queue. 1083 * and put on the head of the appropriate queue.
1081 */ 1084 */
1082 if (ISSET(bp->b_flags, B_LOCKED)) { 1085 if (ISSET(bp->b_flags, B_LOCKED)) {
1083 if (wapbl_vphaswapbl(vp = bp->b_vp)) { 1086 if (wapbl_vphaswapbl(vp = bp->b_vp)) {
1084 struct mount *mp = wapbl_vptomp(vp); 1087 struct mount *mp = wapbl_vptomp(vp);
1085 1088
1086 KASSERT(bp->b_iodone 1089 KASSERT(bp->b_iodone
1087 != mp->mnt_wapbl_op->wo_wapbl_biodone); 1090 != mp->mnt_wapbl_op->wo_wapbl_biodone);
1088 WAPBL_REMOVE_BUF(mp, bp); 1091 WAPBL_REMOVE_BUF(mp, bp);
1089 } 1092 }
1090 } 1093 }
1091 1094
1092 mutex_enter(bp->b_objlock); 1095 mutex_enter(bp->b_objlock);
1093 CLR(bp->b_oflags, BO_DONE|BO_DELWRI); 1096 CLR(bp->b_oflags, BO_DONE|BO_DELWRI);
1094 if ((vp = bp->b_vp) != NULL) { 1097 if ((vp = bp->b_vp) != NULL) {
1095 KASSERT(bp->b_objlock == vp->v_interlock); 1098 KASSERT(bp->b_objlock == vp->v_interlock);
1096 reassignbuf(bp, bp->b_vp); 1099 reassignbuf(bp, bp->b_vp);
1097 brelvp(bp); 1100 brelvp(bp);
1098 mutex_exit(vp->v_interlock); 1101 mutex_exit(vp->v_interlock);
1099 } else { 1102 } else {
1100 KASSERT(bp->b_objlock == &buffer_lock); 1103 KASSERT(bp->b_objlock == &buffer_lock);
1101 mutex_exit(bp->b_objlock); 1104 mutex_exit(bp->b_objlock);
1102 } 1105 }
1103 /* We want to dispose of the buffer, so wake everybody. */ 1106 /* We want to dispose of the buffer, so wake everybody. */
1104 cv_broadcast(&bp->b_busy); 1107 cv_broadcast(&bp->b_busy);
1105 if (bp->b_bufsize <= 0) 1108 if (bp->b_bufsize <= 0)
1106 /* no data */ 1109 /* no data */
1107 goto already_queued; 1110 goto already_queued;
1108 else 1111 else
1109 /* invalid data */ 1112 /* invalid data */
1110 bufq = &bufqueues[BQ_AGE]; 1113 bufq = &bufqueues[BQ_AGE];
1111 binsheadfree(bp, bufq); 1114 binsheadfree(bp, bufq);
1112 } else { 1115 } else {
1113 /* 1116 /*
1114 * It has valid data. Put it on the end of the appropriate 1117 * It has valid data. Put it on the end of the appropriate
1115 * queue, so that it'll stick around for as long as possible. 1118 * queue, so that it'll stick around for as long as possible.
1116 * If buf is AGE, but has dependencies, must put it on last 1119 * If buf is AGE, but has dependencies, must put it on last
1117 * bufqueue to be scanned, ie LRU. This protects against the 1120 * bufqueue to be scanned, ie LRU. This protects against the
1118 * livelock where BQ_AGE only has buffers with dependencies, 1121 * livelock where BQ_AGE only has buffers with dependencies,
1119 * and we thus never get to the dependent buffers in BQ_LRU. 1122 * and we thus never get to the dependent buffers in BQ_LRU.
1120 */ 1123 */
1121 if (ISSET(bp->b_flags, B_LOCKED)) { 1124 if (ISSET(bp->b_flags, B_LOCKED)) {
1122 /* locked in core */ 1125 /* locked in core */
1123 bufq = &bufqueues[BQ_LOCKED]; 1126 bufq = &bufqueues[BQ_LOCKED];
1124 } else if (!ISSET(bp->b_cflags, BC_AGE)) { 1127 } else if (!ISSET(bp->b_cflags, BC_AGE)) {
1125 /* valid data */ 1128 /* valid data */
1126 bufq = &bufqueues[BQ_LRU]; 1129 bufq = &bufqueues[BQ_LRU];
1127 } else { 1130 } else {
1128 /* stale but valid data */ 1131 /* stale but valid data */
1129 bufq = &bufqueues[BQ_AGE]; 1132 bufq = &bufqueues[BQ_AGE];
1130 } 1133 }
1131 binstailfree(bp, bufq); 1134 binstailfree(bp, bufq);
1132 } 1135 }
1133already_queued: 1136already_queued:
1134 /* Unlock the buffer. */ 1137 /* Unlock the buffer. */
1135 CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE); 1138 CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE);
1136 CLR(bp->b_flags, B_ASYNC); 1139 CLR(bp->b_flags, B_ASYNC);
1137 1140
1138 /* 1141 /*
1139 * Wake only the highest priority waiter on the lock, in order to 1142 * Wake only the highest priority waiter on the lock, in order to
1140 * prevent a thundering herd: many LWPs simultaneously awakening and 1143 * prevent a thundering herd: many LWPs simultaneously awakening and
1141 * competing for the buffer's lock. Testing in 2019 revealed this 1144 * competing for the buffer's lock. Testing in 2019 revealed this
1142 * to reduce contention on bufcache_lock tenfold during a kernel 1145 * to reduce contention on bufcache_lock tenfold during a kernel
1143 * compile. Here and elsewhere, when the buffer is changing 1146 * compile. Here and elsewhere, when the buffer is changing
1144 * identity, being disposed of, or moving from one list to another, 1147 * identity, being disposed of, or moving from one list to another,
1145 * we wake all lock requestors. 1148 * we wake all lock requestors.
1146 */ 1149 */
1147 if (bp->b_bufsize <= 0) { 1150 if (bp->b_bufsize <= 0) {
1148 cv_broadcast(&bp->b_busy); 1151 cv_broadcast(&bp->b_busy);
1149 buf_destroy(bp); 1152 buf_destroy(bp);
1150#ifdef DEBUG 1153#ifdef DEBUG
1151 memset((char *)bp, 0, sizeof(*bp)); 1154 memset((char *)bp, 0, sizeof(*bp));
1152#endif 1155#endif
1153 pool_cache_put(buf_cache, bp); 1156 pool_cache_put(buf_cache, bp);
1154 } else 1157 } else
1155 cv_signal(&bp->b_busy); 1158 cv_signal(&bp->b_busy);
1156} 1159}
1157 1160
1158void 1161void
1159brelse(buf_t *bp, int set) 1162brelse(buf_t *bp, int set)
1160{ 1163{
1161 1164
1162 mutex_enter(&bufcache_lock); 1165 mutex_enter(&bufcache_lock);
1163 brelsel(bp, set); 1166 brelsel(bp, set);
1164 mutex_exit(&bufcache_lock); 1167 mutex_exit(&bufcache_lock);
1165} 1168}
1166 1169
1167/* 1170/*
1168 * Determine if a block is in the cache. 1171 * Determine if a block is in the cache.
1169 * Just look on what would be its hash chain. If it's there, return 1172 * Just look on what would be its hash chain. If it's there, return
1170 * a pointer to it, unless it's marked invalid. If it's marked invalid, 1173 * a pointer to it, unless it's marked invalid. If it's marked invalid,
1171 * we normally don't return the buffer, unless the caller explicitly 1174 * we normally don't return the buffer, unless the caller explicitly
1172 * wants us to. 1175 * wants us to.
1173 */ 1176 */
1174buf_t * 1177buf_t *
1175incore(struct vnode *vp, daddr_t blkno) 1178incore(struct vnode *vp, daddr_t blkno)
1176{ 1179{
1177 buf_t *bp; 1180 buf_t *bp;
1178 1181
1179 KASSERT(mutex_owned(&bufcache_lock)); 1182 KASSERT(mutex_owned(&bufcache_lock));
1180 1183
1181 /* Search hash chain */ 1184 /* Search hash chain */
1182 LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) { 1185 LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
1183 if (bp->b_lblkno == blkno && bp->b_vp == vp && 1186 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
1184 !ISSET(bp->b_cflags, BC_INVAL)) { 1187 !ISSET(bp->b_cflags, BC_INVAL)) {
1185 KASSERT(bp->b_objlock == vp->v_interlock); 1188 KASSERT(bp->b_objlock == vp->v_interlock);
1186 return (bp); 1189 return (bp);
1187 } 1190 }
1188 } 1191 }
1189 1192
1190 return (NULL); 1193 return (NULL);
1191} 1194}
1192 1195
1193/* 1196/*
1194 * Get a block of requested size that is associated with 1197 * Get a block of requested size that is associated with
1195 * a given vnode and block offset. If it is found in the 1198 * a given vnode and block offset. If it is found in the
1196 * block cache, mark it as having been found, make it busy 1199 * block cache, mark it as having been found, make it busy
1197 * and return it. Otherwise, return an empty block of the 1200 * and return it. Otherwise, return an empty block of the
1198 * correct size. It is up to the caller to insure that the 1201 * correct size. It is up to the caller to insure that the
1199 * cached blocks be of the correct size. 1202 * cached blocks be of the correct size.
1200 */ 1203 */
1201buf_t * 1204buf_t *
1202getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) 1205getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1203{ 1206{
1204 int err, preserve; 1207 int err, preserve;
1205 buf_t *bp; 1208 buf_t *bp;
1206 1209
1207 mutex_enter(&bufcache_lock); 1210 mutex_enter(&bufcache_lock);
1208 SDT_PROBE3(io, kernel, , getblk__start, vp, blkno, size); 1211 SDT_PROBE3(io, kernel, , getblk__start, vp, blkno, size);
1209 loop: 1212 loop:
1210 bp = incore(vp, blkno); 1213 bp = incore(vp, blkno);
1211 if (bp != NULL) { 1214 if (bp != NULL) {
1212 err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL); 1215 err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL);
1213 if (err != 0) { 1216 if (err != 0) {
1214 if (err == EPASSTHROUGH) 1217 if (err == EPASSTHROUGH)
1215 goto loop; 1218 goto loop;
1216 mutex_exit(&bufcache_lock); 1219 mutex_exit(&bufcache_lock);
1217 SDT_PROBE4(io, kernel, , getblk__done, 1220 SDT_PROBE4(io, kernel, , getblk__done,
1218 vp, blkno, size, NULL); 1221 vp, blkno, size, NULL);
1219 return (NULL); 1222 return (NULL);
1220 } 1223 }
1221 KASSERT(!cv_has_waiters(&bp->b_done)); 1224 KASSERT(!cv_has_waiters(&bp->b_done));
1222#ifdef DIAGNOSTIC 1225#ifdef DIAGNOSTIC
1223 if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) && 1226 if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) &&
1224 bp->b_bcount < size && vp->v_type != VBLK) 1227 bp->b_bcount < size && vp->v_type != VBLK)
1225 panic("getblk: block size invariant failed"); 1228 panic("getblk: block size invariant failed");
1226#endif 1229#endif
1227 bremfree(bp); 1230 bremfree(bp);
1228 preserve = 1; 1231 preserve = 1;
1229 } else { 1232 } else {
1230 if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL) 1233 if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL)
1231 goto loop; 1234 goto loop;
1232 1235
1233 if (incore(vp, blkno) != NULL) { 1236 if (incore(vp, blkno) != NULL) {
1234 /* The block has come into memory in the meantime. */ 1237 /* The block has come into memory in the meantime. */
1235 brelsel(bp, 0); 1238 brelsel(bp, 0);
1236 goto loop; 1239 goto loop;
1237 } 1240 }
1238 1241
1239 LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash); 1242 LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash);
1240 bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno; 1243 bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
1241 mutex_enter(vp->v_interlock); 1244 mutex_enter(vp->v_interlock);
1242 bgetvp(vp, bp); 1245 bgetvp(vp, bp);
1243 mutex_exit(vp->v_interlock); 1246 mutex_exit(vp->v_interlock);
1244 preserve = 0; 1247 preserve = 0;
1245 } 1248 }
1246 mutex_exit(&bufcache_lock); 1249 mutex_exit(&bufcache_lock);
1247 1250
1248 /* 1251 /*
1249 * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes) 1252 * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
1250 * if we re-size buffers here. 1253 * if we re-size buffers here.
1251 */ 1254 */
1252 if (ISSET(bp->b_flags, B_LOCKED)) { 1255 if (ISSET(bp->b_flags, B_LOCKED)) {
1253 KASSERT(bp->b_bufsize >= size); 1256 KASSERT(bp->b_bufsize >= size);
1254 } else { 1257 } else {
1255 if (allocbuf(bp, size, preserve)) { 1258 if (allocbuf(bp, size, preserve)) {
1256 mutex_enter(&bufcache_lock); 1259 mutex_enter(&bufcache_lock);
1257 LIST_REMOVE(bp, b_hash); 1260 LIST_REMOVE(bp, b_hash);
1258 brelsel(bp, BC_INVAL); 1261 brelsel(bp, BC_INVAL);
1259 mutex_exit(&bufcache_lock); 1262 mutex_exit(&bufcache_lock);
1260 SDT_PROBE4(io, kernel, , getblk__done, 1263 SDT_PROBE4(io, kernel, , getblk__done,
1261 vp, blkno, size, NULL); 1264 vp, blkno, size, NULL);
1262 return NULL; 1265 return NULL;
1263 } 1266 }
1264 } 1267 }
1265 BIO_SETPRIO(bp, BPRIO_DEFAULT); 1268 BIO_SETPRIO(bp, BPRIO_DEFAULT);
1266 SDT_PROBE4(io, kernel, , getblk__done, vp, blkno, size, bp); 1269 SDT_PROBE4(io, kernel, , getblk__done, vp, blkno, size, bp);
1267 return (bp); 1270 return (bp);
1268} 1271}
1269 1272
1270/* 1273/*
1271 * Get an empty, disassociated buffer of given size. 1274 * Get an empty, disassociated buffer of given size.
1272 */ 1275 */
1273buf_t * 1276buf_t *
1274geteblk(int size) 1277geteblk(int size)
1275{ 1278{
1276 buf_t *bp; 1279 buf_t *bp;
1277 int error __diagused; 1280 int error __diagused;
1278 1281
1279 mutex_enter(&bufcache_lock); 1282 mutex_enter(&bufcache_lock);
1280 while ((bp = getnewbuf(0, 0, 0)) == NULL) 1283 while ((bp = getnewbuf(0, 0, 0)) == NULL)
1281 ; 1284 ;
1282 1285
1283 SET(bp->b_cflags, BC_INVAL); 1286 SET(bp->b_cflags, BC_INVAL);
1284 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 1287 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1285 mutex_exit(&bufcache_lock); 1288 mutex_exit(&bufcache_lock);
1286 BIO_SETPRIO(bp, BPRIO_DEFAULT); 1289 BIO_SETPRIO(bp, BPRIO_DEFAULT);
1287 error = allocbuf(bp, size, 0); 1290 error = allocbuf(bp, size, 0);
1288 KASSERT(error == 0); 1291 KASSERT(error == 0);
1289 return (bp); 1292 return (bp);
1290} 1293}
1291 1294
1292/* 1295/*
1293 * Expand or contract the actual memory allocated to a buffer. 1296 * Expand or contract the actual memory allocated to a buffer.
1294 * 1297 *
1295 * If the buffer shrinks, data is lost, so it's up to the 1298 * If the buffer shrinks, data is lost, so it's up to the
1296 * caller to have written it out *first*; this routine will not 1299 * caller to have written it out *first*; this routine will not
1297 * start a write. If the buffer grows, it's the callers 1300 * start a write. If the buffer grows, it's the callers
1298 * responsibility to fill out the buffer's additional contents. 1301 * responsibility to fill out the buffer's additional contents.
1299 */ 1302 */
1300int 1303int
1301allocbuf(buf_t *bp, int size, int preserve) 1304allocbuf(buf_t *bp, int size, int preserve)
1302{ 1305{
1303 void *addr; 1306 void *addr;
1304 vsize_t oldsize, desired_size; 1307 vsize_t oldsize, desired_size;
1305 int oldcount; 1308 int oldcount;
1306 int delta; 1309 int delta;
1307 1310
1308 desired_size = buf_roundsize(size); 1311 desired_size = buf_roundsize(size);
1309 if (desired_size > MAXBSIZE) 1312 if (desired_size > MAXBSIZE)
1310 printf("allocbuf: buffer larger than MAXBSIZE requested"); 1313 printf("allocbuf: buffer larger than MAXBSIZE requested");
1311 1314
1312 oldcount = bp->b_bcount; 1315 oldcount = bp->b_bcount;
1313 1316
1314 bp->b_bcount = size; 1317 bp->b_bcount = size;
1315 1318
1316 oldsize = bp->b_bufsize; 1319 oldsize = bp->b_bufsize;
1317 if (oldsize == desired_size) { 1320 if (oldsize == desired_size) {
1318 /* 1321 /*
1319 * Do not short cut the WAPBL resize, as the buffer length 1322 * Do not short cut the WAPBL resize, as the buffer length
1320 * could still have changed and this would corrupt the 1323 * could still have changed and this would corrupt the
1321 * tracking of the transaction length. 1324 * tracking of the transaction length.
1322 */ 1325 */
1323 goto out; 1326 goto out;
1324 } 1327 }
1325 1328
1326 /* 1329 /*
1327 * If we want a buffer of a different size, re-allocate the 1330 * If we want a buffer of a different size, re-allocate the
1328 * buffer's memory; copy old content only if needed. 1331 * buffer's memory; copy old content only if needed.
1329 */ 1332 */
1330 addr = buf_alloc(desired_size); 1333 addr = buf_alloc(desired_size);
1331 if (addr == NULL) 1334 if (addr == NULL)
1332 return ENOMEM; 1335 return ENOMEM;
1333 if (preserve) 1336 if (preserve)
1334 memcpy(addr, bp->b_data, MIN(oldsize,desired_size)); 1337 memcpy(addr, bp->b_data, MIN(oldsize,desired_size));
1335 if (bp->b_data != NULL) 1338 if (bp->b_data != NULL)
1336 buf_mrelease(bp->b_data, oldsize); 1339 buf_mrelease(bp->b_data, oldsize);
1337 bp->b_data = addr; 1340 bp->b_data = addr;
1338 bp->b_bufsize = desired_size; 1341 bp->b_bufsize = desired_size;
1339 1342
1340 /* 1343 /*
1341 * Update overall buffer memory counter (protected by bufcache_lock) 1344 * Update overall buffer memory counter (protected by bufcache_lock)
1342 */ 1345 */
1343 delta = (long)desired_size - (long)oldsize; 1346 delta = (long)desired_size - (long)oldsize;
1344 1347
1345 mutex_enter(&bufcache_lock); 1348 mutex_enter(&bufcache_lock);
1346 if ((bufmem += delta) > bufmem_hiwater) { 1349 if ((bufmem += delta) > bufmem_hiwater) {
1347 /* 1350 /*
1348 * Need to trim overall memory usage. 1351 * Need to trim overall memory usage.
1349 */ 1352 */
1350 while (buf_canrelease()) { 1353 while (buf_canrelease()) {
1351 if (preempt_needed()) { 1354 if (preempt_needed()) {
1352 mutex_exit(&bufcache_lock); 1355 mutex_exit(&bufcache_lock);
1353 preempt(); 1356 preempt();
1354 mutex_enter(&bufcache_lock); 1357 mutex_enter(&bufcache_lock);
1355 } 1358 }
1356 if (buf_trim() == 0) 1359 if (buf_trim() == 0)
1357 break; 1360 break;
1358 } 1361 }
1359 } 1362 }
1360 mutex_exit(&bufcache_lock); 1363 mutex_exit(&bufcache_lock);
1361 1364
1362 out: 1365 out:
1363 if (wapbl_vphaswapbl(bp->b_vp)) 1366 if (wapbl_vphaswapbl(bp->b_vp))
1364 WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount); 1367 WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount);
1365 1368
1366 return 0; 1369 return 0;
1367} 1370}
1368 1371
1369/* 1372/*
1370 * Find a buffer which is available for use. 1373 * Find a buffer which is available for use.
1371 * Select something from a free list. 1374 * Select something from a free list.
1372 * Preference is to AGE list, then LRU list. 1375 * Preference is to AGE list, then LRU list.
1373 * 1376 *
1374 * Called with the buffer queues locked. 1377 * Called with the buffer queues locked.
1375 * Return buffer locked. 1378 * Return buffer locked.
1376 */ 1379 */
1377static buf_t * 1380static buf_t *
1378getnewbuf(int slpflag, int slptimeo, int from_bufq) 1381getnewbuf(int slpflag, int slptimeo, int from_bufq)
1379{ 1382{
1380 buf_t *bp; 1383 buf_t *bp;
1381 struct vnode *vp; 1384 struct vnode *vp;
1382 struct mount *transmp = NULL; 1385 struct mount *transmp = NULL;
1383 1386
1384 SDT_PROBE0(io, kernel, , getnewbuf__start); 1387 SDT_PROBE0(io, kernel, , getnewbuf__start);
1385 1388
1386 start: 1389 start:
1387 KASSERT(mutex_owned(&bufcache_lock)); 1390 KASSERT(mutex_owned(&bufcache_lock));
1388 1391
1389 /* 1392 /*
1390 * Get a new buffer from the pool. 1393 * Get a new buffer from the pool.
1391 */ 1394 */
1392 if (!from_bufq && buf_lotsfree()) { 1395 if (!from_bufq && buf_lotsfree()) {
1393 mutex_exit(&bufcache_lock); 1396 mutex_exit(&bufcache_lock);
1394 bp = pool_cache_get(buf_cache, PR_NOWAIT); 1397 bp = pool_cache_get(buf_cache, PR_NOWAIT);
1395 if (bp != NULL) { 1398 if (bp != NULL) {
1396 memset((char *)bp, 0, sizeof(*bp)); 1399 memset((char *)bp, 0, sizeof(*bp));
1397 buf_init(bp); 1400 buf_init(bp);
1398 SET(bp->b_cflags, BC_BUSY); /* mark buffer busy */ 1401 SET(bp->b_cflags, BC_BUSY); /* mark buffer busy */
1399 mutex_enter(&bufcache_lock); 1402 mutex_enter(&bufcache_lock);
1400#if defined(DIAGNOSTIC) 1403#if defined(DIAGNOSTIC)
1401 bp->b_freelistindex = -1; 1404 bp->b_freelistindex = -1;
1402#endif /* defined(DIAGNOSTIC) */ 1405#endif /* defined(DIAGNOSTIC) */
1403 SDT_PROBE1(io, kernel, , getnewbuf__done, bp); 1406 SDT_PROBE1(io, kernel, , getnewbuf__done, bp);
1404 return (bp); 1407 return (bp);
1405 } 1408 }
1406 mutex_enter(&bufcache_lock); 1409 mutex_enter(&bufcache_lock);
1407 } 1410 }
1408 1411
1409 KASSERT(mutex_owned(&bufcache_lock)); 1412 KASSERT(mutex_owned(&bufcache_lock));
1410 if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL) { 1413 if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL) {
1411 KASSERT(!ISSET(bp->b_oflags, BO_DELWRI)); 1414 KASSERT(!ISSET(bp->b_oflags, BO_DELWRI));
1412 } else { 1415 } else {
1413 TAILQ_FOREACH(bp, &bufqueues[BQ_LRU].bq_queue, b_freelist) { 1416 TAILQ_FOREACH(bp, &bufqueues[BQ_LRU].bq_queue, b_freelist) {
1414 if (ISSET(bp->b_cflags, BC_VFLUSH) || 1417 if (ISSET(bp->b_cflags, BC_VFLUSH) ||
1415 !ISSET(bp->b_oflags, BO_DELWRI)) 1418 !ISSET(bp->b_oflags, BO_DELWRI))
1416 break; 1419 break;
1417 if (fstrans_start_nowait(bp->b_vp->v_mount) == 0) { 1420 if (fstrans_start_nowait(bp->b_vp->v_mount) == 0) {
1418 KASSERT(transmp == NULL); 1421 KASSERT(transmp == NULL);
1419 transmp = bp->b_vp->v_mount; 1422 transmp = bp->b_vp->v_mount;
1420 break; 1423 break;
1421 } 1424 }
1422 } 1425 }
1423 } 1426 }
1424 if (bp != NULL) { 1427 if (bp != NULL) {
1425 KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH)); 1428 KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH));
1426 bremfree(bp); 1429 bremfree(bp);
1427 1430
1428 /* Buffer is no longer on free lists. */ 1431 /* Buffer is no longer on free lists. */
1429 SET(bp->b_cflags, BC_BUSY); 1432 SET(bp->b_cflags, BC_BUSY);
1430 1433
1431 /* Wake anyone trying to lock the old identity. */ 1434 /* Wake anyone trying to lock the old identity. */
1432 cv_broadcast(&bp->b_busy); 1435 cv_broadcast(&bp->b_busy);
1433 } else { 1436 } else {
1434 /* 1437 /*
1435 * XXX: !from_bufq should be removed. 1438 * XXX: !from_bufq should be removed.
1436 */ 1439 */
1437 if (!from_bufq || curlwp != uvm.pagedaemon_lwp) { 1440 if (!from_bufq || curlwp != uvm.pagedaemon_lwp) {
1438 /* wait for a free buffer of any kind */ 1441 /* wait for a free buffer of any kind */
1439 if ((slpflag & PCATCH) != 0) 1442 if ((slpflag & PCATCH) != 0)
1440 (void)cv_timedwait_sig(&needbuffer_cv, 1443 (void)cv_timedwait_sig(&needbuffer_cv,
1441 &bufcache_lock, slptimeo); 1444 &bufcache_lock, slptimeo);
1442 else 1445 else
1443 (void)cv_timedwait(&needbuffer_cv, 1446 (void)cv_timedwait(&needbuffer_cv,
1444 &bufcache_lock, slptimeo); 1447 &bufcache_lock, slptimeo);
1445 } 1448 }
1446 SDT_PROBE1(io, kernel, , getnewbuf__done, NULL); 1449 SDT_PROBE1(io, kernel, , getnewbuf__done, NULL);
1447 return (NULL); 1450 return (NULL);
1448 } 1451 }
1449 1452
1450#ifdef DIAGNOSTIC 1453#ifdef DIAGNOSTIC
1451 if (bp->b_bufsize <= 0) 1454 if (bp->b_bufsize <= 0)
1452 panic("buffer %p: on queue but empty", bp); 1455 panic("buffer %p: on queue but empty", bp);
1453#endif 1456#endif
1454 1457
1455 if (ISSET(bp->b_cflags, BC_VFLUSH)) { 1458 if (ISSET(bp->b_cflags, BC_VFLUSH)) {
1456 /* 1459 /*
1457 * This is a delayed write buffer being flushed to disk. Make 1460 * This is a delayed write buffer being flushed to disk. Make
1458 * sure it gets aged out of the queue when it's finished, and 1461 * sure it gets aged out of the queue when it's finished, and
1459 * leave it off the LRU queue. 1462 * leave it off the LRU queue.
1460 */ 1463 */
1461 CLR(bp->b_cflags, BC_VFLUSH); 1464 CLR(bp->b_cflags, BC_VFLUSH);
1462 SET(bp->b_cflags, BC_AGE); 1465 SET(bp->b_cflags, BC_AGE);
1463 goto start; 1466 goto start;
1464 } 1467 }
1465 1468
1466 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 1469 KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1467 KASSERT(!cv_has_waiters(&bp->b_done)); 1470 KASSERT(!cv_has_waiters(&bp->b_done));
1468 1471
1469 /* 1472 /*
1470 * If buffer was a delayed write, start it and return NULL 1473 * If buffer was a delayed write, start it and return NULL
1471 * (since we might sleep while starting the write). 1474 * (since we might sleep while starting the write).
1472 */ 1475 */
1473 if (ISSET(bp->b_oflags, BO_DELWRI)) { 1476 if (ISSET(bp->b_oflags, BO_DELWRI)) {
1474 /* 1477 /*
1475 * This buffer has gone through the LRU, so make sure it gets 1478 * This buffer has gone through the LRU, so make sure it gets
1476 * reused ASAP. 1479 * reused ASAP.
1477 */ 1480 */
1478 SET(bp->b_cflags, BC_AGE); 1481 SET(bp->b_cflags, BC_AGE);
1479 mutex_exit(&bufcache_lock); 1482 mutex_exit(&bufcache_lock);
1480 bawrite(bp); 1483 bawrite(bp);
1481 KASSERT(transmp != NULL); 1484 KASSERT(transmp != NULL);
1482 fstrans_done(transmp); 1485 fstrans_done(transmp);
1483 mutex_enter(&bufcache_lock); 1486 mutex_enter(&bufcache_lock);
1484 SDT_PROBE1(io, kernel, , getnewbuf__done, NULL); 1487 SDT_PROBE1(io, kernel, , getnewbuf__done, NULL);
1485 return (NULL); 1488 return (NULL);
1486 } 1489 }
1487 1490
1488 KASSERT(transmp == NULL); 1491 KASSERT(transmp == NULL);
1489 1492
1490 vp = bp->b_vp; 1493 vp = bp->b_vp;
1491 1494
1492 /* clear out various other fields */ 1495 /* clear out various other fields */
1493 bp->b_cflags = BC_BUSY; 1496 bp->b_cflags = BC_BUSY;
1494 bp->b_oflags = 0; 1497 bp->b_oflags = 0;
1495 bp->b_flags = 0; 1498 bp->b_flags = 0;
1496 bp->b_dev = NODEV; 1499 bp->b_dev = NODEV;
1497 bp->b_blkno = 0; 1500 bp->b_blkno = 0;
1498 bp->b_lblkno = 0; 1501 bp->b_lblkno = 0;
1499 bp->b_rawblkno = 0; 1502 bp->b_rawblkno = 0;
1500 bp->b_iodone = 0; 1503 bp->b_iodone = 0;
1501 bp->b_error = 0; 1504 bp->b_error = 0;
1502 bp->b_resid = 0; 1505 bp->b_resid = 0;
1503 bp->b_bcount = 0; 1506 bp->b_bcount = 0;
1504 1507
1505 LIST_REMOVE(bp, b_hash); 1508 LIST_REMOVE(bp, b_hash);
1506 1509
1507 /* Disassociate us from our vnode, if we had one... */ 1510 /* Disassociate us from our vnode, if we had one... */
1508 if (vp != NULL) { 1511 if (vp != NULL) {
1509 mutex_enter(vp->v_interlock); 1512 mutex_enter(vp->v_interlock);
1510 brelvp(bp); 1513 brelvp(bp);
1511 mutex_exit(vp->v_interlock); 1514 mutex_exit(vp->v_interlock);
1512 } 1515 }
1513 1516
1514 SDT_PROBE1(io, kernel, , getnewbuf__done, bp); 1517 SDT_PROBE1(io, kernel, , getnewbuf__done, bp);
1515 return (bp); 1518 return (bp);
1516} 1519}
1517 1520
1518/* 1521/*
1519 * Invalidate the specified buffer if it exists. 1522 * Invalidate the specified buffer if it exists.
1520 */ 1523 */
1521void 1524void
1522binvalbuf(struct vnode *vp, daddr_t blkno) 1525binvalbuf(struct vnode *vp, daddr_t blkno)
1523{ 1526{
1524 buf_t *bp; 1527 buf_t *bp;
1525 int err; 1528 int err;
1526 1529
1527 mutex_enter(&bufcache_lock); 1530 mutex_enter(&bufcache_lock);
1528 1531
1529 loop: 1532 loop:
1530 bp = incore(vp, blkno); 1533 bp = incore(vp, blkno);
1531 if (bp != NULL) { 1534 if (bp != NULL) {
1532 err = bbusy(bp, 0, 0, NULL); 1535 err = bbusy(bp, 0, 0, NULL);
1533 if (err == EPASSTHROUGH) 1536 if (err == EPASSTHROUGH)
1534 goto loop; 1537 goto loop;
1535 bremfree(bp); 1538 bremfree(bp);
1536 if (ISSET(bp->b_oflags, BO_DELWRI)) { 1539 if (ISSET(bp->b_oflags, BO_DELWRI)) {
1537 SET(bp->b_cflags, BC_NOCACHE); 1540 SET(bp->b_cflags, BC_NOCACHE);
1538 mutex_exit(&bufcache_lock); 1541 mutex_exit(&bufcache_lock);
1539 bwrite(bp); 1542 bwrite(bp);
1540 } else { 1543 } else {
1541 brelsel(bp, BC_INVAL); 1544 brelsel(bp, BC_INVAL);
1542 mutex_exit(&bufcache_lock); 1545 mutex_exit(&bufcache_lock);
1543 } 1546 }
1544 } else 1547 } else
1545 mutex_exit(&bufcache_lock); 1548 mutex_exit(&bufcache_lock);
1546} 1549}
1547 1550
1548/* 1551/*
1549 * Attempt to free an aged buffer off the queues. 1552 * Attempt to free an aged buffer off the queues.
1550 * Called with queue lock held. 1553 * Called with queue lock held.
1551 * Returns the amount of buffer memory freed. 1554 * Returns the amount of buffer memory freed.
1552 */ 1555 */
1553static int 1556static int
1554buf_trim(void) 1557buf_trim(void)
1555{ 1558{
1556 buf_t *bp; 1559 buf_t *bp;
1557 long size; 1560 long size;
1558 1561
1559 KASSERT(mutex_owned(&bufcache_lock)); 1562 KASSERT(mutex_owned(&bufcache_lock));
1560 1563
1561 /* Instruct getnewbuf() to get buffers off the queues */ 1564 /* Instruct getnewbuf() to get buffers off the queues */
1562 if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL) 1565 if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL)
1563 return 0; 1566 return 0;
1564 1567
1565 KASSERT((bp->b_cflags & BC_WANTED) == 0); 1568 KASSERT((bp->b_cflags & BC_WANTED) == 0);
1566 size = bp->b_bufsize; 1569 size = bp->b_bufsize;
1567 bufmem -= size; 1570 bufmem -= size;
1568 if (size > 0) { 1571 if (size > 0) {
1569 buf_mrelease(bp->b_data, size); 1572 buf_mrelease(bp->b_data, size);
1570 bp->b_bcount = bp->b_bufsize = 0; 1573 bp->b_bcount = bp->b_bufsize = 0;
1571 } 1574 }
1572 /* brelse() will return the buffer to the global buffer pool */ 1575 /* brelse() will return the buffer to the global buffer pool */
1573 brelsel(bp, 0); 1576 brelsel(bp, 0);
1574 return size; 1577 return size;
1575} 1578}
1576 1579
1577int 1580int
1578buf_drain(int n) 1581buf_drain(int n)
1579{ 1582{
1580 int size = 0, sz; 1583 int size = 0, sz;
1581 1584
1582 KASSERT(mutex_owned(&bufcache_lock)); 1585 KASSERT(mutex_owned(&bufcache_lock));
1583 1586
1584 while (size < n && bufmem > bufmem_lowater) { 1587 while (size < n && bufmem > bufmem_lowater) {
1585 sz = buf_trim(); 1588 sz = buf_trim();
1586 if (sz <= 0) 1589 if (sz <= 0)
1587 break; 1590 break;
1588 size += sz; 1591 size += sz;
1589 } 1592 }
1590 1593
1591 return size; 1594 return size;
1592} 1595}
1593 1596
1594/* 1597/*
1595 * Wait for operations on the buffer to complete. 1598 * Wait for operations on the buffer to complete.
1596 * When they do, extract and return the I/O's error value. 1599 * When they do, extract and return the I/O's error value.
1597 */ 1600 */
1598int 1601int
1599biowait(buf_t *bp) 1602biowait(buf_t *bp)
1600{ 1603{
1601 1604
1602 BIOHIST_FUNC(__func__); 1605 BIOHIST_FUNC(__func__);
1603 1606
1604 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 1607 KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1605 1608
1606 SDT_PROBE1(io, kernel, , wait__start, bp); 1609 SDT_PROBE1(io, kernel, , wait__start, bp);
1607 1610
1608 mutex_enter(bp->b_objlock); 1611 mutex_enter(bp->b_objlock);
1609 1612
1610 BIOHIST_CALLARGS(biohist, "bp=%#jx, oflags=0x%jx, ret_addr=%#jx", 1613 BIOHIST_CALLARGS(biohist, "bp=%#jx, oflags=0x%jx, ret_addr=%#jx",
1611 (uintptr_t)bp, bp->b_oflags,  1614 (uintptr_t)bp, bp->b_oflags,
1612 (uintptr_t)__builtin_return_address(0), 0); 1615 (uintptr_t)__builtin_return_address(0), 0);
1613 1616
1614 while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI)) { 1617 while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI)) {
1615 BIOHIST_LOG(biohist, "waiting bp=%#jx", (uintptr_t)bp, 0, 0, 0); 1618 BIOHIST_LOG(biohist, "waiting bp=%#jx", (uintptr_t)bp, 0, 0, 0);
1616 cv_wait(&bp->b_done, bp->b_objlock); 1619 cv_wait(&bp->b_done, bp->b_objlock);
1617 } 1620 }
1618 mutex_exit(bp->b_objlock); 1621 mutex_exit(bp->b_objlock);
1619 1622
1620 SDT_PROBE1(io, kernel, , wait__done, bp); 1623 SDT_PROBE1(io, kernel, , wait__done, bp);
1621 1624
1622 BIOHIST_LOG(biohist, "return %jd", bp->b_error, 0, 0, 0); 1625 BIOHIST_LOG(biohist, "return %jd", bp->b_error, 0, 0, 0);
1623 1626
1624 return bp->b_error; 1627 return bp->b_error;
1625} 1628}
1626 1629
1627/* 1630/*
1628 * Mark I/O complete on a buffer. 1631 * Mark I/O complete on a buffer.
1629 * 1632 *
1630 * If a callback has been requested, e.g. the pageout 1633 * If a callback has been requested, e.g. the pageout
1631 * daemon, do so. Otherwise, awaken waiting processes. 1634 * daemon, do so. Otherwise, awaken waiting processes.
1632 * 1635 *
1633 * [ Leffler, et al., says on p.247: 1636 * [ Leffler, et al., says on p.247:
1634 * "This routine wakes up the blocked process, frees the buffer 1637 * "This routine wakes up the blocked process, frees the buffer
1635 * for an asynchronous write, or, for a request by the pagedaemon 1638 * for an asynchronous write, or, for a request by the pagedaemon
1636 * process, invokes a procedure specified in the buffer structure" ] 1639 * process, invokes a procedure specified in the buffer structure" ]
1637 * 1640 *
1638 * In real life, the pagedaemon (or other system processes) wants 1641 * In real life, the pagedaemon (or other system processes) wants
1639 * to do async stuff too, and doesn't want the buffer brelse()'d. 1642 * to do async stuff too, and doesn't want the buffer brelse()'d.
1640 * (for swap pager, that puts swap buffers on the free lists (!!!), 1643 * (for swap pager, that puts swap buffers on the free lists (!!!),
1641 * for the vn device, that puts allocated buffers on the free lists!) 1644 * for the vn device, that puts allocated buffers on the free lists!)
1642 */ 1645 */
1643void 1646void
1644biodone(buf_t *bp) 1647biodone(buf_t *bp)
1645{ 1648{
1646 int s; 1649 int s;
1647 1650
1648 BIOHIST_FUNC(__func__); 1651 BIOHIST_FUNC(__func__);
1649 1652
1650 KASSERT(!ISSET(bp->b_oflags, BO_DONE)); 1653 KASSERT(!ISSET(bp->b_oflags, BO_DONE));
1651 1654
1652 if (cpu_intr_p()) { 1655 if (cpu_intr_p()) {
1653 /* From interrupt mode: defer to a soft interrupt. */ 1656 /* From interrupt mode: defer to a soft interrupt. */
1654 s = splvm(); 1657 s = splvm();
1655 TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq); 1658 TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq);
1656 1659
1657 BIOHIST_CALLARGS(biohist, "bp=%#jx, softint scheduled", 1660 BIOHIST_CALLARGS(biohist, "bp=%#jx, softint scheduled",
1658 (uintptr_t)bp, 0, 0, 0); 1661 (uintptr_t)bp, 0, 0, 0);
1659 softint_schedule(biodone_sih); 1662 softint_schedule(biodone_sih);
1660 splx(s); 1663 splx(s);
1661 } else { 1664 } else {
1662 /* Process now - the buffer may be freed soon. */ 1665 /* Process now - the buffer may be freed soon. */
1663 biodone2(bp); 1666 biodone2(bp);
1664 } 1667 }
1665} 1668}
1666 1669
1667SDT_PROBE_DEFINE1(io, kernel, , done, "struct buf *"/*bp*/); 1670SDT_PROBE_DEFINE1(io, kernel, , done, "struct buf *"/*bp*/);
1668 1671
1669static void 1672static void
1670biodone2(buf_t *bp) 1673biodone2(buf_t *bp)
1671{ 1674{
1672 void (*callout)(buf_t *); 1675 void (*callout)(buf_t *);
1673 1676
1674 SDT_PROBE1(io, kernel, ,done, bp); 1677 SDT_PROBE1(io, kernel, ,done, bp);
1675 1678
1676 BIOHIST_FUNC(__func__); 1679 BIOHIST_FUNC(__func__);
1677 BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0); 1680 BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0);
1678 1681
1679 mutex_enter(bp->b_objlock); 1682 mutex_enter(bp->b_objlock);
1680 /* Note that the transfer is done. */ 1683 /* Note that the transfer is done. */
1681 if (ISSET(bp->b_oflags, BO_DONE)) 1684 if (ISSET(bp->b_oflags, BO_DONE))
1682 panic("biodone2 already"); 1685 panic("biodone2 already");
1683 CLR(bp->b_flags, B_COWDONE); 1686 CLR(bp->b_flags, B_COWDONE);
1684 SET(bp->b_oflags, BO_DONE); 1687 SET(bp->b_oflags, BO_DONE);
1685 BIO_SETPRIO(bp, BPRIO_DEFAULT); 1688 BIO_SETPRIO(bp, BPRIO_DEFAULT);
1686 1689
1687 /* Wake up waiting writers. */ 1690 /* Wake up waiting writers. */
1688 if (!ISSET(bp->b_flags, B_READ)) 1691 if (!ISSET(bp->b_flags, B_READ))
1689 vwakeup(bp); 1692 vwakeup(bp);
1690 1693
1691 if ((callout = bp->b_iodone) != NULL) { 1694 if ((callout = bp->b_iodone) != NULL) {
1692 BIOHIST_LOG(biohist, "callout %#jx", (uintptr_t)callout, 1695 BIOHIST_LOG(biohist, "callout %#jx", (uintptr_t)callout,
1693 0, 0, 0); 1696 0, 0, 0);
1694 1697
1695 /* Note callout done, then call out. */ 1698 /* Note callout done, then call out. */
1696 KASSERT(!cv_has_waiters(&bp->b_done)); 1699 KASSERT(!cv_has_waiters(&bp->b_done));
1697 bp->b_iodone = NULL; 1700 bp->b_iodone = NULL;
1698 mutex_exit(bp->b_objlock); 1701 mutex_exit(bp->b_objlock);
1699 (*callout)(bp); 1702 (*callout)(bp);
1700 } else if (ISSET(bp->b_flags, B_ASYNC)) { 1703 } else if (ISSET(bp->b_flags, B_ASYNC)) {
1701 /* If async, release. */ 1704 /* If async, release. */
1702 BIOHIST_LOG(biohist, "async", 0, 0, 0, 0); 1705 BIOHIST_LOG(biohist, "async", 0, 0, 0, 0);
1703 KASSERT(!cv_has_waiters(&bp->b_done)); 1706 KASSERT(!cv_has_waiters(&bp->b_done));
1704 mutex_exit(bp->b_objlock); 1707 mutex_exit(bp->b_objlock);
1705 brelse(bp, 0); 1708 brelse(bp, 0);
1706 } else { 1709 } else {
1707 /* Otherwise just wake up waiters in biowait(). */ 1710 /* Otherwise just wake up waiters in biowait(). */
1708 BIOHIST_LOG(biohist, "wake-up", 0, 0, 0, 0); 1711 BIOHIST_LOG(biohist, "wake-up", 0, 0, 0, 0);
1709 cv_broadcast(&bp->b_done); 1712 cv_broadcast(&bp->b_done);
1710 mutex_exit(bp->b_objlock); 1713 mutex_exit(bp->b_objlock);
1711 } 1714 }
1712} 1715}
1713 1716
1714static void 1717static void
1715biointr(void *cookie) 1718biointr(void *cookie)
1716{ 1719{
1717 struct cpu_info *ci; 1720 struct cpu_info *ci;
1718 buf_t *bp; 1721 buf_t *bp;
1719 int s; 1722 int s;
1720 1723
1721 BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); 1724 BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
1722 1725
1723 ci = curcpu(); 1726 ci = curcpu();
1724 1727
1725 s = splvm(); 1728 s = splvm();
1726 while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) { 1729 while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) {
1727 KASSERT(curcpu() == ci); 1730 KASSERT(curcpu() == ci);
1728 1731
1729 bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone); 1732 bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone);
1730 TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq); 1733 TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq);
1731 splx(s); 1734 splx(s);
1732 1735
1733 BIOHIST_LOG(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0); 1736 BIOHIST_LOG(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0);
1734 biodone2(bp); 1737 biodone2(bp);
1735 1738
1736 s = splvm(); 1739 s = splvm();
1737 } 1740 }
1738 splx(s); 1741 splx(s);
1739} 1742}
1740 1743
1741static void 1744static void
1742sysctl_fillbuf(const buf_t *i, struct buf_sysctl *o) 1745sysctl_fillbuf(const buf_t *i, struct buf_sysctl *o)
1743{ 1746{
1744 const bool allowaddr = get_expose_address(curproc); 1747 const bool allowaddr = get_expose_address(curproc);
1745 1748
1746 memset(o, 0, sizeof(*o)); 1749 memset(o, 0, sizeof(*o));
1747 1750
1748 o->b_flags = i->b_flags | i->b_cflags | i->b_oflags; 1751 o->b_flags = i->b_flags | i->b_cflags | i->b_oflags;
1749 o->b_error = i->b_error; 1752 o->b_error = i->b_error;
1750 o->b_prio = i->b_prio; 1753 o->b_prio = i->b_prio;
1751 o->b_dev = i->b_dev; 1754 o->b_dev = i->b_dev;
1752 o->b_bufsize = i->b_bufsize; 1755 o->b_bufsize = i->b_bufsize;
1753 o->b_bcount = i->b_bcount; 1756 o->b_bcount = i->b_bcount;
1754 o->b_resid = i->b_resid; 1757 o->b_resid = i->b_resid;
1755 COND_SET_VALUE(o->b_addr, PTRTOUINT64(i->b_data), allowaddr); 1758 COND_SET_VALUE(o->b_addr, PTRTOUINT64(i->b_data), allowaddr);
1756 o->b_blkno = i->b_blkno; 1759 o->b_blkno = i->b_blkno;
1757 o->b_rawblkno = i->b_rawblkno; 1760 o->b_rawblkno = i->b_rawblkno;
1758 COND_SET_VALUE(o->b_iodone, PTRTOUINT64(i->b_iodone), allowaddr); 1761 COND_SET_VALUE(o->b_iodone, PTRTOUINT64(i->b_iodone), allowaddr);
1759 COND_SET_VALUE(o->b_proc, PTRTOUINT64(i->b_proc), allowaddr); 1762 COND_SET_VALUE(o->b_proc, PTRTOUINT64(i->b_proc), allowaddr);
1760 COND_SET_VALUE(o->b_vp, PTRTOUINT64(i->b_vp), allowaddr); 1763 COND_SET_VALUE(o->b_vp, PTRTOUINT64(i->b_vp), allowaddr);
1761 COND_SET_VALUE(o->b_saveaddr, PTRTOUINT64(i->b_saveaddr), allowaddr); 1764 COND_SET_VALUE(o->b_saveaddr, PTRTOUINT64(i->b_saveaddr), allowaddr);
1762 o->b_lblkno = i->b_lblkno; 1765 o->b_lblkno = i->b_lblkno;
1763} 1766}
1764 1767
1765#define KERN_BUFSLOP 20 1768#define KERN_BUFSLOP 20
1766static int 1769static int
1767sysctl_dobuf(SYSCTLFN_ARGS) 1770sysctl_dobuf(SYSCTLFN_ARGS)
1768{ 1771{
1769 buf_t *bp; 1772 buf_t *bp;
1770 struct buf_sysctl bs; 1773 struct buf_sysctl bs;
1771 struct bqueue *bq; 1774 struct bqueue *bq;
1772 char *dp; 1775 char *dp;
1773 u_int i, op, arg; 1776 u_int i, op, arg;
1774 size_t len, needed, elem_size, out_size; 1777 size_t len, needed, elem_size, out_size;
1775 int error, elem_count, retries; 1778 int error, elem_count, retries;
1776 1779
1777 if (namelen == 1 && name[0] == CTL_QUERY) 1780 if (namelen == 1 && name[0] == CTL_QUERY)
1778 return (sysctl_query(SYSCTLFN_CALL(rnode))); 1781 return (sysctl_query(SYSCTLFN_CALL(rnode)));
1779 1782
1780 if (namelen != 4) 1783 if (namelen != 4)
1781 return (EINVAL); 1784 return (EINVAL);
1782 1785
1783 retries = 100; 1786 retries = 100;
1784 retry: 1787 retry:
1785 dp = oldp; 1788 dp = oldp;
1786 len = (oldp != NULL) ? *oldlenp : 0; 1789 len = (oldp != NULL) ? *oldlenp : 0;
1787 op = name[0]; 1790 op = name[0];
1788 arg = name[1]; 1791 arg = name[1];
1789 elem_size = name[2]; 1792 elem_size = name[2];
1790 elem_count = name[3]; 1793 elem_count = name[3];
1791 out_size = MIN(sizeof(bs), elem_size); 1794 out_size = MIN(sizeof(bs), elem_size);
1792 1795
1793 /* 1796 /*
1794 * at the moment, these are just "placeholders" to make the 1797 * at the moment, these are just "placeholders" to make the
1795 * API for retrieving kern.buf data more extensible in the 1798 * API for retrieving kern.buf data more extensible in the
1796 * future. 1799 * future.
1797 * 1800 *
1798 * XXX kern.buf currently has "netbsd32" issues. hopefully 1801 * XXX kern.buf currently has "netbsd32" issues. hopefully
1799 * these will be resolved at a later point. 1802 * these will be resolved at a later point.
1800 */ 1803 */
1801 if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL || 1804 if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL ||
1802 elem_size < 1 || elem_count < 0) 1805 elem_size < 1 || elem_count < 0)
1803 return (EINVAL); 1806 return (EINVAL);
1804 1807
1805 error = 0; 1808 error = 0;
1806 needed = 0; 1809 needed = 0;
1807 sysctl_unlock(); 1810 sysctl_unlock();
1808 mutex_enter(&bufcache_lock); 1811 mutex_enter(&bufcache_lock);
1809 for (i = 0; i < BQUEUES; i++) { 1812 for (i = 0; i < BQUEUES; i++) {
1810 bq = &bufqueues[i]; 1813 bq = &bufqueues[i];
1811 TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) { 1814 TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) {
1812 bq->bq_marker = bp; 1815 bq->bq_marker = bp;
1813 if (len >= elem_size && elem_count > 0) { 1816 if (len >= elem_size && elem_count > 0) {
1814 sysctl_fillbuf(bp, &bs); 1817 sysctl_fillbuf(bp, &bs);
1815 mutex_exit(&bufcache_lock); 1818 mutex_exit(&bufcache_lock);
1816 error = copyout(&bs, dp, out_size); 1819 error = copyout(&bs, dp, out_size);
1817 mutex_enter(&bufcache_lock); 1820 mutex_enter(&bufcache_lock);
1818 if (error) 1821 if (error)
1819 break; 1822 break;
1820 if (bq->bq_marker != bp) { 1823 if (bq->bq_marker != bp) {
1821 /* 1824 /*
1822 * This sysctl node is only for 1825 * This sysctl node is only for
1823 * statistics. Retry; if the 1826 * statistics. Retry; if the
1824 * queue keeps changing, then 1827 * queue keeps changing, then
1825 * bail out. 1828 * bail out.
1826 */ 1829 */
1827 if (retries-- == 0) { 1830 if (retries-- == 0) {
1828 error = EAGAIN; 1831 error = EAGAIN;
1829 break; 1832 break;
1830 } 1833 }
1831 mutex_exit(&bufcache_lock); 1834 mutex_exit(&bufcache_lock);
1832 sysctl_relock(); 1835 sysctl_relock();
1833 goto retry; 1836 goto retry;
1834 } 1837 }
1835 dp += elem_size; 1838 dp += elem_size;
1836 len -= elem_size; 1839 len -= elem_size;
1837 } 1840 }
1838 needed += elem_size; 1841 needed += elem_size;
1839 if (elem_count > 0 && elem_count != INT_MAX) 1842 if (elem_count > 0 && elem_count != INT_MAX)
1840 elem_count--; 1843 elem_count--;
1841 } 1844 }
1842 if (error != 0) 1845 if (error != 0)
1843 break; 1846 break;
1844 } 1847 }
1845 mutex_exit(&bufcache_lock); 1848 mutex_exit(&bufcache_lock);
1846 sysctl_relock(); 1849 sysctl_relock();
1847 1850
1848 *oldlenp = needed; 1851 *oldlenp = needed;
1849 if (oldp == NULL) 1852 if (oldp == NULL)
1850 *oldlenp += KERN_BUFSLOP * sizeof(buf_t); 1853 *oldlenp += KERN_BUFSLOP * sizeof(buf_t);
1851 1854
1852 return (error); 1855 return (error);
1853} 1856}
1854 1857
1855static int 1858static int
1856sysctl_bufvm_update(SYSCTLFN_ARGS) 1859sysctl_bufvm_update(SYSCTLFN_ARGS)
1857{ 1860{
1858 int error, rv; 1861 int error, rv;
1859 struct sysctlnode node; 1862 struct sysctlnode node;
1860 unsigned int temp_bufcache; 1863 unsigned int temp_bufcache;
1861 unsigned long temp_water; 1864 unsigned long temp_water;
1862 1865
1863 /* Take a copy of the supplied node and its data */ 1866 /* Take a copy of the supplied node and its data */
1864 node = *rnode; 1867 node = *rnode;
1865 if (node.sysctl_data == &bufcache) { 1868 if (node.sysctl_data == &bufcache) {
1866 node.sysctl_data = &temp_bufcache; 1869 node.sysctl_data = &temp_bufcache;
1867 temp_bufcache = *(unsigned int *)rnode->sysctl_data; 1870 temp_bufcache = *(unsigned int *)rnode->sysctl_data;
1868 } else { 1871 } else {
1869 node.sysctl_data = &temp_water; 1872 node.sysctl_data = &temp_water;
1870 temp_water = *(unsigned long *)rnode->sysctl_data; 1873 temp_water = *(unsigned long *)rnode->sysctl_data;
1871 } 1874 }
1872 1875
1873 /* Update the copy */ 1876 /* Update the copy */
1874 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1877 error = sysctl_lookup(SYSCTLFN_CALL(&node));
1875 if (error || newp == NULL) 1878 if (error || newp == NULL)
1876 return (error); 1879 return (error);
1877 1880
1878 if (rnode->sysctl_data == &bufcache) { 1881 if (rnode->sysctl_data == &bufcache) {
1879 if (temp_bufcache > 100) 1882 if (temp_bufcache > 100)
1880 return (EINVAL); 1883 return (EINVAL);
1881 bufcache = temp_bufcache; 1884 bufcache = temp_bufcache;
1882 buf_setwm(); 1885 buf_setwm();
1883 } else if (rnode->sysctl_data == &bufmem_lowater) { 1886 } else if (rnode->sysctl_data == &bufmem_lowater) {
1884 if (bufmem_hiwater - temp_water < 16) 1887 if (bufmem_hiwater - temp_water < 16)
1885 return (EINVAL); 1888 return (EINVAL);
1886 bufmem_lowater = temp_water; 1889 bufmem_lowater = temp_water;
1887 } else if (rnode->sysctl_data == &bufmem_hiwater) { 1890 } else if (rnode->sysctl_data == &bufmem_hiwater) {
1888 if (temp_water - bufmem_lowater < 16) 1891 if (temp_water - bufmem_lowater < 16)
1889 return (EINVAL); 1892 return (EINVAL);
1890 bufmem_hiwater = temp_water; 1893 bufmem_hiwater = temp_water;
1891 } else 1894 } else
1892 return (EINVAL); 1895 return (EINVAL);
1893 1896
1894 /* Drain until below new high water mark */ 1897 /* Drain until below new high water mark */
1895 sysctl_unlock(); 1898 sysctl_unlock();
1896 mutex_enter(&bufcache_lock); 1899 mutex_enter(&bufcache_lock);
1897 while (bufmem > bufmem_hiwater) { 1900 while (bufmem > bufmem_hiwater) {
1898 rv = buf_drain((bufmem - bufmem_hiwater) / (2 * 1024)); 1901 rv = buf_drain((bufmem - bufmem_hiwater) / (2 * 1024));
1899 if (rv <= 0) 1902 if (rv <= 0)
1900 break; 1903 break;
1901 } 1904 }
1902 mutex_exit(&bufcache_lock); 1905 mutex_exit(&bufcache_lock);
1903 sysctl_relock(); 1906 sysctl_relock();
1904 1907
1905 return 0; 1908 return 0;
1906} 1909}
1907 1910
1908static struct sysctllog *vfsbio_sysctllog; 1911static struct sysctllog *vfsbio_sysctllog;
1909 1912
1910static void 1913static void
1911sysctl_kern_buf_setup(void) 1914sysctl_kern_buf_setup(void)
1912{ 1915{
1913 1916
1914 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, 1917 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1915 CTLFLAG_PERMANENT, 1918 CTLFLAG_PERMANENT,
1916 CTLTYPE_NODE, "buf", 1919 CTLTYPE_NODE, "buf",
1917 SYSCTL_DESCR("Kernel buffer cache information"), 1920 SYSCTL_DESCR("Kernel buffer cache information"),
1918 sysctl_dobuf, 0, NULL, 0, 1921 sysctl_dobuf, 0, NULL, 0,
1919 CTL_KERN, KERN_BUF, CTL_EOL); 1922 CTL_KERN, KERN_BUF, CTL_EOL);
1920} 1923}
1921 1924
1922static void 1925static void
1923sysctl_vm_buf_setup(void) 1926sysctl_vm_buf_setup(void)
1924{ 1927{
1925 1928
1926 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, 1929 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1927 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1930 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1928 CTLTYPE_INT, "bufcache", 1931 CTLTYPE_INT, "bufcache",
1929 SYSCTL_DESCR("Percentage of physical memory to use for " 1932 SYSCTL_DESCR("Percentage of physical memory to use for "
1930 "buffer cache"), 1933 "buffer cache"),
1931 sysctl_bufvm_update, 0, &bufcache, 0, 1934 sysctl_bufvm_update, 0, &bufcache, 0,
1932 CTL_VM, CTL_CREATE, CTL_EOL); 1935 CTL_VM, CTL_CREATE, CTL_EOL);
1933 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, 1936 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1934 CTLFLAG_PERMANENT|CTLFLAG_READONLY, 1937 CTLFLAG_PERMANENT|CTLFLAG_READONLY,
1935 CTLTYPE_LONG, "bufmem", 1938 CTLTYPE_LONG, "bufmem",
1936 SYSCTL_DESCR("Amount of kernel memory used by buffer " 1939 SYSCTL_DESCR("Amount of kernel memory used by buffer "
1937 "cache"), 1940 "cache"),
1938 NULL, 0, &bufmem, 0, 1941 NULL, 0, &bufmem, 0,
1939 CTL_VM, CTL_CREATE, CTL_EOL); 1942 CTL_VM, CTL_CREATE, CTL_EOL);
1940 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, 1943 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1941 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1944 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1942 CTLTYPE_LONG, "bufmem_lowater", 1945 CTLTYPE_LONG, "bufmem_lowater",
1943 SYSCTL_DESCR("Minimum amount of kernel memory to " 1946 SYSCTL_DESCR("Minimum amount of kernel memory to "
1944 "reserve for buffer cache"), 1947 "reserve for buffer cache"),
1945 sysctl_bufvm_update, 0, &bufmem_lowater, 0, 1948 sysctl_bufvm_update, 0, &bufmem_lowater, 0,
1946 CTL_VM, CTL_CREATE, CTL_EOL); 1949 CTL_VM, CTL_CREATE, CTL_EOL);
1947 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, 1950 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1948 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1951 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1949 CTLTYPE_LONG, "bufmem_hiwater", 1952 CTLTYPE_LONG, "bufmem_hiwater",
1950 SYSCTL_DESCR("Maximum amount of kernel memory to use " 1953 SYSCTL_DESCR("Maximum amount of kernel memory to use "
1951 "for buffer cache"), 1954 "for buffer cache"),
1952 sysctl_bufvm_update, 0, &bufmem_hiwater, 0, 1955 sysctl_bufvm_update, 0, &bufmem_hiwater, 0,
1953 CTL_VM, CTL_CREATE, CTL_EOL); 1956 CTL_VM, CTL_CREATE, CTL_EOL);
1954} 1957}
1955 1958
 1959static int
 1960bufhash_stats(struct hashstat_sysctl *hs, bool fill)
 1961{
 1962 buf_t *bp;
 1963 uint64_t chain;
 1964
 1965 strlcpy(hs->hash_name, "bufhash", sizeof(hs->hash_name));
 1966 strlcpy(hs->hash_desc, "buffer hash", sizeof(hs->hash_desc));
 1967 if (!fill)
 1968 return 0;
 1969
 1970 hs->hash_size = bufhash + 1;
 1971
 1972 for (size_t i = 0; i < hs->hash_size; i++) {
 1973 chain = 0;
 1974
 1975 mutex_enter(&bufcache_lock);
 1976 LIST_FOREACH(bp, &bufhashtbl[i], b_hash) {
 1977 chain++;
 1978 }
 1979 mutex_exit(&bufcache_lock);
 1980
 1981 if (chain > 0) {
 1982 hs->hash_used++;
 1983 hs->hash_items += chain;
 1984 if (chain > hs->hash_maxchain)
 1985 hs->hash_maxchain = chain;
 1986 }
 1987 preempt_point();
 1988 }
 1989
 1990 return 0;
 1991}
 1992
1956#ifdef DEBUG 1993#ifdef DEBUG
1957/* 1994/*
1958 * Print out statistics on the current allocation of the buffer pool. 1995 * Print out statistics on the current allocation of the buffer pool.
1959 * Can be enabled to print out on every ``sync'' by setting "syncprt" 1996 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1960 * in vfs_syscalls.c using sysctl. 1997 * in vfs_syscalls.c using sysctl.
1961 */ 1998 */
1962void 1999void
1963vfs_bufstats(void) 2000vfs_bufstats(void)
1964{ 2001{
1965 int i, j, count; 2002 int i, j, count;
1966 buf_t *bp; 2003 buf_t *bp;
1967 struct bqueue *dp; 2004 struct bqueue *dp;
1968 int counts[MAXBSIZE / MIN_PAGE_SIZE + 1]; 2005 int counts[MAXBSIZE / MIN_PAGE_SIZE + 1];
1969 static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" }; 2006 static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" };
1970 2007
1971 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 2008 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1972 count = 0; 2009 count = 0;
1973 memset(counts, 0, sizeof(counts)); 2010 memset(counts, 0, sizeof(counts));
1974 TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) { 2011 TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) {
1975 counts[bp->b_bufsize / PAGE_SIZE]++; 2012 counts[bp->b_bufsize / PAGE_SIZE]++;
1976 count++; 2013 count++;
1977 } 2014 }
1978 printf("%s: total-%d", bname[i], count); 2015 printf("%s: total-%d", bname[i], count);
1979 for (j = 0; j <= MAXBSIZE / PAGE_SIZE; j++) 2016 for (j = 0; j <= MAXBSIZE / PAGE_SIZE; j++)
1980 if (counts[j] != 0) 2017 if (counts[j] != 0)
1981 printf(", %d-%d", j * PAGE_SIZE, counts[j]); 2018 printf(", %d-%d", j * PAGE_SIZE, counts[j]);
1982 printf("\n"); 2019 printf("\n");
1983 } 2020 }
1984} 2021}
1985#endif /* DEBUG */ 2022#endif /* DEBUG */
1986 2023
1987/* ------------------------------ */ 2024/* ------------------------------ */
1988 2025
1989buf_t * 2026buf_t *
1990getiobuf(struct vnode *vp, bool waitok) 2027getiobuf(struct vnode *vp, bool waitok)
1991{ 2028{
1992 buf_t *bp; 2029 buf_t *bp;
1993 2030
1994 bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); 2031 bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
1995 if (bp == NULL) 2032 if (bp == NULL)
1996 return bp; 2033 return bp;
1997 2034
1998 buf_init(bp); 2035 buf_init(bp);
1999 2036
2000 if ((bp->b_vp = vp) != NULL) { 2037 if ((bp->b_vp = vp) != NULL) {
2001 bp->b_objlock = vp->v_interlock; 2038 bp->b_objlock = vp->v_interlock;
2002 } else { 2039 } else {
2003 KASSERT(bp->b_objlock == &buffer_lock); 2040 KASSERT(bp->b_objlock == &buffer_lock);
2004 } 2041 }
2005 2042
2006 return bp; 2043 return bp;
2007} 2044}
2008 2045
2009void 2046void
2010putiobuf(buf_t *bp) 2047putiobuf(buf_t *bp)
2011{ 2048{
2012 2049
2013 buf_destroy(bp); 2050 buf_destroy(bp);
2014 pool_cache_put(bufio_cache, bp); 2051 pool_cache_put(bufio_cache, bp);
2015} 2052}
2016 2053
2017/* 2054/*
2018 * nestiobuf_iodone: b_iodone callback for nested buffers. 2055 * nestiobuf_iodone: b_iodone callback for nested buffers.
2019 */ 2056 */
2020 2057
2021void 2058void
2022nestiobuf_iodone(buf_t *bp) 2059nestiobuf_iodone(buf_t *bp)
2023{ 2060{
2024 buf_t *mbp = bp->b_private; 2061 buf_t *mbp = bp->b_private;
2025 int error; 2062 int error;
2026 int donebytes; 2063 int donebytes;
2027 2064
2028 KASSERT(bp->b_bcount <= bp->b_bufsize); 2065 KASSERT(bp->b_bcount <= bp->b_bufsize);
2029 KASSERT(mbp != bp); 2066 KASSERT(mbp != bp);
2030 2067
2031 error = bp->b_error; 2068 error = bp->b_error;
2032 if (bp->b_error == 0 && 2069 if (bp->b_error == 0 &&
2033 (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) { 2070 (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) {
2034 /* 2071 /*
2035 * Not all got transferred, raise an error. We have no way to 2072 * Not all got transferred, raise an error. We have no way to
2036 * propagate these conditions to mbp. 2073 * propagate these conditions to mbp.
2037 */ 2074 */
2038 error = EIO; 2075 error = EIO;
2039 } 2076 }
2040 2077
2041 donebytes = bp->b_bufsize; 2078 donebytes = bp->b_bufsize;
2042 2079
2043 putiobuf(bp); 2080 putiobuf(bp);
2044 nestiobuf_done(mbp, donebytes, error); 2081 nestiobuf_done(mbp, donebytes, error);
2045} 2082}
2046 2083
2047/* 2084/*
2048 * nestiobuf_setup: setup a "nested" buffer. 2085 * nestiobuf_setup: setup a "nested" buffer.
2049 * 2086 *
2050 * => 'mbp' is a "master" buffer which is being divided into sub pieces. 2087 * => 'mbp' is a "master" buffer which is being divided into sub pieces.
2051 * => 'bp' should be a buffer allocated by getiobuf. 2088 * => 'bp' should be a buffer allocated by getiobuf.
2052 * => 'offset' is a byte offset in the master buffer. 2089 * => 'offset' is a byte offset in the master buffer.
2053 * => 'size' is a size in bytes of this nested buffer. 2090 * => 'size' is a size in bytes of this nested buffer.
2054 */ 2091 */
2055 2092
2056void 2093void
2057nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size) 2094nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size)
2058{ 2095{
2059 const int b_pass = mbp->b_flags & (B_READ|B_PHYS|B_RAW|B_MEDIA_FLAGS); 2096 const int b_pass = mbp->b_flags & (B_READ|B_PHYS|B_RAW|B_MEDIA_FLAGS);
2060 struct vnode *vp = mbp->b_vp; 2097 struct vnode *vp = mbp->b_vp;
2061 2098
2062 KASSERT(mbp->b_bcount >= offset + size); 2099 KASSERT(mbp->b_bcount >= offset + size);
2063 bp->b_vp = vp; 2100 bp->b_vp = vp;
2064 bp->b_dev = mbp->b_dev; 2101 bp->b_dev = mbp->b_dev;
2065 bp->b_objlock = mbp->b_objlock; 2102 bp->b_objlock = mbp->b_objlock;
2066 bp->b_cflags = BC_BUSY; 2103 bp->b_cflags = BC_BUSY;
2067 bp->b_flags = B_ASYNC | b_pass; 2104 bp->b_flags = B_ASYNC | b_pass;
2068 bp->b_iodone = nestiobuf_iodone; 2105 bp->b_iodone = nestiobuf_iodone;
2069 bp->b_data = (char *)mbp->b_data + offset; 2106 bp->b_data = (char *)mbp->b_data + offset;
2070 bp->b_resid = bp->b_bcount = size; 2107 bp->b_resid = bp->b_bcount = size;
2071 bp->b_bufsize = bp->b_bcount; 2108 bp->b_bufsize = bp->b_bcount;
2072 bp->b_private = mbp; 2109 bp->b_private = mbp;
2073 BIO_COPYPRIO(bp, mbp); 2110 BIO_COPYPRIO(bp, mbp);
2074 if (BUF_ISWRITE(bp) && vp != NULL) { 2111 if (BUF_ISWRITE(bp) && vp != NULL) {
2075 mutex_enter(vp->v_interlock); 2112 mutex_enter(vp->v_interlock);
2076 vp->v_numoutput++; 2113 vp->v_numoutput++;
2077 mutex_exit(vp->v_interlock); 2114 mutex_exit(vp->v_interlock);
2078 } 2115 }
2079} 2116}
2080 2117
2081/* 2118/*
2082 * nestiobuf_done: propagate completion to the master buffer. 2119 * nestiobuf_done: propagate completion to the master buffer.
2083 * 2120 *
2084 * => 'donebytes' specifies how many bytes in the 'mbp' is completed. 2121 * => 'donebytes' specifies how many bytes in the 'mbp' is completed.
2085 * => 'error' is an errno(2) that 'donebytes' has been completed with. 2122 * => 'error' is an errno(2) that 'donebytes' has been completed with.
2086 */ 2123 */
2087 2124
2088void 2125void
2089nestiobuf_done(buf_t *mbp, int donebytes, int error) 2126nestiobuf_done(buf_t *mbp, int donebytes, int error)
2090{ 2127{
2091 2128
2092 if (donebytes == 0) { 2129 if (donebytes == 0) {
2093 return; 2130 return;
2094 } 2131 }
2095 mutex_enter(mbp->b_objlock); 2132 mutex_enter(mbp->b_objlock);
2096 KASSERT(mbp->b_resid >= donebytes); 2133 KASSERT(mbp->b_resid >= donebytes);
2097 mbp->b_resid -= donebytes; 2134 mbp->b_resid -= donebytes;
2098 if (error) 2135 if (error)
2099 mbp->b_error = error; 2136 mbp->b_error = error;
2100 if (mbp->b_resid == 0) { 2137 if (mbp->b_resid == 0) {
2101 if (mbp->b_error) 2138 if (mbp->b_error)
2102 mbp->b_resid = mbp->b_bcount; 2139 mbp->b_resid = mbp->b_bcount;
2103 mutex_exit(mbp->b_objlock); 2140 mutex_exit(mbp->b_objlock);
2104 biodone(mbp); 2141 biodone(mbp);
2105 } else 2142 } else
2106 mutex_exit(mbp->b_objlock); 2143 mutex_exit(mbp->b_objlock);
2107} 2144}
2108 2145
2109void 2146void
2110buf_init(buf_t *bp) 2147buf_init(buf_t *bp)
2111{ 2148{
2112 2149
2113 cv_init(&bp->b_busy, "biolock"); 2150 cv_init(&bp->b_busy, "biolock");
2114 cv_init(&bp->b_done, "biowait"); 2151 cv_init(&bp->b_done, "biowait");
2115 bp->b_dev = NODEV; 2152 bp->b_dev = NODEV;
2116 bp->b_error = 0; 2153 bp->b_error = 0;
2117 bp->b_flags = 0; 2154 bp->b_flags = 0;
2118 bp->b_cflags = 0; 2155 bp->b_cflags = 0;
2119 bp->b_oflags = 0; 2156 bp->b_oflags = 0;
2120 bp->b_objlock = &buffer_lock; 2157 bp->b_objlock = &buffer_lock;
2121 bp->b_iodone = NULL; 2158 bp->b_iodone = NULL;
2122 bp->b_dev = NODEV; 2159 bp->b_dev = NODEV;
2123 bp->b_vnbufs.le_next = NOLIST; 2160 bp->b_vnbufs.le_next = NOLIST;
2124 BIO_SETPRIO(bp, BPRIO_DEFAULT); 2161 BIO_SETPRIO(bp, BPRIO_DEFAULT);
2125} 2162}
2126 2163
2127void 2164void
2128buf_destroy(buf_t *bp) 2165buf_destroy(buf_t *bp)
2129{ 2166{
2130 2167
2131 cv_destroy(&bp->b_done); 2168 cv_destroy(&bp->b_done);
2132 cv_destroy(&bp->b_busy); 2169 cv_destroy(&bp->b_busy);
2133} 2170}
2134 2171
2135int 2172int
2136bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock) 2173bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock)
2137{ 2174{
2138 int error; 2175 int error;
2139 2176
2140 KASSERT(mutex_owned(&bufcache_lock)); 2177 KASSERT(mutex_owned(&bufcache_lock));
2141 2178
2142 SDT_PROBE4(io, kernel, , bbusy__start, bp, intr, timo, interlock); 2179 SDT_PROBE4(io, kernel, , bbusy__start, bp, intr, timo, interlock);
2143 2180
2144 if ((bp->b_cflags & BC_BUSY) != 0) { 2181 if ((bp->b_cflags & BC_BUSY) != 0) {
2145 if (curlwp == uvm.pagedaemon_lwp) { 2182 if (curlwp == uvm.pagedaemon_lwp) {
2146 error = EDEADLK; 2183 error = EDEADLK;
2147 goto out; 2184 goto out;
2148 } 2185 }
2149 bp->b_cflags |= BC_WANTED; 2186 bp->b_cflags |= BC_WANTED;
2150 if (interlock != NULL) 2187 if (interlock != NULL)
2151 mutex_exit(interlock); 2188 mutex_exit(interlock);
2152 if (intr) { 2189 if (intr) {
2153 error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock, 2190 error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock,
2154 timo); 2191 timo);
2155 } else { 2192 } else {
2156 error = cv_timedwait(&bp->b_busy, &bufcache_lock, 2193 error = cv_timedwait(&bp->b_busy, &bufcache_lock,
2157 timo); 2194 timo);
2158 } 2195 }
2159 /* 2196 /*
2160 * At this point the buffer may be gone: don't touch it 2197 * At this point the buffer may be gone: don't touch it
2161 * again. The caller needs to find it again and retry. 2198 * again. The caller needs to find it again and retry.
2162 */ 2199 */
2163 if (interlock != NULL) 2200 if (interlock != NULL)
2164 mutex_enter(interlock); 2201 mutex_enter(interlock);
2165 if (error == 0) 2202 if (error == 0)
2166 error = EPASSTHROUGH; 2203 error = EPASSTHROUGH;
2167 } else { 2204 } else {
2168 bp->b_cflags |= BC_BUSY; 2205 bp->b_cflags |= BC_BUSY;
2169 error = 0; 2206 error = 0;
2170 } 2207 }
2171 2208
2172out: SDT_PROBE5(io, kernel, , bbusy__done, 2209out: SDT_PROBE5(io, kernel, , bbusy__done,
2173 bp, intr, timo, interlock, error); 2210 bp, intr, timo, interlock, error);
2174 return error; 2211 return error;
2175} 2212}
2176 2213
2177/* 2214/*
2178 * Nothing outside this file should really need to know about nbuf, 2215 * Nothing outside this file should really need to know about nbuf,
2179 * but a few things still want to read it, so give them a way to do that. 2216 * but a few things still want to read it, so give them a way to do that.
2180 */ 2217 */
2181u_int 2218u_int
2182buf_nbuf(void) 2219buf_nbuf(void)
2183{ 2220{
2184 2221
2185 return nbuf; 2222 return nbuf;
2186} 2223}