| @@ -1,2186 +1,2223 @@ | | | @@ -1,2186 +1,2223 @@ |
1 | /* $NetBSD: vfs_bio.c,v 1.297 2020/07/31 04:07:30 chs Exp $ */ | | 1 | /* $NetBSD: vfs_bio.c,v 1.298 2021/04/01 06:25:59 simonb Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 2007, 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc. | | 4 | * Copyright (c) 2007, 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc. |
5 | * All rights reserved. | | 5 | * All rights reserved. |
6 | * | | 6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation | | 7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Andrew Doran, and by Wasabi Systems, Inc. | | 8 | * by Andrew Doran, and by Wasabi Systems, Inc. |
9 | * | | 9 | * |
10 | * Redistribution and use in source and binary forms, with or without | | 10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions | | 11 | * modification, are permitted provided that the following conditions |
12 | * are met: | | 12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright | | 13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. | | 14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright | | 15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the | | 16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. | | 17 | * documentation and/or other materials provided with the distribution. |
18 | * | | 18 | * |
19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | | 19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | | 20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | | 21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | | 22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | | 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | | 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | | 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | | 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | | 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | | 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | * POSSIBILITY OF SUCH DAMAGE. | | 29 | * POSSIBILITY OF SUCH DAMAGE. |
30 | */ | | 30 | */ |
31 | | | 31 | |
32 | /*- | | 32 | /*- |
33 | * Copyright (c) 1982, 1986, 1989, 1993 | | 33 | * Copyright (c) 1982, 1986, 1989, 1993 |
34 | * The Regents of the University of California. All rights reserved. | | 34 | * The Regents of the University of California. All rights reserved. |
35 | * (c) UNIX System Laboratories, Inc. | | 35 | * (c) UNIX System Laboratories, Inc. |
36 | * All or some portions of this file are derived from material licensed | | 36 | * All or some portions of this file are derived from material licensed |
37 | * to the University of California by American Telephone and Telegraph | | 37 | * to the University of California by American Telephone and Telegraph |
38 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with | | 38 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
39 | * the permission of UNIX System Laboratories, Inc. | | 39 | * the permission of UNIX System Laboratories, Inc. |
40 | * | | 40 | * |
41 | * Redistribution and use in source and binary forms, with or without | | 41 | * Redistribution and use in source and binary forms, with or without |
42 | * modification, are permitted provided that the following conditions | | 42 | * modification, are permitted provided that the following conditions |
43 | * are met: | | 43 | * are met: |
44 | * 1. Redistributions of source code must retain the above copyright | | 44 | * 1. Redistributions of source code must retain the above copyright |
45 | * notice, this list of conditions and the following disclaimer. | | 45 | * notice, this list of conditions and the following disclaimer. |
46 | * 2. Redistributions in binary form must reproduce the above copyright | | 46 | * 2. Redistributions in binary form must reproduce the above copyright |
47 | * notice, this list of conditions and the following disclaimer in the | | 47 | * notice, this list of conditions and the following disclaimer in the |
48 | * documentation and/or other materials provided with the distribution. | | 48 | * documentation and/or other materials provided with the distribution. |
49 | * 3. Neither the name of the University nor the names of its contributors | | 49 | * 3. Neither the name of the University nor the names of its contributors |
50 | * may be used to endorse or promote products derived from this software | | 50 | * may be used to endorse or promote products derived from this software |
51 | * without specific prior written permission. | | 51 | * without specific prior written permission. |
52 | * | | 52 | * |
53 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | | 53 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
54 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | | 54 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
55 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | | 55 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
56 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | | 56 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
57 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | | 57 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
58 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | | 58 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
59 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | | 59 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
60 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | | 60 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
61 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | | 61 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
62 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | | 62 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
63 | * SUCH DAMAGE. | | 63 | * SUCH DAMAGE. |
64 | * | | 64 | * |
65 | * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 | | 65 | * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 |
66 | */ | | 66 | */ |
67 | | | 67 | |
68 | /*- | | 68 | /*- |
69 | * Copyright (c) 1994 Christopher G. Demetriou | | 69 | * Copyright (c) 1994 Christopher G. Demetriou |
70 | * | | 70 | * |
71 | * Redistribution and use in source and binary forms, with or without | | 71 | * Redistribution and use in source and binary forms, with or without |
72 | * modification, are permitted provided that the following conditions | | 72 | * modification, are permitted provided that the following conditions |
73 | * are met: | | 73 | * are met: |
74 | * 1. Redistributions of source code must retain the above copyright | | 74 | * 1. Redistributions of source code must retain the above copyright |
75 | * notice, this list of conditions and the following disclaimer. | | 75 | * notice, this list of conditions and the following disclaimer. |
76 | * 2. Redistributions in binary form must reproduce the above copyright | | 76 | * 2. Redistributions in binary form must reproduce the above copyright |
77 | * notice, this list of conditions and the following disclaimer in the | | 77 | * notice, this list of conditions and the following disclaimer in the |
78 | * documentation and/or other materials provided with the distribution. | | 78 | * documentation and/or other materials provided with the distribution. |
79 | * 3. All advertising materials mentioning features or use of this software | | 79 | * 3. All advertising materials mentioning features or use of this software |
80 | * must display the following acknowledgement: | | 80 | * must display the following acknowledgement: |
81 | * This product includes software developed by the University of | | 81 | * This product includes software developed by the University of |
82 | * California, Berkeley and its contributors. | | 82 | * California, Berkeley and its contributors. |
83 | * 4. Neither the name of the University nor the names of its contributors | | 83 | * 4. Neither the name of the University nor the names of its contributors |
84 | * may be used to endorse or promote products derived from this software | | 84 | * may be used to endorse or promote products derived from this software |
85 | * without specific prior written permission. | | 85 | * without specific prior written permission. |
86 | * | | 86 | * |
87 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | | 87 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
88 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | | 88 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
89 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | | 89 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
90 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | | 90 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
91 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | | 91 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
92 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | | 92 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
93 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | | 93 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
94 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | | 94 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
95 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | | 95 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
96 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | | 96 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
97 | * SUCH DAMAGE. | | 97 | * SUCH DAMAGE. |
98 | * | | 98 | * |
99 | * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 | | 99 | * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 |
100 | */ | | 100 | */ |
101 | | | 101 | |
102 | /* | | 102 | /* |
103 | * The buffer cache subsystem. | | 103 | * The buffer cache subsystem. |
104 | * | | 104 | * |
105 | * Some references: | | 105 | * Some references: |
106 | * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) | | 106 | * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) |
107 | * Leffler, et al.: The Design and Implementation of the 4.3BSD | | 107 | * Leffler, et al.: The Design and Implementation of the 4.3BSD |
108 | * UNIX Operating System (Addison Welley, 1989) | | 108 | * UNIX Operating System (Addison Welley, 1989) |
109 | * | | 109 | * |
110 | * Locking | | 110 | * Locking |
111 | * | | 111 | * |
112 | * There are three locks: | | 112 | * There are three locks: |
113 | * - bufcache_lock: protects global buffer cache state. | | 113 | * - bufcache_lock: protects global buffer cache state. |
114 | * - BC_BUSY: a long term per-buffer lock. | | 114 | * - BC_BUSY: a long term per-buffer lock. |
115 | * - buf_t::b_objlock: lock on completion (biowait vs biodone). | | 115 | * - buf_t::b_objlock: lock on completion (biowait vs biodone). |
116 | * | | 116 | * |
117 | * For buffers associated with vnodes (a most common case) b_objlock points | | 117 | * For buffers associated with vnodes (a most common case) b_objlock points |
118 | * to the vnode_t::v_interlock. Otherwise, it points to generic buffer_lock. | | 118 | * to the vnode_t::v_interlock. Otherwise, it points to generic buffer_lock. |
119 | * | | 119 | * |
120 | * Lock order: | | 120 | * Lock order: |
121 | * bufcache_lock -> | | 121 | * bufcache_lock -> |
122 | * buf_t::b_objlock | | 122 | * buf_t::b_objlock |
123 | */ | | 123 | */ |
124 | | | 124 | |
125 | #include <sys/cdefs.h> | | 125 | #include <sys/cdefs.h> |
126 | __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.297 2020/07/31 04:07:30 chs Exp $"); | | 126 | __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.298 2021/04/01 06:25:59 simonb Exp $"); |
127 | | | 127 | |
128 | #ifdef _KERNEL_OPT | | 128 | #ifdef _KERNEL_OPT |
129 | #include "opt_bufcache.h" | | 129 | #include "opt_bufcache.h" |
130 | #include "opt_dtrace.h" | | 130 | #include "opt_dtrace.h" |
131 | #include "opt_biohist.h" | | 131 | #include "opt_biohist.h" |
132 | #endif | | 132 | #endif |
133 | | | 133 | |
134 | #include <sys/param.h> | | 134 | #include <sys/param.h> |
135 | #include <sys/systm.h> | | 135 | #include <sys/systm.h> |
136 | #include <sys/kernel.h> | | 136 | #include <sys/kernel.h> |
137 | #include <sys/proc.h> | | 137 | #include <sys/proc.h> |
138 | #include <sys/buf.h> | | 138 | #include <sys/buf.h> |
139 | #include <sys/vnode.h> | | 139 | #include <sys/vnode.h> |
140 | #include <sys/mount.h> | | 140 | #include <sys/mount.h> |
141 | #include <sys/resourcevar.h> | | 141 | #include <sys/resourcevar.h> |
142 | #include <sys/sysctl.h> | | 142 | #include <sys/sysctl.h> |
143 | #include <sys/conf.h> | | 143 | #include <sys/conf.h> |
144 | #include <sys/kauth.h> | | 144 | #include <sys/kauth.h> |
145 | #include <sys/fstrans.h> | | 145 | #include <sys/fstrans.h> |
146 | #include <sys/intr.h> | | 146 | #include <sys/intr.h> |
147 | #include <sys/cpu.h> | | 147 | #include <sys/cpu.h> |
148 | #include <sys/wapbl.h> | | 148 | #include <sys/wapbl.h> |
149 | #include <sys/bitops.h> | | 149 | #include <sys/bitops.h> |
150 | #include <sys/cprng.h> | | 150 | #include <sys/cprng.h> |
151 | #include <sys/sdt.h> | | 151 | #include <sys/sdt.h> |
152 | | | 152 | |
153 | #include <uvm/uvm.h> /* extern struct uvm uvm */ | | 153 | #include <uvm/uvm.h> /* extern struct uvm uvm */ |
154 | | | 154 | |
155 | #include <miscfs/specfs/specdev.h> | | 155 | #include <miscfs/specfs/specdev.h> |
156 | | | 156 | |
157 | SDT_PROVIDER_DEFINE(io); | | 157 | SDT_PROVIDER_DEFINE(io); |
158 | | | 158 | |
159 | SDT_PROBE_DEFINE4(io, kernel, , bbusy__start, | | 159 | SDT_PROBE_DEFINE4(io, kernel, , bbusy__start, |
160 | "struct buf *"/*bp*/, | | 160 | "struct buf *"/*bp*/, |
161 | "bool"/*intr*/, "int"/*timo*/, "kmutex_t *"/*interlock*/); | | 161 | "bool"/*intr*/, "int"/*timo*/, "kmutex_t *"/*interlock*/); |
162 | SDT_PROBE_DEFINE5(io, kernel, , bbusy__done, | | 162 | SDT_PROBE_DEFINE5(io, kernel, , bbusy__done, |
163 | "struct buf *"/*bp*/, | | 163 | "struct buf *"/*bp*/, |
164 | "bool"/*intr*/, | | 164 | "bool"/*intr*/, |
165 | "int"/*timo*/, | | 165 | "int"/*timo*/, |
166 | "kmutex_t *"/*interlock*/, | | 166 | "kmutex_t *"/*interlock*/, |
167 | "int"/*error*/); | | 167 | "int"/*error*/); |
168 | SDT_PROBE_DEFINE0(io, kernel, , getnewbuf__start); | | 168 | SDT_PROBE_DEFINE0(io, kernel, , getnewbuf__start); |
169 | SDT_PROBE_DEFINE1(io, kernel, , getnewbuf__done, "struct buf *"/*bp*/); | | 169 | SDT_PROBE_DEFINE1(io, kernel, , getnewbuf__done, "struct buf *"/*bp*/); |
170 | SDT_PROBE_DEFINE3(io, kernel, , getblk__start, | | 170 | SDT_PROBE_DEFINE3(io, kernel, , getblk__start, |
171 | "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/); | | 171 | "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/); |
172 | SDT_PROBE_DEFINE4(io, kernel, , getblk__done, | | 172 | SDT_PROBE_DEFINE4(io, kernel, , getblk__done, |
173 | "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/, | | 173 | "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/, |
174 | "struct buf *"/*bp*/); | | 174 | "struct buf *"/*bp*/); |
175 | SDT_PROBE_DEFINE2(io, kernel, , brelse, "struct buf *"/*bp*/, "int"/*set*/); | | 175 | SDT_PROBE_DEFINE2(io, kernel, , brelse, "struct buf *"/*bp*/, "int"/*set*/); |
176 | SDT_PROBE_DEFINE1(io, kernel, , wait__start, "struct buf *"/*bp*/); | | 176 | SDT_PROBE_DEFINE1(io, kernel, , wait__start, "struct buf *"/*bp*/); |
177 | SDT_PROBE_DEFINE1(io, kernel, , wait__done, "struct buf *"/*bp*/); | | 177 | SDT_PROBE_DEFINE1(io, kernel, , wait__done, "struct buf *"/*bp*/); |
178 | | | 178 | |
179 | #ifndef BUFPAGES | | 179 | #ifndef BUFPAGES |
180 | # define BUFPAGES 0 | | 180 | # define BUFPAGES 0 |
181 | #endif | | 181 | #endif |
182 | | | 182 | |
183 | #ifdef BUFCACHE | | 183 | #ifdef BUFCACHE |
184 | # if (BUFCACHE < 5) || (BUFCACHE > 95) | | 184 | # if (BUFCACHE < 5) || (BUFCACHE > 95) |
185 | # error BUFCACHE is not between 5 and 95 | | 185 | # error BUFCACHE is not between 5 and 95 |
186 | # endif | | 186 | # endif |
187 | #else | | 187 | #else |
188 | # define BUFCACHE 15 | | 188 | # define BUFCACHE 15 |
189 | #endif | | 189 | #endif |
190 | | | 190 | |
191 | u_int nbuf; /* desired number of buffer headers */ | | 191 | u_int nbuf; /* desired number of buffer headers */ |
192 | u_int bufpages = BUFPAGES; /* optional hardwired count */ | | 192 | u_int bufpages = BUFPAGES; /* optional hardwired count */ |
193 | u_int bufcache = BUFCACHE; /* max % of RAM to use for buffer cache */ | | 193 | u_int bufcache = BUFCACHE; /* max % of RAM to use for buffer cache */ |
194 | | | 194 | |
195 | /* | | 195 | /* |
196 | * Definitions for the buffer free lists. | | 196 | * Definitions for the buffer free lists. |
197 | */ | | 197 | */ |
198 | #define BQUEUES 3 /* number of free buffer queues */ | | 198 | #define BQUEUES 3 /* number of free buffer queues */ |
199 | | | 199 | |
200 | #define BQ_LOCKED 0 /* super-blocks &c */ | | 200 | #define BQ_LOCKED 0 /* super-blocks &c */ |
201 | #define BQ_LRU 1 /* lru, useful buffers */ | | 201 | #define BQ_LRU 1 /* lru, useful buffers */ |
202 | #define BQ_AGE 2 /* rubbish */ | | 202 | #define BQ_AGE 2 /* rubbish */ |
203 | | | 203 | |
204 | struct bqueue { | | 204 | struct bqueue { |
205 | TAILQ_HEAD(, buf) bq_queue; | | 205 | TAILQ_HEAD(, buf) bq_queue; |
206 | uint64_t bq_bytes; | | 206 | uint64_t bq_bytes; |
207 | buf_t *bq_marker; | | 207 | buf_t *bq_marker; |
208 | }; | | 208 | }; |
209 | static struct bqueue bufqueues[BQUEUES] __cacheline_aligned; | | 209 | static struct bqueue bufqueues[BQUEUES] __cacheline_aligned; |
210 | | | 210 | |
211 | /* Function prototypes */ | | 211 | /* Function prototypes */ |
212 | static void buf_setwm(void); | | 212 | static void buf_setwm(void); |
213 | static int buf_trim(void); | | 213 | static int buf_trim(void); |
214 | static void *bufpool_page_alloc(struct pool *, int); | | 214 | static void *bufpool_page_alloc(struct pool *, int); |
215 | static void bufpool_page_free(struct pool *, void *); | | 215 | static void bufpool_page_free(struct pool *, void *); |
216 | static buf_t *bio_doread(struct vnode *, daddr_t, int, int); | | 216 | static buf_t *bio_doread(struct vnode *, daddr_t, int, int); |
217 | static buf_t *getnewbuf(int, int, int); | | 217 | static buf_t *getnewbuf(int, int, int); |
218 | static int buf_lotsfree(void); | | 218 | static int buf_lotsfree(void); |
219 | static int buf_canrelease(void); | | 219 | static int buf_canrelease(void); |
220 | static u_long buf_mempoolidx(u_long); | | 220 | static u_long buf_mempoolidx(u_long); |
221 | static u_long buf_roundsize(u_long); | | 221 | static u_long buf_roundsize(u_long); |
222 | static void *buf_alloc(size_t); | | 222 | static void *buf_alloc(size_t); |
223 | static void buf_mrelease(void *, size_t); | | 223 | static void buf_mrelease(void *, size_t); |
224 | static void binsheadfree(buf_t *, struct bqueue *); | | 224 | static void binsheadfree(buf_t *, struct bqueue *); |
225 | static void binstailfree(buf_t *, struct bqueue *); | | 225 | static void binstailfree(buf_t *, struct bqueue *); |
226 | #ifdef DEBUG | | 226 | #ifdef DEBUG |
227 | static int checkfreelist(buf_t *, struct bqueue *, int); | | 227 | static int checkfreelist(buf_t *, struct bqueue *, int); |
228 | #endif | | 228 | #endif |
229 | static void biointr(void *); | | 229 | static void biointr(void *); |
230 | static void biodone2(buf_t *); | | 230 | static void biodone2(buf_t *); |
231 | static void sysctl_kern_buf_setup(void); | | 231 | static void sysctl_kern_buf_setup(void); |
232 | static void sysctl_vm_buf_setup(void); | | 232 | static void sysctl_vm_buf_setup(void); |
233 | | | 233 | |
234 | /* Initialization for biohist */ | | 234 | /* Initialization for biohist */ |
235 | | | 235 | |
236 | #include <sys/biohist.h> | | 236 | #include <sys/biohist.h> |
237 | | | 237 | |
238 | BIOHIST_DEFINE(biohist); | | 238 | BIOHIST_DEFINE(biohist); |
239 | | | 239 | |
240 | void | | 240 | void |
241 | biohist_init(void) | | 241 | biohist_init(void) |
242 | { | | 242 | { |
243 | | | 243 | |
244 | BIOHIST_INIT(biohist, BIOHIST_SIZE); | | 244 | BIOHIST_INIT(biohist, BIOHIST_SIZE); |
245 | } | | 245 | } |
246 | | | 246 | |
247 | /* | | 247 | /* |
248 | * Definitions for the buffer hash lists. | | 248 | * Definitions for the buffer hash lists. |
249 | */ | | 249 | */ |
250 | #define BUFHASH(dvp, lbn) \ | | 250 | #define BUFHASH(dvp, lbn) \ |
251 | (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash]) | | 251 | (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash]) |
252 | LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; | | 252 | LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; |
253 | u_long bufhash; | | 253 | u_long bufhash; |
254 | | | 254 | |
| | | 255 | static int bufhash_stats(struct hashstat_sysctl *, bool); |
| | | 256 | |
255 | static kcondvar_t needbuffer_cv; | | 257 | static kcondvar_t needbuffer_cv; |
256 | | | 258 | |
257 | /* | | 259 | /* |
258 | * Buffer queue lock. | | 260 | * Buffer queue lock. |
259 | */ | | 261 | */ |
260 | kmutex_t bufcache_lock __cacheline_aligned; | | 262 | kmutex_t bufcache_lock __cacheline_aligned; |
261 | kmutex_t buffer_lock __cacheline_aligned; | | 263 | kmutex_t buffer_lock __cacheline_aligned; |
262 | | | 264 | |
263 | /* Software ISR for completed transfers. */ | | 265 | /* Software ISR for completed transfers. */ |
264 | static void *biodone_sih; | | 266 | static void *biodone_sih; |
265 | | | 267 | |
266 | /* Buffer pool for I/O buffers. */ | | 268 | /* Buffer pool for I/O buffers. */ |
267 | static pool_cache_t buf_cache; | | 269 | static pool_cache_t buf_cache; |
268 | static pool_cache_t bufio_cache; | | 270 | static pool_cache_t bufio_cache; |
269 | | | 271 | |
270 | #define MEMPOOL_INDEX_OFFSET (ilog2(DEV_BSIZE)) /* smallest pool is 512 bytes */ | | 272 | #define MEMPOOL_INDEX_OFFSET (ilog2(DEV_BSIZE)) /* smallest pool is 512 bytes */ |
271 | #define NMEMPOOLS (ilog2(MAXBSIZE) - MEMPOOL_INDEX_OFFSET + 1) | | 273 | #define NMEMPOOLS (ilog2(MAXBSIZE) - MEMPOOL_INDEX_OFFSET + 1) |
272 | __CTASSERT((1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) == MAXBSIZE); | | 274 | __CTASSERT((1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) == MAXBSIZE); |
273 | | | 275 | |
274 | /* Buffer memory pools */ | | 276 | /* Buffer memory pools */ |
275 | static struct pool bmempools[NMEMPOOLS]; | | 277 | static struct pool bmempools[NMEMPOOLS]; |
276 | | | 278 | |
277 | static struct vm_map *buf_map; | | 279 | static struct vm_map *buf_map; |
278 | | | 280 | |
279 | /* | | 281 | /* |
280 | * Buffer memory pool allocator. | | 282 | * Buffer memory pool allocator. |
281 | */ | | 283 | */ |
282 | static void * | | 284 | static void * |
283 | bufpool_page_alloc(struct pool *pp, int flags) | | 285 | bufpool_page_alloc(struct pool *pp, int flags) |
284 | { | | 286 | { |
285 | | | 287 | |
286 | return (void *)uvm_km_alloc(buf_map, | | 288 | return (void *)uvm_km_alloc(buf_map, |
287 | MAXBSIZE, MAXBSIZE, | | 289 | MAXBSIZE, MAXBSIZE, |
288 | ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT|UVM_KMF_TRYLOCK) | | 290 | ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT|UVM_KMF_TRYLOCK) |
289 | | UVM_KMF_WIRED); | | 291 | | UVM_KMF_WIRED); |
290 | } | | 292 | } |
291 | | | 293 | |
292 | static void | | 294 | static void |
293 | bufpool_page_free(struct pool *pp, void *v) | | 295 | bufpool_page_free(struct pool *pp, void *v) |
294 | { | | 296 | { |
295 | | | 297 | |
296 | uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED); | | 298 | uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED); |
297 | } | | 299 | } |
298 | | | 300 | |
299 | static struct pool_allocator bufmempool_allocator = { | | 301 | static struct pool_allocator bufmempool_allocator = { |
300 | .pa_alloc = bufpool_page_alloc, | | 302 | .pa_alloc = bufpool_page_alloc, |
301 | .pa_free = bufpool_page_free, | | 303 | .pa_free = bufpool_page_free, |
302 | .pa_pagesz = MAXBSIZE, | | 304 | .pa_pagesz = MAXBSIZE, |
303 | }; | | 305 | }; |
304 | | | 306 | |
305 | /* Buffer memory management variables */ | | 307 | /* Buffer memory management variables */ |
306 | u_long bufmem_valimit; | | 308 | u_long bufmem_valimit; |
307 | u_long bufmem_hiwater; | | 309 | u_long bufmem_hiwater; |
308 | u_long bufmem_lowater; | | 310 | u_long bufmem_lowater; |
309 | u_long bufmem; | | 311 | u_long bufmem; |
310 | | | 312 | |
311 | /* | | 313 | /* |
312 | * MD code can call this to set a hard limit on the amount | | 314 | * MD code can call this to set a hard limit on the amount |
313 | * of virtual memory used by the buffer cache. | | 315 | * of virtual memory used by the buffer cache. |
314 | */ | | 316 | */ |
315 | int | | 317 | int |
316 | buf_setvalimit(vsize_t sz) | | 318 | buf_setvalimit(vsize_t sz) |
317 | { | | 319 | { |
318 | | | 320 | |
319 | /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */ | | 321 | /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */ |
320 | if (sz < NMEMPOOLS * MAXBSIZE) | | 322 | if (sz < NMEMPOOLS * MAXBSIZE) |
321 | return EINVAL; | | 323 | return EINVAL; |
322 | | | 324 | |
323 | bufmem_valimit = sz; | | 325 | bufmem_valimit = sz; |
324 | return 0; | | 326 | return 0; |
325 | } | | 327 | } |
326 | | | 328 | |
327 | static void | | 329 | static void |
328 | buf_setwm(void) | | 330 | buf_setwm(void) |
329 | { | | 331 | { |
330 | | | 332 | |
331 | bufmem_hiwater = buf_memcalc(); | | 333 | bufmem_hiwater = buf_memcalc(); |
332 | /* lowater is approx. 2% of memory (with bufcache = 15) */ | | 334 | /* lowater is approx. 2% of memory (with bufcache = 15) */ |
333 | #define BUFMEM_WMSHIFT 3 | | 335 | #define BUFMEM_WMSHIFT 3 |
334 | #define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT) | | 336 | #define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT) |
335 | if (bufmem_hiwater < BUFMEM_HIWMMIN) | | 337 | if (bufmem_hiwater < BUFMEM_HIWMMIN) |
336 | /* Ensure a reasonable minimum value */ | | 338 | /* Ensure a reasonable minimum value */ |
337 | bufmem_hiwater = BUFMEM_HIWMMIN; | | 339 | bufmem_hiwater = BUFMEM_HIWMMIN; |
338 | bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT; | | 340 | bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT; |
339 | } | | 341 | } |
340 | | | 342 | |
341 | #ifdef DEBUG | | 343 | #ifdef DEBUG |
342 | int debug_verify_freelist = 0; | | 344 | int debug_verify_freelist = 0; |
343 | static int | | 345 | static int |
344 | checkfreelist(buf_t *bp, struct bqueue *dp, int ison) | | 346 | checkfreelist(buf_t *bp, struct bqueue *dp, int ison) |
345 | { | | 347 | { |
346 | buf_t *b; | | 348 | buf_t *b; |
347 | | | 349 | |
348 | if (!debug_verify_freelist) | | 350 | if (!debug_verify_freelist) |
349 | return 1; | | 351 | return 1; |
350 | | | 352 | |
351 | TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) { | | 353 | TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) { |
352 | if (b == bp) | | 354 | if (b == bp) |
353 | return ison ? 1 : 0; | | 355 | return ison ? 1 : 0; |
354 | } | | 356 | } |
355 | | | 357 | |
356 | return ison ? 0 : 1; | | 358 | return ison ? 0 : 1; |
357 | } | | 359 | } |
358 | #endif | | 360 | #endif |
359 | | | 361 | |
360 | /* | | 362 | /* |
361 | * Insq/Remq for the buffer hash lists. | | 363 | * Insq/Remq for the buffer hash lists. |
362 | * Call with buffer queue locked. | | 364 | * Call with buffer queue locked. |
363 | */ | | 365 | */ |
364 | static void | | 366 | static void |
365 | binsheadfree(buf_t *bp, struct bqueue *dp) | | 367 | binsheadfree(buf_t *bp, struct bqueue *dp) |
366 | { | | 368 | { |
367 | | | 369 | |
368 | KASSERT(mutex_owned(&bufcache_lock)); | | 370 | KASSERT(mutex_owned(&bufcache_lock)); |
369 | KASSERT(bp->b_freelistindex == -1); | | 371 | KASSERT(bp->b_freelistindex == -1); |
370 | TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist); | | 372 | TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist); |
371 | dp->bq_bytes += bp->b_bufsize; | | 373 | dp->bq_bytes += bp->b_bufsize; |
372 | bp->b_freelistindex = dp - bufqueues; | | 374 | bp->b_freelistindex = dp - bufqueues; |
373 | } | | 375 | } |
374 | | | 376 | |
375 | static void | | 377 | static void |
376 | binstailfree(buf_t *bp, struct bqueue *dp) | | 378 | binstailfree(buf_t *bp, struct bqueue *dp) |
377 | { | | 379 | { |
378 | | | 380 | |
379 | KASSERT(mutex_owned(&bufcache_lock)); | | 381 | KASSERT(mutex_owned(&bufcache_lock)); |
380 | KASSERTMSG(bp->b_freelistindex == -1, "double free of buffer? " | | 382 | KASSERTMSG(bp->b_freelistindex == -1, "double free of buffer? " |
381 | "bp=%p, b_freelistindex=%d\n", bp, bp->b_freelistindex); | | 383 | "bp=%p, b_freelistindex=%d\n", bp, bp->b_freelistindex); |
382 | TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist); | | 384 | TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist); |
383 | dp->bq_bytes += bp->b_bufsize; | | 385 | dp->bq_bytes += bp->b_bufsize; |
384 | bp->b_freelistindex = dp - bufqueues; | | 386 | bp->b_freelistindex = dp - bufqueues; |
385 | } | | 387 | } |
386 | | | 388 | |
387 | void | | 389 | void |
388 | bremfree(buf_t *bp) | | 390 | bremfree(buf_t *bp) |
389 | { | | 391 | { |
390 | struct bqueue *dp; | | 392 | struct bqueue *dp; |
391 | int bqidx = bp->b_freelistindex; | | 393 | int bqidx = bp->b_freelistindex; |
392 | | | 394 | |
393 | KASSERT(mutex_owned(&bufcache_lock)); | | 395 | KASSERT(mutex_owned(&bufcache_lock)); |
394 | | | 396 | |
395 | KASSERT(bqidx != -1); | | 397 | KASSERT(bqidx != -1); |
396 | dp = &bufqueues[bqidx]; | | 398 | dp = &bufqueues[bqidx]; |
397 | KDASSERT(checkfreelist(bp, dp, 1)); | | 399 | KDASSERT(checkfreelist(bp, dp, 1)); |
398 | KASSERT(dp->bq_bytes >= bp->b_bufsize); | | 400 | KASSERT(dp->bq_bytes >= bp->b_bufsize); |
399 | TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist); | | 401 | TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist); |
400 | dp->bq_bytes -= bp->b_bufsize; | | 402 | dp->bq_bytes -= bp->b_bufsize; |
401 | | | 403 | |
402 | /* For the sysctl helper. */ | | 404 | /* For the sysctl helper. */ |
403 | if (bp == dp->bq_marker) | | 405 | if (bp == dp->bq_marker) |
404 | dp->bq_marker = NULL; | | 406 | dp->bq_marker = NULL; |
405 | | | 407 | |
406 | #if defined(DIAGNOSTIC) | | 408 | #if defined(DIAGNOSTIC) |
407 | bp->b_freelistindex = -1; | | 409 | bp->b_freelistindex = -1; |
408 | #endif /* defined(DIAGNOSTIC) */ | | 410 | #endif /* defined(DIAGNOSTIC) */ |
409 | } | | 411 | } |
410 | | | 412 | |
411 | /* | | 413 | /* |
412 | * note that for some ports this is used by pmap bootstrap code to | | 414 | * note that for some ports this is used by pmap bootstrap code to |
413 | * determine kva size. | | 415 | * determine kva size. |
414 | */ | | 416 | */ |
415 | u_long | | 417 | u_long |
416 | buf_memcalc(void) | | 418 | buf_memcalc(void) |
417 | { | | 419 | { |
418 | u_long n; | | 420 | u_long n; |
419 | vsize_t mapsz = 0; | | 421 | vsize_t mapsz = 0; |
420 | | | 422 | |
421 | /* | | 423 | /* |
422 | * Determine the upper bound of memory to use for buffers. | | 424 | * Determine the upper bound of memory to use for buffers. |
423 | * | | 425 | * |
424 | * - If bufpages is specified, use that as the number | | 426 | * - If bufpages is specified, use that as the number |
425 | * pages. | | 427 | * pages. |
426 | * | | 428 | * |
427 | * - Otherwise, use bufcache as the percentage of | | 429 | * - Otherwise, use bufcache as the percentage of |
428 | * physical memory. | | 430 | * physical memory. |
429 | */ | | 431 | */ |
430 | if (bufpages != 0) { | | 432 | if (bufpages != 0) { |
431 | n = bufpages; | | 433 | n = bufpages; |
432 | } else { | | 434 | } else { |
433 | if (bufcache < 5) { | | 435 | if (bufcache < 5) { |
434 | printf("forcing bufcache %d -> 5", bufcache); | | 436 | printf("forcing bufcache %d -> 5", bufcache); |
435 | bufcache = 5; | | 437 | bufcache = 5; |
436 | } | | 438 | } |
437 | if (bufcache > 95) { | | 439 | if (bufcache > 95) { |
438 | printf("forcing bufcache %d -> 95", bufcache); | | 440 | printf("forcing bufcache %d -> 95", bufcache); |
439 | bufcache = 95; | | 441 | bufcache = 95; |
440 | } | | 442 | } |
441 | if (buf_map != NULL) | | 443 | if (buf_map != NULL) |
442 | mapsz = vm_map_max(buf_map) - vm_map_min(buf_map); | | 444 | mapsz = vm_map_max(buf_map) - vm_map_min(buf_map); |
443 | n = calc_cache_size(mapsz, bufcache, | | 445 | n = calc_cache_size(mapsz, bufcache, |
444 | (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT) | | 446 | (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT) |
445 | / PAGE_SIZE; | | 447 | / PAGE_SIZE; |
446 | } | | 448 | } |
447 | | | 449 | |
448 | n <<= PAGE_SHIFT; | | 450 | n <<= PAGE_SHIFT; |
449 | if (bufmem_valimit != 0 && n > bufmem_valimit) | | 451 | if (bufmem_valimit != 0 && n > bufmem_valimit) |
450 | n = bufmem_valimit; | | 452 | n = bufmem_valimit; |
451 | | | 453 | |
452 | return (n); | | 454 | return (n); |
453 | } | | 455 | } |
454 | | | 456 | |
455 | /* | | 457 | /* |
456 | * Initialize buffers and hash links for buffers. | | 458 | * Initialize buffers and hash links for buffers. |
457 | */ | | 459 | */ |
458 | void | | 460 | void |
459 | bufinit(void) | | 461 | bufinit(void) |
460 | { | | 462 | { |
461 | struct bqueue *dp; | | 463 | struct bqueue *dp; |
462 | int use_std; | | 464 | int use_std; |
463 | u_int i; | | 465 | u_int i; |
464 | | | 466 | |
465 | biodone_vfs = biodone; | | 467 | biodone_vfs = biodone; |
466 | | | 468 | |
467 | mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE); | | 469 | mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE); |
468 | mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE); | | 470 | mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE); |
469 | cv_init(&needbuffer_cv, "needbuf"); | | 471 | cv_init(&needbuffer_cv, "needbuf"); |
470 | | | 472 | |
471 | if (bufmem_valimit != 0) { | | 473 | if (bufmem_valimit != 0) { |
472 | vaddr_t minaddr = 0, maxaddr; | | 474 | vaddr_t minaddr = 0, maxaddr; |
473 | buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, | | 475 | buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, |
474 | bufmem_valimit, 0, false, 0); | | 476 | bufmem_valimit, 0, false, 0); |
475 | if (buf_map == NULL) | | 477 | if (buf_map == NULL) |
476 | panic("bufinit: cannot allocate submap"); | | 478 | panic("bufinit: cannot allocate submap"); |
477 | } else | | 479 | } else |
478 | buf_map = kernel_map; | | 480 | buf_map = kernel_map; |
479 | | | 481 | |
480 | /* | | 482 | /* |
481 | * Initialize buffer cache memory parameters. | | 483 | * Initialize buffer cache memory parameters. |
482 | */ | | 484 | */ |
483 | bufmem = 0; | | 485 | bufmem = 0; |
484 | buf_setwm(); | | 486 | buf_setwm(); |
485 | | | 487 | |
486 | /* On "small" machines use small pool page sizes where possible */ | | 488 | /* On "small" machines use small pool page sizes where possible */ |
487 | use_std = (physmem < atop(16*1024*1024)); | | 489 | use_std = (physmem < atop(16*1024*1024)); |
488 | | | 490 | |
489 | /* | | 491 | /* |
490 | * Also use them on systems that can map the pool pages using | | 492 | * Also use them on systems that can map the pool pages using |
491 | * a direct-mapped segment. | | 493 | * a direct-mapped segment. |
492 | */ | | 494 | */ |
493 | #ifdef PMAP_MAP_POOLPAGE | | 495 | #ifdef PMAP_MAP_POOLPAGE |
494 | use_std = 1; | | 496 | use_std = 1; |
495 | #endif | | 497 | #endif |
496 | | | 498 | |
497 | buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0, | | 499 | buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0, |
498 | "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL); | | 500 | "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL); |
499 | bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0, | | 501 | bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0, |
500 | "biopl", NULL, IPL_BIO, NULL, NULL, NULL); | | 502 | "biopl", NULL, IPL_BIO, NULL, NULL, NULL); |
501 | | | 503 | |
502 | for (i = 0; i < NMEMPOOLS; i++) { | | 504 | for (i = 0; i < NMEMPOOLS; i++) { |
503 | struct pool_allocator *pa; | | 505 | struct pool_allocator *pa; |
504 | struct pool *pp = &bmempools[i]; | | 506 | struct pool *pp = &bmempools[i]; |
505 | u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET); | | 507 | u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET); |
506 | char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */ | | 508 | char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */ |
507 | if (__predict_false(size >= 1048576)) | | 509 | if (__predict_false(size >= 1048576)) |
508 | (void)snprintf(name, 8, "buf%um", size / 1048576); | | 510 | (void)snprintf(name, 8, "buf%um", size / 1048576); |
509 | else if (__predict_true(size >= 1024)) | | 511 | else if (__predict_true(size >= 1024)) |
510 | (void)snprintf(name, 8, "buf%uk", size / 1024); | | 512 | (void)snprintf(name, 8, "buf%uk", size / 1024); |
511 | else | | 513 | else |
512 | (void)snprintf(name, 8, "buf%ub", size); | | 514 | (void)snprintf(name, 8, "buf%ub", size); |
513 | pa = (size <= PAGE_SIZE && use_std) | | 515 | pa = (size <= PAGE_SIZE && use_std) |
514 | ? &pool_allocator_nointr | | 516 | ? &pool_allocator_nointr |
515 | : &bufmempool_allocator; | | 517 | : &bufmempool_allocator; |
516 | pool_init(pp, size, DEV_BSIZE, 0, 0, name, pa, IPL_NONE); | | 518 | pool_init(pp, size, DEV_BSIZE, 0, 0, name, pa, IPL_NONE); |
517 | pool_setlowat(pp, 1); | | 519 | pool_setlowat(pp, 1); |
518 | pool_sethiwat(pp, 1); | | 520 | pool_sethiwat(pp, 1); |
519 | } | | 521 | } |
520 | | | 522 | |
521 | /* Initialize the buffer queues */ | | 523 | /* Initialize the buffer queues */ |
522 | for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) { | | 524 | for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) { |
523 | TAILQ_INIT(&dp->bq_queue); | | 525 | TAILQ_INIT(&dp->bq_queue); |
524 | dp->bq_bytes = 0; | | 526 | dp->bq_bytes = 0; |
525 | } | | 527 | } |
526 | | | 528 | |
527 | /* | | 529 | /* |
528 | * Estimate hash table size based on the amount of memory we | | 530 | * Estimate hash table size based on the amount of memory we |
529 | * intend to use for the buffer cache. The average buffer | | 531 | * intend to use for the buffer cache. The average buffer |
530 | * size is dependent on our clients (i.e. filesystems). | | 532 | * size is dependent on our clients (i.e. filesystems). |
531 | * | | 533 | * |
532 | * For now, use an empirical 3K per buffer. | | 534 | * For now, use an empirical 3K per buffer. |
533 | */ | | 535 | */ |
534 | nbuf = (bufmem_hiwater / 1024) / 3; | | 536 | nbuf = (bufmem_hiwater / 1024) / 3; |
535 | bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash); | | 537 | bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash); |
536 | | | 538 | |
537 | sysctl_kern_buf_setup(); | | 539 | sysctl_kern_buf_setup(); |
538 | sysctl_vm_buf_setup(); | | 540 | sysctl_vm_buf_setup(); |
| | | 541 | hashstat_register("bufhash", bufhash_stats); |
539 | } | | 542 | } |
540 | | | 543 | |
541 | void | | 544 | void |
542 | bufinit2(void) | | 545 | bufinit2(void) |
543 | { | | 546 | { |
544 | | | 547 | |
545 | biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr, | | 548 | biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr, |
546 | NULL); | | 549 | NULL); |
547 | if (biodone_sih == NULL) | | 550 | if (biodone_sih == NULL) |
548 | panic("bufinit2: can't establish soft interrupt"); | | 551 | panic("bufinit2: can't establish soft interrupt"); |
549 | } | | 552 | } |
550 | | | 553 | |
551 | static int | | 554 | static int |
552 | buf_lotsfree(void) | | 555 | buf_lotsfree(void) |
553 | { | | 556 | { |
554 | u_long guess; | | 557 | u_long guess; |
555 | | | 558 | |
556 | /* Always allocate if less than the low water mark. */ | | 559 | /* Always allocate if less than the low water mark. */ |
557 | if (bufmem < bufmem_lowater) | | 560 | if (bufmem < bufmem_lowater) |
558 | return 1; | | 561 | return 1; |
559 | | | 562 | |
560 | /* Never allocate if greater than the high water mark. */ | | 563 | /* Never allocate if greater than the high water mark. */ |
561 | if (bufmem > bufmem_hiwater) | | 564 | if (bufmem > bufmem_hiwater) |
562 | return 0; | | 565 | return 0; |
563 | | | 566 | |
564 | /* If there's anything on the AGE list, it should be eaten. */ | | 567 | /* If there's anything on the AGE list, it should be eaten. */ |
565 | if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL) | | 568 | if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL) |
566 | return 0; | | 569 | return 0; |
567 | | | 570 | |
568 | /* | | 571 | /* |
569 | * The probabily of getting a new allocation is inversely | | 572 | * The probabily of getting a new allocation is inversely |
570 | * proportional to the current size of the cache above | | 573 | * proportional to the current size of the cache above |
571 | * the low water mark. Divide the total first to avoid overflows | | 574 | * the low water mark. Divide the total first to avoid overflows |
572 | * in the product. | | 575 | * in the product. |
573 | */ | | 576 | */ |
574 | guess = cprng_fast32() % 16; | | 577 | guess = cprng_fast32() % 16; |
575 | | | 578 | |
576 | if ((bufmem_hiwater - bufmem_lowater) / 16 * guess >= | | 579 | if ((bufmem_hiwater - bufmem_lowater) / 16 * guess >= |
577 | (bufmem - bufmem_lowater)) | | 580 | (bufmem - bufmem_lowater)) |
578 | return 1; | | 581 | return 1; |
579 | | | 582 | |
580 | /* Otherwise don't allocate. */ | | 583 | /* Otherwise don't allocate. */ |
581 | return 0; | | 584 | return 0; |
582 | } | | 585 | } |
583 | | | 586 | |
584 | /* | | 587 | /* |
585 | * Return estimate of bytes we think need to be | | 588 | * Return estimate of bytes we think need to be |
586 | * released to help resolve low memory conditions. | | 589 | * released to help resolve low memory conditions. |
587 | * | | 590 | * |
588 | * => called with bufcache_lock held. | | 591 | * => called with bufcache_lock held. |
589 | */ | | 592 | */ |
590 | static int | | 593 | static int |
591 | buf_canrelease(void) | | 594 | buf_canrelease(void) |
592 | { | | 595 | { |
593 | int pagedemand, ninvalid = 0; | | 596 | int pagedemand, ninvalid = 0; |
594 | | | 597 | |
595 | KASSERT(mutex_owned(&bufcache_lock)); | | 598 | KASSERT(mutex_owned(&bufcache_lock)); |
596 | | | 599 | |
597 | if (bufmem < bufmem_lowater) | | 600 | if (bufmem < bufmem_lowater) |
598 | return 0; | | 601 | return 0; |
599 | | | 602 | |
600 | if (bufmem > bufmem_hiwater) | | 603 | if (bufmem > bufmem_hiwater) |
601 | return bufmem - bufmem_hiwater; | | 604 | return bufmem - bufmem_hiwater; |
602 | | | 605 | |
603 | ninvalid += bufqueues[BQ_AGE].bq_bytes; | | 606 | ninvalid += bufqueues[BQ_AGE].bq_bytes; |
604 | | | 607 | |
605 | pagedemand = uvmexp.freetarg - uvm_availmem(false); | | 608 | pagedemand = uvmexp.freetarg - uvm_availmem(false); |
606 | if (pagedemand < 0) | | 609 | if (pagedemand < 0) |
607 | return ninvalid; | | 610 | return ninvalid; |
608 | return MAX(ninvalid, MIN(2 * MAXBSIZE, | | 611 | return MAX(ninvalid, MIN(2 * MAXBSIZE, |
609 | MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE))); | | 612 | MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE))); |
610 | } | | 613 | } |
611 | | | 614 | |
612 | /* | | 615 | /* |
613 | * Buffer memory allocation helper functions | | 616 | * Buffer memory allocation helper functions |
614 | */ | | 617 | */ |
615 | static u_long | | 618 | static u_long |
616 | buf_mempoolidx(u_long size) | | 619 | buf_mempoolidx(u_long size) |
617 | { | | 620 | { |
618 | u_int n = 0; | | 621 | u_int n = 0; |
619 | | | 622 | |
620 | size -= 1; | | 623 | size -= 1; |
621 | size >>= MEMPOOL_INDEX_OFFSET; | | 624 | size >>= MEMPOOL_INDEX_OFFSET; |
622 | while (size) { | | 625 | while (size) { |
623 | size >>= 1; | | 626 | size >>= 1; |
624 | n += 1; | | 627 | n += 1; |
625 | } | | 628 | } |
626 | if (n >= NMEMPOOLS) | | 629 | if (n >= NMEMPOOLS) |
627 | panic("buf mem pool index %d", n); | | 630 | panic("buf mem pool index %d", n); |
628 | return n; | | 631 | return n; |
629 | } | | 632 | } |
630 | | | 633 | |
631 | static u_long | | 634 | static u_long |
632 | buf_roundsize(u_long size) | | 635 | buf_roundsize(u_long size) |
633 | { | | 636 | { |
634 | /* Round up to nearest power of 2 */ | | 637 | /* Round up to nearest power of 2 */ |
635 | return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET)); | | 638 | return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET)); |
636 | } | | 639 | } |
637 | | | 640 | |
638 | static void * | | 641 | static void * |
639 | buf_alloc(size_t size) | | 642 | buf_alloc(size_t size) |
640 | { | | 643 | { |
641 | u_int n = buf_mempoolidx(size); | | 644 | u_int n = buf_mempoolidx(size); |
642 | void *addr; | | 645 | void *addr; |
643 | | | 646 | |
644 | while (1) { | | 647 | while (1) { |
645 | addr = pool_get(&bmempools[n], PR_NOWAIT); | | 648 | addr = pool_get(&bmempools[n], PR_NOWAIT); |
646 | if (addr != NULL) | | 649 | if (addr != NULL) |
647 | break; | | 650 | break; |
648 | | | 651 | |
649 | /* No memory, see if we can free some. If so, try again */ | | 652 | /* No memory, see if we can free some. If so, try again */ |
650 | mutex_enter(&bufcache_lock); | | 653 | mutex_enter(&bufcache_lock); |
651 | if (buf_drain(1) > 0) { | | 654 | if (buf_drain(1) > 0) { |
652 | mutex_exit(&bufcache_lock); | | 655 | mutex_exit(&bufcache_lock); |
653 | continue; | | 656 | continue; |
654 | } | | 657 | } |
655 | | | 658 | |
656 | if (curlwp == uvm.pagedaemon_lwp) { | | 659 | if (curlwp == uvm.pagedaemon_lwp) { |
657 | mutex_exit(&bufcache_lock); | | 660 | mutex_exit(&bufcache_lock); |
658 | return NULL; | | 661 | return NULL; |
659 | } | | 662 | } |
660 | | | 663 | |
661 | /* Wait for buffers to arrive on the LRU queue */ | | 664 | /* Wait for buffers to arrive on the LRU queue */ |
662 | cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4); | | 665 | cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4); |
663 | mutex_exit(&bufcache_lock); | | 666 | mutex_exit(&bufcache_lock); |
664 | } | | 667 | } |
665 | | | 668 | |
666 | return addr; | | 669 | return addr; |
667 | } | | 670 | } |
668 | | | 671 | |
669 | static void | | 672 | static void |
670 | buf_mrelease(void *addr, size_t size) | | 673 | buf_mrelease(void *addr, size_t size) |
671 | { | | 674 | { |
672 | | | 675 | |
673 | pool_put(&bmempools[buf_mempoolidx(size)], addr); | | 676 | pool_put(&bmempools[buf_mempoolidx(size)], addr); |
674 | } | | 677 | } |
675 | | | 678 | |
676 | /* | | 679 | /* |
677 | * bread()/breadn() helper. | | 680 | * bread()/breadn() helper. |
678 | */ | | 681 | */ |
679 | static buf_t * | | 682 | static buf_t * |
680 | bio_doread(struct vnode *vp, daddr_t blkno, int size, int async) | | 683 | bio_doread(struct vnode *vp, daddr_t blkno, int size, int async) |
681 | { | | 684 | { |
682 | buf_t *bp; | | 685 | buf_t *bp; |
683 | struct mount *mp; | | 686 | struct mount *mp; |
684 | | | 687 | |
685 | bp = getblk(vp, blkno, size, 0, 0); | | 688 | bp = getblk(vp, blkno, size, 0, 0); |
686 | | | 689 | |
687 | /* | | 690 | /* |
688 | * getblk() may return NULL if we are the pagedaemon. | | 691 | * getblk() may return NULL if we are the pagedaemon. |
689 | */ | | 692 | */ |
690 | if (bp == NULL) { | | 693 | if (bp == NULL) { |
691 | KASSERT(curlwp == uvm.pagedaemon_lwp); | | 694 | KASSERT(curlwp == uvm.pagedaemon_lwp); |
692 | return NULL; | | 695 | return NULL; |
693 | } | | 696 | } |
694 | | | 697 | |
695 | /* | | 698 | /* |
696 | * If buffer does not have data valid, start a read. | | 699 | * If buffer does not have data valid, start a read. |
697 | * Note that if buffer is BC_INVAL, getblk() won't return it. | | 700 | * Note that if buffer is BC_INVAL, getblk() won't return it. |
698 | * Therefore, it's valid if its I/O has completed or been delayed. | | 701 | * Therefore, it's valid if its I/O has completed or been delayed. |
699 | */ | | 702 | */ |
700 | if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) { | | 703 | if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) { |
701 | /* Start I/O for the buffer. */ | | 704 | /* Start I/O for the buffer. */ |
702 | SET(bp->b_flags, B_READ | async); | | 705 | SET(bp->b_flags, B_READ | async); |
703 | if (async) | | 706 | if (async) |
704 | BIO_SETPRIO(bp, BPRIO_TIMELIMITED); | | 707 | BIO_SETPRIO(bp, BPRIO_TIMELIMITED); |
705 | else | | 708 | else |
706 | BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); | | 709 | BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); |
707 | VOP_STRATEGY(vp, bp); | | 710 | VOP_STRATEGY(vp, bp); |
708 | | | 711 | |
709 | /* Pay for the read. */ | | 712 | /* Pay for the read. */ |
710 | curlwp->l_ru.ru_inblock++; | | 713 | curlwp->l_ru.ru_inblock++; |
711 | } else if (async) | | 714 | } else if (async) |
712 | brelse(bp, 0); | | 715 | brelse(bp, 0); |
713 | | | 716 | |
714 | if (vp->v_type == VBLK) | | 717 | if (vp->v_type == VBLK) |
715 | mp = spec_node_getmountedfs(vp); | | 718 | mp = spec_node_getmountedfs(vp); |
716 | else | | 719 | else |
717 | mp = vp->v_mount; | | 720 | mp = vp->v_mount; |
718 | | | 721 | |
719 | /* | | 722 | /* |
720 | * Collect statistics on synchronous and asynchronous reads. | | 723 | * Collect statistics on synchronous and asynchronous reads. |
721 | * Reads from block devices are charged to their associated | | 724 | * Reads from block devices are charged to their associated |
722 | * filesystem (if any). | | 725 | * filesystem (if any). |
723 | */ | | 726 | */ |
724 | if (mp != NULL) { | | 727 | if (mp != NULL) { |
725 | if (async == 0) | | 728 | if (async == 0) |
726 | mp->mnt_stat.f_syncreads++; | | 729 | mp->mnt_stat.f_syncreads++; |
727 | else | | 730 | else |
728 | mp->mnt_stat.f_asyncreads++; | | 731 | mp->mnt_stat.f_asyncreads++; |
729 | } | | 732 | } |
730 | | | 733 | |
731 | return (bp); | | 734 | return (bp); |
732 | } | | 735 | } |
733 | | | 736 | |
734 | /* | | 737 | /* |
735 | * Read a disk block. | | 738 | * Read a disk block. |
736 | * This algorithm described in Bach (p.54). | | 739 | * This algorithm described in Bach (p.54). |
737 | */ | | 740 | */ |
738 | int | | 741 | int |
739 | bread(struct vnode *vp, daddr_t blkno, int size, int flags, buf_t **bpp) | | 742 | bread(struct vnode *vp, daddr_t blkno, int size, int flags, buf_t **bpp) |
740 | { | | 743 | { |
741 | buf_t *bp; | | 744 | buf_t *bp; |
742 | int error; | | 745 | int error; |
743 | | | 746 | |
744 | BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); | | 747 | BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); |
745 | | | 748 | |
746 | /* Get buffer for block. */ | | 749 | /* Get buffer for block. */ |
747 | bp = *bpp = bio_doread(vp, blkno, size, 0); | | 750 | bp = *bpp = bio_doread(vp, blkno, size, 0); |
748 | if (bp == NULL) | | 751 | if (bp == NULL) |
749 | return ENOMEM; | | 752 | return ENOMEM; |
750 | | | 753 | |
751 | /* Wait for the read to complete, and return result. */ | | 754 | /* Wait for the read to complete, and return result. */ |
752 | error = biowait(bp); | | 755 | error = biowait(bp); |
753 | if (error == 0 && (flags & B_MODIFY) != 0) | | 756 | if (error == 0 && (flags & B_MODIFY) != 0) |
754 | error = fscow_run(bp, true); | | 757 | error = fscow_run(bp, true); |
755 | if (error) { | | 758 | if (error) { |
756 | brelse(bp, 0); | | 759 | brelse(bp, 0); |
757 | *bpp = NULL; | | 760 | *bpp = NULL; |
758 | } | | 761 | } |
759 | | | 762 | |
760 | return error; | | 763 | return error; |
761 | } | | 764 | } |
762 | | | 765 | |
763 | /* | | 766 | /* |
764 | * Read-ahead multiple disk blocks. The first is sync, the rest async. | | 767 | * Read-ahead multiple disk blocks. The first is sync, the rest async. |
765 | * Trivial modification to the breada algorithm presented in Bach (p.55). | | 768 | * Trivial modification to the breada algorithm presented in Bach (p.55). |
766 | */ | | 769 | */ |
767 | int | | 770 | int |
768 | breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks, | | 771 | breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks, |
769 | int *rasizes, int nrablks, int flags, buf_t **bpp) | | 772 | int *rasizes, int nrablks, int flags, buf_t **bpp) |
770 | { | | 773 | { |
771 | buf_t *bp; | | 774 | buf_t *bp; |
772 | int error, i; | | 775 | int error, i; |
773 | | | 776 | |
774 | BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); | | 777 | BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); |
775 | | | 778 | |
776 | bp = *bpp = bio_doread(vp, blkno, size, 0); | | 779 | bp = *bpp = bio_doread(vp, blkno, size, 0); |
777 | if (bp == NULL) | | 780 | if (bp == NULL) |
778 | return ENOMEM; | | 781 | return ENOMEM; |
779 | | | 782 | |
780 | /* | | 783 | /* |
781 | * For each of the read-ahead blocks, start a read, if necessary. | | 784 | * For each of the read-ahead blocks, start a read, if necessary. |
782 | */ | | 785 | */ |
783 | mutex_enter(&bufcache_lock); | | 786 | mutex_enter(&bufcache_lock); |
784 | for (i = 0; i < nrablks; i++) { | | 787 | for (i = 0; i < nrablks; i++) { |
785 | /* If it's in the cache, just go on to next one. */ | | 788 | /* If it's in the cache, just go on to next one. */ |
786 | if (incore(vp, rablks[i])) | | 789 | if (incore(vp, rablks[i])) |
787 | continue; | | 790 | continue; |
788 | | | 791 | |
789 | /* Get a buffer for the read-ahead block */ | | 792 | /* Get a buffer for the read-ahead block */ |
790 | mutex_exit(&bufcache_lock); | | 793 | mutex_exit(&bufcache_lock); |
791 | (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC); | | 794 | (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC); |
792 | mutex_enter(&bufcache_lock); | | 795 | mutex_enter(&bufcache_lock); |
793 | } | | 796 | } |
794 | mutex_exit(&bufcache_lock); | | 797 | mutex_exit(&bufcache_lock); |
795 | | | 798 | |
796 | /* Otherwise, we had to start a read for it; wait until it's valid. */ | | 799 | /* Otherwise, we had to start a read for it; wait until it's valid. */ |
797 | error = biowait(bp); | | 800 | error = biowait(bp); |
798 | if (error == 0 && (flags & B_MODIFY) != 0) | | 801 | if (error == 0 && (flags & B_MODIFY) != 0) |
799 | error = fscow_run(bp, true); | | 802 | error = fscow_run(bp, true); |
800 | if (error) { | | 803 | if (error) { |
801 | brelse(bp, 0); | | 804 | brelse(bp, 0); |
802 | *bpp = NULL; | | 805 | *bpp = NULL; |
803 | } | | 806 | } |
804 | | | 807 | |
805 | return error; | | 808 | return error; |
806 | } | | 809 | } |
807 | | | 810 | |
808 | /* | | 811 | /* |
809 | * Block write. Described in Bach (p.56) | | 812 | * Block write. Described in Bach (p.56) |
810 | */ | | 813 | */ |
811 | int | | 814 | int |
812 | bwrite(buf_t *bp) | | 815 | bwrite(buf_t *bp) |
813 | { | | 816 | { |
814 | int rv, sync, wasdelayed; | | 817 | int rv, sync, wasdelayed; |
815 | struct vnode *vp; | | 818 | struct vnode *vp; |
816 | struct mount *mp; | | 819 | struct mount *mp; |
817 | | | 820 | |
818 | BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx", | | 821 | BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx", |
819 | (uintptr_t)bp, 0, 0, 0); | | 822 | (uintptr_t)bp, 0, 0, 0); |
820 | | | 823 | |
821 | KASSERT(ISSET(bp->b_cflags, BC_BUSY)); | | 824 | KASSERT(ISSET(bp->b_cflags, BC_BUSY)); |
822 | KASSERT(!cv_has_waiters(&bp->b_done)); | | 825 | KASSERT(!cv_has_waiters(&bp->b_done)); |
823 | | | 826 | |
824 | vp = bp->b_vp; | | 827 | vp = bp->b_vp; |
825 | | | 828 | |
826 | /* | | 829 | /* |
827 | * dholland 20160728 AFAICT vp==NULL must be impossible as it | | 830 | * dholland 20160728 AFAICT vp==NULL must be impossible as it |
828 | * will crash upon reaching VOP_STRATEGY below... see further | | 831 | * will crash upon reaching VOP_STRATEGY below... see further |
829 | * analysis on tech-kern. | | 832 | * analysis on tech-kern. |
830 | */ | | 833 | */ |
831 | KASSERTMSG(vp != NULL, "bwrite given buffer with null vnode"); | | 834 | KASSERTMSG(vp != NULL, "bwrite given buffer with null vnode"); |
832 | | | 835 | |
833 | if (vp != NULL) { | | 836 | if (vp != NULL) { |
834 | KASSERT(bp->b_objlock == vp->v_interlock); | | 837 | KASSERT(bp->b_objlock == vp->v_interlock); |
835 | if (vp->v_type == VBLK) | | 838 | if (vp->v_type == VBLK) |
836 | mp = spec_node_getmountedfs(vp); | | 839 | mp = spec_node_getmountedfs(vp); |
837 | else | | 840 | else |
838 | mp = vp->v_mount; | | 841 | mp = vp->v_mount; |
839 | } else { | | 842 | } else { |
840 | mp = NULL; | | 843 | mp = NULL; |
841 | } | | 844 | } |
842 | | | 845 | |
843 | if (mp && mp->mnt_wapbl) { | | 846 | if (mp && mp->mnt_wapbl) { |
844 | if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { | | 847 | if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { |
845 | bdwrite(bp); | | 848 | bdwrite(bp); |
846 | return 0; | | 849 | return 0; |
847 | } | | 850 | } |
848 | } | | 851 | } |
849 | | | 852 | |
850 | /* | | 853 | /* |
851 | * Remember buffer type, to switch on it later. If the write was | | 854 | * Remember buffer type, to switch on it later. If the write was |
852 | * synchronous, but the file system was mounted with MNT_ASYNC, | | 855 | * synchronous, but the file system was mounted with MNT_ASYNC, |
853 | * convert it to a delayed write. | | 856 | * convert it to a delayed write. |
854 | * XXX note that this relies on delayed tape writes being converted | | 857 | * XXX note that this relies on delayed tape writes being converted |
855 | * to async, not sync writes (which is safe, but ugly). | | 858 | * to async, not sync writes (which is safe, but ugly). |
856 | */ | | 859 | */ |
857 | sync = !ISSET(bp->b_flags, B_ASYNC); | | 860 | sync = !ISSET(bp->b_flags, B_ASYNC); |
858 | if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) { | | 861 | if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) { |
859 | bdwrite(bp); | | 862 | bdwrite(bp); |
860 | return (0); | | 863 | return (0); |
861 | } | | 864 | } |
862 | | | 865 | |
863 | /* | | 866 | /* |
864 | * Collect statistics on synchronous and asynchronous writes. | | 867 | * Collect statistics on synchronous and asynchronous writes. |
865 | * Writes to block devices are charged to their associated | | 868 | * Writes to block devices are charged to their associated |
866 | * filesystem (if any). | | 869 | * filesystem (if any). |
867 | */ | | 870 | */ |
868 | if (mp != NULL) { | | 871 | if (mp != NULL) { |
869 | if (sync) | | 872 | if (sync) |
870 | mp->mnt_stat.f_syncwrites++; | | 873 | mp->mnt_stat.f_syncwrites++; |
871 | else | | 874 | else |
872 | mp->mnt_stat.f_asyncwrites++; | | 875 | mp->mnt_stat.f_asyncwrites++; |
873 | } | | 876 | } |
874 | | | 877 | |
875 | /* | | 878 | /* |
876 | * Pay for the I/O operation and make sure the buf is on the correct | | 879 | * Pay for the I/O operation and make sure the buf is on the correct |
877 | * vnode queue. | | 880 | * vnode queue. |
878 | */ | | 881 | */ |
879 | bp->b_error = 0; | | 882 | bp->b_error = 0; |
880 | wasdelayed = ISSET(bp->b_oflags, BO_DELWRI); | | 883 | wasdelayed = ISSET(bp->b_oflags, BO_DELWRI); |
881 | CLR(bp->b_flags, B_READ); | | 884 | CLR(bp->b_flags, B_READ); |
882 | if (wasdelayed) { | | 885 | if (wasdelayed) { |
883 | mutex_enter(&bufcache_lock); | | 886 | mutex_enter(&bufcache_lock); |
884 | mutex_enter(bp->b_objlock); | | 887 | mutex_enter(bp->b_objlock); |
885 | CLR(bp->b_oflags, BO_DONE | BO_DELWRI); | | 888 | CLR(bp->b_oflags, BO_DONE | BO_DELWRI); |
886 | reassignbuf(bp, bp->b_vp); | | 889 | reassignbuf(bp, bp->b_vp); |
887 | /* Wake anyone trying to busy the buffer via vnode's lists. */ | | 890 | /* Wake anyone trying to busy the buffer via vnode's lists. */ |
888 | cv_broadcast(&bp->b_busy); | | 891 | cv_broadcast(&bp->b_busy); |
889 | mutex_exit(&bufcache_lock); | | 892 | mutex_exit(&bufcache_lock); |
890 | } else { | | 893 | } else { |
891 | curlwp->l_ru.ru_oublock++; | | 894 | curlwp->l_ru.ru_oublock++; |
892 | mutex_enter(bp->b_objlock); | | 895 | mutex_enter(bp->b_objlock); |
893 | CLR(bp->b_oflags, BO_DONE | BO_DELWRI); | | 896 | CLR(bp->b_oflags, BO_DONE | BO_DELWRI); |
894 | } | | 897 | } |
895 | if (vp != NULL) | | 898 | if (vp != NULL) |
896 | vp->v_numoutput++; | | 899 | vp->v_numoutput++; |
897 | mutex_exit(bp->b_objlock); | | 900 | mutex_exit(bp->b_objlock); |
898 | | | 901 | |
899 | /* Initiate disk write. */ | | 902 | /* Initiate disk write. */ |
900 | if (sync) | | 903 | if (sync) |
901 | BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); | | 904 | BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); |
902 | else | | 905 | else |
903 | BIO_SETPRIO(bp, BPRIO_TIMELIMITED); | | 906 | BIO_SETPRIO(bp, BPRIO_TIMELIMITED); |
904 | | | 907 | |
905 | VOP_STRATEGY(vp, bp); | | 908 | VOP_STRATEGY(vp, bp); |
906 | | | 909 | |
907 | if (sync) { | | 910 | if (sync) { |
908 | /* If I/O was synchronous, wait for it to complete. */ | | 911 | /* If I/O was synchronous, wait for it to complete. */ |
909 | rv = biowait(bp); | | 912 | rv = biowait(bp); |
910 | | | 913 | |
911 | /* Release the buffer. */ | | 914 | /* Release the buffer. */ |
912 | brelse(bp, 0); | | 915 | brelse(bp, 0); |
913 | | | 916 | |
914 | return (rv); | | 917 | return (rv); |
915 | } else { | | 918 | } else { |
916 | return (0); | | 919 | return (0); |
917 | } | | 920 | } |
918 | } | | 921 | } |
919 | | | 922 | |
920 | int | | 923 | int |
921 | vn_bwrite(void *v) | | 924 | vn_bwrite(void *v) |
922 | { | | 925 | { |
923 | struct vop_bwrite_args *ap = v; | | 926 | struct vop_bwrite_args *ap = v; |
924 | | | 927 | |
925 | return (bwrite(ap->a_bp)); | | 928 | return (bwrite(ap->a_bp)); |
926 | } | | 929 | } |
927 | | | 930 | |
928 | /* | | 931 | /* |
929 | * Delayed write. | | 932 | * Delayed write. |
930 | * | | 933 | * |
931 | * The buffer is marked dirty, but is not queued for I/O. | | 934 | * The buffer is marked dirty, but is not queued for I/O. |
932 | * This routine should be used when the buffer is expected | | 935 | * This routine should be used when the buffer is expected |
933 | * to be modified again soon, typically a small write that | | 936 | * to be modified again soon, typically a small write that |
934 | * partially fills a buffer. | | 937 | * partially fills a buffer. |
935 | * | | 938 | * |
936 | * NB: magnetic tapes cannot be delayed; they must be | | 939 | * NB: magnetic tapes cannot be delayed; they must be |
937 | * written in the order that the writes are requested. | | 940 | * written in the order that the writes are requested. |
938 | * | | 941 | * |
939 | * Described in Leffler, et al. (pp. 208-213). | | 942 | * Described in Leffler, et al. (pp. 208-213). |
940 | */ | | 943 | */ |
941 | void | | 944 | void |
942 | bdwrite(buf_t *bp) | | 945 | bdwrite(buf_t *bp) |
943 | { | | 946 | { |
944 | | | 947 | |
945 | BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx", | | 948 | BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx", |
946 | (uintptr_t)bp, 0, 0, 0); | | 949 | (uintptr_t)bp, 0, 0, 0); |
947 | | | 950 | |
948 | KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS || | | 951 | KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS || |
949 | bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE)); | | 952 | bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE)); |
950 | KASSERT(ISSET(bp->b_cflags, BC_BUSY)); | | 953 | KASSERT(ISSET(bp->b_cflags, BC_BUSY)); |
951 | KASSERT(!cv_has_waiters(&bp->b_done)); | | 954 | KASSERT(!cv_has_waiters(&bp->b_done)); |
952 | | | 955 | |
953 | /* If this is a tape block, write the block now. */ | | 956 | /* If this is a tape block, write the block now. */ |
954 | if (bdev_type(bp->b_dev) == D_TAPE) { | | 957 | if (bdev_type(bp->b_dev) == D_TAPE) { |
955 | bawrite(bp); | | 958 | bawrite(bp); |
956 | return; | | 959 | return; |
957 | } | | 960 | } |
958 | | | 961 | |
959 | if (wapbl_vphaswapbl(bp->b_vp)) { | | 962 | if (wapbl_vphaswapbl(bp->b_vp)) { |
960 | struct mount *mp = wapbl_vptomp(bp->b_vp); | | 963 | struct mount *mp = wapbl_vptomp(bp->b_vp); |
961 | | | 964 | |
962 | if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { | | 965 | if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { |
963 | WAPBL_ADD_BUF(mp, bp); | | 966 | WAPBL_ADD_BUF(mp, bp); |
964 | } | | 967 | } |
965 | } | | 968 | } |
966 | | | 969 | |
967 | /* | | 970 | /* |
968 | * If the block hasn't been seen before: | | 971 | * If the block hasn't been seen before: |
969 | * (1) Mark it as having been seen, | | 972 | * (1) Mark it as having been seen, |
970 | * (2) Charge for the write, | | 973 | * (2) Charge for the write, |
971 | * (3) Make sure it's on its vnode's correct block list. | | 974 | * (3) Make sure it's on its vnode's correct block list. |
972 | */ | | 975 | */ |
973 | KASSERT(bp->b_vp == NULL || bp->b_objlock == bp->b_vp->v_interlock); | | 976 | KASSERT(bp->b_vp == NULL || bp->b_objlock == bp->b_vp->v_interlock); |
974 | | | 977 | |
975 | if (!ISSET(bp->b_oflags, BO_DELWRI)) { | | 978 | if (!ISSET(bp->b_oflags, BO_DELWRI)) { |
976 | mutex_enter(&bufcache_lock); | | 979 | mutex_enter(&bufcache_lock); |
977 | mutex_enter(bp->b_objlock); | | 980 | mutex_enter(bp->b_objlock); |
978 | SET(bp->b_oflags, BO_DELWRI); | | 981 | SET(bp->b_oflags, BO_DELWRI); |
979 | curlwp->l_ru.ru_oublock++; | | 982 | curlwp->l_ru.ru_oublock++; |
980 | reassignbuf(bp, bp->b_vp); | | 983 | reassignbuf(bp, bp->b_vp); |
981 | /* Wake anyone trying to busy the buffer via vnode's lists. */ | | 984 | /* Wake anyone trying to busy the buffer via vnode's lists. */ |
982 | cv_broadcast(&bp->b_busy); | | 985 | cv_broadcast(&bp->b_busy); |
983 | mutex_exit(&bufcache_lock); | | 986 | mutex_exit(&bufcache_lock); |
984 | } else { | | 987 | } else { |
985 | mutex_enter(bp->b_objlock); | | 988 | mutex_enter(bp->b_objlock); |
986 | } | | 989 | } |
987 | /* Otherwise, the "write" is done, so mark and release the buffer. */ | | 990 | /* Otherwise, the "write" is done, so mark and release the buffer. */ |
988 | CLR(bp->b_oflags, BO_DONE); | | 991 | CLR(bp->b_oflags, BO_DONE); |
989 | mutex_exit(bp->b_objlock); | | 992 | mutex_exit(bp->b_objlock); |
990 | | | 993 | |
991 | brelse(bp, 0); | | 994 | brelse(bp, 0); |
992 | } | | 995 | } |
993 | | | 996 | |
994 | /* | | 997 | /* |
995 | * Asynchronous block write; just an asynchronous bwrite(). | | 998 | * Asynchronous block write; just an asynchronous bwrite(). |
996 | */ | | 999 | */ |
997 | void | | 1000 | void |
998 | bawrite(buf_t *bp) | | 1001 | bawrite(buf_t *bp) |
999 | { | | 1002 | { |
1000 | | | 1003 | |
1001 | KASSERT(ISSET(bp->b_cflags, BC_BUSY)); | | 1004 | KASSERT(ISSET(bp->b_cflags, BC_BUSY)); |
1002 | KASSERT(bp->b_vp != NULL); | | 1005 | KASSERT(bp->b_vp != NULL); |
1003 | | | 1006 | |
1004 | SET(bp->b_flags, B_ASYNC); | | 1007 | SET(bp->b_flags, B_ASYNC); |
1005 | VOP_BWRITE(bp->b_vp, bp); | | 1008 | VOP_BWRITE(bp->b_vp, bp); |
1006 | } | | 1009 | } |
1007 | | | 1010 | |
1008 | /* | | 1011 | /* |
1009 | * Release a buffer on to the free lists. | | 1012 | * Release a buffer on to the free lists. |
1010 | * Described in Bach (p. 46). | | 1013 | * Described in Bach (p. 46). |
1011 | */ | | 1014 | */ |
1012 | void | | 1015 | void |
1013 | brelsel(buf_t *bp, int set) | | 1016 | brelsel(buf_t *bp, int set) |
1014 | { | | 1017 | { |
1015 | struct bqueue *bufq; | | 1018 | struct bqueue *bufq; |
1016 | struct vnode *vp; | | 1019 | struct vnode *vp; |
1017 | | | 1020 | |
1018 | SDT_PROBE2(io, kernel, , brelse, bp, set); | | 1021 | SDT_PROBE2(io, kernel, , brelse, bp, set); |
1019 | | | 1022 | |
1020 | KASSERT(bp != NULL); | | 1023 | KASSERT(bp != NULL); |
1021 | KASSERT(mutex_owned(&bufcache_lock)); | | 1024 | KASSERT(mutex_owned(&bufcache_lock)); |
1022 | KASSERT(!cv_has_waiters(&bp->b_done)); | | 1025 | KASSERT(!cv_has_waiters(&bp->b_done)); |
1023 | | | 1026 | |
1024 | SET(bp->b_cflags, set); | | 1027 | SET(bp->b_cflags, set); |
1025 | | | 1028 | |
1026 | KASSERT(ISSET(bp->b_cflags, BC_BUSY)); | | 1029 | KASSERT(ISSET(bp->b_cflags, BC_BUSY)); |
1027 | KASSERT(bp->b_iodone == NULL); | | 1030 | KASSERT(bp->b_iodone == NULL); |
1028 | | | 1031 | |
1029 | /* Wake up any processes waiting for any buffer to become free. */ | | 1032 | /* Wake up any processes waiting for any buffer to become free. */ |
1030 | cv_signal(&needbuffer_cv); | | 1033 | cv_signal(&needbuffer_cv); |
1031 | | | 1034 | |
1032 | /* Wake up any proceeses waiting for _this_ buffer to become free */ | | 1035 | /* Wake up any proceeses waiting for _this_ buffer to become free */ |
1033 | if (ISSET(bp->b_cflags, BC_WANTED)) | | 1036 | if (ISSET(bp->b_cflags, BC_WANTED)) |
1034 | CLR(bp->b_cflags, BC_WANTED|BC_AGE); | | 1037 | CLR(bp->b_cflags, BC_WANTED|BC_AGE); |
1035 | | | 1038 | |
1036 | /* If it's clean clear the copy-on-write flag. */ | | 1039 | /* If it's clean clear the copy-on-write flag. */ |
1037 | if (ISSET(bp->b_flags, B_COWDONE)) { | | 1040 | if (ISSET(bp->b_flags, B_COWDONE)) { |
1038 | mutex_enter(bp->b_objlock); | | 1041 | mutex_enter(bp->b_objlock); |
1039 | if (!ISSET(bp->b_oflags, BO_DELWRI)) | | 1042 | if (!ISSET(bp->b_oflags, BO_DELWRI)) |
1040 | CLR(bp->b_flags, B_COWDONE); | | 1043 | CLR(bp->b_flags, B_COWDONE); |
1041 | mutex_exit(bp->b_objlock); | | 1044 | mutex_exit(bp->b_objlock); |
1042 | } | | 1045 | } |
1043 | | | 1046 | |
1044 | /* | | 1047 | /* |
1045 | * Determine which queue the buffer should be on, then put it there. | | 1048 | * Determine which queue the buffer should be on, then put it there. |
1046 | */ | | 1049 | */ |
1047 | | | 1050 | |
1048 | /* If it's locked, don't report an error; try again later. */ | | 1051 | /* If it's locked, don't report an error; try again later. */ |
1049 | if (ISSET(bp->b_flags, B_LOCKED)) | | 1052 | if (ISSET(bp->b_flags, B_LOCKED)) |
1050 | bp->b_error = 0; | | 1053 | bp->b_error = 0; |
1051 | | | 1054 | |
1052 | /* If it's not cacheable, or an error, mark it invalid. */ | | 1055 | /* If it's not cacheable, or an error, mark it invalid. */ |
1053 | if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0) | | 1056 | if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0) |
1054 | SET(bp->b_cflags, BC_INVAL); | | 1057 | SET(bp->b_cflags, BC_INVAL); |
1055 | | | 1058 | |
1056 | if (ISSET(bp->b_cflags, BC_VFLUSH)) { | | 1059 | if (ISSET(bp->b_cflags, BC_VFLUSH)) { |
1057 | /* | | 1060 | /* |
1058 | * This is a delayed write buffer that was just flushed to | | 1061 | * This is a delayed write buffer that was just flushed to |
1059 | * disk. It is still on the LRU queue. If it's become | | 1062 | * disk. It is still on the LRU queue. If it's become |
1060 | * invalid, then we need to move it to a different queue; | | 1063 | * invalid, then we need to move it to a different queue; |
1061 | * otherwise leave it in its current position. | | 1064 | * otherwise leave it in its current position. |
1062 | */ | | 1065 | */ |
1063 | CLR(bp->b_cflags, BC_VFLUSH); | | 1066 | CLR(bp->b_cflags, BC_VFLUSH); |
1064 | if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) && | | 1067 | if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) && |
1065 | !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) { | | 1068 | !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) { |
1066 | KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1)); | | 1069 | KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1)); |
1067 | goto already_queued; | | 1070 | goto already_queued; |
1068 | } else { | | 1071 | } else { |
1069 | bremfree(bp); | | 1072 | bremfree(bp); |
1070 | } | | 1073 | } |
1071 | } | | 1074 | } |
1072 | | | 1075 | |
1073 | KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0)); | | 1076 | KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0)); |
1074 | KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0)); | | 1077 | KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0)); |
1075 | KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0)); | | 1078 | KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0)); |
1076 | | | 1079 | |
1077 | if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) { | | 1080 | if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) { |
1078 | /* | | 1081 | /* |
1079 | * If it's invalid or empty, dissociate it from its vnode | | 1082 | * If it's invalid or empty, dissociate it from its vnode |
1080 | * and put on the head of the appropriate queue. | | 1083 | * and put on the head of the appropriate queue. |
1081 | */ | | 1084 | */ |
1082 | if (ISSET(bp->b_flags, B_LOCKED)) { | | 1085 | if (ISSET(bp->b_flags, B_LOCKED)) { |
1083 | if (wapbl_vphaswapbl(vp = bp->b_vp)) { | | 1086 | if (wapbl_vphaswapbl(vp = bp->b_vp)) { |
1084 | struct mount *mp = wapbl_vptomp(vp); | | 1087 | struct mount *mp = wapbl_vptomp(vp); |
1085 | | | 1088 | |
1086 | KASSERT(bp->b_iodone | | 1089 | KASSERT(bp->b_iodone |
1087 | != mp->mnt_wapbl_op->wo_wapbl_biodone); | | 1090 | != mp->mnt_wapbl_op->wo_wapbl_biodone); |
1088 | WAPBL_REMOVE_BUF(mp, bp); | | 1091 | WAPBL_REMOVE_BUF(mp, bp); |
1089 | } | | 1092 | } |
1090 | } | | 1093 | } |
1091 | | | 1094 | |
1092 | mutex_enter(bp->b_objlock); | | 1095 | mutex_enter(bp->b_objlock); |
1093 | CLR(bp->b_oflags, BO_DONE|BO_DELWRI); | | 1096 | CLR(bp->b_oflags, BO_DONE|BO_DELWRI); |
1094 | if ((vp = bp->b_vp) != NULL) { | | 1097 | if ((vp = bp->b_vp) != NULL) { |
1095 | KASSERT(bp->b_objlock == vp->v_interlock); | | 1098 | KASSERT(bp->b_objlock == vp->v_interlock); |
1096 | reassignbuf(bp, bp->b_vp); | | 1099 | reassignbuf(bp, bp->b_vp); |
1097 | brelvp(bp); | | 1100 | brelvp(bp); |
1098 | mutex_exit(vp->v_interlock); | | 1101 | mutex_exit(vp->v_interlock); |
1099 | } else { | | 1102 | } else { |
1100 | KASSERT(bp->b_objlock == &buffer_lock); | | 1103 | KASSERT(bp->b_objlock == &buffer_lock); |
1101 | mutex_exit(bp->b_objlock); | | 1104 | mutex_exit(bp->b_objlock); |
1102 | } | | 1105 | } |
1103 | /* We want to dispose of the buffer, so wake everybody. */ | | 1106 | /* We want to dispose of the buffer, so wake everybody. */ |
1104 | cv_broadcast(&bp->b_busy); | | 1107 | cv_broadcast(&bp->b_busy); |
1105 | if (bp->b_bufsize <= 0) | | 1108 | if (bp->b_bufsize <= 0) |
1106 | /* no data */ | | 1109 | /* no data */ |
1107 | goto already_queued; | | 1110 | goto already_queued; |
1108 | else | | 1111 | else |
1109 | /* invalid data */ | | 1112 | /* invalid data */ |
1110 | bufq = &bufqueues[BQ_AGE]; | | 1113 | bufq = &bufqueues[BQ_AGE]; |
1111 | binsheadfree(bp, bufq); | | 1114 | binsheadfree(bp, bufq); |
1112 | } else { | | 1115 | } else { |
1113 | /* | | 1116 | /* |
1114 | * It has valid data. Put it on the end of the appropriate | | 1117 | * It has valid data. Put it on the end of the appropriate |
1115 | * queue, so that it'll stick around for as long as possible. | | 1118 | * queue, so that it'll stick around for as long as possible. |
1116 | * If buf is AGE, but has dependencies, must put it on last | | 1119 | * If buf is AGE, but has dependencies, must put it on last |
1117 | * bufqueue to be scanned, ie LRU. This protects against the | | 1120 | * bufqueue to be scanned, ie LRU. This protects against the |
1118 | * livelock where BQ_AGE only has buffers with dependencies, | | 1121 | * livelock where BQ_AGE only has buffers with dependencies, |
1119 | * and we thus never get to the dependent buffers in BQ_LRU. | | 1122 | * and we thus never get to the dependent buffers in BQ_LRU. |
1120 | */ | | 1123 | */ |
1121 | if (ISSET(bp->b_flags, B_LOCKED)) { | | 1124 | if (ISSET(bp->b_flags, B_LOCKED)) { |
1122 | /* locked in core */ | | 1125 | /* locked in core */ |
1123 | bufq = &bufqueues[BQ_LOCKED]; | | 1126 | bufq = &bufqueues[BQ_LOCKED]; |
1124 | } else if (!ISSET(bp->b_cflags, BC_AGE)) { | | 1127 | } else if (!ISSET(bp->b_cflags, BC_AGE)) { |
1125 | /* valid data */ | | 1128 | /* valid data */ |
1126 | bufq = &bufqueues[BQ_LRU]; | | 1129 | bufq = &bufqueues[BQ_LRU]; |
1127 | } else { | | 1130 | } else { |
1128 | /* stale but valid data */ | | 1131 | /* stale but valid data */ |
1129 | bufq = &bufqueues[BQ_AGE]; | | 1132 | bufq = &bufqueues[BQ_AGE]; |
1130 | } | | 1133 | } |
1131 | binstailfree(bp, bufq); | | 1134 | binstailfree(bp, bufq); |
1132 | } | | 1135 | } |
1133 | already_queued: | | 1136 | already_queued: |
1134 | /* Unlock the buffer. */ | | 1137 | /* Unlock the buffer. */ |
1135 | CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE); | | 1138 | CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE); |
1136 | CLR(bp->b_flags, B_ASYNC); | | 1139 | CLR(bp->b_flags, B_ASYNC); |
1137 | | | 1140 | |
1138 | /* | | 1141 | /* |
1139 | * Wake only the highest priority waiter on the lock, in order to | | 1142 | * Wake only the highest priority waiter on the lock, in order to |
1140 | * prevent a thundering herd: many LWPs simultaneously awakening and | | 1143 | * prevent a thundering herd: many LWPs simultaneously awakening and |
1141 | * competing for the buffer's lock. Testing in 2019 revealed this | | 1144 | * competing for the buffer's lock. Testing in 2019 revealed this |
1142 | * to reduce contention on bufcache_lock tenfold during a kernel | | 1145 | * to reduce contention on bufcache_lock tenfold during a kernel |
1143 | * compile. Here and elsewhere, when the buffer is changing | | 1146 | * compile. Here and elsewhere, when the buffer is changing |
1144 | * identity, being disposed of, or moving from one list to another, | | 1147 | * identity, being disposed of, or moving from one list to another, |
1145 | * we wake all lock requestors. | | 1148 | * we wake all lock requestors. |
1146 | */ | | 1149 | */ |
1147 | if (bp->b_bufsize <= 0) { | | 1150 | if (bp->b_bufsize <= 0) { |
1148 | cv_broadcast(&bp->b_busy); | | 1151 | cv_broadcast(&bp->b_busy); |
1149 | buf_destroy(bp); | | 1152 | buf_destroy(bp); |
1150 | #ifdef DEBUG | | 1153 | #ifdef DEBUG |
1151 | memset((char *)bp, 0, sizeof(*bp)); | | 1154 | memset((char *)bp, 0, sizeof(*bp)); |
1152 | #endif | | 1155 | #endif |
1153 | pool_cache_put(buf_cache, bp); | | 1156 | pool_cache_put(buf_cache, bp); |
1154 | } else | | 1157 | } else |
1155 | cv_signal(&bp->b_busy); | | 1158 | cv_signal(&bp->b_busy); |
1156 | } | | 1159 | } |
1157 | | | 1160 | |
1158 | void | | 1161 | void |
1159 | brelse(buf_t *bp, int set) | | 1162 | brelse(buf_t *bp, int set) |
1160 | { | | 1163 | { |
1161 | | | 1164 | |
1162 | mutex_enter(&bufcache_lock); | | 1165 | mutex_enter(&bufcache_lock); |
1163 | brelsel(bp, set); | | 1166 | brelsel(bp, set); |
1164 | mutex_exit(&bufcache_lock); | | 1167 | mutex_exit(&bufcache_lock); |
1165 | } | | 1168 | } |
1166 | | | 1169 | |
1167 | /* | | 1170 | /* |
1168 | * Determine if a block is in the cache. | | 1171 | * Determine if a block is in the cache. |
1169 | * Just look on what would be its hash chain. If it's there, return | | 1172 | * Just look on what would be its hash chain. If it's there, return |
1170 | * a pointer to it, unless it's marked invalid. If it's marked invalid, | | 1173 | * a pointer to it, unless it's marked invalid. If it's marked invalid, |
1171 | * we normally don't return the buffer, unless the caller explicitly | | 1174 | * we normally don't return the buffer, unless the caller explicitly |
1172 | * wants us to. | | 1175 | * wants us to. |
1173 | */ | | 1176 | */ |
1174 | buf_t * | | 1177 | buf_t * |
1175 | incore(struct vnode *vp, daddr_t blkno) | | 1178 | incore(struct vnode *vp, daddr_t blkno) |
1176 | { | | 1179 | { |
1177 | buf_t *bp; | | 1180 | buf_t *bp; |
1178 | | | 1181 | |
1179 | KASSERT(mutex_owned(&bufcache_lock)); | | 1182 | KASSERT(mutex_owned(&bufcache_lock)); |
1180 | | | 1183 | |
1181 | /* Search hash chain */ | | 1184 | /* Search hash chain */ |
1182 | LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) { | | 1185 | LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) { |
1183 | if (bp->b_lblkno == blkno && bp->b_vp == vp && | | 1186 | if (bp->b_lblkno == blkno && bp->b_vp == vp && |
1184 | !ISSET(bp->b_cflags, BC_INVAL)) { | | 1187 | !ISSET(bp->b_cflags, BC_INVAL)) { |
1185 | KASSERT(bp->b_objlock == vp->v_interlock); | | 1188 | KASSERT(bp->b_objlock == vp->v_interlock); |
1186 | return (bp); | | 1189 | return (bp); |
1187 | } | | 1190 | } |
1188 | } | | 1191 | } |
1189 | | | 1192 | |
1190 | return (NULL); | | 1193 | return (NULL); |
1191 | } | | 1194 | } |
1192 | | | 1195 | |
1193 | /* | | 1196 | /* |
1194 | * Get a block of requested size that is associated with | | 1197 | * Get a block of requested size that is associated with |
1195 | * a given vnode and block offset. If it is found in the | | 1198 | * a given vnode and block offset. If it is found in the |
1196 | * block cache, mark it as having been found, make it busy | | 1199 | * block cache, mark it as having been found, make it busy |
1197 | * and return it. Otherwise, return an empty block of the | | 1200 | * and return it. Otherwise, return an empty block of the |
1198 | * correct size. It is up to the caller to insure that the | | 1201 | * correct size. It is up to the caller to insure that the |
1199 | * cached blocks be of the correct size. | | 1202 | * cached blocks be of the correct size. |
1200 | */ | | 1203 | */ |
1201 | buf_t * | | 1204 | buf_t * |
1202 | getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) | | 1205 | getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) |
1203 | { | | 1206 | { |
1204 | int err, preserve; | | 1207 | int err, preserve; |
1205 | buf_t *bp; | | 1208 | buf_t *bp; |
1206 | | | 1209 | |
1207 | mutex_enter(&bufcache_lock); | | 1210 | mutex_enter(&bufcache_lock); |
1208 | SDT_PROBE3(io, kernel, , getblk__start, vp, blkno, size); | | 1211 | SDT_PROBE3(io, kernel, , getblk__start, vp, blkno, size); |
1209 | loop: | | 1212 | loop: |
1210 | bp = incore(vp, blkno); | | 1213 | bp = incore(vp, blkno); |
1211 | if (bp != NULL) { | | 1214 | if (bp != NULL) { |
1212 | err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL); | | 1215 | err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL); |
1213 | if (err != 0) { | | 1216 | if (err != 0) { |
1214 | if (err == EPASSTHROUGH) | | 1217 | if (err == EPASSTHROUGH) |
1215 | goto loop; | | 1218 | goto loop; |
1216 | mutex_exit(&bufcache_lock); | | 1219 | mutex_exit(&bufcache_lock); |
1217 | SDT_PROBE4(io, kernel, , getblk__done, | | 1220 | SDT_PROBE4(io, kernel, , getblk__done, |
1218 | vp, blkno, size, NULL); | | 1221 | vp, blkno, size, NULL); |
1219 | return (NULL); | | 1222 | return (NULL); |
1220 | } | | 1223 | } |
1221 | KASSERT(!cv_has_waiters(&bp->b_done)); | | 1224 | KASSERT(!cv_has_waiters(&bp->b_done)); |
1222 | #ifdef DIAGNOSTIC | | 1225 | #ifdef DIAGNOSTIC |
1223 | if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) && | | 1226 | if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) && |
1224 | bp->b_bcount < size && vp->v_type != VBLK) | | 1227 | bp->b_bcount < size && vp->v_type != VBLK) |
1225 | panic("getblk: block size invariant failed"); | | 1228 | panic("getblk: block size invariant failed"); |
1226 | #endif | | 1229 | #endif |
1227 | bremfree(bp); | | 1230 | bremfree(bp); |
1228 | preserve = 1; | | 1231 | preserve = 1; |
1229 | } else { | | 1232 | } else { |
1230 | if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL) | | 1233 | if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL) |
1231 | goto loop; | | 1234 | goto loop; |
1232 | | | 1235 | |
1233 | if (incore(vp, blkno) != NULL) { | | 1236 | if (incore(vp, blkno) != NULL) { |
1234 | /* The block has come into memory in the meantime. */ | | 1237 | /* The block has come into memory in the meantime. */ |
1235 | brelsel(bp, 0); | | 1238 | brelsel(bp, 0); |
1236 | goto loop; | | 1239 | goto loop; |
1237 | } | | 1240 | } |
1238 | | | 1241 | |
1239 | LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash); | | 1242 | LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash); |
1240 | bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno; | | 1243 | bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno; |
1241 | mutex_enter(vp->v_interlock); | | 1244 | mutex_enter(vp->v_interlock); |
1242 | bgetvp(vp, bp); | | 1245 | bgetvp(vp, bp); |
1243 | mutex_exit(vp->v_interlock); | | 1246 | mutex_exit(vp->v_interlock); |
1244 | preserve = 0; | | 1247 | preserve = 0; |
1245 | } | | 1248 | } |
1246 | mutex_exit(&bufcache_lock); | | 1249 | mutex_exit(&bufcache_lock); |
1247 | | | 1250 | |
1248 | /* | | 1251 | /* |
1249 | * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes) | | 1252 | * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes) |
1250 | * if we re-size buffers here. | | 1253 | * if we re-size buffers here. |
1251 | */ | | 1254 | */ |
1252 | if (ISSET(bp->b_flags, B_LOCKED)) { | | 1255 | if (ISSET(bp->b_flags, B_LOCKED)) { |
1253 | KASSERT(bp->b_bufsize >= size); | | 1256 | KASSERT(bp->b_bufsize >= size); |
1254 | } else { | | 1257 | } else { |
1255 | if (allocbuf(bp, size, preserve)) { | | 1258 | if (allocbuf(bp, size, preserve)) { |
1256 | mutex_enter(&bufcache_lock); | | 1259 | mutex_enter(&bufcache_lock); |
1257 | LIST_REMOVE(bp, b_hash); | | 1260 | LIST_REMOVE(bp, b_hash); |
1258 | brelsel(bp, BC_INVAL); | | 1261 | brelsel(bp, BC_INVAL); |
1259 | mutex_exit(&bufcache_lock); | | 1262 | mutex_exit(&bufcache_lock); |
1260 | SDT_PROBE4(io, kernel, , getblk__done, | | 1263 | SDT_PROBE4(io, kernel, , getblk__done, |
1261 | vp, blkno, size, NULL); | | 1264 | vp, blkno, size, NULL); |
1262 | return NULL; | | 1265 | return NULL; |
1263 | } | | 1266 | } |
1264 | } | | 1267 | } |
1265 | BIO_SETPRIO(bp, BPRIO_DEFAULT); | | 1268 | BIO_SETPRIO(bp, BPRIO_DEFAULT); |
1266 | SDT_PROBE4(io, kernel, , getblk__done, vp, blkno, size, bp); | | 1269 | SDT_PROBE4(io, kernel, , getblk__done, vp, blkno, size, bp); |
1267 | return (bp); | | 1270 | return (bp); |
1268 | } | | 1271 | } |
1269 | | | 1272 | |
1270 | /* | | 1273 | /* |
1271 | * Get an empty, disassociated buffer of given size. | | 1274 | * Get an empty, disassociated buffer of given size. |
1272 | */ | | 1275 | */ |
1273 | buf_t * | | 1276 | buf_t * |
1274 | geteblk(int size) | | 1277 | geteblk(int size) |
1275 | { | | 1278 | { |
1276 | buf_t *bp; | | 1279 | buf_t *bp; |
1277 | int error __diagused; | | 1280 | int error __diagused; |
1278 | | | 1281 | |
1279 | mutex_enter(&bufcache_lock); | | 1282 | mutex_enter(&bufcache_lock); |
1280 | while ((bp = getnewbuf(0, 0, 0)) == NULL) | | 1283 | while ((bp = getnewbuf(0, 0, 0)) == NULL) |
1281 | ; | | 1284 | ; |
1282 | | | 1285 | |
1283 | SET(bp->b_cflags, BC_INVAL); | | 1286 | SET(bp->b_cflags, BC_INVAL); |
1284 | LIST_INSERT_HEAD(&invalhash, bp, b_hash); | | 1287 | LIST_INSERT_HEAD(&invalhash, bp, b_hash); |
1285 | mutex_exit(&bufcache_lock); | | 1288 | mutex_exit(&bufcache_lock); |
1286 | BIO_SETPRIO(bp, BPRIO_DEFAULT); | | 1289 | BIO_SETPRIO(bp, BPRIO_DEFAULT); |
1287 | error = allocbuf(bp, size, 0); | | 1290 | error = allocbuf(bp, size, 0); |
1288 | KASSERT(error == 0); | | 1291 | KASSERT(error == 0); |
1289 | return (bp); | | 1292 | return (bp); |
1290 | } | | 1293 | } |
1291 | | | 1294 | |
1292 | /* | | 1295 | /* |
1293 | * Expand or contract the actual memory allocated to a buffer. | | 1296 | * Expand or contract the actual memory allocated to a buffer. |
1294 | * | | 1297 | * |
1295 | * If the buffer shrinks, data is lost, so it's up to the | | 1298 | * If the buffer shrinks, data is lost, so it's up to the |
1296 | * caller to have written it out *first*; this routine will not | | 1299 | * caller to have written it out *first*; this routine will not |
1297 | * start a write. If the buffer grows, it's the callers | | 1300 | * start a write. If the buffer grows, it's the callers |
1298 | * responsibility to fill out the buffer's additional contents. | | 1301 | * responsibility to fill out the buffer's additional contents. |
1299 | */ | | 1302 | */ |
1300 | int | | 1303 | int |
1301 | allocbuf(buf_t *bp, int size, int preserve) | | 1304 | allocbuf(buf_t *bp, int size, int preserve) |
1302 | { | | 1305 | { |
1303 | void *addr; | | 1306 | void *addr; |
1304 | vsize_t oldsize, desired_size; | | 1307 | vsize_t oldsize, desired_size; |
1305 | int oldcount; | | 1308 | int oldcount; |
1306 | int delta; | | 1309 | int delta; |
1307 | | | 1310 | |
1308 | desired_size = buf_roundsize(size); | | 1311 | desired_size = buf_roundsize(size); |
1309 | if (desired_size > MAXBSIZE) | | 1312 | if (desired_size > MAXBSIZE) |
1310 | printf("allocbuf: buffer larger than MAXBSIZE requested"); | | 1313 | printf("allocbuf: buffer larger than MAXBSIZE requested"); |
1311 | | | 1314 | |
1312 | oldcount = bp->b_bcount; | | 1315 | oldcount = bp->b_bcount; |
1313 | | | 1316 | |
1314 | bp->b_bcount = size; | | 1317 | bp->b_bcount = size; |
1315 | | | 1318 | |
1316 | oldsize = bp->b_bufsize; | | 1319 | oldsize = bp->b_bufsize; |
1317 | if (oldsize == desired_size) { | | 1320 | if (oldsize == desired_size) { |
1318 | /* | | 1321 | /* |
1319 | * Do not short cut the WAPBL resize, as the buffer length | | 1322 | * Do not short cut the WAPBL resize, as the buffer length |
1320 | * could still have changed and this would corrupt the | | 1323 | * could still have changed and this would corrupt the |
1321 | * tracking of the transaction length. | | 1324 | * tracking of the transaction length. |
1322 | */ | | 1325 | */ |
1323 | goto out; | | 1326 | goto out; |
1324 | } | | 1327 | } |
1325 | | | 1328 | |
1326 | /* | | 1329 | /* |
1327 | * If we want a buffer of a different size, re-allocate the | | 1330 | * If we want a buffer of a different size, re-allocate the |
1328 | * buffer's memory; copy old content only if needed. | | 1331 | * buffer's memory; copy old content only if needed. |
1329 | */ | | 1332 | */ |
1330 | addr = buf_alloc(desired_size); | | 1333 | addr = buf_alloc(desired_size); |
1331 | if (addr == NULL) | | 1334 | if (addr == NULL) |
1332 | return ENOMEM; | | 1335 | return ENOMEM; |
1333 | if (preserve) | | 1336 | if (preserve) |
1334 | memcpy(addr, bp->b_data, MIN(oldsize,desired_size)); | | 1337 | memcpy(addr, bp->b_data, MIN(oldsize,desired_size)); |
1335 | if (bp->b_data != NULL) | | 1338 | if (bp->b_data != NULL) |
1336 | buf_mrelease(bp->b_data, oldsize); | | 1339 | buf_mrelease(bp->b_data, oldsize); |
1337 | bp->b_data = addr; | | 1340 | bp->b_data = addr; |
1338 | bp->b_bufsize = desired_size; | | 1341 | bp->b_bufsize = desired_size; |
1339 | | | 1342 | |
1340 | /* | | 1343 | /* |
1341 | * Update overall buffer memory counter (protected by bufcache_lock) | | 1344 | * Update overall buffer memory counter (protected by bufcache_lock) |
1342 | */ | | 1345 | */ |
1343 | delta = (long)desired_size - (long)oldsize; | | 1346 | delta = (long)desired_size - (long)oldsize; |
1344 | | | 1347 | |
1345 | mutex_enter(&bufcache_lock); | | 1348 | mutex_enter(&bufcache_lock); |
1346 | if ((bufmem += delta) > bufmem_hiwater) { | | 1349 | if ((bufmem += delta) > bufmem_hiwater) { |
1347 | /* | | 1350 | /* |
1348 | * Need to trim overall memory usage. | | 1351 | * Need to trim overall memory usage. |
1349 | */ | | 1352 | */ |
1350 | while (buf_canrelease()) { | | 1353 | while (buf_canrelease()) { |
1351 | if (preempt_needed()) { | | 1354 | if (preempt_needed()) { |
1352 | mutex_exit(&bufcache_lock); | | 1355 | mutex_exit(&bufcache_lock); |
1353 | preempt(); | | 1356 | preempt(); |
1354 | mutex_enter(&bufcache_lock); | | 1357 | mutex_enter(&bufcache_lock); |
1355 | } | | 1358 | } |
1356 | if (buf_trim() == 0) | | 1359 | if (buf_trim() == 0) |
1357 | break; | | 1360 | break; |
1358 | } | | 1361 | } |
1359 | } | | 1362 | } |
1360 | mutex_exit(&bufcache_lock); | | 1363 | mutex_exit(&bufcache_lock); |
1361 | | | 1364 | |
1362 | out: | | 1365 | out: |
1363 | if (wapbl_vphaswapbl(bp->b_vp)) | | 1366 | if (wapbl_vphaswapbl(bp->b_vp)) |
1364 | WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount); | | 1367 | WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount); |
1365 | | | 1368 | |
1366 | return 0; | | 1369 | return 0; |
1367 | } | | 1370 | } |
1368 | | | 1371 | |
1369 | /* | | 1372 | /* |
1370 | * Find a buffer which is available for use. | | 1373 | * Find a buffer which is available for use. |
1371 | * Select something from a free list. | | 1374 | * Select something from a free list. |
1372 | * Preference is to AGE list, then LRU list. | | 1375 | * Preference is to AGE list, then LRU list. |
1373 | * | | 1376 | * |
1374 | * Called with the buffer queues locked. | | 1377 | * Called with the buffer queues locked. |
1375 | * Return buffer locked. | | 1378 | * Return buffer locked. |
1376 | */ | | 1379 | */ |
1377 | static buf_t * | | 1380 | static buf_t * |
1378 | getnewbuf(int slpflag, int slptimeo, int from_bufq) | | 1381 | getnewbuf(int slpflag, int slptimeo, int from_bufq) |
1379 | { | | 1382 | { |
1380 | buf_t *bp; | | 1383 | buf_t *bp; |
1381 | struct vnode *vp; | | 1384 | struct vnode *vp; |
1382 | struct mount *transmp = NULL; | | 1385 | struct mount *transmp = NULL; |
1383 | | | 1386 | |
1384 | SDT_PROBE0(io, kernel, , getnewbuf__start); | | 1387 | SDT_PROBE0(io, kernel, , getnewbuf__start); |
1385 | | | 1388 | |
1386 | start: | | 1389 | start: |
1387 | KASSERT(mutex_owned(&bufcache_lock)); | | 1390 | KASSERT(mutex_owned(&bufcache_lock)); |
1388 | | | 1391 | |
1389 | /* | | 1392 | /* |
1390 | * Get a new buffer from the pool. | | 1393 | * Get a new buffer from the pool. |
1391 | */ | | 1394 | */ |
1392 | if (!from_bufq && buf_lotsfree()) { | | 1395 | if (!from_bufq && buf_lotsfree()) { |
1393 | mutex_exit(&bufcache_lock); | | 1396 | mutex_exit(&bufcache_lock); |
1394 | bp = pool_cache_get(buf_cache, PR_NOWAIT); | | 1397 | bp = pool_cache_get(buf_cache, PR_NOWAIT); |
1395 | if (bp != NULL) { | | 1398 | if (bp != NULL) { |
1396 | memset((char *)bp, 0, sizeof(*bp)); | | 1399 | memset((char *)bp, 0, sizeof(*bp)); |
1397 | buf_init(bp); | | 1400 | buf_init(bp); |
1398 | SET(bp->b_cflags, BC_BUSY); /* mark buffer busy */ | | 1401 | SET(bp->b_cflags, BC_BUSY); /* mark buffer busy */ |
1399 | mutex_enter(&bufcache_lock); | | 1402 | mutex_enter(&bufcache_lock); |
1400 | #if defined(DIAGNOSTIC) | | 1403 | #if defined(DIAGNOSTIC) |
1401 | bp->b_freelistindex = -1; | | 1404 | bp->b_freelistindex = -1; |
1402 | #endif /* defined(DIAGNOSTIC) */ | | 1405 | #endif /* defined(DIAGNOSTIC) */ |
1403 | SDT_PROBE1(io, kernel, , getnewbuf__done, bp); | | 1406 | SDT_PROBE1(io, kernel, , getnewbuf__done, bp); |
1404 | return (bp); | | 1407 | return (bp); |
1405 | } | | 1408 | } |
1406 | mutex_enter(&bufcache_lock); | | 1409 | mutex_enter(&bufcache_lock); |
1407 | } | | 1410 | } |
1408 | | | 1411 | |
1409 | KASSERT(mutex_owned(&bufcache_lock)); | | 1412 | KASSERT(mutex_owned(&bufcache_lock)); |
1410 | if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL) { | | 1413 | if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL) { |
1411 | KASSERT(!ISSET(bp->b_oflags, BO_DELWRI)); | | 1414 | KASSERT(!ISSET(bp->b_oflags, BO_DELWRI)); |
1412 | } else { | | 1415 | } else { |
1413 | TAILQ_FOREACH(bp, &bufqueues[BQ_LRU].bq_queue, b_freelist) { | | 1416 | TAILQ_FOREACH(bp, &bufqueues[BQ_LRU].bq_queue, b_freelist) { |
1414 | if (ISSET(bp->b_cflags, BC_VFLUSH) || | | 1417 | if (ISSET(bp->b_cflags, BC_VFLUSH) || |
1415 | !ISSET(bp->b_oflags, BO_DELWRI)) | | 1418 | !ISSET(bp->b_oflags, BO_DELWRI)) |
1416 | break; | | 1419 | break; |
1417 | if (fstrans_start_nowait(bp->b_vp->v_mount) == 0) { | | 1420 | if (fstrans_start_nowait(bp->b_vp->v_mount) == 0) { |
1418 | KASSERT(transmp == NULL); | | 1421 | KASSERT(transmp == NULL); |
1419 | transmp = bp->b_vp->v_mount; | | 1422 | transmp = bp->b_vp->v_mount; |
1420 | break; | | 1423 | break; |
1421 | } | | 1424 | } |
1422 | } | | 1425 | } |
1423 | } | | 1426 | } |
1424 | if (bp != NULL) { | | 1427 | if (bp != NULL) { |
1425 | KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH)); | | 1428 | KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH)); |
1426 | bremfree(bp); | | 1429 | bremfree(bp); |
1427 | | | 1430 | |
1428 | /* Buffer is no longer on free lists. */ | | 1431 | /* Buffer is no longer on free lists. */ |
1429 | SET(bp->b_cflags, BC_BUSY); | | 1432 | SET(bp->b_cflags, BC_BUSY); |
1430 | | | 1433 | |
1431 | /* Wake anyone trying to lock the old identity. */ | | 1434 | /* Wake anyone trying to lock the old identity. */ |
1432 | cv_broadcast(&bp->b_busy); | | 1435 | cv_broadcast(&bp->b_busy); |
1433 | } else { | | 1436 | } else { |
1434 | /* | | 1437 | /* |
1435 | * XXX: !from_bufq should be removed. | | 1438 | * XXX: !from_bufq should be removed. |
1436 | */ | | 1439 | */ |
1437 | if (!from_bufq || curlwp != uvm.pagedaemon_lwp) { | | 1440 | if (!from_bufq || curlwp != uvm.pagedaemon_lwp) { |
1438 | /* wait for a free buffer of any kind */ | | 1441 | /* wait for a free buffer of any kind */ |
1439 | if ((slpflag & PCATCH) != 0) | | 1442 | if ((slpflag & PCATCH) != 0) |
1440 | (void)cv_timedwait_sig(&needbuffer_cv, | | 1443 | (void)cv_timedwait_sig(&needbuffer_cv, |
1441 | &bufcache_lock, slptimeo); | | 1444 | &bufcache_lock, slptimeo); |
1442 | else | | 1445 | else |
1443 | (void)cv_timedwait(&needbuffer_cv, | | 1446 | (void)cv_timedwait(&needbuffer_cv, |
1444 | &bufcache_lock, slptimeo); | | 1447 | &bufcache_lock, slptimeo); |
1445 | } | | 1448 | } |
1446 | SDT_PROBE1(io, kernel, , getnewbuf__done, NULL); | | 1449 | SDT_PROBE1(io, kernel, , getnewbuf__done, NULL); |
1447 | return (NULL); | | 1450 | return (NULL); |
1448 | } | | 1451 | } |
1449 | | | 1452 | |
1450 | #ifdef DIAGNOSTIC | | 1453 | #ifdef DIAGNOSTIC |
1451 | if (bp->b_bufsize <= 0) | | 1454 | if (bp->b_bufsize <= 0) |
1452 | panic("buffer %p: on queue but empty", bp); | | 1455 | panic("buffer %p: on queue but empty", bp); |
1453 | #endif | | 1456 | #endif |
1454 | | | 1457 | |
1455 | if (ISSET(bp->b_cflags, BC_VFLUSH)) { | | 1458 | if (ISSET(bp->b_cflags, BC_VFLUSH)) { |
1456 | /* | | 1459 | /* |
1457 | * This is a delayed write buffer being flushed to disk. Make | | 1460 | * This is a delayed write buffer being flushed to disk. Make |
1458 | * sure it gets aged out of the queue when it's finished, and | | 1461 | * sure it gets aged out of the queue when it's finished, and |
1459 | * leave it off the LRU queue. | | 1462 | * leave it off the LRU queue. |
1460 | */ | | 1463 | */ |
1461 | CLR(bp->b_cflags, BC_VFLUSH); | | 1464 | CLR(bp->b_cflags, BC_VFLUSH); |
1462 | SET(bp->b_cflags, BC_AGE); | | 1465 | SET(bp->b_cflags, BC_AGE); |
1463 | goto start; | | 1466 | goto start; |
1464 | } | | 1467 | } |
1465 | | | 1468 | |
1466 | KASSERT(ISSET(bp->b_cflags, BC_BUSY)); | | 1469 | KASSERT(ISSET(bp->b_cflags, BC_BUSY)); |
1467 | KASSERT(!cv_has_waiters(&bp->b_done)); | | 1470 | KASSERT(!cv_has_waiters(&bp->b_done)); |
1468 | | | 1471 | |
1469 | /* | | 1472 | /* |
1470 | * If buffer was a delayed write, start it and return NULL | | 1473 | * If buffer was a delayed write, start it and return NULL |
1471 | * (since we might sleep while starting the write). | | 1474 | * (since we might sleep while starting the write). |
1472 | */ | | 1475 | */ |
1473 | if (ISSET(bp->b_oflags, BO_DELWRI)) { | | 1476 | if (ISSET(bp->b_oflags, BO_DELWRI)) { |
1474 | /* | | 1477 | /* |
1475 | * This buffer has gone through the LRU, so make sure it gets | | 1478 | * This buffer has gone through the LRU, so make sure it gets |
1476 | * reused ASAP. | | 1479 | * reused ASAP. |
1477 | */ | | 1480 | */ |
1478 | SET(bp->b_cflags, BC_AGE); | | 1481 | SET(bp->b_cflags, BC_AGE); |
1479 | mutex_exit(&bufcache_lock); | | 1482 | mutex_exit(&bufcache_lock); |
1480 | bawrite(bp); | | 1483 | bawrite(bp); |
1481 | KASSERT(transmp != NULL); | | 1484 | KASSERT(transmp != NULL); |
1482 | fstrans_done(transmp); | | 1485 | fstrans_done(transmp); |
1483 | mutex_enter(&bufcache_lock); | | 1486 | mutex_enter(&bufcache_lock); |
1484 | SDT_PROBE1(io, kernel, , getnewbuf__done, NULL); | | 1487 | SDT_PROBE1(io, kernel, , getnewbuf__done, NULL); |
1485 | return (NULL); | | 1488 | return (NULL); |
1486 | } | | 1489 | } |
1487 | | | 1490 | |
1488 | KASSERT(transmp == NULL); | | 1491 | KASSERT(transmp == NULL); |
1489 | | | 1492 | |
1490 | vp = bp->b_vp; | | 1493 | vp = bp->b_vp; |
1491 | | | 1494 | |
1492 | /* clear out various other fields */ | | 1495 | /* clear out various other fields */ |
1493 | bp->b_cflags = BC_BUSY; | | 1496 | bp->b_cflags = BC_BUSY; |
1494 | bp->b_oflags = 0; | | 1497 | bp->b_oflags = 0; |
1495 | bp->b_flags = 0; | | 1498 | bp->b_flags = 0; |
1496 | bp->b_dev = NODEV; | | 1499 | bp->b_dev = NODEV; |
1497 | bp->b_blkno = 0; | | 1500 | bp->b_blkno = 0; |
1498 | bp->b_lblkno = 0; | | 1501 | bp->b_lblkno = 0; |
1499 | bp->b_rawblkno = 0; | | 1502 | bp->b_rawblkno = 0; |
1500 | bp->b_iodone = 0; | | 1503 | bp->b_iodone = 0; |
1501 | bp->b_error = 0; | | 1504 | bp->b_error = 0; |
1502 | bp->b_resid = 0; | | 1505 | bp->b_resid = 0; |
1503 | bp->b_bcount = 0; | | 1506 | bp->b_bcount = 0; |
1504 | | | 1507 | |
1505 | LIST_REMOVE(bp, b_hash); | | 1508 | LIST_REMOVE(bp, b_hash); |
1506 | | | 1509 | |
1507 | /* Disassociate us from our vnode, if we had one... */ | | 1510 | /* Disassociate us from our vnode, if we had one... */ |
1508 | if (vp != NULL) { | | 1511 | if (vp != NULL) { |
1509 | mutex_enter(vp->v_interlock); | | 1512 | mutex_enter(vp->v_interlock); |
1510 | brelvp(bp); | | 1513 | brelvp(bp); |
1511 | mutex_exit(vp->v_interlock); | | 1514 | mutex_exit(vp->v_interlock); |
1512 | } | | 1515 | } |
1513 | | | 1516 | |
1514 | SDT_PROBE1(io, kernel, , getnewbuf__done, bp); | | 1517 | SDT_PROBE1(io, kernel, , getnewbuf__done, bp); |
1515 | return (bp); | | 1518 | return (bp); |
1516 | } | | 1519 | } |
1517 | | | 1520 | |
1518 | /* | | 1521 | /* |
1519 | * Invalidate the specified buffer if it exists. | | 1522 | * Invalidate the specified buffer if it exists. |
1520 | */ | | 1523 | */ |
1521 | void | | 1524 | void |
1522 | binvalbuf(struct vnode *vp, daddr_t blkno) | | 1525 | binvalbuf(struct vnode *vp, daddr_t blkno) |
1523 | { | | 1526 | { |
1524 | buf_t *bp; | | 1527 | buf_t *bp; |
1525 | int err; | | 1528 | int err; |
1526 | | | 1529 | |
1527 | mutex_enter(&bufcache_lock); | | 1530 | mutex_enter(&bufcache_lock); |
1528 | | | 1531 | |
1529 | loop: | | 1532 | loop: |
1530 | bp = incore(vp, blkno); | | 1533 | bp = incore(vp, blkno); |
1531 | if (bp != NULL) { | | 1534 | if (bp != NULL) { |
1532 | err = bbusy(bp, 0, 0, NULL); | | 1535 | err = bbusy(bp, 0, 0, NULL); |
1533 | if (err == EPASSTHROUGH) | | 1536 | if (err == EPASSTHROUGH) |
1534 | goto loop; | | 1537 | goto loop; |
1535 | bremfree(bp); | | 1538 | bremfree(bp); |
1536 | if (ISSET(bp->b_oflags, BO_DELWRI)) { | | 1539 | if (ISSET(bp->b_oflags, BO_DELWRI)) { |
1537 | SET(bp->b_cflags, BC_NOCACHE); | | 1540 | SET(bp->b_cflags, BC_NOCACHE); |
1538 | mutex_exit(&bufcache_lock); | | 1541 | mutex_exit(&bufcache_lock); |
1539 | bwrite(bp); | | 1542 | bwrite(bp); |
1540 | } else { | | 1543 | } else { |
1541 | brelsel(bp, BC_INVAL); | | 1544 | brelsel(bp, BC_INVAL); |
1542 | mutex_exit(&bufcache_lock); | | 1545 | mutex_exit(&bufcache_lock); |
1543 | } | | 1546 | } |
1544 | } else | | 1547 | } else |
1545 | mutex_exit(&bufcache_lock); | | 1548 | mutex_exit(&bufcache_lock); |
1546 | } | | 1549 | } |
1547 | | | 1550 | |
1548 | /* | | 1551 | /* |
1549 | * Attempt to free an aged buffer off the queues. | | 1552 | * Attempt to free an aged buffer off the queues. |
1550 | * Called with queue lock held. | | 1553 | * Called with queue lock held. |
1551 | * Returns the amount of buffer memory freed. | | 1554 | * Returns the amount of buffer memory freed. |
1552 | */ | | 1555 | */ |
1553 | static int | | 1556 | static int |
1554 | buf_trim(void) | | 1557 | buf_trim(void) |
1555 | { | | 1558 | { |
1556 | buf_t *bp; | | 1559 | buf_t *bp; |
1557 | long size; | | 1560 | long size; |
1558 | | | 1561 | |
1559 | KASSERT(mutex_owned(&bufcache_lock)); | | 1562 | KASSERT(mutex_owned(&bufcache_lock)); |
1560 | | | 1563 | |
1561 | /* Instruct getnewbuf() to get buffers off the queues */ | | 1564 | /* Instruct getnewbuf() to get buffers off the queues */ |
1562 | if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL) | | 1565 | if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL) |
1563 | return 0; | | 1566 | return 0; |
1564 | | | 1567 | |
1565 | KASSERT((bp->b_cflags & BC_WANTED) == 0); | | 1568 | KASSERT((bp->b_cflags & BC_WANTED) == 0); |
1566 | size = bp->b_bufsize; | | 1569 | size = bp->b_bufsize; |
1567 | bufmem -= size; | | 1570 | bufmem -= size; |
1568 | if (size > 0) { | | 1571 | if (size > 0) { |
1569 | buf_mrelease(bp->b_data, size); | | 1572 | buf_mrelease(bp->b_data, size); |
1570 | bp->b_bcount = bp->b_bufsize = 0; | | 1573 | bp->b_bcount = bp->b_bufsize = 0; |
1571 | } | | 1574 | } |
1572 | /* brelse() will return the buffer to the global buffer pool */ | | 1575 | /* brelse() will return the buffer to the global buffer pool */ |
1573 | brelsel(bp, 0); | | 1576 | brelsel(bp, 0); |
1574 | return size; | | 1577 | return size; |
1575 | } | | 1578 | } |
1576 | | | 1579 | |
1577 | int | | 1580 | int |
1578 | buf_drain(int n) | | 1581 | buf_drain(int n) |
1579 | { | | 1582 | { |
1580 | int size = 0, sz; | | 1583 | int size = 0, sz; |
1581 | | | 1584 | |
1582 | KASSERT(mutex_owned(&bufcache_lock)); | | 1585 | KASSERT(mutex_owned(&bufcache_lock)); |
1583 | | | 1586 | |
1584 | while (size < n && bufmem > bufmem_lowater) { | | 1587 | while (size < n && bufmem > bufmem_lowater) { |
1585 | sz = buf_trim(); | | 1588 | sz = buf_trim(); |
1586 | if (sz <= 0) | | 1589 | if (sz <= 0) |
1587 | break; | | 1590 | break; |
1588 | size += sz; | | 1591 | size += sz; |
1589 | } | | 1592 | } |
1590 | | | 1593 | |
1591 | return size; | | 1594 | return size; |
1592 | } | | 1595 | } |
1593 | | | 1596 | |
1594 | /* | | 1597 | /* |
1595 | * Wait for operations on the buffer to complete. | | 1598 | * Wait for operations on the buffer to complete. |
1596 | * When they do, extract and return the I/O's error value. | | 1599 | * When they do, extract and return the I/O's error value. |
1597 | */ | | 1600 | */ |
1598 | int | | 1601 | int |
1599 | biowait(buf_t *bp) | | 1602 | biowait(buf_t *bp) |
1600 | { | | 1603 | { |
1601 | | | 1604 | |
1602 | BIOHIST_FUNC(__func__); | | 1605 | BIOHIST_FUNC(__func__); |
1603 | | | 1606 | |
1604 | KASSERT(ISSET(bp->b_cflags, BC_BUSY)); | | 1607 | KASSERT(ISSET(bp->b_cflags, BC_BUSY)); |
1605 | | | 1608 | |
1606 | SDT_PROBE1(io, kernel, , wait__start, bp); | | 1609 | SDT_PROBE1(io, kernel, , wait__start, bp); |
1607 | | | 1610 | |
1608 | mutex_enter(bp->b_objlock); | | 1611 | mutex_enter(bp->b_objlock); |
1609 | | | 1612 | |
1610 | BIOHIST_CALLARGS(biohist, "bp=%#jx, oflags=0x%jx, ret_addr=%#jx", | | 1613 | BIOHIST_CALLARGS(biohist, "bp=%#jx, oflags=0x%jx, ret_addr=%#jx", |
1611 | (uintptr_t)bp, bp->b_oflags, | | 1614 | (uintptr_t)bp, bp->b_oflags, |
1612 | (uintptr_t)__builtin_return_address(0), 0); | | 1615 | (uintptr_t)__builtin_return_address(0), 0); |
1613 | | | 1616 | |
1614 | while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI)) { | | 1617 | while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI)) { |
1615 | BIOHIST_LOG(biohist, "waiting bp=%#jx", (uintptr_t)bp, 0, 0, 0); | | 1618 | BIOHIST_LOG(biohist, "waiting bp=%#jx", (uintptr_t)bp, 0, 0, 0); |
1616 | cv_wait(&bp->b_done, bp->b_objlock); | | 1619 | cv_wait(&bp->b_done, bp->b_objlock); |
1617 | } | | 1620 | } |
1618 | mutex_exit(bp->b_objlock); | | 1621 | mutex_exit(bp->b_objlock); |
1619 | | | 1622 | |
1620 | SDT_PROBE1(io, kernel, , wait__done, bp); | | 1623 | SDT_PROBE1(io, kernel, , wait__done, bp); |
1621 | | | 1624 | |
1622 | BIOHIST_LOG(biohist, "return %jd", bp->b_error, 0, 0, 0); | | 1625 | BIOHIST_LOG(biohist, "return %jd", bp->b_error, 0, 0, 0); |
1623 | | | 1626 | |
1624 | return bp->b_error; | | 1627 | return bp->b_error; |
1625 | } | | 1628 | } |
1626 | | | 1629 | |
1627 | /* | | 1630 | /* |
1628 | * Mark I/O complete on a buffer. | | 1631 | * Mark I/O complete on a buffer. |
1629 | * | | 1632 | * |
1630 | * If a callback has been requested, e.g. the pageout | | 1633 | * If a callback has been requested, e.g. the pageout |
1631 | * daemon, do so. Otherwise, awaken waiting processes. | | 1634 | * daemon, do so. Otherwise, awaken waiting processes. |
1632 | * | | 1635 | * |
1633 | * [ Leffler, et al., says on p.247: | | 1636 | * [ Leffler, et al., says on p.247: |
1634 | * "This routine wakes up the blocked process, frees the buffer | | 1637 | * "This routine wakes up the blocked process, frees the buffer |
1635 | * for an asynchronous write, or, for a request by the pagedaemon | | 1638 | * for an asynchronous write, or, for a request by the pagedaemon |
1636 | * process, invokes a procedure specified in the buffer structure" ] | | 1639 | * process, invokes a procedure specified in the buffer structure" ] |
1637 | * | | 1640 | * |
1638 | * In real life, the pagedaemon (or other system processes) wants | | 1641 | * In real life, the pagedaemon (or other system processes) wants |
1639 | * to do async stuff too, and doesn't want the buffer brelse()'d. | | 1642 | * to do async stuff too, and doesn't want the buffer brelse()'d. |
1640 | * (for swap pager, that puts swap buffers on the free lists (!!!), | | 1643 | * (for swap pager, that puts swap buffers on the free lists (!!!), |
1641 | * for the vn device, that puts allocated buffers on the free lists!) | | 1644 | * for the vn device, that puts allocated buffers on the free lists!) |
1642 | */ | | 1645 | */ |
1643 | void | | 1646 | void |
1644 | biodone(buf_t *bp) | | 1647 | biodone(buf_t *bp) |
1645 | { | | 1648 | { |
1646 | int s; | | 1649 | int s; |
1647 | | | 1650 | |
1648 | BIOHIST_FUNC(__func__); | | 1651 | BIOHIST_FUNC(__func__); |
1649 | | | 1652 | |
1650 | KASSERT(!ISSET(bp->b_oflags, BO_DONE)); | | 1653 | KASSERT(!ISSET(bp->b_oflags, BO_DONE)); |
1651 | | | 1654 | |
1652 | if (cpu_intr_p()) { | | 1655 | if (cpu_intr_p()) { |
1653 | /* From interrupt mode: defer to a soft interrupt. */ | | 1656 | /* From interrupt mode: defer to a soft interrupt. */ |
1654 | s = splvm(); | | 1657 | s = splvm(); |
1655 | TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq); | | 1658 | TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq); |
1656 | | | 1659 | |
1657 | BIOHIST_CALLARGS(biohist, "bp=%#jx, softint scheduled", | | 1660 | BIOHIST_CALLARGS(biohist, "bp=%#jx, softint scheduled", |
1658 | (uintptr_t)bp, 0, 0, 0); | | 1661 | (uintptr_t)bp, 0, 0, 0); |
1659 | softint_schedule(biodone_sih); | | 1662 | softint_schedule(biodone_sih); |
1660 | splx(s); | | 1663 | splx(s); |
1661 | } else { | | 1664 | } else { |
1662 | /* Process now - the buffer may be freed soon. */ | | 1665 | /* Process now - the buffer may be freed soon. */ |
1663 | biodone2(bp); | | 1666 | biodone2(bp); |
1664 | } | | 1667 | } |
1665 | } | | 1668 | } |
1666 | | | 1669 | |
1667 | SDT_PROBE_DEFINE1(io, kernel, , done, "struct buf *"/*bp*/); | | 1670 | SDT_PROBE_DEFINE1(io, kernel, , done, "struct buf *"/*bp*/); |
1668 | | | 1671 | |
1669 | static void | | 1672 | static void |
1670 | biodone2(buf_t *bp) | | 1673 | biodone2(buf_t *bp) |
1671 | { | | 1674 | { |
1672 | void (*callout)(buf_t *); | | 1675 | void (*callout)(buf_t *); |
1673 | | | 1676 | |
1674 | SDT_PROBE1(io, kernel, ,done, bp); | | 1677 | SDT_PROBE1(io, kernel, ,done, bp); |
1675 | | | 1678 | |
1676 | BIOHIST_FUNC(__func__); | | 1679 | BIOHIST_FUNC(__func__); |
1677 | BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0); | | 1680 | BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0); |
1678 | | | 1681 | |
1679 | mutex_enter(bp->b_objlock); | | 1682 | mutex_enter(bp->b_objlock); |
1680 | /* Note that the transfer is done. */ | | 1683 | /* Note that the transfer is done. */ |
1681 | if (ISSET(bp->b_oflags, BO_DONE)) | | 1684 | if (ISSET(bp->b_oflags, BO_DONE)) |
1682 | panic("biodone2 already"); | | 1685 | panic("biodone2 already"); |
1683 | CLR(bp->b_flags, B_COWDONE); | | 1686 | CLR(bp->b_flags, B_COWDONE); |
1684 | SET(bp->b_oflags, BO_DONE); | | 1687 | SET(bp->b_oflags, BO_DONE); |
1685 | BIO_SETPRIO(bp, BPRIO_DEFAULT); | | 1688 | BIO_SETPRIO(bp, BPRIO_DEFAULT); |
1686 | | | 1689 | |
1687 | /* Wake up waiting writers. */ | | 1690 | /* Wake up waiting writers. */ |
1688 | if (!ISSET(bp->b_flags, B_READ)) | | 1691 | if (!ISSET(bp->b_flags, B_READ)) |
1689 | vwakeup(bp); | | 1692 | vwakeup(bp); |
1690 | | | 1693 | |
1691 | if ((callout = bp->b_iodone) != NULL) { | | 1694 | if ((callout = bp->b_iodone) != NULL) { |
1692 | BIOHIST_LOG(biohist, "callout %#jx", (uintptr_t)callout, | | 1695 | BIOHIST_LOG(biohist, "callout %#jx", (uintptr_t)callout, |
1693 | 0, 0, 0); | | 1696 | 0, 0, 0); |
1694 | | | 1697 | |
1695 | /* Note callout done, then call out. */ | | 1698 | /* Note callout done, then call out. */ |
1696 | KASSERT(!cv_has_waiters(&bp->b_done)); | | 1699 | KASSERT(!cv_has_waiters(&bp->b_done)); |
1697 | bp->b_iodone = NULL; | | 1700 | bp->b_iodone = NULL; |
1698 | mutex_exit(bp->b_objlock); | | 1701 | mutex_exit(bp->b_objlock); |
1699 | (*callout)(bp); | | 1702 | (*callout)(bp); |
1700 | } else if (ISSET(bp->b_flags, B_ASYNC)) { | | 1703 | } else if (ISSET(bp->b_flags, B_ASYNC)) { |
1701 | /* If async, release. */ | | 1704 | /* If async, release. */ |
1702 | BIOHIST_LOG(biohist, "async", 0, 0, 0, 0); | | 1705 | BIOHIST_LOG(biohist, "async", 0, 0, 0, 0); |
1703 | KASSERT(!cv_has_waiters(&bp->b_done)); | | 1706 | KASSERT(!cv_has_waiters(&bp->b_done)); |
1704 | mutex_exit(bp->b_objlock); | | 1707 | mutex_exit(bp->b_objlock); |
1705 | brelse(bp, 0); | | 1708 | brelse(bp, 0); |
1706 | } else { | | 1709 | } else { |
1707 | /* Otherwise just wake up waiters in biowait(). */ | | 1710 | /* Otherwise just wake up waiters in biowait(). */ |
1708 | BIOHIST_LOG(biohist, "wake-up", 0, 0, 0, 0); | | 1711 | BIOHIST_LOG(biohist, "wake-up", 0, 0, 0, 0); |
1709 | cv_broadcast(&bp->b_done); | | 1712 | cv_broadcast(&bp->b_done); |
1710 | mutex_exit(bp->b_objlock); | | 1713 | mutex_exit(bp->b_objlock); |
1711 | } | | 1714 | } |
1712 | } | | 1715 | } |
1713 | | | 1716 | |
1714 | static void | | 1717 | static void |
1715 | biointr(void *cookie) | | 1718 | biointr(void *cookie) |
1716 | { | | 1719 | { |
1717 | struct cpu_info *ci; | | 1720 | struct cpu_info *ci; |
1718 | buf_t *bp; | | 1721 | buf_t *bp; |
1719 | int s; | | 1722 | int s; |
1720 | | | 1723 | |
1721 | BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); | | 1724 | BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); |
1722 | | | 1725 | |
1723 | ci = curcpu(); | | 1726 | ci = curcpu(); |
1724 | | | 1727 | |
1725 | s = splvm(); | | 1728 | s = splvm(); |
1726 | while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) { | | 1729 | while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) { |
1727 | KASSERT(curcpu() == ci); | | 1730 | KASSERT(curcpu() == ci); |
1728 | | | 1731 | |
1729 | bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone); | | 1732 | bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone); |
1730 | TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq); | | 1733 | TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq); |
1731 | splx(s); | | 1734 | splx(s); |
1732 | | | 1735 | |
1733 | BIOHIST_LOG(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0); | | 1736 | BIOHIST_LOG(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0); |
1734 | biodone2(bp); | | 1737 | biodone2(bp); |
1735 | | | 1738 | |
1736 | s = splvm(); | | 1739 | s = splvm(); |
1737 | } | | 1740 | } |
1738 | splx(s); | | 1741 | splx(s); |
1739 | } | | 1742 | } |
1740 | | | 1743 | |
1741 | static void | | 1744 | static void |
1742 | sysctl_fillbuf(const buf_t *i, struct buf_sysctl *o) | | 1745 | sysctl_fillbuf(const buf_t *i, struct buf_sysctl *o) |
1743 | { | | 1746 | { |
1744 | const bool allowaddr = get_expose_address(curproc); | | 1747 | const bool allowaddr = get_expose_address(curproc); |
1745 | | | 1748 | |
1746 | memset(o, 0, sizeof(*o)); | | 1749 | memset(o, 0, sizeof(*o)); |
1747 | | | 1750 | |
1748 | o->b_flags = i->b_flags | i->b_cflags | i->b_oflags; | | 1751 | o->b_flags = i->b_flags | i->b_cflags | i->b_oflags; |
1749 | o->b_error = i->b_error; | | 1752 | o->b_error = i->b_error; |
1750 | o->b_prio = i->b_prio; | | 1753 | o->b_prio = i->b_prio; |
1751 | o->b_dev = i->b_dev; | | 1754 | o->b_dev = i->b_dev; |
1752 | o->b_bufsize = i->b_bufsize; | | 1755 | o->b_bufsize = i->b_bufsize; |
1753 | o->b_bcount = i->b_bcount; | | 1756 | o->b_bcount = i->b_bcount; |
1754 | o->b_resid = i->b_resid; | | 1757 | o->b_resid = i->b_resid; |
1755 | COND_SET_VALUE(o->b_addr, PTRTOUINT64(i->b_data), allowaddr); | | 1758 | COND_SET_VALUE(o->b_addr, PTRTOUINT64(i->b_data), allowaddr); |
1756 | o->b_blkno = i->b_blkno; | | 1759 | o->b_blkno = i->b_blkno; |
1757 | o->b_rawblkno = i->b_rawblkno; | | 1760 | o->b_rawblkno = i->b_rawblkno; |
1758 | COND_SET_VALUE(o->b_iodone, PTRTOUINT64(i->b_iodone), allowaddr); | | 1761 | COND_SET_VALUE(o->b_iodone, PTRTOUINT64(i->b_iodone), allowaddr); |
1759 | COND_SET_VALUE(o->b_proc, PTRTOUINT64(i->b_proc), allowaddr); | | 1762 | COND_SET_VALUE(o->b_proc, PTRTOUINT64(i->b_proc), allowaddr); |
1760 | COND_SET_VALUE(o->b_vp, PTRTOUINT64(i->b_vp), allowaddr); | | 1763 | COND_SET_VALUE(o->b_vp, PTRTOUINT64(i->b_vp), allowaddr); |
1761 | COND_SET_VALUE(o->b_saveaddr, PTRTOUINT64(i->b_saveaddr), allowaddr); | | 1764 | COND_SET_VALUE(o->b_saveaddr, PTRTOUINT64(i->b_saveaddr), allowaddr); |
1762 | o->b_lblkno = i->b_lblkno; | | 1765 | o->b_lblkno = i->b_lblkno; |
1763 | } | | 1766 | } |
1764 | | | 1767 | |
1765 | #define KERN_BUFSLOP 20 | | 1768 | #define KERN_BUFSLOP 20 |
1766 | static int | | 1769 | static int |
1767 | sysctl_dobuf(SYSCTLFN_ARGS) | | 1770 | sysctl_dobuf(SYSCTLFN_ARGS) |
1768 | { | | 1771 | { |
1769 | buf_t *bp; | | 1772 | buf_t *bp; |
1770 | struct buf_sysctl bs; | | 1773 | struct buf_sysctl bs; |
1771 | struct bqueue *bq; | | 1774 | struct bqueue *bq; |
1772 | char *dp; | | 1775 | char *dp; |
1773 | u_int i, op, arg; | | 1776 | u_int i, op, arg; |
1774 | size_t len, needed, elem_size, out_size; | | 1777 | size_t len, needed, elem_size, out_size; |
1775 | int error, elem_count, retries; | | 1778 | int error, elem_count, retries; |
1776 | | | 1779 | |
1777 | if (namelen == 1 && name[0] == CTL_QUERY) | | 1780 | if (namelen == 1 && name[0] == CTL_QUERY) |
1778 | return (sysctl_query(SYSCTLFN_CALL(rnode))); | | 1781 | return (sysctl_query(SYSCTLFN_CALL(rnode))); |
1779 | | | 1782 | |
1780 | if (namelen != 4) | | 1783 | if (namelen != 4) |
1781 | return (EINVAL); | | 1784 | return (EINVAL); |
1782 | | | 1785 | |
1783 | retries = 100; | | 1786 | retries = 100; |
1784 | retry: | | 1787 | retry: |
1785 | dp = oldp; | | 1788 | dp = oldp; |
1786 | len = (oldp != NULL) ? *oldlenp : 0; | | 1789 | len = (oldp != NULL) ? *oldlenp : 0; |
1787 | op = name[0]; | | 1790 | op = name[0]; |
1788 | arg = name[1]; | | 1791 | arg = name[1]; |
1789 | elem_size = name[2]; | | 1792 | elem_size = name[2]; |
1790 | elem_count = name[3]; | | 1793 | elem_count = name[3]; |
1791 | out_size = MIN(sizeof(bs), elem_size); | | 1794 | out_size = MIN(sizeof(bs), elem_size); |
1792 | | | 1795 | |
1793 | /* | | 1796 | /* |
1794 | * at the moment, these are just "placeholders" to make the | | 1797 | * at the moment, these are just "placeholders" to make the |
1795 | * API for retrieving kern.buf data more extensible in the | | 1798 | * API for retrieving kern.buf data more extensible in the |
1796 | * future. | | 1799 | * future. |
1797 | * | | 1800 | * |
1798 | * XXX kern.buf currently has "netbsd32" issues. hopefully | | 1801 | * XXX kern.buf currently has "netbsd32" issues. hopefully |
1799 | * these will be resolved at a later point. | | 1802 | * these will be resolved at a later point. |
1800 | */ | | 1803 | */ |
1801 | if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL || | | 1804 | if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL || |
1802 | elem_size < 1 || elem_count < 0) | | 1805 | elem_size < 1 || elem_count < 0) |
1803 | return (EINVAL); | | 1806 | return (EINVAL); |
1804 | | | 1807 | |
1805 | error = 0; | | 1808 | error = 0; |
1806 | needed = 0; | | 1809 | needed = 0; |
1807 | sysctl_unlock(); | | 1810 | sysctl_unlock(); |
1808 | mutex_enter(&bufcache_lock); | | 1811 | mutex_enter(&bufcache_lock); |
1809 | for (i = 0; i < BQUEUES; i++) { | | 1812 | for (i = 0; i < BQUEUES; i++) { |
1810 | bq = &bufqueues[i]; | | 1813 | bq = &bufqueues[i]; |
1811 | TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) { | | 1814 | TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) { |
1812 | bq->bq_marker = bp; | | 1815 | bq->bq_marker = bp; |
1813 | if (len >= elem_size && elem_count > 0) { | | 1816 | if (len >= elem_size && elem_count > 0) { |
1814 | sysctl_fillbuf(bp, &bs); | | 1817 | sysctl_fillbuf(bp, &bs); |
1815 | mutex_exit(&bufcache_lock); | | 1818 | mutex_exit(&bufcache_lock); |
1816 | error = copyout(&bs, dp, out_size); | | 1819 | error = copyout(&bs, dp, out_size); |
1817 | mutex_enter(&bufcache_lock); | | 1820 | mutex_enter(&bufcache_lock); |
1818 | if (error) | | 1821 | if (error) |
1819 | break; | | 1822 | break; |
1820 | if (bq->bq_marker != bp) { | | 1823 | if (bq->bq_marker != bp) { |
1821 | /* | | 1824 | /* |
1822 | * This sysctl node is only for | | 1825 | * This sysctl node is only for |
1823 | * statistics. Retry; if the | | 1826 | * statistics. Retry; if the |
1824 | * queue keeps changing, then | | 1827 | * queue keeps changing, then |
1825 | * bail out. | | 1828 | * bail out. |
1826 | */ | | 1829 | */ |
1827 | if (retries-- == 0) { | | 1830 | if (retries-- == 0) { |
1828 | error = EAGAIN; | | 1831 | error = EAGAIN; |
1829 | break; | | 1832 | break; |
1830 | } | | 1833 | } |
1831 | mutex_exit(&bufcache_lock); | | 1834 | mutex_exit(&bufcache_lock); |
1832 | sysctl_relock(); | | 1835 | sysctl_relock(); |
1833 | goto retry; | | 1836 | goto retry; |
1834 | } | | 1837 | } |
1835 | dp += elem_size; | | 1838 | dp += elem_size; |
1836 | len -= elem_size; | | 1839 | len -= elem_size; |
1837 | } | | 1840 | } |
1838 | needed += elem_size; | | 1841 | needed += elem_size; |
1839 | if (elem_count > 0 && elem_count != INT_MAX) | | 1842 | if (elem_count > 0 && elem_count != INT_MAX) |
1840 | elem_count--; | | 1843 | elem_count--; |
1841 | } | | 1844 | } |
1842 | if (error != 0) | | 1845 | if (error != 0) |
1843 | break; | | 1846 | break; |
1844 | } | | 1847 | } |
1845 | mutex_exit(&bufcache_lock); | | 1848 | mutex_exit(&bufcache_lock); |
1846 | sysctl_relock(); | | 1849 | sysctl_relock(); |
1847 | | | 1850 | |
1848 | *oldlenp = needed; | | 1851 | *oldlenp = needed; |
1849 | if (oldp == NULL) | | 1852 | if (oldp == NULL) |
1850 | *oldlenp += KERN_BUFSLOP * sizeof(buf_t); | | 1853 | *oldlenp += KERN_BUFSLOP * sizeof(buf_t); |
1851 | | | 1854 | |
1852 | return (error); | | 1855 | return (error); |
1853 | } | | 1856 | } |
1854 | | | 1857 | |
1855 | static int | | 1858 | static int |
1856 | sysctl_bufvm_update(SYSCTLFN_ARGS) | | 1859 | sysctl_bufvm_update(SYSCTLFN_ARGS) |
1857 | { | | 1860 | { |
1858 | int error, rv; | | 1861 | int error, rv; |
1859 | struct sysctlnode node; | | 1862 | struct sysctlnode node; |
1860 | unsigned int temp_bufcache; | | 1863 | unsigned int temp_bufcache; |
1861 | unsigned long temp_water; | | 1864 | unsigned long temp_water; |
1862 | | | 1865 | |
1863 | /* Take a copy of the supplied node and its data */ | | 1866 | /* Take a copy of the supplied node and its data */ |
1864 | node = *rnode; | | 1867 | node = *rnode; |
1865 | if (node.sysctl_data == &bufcache) { | | 1868 | if (node.sysctl_data == &bufcache) { |
1866 | node.sysctl_data = &temp_bufcache; | | 1869 | node.sysctl_data = &temp_bufcache; |
1867 | temp_bufcache = *(unsigned int *)rnode->sysctl_data; | | 1870 | temp_bufcache = *(unsigned int *)rnode->sysctl_data; |
1868 | } else { | | 1871 | } else { |
1869 | node.sysctl_data = &temp_water; | | 1872 | node.sysctl_data = &temp_water; |
1870 | temp_water = *(unsigned long *)rnode->sysctl_data; | | 1873 | temp_water = *(unsigned long *)rnode->sysctl_data; |
1871 | } | | 1874 | } |
1872 | | | 1875 | |
1873 | /* Update the copy */ | | 1876 | /* Update the copy */ |
1874 | error = sysctl_lookup(SYSCTLFN_CALL(&node)); | | 1877 | error = sysctl_lookup(SYSCTLFN_CALL(&node)); |
1875 | if (error || newp == NULL) | | 1878 | if (error || newp == NULL) |
1876 | return (error); | | 1879 | return (error); |
1877 | | | 1880 | |
1878 | if (rnode->sysctl_data == &bufcache) { | | 1881 | if (rnode->sysctl_data == &bufcache) { |
1879 | if (temp_bufcache > 100) | | 1882 | if (temp_bufcache > 100) |
1880 | return (EINVAL); | | 1883 | return (EINVAL); |
1881 | bufcache = temp_bufcache; | | 1884 | bufcache = temp_bufcache; |
1882 | buf_setwm(); | | 1885 | buf_setwm(); |
1883 | } else if (rnode->sysctl_data == &bufmem_lowater) { | | 1886 | } else if (rnode->sysctl_data == &bufmem_lowater) { |
1884 | if (bufmem_hiwater - temp_water < 16) | | 1887 | if (bufmem_hiwater - temp_water < 16) |
1885 | return (EINVAL); | | 1888 | return (EINVAL); |
1886 | bufmem_lowater = temp_water; | | 1889 | bufmem_lowater = temp_water; |
1887 | } else if (rnode->sysctl_data == &bufmem_hiwater) { | | 1890 | } else if (rnode->sysctl_data == &bufmem_hiwater) { |
1888 | if (temp_water - bufmem_lowater < 16) | | 1891 | if (temp_water - bufmem_lowater < 16) |
1889 | return (EINVAL); | | 1892 | return (EINVAL); |
1890 | bufmem_hiwater = temp_water; | | 1893 | bufmem_hiwater = temp_water; |
1891 | } else | | 1894 | } else |
1892 | return (EINVAL); | | 1895 | return (EINVAL); |
1893 | | | 1896 | |
1894 | /* Drain until below new high water mark */ | | 1897 | /* Drain until below new high water mark */ |
1895 | sysctl_unlock(); | | 1898 | sysctl_unlock(); |
1896 | mutex_enter(&bufcache_lock); | | 1899 | mutex_enter(&bufcache_lock); |
1897 | while (bufmem > bufmem_hiwater) { | | 1900 | while (bufmem > bufmem_hiwater) { |
1898 | rv = buf_drain((bufmem - bufmem_hiwater) / (2 * 1024)); | | 1901 | rv = buf_drain((bufmem - bufmem_hiwater) / (2 * 1024)); |
1899 | if (rv <= 0) | | 1902 | if (rv <= 0) |
1900 | break; | | 1903 | break; |
1901 | } | | 1904 | } |
1902 | mutex_exit(&bufcache_lock); | | 1905 | mutex_exit(&bufcache_lock); |
1903 | sysctl_relock(); | | 1906 | sysctl_relock(); |
1904 | | | 1907 | |
1905 | return 0; | | 1908 | return 0; |
1906 | } | | 1909 | } |
1907 | | | 1910 | |
1908 | static struct sysctllog *vfsbio_sysctllog; | | 1911 | static struct sysctllog *vfsbio_sysctllog; |
1909 | | | 1912 | |
1910 | static void | | 1913 | static void |
1911 | sysctl_kern_buf_setup(void) | | 1914 | sysctl_kern_buf_setup(void) |
1912 | { | | 1915 | { |
1913 | | | 1916 | |
1914 | sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, | | 1917 | sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, |
1915 | CTLFLAG_PERMANENT, | | 1918 | CTLFLAG_PERMANENT, |
1916 | CTLTYPE_NODE, "buf", | | 1919 | CTLTYPE_NODE, "buf", |
1917 | SYSCTL_DESCR("Kernel buffer cache information"), | | 1920 | SYSCTL_DESCR("Kernel buffer cache information"), |
1918 | sysctl_dobuf, 0, NULL, 0, | | 1921 | sysctl_dobuf, 0, NULL, 0, |
1919 | CTL_KERN, KERN_BUF, CTL_EOL); | | 1922 | CTL_KERN, KERN_BUF, CTL_EOL); |
1920 | } | | 1923 | } |
1921 | | | 1924 | |
1922 | static void | | 1925 | static void |
1923 | sysctl_vm_buf_setup(void) | | 1926 | sysctl_vm_buf_setup(void) |
1924 | { | | 1927 | { |
1925 | | | 1928 | |
1926 | sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, | | 1929 | sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, |
1927 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, | | 1930 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, |
1928 | CTLTYPE_INT, "bufcache", | | 1931 | CTLTYPE_INT, "bufcache", |
1929 | SYSCTL_DESCR("Percentage of physical memory to use for " | | 1932 | SYSCTL_DESCR("Percentage of physical memory to use for " |
1930 | "buffer cache"), | | 1933 | "buffer cache"), |
1931 | sysctl_bufvm_update, 0, &bufcache, 0, | | 1934 | sysctl_bufvm_update, 0, &bufcache, 0, |
1932 | CTL_VM, CTL_CREATE, CTL_EOL); | | 1935 | CTL_VM, CTL_CREATE, CTL_EOL); |
1933 | sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, | | 1936 | sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, |
1934 | CTLFLAG_PERMANENT|CTLFLAG_READONLY, | | 1937 | CTLFLAG_PERMANENT|CTLFLAG_READONLY, |
1935 | CTLTYPE_LONG, "bufmem", | | 1938 | CTLTYPE_LONG, "bufmem", |
1936 | SYSCTL_DESCR("Amount of kernel memory used by buffer " | | 1939 | SYSCTL_DESCR("Amount of kernel memory used by buffer " |
1937 | "cache"), | | 1940 | "cache"), |
1938 | NULL, 0, &bufmem, 0, | | 1941 | NULL, 0, &bufmem, 0, |
1939 | CTL_VM, CTL_CREATE, CTL_EOL); | | 1942 | CTL_VM, CTL_CREATE, CTL_EOL); |
1940 | sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, | | 1943 | sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, |
1941 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, | | 1944 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, |
1942 | CTLTYPE_LONG, "bufmem_lowater", | | 1945 | CTLTYPE_LONG, "bufmem_lowater", |
1943 | SYSCTL_DESCR("Minimum amount of kernel memory to " | | 1946 | SYSCTL_DESCR("Minimum amount of kernel memory to " |
1944 | "reserve for buffer cache"), | | 1947 | "reserve for buffer cache"), |
1945 | sysctl_bufvm_update, 0, &bufmem_lowater, 0, | | 1948 | sysctl_bufvm_update, 0, &bufmem_lowater, 0, |
1946 | CTL_VM, CTL_CREATE, CTL_EOL); | | 1949 | CTL_VM, CTL_CREATE, CTL_EOL); |
1947 | sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, | | 1950 | sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, |
1948 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, | | 1951 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, |
1949 | CTLTYPE_LONG, "bufmem_hiwater", | | 1952 | CTLTYPE_LONG, "bufmem_hiwater", |
1950 | SYSCTL_DESCR("Maximum amount of kernel memory to use " | | 1953 | SYSCTL_DESCR("Maximum amount of kernel memory to use " |
1951 | "for buffer cache"), | | 1954 | "for buffer cache"), |
1952 | sysctl_bufvm_update, 0, &bufmem_hiwater, 0, | | 1955 | sysctl_bufvm_update, 0, &bufmem_hiwater, 0, |
1953 | CTL_VM, CTL_CREATE, CTL_EOL); | | 1956 | CTL_VM, CTL_CREATE, CTL_EOL); |
1954 | } | | 1957 | } |
1955 | | | 1958 | |
| | | 1959 | static int |
| | | 1960 | bufhash_stats(struct hashstat_sysctl *hs, bool fill) |
| | | 1961 | { |
| | | 1962 | buf_t *bp; |
| | | 1963 | uint64_t chain; |
| | | 1964 | |
| | | 1965 | strlcpy(hs->hash_name, "bufhash", sizeof(hs->hash_name)); |
| | | 1966 | strlcpy(hs->hash_desc, "buffer hash", sizeof(hs->hash_desc)); |
| | | 1967 | if (!fill) |
| | | 1968 | return 0; |
| | | 1969 | |
| | | 1970 | hs->hash_size = bufhash + 1; |
| | | 1971 | |
| | | 1972 | for (size_t i = 0; i < hs->hash_size; i++) { |
| | | 1973 | chain = 0; |
| | | 1974 | |
| | | 1975 | mutex_enter(&bufcache_lock); |
| | | 1976 | LIST_FOREACH(bp, &bufhashtbl[i], b_hash) { |
| | | 1977 | chain++; |
| | | 1978 | } |
| | | 1979 | mutex_exit(&bufcache_lock); |
| | | 1980 | |
| | | 1981 | if (chain > 0) { |
| | | 1982 | hs->hash_used++; |
| | | 1983 | hs->hash_items += chain; |
| | | 1984 | if (chain > hs->hash_maxchain) |
| | | 1985 | hs->hash_maxchain = chain; |
| | | 1986 | } |
| | | 1987 | preempt_point(); |
| | | 1988 | } |
| | | 1989 | |
| | | 1990 | return 0; |
| | | 1991 | } |
| | | 1992 | |
1956 | #ifdef DEBUG | | 1993 | #ifdef DEBUG |
1957 | /* | | 1994 | /* |
1958 | * Print out statistics on the current allocation of the buffer pool. | | 1995 | * Print out statistics on the current allocation of the buffer pool. |
1959 | * Can be enabled to print out on every ``sync'' by setting "syncprt" | | 1996 | * Can be enabled to print out on every ``sync'' by setting "syncprt" |
1960 | * in vfs_syscalls.c using sysctl. | | 1997 | * in vfs_syscalls.c using sysctl. |
1961 | */ | | 1998 | */ |
1962 | void | | 1999 | void |
1963 | vfs_bufstats(void) | | 2000 | vfs_bufstats(void) |
1964 | { | | 2001 | { |
1965 | int i, j, count; | | 2002 | int i, j, count; |
1966 | buf_t *bp; | | 2003 | buf_t *bp; |
1967 | struct bqueue *dp; | | 2004 | struct bqueue *dp; |
1968 | int counts[MAXBSIZE / MIN_PAGE_SIZE + 1]; | | 2005 | int counts[MAXBSIZE / MIN_PAGE_SIZE + 1]; |
1969 | static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" }; | | 2006 | static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" }; |
1970 | | | 2007 | |
1971 | for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { | | 2008 | for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { |
1972 | count = 0; | | 2009 | count = 0; |
1973 | memset(counts, 0, sizeof(counts)); | | 2010 | memset(counts, 0, sizeof(counts)); |
1974 | TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) { | | 2011 | TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) { |
1975 | counts[bp->b_bufsize / PAGE_SIZE]++; | | 2012 | counts[bp->b_bufsize / PAGE_SIZE]++; |
1976 | count++; | | 2013 | count++; |
1977 | } | | 2014 | } |
1978 | printf("%s: total-%d", bname[i], count); | | 2015 | printf("%s: total-%d", bname[i], count); |
1979 | for (j = 0; j <= MAXBSIZE / PAGE_SIZE; j++) | | 2016 | for (j = 0; j <= MAXBSIZE / PAGE_SIZE; j++) |
1980 | if (counts[j] != 0) | | 2017 | if (counts[j] != 0) |
1981 | printf(", %d-%d", j * PAGE_SIZE, counts[j]); | | 2018 | printf(", %d-%d", j * PAGE_SIZE, counts[j]); |
1982 | printf("\n"); | | 2019 | printf("\n"); |
1983 | } | | 2020 | } |
1984 | } | | 2021 | } |
1985 | #endif /* DEBUG */ | | 2022 | #endif /* DEBUG */ |
1986 | | | 2023 | |
1987 | /* ------------------------------ */ | | 2024 | /* ------------------------------ */ |
1988 | | | 2025 | |
1989 | buf_t * | | 2026 | buf_t * |
1990 | getiobuf(struct vnode *vp, bool waitok) | | 2027 | getiobuf(struct vnode *vp, bool waitok) |
1991 | { | | 2028 | { |
1992 | buf_t *bp; | | 2029 | buf_t *bp; |
1993 | | | 2030 | |
1994 | bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); | | 2031 | bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); |
1995 | if (bp == NULL) | | 2032 | if (bp == NULL) |
1996 | return bp; | | 2033 | return bp; |
1997 | | | 2034 | |
1998 | buf_init(bp); | | 2035 | buf_init(bp); |
1999 | | | 2036 | |
2000 | if ((bp->b_vp = vp) != NULL) { | | 2037 | if ((bp->b_vp = vp) != NULL) { |
2001 | bp->b_objlock = vp->v_interlock; | | 2038 | bp->b_objlock = vp->v_interlock; |
2002 | } else { | | 2039 | } else { |
2003 | KASSERT(bp->b_objlock == &buffer_lock); | | 2040 | KASSERT(bp->b_objlock == &buffer_lock); |
2004 | } | | 2041 | } |
2005 | | | 2042 | |
2006 | return bp; | | 2043 | return bp; |
2007 | } | | 2044 | } |
2008 | | | 2045 | |
2009 | void | | 2046 | void |
2010 | putiobuf(buf_t *bp) | | 2047 | putiobuf(buf_t *bp) |
2011 | { | | 2048 | { |
2012 | | | 2049 | |
2013 | buf_destroy(bp); | | 2050 | buf_destroy(bp); |
2014 | pool_cache_put(bufio_cache, bp); | | 2051 | pool_cache_put(bufio_cache, bp); |
2015 | } | | 2052 | } |
2016 | | | 2053 | |
2017 | /* | | 2054 | /* |
2018 | * nestiobuf_iodone: b_iodone callback for nested buffers. | | 2055 | * nestiobuf_iodone: b_iodone callback for nested buffers. |
2019 | */ | | 2056 | */ |
2020 | | | 2057 | |
2021 | void | | 2058 | void |
2022 | nestiobuf_iodone(buf_t *bp) | | 2059 | nestiobuf_iodone(buf_t *bp) |
2023 | { | | 2060 | { |
2024 | buf_t *mbp = bp->b_private; | | 2061 | buf_t *mbp = bp->b_private; |
2025 | int error; | | 2062 | int error; |
2026 | int donebytes; | | 2063 | int donebytes; |
2027 | | | 2064 | |
2028 | KASSERT(bp->b_bcount <= bp->b_bufsize); | | 2065 | KASSERT(bp->b_bcount <= bp->b_bufsize); |
2029 | KASSERT(mbp != bp); | | 2066 | KASSERT(mbp != bp); |
2030 | | | 2067 | |
2031 | error = bp->b_error; | | 2068 | error = bp->b_error; |
2032 | if (bp->b_error == 0 && | | 2069 | if (bp->b_error == 0 && |
2033 | (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) { | | 2070 | (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) { |
2034 | /* | | 2071 | /* |
2035 | * Not all got transferred, raise an error. We have no way to | | 2072 | * Not all got transferred, raise an error. We have no way to |
2036 | * propagate these conditions to mbp. | | 2073 | * propagate these conditions to mbp. |
2037 | */ | | 2074 | */ |
2038 | error = EIO; | | 2075 | error = EIO; |
2039 | } | | 2076 | } |
2040 | | | 2077 | |
2041 | donebytes = bp->b_bufsize; | | 2078 | donebytes = bp->b_bufsize; |
2042 | | | 2079 | |
2043 | putiobuf(bp); | | 2080 | putiobuf(bp); |
2044 | nestiobuf_done(mbp, donebytes, error); | | 2081 | nestiobuf_done(mbp, donebytes, error); |
2045 | } | | 2082 | } |
2046 | | | 2083 | |
2047 | /* | | 2084 | /* |
2048 | * nestiobuf_setup: setup a "nested" buffer. | | 2085 | * nestiobuf_setup: setup a "nested" buffer. |
2049 | * | | 2086 | * |
2050 | * => 'mbp' is a "master" buffer which is being divided into sub pieces. | | 2087 | * => 'mbp' is a "master" buffer which is being divided into sub pieces. |
2051 | * => 'bp' should be a buffer allocated by getiobuf. | | 2088 | * => 'bp' should be a buffer allocated by getiobuf. |
2052 | * => 'offset' is a byte offset in the master buffer. | | 2089 | * => 'offset' is a byte offset in the master buffer. |
2053 | * => 'size' is a size in bytes of this nested buffer. | | 2090 | * => 'size' is a size in bytes of this nested buffer. |
2054 | */ | | 2091 | */ |
2055 | | | 2092 | |
2056 | void | | 2093 | void |
2057 | nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size) | | 2094 | nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size) |
2058 | { | | 2095 | { |
2059 | const int b_pass = mbp->b_flags & (B_READ|B_PHYS|B_RAW|B_MEDIA_FLAGS); | | 2096 | const int b_pass = mbp->b_flags & (B_READ|B_PHYS|B_RAW|B_MEDIA_FLAGS); |
2060 | struct vnode *vp = mbp->b_vp; | | 2097 | struct vnode *vp = mbp->b_vp; |
2061 | | | 2098 | |
2062 | KASSERT(mbp->b_bcount >= offset + size); | | 2099 | KASSERT(mbp->b_bcount >= offset + size); |
2063 | bp->b_vp = vp; | | 2100 | bp->b_vp = vp; |
2064 | bp->b_dev = mbp->b_dev; | | 2101 | bp->b_dev = mbp->b_dev; |
2065 | bp->b_objlock = mbp->b_objlock; | | 2102 | bp->b_objlock = mbp->b_objlock; |
2066 | bp->b_cflags = BC_BUSY; | | 2103 | bp->b_cflags = BC_BUSY; |
2067 | bp->b_flags = B_ASYNC | b_pass; | | 2104 | bp->b_flags = B_ASYNC | b_pass; |
2068 | bp->b_iodone = nestiobuf_iodone; | | 2105 | bp->b_iodone = nestiobuf_iodone; |
2069 | bp->b_data = (char *)mbp->b_data + offset; | | 2106 | bp->b_data = (char *)mbp->b_data + offset; |
2070 | bp->b_resid = bp->b_bcount = size; | | 2107 | bp->b_resid = bp->b_bcount = size; |
2071 | bp->b_bufsize = bp->b_bcount; | | 2108 | bp->b_bufsize = bp->b_bcount; |
2072 | bp->b_private = mbp; | | 2109 | bp->b_private = mbp; |
2073 | BIO_COPYPRIO(bp, mbp); | | 2110 | BIO_COPYPRIO(bp, mbp); |
2074 | if (BUF_ISWRITE(bp) && vp != NULL) { | | 2111 | if (BUF_ISWRITE(bp) && vp != NULL) { |
2075 | mutex_enter(vp->v_interlock); | | 2112 | mutex_enter(vp->v_interlock); |
2076 | vp->v_numoutput++; | | 2113 | vp->v_numoutput++; |
2077 | mutex_exit(vp->v_interlock); | | 2114 | mutex_exit(vp->v_interlock); |
2078 | } | | 2115 | } |
2079 | } | | 2116 | } |
2080 | | | 2117 | |
2081 | /* | | 2118 | /* |
2082 | * nestiobuf_done: propagate completion to the master buffer. | | 2119 | * nestiobuf_done: propagate completion to the master buffer. |
2083 | * | | 2120 | * |
2084 | * => 'donebytes' specifies how many bytes in the 'mbp' is completed. | | 2121 | * => 'donebytes' specifies how many bytes in the 'mbp' is completed. |
2085 | * => 'error' is an errno(2) that 'donebytes' has been completed with. | | 2122 | * => 'error' is an errno(2) that 'donebytes' has been completed with. |
2086 | */ | | 2123 | */ |
2087 | | | 2124 | |
2088 | void | | 2125 | void |
2089 | nestiobuf_done(buf_t *mbp, int donebytes, int error) | | 2126 | nestiobuf_done(buf_t *mbp, int donebytes, int error) |
2090 | { | | 2127 | { |
2091 | | | 2128 | |
2092 | if (donebytes == 0) { | | 2129 | if (donebytes == 0) { |
2093 | return; | | 2130 | return; |
2094 | } | | 2131 | } |
2095 | mutex_enter(mbp->b_objlock); | | 2132 | mutex_enter(mbp->b_objlock); |
2096 | KASSERT(mbp->b_resid >= donebytes); | | 2133 | KASSERT(mbp->b_resid >= donebytes); |
2097 | mbp->b_resid -= donebytes; | | 2134 | mbp->b_resid -= donebytes; |
2098 | if (error) | | 2135 | if (error) |
2099 | mbp->b_error = error; | | 2136 | mbp->b_error = error; |
2100 | if (mbp->b_resid == 0) { | | 2137 | if (mbp->b_resid == 0) { |
2101 | if (mbp->b_error) | | 2138 | if (mbp->b_error) |
2102 | mbp->b_resid = mbp->b_bcount; | | 2139 | mbp->b_resid = mbp->b_bcount; |
2103 | mutex_exit(mbp->b_objlock); | | 2140 | mutex_exit(mbp->b_objlock); |
2104 | biodone(mbp); | | 2141 | biodone(mbp); |
2105 | } else | | 2142 | } else |
2106 | mutex_exit(mbp->b_objlock); | | 2143 | mutex_exit(mbp->b_objlock); |
2107 | } | | 2144 | } |
2108 | | | 2145 | |
2109 | void | | 2146 | void |
2110 | buf_init(buf_t *bp) | | 2147 | buf_init(buf_t *bp) |
2111 | { | | 2148 | { |
2112 | | | 2149 | |
2113 | cv_init(&bp->b_busy, "biolock"); | | 2150 | cv_init(&bp->b_busy, "biolock"); |
2114 | cv_init(&bp->b_done, "biowait"); | | 2151 | cv_init(&bp->b_done, "biowait"); |
2115 | bp->b_dev = NODEV; | | 2152 | bp->b_dev = NODEV; |
2116 | bp->b_error = 0; | | 2153 | bp->b_error = 0; |
2117 | bp->b_flags = 0; | | 2154 | bp->b_flags = 0; |
2118 | bp->b_cflags = 0; | | 2155 | bp->b_cflags = 0; |
2119 | bp->b_oflags = 0; | | 2156 | bp->b_oflags = 0; |
2120 | bp->b_objlock = &buffer_lock; | | 2157 | bp->b_objlock = &buffer_lock; |
2121 | bp->b_iodone = NULL; | | 2158 | bp->b_iodone = NULL; |
2122 | bp->b_dev = NODEV; | | 2159 | bp->b_dev = NODEV; |
2123 | bp->b_vnbufs.le_next = NOLIST; | | 2160 | bp->b_vnbufs.le_next = NOLIST; |
2124 | BIO_SETPRIO(bp, BPRIO_DEFAULT); | | 2161 | BIO_SETPRIO(bp, BPRIO_DEFAULT); |
2125 | } | | 2162 | } |
2126 | | | 2163 | |
2127 | void | | 2164 | void |
2128 | buf_destroy(buf_t *bp) | | 2165 | buf_destroy(buf_t *bp) |
2129 | { | | 2166 | { |
2130 | | | 2167 | |
2131 | cv_destroy(&bp->b_done); | | 2168 | cv_destroy(&bp->b_done); |
2132 | cv_destroy(&bp->b_busy); | | 2169 | cv_destroy(&bp->b_busy); |
2133 | } | | 2170 | } |
2134 | | | 2171 | |
2135 | int | | 2172 | int |
2136 | bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock) | | 2173 | bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock) |
2137 | { | | 2174 | { |
2138 | int error; | | 2175 | int error; |
2139 | | | 2176 | |
2140 | KASSERT(mutex_owned(&bufcache_lock)); | | 2177 | KASSERT(mutex_owned(&bufcache_lock)); |
2141 | | | 2178 | |
2142 | SDT_PROBE4(io, kernel, , bbusy__start, bp, intr, timo, interlock); | | 2179 | SDT_PROBE4(io, kernel, , bbusy__start, bp, intr, timo, interlock); |
2143 | | | 2180 | |
2144 | if ((bp->b_cflags & BC_BUSY) != 0) { | | 2181 | if ((bp->b_cflags & BC_BUSY) != 0) { |
2145 | if (curlwp == uvm.pagedaemon_lwp) { | | 2182 | if (curlwp == uvm.pagedaemon_lwp) { |
2146 | error = EDEADLK; | | 2183 | error = EDEADLK; |
2147 | goto out; | | 2184 | goto out; |
2148 | } | | 2185 | } |
2149 | bp->b_cflags |= BC_WANTED; | | 2186 | bp->b_cflags |= BC_WANTED; |
2150 | if (interlock != NULL) | | 2187 | if (interlock != NULL) |
2151 | mutex_exit(interlock); | | 2188 | mutex_exit(interlock); |
2152 | if (intr) { | | 2189 | if (intr) { |
2153 | error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock, | | 2190 | error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock, |
2154 | timo); | | 2191 | timo); |
2155 | } else { | | 2192 | } else { |
2156 | error = cv_timedwait(&bp->b_busy, &bufcache_lock, | | 2193 | error = cv_timedwait(&bp->b_busy, &bufcache_lock, |
2157 | timo); | | 2194 | timo); |
2158 | } | | 2195 | } |
2159 | /* | | 2196 | /* |
2160 | * At this point the buffer may be gone: don't touch it | | 2197 | * At this point the buffer may be gone: don't touch it |
2161 | * again. The caller needs to find it again and retry. | | 2198 | * again. The caller needs to find it again and retry. |
2162 | */ | | 2199 | */ |
2163 | if (interlock != NULL) | | 2200 | if (interlock != NULL) |
2164 | mutex_enter(interlock); | | 2201 | mutex_enter(interlock); |
2165 | if (error == 0) | | 2202 | if (error == 0) |
2166 | error = EPASSTHROUGH; | | 2203 | error = EPASSTHROUGH; |
2167 | } else { | | 2204 | } else { |
2168 | bp->b_cflags |= BC_BUSY; | | 2205 | bp->b_cflags |= BC_BUSY; |
2169 | error = 0; | | 2206 | error = 0; |
2170 | } | | 2207 | } |
2171 | | | 2208 | |
2172 | out: SDT_PROBE5(io, kernel, , bbusy__done, | | 2209 | out: SDT_PROBE5(io, kernel, , bbusy__done, |
2173 | bp, intr, timo, interlock, error); | | 2210 | bp, intr, timo, interlock, error); |
2174 | return error; | | 2211 | return error; |
2175 | } | | 2212 | } |
2176 | | | 2213 | |
2177 | /* | | 2214 | /* |
2178 | * Nothing outside this file should really need to know about nbuf, | | 2215 | * Nothing outside this file should really need to know about nbuf, |
2179 | * but a few things still want to read it, so give them a way to do that. | | 2216 | * but a few things still want to read it, so give them a way to do that. |
2180 | */ | | 2217 | */ |
2181 | u_int | | 2218 | u_int |
2182 | buf_nbuf(void) | | 2219 | buf_nbuf(void) |
2183 | { | | 2220 | { |
2184 | | | 2221 | |
2185 | return nbuf; | | 2222 | return nbuf; |
2186 | } | | 2223 | } |