| @@ -1,1463 +1,1459 @@ | | | @@ -1,1463 +1,1459 @@ |
1 | /* $NetBSD: vfs_cache.c,v 1.140 2020/04/22 21:35:52 ad Exp $ */ | | 1 | /* $NetBSD: vfs_cache.c,v 1.141 2020/04/23 22:58:36 ad Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc. | | 4 | * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc. |
5 | * All rights reserved. | | 5 | * All rights reserved. |
6 | * | | 6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation | | 7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Andrew Doran. | | 8 | * by Andrew Doran. |
9 | * | | 9 | * |
10 | * Redistribution and use in source and binary forms, with or without | | 10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions | | 11 | * modification, are permitted provided that the following conditions |
12 | * are met: | | 12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright | | 13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. | | 14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright | | 15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the | | 16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. | | 17 | * documentation and/or other materials provided with the distribution. |
18 | * | | 18 | * |
19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | | 19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | | 20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | | 21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | | 22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | | 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | | 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | | 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | | 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | | 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | | 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | * POSSIBILITY OF SUCH DAMAGE. | | 29 | * POSSIBILITY OF SUCH DAMAGE. |
30 | */ | | 30 | */ |
31 | | | 31 | |
32 | /* | | 32 | /* |
33 | * Copyright (c) 1989, 1993 | | 33 | * Copyright (c) 1989, 1993 |
34 | * The Regents of the University of California. All rights reserved. | | 34 | * The Regents of the University of California. All rights reserved. |
35 | * | | 35 | * |
36 | * Redistribution and use in source and binary forms, with or without | | 36 | * Redistribution and use in source and binary forms, with or without |
37 | * modification, are permitted provided that the following conditions | | 37 | * modification, are permitted provided that the following conditions |
38 | * are met: | | 38 | * are met: |
39 | * 1. Redistributions of source code must retain the above copyright | | 39 | * 1. Redistributions of source code must retain the above copyright |
40 | * notice, this list of conditions and the following disclaimer. | | 40 | * notice, this list of conditions and the following disclaimer. |
41 | * 2. Redistributions in binary form must reproduce the above copyright | | 41 | * 2. Redistributions in binary form must reproduce the above copyright |
42 | * notice, this list of conditions and the following disclaimer in the | | 42 | * notice, this list of conditions and the following disclaimer in the |
43 | * documentation and/or other materials provided with the distribution. | | 43 | * documentation and/or other materials provided with the distribution. |
44 | * 3. Neither the name of the University nor the names of its contributors | | 44 | * 3. Neither the name of the University nor the names of its contributors |
45 | * may be used to endorse or promote products derived from this software | | 45 | * may be used to endorse or promote products derived from this software |
46 | * without specific prior written permission. | | 46 | * without specific prior written permission. |
47 | * | | 47 | * |
48 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | | 48 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
49 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | | 49 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
50 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | | 50 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
51 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | | 51 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
52 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | | 52 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
53 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | | 53 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
54 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | | 54 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
55 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | | 55 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
56 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | | 56 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
57 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | | 57 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
58 | * SUCH DAMAGE. | | 58 | * SUCH DAMAGE. |
59 | * | | 59 | * |
60 | * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94 | | 60 | * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94 |
61 | */ | | 61 | */ |
62 | | | 62 | |
63 | /* | | 63 | /* |
64 | * Name caching: | | 64 | * Name caching: |
65 | * | | 65 | * |
66 | * Names found by directory scans are retained in a cache for future | | 66 | * Names found by directory scans are retained in a cache for future |
67 | * reference. It is managed LRU, so frequently used names will hang | | 67 | * reference. It is managed LRU, so frequently used names will hang |
68 | * around. The cache is indexed by hash value obtained from the name. | | 68 | * around. The cache is indexed by hash value obtained from the name. |
69 | * | | 69 | * |
70 | * The name cache is the brainchild of Robert Elz and was introduced in | | 70 | * The name cache is the brainchild of Robert Elz and was introduced in |
71 | * 4.3BSD. See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk | | 71 | * 4.3BSD. See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk |
72 | * McKusick, May 21 1984. | | 72 | * McKusick, May 21 1984. |
73 | * | | 73 | * |
74 | * Data structures: | | 74 | * Data structures: |
75 | * | | 75 | * |
76 | * Most Unix namecaches very sensibly use a global hash table to index | | 76 | * Most Unix namecaches very sensibly use a global hash table to index |
77 | * names. The global hash table works well, but can cause concurrency | | 77 | * names. The global hash table works well, but can cause concurrency |
78 | * headaches for the kernel hacker. In the NetBSD 10.0 implementation | | 78 | * headaches for the kernel hacker. In the NetBSD 10.0 implementation |
79 | * we are not sensible, and use a per-directory data structure to index | | 79 | * we are not sensible, and use a per-directory data structure to index |
80 | * names, but the cache otherwise functions the same. | | 80 | * names, but the cache otherwise functions the same. |
81 | * | | 81 | * |
82 | * The index is a red-black tree. There are no special concurrency | | 82 | * The index is a red-black tree. There are no special concurrency |
83 | * requirements placed on it, because it's per-directory and protected | | 83 | * requirements placed on it, because it's per-directory and protected |
84 | * by the namecache's per-directory locks. It should therefore not be | | 84 | * by the namecache's per-directory locks. It should therefore not be |
85 | * difficult to experiment with other types of index. | | 85 | * difficult to experiment with other types of index. |
86 | * | | 86 | * |
87 | * Each cached name is stored in a struct namecache, along with a | | 87 | * Each cached name is stored in a struct namecache, along with a |
88 | * pointer to the associated vnode (nc_vp). Names longer than a | | 88 | * pointer to the associated vnode (nc_vp). Names longer than a |
89 | * maximum length of NCHNAMLEN are allocated with kmem_alloc(); they | | 89 | * maximum length of NCHNAMLEN are allocated with kmem_alloc(); they |
90 | * occur infrequently, and names shorter than this are stored directly | | 90 | * occur infrequently, and names shorter than this are stored directly |
91 | * in struct namecache. If it is a "negative" entry, (i.e. for a name | | 91 | * in struct namecache. If it is a "negative" entry, (i.e. for a name |
92 | * that is known NOT to exist) the vnode pointer will be NULL. | | 92 | * that is known NOT to exist) the vnode pointer will be NULL. |
93 | * | | 93 | * |
94 | * For a directory with 3 cached names for 3 distinct vnodes, the | | 94 | * For a directory with 3 cached names for 3 distinct vnodes, the |
95 | * various vnodes and namecache structs would be connected like this | | 95 | * various vnodes and namecache structs would be connected like this |
96 | * (the root is at the bottom of the diagram): | | 96 | * (the root is at the bottom of the diagram): |
97 | * | | 97 | * |
98 | * ... | | 98 | * ... |
99 | * ^ | | 99 | * ^ |
100 | * |- vi_nc_tree | | 100 | * |- vi_nc_tree |
101 | * | | | 101 | * | |
102 | * +----o----+ +---------+ +---------+ | | 102 | * +----o----+ +---------+ +---------+ |
103 | * | VDIR | | VCHR | | VREG | | | 103 | * | VDIR | | VCHR | | VREG | |
104 | * | vnode o-----+ | vnode o-----+ | vnode o------+ | | 104 | * | vnode o-----+ | vnode o-----+ | vnode o------+ |
105 | * +---------+ | +---------+ | +---------+ | | | 105 | * +---------+ | +---------+ | +---------+ | |
106 | * ^ | ^ | ^ | | | 106 | * ^ | ^ | ^ | |
107 | * |- nc_vp |- vi_nc_list |- nc_vp |- vi_nc_list |- nc_vp | | | 107 | * |- nc_vp |- vi_nc_list |- nc_vp |- vi_nc_list |- nc_vp | |
108 | * | | | | | | | | 108 | * | | | | | | |
109 | * +----o----+ | +----o----+ | +----o----+ | | | 109 | * +----o----+ | +----o----+ | +----o----+ | |
110 | * +---onamecache|<----+ +---onamecache|<----+ +---onamecache|<-----+ | | 110 | * +---onamecache|<----+ +---onamecache|<----+ +---onamecache|<-----+ |
111 | * | +---------+ | +---------+ | +---------+ | | 111 | * | +---------+ | +---------+ | +---------+ |
112 | * | ^ | ^ | ^ | | 112 | * | ^ | ^ | ^ |
113 | * | | | | | | | | 113 | * | | | | | | |
114 | * | | +----------------------+ | | | | 114 | * | | +----------------------+ | | |
115 | * |-nc_dvp | +-------------------------------------------------+ | | 115 | * |-nc_dvp | +-------------------------------------------------+ |
116 | * | |/- vi_nc_tree | | | | 116 | * | |/- vi_nc_tree | | |
117 | * | | |- nc_dvp |- nc_dvp | | 117 | * | | |- nc_dvp |- nc_dvp |
118 | * | +----o----+ | | | | 118 | * | +----o----+ | | |
119 | * +-->| VDIR |<----------+ | | | 119 | * +-->| VDIR |<----------+ | |
120 | * | vnode |<------------------------------------+ | | 120 | * | vnode |<------------------------------------+ |
121 | * +---------+ | | 121 | * +---------+ |
122 | * | | 122 | * |
123 | * START HERE | | 123 | * START HERE |
124 | * | | 124 | * |
125 | * Replacement: | | 125 | * Replacement: |
126 | * | | 126 | * |
127 | * As the cache becomes full, old and unused entries are purged as new | | 127 | * As the cache becomes full, old and unused entries are purged as new |
128 | * entries are added. The synchronization overhead in maintaining a | | 128 | * entries are added. The synchronization overhead in maintaining a |
129 | * strict ordering would be prohibitive, so the VM system's "clock" or | | 129 | * strict ordering would be prohibitive, so the VM system's "clock" or |
130 | * "second chance" page replacement algorithm is aped here. New | | 130 | * "second chance" page replacement algorithm is aped here. New |
131 | * entries go to the tail of the active list. After they age out and | | 131 | * entries go to the tail of the active list. After they age out and |
132 | * reach the head of the list, they are moved to the tail of the | | 132 | * reach the head of the list, they are moved to the tail of the |
133 | * inactive list. Any use of the deactivated cache entry reactivates | | 133 | * inactive list. Any use of the deactivated cache entry reactivates |
134 | * it, saving it from impending doom; if not reactivated, the entry | | 134 | * it, saving it from impending doom; if not reactivated, the entry |
135 | * eventually reaches the head of the inactive list and is purged. | | 135 | * eventually reaches the head of the inactive list and is purged. |
136 | * | | 136 | * |
137 | * Concurrency: | | 137 | * Concurrency: |
138 | * | | 138 | * |
139 | * From a performance perspective, cache_lookup(nameiop == LOOKUP) is | | 139 | * From a performance perspective, cache_lookup(nameiop == LOOKUP) is |
140 | * what really matters; insertion of new entries with cache_enter() is | | 140 | * what really matters; insertion of new entries with cache_enter() is |
141 | * comparatively infrequent, and overshadowed by the cost of expensive | | 141 | * comparatively infrequent, and overshadowed by the cost of expensive |
142 | * file system metadata operations (which may involve disk I/O). We | | 142 | * file system metadata operations (which may involve disk I/O). We |
143 | * therefore want to make everything simplest in the lookup path. | | 143 | * therefore want to make everything simplest in the lookup path. |
144 | * | | 144 | * |
145 | * struct namecache is mostly stable except for list and tree related | | 145 | * struct namecache is mostly stable except for list and tree related |
146 | * entries, changes to which don't affect the cached name or vnode. | | 146 | * entries, changes to which don't affect the cached name or vnode. |
147 | * For changes to name+vnode, entries are purged in preference to | | 147 | * For changes to name+vnode, entries are purged in preference to |
148 | * modifying them. | | 148 | * modifying them. |
149 | * | | 149 | * |
150 | * Read access to namecache entries is made via tree, list, or LRU | | 150 | * Read access to namecache entries is made via tree, list, or LRU |
151 | * list. A lock corresponding to the direction of access should be | | 151 | * list. A lock corresponding to the direction of access should be |
152 | * held. See definition of "struct namecache" in src/sys/namei.src, | | 152 | * held. See definition of "struct namecache" in src/sys/namei.src, |
153 | * and the definition of "struct vnode" for the particulars. | | 153 | * and the definition of "struct vnode" for the particulars. |
154 | * | | 154 | * |
155 | * Per-CPU statistics, and LRU list totals are read unlocked, since | | 155 | * Per-CPU statistics, and LRU list totals are read unlocked, since |
156 | * an approximate value is OK. We maintain 32-bit sized per-CPU | | 156 | * an approximate value is OK. We maintain 32-bit sized per-CPU |
157 | * counters and 64-bit global counters under the theory that 32-bit | | 157 | * counters and 64-bit global counters under the theory that 32-bit |
158 | * sized counters are less likely to be hosed by nonatomic increment | | 158 | * sized counters are less likely to be hosed by nonatomic increment |
159 | * (on 32-bit platforms). | | 159 | * (on 32-bit platforms). |
160 | * | | 160 | * |
161 | * The lock order is: | | 161 | * The lock order is: |
162 | * | | 162 | * |
163 | * 1) vi->vi_nc_lock (tree or parent -> child direction, | | 163 | * 1) vi->vi_nc_lock (tree or parent -> child direction, |
164 | * used during forward lookup) | | 164 | * used during forward lookup) |
165 | * | | 165 | * |
166 | * 2) vi->vi_nc_listlock (list or child -> parent direction, | | 166 | * 2) vi->vi_nc_listlock (list or child -> parent direction, |
167 | * used during reverse lookup) | | 167 | * used during reverse lookup) |
168 | * | | 168 | * |
169 | * 3) cache_lru_lock (LRU list direction, used during reclaim) | | 169 | * 3) cache_lru_lock (LRU list direction, used during reclaim) |
170 | * | | 170 | * |
171 | * 4) vp->v_interlock (what the cache entry points to) | | 171 | * 4) vp->v_interlock (what the cache entry points to) |
172 | */ | | 172 | */ |
173 | | | 173 | |
174 | #include <sys/cdefs.h> | | 174 | #include <sys/cdefs.h> |
175 | __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.140 2020/04/22 21:35:52 ad Exp $"); | | 175 | __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.141 2020/04/23 22:58:36 ad Exp $"); |
176 | | | 176 | |
177 | #define __NAMECACHE_PRIVATE | | 177 | #define __NAMECACHE_PRIVATE |
178 | #ifdef _KERNEL_OPT | | 178 | #ifdef _KERNEL_OPT |
179 | #include "opt_ddb.h" | | 179 | #include "opt_ddb.h" |
180 | #include "opt_dtrace.h" | | 180 | #include "opt_dtrace.h" |
181 | #endif | | 181 | #endif |
182 | | | 182 | |
183 | #include <sys/types.h> | | 183 | #include <sys/types.h> |
184 | #include <sys/atomic.h> | | 184 | #include <sys/atomic.h> |
185 | #include <sys/callout.h> | | 185 | #include <sys/callout.h> |
186 | #include <sys/cpu.h> | | 186 | #include <sys/cpu.h> |
187 | #include <sys/errno.h> | | 187 | #include <sys/errno.h> |
188 | #include <sys/evcnt.h> | | 188 | #include <sys/evcnt.h> |
189 | #include <sys/hash.h> | | 189 | #include <sys/hash.h> |
190 | #include <sys/kernel.h> | | 190 | #include <sys/kernel.h> |
191 | #include <sys/mount.h> | | 191 | #include <sys/mount.h> |
192 | #include <sys/mutex.h> | | 192 | #include <sys/mutex.h> |
193 | #include <sys/namei.h> | | 193 | #include <sys/namei.h> |
194 | #include <sys/param.h> | | 194 | #include <sys/param.h> |
195 | #include <sys/pool.h> | | 195 | #include <sys/pool.h> |
196 | #include <sys/sdt.h> | | 196 | #include <sys/sdt.h> |
197 | #include <sys/sysctl.h> | | 197 | #include <sys/sysctl.h> |
198 | #include <sys/systm.h> | | 198 | #include <sys/systm.h> |
199 | #include <sys/time.h> | | 199 | #include <sys/time.h> |
200 | #include <sys/vnode_impl.h> | | 200 | #include <sys/vnode_impl.h> |
201 | | | 201 | |
202 | #include <miscfs/genfs/genfs.h> | | 202 | #include <miscfs/genfs/genfs.h> |
203 | | | 203 | |
204 | static void cache_activate(struct namecache *); | | 204 | static void cache_activate(struct namecache *); |
205 | static void cache_update_stats(void *); | | 205 | static void cache_update_stats(void *); |
206 | static int cache_compare_nodes(void *, const void *, const void *); | | 206 | static int cache_compare_nodes(void *, const void *, const void *); |
207 | static void cache_deactivate(void); | | 207 | static void cache_deactivate(void); |
208 | static void cache_reclaim(void); | | 208 | static void cache_reclaim(void); |
209 | static int cache_stat_sysctl(SYSCTLFN_ARGS); | | 209 | static int cache_stat_sysctl(SYSCTLFN_ARGS); |
210 | | | 210 | |
211 | /* | | 211 | /* |
212 | * Global pool cache. | | 212 | * Global pool cache. |
213 | */ | | 213 | */ |
214 | static pool_cache_t cache_pool __read_mostly; | | 214 | static pool_cache_t cache_pool __read_mostly; |
215 | | | 215 | |
216 | /* | | 216 | /* |
217 | * LRU replacement. | | 217 | * LRU replacement. |
218 | */ | | 218 | */ |
219 | enum cache_lru_id { | | 219 | enum cache_lru_id { |
220 | LRU_ACTIVE, | | 220 | LRU_ACTIVE, |
221 | LRU_INACTIVE, | | 221 | LRU_INACTIVE, |
222 | LRU_COUNT | | 222 | LRU_COUNT |
223 | }; | | 223 | }; |
224 | | | 224 | |
225 | static struct { | | 225 | static struct { |
226 | TAILQ_HEAD(, namecache) list[LRU_COUNT]; | | 226 | TAILQ_HEAD(, namecache) list[LRU_COUNT]; |
227 | u_int count[LRU_COUNT]; | | 227 | u_int count[LRU_COUNT]; |
228 | } cache_lru __cacheline_aligned; | | 228 | } cache_lru __cacheline_aligned; |
229 | | | 229 | |
230 | static kmutex_t cache_lru_lock __cacheline_aligned; | | 230 | static kmutex_t cache_lru_lock __cacheline_aligned; |
231 | | | 231 | |
232 | /* | | 232 | /* |
233 | * Cache effectiveness statistics. nchstats holds system-wide total. | | 233 | * Cache effectiveness statistics. nchstats holds system-wide total. |
234 | */ | | 234 | */ |
235 | struct nchstats nchstats; | | 235 | struct nchstats nchstats; |
236 | struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t); | | 236 | struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t); |
237 | struct nchcpu { | | 237 | struct nchcpu { |
238 | struct nchstats_percpu cur; | | 238 | struct nchstats_percpu cur; |
239 | struct nchstats_percpu last; | | 239 | struct nchstats_percpu last; |
240 | }; | | 240 | }; |
241 | static callout_t cache_stat_callout; | | 241 | static callout_t cache_stat_callout; |
242 | static kmutex_t cache_stat_lock __cacheline_aligned; | | 242 | static kmutex_t cache_stat_lock __cacheline_aligned; |
243 | | | 243 | |
244 | #define COUNT(f) do { \ | | 244 | #define COUNT(f) do { \ |
245 | lwp_t *l = curlwp; \ | | 245 | lwp_t *l = curlwp; \ |
246 | KPREEMPT_DISABLE(l); \ | | 246 | KPREEMPT_DISABLE(l); \ |
247 | ((struct nchstats_percpu *)curcpu()->ci_data.cpu_nch)->f++; \ | | 247 | ((struct nchstats_percpu *)curcpu()->ci_data.cpu_nch)->f++; \ |
248 | KPREEMPT_ENABLE(l); \ | | 248 | KPREEMPT_ENABLE(l); \ |
249 | } while (/* CONSTCOND */ 0); | | 249 | } while (/* CONSTCOND */ 0); |
250 | | | 250 | |
251 | #define UPDATE(nchcpu, f) do { \ | | 251 | #define UPDATE(nchcpu, f) do { \ |
252 | uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \ | | 252 | uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \ |
253 | nchstats.f += (uint32_t)(cur - nchcpu->last.f); \ | | 253 | nchstats.f += (uint32_t)(cur - nchcpu->last.f); \ |
254 | nchcpu->last.f = cur; \ | | 254 | nchcpu->last.f = cur; \ |
255 | } while (/* CONSTCOND */ 0) | | 255 | } while (/* CONSTCOND */ 0) |
256 | | | 256 | |
257 | /* | | 257 | /* |
258 | * Tunables. cache_maxlen replaces the historical doingcache: | | 258 | * Tunables. cache_maxlen replaces the historical doingcache: |
259 | * set it zero to disable caching for debugging purposes. | | 259 | * set it zero to disable caching for debugging purposes. |
260 | */ | | 260 | */ |
261 | int cache_lru_maxdeact __read_mostly = 2; /* max # to deactivate */ | | 261 | int cache_lru_maxdeact __read_mostly = 2; /* max # to deactivate */ |
262 | int cache_lru_maxscan __read_mostly = 64; /* max # to scan/reclaim */ | | 262 | int cache_lru_maxscan __read_mostly = 64; /* max # to scan/reclaim */ |
263 | int cache_maxlen __read_mostly = USHRT_MAX; /* max name length to cache */ | | 263 | int cache_maxlen __read_mostly = USHRT_MAX; /* max name length to cache */ |
264 | int cache_stat_interval __read_mostly = 300; /* in seconds */ | | 264 | int cache_stat_interval __read_mostly = 300; /* in seconds */ |
265 | | | 265 | |
266 | /* | | 266 | /* |
267 | * sysctl stuff. | | 267 | * sysctl stuff. |
268 | */ | | 268 | */ |
269 | static struct sysctllog *cache_sysctllog; | | 269 | static struct sysctllog *cache_sysctllog; |
270 | | | 270 | |
271 | /* | | 271 | /* |
272 | * Red-black tree stuff. | | 272 | * Red-black tree stuff. |
273 | */ | | 273 | */ |
274 | static const rb_tree_ops_t cache_rbtree_ops = { | | 274 | static const rb_tree_ops_t cache_rbtree_ops = { |
275 | .rbto_compare_nodes = cache_compare_nodes, | | 275 | .rbto_compare_nodes = cache_compare_nodes, |
276 | .rbto_compare_key = cache_compare_nodes, | | 276 | .rbto_compare_key = cache_compare_nodes, |
277 | .rbto_node_offset = offsetof(struct namecache, nc_tree), | | 277 | .rbto_node_offset = offsetof(struct namecache, nc_tree), |
278 | .rbto_context = NULL | | 278 | .rbto_context = NULL |
279 | }; | | 279 | }; |
280 | | | 280 | |
281 | /* | | 281 | /* |
282 | * dtrace probes. | | 282 | * dtrace probes. |
283 | */ | | 283 | */ |
284 | SDT_PROVIDER_DEFINE(vfs); | | 284 | SDT_PROVIDER_DEFINE(vfs); |
285 | | | 285 | |
286 | SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *"); | | 286 | SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *"); |
287 | SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *"); | | 287 | SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *"); |
288 | SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *"); | | 288 | SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *"); |
289 | SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t"); | | 289 | SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t"); |
290 | SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *"); | | 290 | SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *"); |
291 | SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", | | 291 | SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", |
292 | "char *", "size_t"); | | 292 | "char *", "size_t"); |
293 | SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *", | | 293 | SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *", |
294 | "char *", "size_t"); | | 294 | "char *", "size_t"); |
295 | SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *", | | 295 | SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *", |
296 | "char *", "size_t"); | | 296 | "char *", "size_t"); |
297 | SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *", | | 297 | SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *", |
298 | "struct vnode *"); | | 298 | "struct vnode *"); |
299 | SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *", | | 299 | SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *", |
300 | "int"); | | 300 | "int"); |
301 | SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int"); | | 301 | SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int"); |
302 | SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *", | | 302 | SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *", |
303 | "char *", "size_t"); | | 303 | "char *", "size_t"); |
304 | SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", | | 304 | SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", |
305 | "char *", "size_t"); | | 305 | "char *", "size_t"); |
306 | | | 306 | |
307 | /* | | 307 | /* |
308 | * rbtree: compare two nodes. | | 308 | * rbtree: compare two nodes. |
309 | */ | | 309 | */ |
310 | static int | | 310 | static int |
311 | cache_compare_nodes(void *context, const void *n1, const void *n2) | | 311 | cache_compare_nodes(void *context, const void *n1, const void *n2) |
312 | { | | 312 | { |
313 | const struct namecache *nc1 = n1; | | 313 | const struct namecache *nc1 = n1; |
314 | const struct namecache *nc2 = n2; | | 314 | const struct namecache *nc2 = n2; |
315 | | | 315 | |
316 | if (nc1->nc_key < nc2->nc_key) { | | 316 | if (nc1->nc_key < nc2->nc_key) { |
317 | return -1; | | 317 | return -1; |
318 | } | | 318 | } |
319 | if (nc1->nc_key > nc2->nc_key) { | | 319 | if (nc1->nc_key > nc2->nc_key) { |
320 | return 1; | | 320 | return 1; |
321 | } | | 321 | } |
322 | KASSERT(nc1->nc_nlen == nc2->nc_nlen); | | 322 | KASSERT(nc1->nc_nlen == nc2->nc_nlen); |
323 | return memcmp(nc1->nc_name, nc2->nc_name, nc1->nc_nlen); | | 323 | return memcmp(nc1->nc_name, nc2->nc_name, nc1->nc_nlen); |
324 | } | | 324 | } |
325 | | | 325 | |
326 | /* | | 326 | /* |
327 | * Compute a key value for the given name. The name length is encoded in | | 327 | * Compute a key value for the given name. The name length is encoded in |
328 | * the key value to try and improve uniqueness, and so that length doesn't | | 328 | * the key value to try and improve uniqueness, and so that length doesn't |
329 | * need to be compared separately for string comparisons. | | 329 | * need to be compared separately for string comparisons. |
330 | */ | | 330 | */ |
331 | static inline uint64_t | | 331 | static inline uint64_t |
332 | cache_key(const char *name, size_t nlen) | | 332 | cache_key(const char *name, size_t nlen) |
333 | { | | 333 | { |
334 | uint64_t key; | | 334 | uint64_t key; |
335 | | | 335 | |
336 | KASSERT(nlen <= USHRT_MAX); | | 336 | KASSERT(nlen <= USHRT_MAX); |
337 | | | 337 | |
338 | key = hash32_buf(name, nlen, HASH32_STR_INIT); | | 338 | key = hash32_buf(name, nlen, HASH32_STR_INIT); |
339 | return (key << 32) | nlen; | | 339 | return (key << 32) | nlen; |
340 | } | | 340 | } |
341 | | | 341 | |
342 | /* | | 342 | /* |
343 | * Remove an entry from the cache. vi_nc_lock must be held, and if dir2node | | 343 | * Remove an entry from the cache. vi_nc_lock must be held, and if dir2node |
344 | * is true, then we're locking in the conventional direction and the list | | 344 | * is true, then we're locking in the conventional direction and the list |
345 | * lock will be acquired when removing the entry from the vnode list. | | 345 | * lock will be acquired when removing the entry from the vnode list. |
346 | */ | | 346 | */ |
347 | static void | | 347 | static void |
348 | cache_remove(struct namecache *ncp, const bool dir2node) | | 348 | cache_remove(struct namecache *ncp, const bool dir2node) |
349 | { | | 349 | { |
350 | struct vnode *vp, *dvp = ncp->nc_dvp; | | 350 | struct vnode *vp, *dvp = ncp->nc_dvp; |
351 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); | | 351 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); |
352 | | | 352 | |
353 | KASSERT(rw_write_held(&dvi->vi_nc_lock)); | | 353 | KASSERT(rw_write_held(&dvi->vi_nc_lock)); |
354 | KASSERT(cache_key(ncp->nc_name, ncp->nc_nlen) == ncp->nc_key); | | 354 | KASSERT(cache_key(ncp->nc_name, ncp->nc_nlen) == ncp->nc_key); |
355 | KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp); | | 355 | KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp); |
356 | | | 356 | |
357 | SDT_PROBE(vfs, namecache, invalidate, done, ncp, | | 357 | SDT_PROBE(vfs, namecache, invalidate, done, ncp, |
358 | 0, 0, 0, 0); | | 358 | 0, 0, 0, 0); |
359 | | | 359 | |
360 | /* | | 360 | /* |
361 | * Remove from the vnode's list. This excludes cache_revlookup(), | | 361 | * Remove from the vnode's list. This excludes cache_revlookup(), |
362 | * and then it's safe to remove from the LRU lists. | | 362 | * and then it's safe to remove from the LRU lists. |
363 | */ | | 363 | */ |
364 | if ((vp = ncp->nc_vp) != NULL) { | | 364 | if ((vp = ncp->nc_vp) != NULL) { |
365 | vnode_impl_t *vi = VNODE_TO_VIMPL(vp); | | 365 | vnode_impl_t *vi = VNODE_TO_VIMPL(vp); |
366 | if (__predict_true(dir2node)) { | | 366 | if (__predict_true(dir2node)) { |
367 | rw_enter(&vi->vi_nc_listlock, RW_WRITER); | | 367 | rw_enter(&vi->vi_nc_listlock, RW_WRITER); |
368 | TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); | | 368 | TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); |
369 | rw_exit(&vi->vi_nc_listlock); | | 369 | rw_exit(&vi->vi_nc_listlock); |
370 | } else { | | 370 | } else { |
371 | TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); | | 371 | TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); |
372 | } | | 372 | } |
373 | } | | 373 | } |
374 | | | 374 | |
375 | /* Remove from the directory's rbtree. */ | | 375 | /* Remove from the directory's rbtree. */ |
376 | rb_tree_remove_node(&dvi->vi_nc_tree, ncp); | | 376 | rb_tree_remove_node(&dvi->vi_nc_tree, ncp); |
377 | | | 377 | |
378 | /* Remove from the LRU lists. */ | | 378 | /* Remove from the LRU lists. */ |
379 | mutex_enter(&cache_lru_lock); | | 379 | mutex_enter(&cache_lru_lock); |
380 | TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); | | 380 | TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); |
381 | cache_lru.count[ncp->nc_lrulist]--; | | 381 | cache_lru.count[ncp->nc_lrulist]--; |
382 | mutex_exit(&cache_lru_lock); | | 382 | mutex_exit(&cache_lru_lock); |
383 | | | 383 | |
384 | /* Finally, free it. */ | | 384 | /* Finally, free it. */ |
385 | if (ncp->nc_nlen > NCHNAMLEN) { | | 385 | if (ncp->nc_nlen > NCHNAMLEN) { |
386 | size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]); | | 386 | size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]); |
387 | kmem_free(ncp, sz); | | 387 | kmem_free(ncp, sz); |
388 | } else { | | 388 | } else { |
389 | pool_cache_put(cache_pool, ncp); | | 389 | pool_cache_put(cache_pool, ncp); |
390 | } | | 390 | } |
391 | } | | 391 | } |
392 | | | 392 | |
393 | /* | | 393 | /* |
394 | * Find a single cache entry and return it. vi_nc_lock must be held. | | 394 | * Find a single cache entry and return it. vi_nc_lock must be held. |
395 | */ | | 395 | */ |
396 | static struct namecache * __noinline | | 396 | static struct namecache * __noinline |
397 | cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen, | | 397 | cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen, |
398 | uint64_t key) | | 398 | uint64_t key) |
399 | { | | 399 | { |
400 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); | | 400 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); |
401 | struct rb_node *node = dvi->vi_nc_tree.rbt_root; | | 401 | struct rb_node *node = dvi->vi_nc_tree.rbt_root; |
402 | struct namecache *ncp; | | 402 | struct namecache *ncp; |
403 | int lrulist, diff; | | 403 | int lrulist, diff; |
404 | | | 404 | |
405 | KASSERT(rw_lock_held(&dvi->vi_nc_lock)); | | 405 | KASSERT(rw_lock_held(&dvi->vi_nc_lock)); |
406 | | | 406 | |
407 | /* | | 407 | /* |
408 | * Search the RB tree for the key. This is an inlined lookup | | 408 | * Search the RB tree for the key. This is an inlined lookup |
409 | * tailored for exactly what's needed here (64-bit key and so on) | | 409 | * tailored for exactly what's needed here (64-bit key and so on) |
410 | * that is quite a bit faster than using rb_tree_find_node(). | | 410 | * that is quite a bit faster than using rb_tree_find_node(). |
411 | * | | 411 | * |
412 | * For a matching key memcmp() needs to be called once to confirm | | 412 | * For a matching key memcmp() needs to be called once to confirm |
413 | * that the correct name has been found. Very rarely there will be | | 413 | * that the correct name has been found. Very rarely there will be |
414 | * a key value collision and the search will continue. | | 414 | * a key value collision and the search will continue. |
415 | */ | | 415 | */ |
416 | for (;;) { | | 416 | for (;;) { |
417 | if (__predict_false(RB_SENTINEL_P(node))) { | | 417 | if (__predict_false(RB_SENTINEL_P(node))) { |
418 | return NULL; | | 418 | return NULL; |
419 | } | | 419 | } |
420 | ncp = (struct namecache *)node; | | 420 | ncp = (struct namecache *)node; |
421 | KASSERT((void *)&ncp->nc_tree == (void *)ncp); | | 421 | KASSERT((void *)&ncp->nc_tree == (void *)ncp); |
422 | KASSERT(ncp->nc_dvp == dvp); | | 422 | KASSERT(ncp->nc_dvp == dvp); |
423 | if (ncp->nc_key == key) { | | 423 | if (ncp->nc_key == key) { |
424 | KASSERT(ncp->nc_nlen == namelen); | | 424 | KASSERT(ncp->nc_nlen == namelen); |
425 | diff = memcmp(ncp->nc_name, name, namelen); | | 425 | diff = memcmp(ncp->nc_name, name, namelen); |
426 | if (__predict_true(diff == 0)) { | | 426 | if (__predict_true(diff == 0)) { |
427 | break; | | 427 | break; |
428 | } | | 428 | } |
429 | node = node->rb_nodes[diff < 0]; | | 429 | node = node->rb_nodes[diff < 0]; |
430 | } else { | | 430 | } else { |
431 | node = node->rb_nodes[ncp->nc_key < key]; | | 431 | node = node->rb_nodes[ncp->nc_key < key]; |
432 | } | | 432 | } |
433 | } | | 433 | } |
434 | | | 434 | |
435 | /* | | 435 | /* |
436 | * If the entry is on the wrong LRU list, requeue it. This is an | | 436 | * If the entry is on the wrong LRU list, requeue it. This is an |
437 | * unlocked check, but it will rarely be wrong and even then there | | 437 | * unlocked check, but it will rarely be wrong and even then there |
438 | * will be no harm caused. | | 438 | * will be no harm caused. |
439 | */ | | 439 | */ |
440 | lrulist = atomic_load_relaxed(&ncp->nc_lrulist); | | 440 | lrulist = atomic_load_relaxed(&ncp->nc_lrulist); |
441 | if (__predict_false(lrulist != LRU_ACTIVE)) { | | 441 | if (__predict_false(lrulist != LRU_ACTIVE)) { |
442 | cache_activate(ncp); | | 442 | cache_activate(ncp); |
443 | } | | 443 | } |
444 | return ncp; | | 444 | return ncp; |
445 | } | | 445 | } |
446 | | | 446 | |
447 | /* | | 447 | /* |
448 | * Look for a the name in the cache. We don't do this | | 448 | * Look for a the name in the cache. We don't do this |
449 | * if the segment name is long, simply so the cache can avoid | | 449 | * if the segment name is long, simply so the cache can avoid |
450 | * holding long names (which would either waste space, or | | 450 | * holding long names (which would either waste space, or |
451 | * add greatly to the complexity). | | 451 | * add greatly to the complexity). |
452 | * | | 452 | * |
453 | * Lookup is called with DVP pointing to the directory to search, | | 453 | * Lookup is called with DVP pointing to the directory to search, |
454 | * and CNP providing the name of the entry being sought: cn_nameptr | | 454 | * and CNP providing the name of the entry being sought: cn_nameptr |
455 | * is the name, cn_namelen is its length, and cn_flags is the flags | | 455 | * is the name, cn_namelen is its length, and cn_flags is the flags |
456 | * word from the namei operation. | | 456 | * word from the namei operation. |
457 | * | | 457 | * |
458 | * DVP must be locked. | | 458 | * DVP must be locked. |
459 | * | | 459 | * |
460 | * There are three possible non-error return states: | | 460 | * There are three possible non-error return states: |
461 | * 1. Nothing was found in the cache. Nothing is known about | | 461 | * 1. Nothing was found in the cache. Nothing is known about |
462 | * the requested name. | | 462 | * the requested name. |
463 | * 2. A negative entry was found in the cache, meaning that the | | 463 | * 2. A negative entry was found in the cache, meaning that the |
464 | * requested name definitely does not exist. | | 464 | * requested name definitely does not exist. |
465 | * 3. A positive entry was found in the cache, meaning that the | | 465 | * 3. A positive entry was found in the cache, meaning that the |
466 | * requested name does exist and that we are providing the | | 466 | * requested name does exist and that we are providing the |
467 | * vnode. | | 467 | * vnode. |
468 | * In these cases the results are: | | 468 | * In these cases the results are: |
469 | * 1. 0 returned; VN is set to NULL. | | 469 | * 1. 0 returned; VN is set to NULL. |
470 | * 2. 1 returned; VN is set to NULL. | | 470 | * 2. 1 returned; VN is set to NULL. |
471 | * 3. 1 returned; VN is set to the vnode found. | | 471 | * 3. 1 returned; VN is set to the vnode found. |
472 | * | | 472 | * |
473 | * The additional result argument ISWHT is set to zero, unless a | | 473 | * The additional result argument ISWHT is set to zero, unless a |
474 | * negative entry is found that was entered as a whiteout, in which | | 474 | * negative entry is found that was entered as a whiteout, in which |
475 | * case ISWHT is set to one. | | 475 | * case ISWHT is set to one. |
476 | * | | 476 | * |
477 | * The ISWHT_RET argument pointer may be null. In this case an | | 477 | * The ISWHT_RET argument pointer may be null. In this case an |
478 | * assertion is made that the whiteout flag is not set. File systems | | 478 | * assertion is made that the whiteout flag is not set. File systems |
479 | * that do not support whiteouts can/should do this. | | 479 | * that do not support whiteouts can/should do this. |
480 | * | | 480 | * |
481 | * Filesystems that do support whiteouts should add ISWHITEOUT to | | 481 | * Filesystems that do support whiteouts should add ISWHITEOUT to |
482 | * cnp->cn_flags if ISWHT comes back nonzero. | | 482 | * cnp->cn_flags if ISWHT comes back nonzero. |
483 | * | | 483 | * |
484 | * When a vnode is returned, it is locked, as per the vnode lookup | | 484 | * When a vnode is returned, it is locked, as per the vnode lookup |
485 | * locking protocol. | | 485 | * locking protocol. |
486 | * | | 486 | * |
487 | * There is no way for this function to fail, in the sense of | | 487 | * There is no way for this function to fail, in the sense of |
488 | * generating an error that requires aborting the namei operation. | | 488 | * generating an error that requires aborting the namei operation. |
489 | * | | 489 | * |
490 | * (Prior to October 2012, this function returned an integer status, | | 490 | * (Prior to October 2012, this function returned an integer status, |
491 | * and a vnode, and mucked with the flags word in CNP for whiteouts. | | 491 | * and a vnode, and mucked with the flags word in CNP for whiteouts. |
492 | * The integer status was -1 for "nothing found", ENOENT for "a | | 492 | * The integer status was -1 for "nothing found", ENOENT for "a |
493 | * negative entry found", 0 for "a positive entry found", and possibly | | 493 | * negative entry found", 0 for "a positive entry found", and possibly |
494 | * other errors, and the value of VN might or might not have been set | | 494 | * other errors, and the value of VN might or might not have been set |
495 | * depending on what error occurred.) | | 495 | * depending on what error occurred.) |
496 | */ | | 496 | */ |
497 | bool | | 497 | bool |
498 | cache_lookup(struct vnode *dvp, const char *name, size_t namelen, | | 498 | cache_lookup(struct vnode *dvp, const char *name, size_t namelen, |
499 | uint32_t nameiop, uint32_t cnflags, | | 499 | uint32_t nameiop, uint32_t cnflags, |
500 | int *iswht_ret, struct vnode **vn_ret) | | 500 | int *iswht_ret, struct vnode **vn_ret) |
501 | { | | 501 | { |
502 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); | | 502 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); |
503 | struct namecache *ncp; | | 503 | struct namecache *ncp; |
504 | struct vnode *vp; | | 504 | struct vnode *vp; |
505 | uint64_t key; | | 505 | uint64_t key; |
506 | int error; | | 506 | int error; |
507 | bool hit; | | 507 | bool hit; |
508 | krw_t op; | | 508 | krw_t op; |
509 | | | 509 | |
510 | /* Establish default result values */ | | 510 | /* Establish default result values */ |
511 | if (iswht_ret != NULL) { | | 511 | if (iswht_ret != NULL) { |
512 | *iswht_ret = 0; | | 512 | *iswht_ret = 0; |
513 | } | | 513 | } |
514 | *vn_ret = NULL; | | 514 | *vn_ret = NULL; |
515 | | | 515 | |
516 | if (__predict_false(namelen > cache_maxlen)) { | | 516 | if (__predict_false(namelen > cache_maxlen)) { |
517 | SDT_PROBE(vfs, namecache, lookup, toolong, dvp, | | 517 | SDT_PROBE(vfs, namecache, lookup, toolong, dvp, |
518 | name, namelen, 0, 0); | | 518 | name, namelen, 0, 0); |
519 | COUNT(ncs_long); | | 519 | COUNT(ncs_long); |
520 | return false; | | 520 | return false; |
521 | } | | 521 | } |
522 | | | 522 | |
523 | /* Compute the key up front - don't need the lock. */ | | 523 | /* Compute the key up front - don't need the lock. */ |
524 | key = cache_key(name, namelen); | | 524 | key = cache_key(name, namelen); |
525 | | | 525 | |
526 | /* Could the entry be purged below? */ | | 526 | /* Could the entry be purged below? */ |
527 | if ((cnflags & ISLASTCN) != 0 && | | 527 | if ((cnflags & ISLASTCN) != 0 && |
528 | ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) { | | 528 | ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) { |
529 | op = RW_WRITER; | | 529 | op = RW_WRITER; |
530 | } else { | | 530 | } else { |
531 | op = RW_READER; | | 531 | op = RW_READER; |
532 | } | | 532 | } |
533 | | | 533 | |
534 | /* Now look for the name. */ | | 534 | /* Now look for the name. */ |
535 | rw_enter(&dvi->vi_nc_lock, op); | | 535 | rw_enter(&dvi->vi_nc_lock, op); |
536 | ncp = cache_lookup_entry(dvp, name, namelen, key); | | 536 | ncp = cache_lookup_entry(dvp, name, namelen, key); |
537 | if (__predict_false(ncp == NULL)) { | | 537 | if (__predict_false(ncp == NULL)) { |
538 | rw_exit(&dvi->vi_nc_lock); | | 538 | rw_exit(&dvi->vi_nc_lock); |
539 | COUNT(ncs_miss); | | 539 | COUNT(ncs_miss); |
540 | SDT_PROBE(vfs, namecache, lookup, miss, dvp, | | 540 | SDT_PROBE(vfs, namecache, lookup, miss, dvp, |
541 | name, namelen, 0, 0); | | 541 | name, namelen, 0, 0); |
542 | return false; | | 542 | return false; |
543 | } | | 543 | } |
544 | if (__predict_false((cnflags & MAKEENTRY) == 0)) { | | 544 | if (__predict_false((cnflags & MAKEENTRY) == 0)) { |
545 | /* | | 545 | /* |
546 | * Last component and we are renaming or deleting, | | 546 | * Last component and we are renaming or deleting, |
547 | * the cache entry is invalid, or otherwise don't | | 547 | * the cache entry is invalid, or otherwise don't |
548 | * want cache entry to exist. | | 548 | * want cache entry to exist. |
549 | */ | | 549 | */ |
550 | KASSERT((cnflags & ISLASTCN) != 0); | | 550 | KASSERT((cnflags & ISLASTCN) != 0); |
551 | cache_remove(ncp, true); | | 551 | cache_remove(ncp, true); |
552 | rw_exit(&dvi->vi_nc_lock); | | 552 | rw_exit(&dvi->vi_nc_lock); |
553 | COUNT(ncs_badhits); | | 553 | COUNT(ncs_badhits); |
554 | return false; | | 554 | return false; |
555 | } | | 555 | } |
556 | if (ncp->nc_vp == NULL) { | | 556 | if (ncp->nc_vp == NULL) { |
557 | if (iswht_ret != NULL) { | | 557 | if (iswht_ret != NULL) { |
558 | /* | | 558 | /* |
559 | * Restore the ISWHITEOUT flag saved earlier. | | 559 | * Restore the ISWHITEOUT flag saved earlier. |
560 | */ | | 560 | */ |
561 | *iswht_ret = ncp->nc_whiteout; | | 561 | *iswht_ret = ncp->nc_whiteout; |
562 | } else { | | 562 | } else { |
563 | KASSERT(!ncp->nc_whiteout); | | 563 | KASSERT(!ncp->nc_whiteout); |
564 | } | | 564 | } |
565 | if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) { | | 565 | if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) { |
566 | /* | | 566 | /* |
567 | * Last component and we are preparing to create | | 567 | * Last component and we are preparing to create |
568 | * the named object, so flush the negative cache | | 568 | * the named object, so flush the negative cache |
569 | * entry. | | 569 | * entry. |
570 | */ | | 570 | */ |
571 | COUNT(ncs_badhits); | | 571 | COUNT(ncs_badhits); |
572 | cache_remove(ncp, true); | | 572 | cache_remove(ncp, true); |
573 | hit = false; | | 573 | hit = false; |
574 | } else { | | 574 | } else { |
575 | COUNT(ncs_neghits); | | 575 | COUNT(ncs_neghits); |
576 | SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, | | 576 | SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, |
577 | namelen, 0, 0); | | 577 | namelen, 0, 0); |
578 | /* found neg entry; vn is already null from above */ | | 578 | /* found neg entry; vn is already null from above */ |
579 | hit = true; | | 579 | hit = true; |
580 | } | | 580 | } |
581 | rw_exit(&dvi->vi_nc_lock); | | 581 | rw_exit(&dvi->vi_nc_lock); |
582 | return hit; | | 582 | return hit; |
583 | } | | 583 | } |
584 | vp = ncp->nc_vp; | | 584 | vp = ncp->nc_vp; |
585 | mutex_enter(vp->v_interlock); | | 585 | mutex_enter(vp->v_interlock); |
586 | rw_exit(&dvi->vi_nc_lock); | | 586 | rw_exit(&dvi->vi_nc_lock); |
587 | | | 587 | |
588 | /* | | 588 | /* |
589 | * Unlocked except for the vnode interlock. Call vcache_tryvget(). | | 589 | * Unlocked except for the vnode interlock. Call vcache_tryvget(). |
590 | */ | | 590 | */ |
591 | error = vcache_tryvget(vp); | | 591 | error = vcache_tryvget(vp); |
592 | if (error) { | | 592 | if (error) { |
593 | KASSERT(error == EBUSY); | | 593 | KASSERT(error == EBUSY); |
594 | /* | | 594 | /* |
595 | * This vnode is being cleaned out. | | 595 | * This vnode is being cleaned out. |
596 | * XXX badhits? | | 596 | * XXX badhits? |
597 | */ | | 597 | */ |
598 | COUNT(ncs_falsehits); | | 598 | COUNT(ncs_falsehits); |
599 | return false; | | 599 | return false; |
600 | } | | 600 | } |
601 | | | 601 | |
602 | COUNT(ncs_goodhits); | | 602 | COUNT(ncs_goodhits); |
603 | SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); | | 603 | SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); |
604 | /* found it */ | | 604 | /* found it */ |
605 | *vn_ret = vp; | | 605 | *vn_ret = vp; |
606 | return true; | | 606 | return true; |
607 | } | | 607 | } |
608 | | | 608 | |
609 | /* | | 609 | /* |
610 | * Version of the above without the nameiop argument, for NFS. | | 610 | * Version of the above without the nameiop argument, for NFS. |
611 | */ | | 611 | */ |
612 | bool | | 612 | bool |
613 | cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen, | | 613 | cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen, |
614 | uint32_t cnflags, | | 614 | uint32_t cnflags, |
615 | int *iswht_ret, struct vnode **vn_ret) | | 615 | int *iswht_ret, struct vnode **vn_ret) |
616 | { | | 616 | { |
617 | | | 617 | |
618 | return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY, | | 618 | return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY, |
619 | iswht_ret, vn_ret); | | 619 | iswht_ret, vn_ret); |
620 | } | | 620 | } |
621 | | | 621 | |
622 | /* | | 622 | /* |
623 | * Used by namei() to walk down a path, component by component by looking up | | 623 | * Used by namei() to walk down a path, component by component by looking up |
624 | * names in the cache. The node locks are chained along the way: a parent's | | 624 | * names in the cache. The node locks are chained along the way: a parent's |
625 | * lock is not dropped until the child's is acquired. | | 625 | * lock is not dropped until the child's is acquired. |
626 | */ | | 626 | */ |
627 | bool | | 627 | bool |
628 | cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen, | | 628 | cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen, |
629 | struct vnode **vn_ret, krwlock_t **plock, | | 629 | struct vnode **vn_ret, krwlock_t **plock, |
630 | kauth_cred_t cred) | | 630 | kauth_cred_t cred) |
631 | { | | 631 | { |
632 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); | | 632 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); |
633 | struct namecache *ncp; | | 633 | struct namecache *ncp; |
634 | uint64_t key; | | 634 | uint64_t key; |
635 | int error; | | 635 | int error; |
636 | | | 636 | |
637 | /* Establish default results. */ | | 637 | /* Establish default results. */ |
638 | *vn_ret = NULL; | | 638 | *vn_ret = NULL; |
639 | | | 639 | |
640 | /* If disabled, or file system doesn't support this, bail out. */ | | 640 | /* If disabled, or file system doesn't support this, bail out. */ |
641 | if (__predict_false((dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) { | | 641 | if (__predict_false((dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) { |
642 | return false; | | 642 | return false; |
643 | } | | 643 | } |
644 | | | 644 | |
645 | if (__predict_false(namelen > cache_maxlen)) { | | 645 | if (__predict_false(namelen > cache_maxlen)) { |
646 | COUNT(ncs_long); | | 646 | COUNT(ncs_long); |
647 | return false; | | 647 | return false; |
648 | } | | 648 | } |
649 | | | 649 | |
650 | /* Compute the key up front - don't need the lock. */ | | 650 | /* Compute the key up front - don't need the lock. */ |
651 | key = cache_key(name, namelen); | | 651 | key = cache_key(name, namelen); |
652 | | | 652 | |
653 | /* | | 653 | /* |
654 | * Acquire the directory lock. Once we have that, we can drop the | | 654 | * Acquire the directory lock. Once we have that, we can drop the |
655 | * previous one (if any). | | 655 | * previous one (if any). |
656 | * | | 656 | * |
657 | * The two lock holds mean that the directory can't go away while | | 657 | * The two lock holds mean that the directory can't go away while |
658 | * here: the directory must be purged with cache_purge() before | | 658 | * here: the directory must be purged with cache_purge() before |
659 | * being freed, and both parent & child's vi_nc_lock must be taken | | 659 | * being freed, and both parent & child's vi_nc_lock must be taken |
660 | * before that point is passed. | | 660 | * before that point is passed. |
661 | * | | 661 | * |
662 | * However if there's no previous lock, like at the root of the | | 662 | * However if there's no previous lock, like at the root of the |
663 | * chain, then "dvp" must be referenced to prevent dvp going away | | 663 | * chain, then "dvp" must be referenced to prevent dvp going away |
664 | * before we get its lock. | | 664 | * before we get its lock. |
665 | * | | 665 | * |
666 | * Note that the two locks can be the same if looking up a dot, for | | 666 | * Note that the two locks can be the same if looking up a dot, for |
667 | * example: /usr/bin/. If looking up the parent (..) we can't wait | | 667 | * example: /usr/bin/. If looking up the parent (..) we can't wait |
668 | * on the lock as child -> parent is the wrong direction. | | 668 | * on the lock as child -> parent is the wrong direction. |
669 | */ | | 669 | */ |
670 | if (*plock != &dvi->vi_nc_lock) { | | 670 | if (*plock != &dvi->vi_nc_lock) { |
671 | if (namelen == 2 && name[0] == '.' && name[1] == '.') { | | 671 | if (!rw_tryenter(&dvi->vi_nc_lock, RW_READER)) { |
672 | if (!rw_tryenter(&dvi->vi_nc_lock, RW_READER)) { | | 672 | return false; |
673 | return false; | | | |
674 | } | | | |
675 | } else { | | | |
676 | rw_enter(&dvi->vi_nc_lock, RW_READER); | | | |
677 | } | | 673 | } |
678 | if (*plock != NULL) { | | 674 | if (*plock != NULL) { |
679 | rw_exit(*plock); | | 675 | rw_exit(*plock); |
680 | } | | 676 | } |
681 | *plock = &dvi->vi_nc_lock; | | 677 | *plock = &dvi->vi_nc_lock; |
682 | } else if (*plock == NULL) { | | 678 | } else if (*plock == NULL) { |
683 | KASSERT(vrefcnt(dvp) > 0); | | 679 | KASSERT(vrefcnt(dvp) > 0); |
684 | } | | 680 | } |
685 | | | 681 | |
686 | /* | | 682 | /* |
687 | * First up check if the user is allowed to look up files in this | | 683 | * First up check if the user is allowed to look up files in this |
688 | * directory. | | 684 | * directory. |
689 | */ | | 685 | */ |
690 | KASSERT(dvi->vi_nc_mode != VNOVAL && dvi->vi_nc_uid != VNOVAL && | | 686 | KASSERT(dvi->vi_nc_mode != VNOVAL && dvi->vi_nc_uid != VNOVAL && |
691 | dvi->vi_nc_gid != VNOVAL); | | 687 | dvi->vi_nc_gid != VNOVAL); |
692 | error = kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(VEXEC, | | 688 | error = kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(VEXEC, |
693 | dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL, | | 689 | dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL, |
694 | genfs_can_access(dvp->v_type, dvi->vi_nc_mode & ALLPERMS, | | 690 | genfs_can_access(dvp->v_type, dvi->vi_nc_mode & ALLPERMS, |
695 | dvi->vi_nc_uid, dvi->vi_nc_gid, VEXEC, cred)); | | 691 | dvi->vi_nc_uid, dvi->vi_nc_gid, VEXEC, cred)); |
696 | if (error != 0) { | | 692 | if (error != 0) { |
697 | COUNT(ncs_denied); | | 693 | COUNT(ncs_denied); |
698 | return false; | | 694 | return false; |
699 | } | | 695 | } |
700 | | | 696 | |
701 | /* | | 697 | /* |
702 | * Now look for a matching cache entry. | | 698 | * Now look for a matching cache entry. |
703 | */ | | 699 | */ |
704 | ncp = cache_lookup_entry(dvp, name, namelen, key); | | 700 | ncp = cache_lookup_entry(dvp, name, namelen, key); |
705 | if (__predict_false(ncp == NULL)) { | | 701 | if (__predict_false(ncp == NULL)) { |
706 | COUNT(ncs_miss); | | 702 | COUNT(ncs_miss); |
707 | SDT_PROBE(vfs, namecache, lookup, miss, dvp, | | 703 | SDT_PROBE(vfs, namecache, lookup, miss, dvp, |
708 | name, namelen, 0, 0); | | 704 | name, namelen, 0, 0); |
709 | return false; | | 705 | return false; |
710 | } | | 706 | } |
711 | if (ncp->nc_vp == NULL) { | | 707 | if (ncp->nc_vp == NULL) { |
712 | /* found negative entry; vn is already null from above */ | | 708 | /* found negative entry; vn is already null from above */ |
713 | COUNT(ncs_neghits); | | 709 | COUNT(ncs_neghits); |
714 | SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); | | 710 | SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); |
715 | return true; | | 711 | return true; |
716 | } | | 712 | } |
717 | | | 713 | |
718 | COUNT(ncs_goodhits); /* XXX can be "badhits" */ | | 714 | COUNT(ncs_goodhits); /* XXX can be "badhits" */ |
719 | SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); | | 715 | SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); |
720 | | | 716 | |
721 | /* | | 717 | /* |
722 | * Return with the directory lock still held. It will either be | | 718 | * Return with the directory lock still held. It will either be |
723 | * returned to us with another call to cache_lookup_linked() when | | 719 | * returned to us with another call to cache_lookup_linked() when |
724 | * looking up the next component, or the caller will release it | | 720 | * looking up the next component, or the caller will release it |
725 | * manually when finished. | | 721 | * manually when finished. |
726 | */ | | 722 | */ |
727 | *vn_ret = ncp->nc_vp; | | 723 | *vn_ret = ncp->nc_vp; |
728 | return true; | | 724 | return true; |
729 | } | | 725 | } |
730 | | | 726 | |
731 | /* | | 727 | /* |
732 | * Scan cache looking for name of directory entry pointing at vp. | | 728 | * Scan cache looking for name of directory entry pointing at vp. |
733 | * Will not search for "." or "..". | | 729 | * Will not search for "." or "..". |
734 | * | | 730 | * |
735 | * If the lookup succeeds the vnode is referenced and stored in dvpp. | | 731 | * If the lookup succeeds the vnode is referenced and stored in dvpp. |
736 | * | | 732 | * |
737 | * If bufp is non-NULL, also place the name in the buffer which starts | | 733 | * If bufp is non-NULL, also place the name in the buffer which starts |
738 | * at bufp, immediately before *bpp, and move bpp backwards to point | | 734 | * at bufp, immediately before *bpp, and move bpp backwards to point |
739 | * at the start of it. (Yes, this is a little baroque, but it's done | | 735 | * at the start of it. (Yes, this is a little baroque, but it's done |
740 | * this way to cater to the whims of getcwd). | | 736 | * this way to cater to the whims of getcwd). |
741 | * | | 737 | * |
742 | * Returns 0 on success, -1 on cache miss, positive errno on failure. | | 738 | * Returns 0 on success, -1 on cache miss, positive errno on failure. |
743 | */ | | 739 | */ |
744 | int | | 740 | int |
745 | cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp, | | 741 | cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp, |
746 | bool checkaccess, int perms) | | 742 | bool checkaccess, int perms) |
747 | { | | 743 | { |
748 | vnode_impl_t *vi = VNODE_TO_VIMPL(vp); | | 744 | vnode_impl_t *vi = VNODE_TO_VIMPL(vp); |
749 | struct namecache *ncp; | | 745 | struct namecache *ncp; |
750 | struct vnode *dvp; | | 746 | struct vnode *dvp; |
751 | int error, nlen, lrulist; | | 747 | int error, nlen, lrulist; |
752 | char *bp; | | 748 | char *bp; |
753 | | | 749 | |
754 | KASSERT(vp != NULL); | | 750 | KASSERT(vp != NULL); |
755 | | | 751 | |
756 | if (cache_maxlen == 0) | | 752 | if (cache_maxlen == 0) |
757 | goto out; | | 753 | goto out; |
758 | | | 754 | |
759 | rw_enter(&vi->vi_nc_listlock, RW_READER); | | 755 | rw_enter(&vi->vi_nc_listlock, RW_READER); |
760 | if (checkaccess) { | | 756 | if (checkaccess) { |
761 | /* | | 757 | /* |
762 | * Check if the user is allowed to see. NOTE: this is | | 758 | * Check if the user is allowed to see. NOTE: this is |
763 | * checking for access on the "wrong" directory. getcwd() | | 759 | * checking for access on the "wrong" directory. getcwd() |
764 | * wants to see that there is access on every component | | 760 | * wants to see that there is access on every component |
765 | * along the way, not that there is access to any individual | | 761 | * along the way, not that there is access to any individual |
766 | * component. Don't use this to check you can look in vp. | | 762 | * component. Don't use this to check you can look in vp. |
767 | * | | 763 | * |
768 | * I don't like it, I didn't come up with it, don't blame me! | | 764 | * I don't like it, I didn't come up with it, don't blame me! |
769 | */ | | 765 | */ |
770 | KASSERT(vi->vi_nc_mode != VNOVAL && vi->vi_nc_uid != VNOVAL && | | 766 | KASSERT(vi->vi_nc_mode != VNOVAL && vi->vi_nc_uid != VNOVAL && |
771 | vi->vi_nc_gid != VNOVAL); | | 767 | vi->vi_nc_gid != VNOVAL); |
772 | error = kauth_authorize_vnode(curlwp->l_cred, | | 768 | error = kauth_authorize_vnode(curlwp->l_cred, |
773 | KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode & | | 769 | KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode & |
774 | ALLPERMS), vp, NULL, genfs_can_access(vp->v_type, | | 770 | ALLPERMS), vp, NULL, genfs_can_access(vp->v_type, |
775 | vi->vi_nc_mode & ALLPERMS, vi->vi_nc_uid, vi->vi_nc_gid, | | 771 | vi->vi_nc_mode & ALLPERMS, vi->vi_nc_uid, vi->vi_nc_gid, |
776 | perms, curlwp->l_cred)); | | 772 | perms, curlwp->l_cred)); |
777 | if (error != 0) { | | 773 | if (error != 0) { |
778 | rw_exit(&vi->vi_nc_listlock); | | 774 | rw_exit(&vi->vi_nc_listlock); |
779 | COUNT(ncs_denied); | | 775 | COUNT(ncs_denied); |
780 | return EACCES; | | 776 | return EACCES; |
781 | } | | 777 | } |
782 | } | | 778 | } |
783 | TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) { | | 779 | TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) { |
784 | KASSERT(ncp->nc_vp == vp); | | 780 | KASSERT(ncp->nc_vp == vp); |
785 | KASSERT(ncp->nc_dvp != NULL); | | 781 | KASSERT(ncp->nc_dvp != NULL); |
786 | nlen = ncp->nc_nlen; | | 782 | nlen = ncp->nc_nlen; |
787 | | | 783 | |
788 | /* | | 784 | /* |
789 | * The queue is partially sorted. Once we hit dots, nothing | | 785 | * The queue is partially sorted. Once we hit dots, nothing |
790 | * else remains but dots and dotdots, so bail out. | | 786 | * else remains but dots and dotdots, so bail out. |
791 | */ | | 787 | */ |
792 | if (ncp->nc_name[0] == '.') { | | 788 | if (ncp->nc_name[0] == '.') { |
793 | if (nlen == 1 || | | 789 | if (nlen == 1 || |
794 | (nlen == 2 && ncp->nc_name[1] == '.')) { | | 790 | (nlen == 2 && ncp->nc_name[1] == '.')) { |
795 | break; | | 791 | break; |
796 | } | | 792 | } |
797 | } | | 793 | } |
798 | | | 794 | |
799 | /* | | 795 | /* |
800 | * Record a hit on the entry. This is an unlocked read but | | 796 | * Record a hit on the entry. This is an unlocked read but |
801 | * even if wrong it doesn't matter too much. | | 797 | * even if wrong it doesn't matter too much. |
802 | */ | | 798 | */ |
803 | lrulist = atomic_load_relaxed(&ncp->nc_lrulist); | | 799 | lrulist = atomic_load_relaxed(&ncp->nc_lrulist); |
804 | if (lrulist != LRU_ACTIVE) { | | 800 | if (lrulist != LRU_ACTIVE) { |
805 | cache_activate(ncp); | | 801 | cache_activate(ncp); |
806 | } | | 802 | } |
807 | | | 803 | |
808 | if (bufp) { | | 804 | if (bufp) { |
809 | bp = *bpp; | | 805 | bp = *bpp; |
810 | bp -= nlen; | | 806 | bp -= nlen; |
811 | if (bp <= bufp) { | | 807 | if (bp <= bufp) { |
812 | *dvpp = NULL; | | 808 | *dvpp = NULL; |
813 | rw_exit(&vi->vi_nc_listlock); | | 809 | rw_exit(&vi->vi_nc_listlock); |
814 | SDT_PROBE(vfs, namecache, revlookup, | | 810 | SDT_PROBE(vfs, namecache, revlookup, |
815 | fail, vp, ERANGE, 0, 0, 0); | | 811 | fail, vp, ERANGE, 0, 0, 0); |
816 | return (ERANGE); | | 812 | return (ERANGE); |
817 | } | | 813 | } |
818 | memcpy(bp, ncp->nc_name, nlen); | | 814 | memcpy(bp, ncp->nc_name, nlen); |
819 | *bpp = bp; | | 815 | *bpp = bp; |
820 | } | | 816 | } |
821 | | | 817 | |
822 | dvp = ncp->nc_dvp; | | 818 | dvp = ncp->nc_dvp; |
823 | mutex_enter(dvp->v_interlock); | | 819 | mutex_enter(dvp->v_interlock); |
824 | rw_exit(&vi->vi_nc_listlock); | | 820 | rw_exit(&vi->vi_nc_listlock); |
825 | error = vcache_tryvget(dvp); | | 821 | error = vcache_tryvget(dvp); |
826 | if (error) { | | 822 | if (error) { |
827 | KASSERT(error == EBUSY); | | 823 | KASSERT(error == EBUSY); |
828 | if (bufp) | | 824 | if (bufp) |
829 | (*bpp) += nlen; | | 825 | (*bpp) += nlen; |
830 | *dvpp = NULL; | | 826 | *dvpp = NULL; |
831 | SDT_PROBE(vfs, namecache, revlookup, fail, vp, | | 827 | SDT_PROBE(vfs, namecache, revlookup, fail, vp, |
832 | error, 0, 0, 0); | | 828 | error, 0, 0, 0); |
833 | return -1; | | 829 | return -1; |
834 | } | | 830 | } |
835 | *dvpp = dvp; | | 831 | *dvpp = dvp; |
836 | SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp, | | 832 | SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp, |
837 | 0, 0, 0); | | 833 | 0, 0, 0); |
838 | COUNT(ncs_revhits); | | 834 | COUNT(ncs_revhits); |
839 | return (0); | | 835 | return (0); |
840 | } | | 836 | } |
841 | rw_exit(&vi->vi_nc_listlock); | | 837 | rw_exit(&vi->vi_nc_listlock); |
842 | COUNT(ncs_revmiss); | | 838 | COUNT(ncs_revmiss); |
843 | out: | | 839 | out: |
844 | *dvpp = NULL; | | 840 | *dvpp = NULL; |
845 | return (-1); | | 841 | return (-1); |
846 | } | | 842 | } |
847 | | | 843 | |
848 | /* | | 844 | /* |
849 | * Add an entry to the cache. | | 845 | * Add an entry to the cache. |
850 | */ | | 846 | */ |
851 | void | | 847 | void |
852 | cache_enter(struct vnode *dvp, struct vnode *vp, | | 848 | cache_enter(struct vnode *dvp, struct vnode *vp, |
853 | const char *name, size_t namelen, uint32_t cnflags) | | 849 | const char *name, size_t namelen, uint32_t cnflags) |
854 | { | | 850 | { |
855 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); | | 851 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); |
856 | struct namecache *ncp, *oncp; | | 852 | struct namecache *ncp, *oncp; |
857 | int total; | | 853 | int total; |
858 | | | 854 | |
859 | /* First, check whether we can/should add a cache entry. */ | | 855 | /* First, check whether we can/should add a cache entry. */ |
860 | if ((cnflags & MAKEENTRY) == 0 || | | 856 | if ((cnflags & MAKEENTRY) == 0 || |
861 | __predict_false(namelen > cache_maxlen)) { | | 857 | __predict_false(namelen > cache_maxlen)) { |
862 | SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen, | | 858 | SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen, |
863 | 0, 0); | | 859 | 0, 0); |
864 | return; | | 860 | return; |
865 | } | | 861 | } |
866 | | | 862 | |
867 | SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0); | | 863 | SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0); |
868 | | | 864 | |
869 | /* | | 865 | /* |
870 | * Reclaim some entries if over budget. This is an unlocked check, | | 866 | * Reclaim some entries if over budget. This is an unlocked check, |
871 | * but it doesn't matter. Just need to catch up with things | | 867 | * but it doesn't matter. Just need to catch up with things |
872 | * eventually: it doesn't matter if we go over temporarily. | | 868 | * eventually: it doesn't matter if we go over temporarily. |
873 | */ | | 869 | */ |
874 | total = atomic_load_relaxed(&cache_lru.count[LRU_ACTIVE]); | | 870 | total = atomic_load_relaxed(&cache_lru.count[LRU_ACTIVE]); |
875 | total += atomic_load_relaxed(&cache_lru.count[LRU_INACTIVE]); | | 871 | total += atomic_load_relaxed(&cache_lru.count[LRU_INACTIVE]); |
876 | if (__predict_false(total > desiredvnodes)) { | | 872 | if (__predict_false(total > desiredvnodes)) { |
877 | cache_reclaim(); | | 873 | cache_reclaim(); |
878 | } | | 874 | } |
879 | | | 875 | |
880 | /* Now allocate a fresh entry. */ | | 876 | /* Now allocate a fresh entry. */ |
881 | if (__predict_true(namelen <= NCHNAMLEN)) { | | 877 | if (__predict_true(namelen <= NCHNAMLEN)) { |
882 | ncp = pool_cache_get(cache_pool, PR_WAITOK); | | 878 | ncp = pool_cache_get(cache_pool, PR_WAITOK); |
883 | } else { | | 879 | } else { |
884 | size_t sz = offsetof(struct namecache, nc_name[namelen]); | | 880 | size_t sz = offsetof(struct namecache, nc_name[namelen]); |
885 | ncp = kmem_alloc(sz, KM_SLEEP); | | 881 | ncp = kmem_alloc(sz, KM_SLEEP); |
886 | } | | 882 | } |
887 | | | 883 | |
888 | /* | | 884 | /* |
889 | * Fill in cache info. For negative hits, save the ISWHITEOUT flag | | 885 | * Fill in cache info. For negative hits, save the ISWHITEOUT flag |
890 | * so we can restore it later when the cache entry is used again. | | 886 | * so we can restore it later when the cache entry is used again. |
891 | */ | | 887 | */ |
892 | ncp->nc_vp = vp; | | 888 | ncp->nc_vp = vp; |
893 | ncp->nc_dvp = dvp; | | 889 | ncp->nc_dvp = dvp; |
894 | ncp->nc_key = cache_key(name, namelen); | | 890 | ncp->nc_key = cache_key(name, namelen); |
895 | ncp->nc_nlen = namelen; | | 891 | ncp->nc_nlen = namelen; |
896 | ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0); | | 892 | ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0); |
897 | memcpy(ncp->nc_name, name, namelen); | | 893 | memcpy(ncp->nc_name, name, namelen); |
898 | | | 894 | |
899 | /* | | 895 | /* |
900 | * Insert to the directory. Concurrent lookups may race for a cache | | 896 | * Insert to the directory. Concurrent lookups may race for a cache |
901 | * entry. If there's a entry there already, purge it. | | 897 | * entry. If there's a entry there already, purge it. |
902 | */ | | 898 | */ |
903 | rw_enter(&dvi->vi_nc_lock, RW_WRITER); | | 899 | rw_enter(&dvi->vi_nc_lock, RW_WRITER); |
904 | oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp); | | 900 | oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp); |
905 | if (oncp != ncp) { | | 901 | if (oncp != ncp) { |
906 | KASSERT(oncp->nc_key == ncp->nc_key); | | 902 | KASSERT(oncp->nc_key == ncp->nc_key); |
907 | KASSERT(oncp->nc_nlen == ncp->nc_nlen); | | 903 | KASSERT(oncp->nc_nlen == ncp->nc_nlen); |
908 | KASSERT(memcmp(oncp->nc_name, name, namelen) == 0); | | 904 | KASSERT(memcmp(oncp->nc_name, name, namelen) == 0); |
909 | cache_remove(oncp, true); | | 905 | cache_remove(oncp, true); |
910 | oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp); | | 906 | oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp); |
911 | KASSERT(oncp == ncp); | | 907 | KASSERT(oncp == ncp); |
912 | } | | 908 | } |
913 | | | 909 | |
914 | /* | | 910 | /* |
915 | * With the directory lock still held, insert to the tail of the | | 911 | * With the directory lock still held, insert to the tail of the |
916 | * ACTIVE LRU list (new) and take the opportunity to incrementally | | 912 | * ACTIVE LRU list (new) and take the opportunity to incrementally |
917 | * balance the lists. | | 913 | * balance the lists. |
918 | */ | | 914 | */ |
919 | mutex_enter(&cache_lru_lock); | | 915 | mutex_enter(&cache_lru_lock); |
920 | ncp->nc_lrulist = LRU_ACTIVE; | | 916 | ncp->nc_lrulist = LRU_ACTIVE; |
921 | cache_lru.count[LRU_ACTIVE]++; | | 917 | cache_lru.count[LRU_ACTIVE]++; |
922 | TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); | | 918 | TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); |
923 | cache_deactivate(); | | 919 | cache_deactivate(); |
924 | mutex_exit(&cache_lru_lock); | | 920 | mutex_exit(&cache_lru_lock); |
925 | | | 921 | |
926 | /* | | 922 | /* |
927 | * Finally, insert to the vnode and unlock. With everything set up | | 923 | * Finally, insert to the vnode and unlock. With everything set up |
928 | * it's safe to let cache_revlookup() see the entry. Partially sort | | 924 | * it's safe to let cache_revlookup() see the entry. Partially sort |
929 | * the per-vnode list: dots go to back so cache_revlookup() doesn't | | 925 | * the per-vnode list: dots go to back so cache_revlookup() doesn't |
930 | * have to consider them. | | 926 | * have to consider them. |
931 | */ | | 927 | */ |
932 | if (vp != NULL) { | | 928 | if (vp != NULL) { |
933 | vnode_impl_t *vi = VNODE_TO_VIMPL(vp); | | 929 | vnode_impl_t *vi = VNODE_TO_VIMPL(vp); |
934 | rw_enter(&vi->vi_nc_listlock, RW_WRITER); | | 930 | rw_enter(&vi->vi_nc_listlock, RW_WRITER); |
935 | if ((namelen == 1 && name[0] == '.') || | | 931 | if ((namelen == 1 && name[0] == '.') || |
936 | (namelen == 2 && name[0] == '.' && name[1] == '.')) { | | 932 | (namelen == 2 && name[0] == '.' && name[1] == '.')) { |
937 | TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list); | | 933 | TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list); |
938 | } else { | | 934 | } else { |
939 | TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list); | | 935 | TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list); |
940 | } | | 936 | } |
941 | rw_exit(&vi->vi_nc_listlock); | | 937 | rw_exit(&vi->vi_nc_listlock); |
942 | } | | 938 | } |
943 | rw_exit(&dvi->vi_nc_lock); | | 939 | rw_exit(&dvi->vi_nc_lock); |
944 | } | | 940 | } |
945 | | | 941 | |
946 | /* | | 942 | /* |
947 | * Set identity info in cache for a vnode. We only care about directories | | 943 | * Set identity info in cache for a vnode. We only care about directories |
948 | * so ignore other updates. | | 944 | * so ignore other updates. |
949 | */ | | 945 | */ |
950 | void | | 946 | void |
951 | cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid) | | 947 | cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid) |
952 | { | | 948 | { |
953 | vnode_impl_t *vi = VNODE_TO_VIMPL(vp); | | 949 | vnode_impl_t *vi = VNODE_TO_VIMPL(vp); |
954 | | | 950 | |
955 | if (vp->v_type == VDIR) { | | 951 | if (vp->v_type == VDIR) { |
956 | /* Grab both locks, for forward & reverse lookup. */ | | 952 | /* Grab both locks, for forward & reverse lookup. */ |
957 | rw_enter(&vi->vi_nc_lock, RW_WRITER); | | 953 | rw_enter(&vi->vi_nc_lock, RW_WRITER); |
958 | rw_enter(&vi->vi_nc_listlock, RW_WRITER); | | 954 | rw_enter(&vi->vi_nc_listlock, RW_WRITER); |
959 | vi->vi_nc_mode = mode; | | 955 | vi->vi_nc_mode = mode; |
960 | vi->vi_nc_uid = uid; | | 956 | vi->vi_nc_uid = uid; |
961 | vi->vi_nc_gid = gid; | | 957 | vi->vi_nc_gid = gid; |
962 | rw_exit(&vi->vi_nc_listlock); | | 958 | rw_exit(&vi->vi_nc_listlock); |
963 | rw_exit(&vi->vi_nc_lock); | | 959 | rw_exit(&vi->vi_nc_lock); |
964 | } | | 960 | } |
965 | } | | 961 | } |
966 | | | 962 | |
967 | /* | | 963 | /* |
968 | * Return true if we have identity for the given vnode, and use as an | | 964 | * Return true if we have identity for the given vnode, and use as an |
969 | * opportunity to confirm that everything squares up. | | 965 | * opportunity to confirm that everything squares up. |
970 | * | | 966 | * |
971 | * Because of shared code, some file systems could provide partial | | 967 | * Because of shared code, some file systems could provide partial |
972 | * information, missing some updates, so always check the mount flag | | 968 | * information, missing some updates, so always check the mount flag |
973 | * instead of looking for !VNOVAL. | | 969 | * instead of looking for !VNOVAL. |
974 | */ | | 970 | */ |
975 | bool | | 971 | bool |
976 | cache_have_id(struct vnode *vp) | | 972 | cache_have_id(struct vnode *vp) |
977 | { | | 973 | { |
978 | | | 974 | |
979 | if (vp->v_type == VDIR && | | 975 | if (vp->v_type == VDIR && |
980 | (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0) { | | 976 | (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0) { |
981 | KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_mode != VNOVAL); | | 977 | KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_mode != VNOVAL); |
982 | KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_uid != VNOVAL); | | 978 | KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_uid != VNOVAL); |
983 | KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_gid != VNOVAL); | | 979 | KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_gid != VNOVAL); |
984 | return true; | | 980 | return true; |
985 | } else { | | 981 | } else { |
986 | return false; | | 982 | return false; |
987 | } | | 983 | } |
988 | } | | 984 | } |
989 | | | 985 | |
990 | /* | | 986 | /* |
991 | * Name cache initialization, from vfs_init() when the system is booting. | | 987 | * Name cache initialization, from vfs_init() when the system is booting. |
992 | */ | | 988 | */ |
993 | void | | 989 | void |
994 | nchinit(void) | | 990 | nchinit(void) |
995 | { | | 991 | { |
996 | | | 992 | |
997 | cache_pool = pool_cache_init(sizeof(struct namecache), | | 993 | cache_pool = pool_cache_init(sizeof(struct namecache), |
998 | coherency_unit, 0, 0, "namecache", NULL, IPL_NONE, NULL, | | 994 | coherency_unit, 0, 0, "namecache", NULL, IPL_NONE, NULL, |
999 | NULL, NULL); | | 995 | NULL, NULL); |
1000 | KASSERT(cache_pool != NULL); | | 996 | KASSERT(cache_pool != NULL); |
1001 | | | 997 | |
1002 | mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE); | | 998 | mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE); |
1003 | TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]); | | 999 | TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]); |
1004 | TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]); | | 1000 | TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]); |
1005 | | | 1001 | |
1006 | mutex_init(&cache_stat_lock, MUTEX_DEFAULT, IPL_NONE); | | 1002 | mutex_init(&cache_stat_lock, MUTEX_DEFAULT, IPL_NONE); |
1007 | callout_init(&cache_stat_callout, CALLOUT_MPSAFE); | | 1003 | callout_init(&cache_stat_callout, CALLOUT_MPSAFE); |
1008 | callout_setfunc(&cache_stat_callout, cache_update_stats, NULL); | | 1004 | callout_setfunc(&cache_stat_callout, cache_update_stats, NULL); |
1009 | callout_schedule(&cache_stat_callout, cache_stat_interval * hz); | | 1005 | callout_schedule(&cache_stat_callout, cache_stat_interval * hz); |
1010 | | | 1006 | |
1011 | KASSERT(cache_sysctllog == NULL); | | 1007 | KASSERT(cache_sysctllog == NULL); |
1012 | sysctl_createv(&cache_sysctllog, 0, NULL, NULL, | | 1008 | sysctl_createv(&cache_sysctllog, 0, NULL, NULL, |
1013 | CTLFLAG_PERMANENT, | | 1009 | CTLFLAG_PERMANENT, |
1014 | CTLTYPE_STRUCT, "namecache_stats", | | 1010 | CTLTYPE_STRUCT, "namecache_stats", |
1015 | SYSCTL_DESCR("namecache statistics"), | | 1011 | SYSCTL_DESCR("namecache statistics"), |
1016 | cache_stat_sysctl, 0, NULL, 0, | | 1012 | cache_stat_sysctl, 0, NULL, 0, |
1017 | CTL_VFS, CTL_CREATE, CTL_EOL); | | 1013 | CTL_VFS, CTL_CREATE, CTL_EOL); |
1018 | } | | 1014 | } |
1019 | | | 1015 | |
1020 | /* | | 1016 | /* |
1021 | * Called once for each CPU in the system as attached. | | 1017 | * Called once for each CPU in the system as attached. |
1022 | */ | | 1018 | */ |
1023 | void | | 1019 | void |
1024 | cache_cpu_init(struct cpu_info *ci) | | 1020 | cache_cpu_init(struct cpu_info *ci) |
1025 | { | | 1021 | { |
1026 | void *p; | | 1022 | void *p; |
1027 | size_t sz; | | 1023 | size_t sz; |
1028 | | | 1024 | |
1029 | sz = roundup2(sizeof(struct nchstats_percpu), coherency_unit) + | | 1025 | sz = roundup2(sizeof(struct nchstats_percpu), coherency_unit) + |
1030 | coherency_unit; | | 1026 | coherency_unit; |
1031 | p = kmem_zalloc(sz, KM_SLEEP); | | 1027 | p = kmem_zalloc(sz, KM_SLEEP); |
1032 | ci->ci_data.cpu_nch = (void *)roundup2((uintptr_t)p, coherency_unit); | | 1028 | ci->ci_data.cpu_nch = (void *)roundup2((uintptr_t)p, coherency_unit); |
1033 | } | | 1029 | } |
1034 | | | 1030 | |
1035 | /* | | 1031 | /* |
1036 | * A vnode is being allocated: set up cache structures. | | 1032 | * A vnode is being allocated: set up cache structures. |
1037 | */ | | 1033 | */ |
1038 | void | | 1034 | void |
1039 | cache_vnode_init(struct vnode *vp) | | 1035 | cache_vnode_init(struct vnode *vp) |
1040 | { | | 1036 | { |
1041 | vnode_impl_t *vi = VNODE_TO_VIMPL(vp); | | 1037 | vnode_impl_t *vi = VNODE_TO_VIMPL(vp); |
1042 | | | 1038 | |
1043 | rw_init(&vi->vi_nc_lock); | | 1039 | rw_init(&vi->vi_nc_lock); |
1044 | rw_init(&vi->vi_nc_listlock); | | 1040 | rw_init(&vi->vi_nc_listlock); |
1045 | rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops); | | 1041 | rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops); |
1046 | TAILQ_INIT(&vi->vi_nc_list); | | 1042 | TAILQ_INIT(&vi->vi_nc_list); |
1047 | vi->vi_nc_mode = VNOVAL; | | 1043 | vi->vi_nc_mode = VNOVAL; |
1048 | vi->vi_nc_uid = VNOVAL; | | 1044 | vi->vi_nc_uid = VNOVAL; |
1049 | vi->vi_nc_gid = VNOVAL; | | 1045 | vi->vi_nc_gid = VNOVAL; |
1050 | } | | 1046 | } |
1051 | | | 1047 | |
1052 | /* | | 1048 | /* |
1053 | * A vnode is being freed: finish cache structures. | | 1049 | * A vnode is being freed: finish cache structures. |
1054 | */ | | 1050 | */ |
1055 | void | | 1051 | void |
1056 | cache_vnode_fini(struct vnode *vp) | | 1052 | cache_vnode_fini(struct vnode *vp) |
1057 | { | | 1053 | { |
1058 | vnode_impl_t *vi = VNODE_TO_VIMPL(vp); | | 1054 | vnode_impl_t *vi = VNODE_TO_VIMPL(vp); |
1059 | | | 1055 | |
1060 | KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL); | | 1056 | KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL); |
1061 | KASSERT(TAILQ_EMPTY(&vi->vi_nc_list)); | | 1057 | KASSERT(TAILQ_EMPTY(&vi->vi_nc_list)); |
1062 | rw_destroy(&vi->vi_nc_lock); | | 1058 | rw_destroy(&vi->vi_nc_lock); |
1063 | rw_destroy(&vi->vi_nc_listlock); | | 1059 | rw_destroy(&vi->vi_nc_listlock); |
1064 | } | | 1060 | } |
1065 | | | 1061 | |
1066 | /* | | 1062 | /* |
1067 | * Helper for cache_purge1(): purge cache entries for the given vnode from | | 1063 | * Helper for cache_purge1(): purge cache entries for the given vnode from |
1068 | * all directories that the vnode is cached in. | | 1064 | * all directories that the vnode is cached in. |
1069 | */ | | 1065 | */ |
1070 | static void | | 1066 | static void |
1071 | cache_purge_parents(struct vnode *vp) | | 1067 | cache_purge_parents(struct vnode *vp) |
1072 | { | | 1068 | { |
1073 | vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp); | | 1069 | vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp); |
1074 | struct vnode *dvp, *blocked; | | 1070 | struct vnode *dvp, *blocked; |
1075 | struct namecache *ncp; | | 1071 | struct namecache *ncp; |
1076 | | | 1072 | |
1077 | SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0); | | 1073 | SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0); |
1078 | | | 1074 | |
1079 | blocked = NULL; | | 1075 | blocked = NULL; |
1080 | | | 1076 | |
1081 | rw_enter(&vi->vi_nc_listlock, RW_WRITER); | | 1077 | rw_enter(&vi->vi_nc_listlock, RW_WRITER); |
1082 | while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) { | | 1078 | while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) { |
1083 | /* | | 1079 | /* |
1084 | * Locking in the wrong direction. Try for a hold on the | | 1080 | * Locking in the wrong direction. Try for a hold on the |
1085 | * directory node's lock, and if we get it then all good, | | 1081 | * directory node's lock, and if we get it then all good, |
1086 | * nuke the entry and move on to the next. | | 1082 | * nuke the entry and move on to the next. |
1087 | */ | | 1083 | */ |
1088 | dvp = ncp->nc_dvp; | | 1084 | dvp = ncp->nc_dvp; |
1089 | dvi = VNODE_TO_VIMPL(dvp); | | 1085 | dvi = VNODE_TO_VIMPL(dvp); |
1090 | if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { | | 1086 | if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { |
1091 | cache_remove(ncp, false); | | 1087 | cache_remove(ncp, false); |
1092 | rw_exit(&dvi->vi_nc_lock); | | 1088 | rw_exit(&dvi->vi_nc_lock); |
1093 | blocked = NULL; | | 1089 | blocked = NULL; |
1094 | continue; | | 1090 | continue; |
1095 | } | | 1091 | } |
1096 | | | 1092 | |
1097 | /* | | 1093 | /* |
1098 | * We can't wait on the directory node's lock with our list | | 1094 | * We can't wait on the directory node's lock with our list |
1099 | * lock held or the system could deadlock. | | 1095 | * lock held or the system could deadlock. |
1100 | * | | 1096 | * |
1101 | * Take a hold on the directory vnode to prevent it from | | 1097 | * Take a hold on the directory vnode to prevent it from |
1102 | * being freed (taking the vnode & lock with it). Then | | 1098 | * being freed (taking the vnode & lock with it). Then |
1103 | * wait for the lock to become available with no other locks | | 1099 | * wait for the lock to become available with no other locks |
1104 | * held, and retry. | | 1100 | * held, and retry. |
1105 | * | | 1101 | * |
1106 | * If this happens twice in a row, give the other side a | | 1102 | * If this happens twice in a row, give the other side a |
1107 | * breather; we can do nothing until it lets go. | | 1103 | * breather; we can do nothing until it lets go. |
1108 | */ | | 1104 | */ |
1109 | vhold(dvp); | | 1105 | vhold(dvp); |
1110 | rw_exit(&vi->vi_nc_listlock); | | 1106 | rw_exit(&vi->vi_nc_listlock); |
1111 | rw_enter(&dvi->vi_nc_lock, RW_WRITER); | | 1107 | rw_enter(&dvi->vi_nc_lock, RW_WRITER); |
1112 | /* Do nothing. */ | | 1108 | /* Do nothing. */ |
1113 | rw_exit(&dvi->vi_nc_lock); | | 1109 | rw_exit(&dvi->vi_nc_lock); |
1114 | holdrele(dvp); | | 1110 | holdrele(dvp); |
1115 | if (blocked == dvp) { | | 1111 | if (blocked == dvp) { |
1116 | kpause("ncpurge", false, 1, NULL); | | 1112 | kpause("ncpurge", false, 1, NULL); |
1117 | } | | 1113 | } |
1118 | rw_enter(&vi->vi_nc_listlock, RW_WRITER); | | 1114 | rw_enter(&vi->vi_nc_listlock, RW_WRITER); |
1119 | blocked = dvp; | | 1115 | blocked = dvp; |
1120 | } | | 1116 | } |
1121 | rw_exit(&vi->vi_nc_listlock); | | 1117 | rw_exit(&vi->vi_nc_listlock); |
1122 | } | | 1118 | } |
1123 | | | 1119 | |
1124 | /* | | 1120 | /* |
1125 | * Helper for cache_purge1(): purge all cache entries hanging off the given | | 1121 | * Helper for cache_purge1(): purge all cache entries hanging off the given |
1126 | * directory vnode. | | 1122 | * directory vnode. |
1127 | */ | | 1123 | */ |
1128 | static void | | 1124 | static void |
1129 | cache_purge_children(struct vnode *dvp) | | 1125 | cache_purge_children(struct vnode *dvp) |
1130 | { | | 1126 | { |
1131 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); | | 1127 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); |
1132 | struct namecache *ncp; | | 1128 | struct namecache *ncp; |
1133 | | | 1129 | |
1134 | SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0); | | 1130 | SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0); |
1135 | | | 1131 | |
1136 | rw_enter(&dvi->vi_nc_lock, RW_WRITER); | | 1132 | rw_enter(&dvi->vi_nc_lock, RW_WRITER); |
1137 | while ((ncp = RB_TREE_MIN(&dvi->vi_nc_tree)) != NULL) { | | 1133 | while ((ncp = RB_TREE_MIN(&dvi->vi_nc_tree)) != NULL) { |
1138 | cache_remove(ncp, true); | | 1134 | cache_remove(ncp, true); |
1139 | } | | 1135 | } |
1140 | rw_exit(&dvi->vi_nc_lock); | | 1136 | rw_exit(&dvi->vi_nc_lock); |
1141 | } | | 1137 | } |
1142 | | | 1138 | |
1143 | /* | | 1139 | /* |
1144 | * Helper for cache_purge1(): purge cache entry from the given vnode, | | 1140 | * Helper for cache_purge1(): purge cache entry from the given vnode, |
1145 | * finding it by name. | | 1141 | * finding it by name. |
1146 | */ | | 1142 | */ |
1147 | static void | | 1143 | static void |
1148 | cache_purge_name(struct vnode *dvp, const char *name, size_t namelen) | | 1144 | cache_purge_name(struct vnode *dvp, const char *name, size_t namelen) |
1149 | { | | 1145 | { |
1150 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); | | 1146 | vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); |
1151 | struct namecache *ncp; | | 1147 | struct namecache *ncp; |
1152 | uint64_t key; | | 1148 | uint64_t key; |
1153 | | | 1149 | |
1154 | SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0); | | 1150 | SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0); |
1155 | | | 1151 | |
1156 | key = cache_key(name, namelen); | | 1152 | key = cache_key(name, namelen); |
1157 | rw_enter(&dvi->vi_nc_lock, RW_WRITER); | | 1153 | rw_enter(&dvi->vi_nc_lock, RW_WRITER); |
1158 | ncp = cache_lookup_entry(dvp, name, namelen, key); | | 1154 | ncp = cache_lookup_entry(dvp, name, namelen, key); |
1159 | if (ncp) { | | 1155 | if (ncp) { |
1160 | cache_remove(ncp, true); | | 1156 | cache_remove(ncp, true); |
1161 | } | | 1157 | } |
1162 | rw_exit(&dvi->vi_nc_lock); | | 1158 | rw_exit(&dvi->vi_nc_lock); |
1163 | } | | 1159 | } |
1164 | | | 1160 | |
1165 | /* | | 1161 | /* |
1166 | * Cache flush, a particular vnode; called when a vnode is renamed to | | 1162 | * Cache flush, a particular vnode; called when a vnode is renamed to |
1167 | * hide entries that would now be invalid. | | 1163 | * hide entries that would now be invalid. |
1168 | */ | | 1164 | */ |
1169 | void | | 1165 | void |
1170 | cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags) | | 1166 | cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags) |
1171 | { | | 1167 | { |
1172 | | | 1168 | |
1173 | if (flags & PURGE_PARENTS) { | | 1169 | if (flags & PURGE_PARENTS) { |
1174 | cache_purge_parents(vp); | | 1170 | cache_purge_parents(vp); |
1175 | } | | 1171 | } |
1176 | if (flags & PURGE_CHILDREN) { | | 1172 | if (flags & PURGE_CHILDREN) { |
1177 | cache_purge_children(vp); | | 1173 | cache_purge_children(vp); |
1178 | } | | 1174 | } |
1179 | if (name != NULL) { | | 1175 | if (name != NULL) { |
1180 | cache_purge_name(vp, name, namelen); | | 1176 | cache_purge_name(vp, name, namelen); |
1181 | } | | 1177 | } |
1182 | } | | 1178 | } |
1183 | | | 1179 | |
1184 | /* | | 1180 | /* |
1185 | * vnode filter for cache_purgevfs(). | | 1181 | * vnode filter for cache_purgevfs(). |
1186 | */ | | 1182 | */ |
1187 | static bool | | 1183 | static bool |
1188 | cache_vdir_filter(void *cookie, vnode_t *vp) | | 1184 | cache_vdir_filter(void *cookie, vnode_t *vp) |
1189 | { | | 1185 | { |
1190 | | | 1186 | |
1191 | return vp->v_type == VDIR; | | 1187 | return vp->v_type == VDIR; |
1192 | } | | 1188 | } |
1193 | | | 1189 | |
1194 | /* | | 1190 | /* |
1195 | * Cache flush, a whole filesystem; called when filesys is umounted to | | 1191 | * Cache flush, a whole filesystem; called when filesys is umounted to |
1196 | * remove entries that would now be invalid. | | 1192 | * remove entries that would now be invalid. |
1197 | */ | | 1193 | */ |
1198 | void | | 1194 | void |
1199 | cache_purgevfs(struct mount *mp) | | 1195 | cache_purgevfs(struct mount *mp) |
1200 | { | | 1196 | { |
1201 | struct vnode_iterator *iter; | | 1197 | struct vnode_iterator *iter; |
1202 | vnode_t *dvp; | | 1198 | vnode_t *dvp; |
1203 | | | 1199 | |
1204 | vfs_vnode_iterator_init(mp, &iter); | | 1200 | vfs_vnode_iterator_init(mp, &iter); |
1205 | for (;;) { | | 1201 | for (;;) { |
1206 | dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL); | | 1202 | dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL); |
1207 | if (dvp == NULL) { | | 1203 | if (dvp == NULL) { |
1208 | break; | | 1204 | break; |
1209 | } | | 1205 | } |
1210 | cache_purge_children(dvp); | | 1206 | cache_purge_children(dvp); |
1211 | vrele(dvp); | | 1207 | vrele(dvp); |
1212 | } | | 1208 | } |
1213 | vfs_vnode_iterator_destroy(iter); | | 1209 | vfs_vnode_iterator_destroy(iter); |
1214 | } | | 1210 | } |
1215 | | | 1211 | |
1216 | /* | | 1212 | /* |
1217 | * Re-queue an entry onto the tail of the active LRU list, after it has | | 1213 | * Re-queue an entry onto the tail of the active LRU list, after it has |
1218 | * scored a hit. | | 1214 | * scored a hit. |
1219 | */ | | 1215 | */ |
1220 | static void | | 1216 | static void |
1221 | cache_activate(struct namecache *ncp) | | 1217 | cache_activate(struct namecache *ncp) |
1222 | { | | 1218 | { |
1223 | | | 1219 | |
1224 | mutex_enter(&cache_lru_lock); | | 1220 | mutex_enter(&cache_lru_lock); |
1225 | TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); | | 1221 | TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); |
1226 | TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); | | 1222 | TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); |
1227 | cache_lru.count[ncp->nc_lrulist]--; | | 1223 | cache_lru.count[ncp->nc_lrulist]--; |
1228 | cache_lru.count[LRU_ACTIVE]++; | | 1224 | cache_lru.count[LRU_ACTIVE]++; |
1229 | ncp->nc_lrulist = LRU_ACTIVE; | | 1225 | ncp->nc_lrulist = LRU_ACTIVE; |
1230 | mutex_exit(&cache_lru_lock); | | 1226 | mutex_exit(&cache_lru_lock); |
1231 | } | | 1227 | } |
1232 | | | 1228 | |
1233 | /* | | 1229 | /* |
1234 | * Try to balance the LRU lists. Pick some victim entries, and re-queue | | 1230 | * Try to balance the LRU lists. Pick some victim entries, and re-queue |
1235 | * them from the head of the active list to the tail of the inactive list. | | 1231 | * them from the head of the active list to the tail of the inactive list. |
1236 | */ | | 1232 | */ |
1237 | static void | | 1233 | static void |
1238 | cache_deactivate(void) | | 1234 | cache_deactivate(void) |
1239 | { | | 1235 | { |
1240 | struct namecache *ncp; | | 1236 | struct namecache *ncp; |
1241 | int total, i; | | 1237 | int total, i; |
1242 | | | 1238 | |
1243 | KASSERT(mutex_owned(&cache_lru_lock)); | | 1239 | KASSERT(mutex_owned(&cache_lru_lock)); |
1244 | | | 1240 | |
1245 | /* If we're nowhere near budget yet, don't bother. */ | | 1241 | /* If we're nowhere near budget yet, don't bother. */ |
1246 | total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE]; | | 1242 | total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE]; |
1247 | if (total < (desiredvnodes >> 1)) { | | 1243 | if (total < (desiredvnodes >> 1)) { |
1248 | return; | | 1244 | return; |
1249 | } | | 1245 | } |
1250 | | | 1246 | |
1251 | /* | | 1247 | /* |
1252 | * Aim for a 1:1 ratio of active to inactive. This is to allow each | | 1248 | * Aim for a 1:1 ratio of active to inactive. This is to allow each |
1253 | * potential victim a reasonable amount of time to cycle through the | | 1249 | * potential victim a reasonable amount of time to cycle through the |
1254 | * inactive list in order to score a hit and be reactivated, while | | 1250 | * inactive list in order to score a hit and be reactivated, while |
1255 | * trying not to cause reactivations too frequently. | | 1251 | * trying not to cause reactivations too frequently. |
1256 | */ | | 1252 | */ |
1257 | if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) { | | 1253 | if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) { |
1258 | return; | | 1254 | return; |
1259 | } | | 1255 | } |
1260 | | | 1256 | |
1261 | /* Move only a few at a time; will catch up eventually. */ | | 1257 | /* Move only a few at a time; will catch up eventually. */ |
1262 | for (i = 0; i < cache_lru_maxdeact; i++) { | | 1258 | for (i = 0; i < cache_lru_maxdeact; i++) { |
1263 | ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]); | | 1259 | ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]); |
1264 | if (ncp == NULL) { | | 1260 | if (ncp == NULL) { |
1265 | break; | | 1261 | break; |
1266 | } | | 1262 | } |
1267 | KASSERT(ncp->nc_lrulist == LRU_ACTIVE); | | 1263 | KASSERT(ncp->nc_lrulist == LRU_ACTIVE); |
1268 | ncp->nc_lrulist = LRU_INACTIVE; | | 1264 | ncp->nc_lrulist = LRU_INACTIVE; |
1269 | TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); | | 1265 | TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); |
1270 | TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru); | | 1266 | TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru); |
1271 | cache_lru.count[LRU_ACTIVE]--; | | 1267 | cache_lru.count[LRU_ACTIVE]--; |
1272 | cache_lru.count[LRU_INACTIVE]++; | | 1268 | cache_lru.count[LRU_INACTIVE]++; |
1273 | } | | 1269 | } |
1274 | } | | 1270 | } |
1275 | | | 1271 | |
1276 | /* | | 1272 | /* |
1277 | * Free some entries from the cache, when we have gone over budget. | | 1273 | * Free some entries from the cache, when we have gone over budget. |
1278 | * | | 1274 | * |
1279 | * We don't want to cause too much work for any individual caller, and it | | 1275 | * We don't want to cause too much work for any individual caller, and it |
1280 | * doesn't matter if we temporarily go over budget. This is also "just a | | 1276 | * doesn't matter if we temporarily go over budget. This is also "just a |
1281 | * cache" so it's not a big deal if we screw up and throw out something we | | 1277 | * cache" so it's not a big deal if we screw up and throw out something we |
1282 | * shouldn't. So we take a relaxed attitude to this process to reduce its | | 1278 | * shouldn't. So we take a relaxed attitude to this process to reduce its |
1283 | * impact. | | 1279 | * impact. |
1284 | */ | | 1280 | */ |
1285 | static void | | 1281 | static void |
1286 | cache_reclaim(void) | | 1282 | cache_reclaim(void) |
1287 | { | | 1283 | { |
1288 | struct namecache *ncp; | | 1284 | struct namecache *ncp; |
1289 | vnode_impl_t *dvi; | | 1285 | vnode_impl_t *dvi; |
1290 | int toscan; | | 1286 | int toscan; |
1291 | | | 1287 | |
1292 | /* | | 1288 | /* |
1293 | * Scan up to a preset maxium number of entries, but no more than | | 1289 | * Scan up to a preset maxium number of entries, but no more than |
1294 | * 0.8% of the total at once (to allow for very small systems). | | 1290 | * 0.8% of the total at once (to allow for very small systems). |
1295 | * | | 1291 | * |
1296 | * On bigger systems, do a larger chunk of work to reduce the number | | 1292 | * On bigger systems, do a larger chunk of work to reduce the number |
1297 | * of times that cache_lru_lock is held for any length of time. | | 1293 | * of times that cache_lru_lock is held for any length of time. |
1298 | */ | | 1294 | */ |
1299 | mutex_enter(&cache_lru_lock); | | 1295 | mutex_enter(&cache_lru_lock); |
1300 | toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7); | | 1296 | toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7); |
1301 | toscan = MAX(toscan, 1); | | 1297 | toscan = MAX(toscan, 1); |
1302 | SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] + | | 1298 | SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] + |
1303 | cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0); | | 1299 | cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0); |
1304 | while (toscan-- != 0) { | | 1300 | while (toscan-- != 0) { |
1305 | /* First try to balance the lists. */ | | 1301 | /* First try to balance the lists. */ |
1306 | cache_deactivate(); | | 1302 | cache_deactivate(); |
1307 | | | 1303 | |
1308 | /* Now look for a victim on head of inactive list (old). */ | | 1304 | /* Now look for a victim on head of inactive list (old). */ |
1309 | ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]); | | 1305 | ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]); |
1310 | if (ncp == NULL) { | | 1306 | if (ncp == NULL) { |
1311 | break; | | 1307 | break; |
1312 | } | | 1308 | } |
1313 | dvi = VNODE_TO_VIMPL(ncp->nc_dvp); | | 1309 | dvi = VNODE_TO_VIMPL(ncp->nc_dvp); |
1314 | KASSERT(ncp->nc_lrulist == LRU_INACTIVE); | | 1310 | KASSERT(ncp->nc_lrulist == LRU_INACTIVE); |
1315 | KASSERT(dvi != NULL); | | 1311 | KASSERT(dvi != NULL); |
1316 | | | 1312 | |
1317 | /* | | 1313 | /* |
1318 | * Locking in the wrong direction. If we can't get the | | 1314 | * Locking in the wrong direction. If we can't get the |
1319 | * lock, the directory is actively busy, and it could also | | 1315 | * lock, the directory is actively busy, and it could also |
1320 | * cause problems for the next guy in here, so send the | | 1316 | * cause problems for the next guy in here, so send the |
1321 | * entry to the back of the list. | | 1317 | * entry to the back of the list. |
1322 | */ | | 1318 | */ |
1323 | if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { | | 1319 | if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { |
1324 | TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE], | | 1320 | TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE], |
1325 | ncp, nc_lru); | | 1321 | ncp, nc_lru); |
1326 | TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], | | 1322 | TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], |
1327 | ncp, nc_lru); | | 1323 | ncp, nc_lru); |
1328 | continue; | | 1324 | continue; |
1329 | } | | 1325 | } |
1330 | | | 1326 | |
1331 | /* | | 1327 | /* |
1332 | * Now have the victim entry locked. Drop the LRU list | | 1328 | * Now have the victim entry locked. Drop the LRU list |
1333 | * lock, purge the entry, and start over. The hold on | | 1329 | * lock, purge the entry, and start over. The hold on |
1334 | * vi_nc_lock will prevent the vnode from vanishing until | | 1330 | * vi_nc_lock will prevent the vnode from vanishing until |
1335 | * finished (cache_purge() will be called on dvp before it | | 1331 | * finished (cache_purge() will be called on dvp before it |
1336 | * disappears, and that will wait on vi_nc_lock). | | 1332 | * disappears, and that will wait on vi_nc_lock). |
1337 | */ | | 1333 | */ |
1338 | mutex_exit(&cache_lru_lock); | | 1334 | mutex_exit(&cache_lru_lock); |
1339 | cache_remove(ncp, true); | | 1335 | cache_remove(ncp, true); |
1340 | rw_exit(&dvi->vi_nc_lock); | | 1336 | rw_exit(&dvi->vi_nc_lock); |
1341 | mutex_enter(&cache_lru_lock); | | 1337 | mutex_enter(&cache_lru_lock); |
1342 | } | | 1338 | } |
1343 | mutex_exit(&cache_lru_lock); | | 1339 | mutex_exit(&cache_lru_lock); |
1344 | } | | 1340 | } |
1345 | | | 1341 | |
1346 | /* | | 1342 | /* |
1347 | * For file system code: count a lookup that required a full re-scan of | | 1343 | * For file system code: count a lookup that required a full re-scan of |
1348 | * directory metadata. | | 1344 | * directory metadata. |
1349 | */ | | 1345 | */ |
1350 | void | | 1346 | void |
1351 | namecache_count_pass2(void) | | 1347 | namecache_count_pass2(void) |
1352 | { | | 1348 | { |
1353 | | | 1349 | |
1354 | COUNT(ncs_pass2); | | 1350 | COUNT(ncs_pass2); |
1355 | } | | 1351 | } |
1356 | | | 1352 | |
1357 | /* | | 1353 | /* |
1358 | * For file system code: count a lookup that scored a hit in the directory | | 1354 | * For file system code: count a lookup that scored a hit in the directory |
1359 | * metadata near the location of the last lookup. | | 1355 | * metadata near the location of the last lookup. |
1360 | */ | | 1356 | */ |
1361 | void | | 1357 | void |
1362 | namecache_count_2passes(void) | | 1358 | namecache_count_2passes(void) |
1363 | { | | 1359 | { |
1364 | | | 1360 | |
1365 | COUNT(ncs_2passes); | | 1361 | COUNT(ncs_2passes); |
1366 | } | | 1362 | } |
1367 | | | 1363 | |
1368 | /* | | 1364 | /* |
1369 | * Sum the stats from all CPUs into nchstats. This needs to run at least | | 1365 | * Sum the stats from all CPUs into nchstats. This needs to run at least |
1370 | * once within every window where a 32-bit counter could roll over. It's | | 1366 | * once within every window where a 32-bit counter could roll over. It's |
1371 | * called regularly by timer to ensure this. | | 1367 | * called regularly by timer to ensure this. |
1372 | */ | | 1368 | */ |
1373 | static void | | 1369 | static void |
1374 | cache_update_stats(void *cookie) | | 1370 | cache_update_stats(void *cookie) |
1375 | { | | 1371 | { |
1376 | CPU_INFO_ITERATOR cii; | | 1372 | CPU_INFO_ITERATOR cii; |
1377 | struct cpu_info *ci; | | 1373 | struct cpu_info *ci; |
1378 | | | 1374 | |
1379 | mutex_enter(&cache_stat_lock); | | 1375 | mutex_enter(&cache_stat_lock); |
1380 | for (CPU_INFO_FOREACH(cii, ci)) { | | 1376 | for (CPU_INFO_FOREACH(cii, ci)) { |
1381 | struct nchcpu *nchcpu = ci->ci_data.cpu_nch; | | 1377 | struct nchcpu *nchcpu = ci->ci_data.cpu_nch; |
1382 | UPDATE(nchcpu, ncs_goodhits); | | 1378 | UPDATE(nchcpu, ncs_goodhits); |
1383 | UPDATE(nchcpu, ncs_neghits); | | 1379 | UPDATE(nchcpu, ncs_neghits); |
1384 | UPDATE(nchcpu, ncs_badhits); | | 1380 | UPDATE(nchcpu, ncs_badhits); |
1385 | UPDATE(nchcpu, ncs_falsehits); | | 1381 | UPDATE(nchcpu, ncs_falsehits); |
1386 | UPDATE(nchcpu, ncs_miss); | | 1382 | UPDATE(nchcpu, ncs_miss); |
1387 | UPDATE(nchcpu, ncs_long); | | 1383 | UPDATE(nchcpu, ncs_long); |
1388 | UPDATE(nchcpu, ncs_pass2); | | 1384 | UPDATE(nchcpu, ncs_pass2); |
1389 | UPDATE(nchcpu, ncs_2passes); | | 1385 | UPDATE(nchcpu, ncs_2passes); |
1390 | UPDATE(nchcpu, ncs_revhits); | | 1386 | UPDATE(nchcpu, ncs_revhits); |
1391 | UPDATE(nchcpu, ncs_revmiss); | | 1387 | UPDATE(nchcpu, ncs_revmiss); |
1392 | UPDATE(nchcpu, ncs_denied); | | 1388 | UPDATE(nchcpu, ncs_denied); |
1393 | } | | 1389 | } |
1394 | if (cookie != NULL) { | | 1390 | if (cookie != NULL) { |
1395 | memcpy(cookie, &nchstats, sizeof(nchstats)); | | 1391 | memcpy(cookie, &nchstats, sizeof(nchstats)); |
1396 | } | | 1392 | } |
1397 | /* Reset the timer; arrive back here in N minutes at latest. */ | | 1393 | /* Reset the timer; arrive back here in N minutes at latest. */ |
1398 | callout_schedule(&cache_stat_callout, cache_stat_interval * hz); | | 1394 | callout_schedule(&cache_stat_callout, cache_stat_interval * hz); |
1399 | mutex_exit(&cache_stat_lock); | | 1395 | mutex_exit(&cache_stat_lock); |
1400 | } | | 1396 | } |
1401 | | | 1397 | |
1402 | /* | | 1398 | /* |
1403 | * Fetch the current values of the stats for sysctl. | | 1399 | * Fetch the current values of the stats for sysctl. |
1404 | */ | | 1400 | */ |
1405 | static int | | 1401 | static int |
1406 | cache_stat_sysctl(SYSCTLFN_ARGS) | | 1402 | cache_stat_sysctl(SYSCTLFN_ARGS) |
1407 | { | | 1403 | { |
1408 | struct nchstats stats; | | 1404 | struct nchstats stats; |
1409 | | | 1405 | |
1410 | if (oldp == NULL) { | | 1406 | if (oldp == NULL) { |
1411 | *oldlenp = sizeof(nchstats); | | 1407 | *oldlenp = sizeof(nchstats); |
1412 | return 0; | | 1408 | return 0; |
1413 | } | | 1409 | } |
1414 | | | 1410 | |
1415 | if (*oldlenp <= 0) { | | 1411 | if (*oldlenp <= 0) { |
1416 | *oldlenp = 0; | | 1412 | *oldlenp = 0; |
1417 | return 0; | | 1413 | return 0; |
1418 | } | | 1414 | } |
1419 | | | 1415 | |
1420 | /* Refresh the global stats. */ | | 1416 | /* Refresh the global stats. */ |
1421 | sysctl_unlock(); | | 1417 | sysctl_unlock(); |
1422 | cache_update_stats(&stats); | | 1418 | cache_update_stats(&stats); |
1423 | sysctl_relock(); | | 1419 | sysctl_relock(); |
1424 | | | 1420 | |
1425 | *oldlenp = MIN(sizeof(stats), *oldlenp); | | 1421 | *oldlenp = MIN(sizeof(stats), *oldlenp); |
1426 | return sysctl_copyout(l, &stats, oldp, *oldlenp); | | 1422 | return sysctl_copyout(l, &stats, oldp, *oldlenp); |
1427 | } | | 1423 | } |
1428 | | | 1424 | |
1429 | /* | | 1425 | /* |
1430 | * For the debugger, given the address of a vnode, print all associated | | 1426 | * For the debugger, given the address of a vnode, print all associated |
1431 | * names in the cache. | | 1427 | * names in the cache. |
1432 | */ | | 1428 | */ |
1433 | #ifdef DDB | | 1429 | #ifdef DDB |
1434 | void | | 1430 | void |
1435 | namecache_print(struct vnode *vp, void (*pr)(const char *, ...)) | | 1431 | namecache_print(struct vnode *vp, void (*pr)(const char *, ...)) |
1436 | { | | 1432 | { |
1437 | struct vnode *dvp = NULL; | | 1433 | struct vnode *dvp = NULL; |
1438 | struct namecache *ncp; | | 1434 | struct namecache *ncp; |
1439 | enum cache_lru_id id; | | 1435 | enum cache_lru_id id; |
1440 | | | 1436 | |
1441 | for (id = 0; id < LRU_COUNT; id++) { | | 1437 | for (id = 0; id < LRU_COUNT; id++) { |
1442 | TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { | | 1438 | TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { |
1443 | if (ncp->nc_vp == vp) { | | 1439 | if (ncp->nc_vp == vp) { |
1444 | (*pr)("name %.*s\n", ncp->nc_nlen, | | 1440 | (*pr)("name %.*s\n", ncp->nc_nlen, |
1445 | ncp->nc_name); | | 1441 | ncp->nc_name); |
1446 | dvp = ncp->nc_dvp; | | 1442 | dvp = ncp->nc_dvp; |
1447 | } | | 1443 | } |
1448 | } | | 1444 | } |
1449 | } | | 1445 | } |
1450 | if (dvp == NULL) { | | 1446 | if (dvp == NULL) { |
1451 | (*pr)("name not found\n"); | | 1447 | (*pr)("name not found\n"); |
1452 | return; | | 1448 | return; |
1453 | } | | 1449 | } |
1454 | for (id = 0; id < LRU_COUNT; id++) { | | 1450 | for (id = 0; id < LRU_COUNT; id++) { |
1455 | TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { | | 1451 | TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { |
1456 | if (ncp->nc_vp == dvp) { | | 1452 | if (ncp->nc_vp == dvp) { |
1457 | (*pr)("parent %.*s\n", ncp->nc_nlen, | | 1453 | (*pr)("parent %.*s\n", ncp->nc_nlen, |
1458 | ncp->nc_name); | | 1454 | ncp->nc_name); |
1459 | } | | 1455 | } |
1460 | } | | 1456 | } |
1461 | } | | 1457 | } |
1462 | } | | 1458 | } |
1463 | #endif | | 1459 | #endif |