Thu Apr 23 22:58:36 2020 UTC ()
cache_lookup_linked(): We can't use the name to decide how to lock the dir,
since the name refers to the child (found object) not the parent (the thing
that's being locked).

Fix it by always doing rw_tryenter().  There's not much to be won by
optimising for the contended case, and were this routine doing lockless
lookups (the eventual goal) it wouldn't be hanging around waiting for
changes either.


(ad)
diff -r1.140 -r1.141 src/sys/kern/vfs_cache.c

cvs diff -r1.140 -r1.141 src/sys/kern/vfs_cache.c (switch to unified diff)

--- src/sys/kern/vfs_cache.c 2020/04/22 21:35:52 1.140
+++ src/sys/kern/vfs_cache.c 2020/04/23 22:58:36 1.141
@@ -1,1463 +1,1459 @@ @@ -1,1463 +1,1459 @@
1/* $NetBSD: vfs_cache.c,v 1.140 2020/04/22 21:35:52 ad Exp $ */ 1/* $NetBSD: vfs_cache.c,v 1.141 2020/04/23 22:58:36 ad Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * Copyright (c) 1989, 1993 33 * Copyright (c) 1989, 1993
34 * The Regents of the University of California. All rights reserved. 34 * The Regents of the University of California. All rights reserved.
35 * 35 *
36 * Redistribution and use in source and binary forms, with or without 36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions 37 * modification, are permitted provided that the following conditions
38 * are met: 38 * are met:
39 * 1. Redistributions of source code must retain the above copyright 39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer. 40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright 41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the 42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution. 43 * documentation and/or other materials provided with the distribution.
44 * 3. Neither the name of the University nor the names of its contributors 44 * 3. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software 45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission. 46 * without specific prior written permission.
47 * 47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE. 58 * SUCH DAMAGE.
59 * 59 *
60 * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94 60 * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94
61 */ 61 */
62 62
63/* 63/*
64 * Name caching: 64 * Name caching:
65 * 65 *
66 * Names found by directory scans are retained in a cache for future 66 * Names found by directory scans are retained in a cache for future
67 * reference. It is managed LRU, so frequently used names will hang 67 * reference. It is managed LRU, so frequently used names will hang
68 * around. The cache is indexed by hash value obtained from the name. 68 * around. The cache is indexed by hash value obtained from the name.
69 * 69 *
70 * The name cache is the brainchild of Robert Elz and was introduced in 70 * The name cache is the brainchild of Robert Elz and was introduced in
71 * 4.3BSD. See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk 71 * 4.3BSD. See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk
72 * McKusick, May 21 1984. 72 * McKusick, May 21 1984.
73 * 73 *
74 * Data structures: 74 * Data structures:
75 * 75 *
76 * Most Unix namecaches very sensibly use a global hash table to index 76 * Most Unix namecaches very sensibly use a global hash table to index
77 * names. The global hash table works well, but can cause concurrency 77 * names. The global hash table works well, but can cause concurrency
78 * headaches for the kernel hacker. In the NetBSD 10.0 implementation 78 * headaches for the kernel hacker. In the NetBSD 10.0 implementation
79 * we are not sensible, and use a per-directory data structure to index 79 * we are not sensible, and use a per-directory data structure to index
80 * names, but the cache otherwise functions the same. 80 * names, but the cache otherwise functions the same.
81 * 81 *
82 * The index is a red-black tree. There are no special concurrency 82 * The index is a red-black tree. There are no special concurrency
83 * requirements placed on it, because it's per-directory and protected 83 * requirements placed on it, because it's per-directory and protected
84 * by the namecache's per-directory locks. It should therefore not be 84 * by the namecache's per-directory locks. It should therefore not be
85 * difficult to experiment with other types of index. 85 * difficult to experiment with other types of index.
86 * 86 *
87 * Each cached name is stored in a struct namecache, along with a 87 * Each cached name is stored in a struct namecache, along with a
88 * pointer to the associated vnode (nc_vp). Names longer than a 88 * pointer to the associated vnode (nc_vp). Names longer than a
89 * maximum length of NCHNAMLEN are allocated with kmem_alloc(); they 89 * maximum length of NCHNAMLEN are allocated with kmem_alloc(); they
90 * occur infrequently, and names shorter than this are stored directly 90 * occur infrequently, and names shorter than this are stored directly
91 * in struct namecache. If it is a "negative" entry, (i.e. for a name 91 * in struct namecache. If it is a "negative" entry, (i.e. for a name
92 * that is known NOT to exist) the vnode pointer will be NULL. 92 * that is known NOT to exist) the vnode pointer will be NULL.
93 * 93 *
94 * For a directory with 3 cached names for 3 distinct vnodes, the 94 * For a directory with 3 cached names for 3 distinct vnodes, the
95 * various vnodes and namecache structs would be connected like this 95 * various vnodes and namecache structs would be connected like this
96 * (the root is at the bottom of the diagram): 96 * (the root is at the bottom of the diagram):
97 * 97 *
98 * ... 98 * ...
99 * ^ 99 * ^
100 * |- vi_nc_tree 100 * |- vi_nc_tree
101 * |  101 * |
102 * +----o----+ +---------+ +---------+ 102 * +----o----+ +---------+ +---------+
103 * | VDIR | | VCHR | | VREG | 103 * | VDIR | | VCHR | | VREG |
104 * | vnode o-----+ | vnode o-----+ | vnode o------+ 104 * | vnode o-----+ | vnode o-----+ | vnode o------+
105 * +---------+ | +---------+ | +---------+ | 105 * +---------+ | +---------+ | +---------+ |
106 * ^ | ^ | ^ | 106 * ^ | ^ | ^ |
107 * |- nc_vp |- vi_nc_list |- nc_vp |- vi_nc_list |- nc_vp | 107 * |- nc_vp |- vi_nc_list |- nc_vp |- vi_nc_list |- nc_vp |
108 * | | | | | | 108 * | | | | | |
109 * +----o----+ | +----o----+ | +----o----+ | 109 * +----o----+ | +----o----+ | +----o----+ |
110 * +---onamecache|<----+ +---onamecache|<----+ +---onamecache|<-----+ 110 * +---onamecache|<----+ +---onamecache|<----+ +---onamecache|<-----+
111 * | +---------+ | +---------+ | +---------+ 111 * | +---------+ | +---------+ | +---------+
112 * | ^ | ^ | ^ 112 * | ^ | ^ | ^
113 * | | | | | | 113 * | | | | | |
114 * | | +----------------------+ | | 114 * | | +----------------------+ | |
115 * |-nc_dvp | +-------------------------------------------------+ 115 * |-nc_dvp | +-------------------------------------------------+
116 * | |/- vi_nc_tree | | 116 * | |/- vi_nc_tree | |
117 * | | |- nc_dvp |- nc_dvp 117 * | | |- nc_dvp |- nc_dvp
118 * | +----o----+ | | 118 * | +----o----+ | |
119 * +-->| VDIR |<----------+ | 119 * +-->| VDIR |<----------+ |
120 * | vnode |<------------------------------------+ 120 * | vnode |<------------------------------------+
121 * +---------+ 121 * +---------+
122 * 122 *
123 * START HERE 123 * START HERE
124 * 124 *
125 * Replacement: 125 * Replacement:
126 * 126 *
127 * As the cache becomes full, old and unused entries are purged as new 127 * As the cache becomes full, old and unused entries are purged as new
128 * entries are added. The synchronization overhead in maintaining a 128 * entries are added. The synchronization overhead in maintaining a
129 * strict ordering would be prohibitive, so the VM system's "clock" or 129 * strict ordering would be prohibitive, so the VM system's "clock" or
130 * "second chance" page replacement algorithm is aped here. New 130 * "second chance" page replacement algorithm is aped here. New
131 * entries go to the tail of the active list. After they age out and 131 * entries go to the tail of the active list. After they age out and
132 * reach the head of the list, they are moved to the tail of the 132 * reach the head of the list, they are moved to the tail of the
133 * inactive list. Any use of the deactivated cache entry reactivates 133 * inactive list. Any use of the deactivated cache entry reactivates
134 * it, saving it from impending doom; if not reactivated, the entry 134 * it, saving it from impending doom; if not reactivated, the entry
135 * eventually reaches the head of the inactive list and is purged. 135 * eventually reaches the head of the inactive list and is purged.
136 * 136 *
137 * Concurrency: 137 * Concurrency:
138 * 138 *
139 * From a performance perspective, cache_lookup(nameiop == LOOKUP) is 139 * From a performance perspective, cache_lookup(nameiop == LOOKUP) is
140 * what really matters; insertion of new entries with cache_enter() is 140 * what really matters; insertion of new entries with cache_enter() is
141 * comparatively infrequent, and overshadowed by the cost of expensive 141 * comparatively infrequent, and overshadowed by the cost of expensive
142 * file system metadata operations (which may involve disk I/O). We 142 * file system metadata operations (which may involve disk I/O). We
143 * therefore want to make everything simplest in the lookup path. 143 * therefore want to make everything simplest in the lookup path.
144 * 144 *
145 * struct namecache is mostly stable except for list and tree related 145 * struct namecache is mostly stable except for list and tree related
146 * entries, changes to which don't affect the cached name or vnode.  146 * entries, changes to which don't affect the cached name or vnode.
147 * For changes to name+vnode, entries are purged in preference to 147 * For changes to name+vnode, entries are purged in preference to
148 * modifying them. 148 * modifying them.
149 * 149 *
150 * Read access to namecache entries is made via tree, list, or LRU 150 * Read access to namecache entries is made via tree, list, or LRU
151 * list. A lock corresponding to the direction of access should be 151 * list. A lock corresponding to the direction of access should be
152 * held. See definition of "struct namecache" in src/sys/namei.src, 152 * held. See definition of "struct namecache" in src/sys/namei.src,
153 * and the definition of "struct vnode" for the particulars. 153 * and the definition of "struct vnode" for the particulars.
154 * 154 *
155 * Per-CPU statistics, and LRU list totals are read unlocked, since 155 * Per-CPU statistics, and LRU list totals are read unlocked, since
156 * an approximate value is OK. We maintain 32-bit sized per-CPU 156 * an approximate value is OK. We maintain 32-bit sized per-CPU
157 * counters and 64-bit global counters under the theory that 32-bit 157 * counters and 64-bit global counters under the theory that 32-bit
158 * sized counters are less likely to be hosed by nonatomic increment 158 * sized counters are less likely to be hosed by nonatomic increment
159 * (on 32-bit platforms). 159 * (on 32-bit platforms).
160 * 160 *
161 * The lock order is: 161 * The lock order is:
162 * 162 *
163 * 1) vi->vi_nc_lock (tree or parent -> child direction, 163 * 1) vi->vi_nc_lock (tree or parent -> child direction,
164 * used during forward lookup) 164 * used during forward lookup)
165 * 165 *
166 * 2) vi->vi_nc_listlock (list or child -> parent direction, 166 * 2) vi->vi_nc_listlock (list or child -> parent direction,
167 * used during reverse lookup) 167 * used during reverse lookup)
168 * 168 *
169 * 3) cache_lru_lock (LRU list direction, used during reclaim) 169 * 3) cache_lru_lock (LRU list direction, used during reclaim)
170 * 170 *
171 * 4) vp->v_interlock (what the cache entry points to) 171 * 4) vp->v_interlock (what the cache entry points to)
172 */ 172 */
173 173
174#include <sys/cdefs.h> 174#include <sys/cdefs.h>
175__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.140 2020/04/22 21:35:52 ad Exp $"); 175__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.141 2020/04/23 22:58:36 ad Exp $");
176 176
177#define __NAMECACHE_PRIVATE 177#define __NAMECACHE_PRIVATE
178#ifdef _KERNEL_OPT 178#ifdef _KERNEL_OPT
179#include "opt_ddb.h" 179#include "opt_ddb.h"
180#include "opt_dtrace.h" 180#include "opt_dtrace.h"
181#endif 181#endif
182 182
183#include <sys/types.h> 183#include <sys/types.h>
184#include <sys/atomic.h> 184#include <sys/atomic.h>
185#include <sys/callout.h> 185#include <sys/callout.h>
186#include <sys/cpu.h> 186#include <sys/cpu.h>
187#include <sys/errno.h> 187#include <sys/errno.h>
188#include <sys/evcnt.h> 188#include <sys/evcnt.h>
189#include <sys/hash.h> 189#include <sys/hash.h>
190#include <sys/kernel.h> 190#include <sys/kernel.h>
191#include <sys/mount.h> 191#include <sys/mount.h>
192#include <sys/mutex.h> 192#include <sys/mutex.h>
193#include <sys/namei.h> 193#include <sys/namei.h>
194#include <sys/param.h> 194#include <sys/param.h>
195#include <sys/pool.h> 195#include <sys/pool.h>
196#include <sys/sdt.h> 196#include <sys/sdt.h>
197#include <sys/sysctl.h> 197#include <sys/sysctl.h>
198#include <sys/systm.h> 198#include <sys/systm.h>
199#include <sys/time.h> 199#include <sys/time.h>
200#include <sys/vnode_impl.h> 200#include <sys/vnode_impl.h>
201 201
202#include <miscfs/genfs/genfs.h> 202#include <miscfs/genfs/genfs.h>
203 203
204static void cache_activate(struct namecache *); 204static void cache_activate(struct namecache *);
205static void cache_update_stats(void *); 205static void cache_update_stats(void *);
206static int cache_compare_nodes(void *, const void *, const void *); 206static int cache_compare_nodes(void *, const void *, const void *);
207static void cache_deactivate(void); 207static void cache_deactivate(void);
208static void cache_reclaim(void); 208static void cache_reclaim(void);
209static int cache_stat_sysctl(SYSCTLFN_ARGS); 209static int cache_stat_sysctl(SYSCTLFN_ARGS);
210 210
211/* 211/*
212 * Global pool cache. 212 * Global pool cache.
213 */ 213 */
214static pool_cache_t cache_pool __read_mostly; 214static pool_cache_t cache_pool __read_mostly;
215 215
216/* 216/*
217 * LRU replacement. 217 * LRU replacement.
218 */ 218 */
219enum cache_lru_id { 219enum cache_lru_id {
220 LRU_ACTIVE, 220 LRU_ACTIVE,
221 LRU_INACTIVE, 221 LRU_INACTIVE,
222 LRU_COUNT 222 LRU_COUNT
223}; 223};
224 224
225static struct { 225static struct {
226 TAILQ_HEAD(, namecache) list[LRU_COUNT]; 226 TAILQ_HEAD(, namecache) list[LRU_COUNT];
227 u_int count[LRU_COUNT]; 227 u_int count[LRU_COUNT];
228} cache_lru __cacheline_aligned; 228} cache_lru __cacheline_aligned;
229 229
230static kmutex_t cache_lru_lock __cacheline_aligned; 230static kmutex_t cache_lru_lock __cacheline_aligned;
231 231
232/* 232/*
233 * Cache effectiveness statistics. nchstats holds system-wide total. 233 * Cache effectiveness statistics. nchstats holds system-wide total.
234 */ 234 */
235struct nchstats nchstats; 235struct nchstats nchstats;
236struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t); 236struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t);
237struct nchcpu { 237struct nchcpu {
238 struct nchstats_percpu cur; 238 struct nchstats_percpu cur;
239 struct nchstats_percpu last; 239 struct nchstats_percpu last;
240}; 240};
241static callout_t cache_stat_callout; 241static callout_t cache_stat_callout;
242static kmutex_t cache_stat_lock __cacheline_aligned; 242static kmutex_t cache_stat_lock __cacheline_aligned;
243 243
244#define COUNT(f) do { \ 244#define COUNT(f) do { \
245 lwp_t *l = curlwp; \ 245 lwp_t *l = curlwp; \
246 KPREEMPT_DISABLE(l); \ 246 KPREEMPT_DISABLE(l); \
247 ((struct nchstats_percpu *)curcpu()->ci_data.cpu_nch)->f++; \ 247 ((struct nchstats_percpu *)curcpu()->ci_data.cpu_nch)->f++; \
248 KPREEMPT_ENABLE(l); \ 248 KPREEMPT_ENABLE(l); \
249} while (/* CONSTCOND */ 0); 249} while (/* CONSTCOND */ 0);
250 250
251#define UPDATE(nchcpu, f) do { \ 251#define UPDATE(nchcpu, f) do { \
252 uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \ 252 uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \
253 nchstats.f += (uint32_t)(cur - nchcpu->last.f); \ 253 nchstats.f += (uint32_t)(cur - nchcpu->last.f); \
254 nchcpu->last.f = cur; \ 254 nchcpu->last.f = cur; \
255} while (/* CONSTCOND */ 0) 255} while (/* CONSTCOND */ 0)
256 256
257/* 257/*
258 * Tunables. cache_maxlen replaces the historical doingcache: 258 * Tunables. cache_maxlen replaces the historical doingcache:
259 * set it zero to disable caching for debugging purposes. 259 * set it zero to disable caching for debugging purposes.
260 */ 260 */
261int cache_lru_maxdeact __read_mostly = 2; /* max # to deactivate */ 261int cache_lru_maxdeact __read_mostly = 2; /* max # to deactivate */
262int cache_lru_maxscan __read_mostly = 64; /* max # to scan/reclaim */ 262int cache_lru_maxscan __read_mostly = 64; /* max # to scan/reclaim */
263int cache_maxlen __read_mostly = USHRT_MAX; /* max name length to cache */ 263int cache_maxlen __read_mostly = USHRT_MAX; /* max name length to cache */
264int cache_stat_interval __read_mostly = 300; /* in seconds */ 264int cache_stat_interval __read_mostly = 300; /* in seconds */
265 265
266/* 266/*
267 * sysctl stuff. 267 * sysctl stuff.
268 */ 268 */
269static struct sysctllog *cache_sysctllog; 269static struct sysctllog *cache_sysctllog;
270 270
271/* 271/*
272 * Red-black tree stuff. 272 * Red-black tree stuff.
273 */ 273 */
274static const rb_tree_ops_t cache_rbtree_ops = { 274static const rb_tree_ops_t cache_rbtree_ops = {
275 .rbto_compare_nodes = cache_compare_nodes, 275 .rbto_compare_nodes = cache_compare_nodes,
276 .rbto_compare_key = cache_compare_nodes, 276 .rbto_compare_key = cache_compare_nodes,
277 .rbto_node_offset = offsetof(struct namecache, nc_tree), 277 .rbto_node_offset = offsetof(struct namecache, nc_tree),
278 .rbto_context = NULL 278 .rbto_context = NULL
279}; 279};
280 280
281/* 281/*
282 * dtrace probes. 282 * dtrace probes.
283 */ 283 */
284SDT_PROVIDER_DEFINE(vfs); 284SDT_PROVIDER_DEFINE(vfs);
285 285
286SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *"); 286SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *");
287SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *"); 287SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *");
288SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *"); 288SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *");
289SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t"); 289SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t");
290SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *"); 290SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *");
291SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", 291SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *",
292 "char *", "size_t"); 292 "char *", "size_t");
293SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *", 293SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *",
294 "char *", "size_t"); 294 "char *", "size_t");
295SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *", 295SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *",
296 "char *", "size_t"); 296 "char *", "size_t");
297SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *", 297SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *",
298 "struct vnode *"); 298 "struct vnode *");
299SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *", 299SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *",
300 "int"); 300 "int");
301SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int"); 301SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int");
302SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *", 302SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *",
303 "char *", "size_t"); 303 "char *", "size_t");
304SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", 304SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *",
305 "char *", "size_t"); 305 "char *", "size_t");
306 306
307/* 307/*
308 * rbtree: compare two nodes. 308 * rbtree: compare two nodes.
309 */ 309 */
310static int 310static int
311cache_compare_nodes(void *context, const void *n1, const void *n2) 311cache_compare_nodes(void *context, const void *n1, const void *n2)
312{ 312{
313 const struct namecache *nc1 = n1; 313 const struct namecache *nc1 = n1;
314 const struct namecache *nc2 = n2; 314 const struct namecache *nc2 = n2;
315 315
316 if (nc1->nc_key < nc2->nc_key) { 316 if (nc1->nc_key < nc2->nc_key) {
317 return -1; 317 return -1;
318 } 318 }
319 if (nc1->nc_key > nc2->nc_key) { 319 if (nc1->nc_key > nc2->nc_key) {
320 return 1; 320 return 1;
321 } 321 }
322 KASSERT(nc1->nc_nlen == nc2->nc_nlen); 322 KASSERT(nc1->nc_nlen == nc2->nc_nlen);
323 return memcmp(nc1->nc_name, nc2->nc_name, nc1->nc_nlen); 323 return memcmp(nc1->nc_name, nc2->nc_name, nc1->nc_nlen);
324} 324}
325 325
326/* 326/*
327 * Compute a key value for the given name. The name length is encoded in 327 * Compute a key value for the given name. The name length is encoded in
328 * the key value to try and improve uniqueness, and so that length doesn't 328 * the key value to try and improve uniqueness, and so that length doesn't
329 * need to be compared separately for string comparisons. 329 * need to be compared separately for string comparisons.
330 */ 330 */
331static inline uint64_t 331static inline uint64_t
332cache_key(const char *name, size_t nlen) 332cache_key(const char *name, size_t nlen)
333{ 333{
334 uint64_t key; 334 uint64_t key;
335 335
336 KASSERT(nlen <= USHRT_MAX); 336 KASSERT(nlen <= USHRT_MAX);
337 337
338 key = hash32_buf(name, nlen, HASH32_STR_INIT); 338 key = hash32_buf(name, nlen, HASH32_STR_INIT);
339 return (key << 32) | nlen; 339 return (key << 32) | nlen;
340} 340}
341 341
342/* 342/*
343 * Remove an entry from the cache. vi_nc_lock must be held, and if dir2node 343 * Remove an entry from the cache. vi_nc_lock must be held, and if dir2node
344 * is true, then we're locking in the conventional direction and the list 344 * is true, then we're locking in the conventional direction and the list
345 * lock will be acquired when removing the entry from the vnode list. 345 * lock will be acquired when removing the entry from the vnode list.
346 */ 346 */
347static void 347static void
348cache_remove(struct namecache *ncp, const bool dir2node) 348cache_remove(struct namecache *ncp, const bool dir2node)
349{ 349{
350 struct vnode *vp, *dvp = ncp->nc_dvp; 350 struct vnode *vp, *dvp = ncp->nc_dvp;
351 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 351 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
352 352
353 KASSERT(rw_write_held(&dvi->vi_nc_lock)); 353 KASSERT(rw_write_held(&dvi->vi_nc_lock));
354 KASSERT(cache_key(ncp->nc_name, ncp->nc_nlen) == ncp->nc_key); 354 KASSERT(cache_key(ncp->nc_name, ncp->nc_nlen) == ncp->nc_key);
355 KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp); 355 KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp);
356 356
357 SDT_PROBE(vfs, namecache, invalidate, done, ncp, 357 SDT_PROBE(vfs, namecache, invalidate, done, ncp,
358 0, 0, 0, 0); 358 0, 0, 0, 0);
359 359
360 /* 360 /*
361 * Remove from the vnode's list. This excludes cache_revlookup(), 361 * Remove from the vnode's list. This excludes cache_revlookup(),
362 * and then it's safe to remove from the LRU lists. 362 * and then it's safe to remove from the LRU lists.
363 */ 363 */
364 if ((vp = ncp->nc_vp) != NULL) { 364 if ((vp = ncp->nc_vp) != NULL) {
365 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 365 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
366 if (__predict_true(dir2node)) { 366 if (__predict_true(dir2node)) {
367 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 367 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
368 TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); 368 TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
369 rw_exit(&vi->vi_nc_listlock); 369 rw_exit(&vi->vi_nc_listlock);
370 } else { 370 } else {
371 TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); 371 TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
372 } 372 }
373 } 373 }
374 374
375 /* Remove from the directory's rbtree. */ 375 /* Remove from the directory's rbtree. */
376 rb_tree_remove_node(&dvi->vi_nc_tree, ncp); 376 rb_tree_remove_node(&dvi->vi_nc_tree, ncp);
377 377
378 /* Remove from the LRU lists. */ 378 /* Remove from the LRU lists. */
379 mutex_enter(&cache_lru_lock); 379 mutex_enter(&cache_lru_lock);
380 TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); 380 TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
381 cache_lru.count[ncp->nc_lrulist]--; 381 cache_lru.count[ncp->nc_lrulist]--;
382 mutex_exit(&cache_lru_lock); 382 mutex_exit(&cache_lru_lock);
383 383
384 /* Finally, free it. */ 384 /* Finally, free it. */
385 if (ncp->nc_nlen > NCHNAMLEN) { 385 if (ncp->nc_nlen > NCHNAMLEN) {
386 size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]); 386 size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]);
387 kmem_free(ncp, sz); 387 kmem_free(ncp, sz);
388 } else { 388 } else {
389 pool_cache_put(cache_pool, ncp); 389 pool_cache_put(cache_pool, ncp);
390 } 390 }
391} 391}
392 392
393/* 393/*
394 * Find a single cache entry and return it. vi_nc_lock must be held. 394 * Find a single cache entry and return it. vi_nc_lock must be held.
395 */ 395 */
396static struct namecache * __noinline 396static struct namecache * __noinline
397cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen, 397cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen,
398 uint64_t key) 398 uint64_t key)
399{ 399{
400 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 400 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
401 struct rb_node *node = dvi->vi_nc_tree.rbt_root; 401 struct rb_node *node = dvi->vi_nc_tree.rbt_root;
402 struct namecache *ncp; 402 struct namecache *ncp;
403 int lrulist, diff; 403 int lrulist, diff;
404 404
405 KASSERT(rw_lock_held(&dvi->vi_nc_lock)); 405 KASSERT(rw_lock_held(&dvi->vi_nc_lock));
406 406
407 /* 407 /*
408 * Search the RB tree for the key. This is an inlined lookup 408 * Search the RB tree for the key. This is an inlined lookup
409 * tailored for exactly what's needed here (64-bit key and so on) 409 * tailored for exactly what's needed here (64-bit key and so on)
410 * that is quite a bit faster than using rb_tree_find_node().  410 * that is quite a bit faster than using rb_tree_find_node().
411 * 411 *
412 * For a matching key memcmp() needs to be called once to confirm 412 * For a matching key memcmp() needs to be called once to confirm
413 * that the correct name has been found. Very rarely there will be 413 * that the correct name has been found. Very rarely there will be
414 * a key value collision and the search will continue. 414 * a key value collision and the search will continue.
415 */ 415 */
416 for (;;) { 416 for (;;) {
417 if (__predict_false(RB_SENTINEL_P(node))) { 417 if (__predict_false(RB_SENTINEL_P(node))) {
418 return NULL; 418 return NULL;
419 } 419 }
420 ncp = (struct namecache *)node; 420 ncp = (struct namecache *)node;
421 KASSERT((void *)&ncp->nc_tree == (void *)ncp); 421 KASSERT((void *)&ncp->nc_tree == (void *)ncp);
422 KASSERT(ncp->nc_dvp == dvp); 422 KASSERT(ncp->nc_dvp == dvp);
423 if (ncp->nc_key == key) { 423 if (ncp->nc_key == key) {
424 KASSERT(ncp->nc_nlen == namelen); 424 KASSERT(ncp->nc_nlen == namelen);
425 diff = memcmp(ncp->nc_name, name, namelen); 425 diff = memcmp(ncp->nc_name, name, namelen);
426 if (__predict_true(diff == 0)) { 426 if (__predict_true(diff == 0)) {
427 break; 427 break;
428 } 428 }
429 node = node->rb_nodes[diff < 0];  429 node = node->rb_nodes[diff < 0];
430 } else { 430 } else {
431 node = node->rb_nodes[ncp->nc_key < key]; 431 node = node->rb_nodes[ncp->nc_key < key];
432 } 432 }
433 } 433 }
434 434
435 /* 435 /*
436 * If the entry is on the wrong LRU list, requeue it. This is an 436 * If the entry is on the wrong LRU list, requeue it. This is an
437 * unlocked check, but it will rarely be wrong and even then there 437 * unlocked check, but it will rarely be wrong and even then there
438 * will be no harm caused. 438 * will be no harm caused.
439 */ 439 */
440 lrulist = atomic_load_relaxed(&ncp->nc_lrulist); 440 lrulist = atomic_load_relaxed(&ncp->nc_lrulist);
441 if (__predict_false(lrulist != LRU_ACTIVE)) { 441 if (__predict_false(lrulist != LRU_ACTIVE)) {
442 cache_activate(ncp); 442 cache_activate(ncp);
443 } 443 }
444 return ncp; 444 return ncp;
445} 445}
446 446
447/* 447/*
448 * Look for a the name in the cache. We don't do this 448 * Look for a the name in the cache. We don't do this
449 * if the segment name is long, simply so the cache can avoid 449 * if the segment name is long, simply so the cache can avoid
450 * holding long names (which would either waste space, or 450 * holding long names (which would either waste space, or
451 * add greatly to the complexity). 451 * add greatly to the complexity).
452 * 452 *
453 * Lookup is called with DVP pointing to the directory to search, 453 * Lookup is called with DVP pointing to the directory to search,
454 * and CNP providing the name of the entry being sought: cn_nameptr 454 * and CNP providing the name of the entry being sought: cn_nameptr
455 * is the name, cn_namelen is its length, and cn_flags is the flags 455 * is the name, cn_namelen is its length, and cn_flags is the flags
456 * word from the namei operation. 456 * word from the namei operation.
457 * 457 *
458 * DVP must be locked. 458 * DVP must be locked.
459 * 459 *
460 * There are three possible non-error return states: 460 * There are three possible non-error return states:
461 * 1. Nothing was found in the cache. Nothing is known about 461 * 1. Nothing was found in the cache. Nothing is known about
462 * the requested name. 462 * the requested name.
463 * 2. A negative entry was found in the cache, meaning that the 463 * 2. A negative entry was found in the cache, meaning that the
464 * requested name definitely does not exist. 464 * requested name definitely does not exist.
465 * 3. A positive entry was found in the cache, meaning that the 465 * 3. A positive entry was found in the cache, meaning that the
466 * requested name does exist and that we are providing the 466 * requested name does exist and that we are providing the
467 * vnode. 467 * vnode.
468 * In these cases the results are: 468 * In these cases the results are:
469 * 1. 0 returned; VN is set to NULL. 469 * 1. 0 returned; VN is set to NULL.
470 * 2. 1 returned; VN is set to NULL. 470 * 2. 1 returned; VN is set to NULL.
471 * 3. 1 returned; VN is set to the vnode found. 471 * 3. 1 returned; VN is set to the vnode found.
472 * 472 *
473 * The additional result argument ISWHT is set to zero, unless a 473 * The additional result argument ISWHT is set to zero, unless a
474 * negative entry is found that was entered as a whiteout, in which 474 * negative entry is found that was entered as a whiteout, in which
475 * case ISWHT is set to one. 475 * case ISWHT is set to one.
476 * 476 *
477 * The ISWHT_RET argument pointer may be null. In this case an 477 * The ISWHT_RET argument pointer may be null. In this case an
478 * assertion is made that the whiteout flag is not set. File systems 478 * assertion is made that the whiteout flag is not set. File systems
479 * that do not support whiteouts can/should do this. 479 * that do not support whiteouts can/should do this.
480 * 480 *
481 * Filesystems that do support whiteouts should add ISWHITEOUT to 481 * Filesystems that do support whiteouts should add ISWHITEOUT to
482 * cnp->cn_flags if ISWHT comes back nonzero. 482 * cnp->cn_flags if ISWHT comes back nonzero.
483 * 483 *
484 * When a vnode is returned, it is locked, as per the vnode lookup 484 * When a vnode is returned, it is locked, as per the vnode lookup
485 * locking protocol. 485 * locking protocol.
486 * 486 *
487 * There is no way for this function to fail, in the sense of 487 * There is no way for this function to fail, in the sense of
488 * generating an error that requires aborting the namei operation. 488 * generating an error that requires aborting the namei operation.
489 * 489 *
490 * (Prior to October 2012, this function returned an integer status, 490 * (Prior to October 2012, this function returned an integer status,
491 * and a vnode, and mucked with the flags word in CNP for whiteouts. 491 * and a vnode, and mucked with the flags word in CNP for whiteouts.
492 * The integer status was -1 for "nothing found", ENOENT for "a 492 * The integer status was -1 for "nothing found", ENOENT for "a
493 * negative entry found", 0 for "a positive entry found", and possibly 493 * negative entry found", 0 for "a positive entry found", and possibly
494 * other errors, and the value of VN might or might not have been set 494 * other errors, and the value of VN might or might not have been set
495 * depending on what error occurred.) 495 * depending on what error occurred.)
496 */ 496 */
497bool 497bool
498cache_lookup(struct vnode *dvp, const char *name, size_t namelen, 498cache_lookup(struct vnode *dvp, const char *name, size_t namelen,
499 uint32_t nameiop, uint32_t cnflags, 499 uint32_t nameiop, uint32_t cnflags,
500 int *iswht_ret, struct vnode **vn_ret) 500 int *iswht_ret, struct vnode **vn_ret)
501{ 501{
502 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 502 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
503 struct namecache *ncp; 503 struct namecache *ncp;
504 struct vnode *vp; 504 struct vnode *vp;
505 uint64_t key; 505 uint64_t key;
506 int error; 506 int error;
507 bool hit; 507 bool hit;
508 krw_t op; 508 krw_t op;
509 509
510 /* Establish default result values */ 510 /* Establish default result values */
511 if (iswht_ret != NULL) { 511 if (iswht_ret != NULL) {
512 *iswht_ret = 0; 512 *iswht_ret = 0;
513 } 513 }
514 *vn_ret = NULL; 514 *vn_ret = NULL;
515 515
516 if (__predict_false(namelen > cache_maxlen)) { 516 if (__predict_false(namelen > cache_maxlen)) {
517 SDT_PROBE(vfs, namecache, lookup, toolong, dvp, 517 SDT_PROBE(vfs, namecache, lookup, toolong, dvp,
518 name, namelen, 0, 0); 518 name, namelen, 0, 0);
519 COUNT(ncs_long); 519 COUNT(ncs_long);
520 return false; 520 return false;
521 } 521 }
522 522
523 /* Compute the key up front - don't need the lock. */ 523 /* Compute the key up front - don't need the lock. */
524 key = cache_key(name, namelen); 524 key = cache_key(name, namelen);
525 525
526 /* Could the entry be purged below? */ 526 /* Could the entry be purged below? */
527 if ((cnflags & ISLASTCN) != 0 && 527 if ((cnflags & ISLASTCN) != 0 &&
528 ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) { 528 ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) {
529 op = RW_WRITER; 529 op = RW_WRITER;
530 } else { 530 } else {
531 op = RW_READER; 531 op = RW_READER;
532 } 532 }
533 533
534 /* Now look for the name. */ 534 /* Now look for the name. */
535 rw_enter(&dvi->vi_nc_lock, op); 535 rw_enter(&dvi->vi_nc_lock, op);
536 ncp = cache_lookup_entry(dvp, name, namelen, key); 536 ncp = cache_lookup_entry(dvp, name, namelen, key);
537 if (__predict_false(ncp == NULL)) { 537 if (__predict_false(ncp == NULL)) {
538 rw_exit(&dvi->vi_nc_lock); 538 rw_exit(&dvi->vi_nc_lock);
539 COUNT(ncs_miss); 539 COUNT(ncs_miss);
540 SDT_PROBE(vfs, namecache, lookup, miss, dvp, 540 SDT_PROBE(vfs, namecache, lookup, miss, dvp,
541 name, namelen, 0, 0); 541 name, namelen, 0, 0);
542 return false; 542 return false;
543 } 543 }
544 if (__predict_false((cnflags & MAKEENTRY) == 0)) { 544 if (__predict_false((cnflags & MAKEENTRY) == 0)) {
545 /* 545 /*
546 * Last component and we are renaming or deleting, 546 * Last component and we are renaming or deleting,
547 * the cache entry is invalid, or otherwise don't 547 * the cache entry is invalid, or otherwise don't
548 * want cache entry to exist. 548 * want cache entry to exist.
549 */ 549 */
550 KASSERT((cnflags & ISLASTCN) != 0); 550 KASSERT((cnflags & ISLASTCN) != 0);
551 cache_remove(ncp, true); 551 cache_remove(ncp, true);
552 rw_exit(&dvi->vi_nc_lock); 552 rw_exit(&dvi->vi_nc_lock);
553 COUNT(ncs_badhits); 553 COUNT(ncs_badhits);
554 return false; 554 return false;
555 } 555 }
556 if (ncp->nc_vp == NULL) { 556 if (ncp->nc_vp == NULL) {
557 if (iswht_ret != NULL) { 557 if (iswht_ret != NULL) {
558 /* 558 /*
559 * Restore the ISWHITEOUT flag saved earlier. 559 * Restore the ISWHITEOUT flag saved earlier.
560 */ 560 */
561 *iswht_ret = ncp->nc_whiteout; 561 *iswht_ret = ncp->nc_whiteout;
562 } else { 562 } else {
563 KASSERT(!ncp->nc_whiteout); 563 KASSERT(!ncp->nc_whiteout);
564 } 564 }
565 if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) { 565 if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) {
566 /* 566 /*
567 * Last component and we are preparing to create 567 * Last component and we are preparing to create
568 * the named object, so flush the negative cache 568 * the named object, so flush the negative cache
569 * entry. 569 * entry.
570 */ 570 */
571 COUNT(ncs_badhits); 571 COUNT(ncs_badhits);
572 cache_remove(ncp, true); 572 cache_remove(ncp, true);
573 hit = false; 573 hit = false;
574 } else { 574 } else {
575 COUNT(ncs_neghits); 575 COUNT(ncs_neghits);
576 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, 576 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name,
577 namelen, 0, 0); 577 namelen, 0, 0);
578 /* found neg entry; vn is already null from above */ 578 /* found neg entry; vn is already null from above */
579 hit = true; 579 hit = true;
580 } 580 }
581 rw_exit(&dvi->vi_nc_lock); 581 rw_exit(&dvi->vi_nc_lock);
582 return hit; 582 return hit;
583 } 583 }
584 vp = ncp->nc_vp; 584 vp = ncp->nc_vp;
585 mutex_enter(vp->v_interlock); 585 mutex_enter(vp->v_interlock);
586 rw_exit(&dvi->vi_nc_lock); 586 rw_exit(&dvi->vi_nc_lock);
587 587
588 /* 588 /*
589 * Unlocked except for the vnode interlock. Call vcache_tryvget(). 589 * Unlocked except for the vnode interlock. Call vcache_tryvget().
590 */ 590 */
591 error = vcache_tryvget(vp); 591 error = vcache_tryvget(vp);
592 if (error) { 592 if (error) {
593 KASSERT(error == EBUSY); 593 KASSERT(error == EBUSY);
594 /* 594 /*
595 * This vnode is being cleaned out. 595 * This vnode is being cleaned out.
596 * XXX badhits? 596 * XXX badhits?
597 */ 597 */
598 COUNT(ncs_falsehits); 598 COUNT(ncs_falsehits);
599 return false; 599 return false;
600 } 600 }
601 601
602 COUNT(ncs_goodhits); 602 COUNT(ncs_goodhits);
603 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); 603 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
604 /* found it */ 604 /* found it */
605 *vn_ret = vp; 605 *vn_ret = vp;
606 return true; 606 return true;
607} 607}
608 608
609/* 609/*
610 * Version of the above without the nameiop argument, for NFS. 610 * Version of the above without the nameiop argument, for NFS.
611 */ 611 */
612bool 612bool
613cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen, 613cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen,
614 uint32_t cnflags, 614 uint32_t cnflags,
615 int *iswht_ret, struct vnode **vn_ret) 615 int *iswht_ret, struct vnode **vn_ret)
616{ 616{
617 617
618 return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY, 618 return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY,
619 iswht_ret, vn_ret); 619 iswht_ret, vn_ret);
620} 620}
621 621
622/* 622/*
623 * Used by namei() to walk down a path, component by component by looking up 623 * Used by namei() to walk down a path, component by component by looking up
624 * names in the cache. The node locks are chained along the way: a parent's 624 * names in the cache. The node locks are chained along the way: a parent's
625 * lock is not dropped until the child's is acquired. 625 * lock is not dropped until the child's is acquired.
626 */ 626 */
627bool 627bool
628cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen, 628cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen,
629 struct vnode **vn_ret, krwlock_t **plock, 629 struct vnode **vn_ret, krwlock_t **plock,
630 kauth_cred_t cred) 630 kauth_cred_t cred)
631{ 631{
632 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 632 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
633 struct namecache *ncp; 633 struct namecache *ncp;
634 uint64_t key; 634 uint64_t key;
635 int error; 635 int error;
636 636
637 /* Establish default results. */ 637 /* Establish default results. */
638 *vn_ret = NULL; 638 *vn_ret = NULL;
639 639
640 /* If disabled, or file system doesn't support this, bail out. */ 640 /* If disabled, or file system doesn't support this, bail out. */
641 if (__predict_false((dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) { 641 if (__predict_false((dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) {
642 return false; 642 return false;
643 } 643 }
644 644
645 if (__predict_false(namelen > cache_maxlen)) { 645 if (__predict_false(namelen > cache_maxlen)) {
646 COUNT(ncs_long); 646 COUNT(ncs_long);
647 return false; 647 return false;
648 } 648 }
649 649
650 /* Compute the key up front - don't need the lock. */ 650 /* Compute the key up front - don't need the lock. */
651 key = cache_key(name, namelen); 651 key = cache_key(name, namelen);
652 652
653 /* 653 /*
654 * Acquire the directory lock. Once we have that, we can drop the 654 * Acquire the directory lock. Once we have that, we can drop the
655 * previous one (if any). 655 * previous one (if any).
656 * 656 *
657 * The two lock holds mean that the directory can't go away while 657 * The two lock holds mean that the directory can't go away while
658 * here: the directory must be purged with cache_purge() before 658 * here: the directory must be purged with cache_purge() before
659 * being freed, and both parent & child's vi_nc_lock must be taken 659 * being freed, and both parent & child's vi_nc_lock must be taken
660 * before that point is passed. 660 * before that point is passed.
661 * 661 *
662 * However if there's no previous lock, like at the root of the 662 * However if there's no previous lock, like at the root of the
663 * chain, then "dvp" must be referenced to prevent dvp going away 663 * chain, then "dvp" must be referenced to prevent dvp going away
664 * before we get its lock. 664 * before we get its lock.
665 * 665 *
666 * Note that the two locks can be the same if looking up a dot, for 666 * Note that the two locks can be the same if looking up a dot, for
667 * example: /usr/bin/. If looking up the parent (..) we can't wait 667 * example: /usr/bin/. If looking up the parent (..) we can't wait
668 * on the lock as child -> parent is the wrong direction. 668 * on the lock as child -> parent is the wrong direction.
669 */ 669 */
670 if (*plock != &dvi->vi_nc_lock) { 670 if (*plock != &dvi->vi_nc_lock) {
671 if (namelen == 2 && name[0] == '.' && name[1] == '.') { 671 if (!rw_tryenter(&dvi->vi_nc_lock, RW_READER)) {
672 if (!rw_tryenter(&dvi->vi_nc_lock, RW_READER)) { 672 return false;
673 return false; 
674 } 
675 } else { 
676 rw_enter(&dvi->vi_nc_lock, RW_READER); 
677 } 673 }
678 if (*plock != NULL) { 674 if (*plock != NULL) {
679 rw_exit(*plock); 675 rw_exit(*plock);
680 } 676 }
681 *plock = &dvi->vi_nc_lock; 677 *plock = &dvi->vi_nc_lock;
682 } else if (*plock == NULL) { 678 } else if (*plock == NULL) {
683 KASSERT(vrefcnt(dvp) > 0); 679 KASSERT(vrefcnt(dvp) > 0);
684 } 680 }
685 681
686 /* 682 /*
687 * First up check if the user is allowed to look up files in this 683 * First up check if the user is allowed to look up files in this
688 * directory. 684 * directory.
689 */ 685 */
690 KASSERT(dvi->vi_nc_mode != VNOVAL && dvi->vi_nc_uid != VNOVAL && 686 KASSERT(dvi->vi_nc_mode != VNOVAL && dvi->vi_nc_uid != VNOVAL &&
691 dvi->vi_nc_gid != VNOVAL); 687 dvi->vi_nc_gid != VNOVAL);
692 error = kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(VEXEC, 688 error = kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(VEXEC,
693 dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL, 689 dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL,
694 genfs_can_access(dvp->v_type, dvi->vi_nc_mode & ALLPERMS, 690 genfs_can_access(dvp->v_type, dvi->vi_nc_mode & ALLPERMS,
695 dvi->vi_nc_uid, dvi->vi_nc_gid, VEXEC, cred)); 691 dvi->vi_nc_uid, dvi->vi_nc_gid, VEXEC, cred));
696 if (error != 0) { 692 if (error != 0) {
697 COUNT(ncs_denied); 693 COUNT(ncs_denied);
698 return false; 694 return false;
699 } 695 }
700 696
701 /* 697 /*
702 * Now look for a matching cache entry. 698 * Now look for a matching cache entry.
703 */ 699 */
704 ncp = cache_lookup_entry(dvp, name, namelen, key); 700 ncp = cache_lookup_entry(dvp, name, namelen, key);
705 if (__predict_false(ncp == NULL)) { 701 if (__predict_false(ncp == NULL)) {
706 COUNT(ncs_miss); 702 COUNT(ncs_miss);
707 SDT_PROBE(vfs, namecache, lookup, miss, dvp, 703 SDT_PROBE(vfs, namecache, lookup, miss, dvp,
708 name, namelen, 0, 0); 704 name, namelen, 0, 0);
709 return false; 705 return false;
710 } 706 }
711 if (ncp->nc_vp == NULL) { 707 if (ncp->nc_vp == NULL) {
712 /* found negative entry; vn is already null from above */ 708 /* found negative entry; vn is already null from above */
713 COUNT(ncs_neghits); 709 COUNT(ncs_neghits);
714 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); 710 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
715 return true; 711 return true;
716 } 712 }
717 713
718 COUNT(ncs_goodhits); /* XXX can be "badhits" */ 714 COUNT(ncs_goodhits); /* XXX can be "badhits" */
719 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); 715 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
720 716
721 /* 717 /*
722 * Return with the directory lock still held. It will either be 718 * Return with the directory lock still held. It will either be
723 * returned to us with another call to cache_lookup_linked() when 719 * returned to us with another call to cache_lookup_linked() when
724 * looking up the next component, or the caller will release it 720 * looking up the next component, or the caller will release it
725 * manually when finished. 721 * manually when finished.
726 */ 722 */
727 *vn_ret = ncp->nc_vp; 723 *vn_ret = ncp->nc_vp;
728 return true; 724 return true;
729} 725}
730 726
731/* 727/*
732 * Scan cache looking for name of directory entry pointing at vp. 728 * Scan cache looking for name of directory entry pointing at vp.
733 * Will not search for "." or "..". 729 * Will not search for "." or "..".
734 * 730 *
735 * If the lookup succeeds the vnode is referenced and stored in dvpp. 731 * If the lookup succeeds the vnode is referenced and stored in dvpp.
736 * 732 *
737 * If bufp is non-NULL, also place the name in the buffer which starts 733 * If bufp is non-NULL, also place the name in the buffer which starts
738 * at bufp, immediately before *bpp, and move bpp backwards to point 734 * at bufp, immediately before *bpp, and move bpp backwards to point
739 * at the start of it. (Yes, this is a little baroque, but it's done 735 * at the start of it. (Yes, this is a little baroque, but it's done
740 * this way to cater to the whims of getcwd). 736 * this way to cater to the whims of getcwd).
741 * 737 *
742 * Returns 0 on success, -1 on cache miss, positive errno on failure. 738 * Returns 0 on success, -1 on cache miss, positive errno on failure.
743 */ 739 */
744int 740int
745cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp, 741cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp,
746 bool checkaccess, int perms) 742 bool checkaccess, int perms)
747{ 743{
748 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 744 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
749 struct namecache *ncp; 745 struct namecache *ncp;
750 struct vnode *dvp; 746 struct vnode *dvp;
751 int error, nlen, lrulist; 747 int error, nlen, lrulist;
752 char *bp; 748 char *bp;
753 749
754 KASSERT(vp != NULL); 750 KASSERT(vp != NULL);
755 751
756 if (cache_maxlen == 0) 752 if (cache_maxlen == 0)
757 goto out; 753 goto out;
758 754
759 rw_enter(&vi->vi_nc_listlock, RW_READER); 755 rw_enter(&vi->vi_nc_listlock, RW_READER);
760 if (checkaccess) { 756 if (checkaccess) {
761 /* 757 /*
762 * Check if the user is allowed to see. NOTE: this is 758 * Check if the user is allowed to see. NOTE: this is
763 * checking for access on the "wrong" directory. getcwd() 759 * checking for access on the "wrong" directory. getcwd()
764 * wants to see that there is access on every component 760 * wants to see that there is access on every component
765 * along the way, not that there is access to any individual 761 * along the way, not that there is access to any individual
766 * component. Don't use this to check you can look in vp. 762 * component. Don't use this to check you can look in vp.
767 * 763 *
768 * I don't like it, I didn't come up with it, don't blame me! 764 * I don't like it, I didn't come up with it, don't blame me!
769 */ 765 */
770 KASSERT(vi->vi_nc_mode != VNOVAL && vi->vi_nc_uid != VNOVAL && 766 KASSERT(vi->vi_nc_mode != VNOVAL && vi->vi_nc_uid != VNOVAL &&
771 vi->vi_nc_gid != VNOVAL); 767 vi->vi_nc_gid != VNOVAL);
772 error = kauth_authorize_vnode(curlwp->l_cred, 768 error = kauth_authorize_vnode(curlwp->l_cred,
773 KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode & 769 KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode &
774 ALLPERMS), vp, NULL, genfs_can_access(vp->v_type, 770 ALLPERMS), vp, NULL, genfs_can_access(vp->v_type,
775 vi->vi_nc_mode & ALLPERMS, vi->vi_nc_uid, vi->vi_nc_gid, 771 vi->vi_nc_mode & ALLPERMS, vi->vi_nc_uid, vi->vi_nc_gid,
776 perms, curlwp->l_cred)); 772 perms, curlwp->l_cred));
777 if (error != 0) { 773 if (error != 0) {
778 rw_exit(&vi->vi_nc_listlock); 774 rw_exit(&vi->vi_nc_listlock);
779 COUNT(ncs_denied); 775 COUNT(ncs_denied);
780 return EACCES; 776 return EACCES;
781 } 777 }
782 } 778 }
783 TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) { 779 TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) {
784 KASSERT(ncp->nc_vp == vp); 780 KASSERT(ncp->nc_vp == vp);
785 KASSERT(ncp->nc_dvp != NULL); 781 KASSERT(ncp->nc_dvp != NULL);
786 nlen = ncp->nc_nlen; 782 nlen = ncp->nc_nlen;
787 783
788 /* 784 /*
789 * The queue is partially sorted. Once we hit dots, nothing 785 * The queue is partially sorted. Once we hit dots, nothing
790 * else remains but dots and dotdots, so bail out. 786 * else remains but dots and dotdots, so bail out.
791 */ 787 */
792 if (ncp->nc_name[0] == '.') { 788 if (ncp->nc_name[0] == '.') {
793 if (nlen == 1 || 789 if (nlen == 1 ||
794 (nlen == 2 && ncp->nc_name[1] == '.')) { 790 (nlen == 2 && ncp->nc_name[1] == '.')) {
795 break; 791 break;
796 } 792 }
797 } 793 }
798 794
799 /* 795 /*
800 * Record a hit on the entry. This is an unlocked read but 796 * Record a hit on the entry. This is an unlocked read but
801 * even if wrong it doesn't matter too much. 797 * even if wrong it doesn't matter too much.
802 */ 798 */
803 lrulist = atomic_load_relaxed(&ncp->nc_lrulist); 799 lrulist = atomic_load_relaxed(&ncp->nc_lrulist);
804 if (lrulist != LRU_ACTIVE) { 800 if (lrulist != LRU_ACTIVE) {
805 cache_activate(ncp); 801 cache_activate(ncp);
806 } 802 }
807 803
808 if (bufp) { 804 if (bufp) {
809 bp = *bpp; 805 bp = *bpp;
810 bp -= nlen; 806 bp -= nlen;
811 if (bp <= bufp) { 807 if (bp <= bufp) {
812 *dvpp = NULL; 808 *dvpp = NULL;
813 rw_exit(&vi->vi_nc_listlock); 809 rw_exit(&vi->vi_nc_listlock);
814 SDT_PROBE(vfs, namecache, revlookup, 810 SDT_PROBE(vfs, namecache, revlookup,
815 fail, vp, ERANGE, 0, 0, 0); 811 fail, vp, ERANGE, 0, 0, 0);
816 return (ERANGE); 812 return (ERANGE);
817 } 813 }
818 memcpy(bp, ncp->nc_name, nlen); 814 memcpy(bp, ncp->nc_name, nlen);
819 *bpp = bp; 815 *bpp = bp;
820 } 816 }
821 817
822 dvp = ncp->nc_dvp; 818 dvp = ncp->nc_dvp;
823 mutex_enter(dvp->v_interlock); 819 mutex_enter(dvp->v_interlock);
824 rw_exit(&vi->vi_nc_listlock); 820 rw_exit(&vi->vi_nc_listlock);
825 error = vcache_tryvget(dvp); 821 error = vcache_tryvget(dvp);
826 if (error) { 822 if (error) {
827 KASSERT(error == EBUSY); 823 KASSERT(error == EBUSY);
828 if (bufp) 824 if (bufp)
829 (*bpp) += nlen; 825 (*bpp) += nlen;
830 *dvpp = NULL; 826 *dvpp = NULL;
831 SDT_PROBE(vfs, namecache, revlookup, fail, vp, 827 SDT_PROBE(vfs, namecache, revlookup, fail, vp,
832 error, 0, 0, 0); 828 error, 0, 0, 0);
833 return -1; 829 return -1;
834 } 830 }
835 *dvpp = dvp; 831 *dvpp = dvp;
836 SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp, 832 SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp,
837 0, 0, 0); 833 0, 0, 0);
838 COUNT(ncs_revhits); 834 COUNT(ncs_revhits);
839 return (0); 835 return (0);
840 } 836 }
841 rw_exit(&vi->vi_nc_listlock); 837 rw_exit(&vi->vi_nc_listlock);
842 COUNT(ncs_revmiss); 838 COUNT(ncs_revmiss);
843 out: 839 out:
844 *dvpp = NULL; 840 *dvpp = NULL;
845 return (-1); 841 return (-1);
846} 842}
847 843
848/* 844/*
849 * Add an entry to the cache. 845 * Add an entry to the cache.
850 */ 846 */
851void 847void
852cache_enter(struct vnode *dvp, struct vnode *vp, 848cache_enter(struct vnode *dvp, struct vnode *vp,
853 const char *name, size_t namelen, uint32_t cnflags) 849 const char *name, size_t namelen, uint32_t cnflags)
854{ 850{
855 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 851 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
856 struct namecache *ncp, *oncp; 852 struct namecache *ncp, *oncp;
857 int total; 853 int total;
858 854
859 /* First, check whether we can/should add a cache entry. */ 855 /* First, check whether we can/should add a cache entry. */
860 if ((cnflags & MAKEENTRY) == 0 || 856 if ((cnflags & MAKEENTRY) == 0 ||
861 __predict_false(namelen > cache_maxlen)) { 857 __predict_false(namelen > cache_maxlen)) {
862 SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen, 858 SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen,
863 0, 0); 859 0, 0);
864 return; 860 return;
865 } 861 }
866 862
867 SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0); 863 SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0);
868 864
869 /* 865 /*
870 * Reclaim some entries if over budget. This is an unlocked check, 866 * Reclaim some entries if over budget. This is an unlocked check,
871 * but it doesn't matter. Just need to catch up with things 867 * but it doesn't matter. Just need to catch up with things
872 * eventually: it doesn't matter if we go over temporarily. 868 * eventually: it doesn't matter if we go over temporarily.
873 */ 869 */
874 total = atomic_load_relaxed(&cache_lru.count[LRU_ACTIVE]); 870 total = atomic_load_relaxed(&cache_lru.count[LRU_ACTIVE]);
875 total += atomic_load_relaxed(&cache_lru.count[LRU_INACTIVE]); 871 total += atomic_load_relaxed(&cache_lru.count[LRU_INACTIVE]);
876 if (__predict_false(total > desiredvnodes)) { 872 if (__predict_false(total > desiredvnodes)) {
877 cache_reclaim(); 873 cache_reclaim();
878 } 874 }
879 875
880 /* Now allocate a fresh entry. */ 876 /* Now allocate a fresh entry. */
881 if (__predict_true(namelen <= NCHNAMLEN)) { 877 if (__predict_true(namelen <= NCHNAMLEN)) {
882 ncp = pool_cache_get(cache_pool, PR_WAITOK); 878 ncp = pool_cache_get(cache_pool, PR_WAITOK);
883 } else { 879 } else {
884 size_t sz = offsetof(struct namecache, nc_name[namelen]); 880 size_t sz = offsetof(struct namecache, nc_name[namelen]);
885 ncp = kmem_alloc(sz, KM_SLEEP); 881 ncp = kmem_alloc(sz, KM_SLEEP);
886 } 882 }
887 883
888 /* 884 /*
889 * Fill in cache info. For negative hits, save the ISWHITEOUT flag 885 * Fill in cache info. For negative hits, save the ISWHITEOUT flag
890 * so we can restore it later when the cache entry is used again. 886 * so we can restore it later when the cache entry is used again.
891 */ 887 */
892 ncp->nc_vp = vp; 888 ncp->nc_vp = vp;
893 ncp->nc_dvp = dvp; 889 ncp->nc_dvp = dvp;
894 ncp->nc_key = cache_key(name, namelen); 890 ncp->nc_key = cache_key(name, namelen);
895 ncp->nc_nlen = namelen; 891 ncp->nc_nlen = namelen;
896 ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0); 892 ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0);
897 memcpy(ncp->nc_name, name, namelen); 893 memcpy(ncp->nc_name, name, namelen);
898 894
899 /* 895 /*
900 * Insert to the directory. Concurrent lookups may race for a cache 896 * Insert to the directory. Concurrent lookups may race for a cache
901 * entry. If there's a entry there already, purge it. 897 * entry. If there's a entry there already, purge it.
902 */ 898 */
903 rw_enter(&dvi->vi_nc_lock, RW_WRITER); 899 rw_enter(&dvi->vi_nc_lock, RW_WRITER);
904 oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp); 900 oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
905 if (oncp != ncp) { 901 if (oncp != ncp) {
906 KASSERT(oncp->nc_key == ncp->nc_key); 902 KASSERT(oncp->nc_key == ncp->nc_key);
907 KASSERT(oncp->nc_nlen == ncp->nc_nlen); 903 KASSERT(oncp->nc_nlen == ncp->nc_nlen);
908 KASSERT(memcmp(oncp->nc_name, name, namelen) == 0); 904 KASSERT(memcmp(oncp->nc_name, name, namelen) == 0);
909 cache_remove(oncp, true); 905 cache_remove(oncp, true);
910 oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp); 906 oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
911 KASSERT(oncp == ncp); 907 KASSERT(oncp == ncp);
912 } 908 }
913 909
914 /* 910 /*
915 * With the directory lock still held, insert to the tail of the 911 * With the directory lock still held, insert to the tail of the
916 * ACTIVE LRU list (new) and take the opportunity to incrementally 912 * ACTIVE LRU list (new) and take the opportunity to incrementally
917 * balance the lists. 913 * balance the lists.
918 */ 914 */
919 mutex_enter(&cache_lru_lock); 915 mutex_enter(&cache_lru_lock);
920 ncp->nc_lrulist = LRU_ACTIVE; 916 ncp->nc_lrulist = LRU_ACTIVE;
921 cache_lru.count[LRU_ACTIVE]++; 917 cache_lru.count[LRU_ACTIVE]++;
922 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); 918 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
923 cache_deactivate(); 919 cache_deactivate();
924 mutex_exit(&cache_lru_lock); 920 mutex_exit(&cache_lru_lock);
925 921
926 /* 922 /*
927 * Finally, insert to the vnode and unlock. With everything set up 923 * Finally, insert to the vnode and unlock. With everything set up
928 * it's safe to let cache_revlookup() see the entry. Partially sort 924 * it's safe to let cache_revlookup() see the entry. Partially sort
929 * the per-vnode list: dots go to back so cache_revlookup() doesn't 925 * the per-vnode list: dots go to back so cache_revlookup() doesn't
930 * have to consider them. 926 * have to consider them.
931 */ 927 */
932 if (vp != NULL) { 928 if (vp != NULL) {
933 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 929 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
934 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 930 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
935 if ((namelen == 1 && name[0] == '.') || 931 if ((namelen == 1 && name[0] == '.') ||
936 (namelen == 2 && name[0] == '.' && name[1] == '.')) { 932 (namelen == 2 && name[0] == '.' && name[1] == '.')) {
937 TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list); 933 TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list);
938 } else { 934 } else {
939 TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list); 935 TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list);
940 } 936 }
941 rw_exit(&vi->vi_nc_listlock); 937 rw_exit(&vi->vi_nc_listlock);
942 } 938 }
943 rw_exit(&dvi->vi_nc_lock); 939 rw_exit(&dvi->vi_nc_lock);
944} 940}
945 941
946/* 942/*
947 * Set identity info in cache for a vnode. We only care about directories 943 * Set identity info in cache for a vnode. We only care about directories
948 * so ignore other updates. 944 * so ignore other updates.
949 */ 945 */
950void 946void
951cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid) 947cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid)
952{ 948{
953 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 949 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
954 950
955 if (vp->v_type == VDIR) { 951 if (vp->v_type == VDIR) {
956 /* Grab both locks, for forward & reverse lookup. */ 952 /* Grab both locks, for forward & reverse lookup. */
957 rw_enter(&vi->vi_nc_lock, RW_WRITER); 953 rw_enter(&vi->vi_nc_lock, RW_WRITER);
958 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 954 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
959 vi->vi_nc_mode = mode; 955 vi->vi_nc_mode = mode;
960 vi->vi_nc_uid = uid; 956 vi->vi_nc_uid = uid;
961 vi->vi_nc_gid = gid; 957 vi->vi_nc_gid = gid;
962 rw_exit(&vi->vi_nc_listlock); 958 rw_exit(&vi->vi_nc_listlock);
963 rw_exit(&vi->vi_nc_lock); 959 rw_exit(&vi->vi_nc_lock);
964 } 960 }
965} 961}
966 962
967/* 963/*
968 * Return true if we have identity for the given vnode, and use as an 964 * Return true if we have identity for the given vnode, and use as an
969 * opportunity to confirm that everything squares up. 965 * opportunity to confirm that everything squares up.
970 * 966 *
971 * Because of shared code, some file systems could provide partial 967 * Because of shared code, some file systems could provide partial
972 * information, missing some updates, so always check the mount flag 968 * information, missing some updates, so always check the mount flag
973 * instead of looking for !VNOVAL. 969 * instead of looking for !VNOVAL.
974 */ 970 */
975bool 971bool
976cache_have_id(struct vnode *vp) 972cache_have_id(struct vnode *vp)
977{ 973{
978 974
979 if (vp->v_type == VDIR && 975 if (vp->v_type == VDIR &&
980 (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0) { 976 (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0) {
981 KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_mode != VNOVAL); 977 KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_mode != VNOVAL);
982 KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_uid != VNOVAL); 978 KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_uid != VNOVAL);
983 KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_gid != VNOVAL); 979 KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_gid != VNOVAL);
984 return true; 980 return true;
985 } else { 981 } else {
986 return false; 982 return false;
987 } 983 }
988} 984}
989 985
990/* 986/*
991 * Name cache initialization, from vfs_init() when the system is booting. 987 * Name cache initialization, from vfs_init() when the system is booting.
992 */ 988 */
993void 989void
994nchinit(void) 990nchinit(void)
995{ 991{
996 992
997 cache_pool = pool_cache_init(sizeof(struct namecache), 993 cache_pool = pool_cache_init(sizeof(struct namecache),
998 coherency_unit, 0, 0, "namecache", NULL, IPL_NONE, NULL, 994 coherency_unit, 0, 0, "namecache", NULL, IPL_NONE, NULL,
999 NULL, NULL); 995 NULL, NULL);
1000 KASSERT(cache_pool != NULL); 996 KASSERT(cache_pool != NULL);
1001 997
1002 mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE); 998 mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE);
1003 TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]); 999 TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]);
1004 TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]); 1000 TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]);
1005 1001
1006 mutex_init(&cache_stat_lock, MUTEX_DEFAULT, IPL_NONE); 1002 mutex_init(&cache_stat_lock, MUTEX_DEFAULT, IPL_NONE);
1007 callout_init(&cache_stat_callout, CALLOUT_MPSAFE); 1003 callout_init(&cache_stat_callout, CALLOUT_MPSAFE);
1008 callout_setfunc(&cache_stat_callout, cache_update_stats, NULL); 1004 callout_setfunc(&cache_stat_callout, cache_update_stats, NULL);
1009 callout_schedule(&cache_stat_callout, cache_stat_interval * hz); 1005 callout_schedule(&cache_stat_callout, cache_stat_interval * hz);
1010 1006
1011 KASSERT(cache_sysctllog == NULL); 1007 KASSERT(cache_sysctllog == NULL);
1012 sysctl_createv(&cache_sysctllog, 0, NULL, NULL, 1008 sysctl_createv(&cache_sysctllog, 0, NULL, NULL,
1013 CTLFLAG_PERMANENT, 1009 CTLFLAG_PERMANENT,
1014 CTLTYPE_STRUCT, "namecache_stats", 1010 CTLTYPE_STRUCT, "namecache_stats",
1015 SYSCTL_DESCR("namecache statistics"), 1011 SYSCTL_DESCR("namecache statistics"),
1016 cache_stat_sysctl, 0, NULL, 0, 1012 cache_stat_sysctl, 0, NULL, 0,
1017 CTL_VFS, CTL_CREATE, CTL_EOL); 1013 CTL_VFS, CTL_CREATE, CTL_EOL);
1018} 1014}
1019 1015
1020/* 1016/*
1021 * Called once for each CPU in the system as attached. 1017 * Called once for each CPU in the system as attached.
1022 */ 1018 */
1023void 1019void
1024cache_cpu_init(struct cpu_info *ci) 1020cache_cpu_init(struct cpu_info *ci)
1025{ 1021{
1026 void *p; 1022 void *p;
1027 size_t sz; 1023 size_t sz;
1028 1024
1029 sz = roundup2(sizeof(struct nchstats_percpu), coherency_unit) + 1025 sz = roundup2(sizeof(struct nchstats_percpu), coherency_unit) +
1030 coherency_unit; 1026 coherency_unit;
1031 p = kmem_zalloc(sz, KM_SLEEP); 1027 p = kmem_zalloc(sz, KM_SLEEP);
1032 ci->ci_data.cpu_nch = (void *)roundup2((uintptr_t)p, coherency_unit); 1028 ci->ci_data.cpu_nch = (void *)roundup2((uintptr_t)p, coherency_unit);
1033} 1029}
1034 1030
1035/* 1031/*
1036 * A vnode is being allocated: set up cache structures. 1032 * A vnode is being allocated: set up cache structures.
1037 */ 1033 */
1038void 1034void
1039cache_vnode_init(struct vnode *vp) 1035cache_vnode_init(struct vnode *vp)
1040{ 1036{
1041 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 1037 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
1042 1038
1043 rw_init(&vi->vi_nc_lock); 1039 rw_init(&vi->vi_nc_lock);
1044 rw_init(&vi->vi_nc_listlock); 1040 rw_init(&vi->vi_nc_listlock);
1045 rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops); 1041 rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops);
1046 TAILQ_INIT(&vi->vi_nc_list); 1042 TAILQ_INIT(&vi->vi_nc_list);
1047 vi->vi_nc_mode = VNOVAL; 1043 vi->vi_nc_mode = VNOVAL;
1048 vi->vi_nc_uid = VNOVAL; 1044 vi->vi_nc_uid = VNOVAL;
1049 vi->vi_nc_gid = VNOVAL; 1045 vi->vi_nc_gid = VNOVAL;
1050} 1046}
1051 1047
1052/* 1048/*
1053 * A vnode is being freed: finish cache structures. 1049 * A vnode is being freed: finish cache structures.
1054 */ 1050 */
1055void 1051void
1056cache_vnode_fini(struct vnode *vp) 1052cache_vnode_fini(struct vnode *vp)
1057{ 1053{
1058 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 1054 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
1059 1055
1060 KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL); 1056 KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL);
1061 KASSERT(TAILQ_EMPTY(&vi->vi_nc_list)); 1057 KASSERT(TAILQ_EMPTY(&vi->vi_nc_list));
1062 rw_destroy(&vi->vi_nc_lock); 1058 rw_destroy(&vi->vi_nc_lock);
1063 rw_destroy(&vi->vi_nc_listlock); 1059 rw_destroy(&vi->vi_nc_listlock);
1064} 1060}
1065 1061
1066/* 1062/*
1067 * Helper for cache_purge1(): purge cache entries for the given vnode from 1063 * Helper for cache_purge1(): purge cache entries for the given vnode from
1068 * all directories that the vnode is cached in. 1064 * all directories that the vnode is cached in.
1069 */ 1065 */
1070static void 1066static void
1071cache_purge_parents(struct vnode *vp) 1067cache_purge_parents(struct vnode *vp)
1072{ 1068{
1073 vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp); 1069 vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp);
1074 struct vnode *dvp, *blocked; 1070 struct vnode *dvp, *blocked;
1075 struct namecache *ncp; 1071 struct namecache *ncp;
1076 1072
1077 SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0); 1073 SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0);
1078 1074
1079 blocked = NULL; 1075 blocked = NULL;
1080 1076
1081 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 1077 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
1082 while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) { 1078 while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) {
1083 /* 1079 /*
1084 * Locking in the wrong direction. Try for a hold on the 1080 * Locking in the wrong direction. Try for a hold on the
1085 * directory node's lock, and if we get it then all good, 1081 * directory node's lock, and if we get it then all good,
1086 * nuke the entry and move on to the next. 1082 * nuke the entry and move on to the next.
1087 */ 1083 */
1088 dvp = ncp->nc_dvp; 1084 dvp = ncp->nc_dvp;
1089 dvi = VNODE_TO_VIMPL(dvp); 1085 dvi = VNODE_TO_VIMPL(dvp);
1090 if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { 1086 if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
1091 cache_remove(ncp, false); 1087 cache_remove(ncp, false);
1092 rw_exit(&dvi->vi_nc_lock); 1088 rw_exit(&dvi->vi_nc_lock);
1093 blocked = NULL; 1089 blocked = NULL;
1094 continue; 1090 continue;
1095 } 1091 }
1096 1092
1097 /* 1093 /*
1098 * We can't wait on the directory node's lock with our list 1094 * We can't wait on the directory node's lock with our list
1099 * lock held or the system could deadlock. 1095 * lock held or the system could deadlock.
1100 * 1096 *
1101 * Take a hold on the directory vnode to prevent it from 1097 * Take a hold on the directory vnode to prevent it from
1102 * being freed (taking the vnode & lock with it). Then 1098 * being freed (taking the vnode & lock with it). Then
1103 * wait for the lock to become available with no other locks 1099 * wait for the lock to become available with no other locks
1104 * held, and retry. 1100 * held, and retry.
1105 * 1101 *
1106 * If this happens twice in a row, give the other side a 1102 * If this happens twice in a row, give the other side a
1107 * breather; we can do nothing until it lets go. 1103 * breather; we can do nothing until it lets go.
1108 */ 1104 */
1109 vhold(dvp); 1105 vhold(dvp);
1110 rw_exit(&vi->vi_nc_listlock); 1106 rw_exit(&vi->vi_nc_listlock);
1111 rw_enter(&dvi->vi_nc_lock, RW_WRITER); 1107 rw_enter(&dvi->vi_nc_lock, RW_WRITER);
1112 /* Do nothing. */ 1108 /* Do nothing. */
1113 rw_exit(&dvi->vi_nc_lock); 1109 rw_exit(&dvi->vi_nc_lock);
1114 holdrele(dvp); 1110 holdrele(dvp);
1115 if (blocked == dvp) { 1111 if (blocked == dvp) {
1116 kpause("ncpurge", false, 1, NULL); 1112 kpause("ncpurge", false, 1, NULL);
1117 } 1113 }
1118 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 1114 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
1119 blocked = dvp; 1115 blocked = dvp;
1120 } 1116 }
1121 rw_exit(&vi->vi_nc_listlock); 1117 rw_exit(&vi->vi_nc_listlock);
1122} 1118}
1123 1119
1124/* 1120/*
1125 * Helper for cache_purge1(): purge all cache entries hanging off the given 1121 * Helper for cache_purge1(): purge all cache entries hanging off the given
1126 * directory vnode. 1122 * directory vnode.
1127 */ 1123 */
1128static void 1124static void
1129cache_purge_children(struct vnode *dvp) 1125cache_purge_children(struct vnode *dvp)
1130{ 1126{
1131 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 1127 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
1132 struct namecache *ncp; 1128 struct namecache *ncp;
1133 1129
1134 SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0); 1130 SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0);
1135 1131
1136 rw_enter(&dvi->vi_nc_lock, RW_WRITER); 1132 rw_enter(&dvi->vi_nc_lock, RW_WRITER);
1137 while ((ncp = RB_TREE_MIN(&dvi->vi_nc_tree)) != NULL) { 1133 while ((ncp = RB_TREE_MIN(&dvi->vi_nc_tree)) != NULL) {
1138 cache_remove(ncp, true); 1134 cache_remove(ncp, true);
1139 } 1135 }
1140 rw_exit(&dvi->vi_nc_lock); 1136 rw_exit(&dvi->vi_nc_lock);
1141} 1137}
1142 1138
1143/* 1139/*
1144 * Helper for cache_purge1(): purge cache entry from the given vnode, 1140 * Helper for cache_purge1(): purge cache entry from the given vnode,
1145 * finding it by name. 1141 * finding it by name.
1146 */ 1142 */
1147static void 1143static void
1148cache_purge_name(struct vnode *dvp, const char *name, size_t namelen) 1144cache_purge_name(struct vnode *dvp, const char *name, size_t namelen)
1149{ 1145{
1150 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 1146 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
1151 struct namecache *ncp; 1147 struct namecache *ncp;
1152 uint64_t key; 1148 uint64_t key;
1153 1149
1154 SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0); 1150 SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0);
1155 1151
1156 key = cache_key(name, namelen); 1152 key = cache_key(name, namelen);
1157 rw_enter(&dvi->vi_nc_lock, RW_WRITER); 1153 rw_enter(&dvi->vi_nc_lock, RW_WRITER);
1158 ncp = cache_lookup_entry(dvp, name, namelen, key); 1154 ncp = cache_lookup_entry(dvp, name, namelen, key);
1159 if (ncp) { 1155 if (ncp) {
1160 cache_remove(ncp, true); 1156 cache_remove(ncp, true);
1161 } 1157 }
1162 rw_exit(&dvi->vi_nc_lock); 1158 rw_exit(&dvi->vi_nc_lock);
1163} 1159}
1164 1160
1165/* 1161/*
1166 * Cache flush, a particular vnode; called when a vnode is renamed to 1162 * Cache flush, a particular vnode; called when a vnode is renamed to
1167 * hide entries that would now be invalid. 1163 * hide entries that would now be invalid.
1168 */ 1164 */
1169void 1165void
1170cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags) 1166cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags)
1171{ 1167{
1172 1168
1173 if (flags & PURGE_PARENTS) { 1169 if (flags & PURGE_PARENTS) {
1174 cache_purge_parents(vp); 1170 cache_purge_parents(vp);
1175 } 1171 }
1176 if (flags & PURGE_CHILDREN) { 1172 if (flags & PURGE_CHILDREN) {
1177 cache_purge_children(vp); 1173 cache_purge_children(vp);
1178 } 1174 }
1179 if (name != NULL) { 1175 if (name != NULL) {
1180 cache_purge_name(vp, name, namelen); 1176 cache_purge_name(vp, name, namelen);
1181 } 1177 }
1182} 1178}
1183 1179
1184/* 1180/*
1185 * vnode filter for cache_purgevfs(). 1181 * vnode filter for cache_purgevfs().
1186 */ 1182 */
1187static bool 1183static bool
1188cache_vdir_filter(void *cookie, vnode_t *vp) 1184cache_vdir_filter(void *cookie, vnode_t *vp)
1189{ 1185{
1190 1186
1191 return vp->v_type == VDIR; 1187 return vp->v_type == VDIR;
1192} 1188}
1193 1189
1194/* 1190/*
1195 * Cache flush, a whole filesystem; called when filesys is umounted to 1191 * Cache flush, a whole filesystem; called when filesys is umounted to
1196 * remove entries that would now be invalid. 1192 * remove entries that would now be invalid.
1197 */ 1193 */
1198void 1194void
1199cache_purgevfs(struct mount *mp) 1195cache_purgevfs(struct mount *mp)
1200{ 1196{
1201 struct vnode_iterator *iter; 1197 struct vnode_iterator *iter;
1202 vnode_t *dvp; 1198 vnode_t *dvp;
1203 1199
1204 vfs_vnode_iterator_init(mp, &iter); 1200 vfs_vnode_iterator_init(mp, &iter);
1205 for (;;) { 1201 for (;;) {
1206 dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL); 1202 dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL);
1207 if (dvp == NULL) { 1203 if (dvp == NULL) {
1208 break; 1204 break;
1209 } 1205 }
1210 cache_purge_children(dvp); 1206 cache_purge_children(dvp);
1211 vrele(dvp); 1207 vrele(dvp);
1212 } 1208 }
1213 vfs_vnode_iterator_destroy(iter); 1209 vfs_vnode_iterator_destroy(iter);
1214} 1210}
1215 1211
1216/* 1212/*
1217 * Re-queue an entry onto the tail of the active LRU list, after it has 1213 * Re-queue an entry onto the tail of the active LRU list, after it has
1218 * scored a hit. 1214 * scored a hit.
1219 */ 1215 */
1220static void 1216static void
1221cache_activate(struct namecache *ncp) 1217cache_activate(struct namecache *ncp)
1222{ 1218{
1223 1219
1224 mutex_enter(&cache_lru_lock); 1220 mutex_enter(&cache_lru_lock);
1225 TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); 1221 TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
1226 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); 1222 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
1227 cache_lru.count[ncp->nc_lrulist]--; 1223 cache_lru.count[ncp->nc_lrulist]--;
1228 cache_lru.count[LRU_ACTIVE]++; 1224 cache_lru.count[LRU_ACTIVE]++;
1229 ncp->nc_lrulist = LRU_ACTIVE; 1225 ncp->nc_lrulist = LRU_ACTIVE;
1230 mutex_exit(&cache_lru_lock); 1226 mutex_exit(&cache_lru_lock);
1231} 1227}
1232 1228
1233/* 1229/*
1234 * Try to balance the LRU lists. Pick some victim entries, and re-queue 1230 * Try to balance the LRU lists. Pick some victim entries, and re-queue
1235 * them from the head of the active list to the tail of the inactive list.  1231 * them from the head of the active list to the tail of the inactive list.
1236 */ 1232 */
1237static void 1233static void
1238cache_deactivate(void) 1234cache_deactivate(void)
1239{ 1235{
1240 struct namecache *ncp; 1236 struct namecache *ncp;
1241 int total, i; 1237 int total, i;
1242 1238
1243 KASSERT(mutex_owned(&cache_lru_lock)); 1239 KASSERT(mutex_owned(&cache_lru_lock));
1244 1240
1245 /* If we're nowhere near budget yet, don't bother. */ 1241 /* If we're nowhere near budget yet, don't bother. */
1246 total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE]; 1242 total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE];
1247 if (total < (desiredvnodes >> 1)) { 1243 if (total < (desiredvnodes >> 1)) {
1248 return; 1244 return;
1249 } 1245 }
1250 1246
1251 /* 1247 /*
1252 * Aim for a 1:1 ratio of active to inactive. This is to allow each 1248 * Aim for a 1:1 ratio of active to inactive. This is to allow each
1253 * potential victim a reasonable amount of time to cycle through the 1249 * potential victim a reasonable amount of time to cycle through the
1254 * inactive list in order to score a hit and be reactivated, while 1250 * inactive list in order to score a hit and be reactivated, while
1255 * trying not to cause reactivations too frequently. 1251 * trying not to cause reactivations too frequently.
1256 */ 1252 */
1257 if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) { 1253 if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) {
1258 return; 1254 return;
1259 } 1255 }
1260 1256
1261 /* Move only a few at a time; will catch up eventually. */ 1257 /* Move only a few at a time; will catch up eventually. */
1262 for (i = 0; i < cache_lru_maxdeact; i++) { 1258 for (i = 0; i < cache_lru_maxdeact; i++) {
1263 ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]); 1259 ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]);
1264 if (ncp == NULL) { 1260 if (ncp == NULL) {
1265 break; 1261 break;
1266 } 1262 }
1267 KASSERT(ncp->nc_lrulist == LRU_ACTIVE); 1263 KASSERT(ncp->nc_lrulist == LRU_ACTIVE);
1268 ncp->nc_lrulist = LRU_INACTIVE; 1264 ncp->nc_lrulist = LRU_INACTIVE;
1269 TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); 1265 TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
1270 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru); 1266 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru);
1271 cache_lru.count[LRU_ACTIVE]--; 1267 cache_lru.count[LRU_ACTIVE]--;
1272 cache_lru.count[LRU_INACTIVE]++; 1268 cache_lru.count[LRU_INACTIVE]++;
1273 } 1269 }
1274} 1270}
1275 1271
1276/* 1272/*
1277 * Free some entries from the cache, when we have gone over budget. 1273 * Free some entries from the cache, when we have gone over budget.
1278 * 1274 *
1279 * We don't want to cause too much work for any individual caller, and it 1275 * We don't want to cause too much work for any individual caller, and it
1280 * doesn't matter if we temporarily go over budget. This is also "just a 1276 * doesn't matter if we temporarily go over budget. This is also "just a
1281 * cache" so it's not a big deal if we screw up and throw out something we 1277 * cache" so it's not a big deal if we screw up and throw out something we
1282 * shouldn't. So we take a relaxed attitude to this process to reduce its 1278 * shouldn't. So we take a relaxed attitude to this process to reduce its
1283 * impact. 1279 * impact.
1284 */ 1280 */
1285static void 1281static void
1286cache_reclaim(void) 1282cache_reclaim(void)
1287{ 1283{
1288 struct namecache *ncp; 1284 struct namecache *ncp;
1289 vnode_impl_t *dvi; 1285 vnode_impl_t *dvi;
1290 int toscan; 1286 int toscan;
1291 1287
1292 /* 1288 /*
1293 * Scan up to a preset maxium number of entries, but no more than 1289 * Scan up to a preset maxium number of entries, but no more than
1294 * 0.8% of the total at once (to allow for very small systems). 1290 * 0.8% of the total at once (to allow for very small systems).
1295 * 1291 *
1296 * On bigger systems, do a larger chunk of work to reduce the number 1292 * On bigger systems, do a larger chunk of work to reduce the number
1297 * of times that cache_lru_lock is held for any length of time. 1293 * of times that cache_lru_lock is held for any length of time.
1298 */ 1294 */
1299 mutex_enter(&cache_lru_lock); 1295 mutex_enter(&cache_lru_lock);
1300 toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7); 1296 toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7);
1301 toscan = MAX(toscan, 1); 1297 toscan = MAX(toscan, 1);
1302 SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] + 1298 SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] +
1303 cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0); 1299 cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0);
1304 while (toscan-- != 0) { 1300 while (toscan-- != 0) {
1305 /* First try to balance the lists. */ 1301 /* First try to balance the lists. */
1306 cache_deactivate(); 1302 cache_deactivate();
1307 1303
1308 /* Now look for a victim on head of inactive list (old). */ 1304 /* Now look for a victim on head of inactive list (old). */
1309 ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]); 1305 ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]);
1310 if (ncp == NULL) { 1306 if (ncp == NULL) {
1311 break; 1307 break;
1312 } 1308 }
1313 dvi = VNODE_TO_VIMPL(ncp->nc_dvp); 1309 dvi = VNODE_TO_VIMPL(ncp->nc_dvp);
1314 KASSERT(ncp->nc_lrulist == LRU_INACTIVE); 1310 KASSERT(ncp->nc_lrulist == LRU_INACTIVE);
1315 KASSERT(dvi != NULL); 1311 KASSERT(dvi != NULL);
1316 1312
1317 /* 1313 /*
1318 * Locking in the wrong direction. If we can't get the 1314 * Locking in the wrong direction. If we can't get the
1319 * lock, the directory is actively busy, and it could also 1315 * lock, the directory is actively busy, and it could also
1320 * cause problems for the next guy in here, so send the 1316 * cause problems for the next guy in here, so send the
1321 * entry to the back of the list. 1317 * entry to the back of the list.
1322 */ 1318 */
1323 if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { 1319 if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
1324 TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE], 1320 TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE],
1325 ncp, nc_lru); 1321 ncp, nc_lru);
1326 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], 1322 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE],
1327 ncp, nc_lru); 1323 ncp, nc_lru);
1328 continue; 1324 continue;
1329 } 1325 }
1330 1326
1331 /* 1327 /*
1332 * Now have the victim entry locked. Drop the LRU list 1328 * Now have the victim entry locked. Drop the LRU list
1333 * lock, purge the entry, and start over. The hold on 1329 * lock, purge the entry, and start over. The hold on
1334 * vi_nc_lock will prevent the vnode from vanishing until 1330 * vi_nc_lock will prevent the vnode from vanishing until
1335 * finished (cache_purge() will be called on dvp before it 1331 * finished (cache_purge() will be called on dvp before it
1336 * disappears, and that will wait on vi_nc_lock). 1332 * disappears, and that will wait on vi_nc_lock).
1337 */ 1333 */
1338 mutex_exit(&cache_lru_lock); 1334 mutex_exit(&cache_lru_lock);
1339 cache_remove(ncp, true); 1335 cache_remove(ncp, true);
1340 rw_exit(&dvi->vi_nc_lock); 1336 rw_exit(&dvi->vi_nc_lock);
1341 mutex_enter(&cache_lru_lock); 1337 mutex_enter(&cache_lru_lock);
1342 } 1338 }
1343 mutex_exit(&cache_lru_lock); 1339 mutex_exit(&cache_lru_lock);
1344} 1340}
1345 1341
1346/* 1342/*
1347 * For file system code: count a lookup that required a full re-scan of 1343 * For file system code: count a lookup that required a full re-scan of
1348 * directory metadata. 1344 * directory metadata.
1349 */ 1345 */
1350void 1346void
1351namecache_count_pass2(void) 1347namecache_count_pass2(void)
1352{ 1348{
1353 1349
1354 COUNT(ncs_pass2); 1350 COUNT(ncs_pass2);
1355} 1351}
1356 1352
1357/* 1353/*
1358 * For file system code: count a lookup that scored a hit in the directory 1354 * For file system code: count a lookup that scored a hit in the directory
1359 * metadata near the location of the last lookup. 1355 * metadata near the location of the last lookup.
1360 */ 1356 */
1361void 1357void
1362namecache_count_2passes(void) 1358namecache_count_2passes(void)
1363{ 1359{
1364 1360
1365 COUNT(ncs_2passes); 1361 COUNT(ncs_2passes);
1366} 1362}
1367 1363
1368/* 1364/*
1369 * Sum the stats from all CPUs into nchstats. This needs to run at least 1365 * Sum the stats from all CPUs into nchstats. This needs to run at least
1370 * once within every window where a 32-bit counter could roll over. It's 1366 * once within every window where a 32-bit counter could roll over. It's
1371 * called regularly by timer to ensure this. 1367 * called regularly by timer to ensure this.
1372 */ 1368 */
1373static void 1369static void
1374cache_update_stats(void *cookie) 1370cache_update_stats(void *cookie)
1375{ 1371{
1376 CPU_INFO_ITERATOR cii; 1372 CPU_INFO_ITERATOR cii;
1377 struct cpu_info *ci; 1373 struct cpu_info *ci;
1378 1374
1379 mutex_enter(&cache_stat_lock); 1375 mutex_enter(&cache_stat_lock);
1380 for (CPU_INFO_FOREACH(cii, ci)) { 1376 for (CPU_INFO_FOREACH(cii, ci)) {
1381 struct nchcpu *nchcpu = ci->ci_data.cpu_nch; 1377 struct nchcpu *nchcpu = ci->ci_data.cpu_nch;
1382 UPDATE(nchcpu, ncs_goodhits); 1378 UPDATE(nchcpu, ncs_goodhits);
1383 UPDATE(nchcpu, ncs_neghits); 1379 UPDATE(nchcpu, ncs_neghits);
1384 UPDATE(nchcpu, ncs_badhits); 1380 UPDATE(nchcpu, ncs_badhits);
1385 UPDATE(nchcpu, ncs_falsehits); 1381 UPDATE(nchcpu, ncs_falsehits);
1386 UPDATE(nchcpu, ncs_miss); 1382 UPDATE(nchcpu, ncs_miss);
1387 UPDATE(nchcpu, ncs_long); 1383 UPDATE(nchcpu, ncs_long);
1388 UPDATE(nchcpu, ncs_pass2); 1384 UPDATE(nchcpu, ncs_pass2);
1389 UPDATE(nchcpu, ncs_2passes); 1385 UPDATE(nchcpu, ncs_2passes);
1390 UPDATE(nchcpu, ncs_revhits); 1386 UPDATE(nchcpu, ncs_revhits);
1391 UPDATE(nchcpu, ncs_revmiss); 1387 UPDATE(nchcpu, ncs_revmiss);
1392 UPDATE(nchcpu, ncs_denied); 1388 UPDATE(nchcpu, ncs_denied);
1393 } 1389 }
1394 if (cookie != NULL) { 1390 if (cookie != NULL) {
1395 memcpy(cookie, &nchstats, sizeof(nchstats)); 1391 memcpy(cookie, &nchstats, sizeof(nchstats));
1396 } 1392 }
1397 /* Reset the timer; arrive back here in N minutes at latest. */ 1393 /* Reset the timer; arrive back here in N minutes at latest. */
1398 callout_schedule(&cache_stat_callout, cache_stat_interval * hz); 1394 callout_schedule(&cache_stat_callout, cache_stat_interval * hz);
1399 mutex_exit(&cache_stat_lock); 1395 mutex_exit(&cache_stat_lock);
1400} 1396}
1401 1397
1402/* 1398/*
1403 * Fetch the current values of the stats for sysctl. 1399 * Fetch the current values of the stats for sysctl.
1404 */ 1400 */
1405static int 1401static int
1406cache_stat_sysctl(SYSCTLFN_ARGS) 1402cache_stat_sysctl(SYSCTLFN_ARGS)
1407{ 1403{
1408 struct nchstats stats; 1404 struct nchstats stats;
1409 1405
1410 if (oldp == NULL) { 1406 if (oldp == NULL) {
1411 *oldlenp = sizeof(nchstats); 1407 *oldlenp = sizeof(nchstats);
1412 return 0; 1408 return 0;
1413 } 1409 }
1414 1410
1415 if (*oldlenp <= 0) { 1411 if (*oldlenp <= 0) {
1416 *oldlenp = 0; 1412 *oldlenp = 0;
1417 return 0; 1413 return 0;
1418 } 1414 }
1419 1415
1420 /* Refresh the global stats. */ 1416 /* Refresh the global stats. */
1421 sysctl_unlock(); 1417 sysctl_unlock();
1422 cache_update_stats(&stats); 1418 cache_update_stats(&stats);
1423 sysctl_relock(); 1419 sysctl_relock();
1424 1420
1425 *oldlenp = MIN(sizeof(stats), *oldlenp); 1421 *oldlenp = MIN(sizeof(stats), *oldlenp);
1426 return sysctl_copyout(l, &stats, oldp, *oldlenp); 1422 return sysctl_copyout(l, &stats, oldp, *oldlenp);
1427} 1423}
1428 1424
1429/* 1425/*
1430 * For the debugger, given the address of a vnode, print all associated 1426 * For the debugger, given the address of a vnode, print all associated
1431 * names in the cache. 1427 * names in the cache.
1432 */ 1428 */
1433#ifdef DDB 1429#ifdef DDB
1434void 1430void
1435namecache_print(struct vnode *vp, void (*pr)(const char *, ...)) 1431namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
1436{ 1432{
1437 struct vnode *dvp = NULL; 1433 struct vnode *dvp = NULL;
1438 struct namecache *ncp; 1434 struct namecache *ncp;
1439 enum cache_lru_id id; 1435 enum cache_lru_id id;
1440 1436
1441 for (id = 0; id < LRU_COUNT; id++) { 1437 for (id = 0; id < LRU_COUNT; id++) {
1442 TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { 1438 TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
1443 if (ncp->nc_vp == vp) { 1439 if (ncp->nc_vp == vp) {
1444 (*pr)("name %.*s\n", ncp->nc_nlen, 1440 (*pr)("name %.*s\n", ncp->nc_nlen,
1445 ncp->nc_name); 1441 ncp->nc_name);
1446 dvp = ncp->nc_dvp; 1442 dvp = ncp->nc_dvp;
1447 } 1443 }
1448 } 1444 }
1449 } 1445 }
1450 if (dvp == NULL) { 1446 if (dvp == NULL) {
1451 (*pr)("name not found\n"); 1447 (*pr)("name not found\n");
1452 return; 1448 return;
1453 } 1449 }
1454 for (id = 0; id < LRU_COUNT; id++) { 1450 for (id = 0; id < LRU_COUNT; id++) {
1455 TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { 1451 TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
1456 if (ncp->nc_vp == dvp) { 1452 if (ncp->nc_vp == dvp) {
1457 (*pr)("parent %.*s\n", ncp->nc_nlen, 1453 (*pr)("parent %.*s\n", ncp->nc_nlen,
1458 ncp->nc_name); 1454 ncp->nc_name);
1459 } 1455 }
1460 } 1456 }
1461 } 1457 }
1462} 1458}
1463#endif 1459#endif