Wed Apr 22 21:35:52 2020 UTC ()
lookup_fastforward(): handle dotdot lookups and give up less often in
the union mount case.


(ad)
diff -r1.139 -r1.140 src/sys/kern/vfs_cache.c
diff -r1.218 -r1.219 src/sys/kern/vfs_lookup.c

cvs diff -r1.139 -r1.140 src/sys/kern/vfs_cache.c (switch to unified diff)

--- src/sys/kern/vfs_cache.c 2020/04/13 19:23:18 1.139
+++ src/sys/kern/vfs_cache.c 2020/04/22 21:35:52 1.140
@@ -1,1456 +1,1463 @@ @@ -1,1456 +1,1463 @@
1/* $NetBSD: vfs_cache.c,v 1.139 2020/04/13 19:23:18 ad Exp $ */ 1/* $NetBSD: vfs_cache.c,v 1.140 2020/04/22 21:35:52 ad Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32/* 32/*
33 * Copyright (c) 1989, 1993 33 * Copyright (c) 1989, 1993
34 * The Regents of the University of California. All rights reserved. 34 * The Regents of the University of California. All rights reserved.
35 * 35 *
36 * Redistribution and use in source and binary forms, with or without 36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions 37 * modification, are permitted provided that the following conditions
38 * are met: 38 * are met:
39 * 1. Redistributions of source code must retain the above copyright 39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer. 40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright 41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the 42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution. 43 * documentation and/or other materials provided with the distribution.
44 * 3. Neither the name of the University nor the names of its contributors 44 * 3. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software 45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission. 46 * without specific prior written permission.
47 * 47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE. 58 * SUCH DAMAGE.
59 * 59 *
60 * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94 60 * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94
61 */ 61 */
62 62
63/* 63/*
64 * Name caching: 64 * Name caching:
65 * 65 *
66 * Names found by directory scans are retained in a cache for future 66 * Names found by directory scans are retained in a cache for future
67 * reference. It is managed LRU, so frequently used names will hang 67 * reference. It is managed LRU, so frequently used names will hang
68 * around. The cache is indexed by hash value obtained from the name. 68 * around. The cache is indexed by hash value obtained from the name.
69 * 69 *
70 * The name cache is the brainchild of Robert Elz and was introduced in 70 * The name cache is the brainchild of Robert Elz and was introduced in
71 * 4.3BSD. See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk 71 * 4.3BSD. See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk
72 * McKusick, May 21 1984. 72 * McKusick, May 21 1984.
73 * 73 *
74 * Data structures: 74 * Data structures:
75 * 75 *
76 * Most Unix namecaches very sensibly use a global hash table to index 76 * Most Unix namecaches very sensibly use a global hash table to index
77 * names. The global hash table works well, but can cause concurrency 77 * names. The global hash table works well, but can cause concurrency
78 * headaches for the kernel hacker. In the NetBSD 10.0 implementation 78 * headaches for the kernel hacker. In the NetBSD 10.0 implementation
79 * we are not sensible, and use a per-directory data structure to index 79 * we are not sensible, and use a per-directory data structure to index
80 * names, but the cache otherwise functions the same. 80 * names, but the cache otherwise functions the same.
81 * 81 *
82 * The index is a red-black tree. There are no special concurrency 82 * The index is a red-black tree. There are no special concurrency
83 * requirements placed on it, because it's per-directory and protected 83 * requirements placed on it, because it's per-directory and protected
84 * by the namecache's per-directory locks. It should therefore not be 84 * by the namecache's per-directory locks. It should therefore not be
85 * difficult to experiment with other types of index. 85 * difficult to experiment with other types of index.
86 * 86 *
87 * Each cached name is stored in a struct namecache, along with a 87 * Each cached name is stored in a struct namecache, along with a
88 * pointer to the associated vnode (nc_vp). Names longer than a 88 * pointer to the associated vnode (nc_vp). Names longer than a
89 * maximum length of NCHNAMLEN are allocated with kmem_alloc(); they 89 * maximum length of NCHNAMLEN are allocated with kmem_alloc(); they
90 * occur infrequently, and names shorter than this are stored directly 90 * occur infrequently, and names shorter than this are stored directly
91 * in struct namecache. If it is a "negative" entry, (i.e. for a name 91 * in struct namecache. If it is a "negative" entry, (i.e. for a name
92 * that is known NOT to exist) the vnode pointer will be NULL. 92 * that is known NOT to exist) the vnode pointer will be NULL.
93 * 93 *
94 * For a directory with 3 cached names for 3 distinct vnodes, the 94 * For a directory with 3 cached names for 3 distinct vnodes, the
95 * various vnodes and namecache structs would be connected like this 95 * various vnodes and namecache structs would be connected like this
96 * (the root is at the bottom of the diagram): 96 * (the root is at the bottom of the diagram):
97 * 97 *
98 * ... 98 * ...
99 * ^ 99 * ^
100 * |- vi_nc_tree 100 * |- vi_nc_tree
101 * |  101 * |
102 * +----o----+ +---------+ +---------+ 102 * +----o----+ +---------+ +---------+
103 * | VDIR | | VCHR | | VREG | 103 * | VDIR | | VCHR | | VREG |
104 * | vnode o-----+ | vnode o-----+ | vnode o------+ 104 * | vnode o-----+ | vnode o-----+ | vnode o------+
105 * +---------+ | +---------+ | +---------+ | 105 * +---------+ | +---------+ | +---------+ |
106 * ^ | ^ | ^ | 106 * ^ | ^ | ^ |
107 * |- nc_vp |- vi_nc_list |- nc_vp |- vi_nc_list |- nc_vp | 107 * |- nc_vp |- vi_nc_list |- nc_vp |- vi_nc_list |- nc_vp |
108 * | | | | | | 108 * | | | | | |
109 * +----o----+ | +----o----+ | +----o----+ | 109 * +----o----+ | +----o----+ | +----o----+ |
110 * +---onamecache|<----+ +---onamecache|<----+ +---onamecache|<-----+ 110 * +---onamecache|<----+ +---onamecache|<----+ +---onamecache|<-----+
111 * | +---------+ | +---------+ | +---------+ 111 * | +---------+ | +---------+ | +---------+
112 * | ^ | ^ | ^ 112 * | ^ | ^ | ^
113 * | | | | | | 113 * | | | | | |
114 * | | +----------------------+ | | 114 * | | +----------------------+ | |
115 * |-nc_dvp | +-------------------------------------------------+ 115 * |-nc_dvp | +-------------------------------------------------+
116 * | |/- vi_nc_tree | | 116 * | |/- vi_nc_tree | |
117 * | | |- nc_dvp |- nc_dvp 117 * | | |- nc_dvp |- nc_dvp
118 * | +----o----+ | | 118 * | +----o----+ | |
119 * +-->| VDIR |<----------+ | 119 * +-->| VDIR |<----------+ |
120 * | vnode |<------------------------------------+ 120 * | vnode |<------------------------------------+
121 * +---------+ 121 * +---------+
122 * 122 *
123 * START HERE 123 * START HERE
124 * 124 *
125 * Replacement: 125 * Replacement:
126 * 126 *
127 * As the cache becomes full, old and unused entries are purged as new 127 * As the cache becomes full, old and unused entries are purged as new
128 * entries are added. The synchronization overhead in maintaining a 128 * entries are added. The synchronization overhead in maintaining a
129 * strict ordering would be prohibitive, so the VM system's "clock" or 129 * strict ordering would be prohibitive, so the VM system's "clock" or
130 * "second chance" page replacement algorithm is aped here. New 130 * "second chance" page replacement algorithm is aped here. New
131 * entries go to the tail of the active list. After they age out and 131 * entries go to the tail of the active list. After they age out and
132 * reach the head of the list, they are moved to the tail of the 132 * reach the head of the list, they are moved to the tail of the
133 * inactive list. Any use of the deactivated cache entry reactivates 133 * inactive list. Any use of the deactivated cache entry reactivates
134 * it, saving it from impending doom; if not reactivated, the entry 134 * it, saving it from impending doom; if not reactivated, the entry
135 * eventually reaches the head of the inactive list and is purged. 135 * eventually reaches the head of the inactive list and is purged.
136 * 136 *
137 * Concurrency: 137 * Concurrency:
138 * 138 *
139 * From a performance perspective, cache_lookup(nameiop == LOOKUP) is 139 * From a performance perspective, cache_lookup(nameiop == LOOKUP) is
140 * what really matters; insertion of new entries with cache_enter() is 140 * what really matters; insertion of new entries with cache_enter() is
141 * comparatively infrequent, and overshadowed by the cost of expensive 141 * comparatively infrequent, and overshadowed by the cost of expensive
142 * file system metadata operations (which may involve disk I/O). We 142 * file system metadata operations (which may involve disk I/O). We
143 * therefore want to make everything simplest in the lookup path. 143 * therefore want to make everything simplest in the lookup path.
144 * 144 *
145 * struct namecache is mostly stable except for list and tree related 145 * struct namecache is mostly stable except for list and tree related
146 * entries, changes to which don't affect the cached name or vnode.  146 * entries, changes to which don't affect the cached name or vnode.
147 * For changes to name+vnode, entries are purged in preference to 147 * For changes to name+vnode, entries are purged in preference to
148 * modifying them. 148 * modifying them.
149 * 149 *
150 * Read access to namecache entries is made via tree, list, or LRU 150 * Read access to namecache entries is made via tree, list, or LRU
151 * list. A lock corresponding to the direction of access should be 151 * list. A lock corresponding to the direction of access should be
152 * held. See definition of "struct namecache" in src/sys/namei.src, 152 * held. See definition of "struct namecache" in src/sys/namei.src,
153 * and the definition of "struct vnode" for the particulars. 153 * and the definition of "struct vnode" for the particulars.
154 * 154 *
155 * Per-CPU statistics, and LRU list totals are read unlocked, since 155 * Per-CPU statistics, and LRU list totals are read unlocked, since
156 * an approximate value is OK. We maintain 32-bit sized per-CPU 156 * an approximate value is OK. We maintain 32-bit sized per-CPU
157 * counters and 64-bit global counters under the theory that 32-bit 157 * counters and 64-bit global counters under the theory that 32-bit
158 * sized counters are less likely to be hosed by nonatomic increment 158 * sized counters are less likely to be hosed by nonatomic increment
159 * (on 32-bit platforms). 159 * (on 32-bit platforms).
160 * 160 *
161 * The lock order is: 161 * The lock order is:
162 * 162 *
163 * 1) vi->vi_nc_lock (tree or parent -> child direction, 163 * 1) vi->vi_nc_lock (tree or parent -> child direction,
164 * used during forward lookup) 164 * used during forward lookup)
165 * 165 *
166 * 2) vi->vi_nc_listlock (list or child -> parent direction, 166 * 2) vi->vi_nc_listlock (list or child -> parent direction,
167 * used during reverse lookup) 167 * used during reverse lookup)
168 * 168 *
169 * 3) cache_lru_lock (LRU list direction, used during reclaim) 169 * 3) cache_lru_lock (LRU list direction, used during reclaim)
170 * 170 *
171 * 4) vp->v_interlock (what the cache entry points to) 171 * 4) vp->v_interlock (what the cache entry points to)
172 */ 172 */
173 173
174#include <sys/cdefs.h> 174#include <sys/cdefs.h>
175__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.139 2020/04/13 19:23:18 ad Exp $"); 175__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.140 2020/04/22 21:35:52 ad Exp $");
176 176
177#define __NAMECACHE_PRIVATE 177#define __NAMECACHE_PRIVATE
178#ifdef _KERNEL_OPT 178#ifdef _KERNEL_OPT
179#include "opt_ddb.h" 179#include "opt_ddb.h"
180#include "opt_dtrace.h" 180#include "opt_dtrace.h"
181#endif 181#endif
182 182
183#include <sys/types.h> 183#include <sys/types.h>
184#include <sys/atomic.h> 184#include <sys/atomic.h>
185#include <sys/callout.h> 185#include <sys/callout.h>
186#include <sys/cpu.h> 186#include <sys/cpu.h>
187#include <sys/errno.h> 187#include <sys/errno.h>
188#include <sys/evcnt.h> 188#include <sys/evcnt.h>
189#include <sys/hash.h> 189#include <sys/hash.h>
190#include <sys/kernel.h> 190#include <sys/kernel.h>
191#include <sys/mount.h> 191#include <sys/mount.h>
192#include <sys/mutex.h> 192#include <sys/mutex.h>
193#include <sys/namei.h> 193#include <sys/namei.h>
194#include <sys/param.h> 194#include <sys/param.h>
195#include <sys/pool.h> 195#include <sys/pool.h>
196#include <sys/sdt.h> 196#include <sys/sdt.h>
197#include <sys/sysctl.h> 197#include <sys/sysctl.h>
198#include <sys/systm.h> 198#include <sys/systm.h>
199#include <sys/time.h> 199#include <sys/time.h>
200#include <sys/vnode_impl.h> 200#include <sys/vnode_impl.h>
201 201
202#include <miscfs/genfs/genfs.h> 202#include <miscfs/genfs/genfs.h>
203 203
204static void cache_activate(struct namecache *); 204static void cache_activate(struct namecache *);
205static void cache_update_stats(void *); 205static void cache_update_stats(void *);
206static int cache_compare_nodes(void *, const void *, const void *); 206static int cache_compare_nodes(void *, const void *, const void *);
207static void cache_deactivate(void); 207static void cache_deactivate(void);
208static void cache_reclaim(void); 208static void cache_reclaim(void);
209static int cache_stat_sysctl(SYSCTLFN_ARGS); 209static int cache_stat_sysctl(SYSCTLFN_ARGS);
210 210
211/* 211/*
212 * Global pool cache. 212 * Global pool cache.
213 */ 213 */
214static pool_cache_t cache_pool __read_mostly; 214static pool_cache_t cache_pool __read_mostly;
215 215
216/* 216/*
217 * LRU replacement. 217 * LRU replacement.
218 */ 218 */
219enum cache_lru_id { 219enum cache_lru_id {
220 LRU_ACTIVE, 220 LRU_ACTIVE,
221 LRU_INACTIVE, 221 LRU_INACTIVE,
222 LRU_COUNT 222 LRU_COUNT
223}; 223};
224 224
225static struct { 225static struct {
226 TAILQ_HEAD(, namecache) list[LRU_COUNT]; 226 TAILQ_HEAD(, namecache) list[LRU_COUNT];
227 u_int count[LRU_COUNT]; 227 u_int count[LRU_COUNT];
228} cache_lru __cacheline_aligned; 228} cache_lru __cacheline_aligned;
229 229
230static kmutex_t cache_lru_lock __cacheline_aligned; 230static kmutex_t cache_lru_lock __cacheline_aligned;
231 231
232/* 232/*
233 * Cache effectiveness statistics. nchstats holds system-wide total. 233 * Cache effectiveness statistics. nchstats holds system-wide total.
234 */ 234 */
235struct nchstats nchstats; 235struct nchstats nchstats;
236struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t); 236struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t);
237struct nchcpu { 237struct nchcpu {
238 struct nchstats_percpu cur; 238 struct nchstats_percpu cur;
239 struct nchstats_percpu last; 239 struct nchstats_percpu last;
240}; 240};
241static callout_t cache_stat_callout; 241static callout_t cache_stat_callout;
242static kmutex_t cache_stat_lock __cacheline_aligned; 242static kmutex_t cache_stat_lock __cacheline_aligned;
243 243
244#define COUNT(f) do { \ 244#define COUNT(f) do { \
245 lwp_t *l = curlwp; \ 245 lwp_t *l = curlwp; \
246 KPREEMPT_DISABLE(l); \ 246 KPREEMPT_DISABLE(l); \
247 ((struct nchstats_percpu *)curcpu()->ci_data.cpu_nch)->f++; \ 247 ((struct nchstats_percpu *)curcpu()->ci_data.cpu_nch)->f++; \
248 KPREEMPT_ENABLE(l); \ 248 KPREEMPT_ENABLE(l); \
249} while (/* CONSTCOND */ 0); 249} while (/* CONSTCOND */ 0);
250 250
251#define UPDATE(nchcpu, f) do { \ 251#define UPDATE(nchcpu, f) do { \
252 uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \ 252 uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \
253 nchstats.f += (uint32_t)(cur - nchcpu->last.f); \ 253 nchstats.f += (uint32_t)(cur - nchcpu->last.f); \
254 nchcpu->last.f = cur; \ 254 nchcpu->last.f = cur; \
255} while (/* CONSTCOND */ 0) 255} while (/* CONSTCOND */ 0)
256 256
257/* 257/*
258 * Tunables. cache_maxlen replaces the historical doingcache: 258 * Tunables. cache_maxlen replaces the historical doingcache:
259 * set it zero to disable caching for debugging purposes. 259 * set it zero to disable caching for debugging purposes.
260 */ 260 */
261int cache_lru_maxdeact __read_mostly = 2; /* max # to deactivate */ 261int cache_lru_maxdeact __read_mostly = 2; /* max # to deactivate */
262int cache_lru_maxscan __read_mostly = 64; /* max # to scan/reclaim */ 262int cache_lru_maxscan __read_mostly = 64; /* max # to scan/reclaim */
263int cache_maxlen __read_mostly = USHRT_MAX; /* max name length to cache */ 263int cache_maxlen __read_mostly = USHRT_MAX; /* max name length to cache */
264int cache_stat_interval __read_mostly = 300; /* in seconds */ 264int cache_stat_interval __read_mostly = 300; /* in seconds */
265 265
266/* 266/*
267 * sysctl stuff. 267 * sysctl stuff.
268 */ 268 */
269static struct sysctllog *cache_sysctllog; 269static struct sysctllog *cache_sysctllog;
270 270
271/* 271/*
272 * Red-black tree stuff. 272 * Red-black tree stuff.
273 */ 273 */
274static const rb_tree_ops_t cache_rbtree_ops = { 274static const rb_tree_ops_t cache_rbtree_ops = {
275 .rbto_compare_nodes = cache_compare_nodes, 275 .rbto_compare_nodes = cache_compare_nodes,
276 .rbto_compare_key = cache_compare_nodes, 276 .rbto_compare_key = cache_compare_nodes,
277 .rbto_node_offset = offsetof(struct namecache, nc_tree), 277 .rbto_node_offset = offsetof(struct namecache, nc_tree),
278 .rbto_context = NULL 278 .rbto_context = NULL
279}; 279};
280 280
281/* 281/*
282 * dtrace probes. 282 * dtrace probes.
283 */ 283 */
284SDT_PROVIDER_DEFINE(vfs); 284SDT_PROVIDER_DEFINE(vfs);
285 285
286SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *"); 286SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *");
287SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *"); 287SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *");
288SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *"); 288SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *");
289SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t"); 289SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t");
290SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *"); 290SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *");
291SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", 291SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *",
292 "char *", "size_t"); 292 "char *", "size_t");
293SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *", 293SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *",
294 "char *", "size_t"); 294 "char *", "size_t");
295SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *", 295SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *",
296 "char *", "size_t"); 296 "char *", "size_t");
297SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *", 297SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *",
298 "struct vnode *"); 298 "struct vnode *");
299SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *", 299SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *",
300 "int"); 300 "int");
301SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int"); 301SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int");
302SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *", 302SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *",
303 "char *", "size_t"); 303 "char *", "size_t");
304SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", 304SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *",
305 "char *", "size_t"); 305 "char *", "size_t");
306 306
307/* 307/*
308 * rbtree: compare two nodes. 308 * rbtree: compare two nodes.
309 */ 309 */
310static int 310static int
311cache_compare_nodes(void *context, const void *n1, const void *n2) 311cache_compare_nodes(void *context, const void *n1, const void *n2)
312{ 312{
313 const struct namecache *nc1 = n1; 313 const struct namecache *nc1 = n1;
314 const struct namecache *nc2 = n2; 314 const struct namecache *nc2 = n2;
315 315
316 if (nc1->nc_key < nc2->nc_key) { 316 if (nc1->nc_key < nc2->nc_key) {
317 return -1; 317 return -1;
318 } 318 }
319 if (nc1->nc_key > nc2->nc_key) { 319 if (nc1->nc_key > nc2->nc_key) {
320 return 1; 320 return 1;
321 } 321 }
322 KASSERT(nc1->nc_nlen == nc2->nc_nlen); 322 KASSERT(nc1->nc_nlen == nc2->nc_nlen);
323 return memcmp(nc1->nc_name, nc2->nc_name, nc1->nc_nlen); 323 return memcmp(nc1->nc_name, nc2->nc_name, nc1->nc_nlen);
324} 324}
325 325
326/* 326/*
327 * Compute a key value for the given name. The name length is encoded in 327 * Compute a key value for the given name. The name length is encoded in
328 * the key value to try and improve uniqueness, and so that length doesn't 328 * the key value to try and improve uniqueness, and so that length doesn't
329 * need to be compared separately for string comparisons. 329 * need to be compared separately for string comparisons.
330 */ 330 */
331static inline uint64_t 331static inline uint64_t
332cache_key(const char *name, size_t nlen) 332cache_key(const char *name, size_t nlen)
333{ 333{
334 uint64_t key; 334 uint64_t key;
335 335
336 KASSERT(nlen <= USHRT_MAX); 336 KASSERT(nlen <= USHRT_MAX);
337 337
338 key = hash32_buf(name, nlen, HASH32_STR_INIT); 338 key = hash32_buf(name, nlen, HASH32_STR_INIT);
339 return (key << 32) | nlen; 339 return (key << 32) | nlen;
340} 340}
341 341
342/* 342/*
343 * Remove an entry from the cache. vi_nc_lock must be held, and if dir2node 343 * Remove an entry from the cache. vi_nc_lock must be held, and if dir2node
344 * is true, then we're locking in the conventional direction and the list 344 * is true, then we're locking in the conventional direction and the list
345 * lock will be acquired when removing the entry from the vnode list. 345 * lock will be acquired when removing the entry from the vnode list.
346 */ 346 */
347static void 347static void
348cache_remove(struct namecache *ncp, const bool dir2node) 348cache_remove(struct namecache *ncp, const bool dir2node)
349{ 349{
350 struct vnode *vp, *dvp = ncp->nc_dvp; 350 struct vnode *vp, *dvp = ncp->nc_dvp;
351 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 351 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
352 352
353 KASSERT(rw_write_held(&dvi->vi_nc_lock)); 353 KASSERT(rw_write_held(&dvi->vi_nc_lock));
354 KASSERT(cache_key(ncp->nc_name, ncp->nc_nlen) == ncp->nc_key); 354 KASSERT(cache_key(ncp->nc_name, ncp->nc_nlen) == ncp->nc_key);
355 KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp); 355 KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp);
356 356
357 SDT_PROBE(vfs, namecache, invalidate, done, ncp, 357 SDT_PROBE(vfs, namecache, invalidate, done, ncp,
358 0, 0, 0, 0); 358 0, 0, 0, 0);
359 359
360 /* 360 /*
361 * Remove from the vnode's list. This excludes cache_revlookup(), 361 * Remove from the vnode's list. This excludes cache_revlookup(),
362 * and then it's safe to remove from the LRU lists. 362 * and then it's safe to remove from the LRU lists.
363 */ 363 */
364 if ((vp = ncp->nc_vp) != NULL) { 364 if ((vp = ncp->nc_vp) != NULL) {
365 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 365 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
366 if (__predict_true(dir2node)) { 366 if (__predict_true(dir2node)) {
367 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 367 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
368 TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); 368 TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
369 rw_exit(&vi->vi_nc_listlock); 369 rw_exit(&vi->vi_nc_listlock);
370 } else { 370 } else {
371 TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); 371 TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
372 } 372 }
373 } 373 }
374 374
375 /* Remove from the directory's rbtree. */ 375 /* Remove from the directory's rbtree. */
376 rb_tree_remove_node(&dvi->vi_nc_tree, ncp); 376 rb_tree_remove_node(&dvi->vi_nc_tree, ncp);
377 377
378 /* Remove from the LRU lists. */ 378 /* Remove from the LRU lists. */
379 mutex_enter(&cache_lru_lock); 379 mutex_enter(&cache_lru_lock);
380 TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); 380 TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
381 cache_lru.count[ncp->nc_lrulist]--; 381 cache_lru.count[ncp->nc_lrulist]--;
382 mutex_exit(&cache_lru_lock); 382 mutex_exit(&cache_lru_lock);
383 383
384 /* Finally, free it. */ 384 /* Finally, free it. */
385 if (ncp->nc_nlen > NCHNAMLEN) { 385 if (ncp->nc_nlen > NCHNAMLEN) {
386 size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]); 386 size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]);
387 kmem_free(ncp, sz); 387 kmem_free(ncp, sz);
388 } else { 388 } else {
389 pool_cache_put(cache_pool, ncp); 389 pool_cache_put(cache_pool, ncp);
390 } 390 }
391} 391}
392 392
393/* 393/*
394 * Find a single cache entry and return it. vi_nc_lock must be held. 394 * Find a single cache entry and return it. vi_nc_lock must be held.
395 */ 395 */
396static struct namecache * __noinline 396static struct namecache * __noinline
397cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen, 397cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen,
398 uint64_t key) 398 uint64_t key)
399{ 399{
400 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 400 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
401 struct rb_node *node = dvi->vi_nc_tree.rbt_root; 401 struct rb_node *node = dvi->vi_nc_tree.rbt_root;
402 struct namecache *ncp; 402 struct namecache *ncp;
403 int lrulist, diff; 403 int lrulist, diff;
404 404
405 KASSERT(rw_lock_held(&dvi->vi_nc_lock)); 405 KASSERT(rw_lock_held(&dvi->vi_nc_lock));
406 406
407 /* 407 /*
408 * Search the RB tree for the key. This is an inlined lookup 408 * Search the RB tree for the key. This is an inlined lookup
409 * tailored for exactly what's needed here (64-bit key and so on) 409 * tailored for exactly what's needed here (64-bit key and so on)
410 * that is quite a bit faster than using rb_tree_find_node().  410 * that is quite a bit faster than using rb_tree_find_node().
411 * 411 *
412 * For a matching key memcmp() needs to be called once to confirm 412 * For a matching key memcmp() needs to be called once to confirm
413 * that the correct name has been found. Very rarely there will be 413 * that the correct name has been found. Very rarely there will be
414 * a key value collision and the search will continue. 414 * a key value collision and the search will continue.
415 */ 415 */
416 for (;;) { 416 for (;;) {
417 if (__predict_false(RB_SENTINEL_P(node))) { 417 if (__predict_false(RB_SENTINEL_P(node))) {
418 return NULL; 418 return NULL;
419 } 419 }
420 ncp = (struct namecache *)node; 420 ncp = (struct namecache *)node;
421 KASSERT((void *)&ncp->nc_tree == (void *)ncp); 421 KASSERT((void *)&ncp->nc_tree == (void *)ncp);
422 KASSERT(ncp->nc_dvp == dvp); 422 KASSERT(ncp->nc_dvp == dvp);
423 if (ncp->nc_key == key) { 423 if (ncp->nc_key == key) {
424 KASSERT(ncp->nc_nlen == namelen); 424 KASSERT(ncp->nc_nlen == namelen);
425 diff = memcmp(ncp->nc_name, name, namelen); 425 diff = memcmp(ncp->nc_name, name, namelen);
426 if (__predict_true(diff == 0)) { 426 if (__predict_true(diff == 0)) {
427 break; 427 break;
428 } 428 }
429 node = node->rb_nodes[diff < 0];  429 node = node->rb_nodes[diff < 0];
430 } else { 430 } else {
431 node = node->rb_nodes[ncp->nc_key < key]; 431 node = node->rb_nodes[ncp->nc_key < key];
432 } 432 }
433 } 433 }
434 434
435 /* 435 /*
436 * If the entry is on the wrong LRU list, requeue it. This is an 436 * If the entry is on the wrong LRU list, requeue it. This is an
437 * unlocked check, but it will rarely be wrong and even then there 437 * unlocked check, but it will rarely be wrong and even then there
438 * will be no harm caused. 438 * will be no harm caused.
439 */ 439 */
440 lrulist = atomic_load_relaxed(&ncp->nc_lrulist); 440 lrulist = atomic_load_relaxed(&ncp->nc_lrulist);
441 if (__predict_false(lrulist != LRU_ACTIVE)) { 441 if (__predict_false(lrulist != LRU_ACTIVE)) {
442 cache_activate(ncp); 442 cache_activate(ncp);
443 } 443 }
444 return ncp; 444 return ncp;
445} 445}
446 446
447/* 447/*
448 * Look for a the name in the cache. We don't do this 448 * Look for a the name in the cache. We don't do this
449 * if the segment name is long, simply so the cache can avoid 449 * if the segment name is long, simply so the cache can avoid
450 * holding long names (which would either waste space, or 450 * holding long names (which would either waste space, or
451 * add greatly to the complexity). 451 * add greatly to the complexity).
452 * 452 *
453 * Lookup is called with DVP pointing to the directory to search, 453 * Lookup is called with DVP pointing to the directory to search,
454 * and CNP providing the name of the entry being sought: cn_nameptr 454 * and CNP providing the name of the entry being sought: cn_nameptr
455 * is the name, cn_namelen is its length, and cn_flags is the flags 455 * is the name, cn_namelen is its length, and cn_flags is the flags
456 * word from the namei operation. 456 * word from the namei operation.
457 * 457 *
458 * DVP must be locked. 458 * DVP must be locked.
459 * 459 *
460 * There are three possible non-error return states: 460 * There are three possible non-error return states:
461 * 1. Nothing was found in the cache. Nothing is known about 461 * 1. Nothing was found in the cache. Nothing is known about
462 * the requested name. 462 * the requested name.
463 * 2. A negative entry was found in the cache, meaning that the 463 * 2. A negative entry was found in the cache, meaning that the
464 * requested name definitely does not exist. 464 * requested name definitely does not exist.
465 * 3. A positive entry was found in the cache, meaning that the 465 * 3. A positive entry was found in the cache, meaning that the
466 * requested name does exist and that we are providing the 466 * requested name does exist and that we are providing the
467 * vnode. 467 * vnode.
468 * In these cases the results are: 468 * In these cases the results are:
469 * 1. 0 returned; VN is set to NULL. 469 * 1. 0 returned; VN is set to NULL.
470 * 2. 1 returned; VN is set to NULL. 470 * 2. 1 returned; VN is set to NULL.
471 * 3. 1 returned; VN is set to the vnode found. 471 * 3. 1 returned; VN is set to the vnode found.
472 * 472 *
473 * The additional result argument ISWHT is set to zero, unless a 473 * The additional result argument ISWHT is set to zero, unless a
474 * negative entry is found that was entered as a whiteout, in which 474 * negative entry is found that was entered as a whiteout, in which
475 * case ISWHT is set to one. 475 * case ISWHT is set to one.
476 * 476 *
477 * The ISWHT_RET argument pointer may be null. In this case an 477 * The ISWHT_RET argument pointer may be null. In this case an
478 * assertion is made that the whiteout flag is not set. File systems 478 * assertion is made that the whiteout flag is not set. File systems
479 * that do not support whiteouts can/should do this. 479 * that do not support whiteouts can/should do this.
480 * 480 *
481 * Filesystems that do support whiteouts should add ISWHITEOUT to 481 * Filesystems that do support whiteouts should add ISWHITEOUT to
482 * cnp->cn_flags if ISWHT comes back nonzero. 482 * cnp->cn_flags if ISWHT comes back nonzero.
483 * 483 *
484 * When a vnode is returned, it is locked, as per the vnode lookup 484 * When a vnode is returned, it is locked, as per the vnode lookup
485 * locking protocol. 485 * locking protocol.
486 * 486 *
487 * There is no way for this function to fail, in the sense of 487 * There is no way for this function to fail, in the sense of
488 * generating an error that requires aborting the namei operation. 488 * generating an error that requires aborting the namei operation.
489 * 489 *
490 * (Prior to October 2012, this function returned an integer status, 490 * (Prior to October 2012, this function returned an integer status,
491 * and a vnode, and mucked with the flags word in CNP for whiteouts. 491 * and a vnode, and mucked with the flags word in CNP for whiteouts.
492 * The integer status was -1 for "nothing found", ENOENT for "a 492 * The integer status was -1 for "nothing found", ENOENT for "a
493 * negative entry found", 0 for "a positive entry found", and possibly 493 * negative entry found", 0 for "a positive entry found", and possibly
494 * other errors, and the value of VN might or might not have been set 494 * other errors, and the value of VN might or might not have been set
495 * depending on what error occurred.) 495 * depending on what error occurred.)
496 */ 496 */
497bool 497bool
498cache_lookup(struct vnode *dvp, const char *name, size_t namelen, 498cache_lookup(struct vnode *dvp, const char *name, size_t namelen,
499 uint32_t nameiop, uint32_t cnflags, 499 uint32_t nameiop, uint32_t cnflags,
500 int *iswht_ret, struct vnode **vn_ret) 500 int *iswht_ret, struct vnode **vn_ret)
501{ 501{
502 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 502 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
503 struct namecache *ncp; 503 struct namecache *ncp;
504 struct vnode *vp; 504 struct vnode *vp;
505 uint64_t key; 505 uint64_t key;
506 int error; 506 int error;
507 bool hit; 507 bool hit;
508 krw_t op; 508 krw_t op;
509 509
510 /* Establish default result values */ 510 /* Establish default result values */
511 if (iswht_ret != NULL) { 511 if (iswht_ret != NULL) {
512 *iswht_ret = 0; 512 *iswht_ret = 0;
513 } 513 }
514 *vn_ret = NULL; 514 *vn_ret = NULL;
515 515
516 if (__predict_false(namelen > cache_maxlen)) { 516 if (__predict_false(namelen > cache_maxlen)) {
517 SDT_PROBE(vfs, namecache, lookup, toolong, dvp, 517 SDT_PROBE(vfs, namecache, lookup, toolong, dvp,
518 name, namelen, 0, 0); 518 name, namelen, 0, 0);
519 COUNT(ncs_long); 519 COUNT(ncs_long);
520 return false; 520 return false;
521 } 521 }
522 522
523 /* Compute the key up front - don't need the lock. */ 523 /* Compute the key up front - don't need the lock. */
524 key = cache_key(name, namelen); 524 key = cache_key(name, namelen);
525 525
526 /* Could the entry be purged below? */ 526 /* Could the entry be purged below? */
527 if ((cnflags & ISLASTCN) != 0 && 527 if ((cnflags & ISLASTCN) != 0 &&
528 ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) { 528 ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) {
529 op = RW_WRITER; 529 op = RW_WRITER;
530 } else { 530 } else {
531 op = RW_READER; 531 op = RW_READER;
532 } 532 }
533 533
534 /* Now look for the name. */ 534 /* Now look for the name. */
535 rw_enter(&dvi->vi_nc_lock, op); 535 rw_enter(&dvi->vi_nc_lock, op);
536 ncp = cache_lookup_entry(dvp, name, namelen, key); 536 ncp = cache_lookup_entry(dvp, name, namelen, key);
537 if (__predict_false(ncp == NULL)) { 537 if (__predict_false(ncp == NULL)) {
538 rw_exit(&dvi->vi_nc_lock); 538 rw_exit(&dvi->vi_nc_lock);
539 COUNT(ncs_miss); 539 COUNT(ncs_miss);
540 SDT_PROBE(vfs, namecache, lookup, miss, dvp, 540 SDT_PROBE(vfs, namecache, lookup, miss, dvp,
541 name, namelen, 0, 0); 541 name, namelen, 0, 0);
542 return false; 542 return false;
543 } 543 }
544 if (__predict_false((cnflags & MAKEENTRY) == 0)) { 544 if (__predict_false((cnflags & MAKEENTRY) == 0)) {
545 /* 545 /*
546 * Last component and we are renaming or deleting, 546 * Last component and we are renaming or deleting,
547 * the cache entry is invalid, or otherwise don't 547 * the cache entry is invalid, or otherwise don't
548 * want cache entry to exist. 548 * want cache entry to exist.
549 */ 549 */
550 KASSERT((cnflags & ISLASTCN) != 0); 550 KASSERT((cnflags & ISLASTCN) != 0);
551 cache_remove(ncp, true); 551 cache_remove(ncp, true);
552 rw_exit(&dvi->vi_nc_lock); 552 rw_exit(&dvi->vi_nc_lock);
553 COUNT(ncs_badhits); 553 COUNT(ncs_badhits);
554 return false; 554 return false;
555 } 555 }
556 if (ncp->nc_vp == NULL) { 556 if (ncp->nc_vp == NULL) {
557 if (iswht_ret != NULL) { 557 if (iswht_ret != NULL) {
558 /* 558 /*
559 * Restore the ISWHITEOUT flag saved earlier. 559 * Restore the ISWHITEOUT flag saved earlier.
560 */ 560 */
561 *iswht_ret = ncp->nc_whiteout; 561 *iswht_ret = ncp->nc_whiteout;
562 } else { 562 } else {
563 KASSERT(!ncp->nc_whiteout); 563 KASSERT(!ncp->nc_whiteout);
564 } 564 }
565 if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) { 565 if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) {
566 /* 566 /*
567 * Last component and we are preparing to create 567 * Last component and we are preparing to create
568 * the named object, so flush the negative cache 568 * the named object, so flush the negative cache
569 * entry. 569 * entry.
570 */ 570 */
571 COUNT(ncs_badhits); 571 COUNT(ncs_badhits);
572 cache_remove(ncp, true); 572 cache_remove(ncp, true);
573 hit = false; 573 hit = false;
574 } else { 574 } else {
575 COUNT(ncs_neghits); 575 COUNT(ncs_neghits);
576 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, 576 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name,
577 namelen, 0, 0); 577 namelen, 0, 0);
578 /* found neg entry; vn is already null from above */ 578 /* found neg entry; vn is already null from above */
579 hit = true; 579 hit = true;
580 } 580 }
581 rw_exit(&dvi->vi_nc_lock); 581 rw_exit(&dvi->vi_nc_lock);
582 return hit; 582 return hit;
583 } 583 }
584 vp = ncp->nc_vp; 584 vp = ncp->nc_vp;
585 mutex_enter(vp->v_interlock); 585 mutex_enter(vp->v_interlock);
586 rw_exit(&dvi->vi_nc_lock); 586 rw_exit(&dvi->vi_nc_lock);
587 587
588 /* 588 /*
589 * Unlocked except for the vnode interlock. Call vcache_tryvget(). 589 * Unlocked except for the vnode interlock. Call vcache_tryvget().
590 */ 590 */
591 error = vcache_tryvget(vp); 591 error = vcache_tryvget(vp);
592 if (error) { 592 if (error) {
593 KASSERT(error == EBUSY); 593 KASSERT(error == EBUSY);
594 /* 594 /*
595 * This vnode is being cleaned out. 595 * This vnode is being cleaned out.
596 * XXX badhits? 596 * XXX badhits?
597 */ 597 */
598 COUNT(ncs_falsehits); 598 COUNT(ncs_falsehits);
599 return false; 599 return false;
600 } 600 }
601 601
602 COUNT(ncs_goodhits); 602 COUNT(ncs_goodhits);
603 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); 603 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
604 /* found it */ 604 /* found it */
605 *vn_ret = vp; 605 *vn_ret = vp;
606 return true; 606 return true;
607} 607}
608 608
609/* 609/*
610 * Version of the above without the nameiop argument, for NFS. 610 * Version of the above without the nameiop argument, for NFS.
611 */ 611 */
612bool 612bool
613cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen, 613cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen,
614 uint32_t cnflags, 614 uint32_t cnflags,
615 int *iswht_ret, struct vnode **vn_ret) 615 int *iswht_ret, struct vnode **vn_ret)
616{ 616{
617 617
618 return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY, 618 return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY,
619 iswht_ret, vn_ret); 619 iswht_ret, vn_ret);
620} 620}
621 621
622/* 622/*
623 * Used by namei() to walk down a path, component by component by looking up 623 * Used by namei() to walk down a path, component by component by looking up
624 * names in the cache. The node locks are chained along the way: a parent's 624 * names in the cache. The node locks are chained along the way: a parent's
625 * lock is not dropped until the child's is acquired. 625 * lock is not dropped until the child's is acquired.
626 */ 626 */
627bool 627bool
628cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen, 628cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen,
629 struct vnode **vn_ret, krwlock_t **plock, 629 struct vnode **vn_ret, krwlock_t **plock,
630 kauth_cred_t cred) 630 kauth_cred_t cred)
631{ 631{
632 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 632 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
633 struct namecache *ncp; 633 struct namecache *ncp;
634 uint64_t key; 634 uint64_t key;
635 int error; 635 int error;
636 636
637 /* Establish default results. */ 637 /* Establish default results. */
638 *vn_ret = NULL; 638 *vn_ret = NULL;
639 639
640 /* If disabled, or file system doesn't support this, bail out. */ 640 /* If disabled, or file system doesn't support this, bail out. */
641 if (__predict_false((dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) { 641 if (__predict_false((dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) {
642 return false; 642 return false;
643 } 643 }
644 644
645 if (__predict_false(namelen > cache_maxlen)) { 645 if (__predict_false(namelen > cache_maxlen)) {
646 COUNT(ncs_long); 646 COUNT(ncs_long);
647 return false; 647 return false;
648 } 648 }
649 649
650 /* Compute the key up front - don't need the lock. */ 650 /* Compute the key up front - don't need the lock. */
651 key = cache_key(name, namelen); 651 key = cache_key(name, namelen);
652 652
653 /* 653 /*
654 * Acquire the directory lock. Once we have that, we can drop the 654 * Acquire the directory lock. Once we have that, we can drop the
655 * previous one (if any). 655 * previous one (if any).
656 * 656 *
657 * The two lock holds mean that the directory can't go away while 657 * The two lock holds mean that the directory can't go away while
658 * here: the directory must be purged with cache_purge() before 658 * here: the directory must be purged with cache_purge() before
659 * being freed, and both parent & child's vi_nc_lock must be taken 659 * being freed, and both parent & child's vi_nc_lock must be taken
660 * before that point is passed. 660 * before that point is passed.
661 * 661 *
662 * However if there's no previous lock, like at the root of the 662 * However if there's no previous lock, like at the root of the
663 * chain, then "dvp" must be referenced to prevent dvp going away 663 * chain, then "dvp" must be referenced to prevent dvp going away
664 * before we get its lock. 664 * before we get its lock.
665 * 665 *
666 * Note that the two locks can be the same if looking up a dot, for 666 * Note that the two locks can be the same if looking up a dot, for
667 * example: /usr/bin/. 667 * example: /usr/bin/. If looking up the parent (..) we can't wait
 668 * on the lock as child -> parent is the wrong direction.
668 */ 669 */
669 if (*plock != &dvi->vi_nc_lock) { 670 if (*plock != &dvi->vi_nc_lock) {
670 rw_enter(&dvi->vi_nc_lock, RW_READER); 671 if (namelen == 2 && name[0] == '.' && name[1] == '.') {
 672 if (!rw_tryenter(&dvi->vi_nc_lock, RW_READER)) {
 673 return false;
 674 }
 675 } else {
 676 rw_enter(&dvi->vi_nc_lock, RW_READER);
 677 }
671 if (*plock != NULL) { 678 if (*plock != NULL) {
672 rw_exit(*plock); 679 rw_exit(*plock);
673 } 680 }
674 *plock = &dvi->vi_nc_lock; 681 *plock = &dvi->vi_nc_lock;
675 } else if (*plock == NULL) { 682 } else if (*plock == NULL) {
676 KASSERT(vrefcnt(dvp) > 0); 683 KASSERT(vrefcnt(dvp) > 0);
677 } 684 }
678 685
679 /* 686 /*
680 * First up check if the user is allowed to look up files in this 687 * First up check if the user is allowed to look up files in this
681 * directory. 688 * directory.
682 */ 689 */
683 KASSERT(dvi->vi_nc_mode != VNOVAL && dvi->vi_nc_uid != VNOVAL && 690 KASSERT(dvi->vi_nc_mode != VNOVAL && dvi->vi_nc_uid != VNOVAL &&
684 dvi->vi_nc_gid != VNOVAL); 691 dvi->vi_nc_gid != VNOVAL);
685 error = kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(VEXEC, 692 error = kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(VEXEC,
686 dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL, 693 dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL,
687 genfs_can_access(dvp->v_type, dvi->vi_nc_mode & ALLPERMS, 694 genfs_can_access(dvp->v_type, dvi->vi_nc_mode & ALLPERMS,
688 dvi->vi_nc_uid, dvi->vi_nc_gid, VEXEC, cred)); 695 dvi->vi_nc_uid, dvi->vi_nc_gid, VEXEC, cred));
689 if (error != 0) { 696 if (error != 0) {
690 COUNT(ncs_denied); 697 COUNT(ncs_denied);
691 return false; 698 return false;
692 } 699 }
693 700
694 /* 701 /*
695 * Now look for a matching cache entry. 702 * Now look for a matching cache entry.
696 */ 703 */
697 ncp = cache_lookup_entry(dvp, name, namelen, key); 704 ncp = cache_lookup_entry(dvp, name, namelen, key);
698 if (__predict_false(ncp == NULL)) { 705 if (__predict_false(ncp == NULL)) {
699 COUNT(ncs_miss); 706 COUNT(ncs_miss);
700 SDT_PROBE(vfs, namecache, lookup, miss, dvp, 707 SDT_PROBE(vfs, namecache, lookup, miss, dvp,
701 name, namelen, 0, 0); 708 name, namelen, 0, 0);
702 return false; 709 return false;
703 } 710 }
704 if (ncp->nc_vp == NULL) { 711 if (ncp->nc_vp == NULL) {
705 /* found negative entry; vn is already null from above */ 712 /* found negative entry; vn is already null from above */
706 COUNT(ncs_neghits); 713 COUNT(ncs_neghits);
707 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); 714 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
708 return true; 715 return true;
709 } 716 }
710 717
711 COUNT(ncs_goodhits); /* XXX can be "badhits" */ 718 COUNT(ncs_goodhits); /* XXX can be "badhits" */
712 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); 719 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
713 720
714 /* 721 /*
715 * Return with the directory lock still held. It will either be 722 * Return with the directory lock still held. It will either be
716 * returned to us with another call to cache_lookup_linked() when 723 * returned to us with another call to cache_lookup_linked() when
717 * looking up the next component, or the caller will release it 724 * looking up the next component, or the caller will release it
718 * manually when finished. 725 * manually when finished.
719 */ 726 */
720 *vn_ret = ncp->nc_vp; 727 *vn_ret = ncp->nc_vp;
721 return true; 728 return true;
722} 729}
723 730
724/* 731/*
725 * Scan cache looking for name of directory entry pointing at vp. 732 * Scan cache looking for name of directory entry pointing at vp.
726 * Will not search for "." or "..". 733 * Will not search for "." or "..".
727 * 734 *
728 * If the lookup succeeds the vnode is referenced and stored in dvpp. 735 * If the lookup succeeds the vnode is referenced and stored in dvpp.
729 * 736 *
730 * If bufp is non-NULL, also place the name in the buffer which starts 737 * If bufp is non-NULL, also place the name in the buffer which starts
731 * at bufp, immediately before *bpp, and move bpp backwards to point 738 * at bufp, immediately before *bpp, and move bpp backwards to point
732 * at the start of it. (Yes, this is a little baroque, but it's done 739 * at the start of it. (Yes, this is a little baroque, but it's done
733 * this way to cater to the whims of getcwd). 740 * this way to cater to the whims of getcwd).
734 * 741 *
735 * Returns 0 on success, -1 on cache miss, positive errno on failure. 742 * Returns 0 on success, -1 on cache miss, positive errno on failure.
736 */ 743 */
737int 744int
738cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp, 745cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp,
739 bool checkaccess, int perms) 746 bool checkaccess, int perms)
740{ 747{
741 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 748 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
742 struct namecache *ncp; 749 struct namecache *ncp;
743 struct vnode *dvp; 750 struct vnode *dvp;
744 int error, nlen, lrulist; 751 int error, nlen, lrulist;
745 char *bp; 752 char *bp;
746 753
747 KASSERT(vp != NULL); 754 KASSERT(vp != NULL);
748 755
749 if (cache_maxlen == 0) 756 if (cache_maxlen == 0)
750 goto out; 757 goto out;
751 758
752 rw_enter(&vi->vi_nc_listlock, RW_READER); 759 rw_enter(&vi->vi_nc_listlock, RW_READER);
753 if (checkaccess) { 760 if (checkaccess) {
754 /* 761 /*
755 * Check if the user is allowed to see. NOTE: this is 762 * Check if the user is allowed to see. NOTE: this is
756 * checking for access on the "wrong" directory. getcwd() 763 * checking for access on the "wrong" directory. getcwd()
757 * wants to see that there is access on every component 764 * wants to see that there is access on every component
758 * along the way, not that there is access to any individual 765 * along the way, not that there is access to any individual
759 * component. Don't use this to check you can look in vp. 766 * component. Don't use this to check you can look in vp.
760 * 767 *
761 * I don't like it, I didn't come up with it, don't blame me! 768 * I don't like it, I didn't come up with it, don't blame me!
762 */ 769 */
763 KASSERT(vi->vi_nc_mode != VNOVAL && vi->vi_nc_uid != VNOVAL && 770 KASSERT(vi->vi_nc_mode != VNOVAL && vi->vi_nc_uid != VNOVAL &&
764 vi->vi_nc_gid != VNOVAL); 771 vi->vi_nc_gid != VNOVAL);
765 error = kauth_authorize_vnode(curlwp->l_cred, 772 error = kauth_authorize_vnode(curlwp->l_cred,
766 KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode & 773 KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode &
767 ALLPERMS), vp, NULL, genfs_can_access(vp->v_type, 774 ALLPERMS), vp, NULL, genfs_can_access(vp->v_type,
768 vi->vi_nc_mode & ALLPERMS, vi->vi_nc_uid, vi->vi_nc_gid, 775 vi->vi_nc_mode & ALLPERMS, vi->vi_nc_uid, vi->vi_nc_gid,
769 perms, curlwp->l_cred)); 776 perms, curlwp->l_cred));
770 if (error != 0) { 777 if (error != 0) {
771 rw_exit(&vi->vi_nc_listlock); 778 rw_exit(&vi->vi_nc_listlock);
772 COUNT(ncs_denied); 779 COUNT(ncs_denied);
773 return EACCES; 780 return EACCES;
774 } 781 }
775 } 782 }
776 TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) { 783 TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) {
777 KASSERT(ncp->nc_vp == vp); 784 KASSERT(ncp->nc_vp == vp);
778 KASSERT(ncp->nc_dvp != NULL); 785 KASSERT(ncp->nc_dvp != NULL);
779 nlen = ncp->nc_nlen; 786 nlen = ncp->nc_nlen;
780 787
781 /* 788 /*
782 * The queue is partially sorted. Once we hit dots, nothing 789 * The queue is partially sorted. Once we hit dots, nothing
783 * else remains but dots and dotdots, so bail out. 790 * else remains but dots and dotdots, so bail out.
784 */ 791 */
785 if (ncp->nc_name[0] == '.') { 792 if (ncp->nc_name[0] == '.') {
786 if (nlen == 1 || 793 if (nlen == 1 ||
787 (nlen == 2 && ncp->nc_name[1] == '.')) { 794 (nlen == 2 && ncp->nc_name[1] == '.')) {
788 break; 795 break;
789 } 796 }
790 } 797 }
791 798
792 /* 799 /*
793 * Record a hit on the entry. This is an unlocked read but 800 * Record a hit on the entry. This is an unlocked read but
794 * even if wrong it doesn't matter too much. 801 * even if wrong it doesn't matter too much.
795 */ 802 */
796 lrulist = atomic_load_relaxed(&ncp->nc_lrulist); 803 lrulist = atomic_load_relaxed(&ncp->nc_lrulist);
797 if (lrulist != LRU_ACTIVE) { 804 if (lrulist != LRU_ACTIVE) {
798 cache_activate(ncp); 805 cache_activate(ncp);
799 } 806 }
800 807
801 if (bufp) { 808 if (bufp) {
802 bp = *bpp; 809 bp = *bpp;
803 bp -= nlen; 810 bp -= nlen;
804 if (bp <= bufp) { 811 if (bp <= bufp) {
805 *dvpp = NULL; 812 *dvpp = NULL;
806 rw_exit(&vi->vi_nc_listlock); 813 rw_exit(&vi->vi_nc_listlock);
807 SDT_PROBE(vfs, namecache, revlookup, 814 SDT_PROBE(vfs, namecache, revlookup,
808 fail, vp, ERANGE, 0, 0, 0); 815 fail, vp, ERANGE, 0, 0, 0);
809 return (ERANGE); 816 return (ERANGE);
810 } 817 }
811 memcpy(bp, ncp->nc_name, nlen); 818 memcpy(bp, ncp->nc_name, nlen);
812 *bpp = bp; 819 *bpp = bp;
813 } 820 }
814 821
815 dvp = ncp->nc_dvp; 822 dvp = ncp->nc_dvp;
816 mutex_enter(dvp->v_interlock); 823 mutex_enter(dvp->v_interlock);
817 rw_exit(&vi->vi_nc_listlock); 824 rw_exit(&vi->vi_nc_listlock);
818 error = vcache_tryvget(dvp); 825 error = vcache_tryvget(dvp);
819 if (error) { 826 if (error) {
820 KASSERT(error == EBUSY); 827 KASSERT(error == EBUSY);
821 if (bufp) 828 if (bufp)
822 (*bpp) += nlen; 829 (*bpp) += nlen;
823 *dvpp = NULL; 830 *dvpp = NULL;
824 SDT_PROBE(vfs, namecache, revlookup, fail, vp, 831 SDT_PROBE(vfs, namecache, revlookup, fail, vp,
825 error, 0, 0, 0); 832 error, 0, 0, 0);
826 return -1; 833 return -1;
827 } 834 }
828 *dvpp = dvp; 835 *dvpp = dvp;
829 SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp, 836 SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp,
830 0, 0, 0); 837 0, 0, 0);
831 COUNT(ncs_revhits); 838 COUNT(ncs_revhits);
832 return (0); 839 return (0);
833 } 840 }
834 rw_exit(&vi->vi_nc_listlock); 841 rw_exit(&vi->vi_nc_listlock);
835 COUNT(ncs_revmiss); 842 COUNT(ncs_revmiss);
836 out: 843 out:
837 *dvpp = NULL; 844 *dvpp = NULL;
838 return (-1); 845 return (-1);
839} 846}
840 847
841/* 848/*
842 * Add an entry to the cache. 849 * Add an entry to the cache.
843 */ 850 */
844void 851void
845cache_enter(struct vnode *dvp, struct vnode *vp, 852cache_enter(struct vnode *dvp, struct vnode *vp,
846 const char *name, size_t namelen, uint32_t cnflags) 853 const char *name, size_t namelen, uint32_t cnflags)
847{ 854{
848 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 855 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
849 struct namecache *ncp, *oncp; 856 struct namecache *ncp, *oncp;
850 int total; 857 int total;
851 858
852 /* First, check whether we can/should add a cache entry. */ 859 /* First, check whether we can/should add a cache entry. */
853 if ((cnflags & MAKEENTRY) == 0 || 860 if ((cnflags & MAKEENTRY) == 0 ||
854 __predict_false(namelen > cache_maxlen)) { 861 __predict_false(namelen > cache_maxlen)) {
855 SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen, 862 SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen,
856 0, 0); 863 0, 0);
857 return; 864 return;
858 } 865 }
859 866
860 SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0); 867 SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0);
861 868
862 /* 869 /*
863 * Reclaim some entries if over budget. This is an unlocked check, 870 * Reclaim some entries if over budget. This is an unlocked check,
864 * but it doesn't matter. Just need to catch up with things 871 * but it doesn't matter. Just need to catch up with things
865 * eventually: it doesn't matter if we go over temporarily. 872 * eventually: it doesn't matter if we go over temporarily.
866 */ 873 */
867 total = atomic_load_relaxed(&cache_lru.count[LRU_ACTIVE]); 874 total = atomic_load_relaxed(&cache_lru.count[LRU_ACTIVE]);
868 total += atomic_load_relaxed(&cache_lru.count[LRU_INACTIVE]); 875 total += atomic_load_relaxed(&cache_lru.count[LRU_INACTIVE]);
869 if (__predict_false(total > desiredvnodes)) { 876 if (__predict_false(total > desiredvnodes)) {
870 cache_reclaim(); 877 cache_reclaim();
871 } 878 }
872 879
873 /* Now allocate a fresh entry. */ 880 /* Now allocate a fresh entry. */
874 if (__predict_true(namelen <= NCHNAMLEN)) { 881 if (__predict_true(namelen <= NCHNAMLEN)) {
875 ncp = pool_cache_get(cache_pool, PR_WAITOK); 882 ncp = pool_cache_get(cache_pool, PR_WAITOK);
876 } else { 883 } else {
877 size_t sz = offsetof(struct namecache, nc_name[namelen]); 884 size_t sz = offsetof(struct namecache, nc_name[namelen]);
878 ncp = kmem_alloc(sz, KM_SLEEP); 885 ncp = kmem_alloc(sz, KM_SLEEP);
879 } 886 }
880 887
881 /* 888 /*
882 * Fill in cache info. For negative hits, save the ISWHITEOUT flag 889 * Fill in cache info. For negative hits, save the ISWHITEOUT flag
883 * so we can restore it later when the cache entry is used again. 890 * so we can restore it later when the cache entry is used again.
884 */ 891 */
885 ncp->nc_vp = vp; 892 ncp->nc_vp = vp;
886 ncp->nc_dvp = dvp; 893 ncp->nc_dvp = dvp;
887 ncp->nc_key = cache_key(name, namelen); 894 ncp->nc_key = cache_key(name, namelen);
888 ncp->nc_nlen = namelen; 895 ncp->nc_nlen = namelen;
889 ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0); 896 ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0);
890 memcpy(ncp->nc_name, name, namelen); 897 memcpy(ncp->nc_name, name, namelen);
891 898
892 /* 899 /*
893 * Insert to the directory. Concurrent lookups may race for a cache 900 * Insert to the directory. Concurrent lookups may race for a cache
894 * entry. If there's a entry there already, purge it. 901 * entry. If there's a entry there already, purge it.
895 */ 902 */
896 rw_enter(&dvi->vi_nc_lock, RW_WRITER); 903 rw_enter(&dvi->vi_nc_lock, RW_WRITER);
897 oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp); 904 oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
898 if (oncp != ncp) { 905 if (oncp != ncp) {
899 KASSERT(oncp->nc_key == ncp->nc_key); 906 KASSERT(oncp->nc_key == ncp->nc_key);
900 KASSERT(oncp->nc_nlen == ncp->nc_nlen); 907 KASSERT(oncp->nc_nlen == ncp->nc_nlen);
901 KASSERT(memcmp(oncp->nc_name, name, namelen) == 0); 908 KASSERT(memcmp(oncp->nc_name, name, namelen) == 0);
902 cache_remove(oncp, true); 909 cache_remove(oncp, true);
903 oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp); 910 oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
904 KASSERT(oncp == ncp); 911 KASSERT(oncp == ncp);
905 } 912 }
906 913
907 /* 914 /*
908 * With the directory lock still held, insert to the tail of the 915 * With the directory lock still held, insert to the tail of the
909 * ACTIVE LRU list (new) and take the opportunity to incrementally 916 * ACTIVE LRU list (new) and take the opportunity to incrementally
910 * balance the lists. 917 * balance the lists.
911 */ 918 */
912 mutex_enter(&cache_lru_lock); 919 mutex_enter(&cache_lru_lock);
913 ncp->nc_lrulist = LRU_ACTIVE; 920 ncp->nc_lrulist = LRU_ACTIVE;
914 cache_lru.count[LRU_ACTIVE]++; 921 cache_lru.count[LRU_ACTIVE]++;
915 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); 922 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
916 cache_deactivate(); 923 cache_deactivate();
917 mutex_exit(&cache_lru_lock); 924 mutex_exit(&cache_lru_lock);
918 925
919 /* 926 /*
920 * Finally, insert to the vnode and unlock. With everything set up 927 * Finally, insert to the vnode and unlock. With everything set up
921 * it's safe to let cache_revlookup() see the entry. Partially sort 928 * it's safe to let cache_revlookup() see the entry. Partially sort
922 * the per-vnode list: dots go to back so cache_revlookup() doesn't 929 * the per-vnode list: dots go to back so cache_revlookup() doesn't
923 * have to consider them. 930 * have to consider them.
924 */ 931 */
925 if (vp != NULL) { 932 if (vp != NULL) {
926 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 933 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
927 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 934 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
928 if ((namelen == 1 && name[0] == '.') || 935 if ((namelen == 1 && name[0] == '.') ||
929 (namelen == 2 && name[0] == '.' && name[1] == '.')) { 936 (namelen == 2 && name[0] == '.' && name[1] == '.')) {
930 TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list); 937 TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list);
931 } else { 938 } else {
932 TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list); 939 TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list);
933 } 940 }
934 rw_exit(&vi->vi_nc_listlock); 941 rw_exit(&vi->vi_nc_listlock);
935 } 942 }
936 rw_exit(&dvi->vi_nc_lock); 943 rw_exit(&dvi->vi_nc_lock);
937} 944}
938 945
939/* 946/*
940 * Set identity info in cache for a vnode. We only care about directories 947 * Set identity info in cache for a vnode. We only care about directories
941 * so ignore other updates. 948 * so ignore other updates.
942 */ 949 */
943void 950void
944cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid) 951cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid)
945{ 952{
946 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 953 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
947 954
948 if (vp->v_type == VDIR) { 955 if (vp->v_type == VDIR) {
949 /* Grab both locks, for forward & reverse lookup. */ 956 /* Grab both locks, for forward & reverse lookup. */
950 rw_enter(&vi->vi_nc_lock, RW_WRITER); 957 rw_enter(&vi->vi_nc_lock, RW_WRITER);
951 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 958 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
952 vi->vi_nc_mode = mode; 959 vi->vi_nc_mode = mode;
953 vi->vi_nc_uid = uid; 960 vi->vi_nc_uid = uid;
954 vi->vi_nc_gid = gid; 961 vi->vi_nc_gid = gid;
955 rw_exit(&vi->vi_nc_listlock); 962 rw_exit(&vi->vi_nc_listlock);
956 rw_exit(&vi->vi_nc_lock); 963 rw_exit(&vi->vi_nc_lock);
957 } 964 }
958} 965}
959 966
960/* 967/*
961 * Return true if we have identity for the given vnode, and use as an 968 * Return true if we have identity for the given vnode, and use as an
962 * opportunity to confirm that everything squares up. 969 * opportunity to confirm that everything squares up.
963 * 970 *
964 * Because of shared code, some file systems could provide partial 971 * Because of shared code, some file systems could provide partial
965 * information, missing some updates, so always check the mount flag 972 * information, missing some updates, so always check the mount flag
966 * instead of looking for !VNOVAL. 973 * instead of looking for !VNOVAL.
967 */ 974 */
968bool 975bool
969cache_have_id(struct vnode *vp) 976cache_have_id(struct vnode *vp)
970{ 977{
971 978
972 if (vp->v_type == VDIR && 979 if (vp->v_type == VDIR &&
973 (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0) { 980 (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0) {
974 KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_mode != VNOVAL); 981 KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_mode != VNOVAL);
975 KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_uid != VNOVAL); 982 KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_uid != VNOVAL);
976 KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_gid != VNOVAL); 983 KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_gid != VNOVAL);
977 return true; 984 return true;
978 } else { 985 } else {
979 return false; 986 return false;
980 } 987 }
981} 988}
982 989
983/* 990/*
984 * Name cache initialization, from vfs_init() when the system is booting. 991 * Name cache initialization, from vfs_init() when the system is booting.
985 */ 992 */
986void 993void
987nchinit(void) 994nchinit(void)
988{ 995{
989 996
990 cache_pool = pool_cache_init(sizeof(struct namecache), 997 cache_pool = pool_cache_init(sizeof(struct namecache),
991 coherency_unit, 0, 0, "namecache", NULL, IPL_NONE, NULL, 998 coherency_unit, 0, 0, "namecache", NULL, IPL_NONE, NULL,
992 NULL, NULL); 999 NULL, NULL);
993 KASSERT(cache_pool != NULL); 1000 KASSERT(cache_pool != NULL);
994 1001
995 mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE); 1002 mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE);
996 TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]); 1003 TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]);
997 TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]); 1004 TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]);
998 1005
999 mutex_init(&cache_stat_lock, MUTEX_DEFAULT, IPL_NONE); 1006 mutex_init(&cache_stat_lock, MUTEX_DEFAULT, IPL_NONE);
1000 callout_init(&cache_stat_callout, CALLOUT_MPSAFE); 1007 callout_init(&cache_stat_callout, CALLOUT_MPSAFE);
1001 callout_setfunc(&cache_stat_callout, cache_update_stats, NULL); 1008 callout_setfunc(&cache_stat_callout, cache_update_stats, NULL);
1002 callout_schedule(&cache_stat_callout, cache_stat_interval * hz); 1009 callout_schedule(&cache_stat_callout, cache_stat_interval * hz);
1003 1010
1004 KASSERT(cache_sysctllog == NULL); 1011 KASSERT(cache_sysctllog == NULL);
1005 sysctl_createv(&cache_sysctllog, 0, NULL, NULL, 1012 sysctl_createv(&cache_sysctllog, 0, NULL, NULL,
1006 CTLFLAG_PERMANENT, 1013 CTLFLAG_PERMANENT,
1007 CTLTYPE_STRUCT, "namecache_stats", 1014 CTLTYPE_STRUCT, "namecache_stats",
1008 SYSCTL_DESCR("namecache statistics"), 1015 SYSCTL_DESCR("namecache statistics"),
1009 cache_stat_sysctl, 0, NULL, 0, 1016 cache_stat_sysctl, 0, NULL, 0,
1010 CTL_VFS, CTL_CREATE, CTL_EOL); 1017 CTL_VFS, CTL_CREATE, CTL_EOL);
1011} 1018}
1012 1019
1013/* 1020/*
1014 * Called once for each CPU in the system as attached. 1021 * Called once for each CPU in the system as attached.
1015 */ 1022 */
1016void 1023void
1017cache_cpu_init(struct cpu_info *ci) 1024cache_cpu_init(struct cpu_info *ci)
1018{ 1025{
1019 void *p; 1026 void *p;
1020 size_t sz; 1027 size_t sz;
1021 1028
1022 sz = roundup2(sizeof(struct nchstats_percpu), coherency_unit) + 1029 sz = roundup2(sizeof(struct nchstats_percpu), coherency_unit) +
1023 coherency_unit; 1030 coherency_unit;
1024 p = kmem_zalloc(sz, KM_SLEEP); 1031 p = kmem_zalloc(sz, KM_SLEEP);
1025 ci->ci_data.cpu_nch = (void *)roundup2((uintptr_t)p, coherency_unit); 1032 ci->ci_data.cpu_nch = (void *)roundup2((uintptr_t)p, coherency_unit);
1026} 1033}
1027 1034
1028/* 1035/*
1029 * A vnode is being allocated: set up cache structures. 1036 * A vnode is being allocated: set up cache structures.
1030 */ 1037 */
1031void 1038void
1032cache_vnode_init(struct vnode *vp) 1039cache_vnode_init(struct vnode *vp)
1033{ 1040{
1034 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 1041 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
1035 1042
1036 rw_init(&vi->vi_nc_lock); 1043 rw_init(&vi->vi_nc_lock);
1037 rw_init(&vi->vi_nc_listlock); 1044 rw_init(&vi->vi_nc_listlock);
1038 rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops); 1045 rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops);
1039 TAILQ_INIT(&vi->vi_nc_list); 1046 TAILQ_INIT(&vi->vi_nc_list);
1040 vi->vi_nc_mode = VNOVAL; 1047 vi->vi_nc_mode = VNOVAL;
1041 vi->vi_nc_uid = VNOVAL; 1048 vi->vi_nc_uid = VNOVAL;
1042 vi->vi_nc_gid = VNOVAL; 1049 vi->vi_nc_gid = VNOVAL;
1043} 1050}
1044 1051
1045/* 1052/*
1046 * A vnode is being freed: finish cache structures. 1053 * A vnode is being freed: finish cache structures.
1047 */ 1054 */
1048void 1055void
1049cache_vnode_fini(struct vnode *vp) 1056cache_vnode_fini(struct vnode *vp)
1050{ 1057{
1051 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 1058 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
1052 1059
1053 KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL); 1060 KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL);
1054 KASSERT(TAILQ_EMPTY(&vi->vi_nc_list)); 1061 KASSERT(TAILQ_EMPTY(&vi->vi_nc_list));
1055 rw_destroy(&vi->vi_nc_lock); 1062 rw_destroy(&vi->vi_nc_lock);
1056 rw_destroy(&vi->vi_nc_listlock); 1063 rw_destroy(&vi->vi_nc_listlock);
1057} 1064}
1058 1065
1059/* 1066/*
1060 * Helper for cache_purge1(): purge cache entries for the given vnode from 1067 * Helper for cache_purge1(): purge cache entries for the given vnode from
1061 * all directories that the vnode is cached in. 1068 * all directories that the vnode is cached in.
1062 */ 1069 */
1063static void 1070static void
1064cache_purge_parents(struct vnode *vp) 1071cache_purge_parents(struct vnode *vp)
1065{ 1072{
1066 vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp); 1073 vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp);
1067 struct vnode *dvp, *blocked; 1074 struct vnode *dvp, *blocked;
1068 struct namecache *ncp; 1075 struct namecache *ncp;
1069 1076
1070 SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0); 1077 SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0);
1071 1078
1072 blocked = NULL; 1079 blocked = NULL;
1073 1080
1074 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 1081 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
1075 while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) { 1082 while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) {
1076 /* 1083 /*
1077 * Locking in the wrong direction. Try for a hold on the 1084 * Locking in the wrong direction. Try for a hold on the
1078 * directory node's lock, and if we get it then all good, 1085 * directory node's lock, and if we get it then all good,
1079 * nuke the entry and move on to the next. 1086 * nuke the entry and move on to the next.
1080 */ 1087 */
1081 dvp = ncp->nc_dvp; 1088 dvp = ncp->nc_dvp;
1082 dvi = VNODE_TO_VIMPL(dvp); 1089 dvi = VNODE_TO_VIMPL(dvp);
1083 if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { 1090 if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
1084 cache_remove(ncp, false); 1091 cache_remove(ncp, false);
1085 rw_exit(&dvi->vi_nc_lock); 1092 rw_exit(&dvi->vi_nc_lock);
1086 blocked = NULL; 1093 blocked = NULL;
1087 continue; 1094 continue;
1088 } 1095 }
1089 1096
1090 /* 1097 /*
1091 * We can't wait on the directory node's lock with our list 1098 * We can't wait on the directory node's lock with our list
1092 * lock held or the system could deadlock. 1099 * lock held or the system could deadlock.
1093 * 1100 *
1094 * Take a hold on the directory vnode to prevent it from 1101 * Take a hold on the directory vnode to prevent it from
1095 * being freed (taking the vnode & lock with it). Then 1102 * being freed (taking the vnode & lock with it). Then
1096 * wait for the lock to become available with no other locks 1103 * wait for the lock to become available with no other locks
1097 * held, and retry. 1104 * held, and retry.
1098 * 1105 *
1099 * If this happens twice in a row, give the other side a 1106 * If this happens twice in a row, give the other side a
1100 * breather; we can do nothing until it lets go. 1107 * breather; we can do nothing until it lets go.
1101 */ 1108 */
1102 vhold(dvp); 1109 vhold(dvp);
1103 rw_exit(&vi->vi_nc_listlock); 1110 rw_exit(&vi->vi_nc_listlock);
1104 rw_enter(&dvi->vi_nc_lock, RW_WRITER); 1111 rw_enter(&dvi->vi_nc_lock, RW_WRITER);
1105 /* Do nothing. */ 1112 /* Do nothing. */
1106 rw_exit(&dvi->vi_nc_lock); 1113 rw_exit(&dvi->vi_nc_lock);
1107 holdrele(dvp); 1114 holdrele(dvp);
1108 if (blocked == dvp) { 1115 if (blocked == dvp) {
1109 kpause("ncpurge", false, 1, NULL); 1116 kpause("ncpurge", false, 1, NULL);
1110 } 1117 }
1111 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 1118 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
1112 blocked = dvp; 1119 blocked = dvp;
1113 } 1120 }
1114 rw_exit(&vi->vi_nc_listlock); 1121 rw_exit(&vi->vi_nc_listlock);
1115} 1122}
1116 1123
1117/* 1124/*
1118 * Helper for cache_purge1(): purge all cache entries hanging off the given 1125 * Helper for cache_purge1(): purge all cache entries hanging off the given
1119 * directory vnode. 1126 * directory vnode.
1120 */ 1127 */
1121static void 1128static void
1122cache_purge_children(struct vnode *dvp) 1129cache_purge_children(struct vnode *dvp)
1123{ 1130{
1124 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 1131 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
1125 struct namecache *ncp; 1132 struct namecache *ncp;
1126 1133
1127 SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0); 1134 SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0);
1128 1135
1129 rw_enter(&dvi->vi_nc_lock, RW_WRITER); 1136 rw_enter(&dvi->vi_nc_lock, RW_WRITER);
1130 while ((ncp = RB_TREE_MIN(&dvi->vi_nc_tree)) != NULL) { 1137 while ((ncp = RB_TREE_MIN(&dvi->vi_nc_tree)) != NULL) {
1131 cache_remove(ncp, true); 1138 cache_remove(ncp, true);
1132 } 1139 }
1133 rw_exit(&dvi->vi_nc_lock); 1140 rw_exit(&dvi->vi_nc_lock);
1134} 1141}
1135 1142
1136/* 1143/*
1137 * Helper for cache_purge1(): purge cache entry from the given vnode, 1144 * Helper for cache_purge1(): purge cache entry from the given vnode,
1138 * finding it by name. 1145 * finding it by name.
1139 */ 1146 */
1140static void 1147static void
1141cache_purge_name(struct vnode *dvp, const char *name, size_t namelen) 1148cache_purge_name(struct vnode *dvp, const char *name, size_t namelen)
1142{ 1149{
1143 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 1150 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
1144 struct namecache *ncp; 1151 struct namecache *ncp;
1145 uint64_t key; 1152 uint64_t key;
1146 1153
1147 SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0); 1154 SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0);
1148 1155
1149 key = cache_key(name, namelen); 1156 key = cache_key(name, namelen);
1150 rw_enter(&dvi->vi_nc_lock, RW_WRITER); 1157 rw_enter(&dvi->vi_nc_lock, RW_WRITER);
1151 ncp = cache_lookup_entry(dvp, name, namelen, key); 1158 ncp = cache_lookup_entry(dvp, name, namelen, key);
1152 if (ncp) { 1159 if (ncp) {
1153 cache_remove(ncp, true); 1160 cache_remove(ncp, true);
1154 } 1161 }
1155 rw_exit(&dvi->vi_nc_lock); 1162 rw_exit(&dvi->vi_nc_lock);
1156} 1163}
1157 1164
1158/* 1165/*
1159 * Cache flush, a particular vnode; called when a vnode is renamed to 1166 * Cache flush, a particular vnode; called when a vnode is renamed to
1160 * hide entries that would now be invalid. 1167 * hide entries that would now be invalid.
1161 */ 1168 */
1162void 1169void
1163cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags) 1170cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags)
1164{ 1171{
1165 1172
1166 if (flags & PURGE_PARENTS) { 1173 if (flags & PURGE_PARENTS) {
1167 cache_purge_parents(vp); 1174 cache_purge_parents(vp);
1168 } 1175 }
1169 if (flags & PURGE_CHILDREN) { 1176 if (flags & PURGE_CHILDREN) {
1170 cache_purge_children(vp); 1177 cache_purge_children(vp);
1171 } 1178 }
1172 if (name != NULL) { 1179 if (name != NULL) {
1173 cache_purge_name(vp, name, namelen); 1180 cache_purge_name(vp, name, namelen);
1174 } 1181 }
1175} 1182}
1176 1183
1177/* 1184/*
1178 * vnode filter for cache_purgevfs(). 1185 * vnode filter for cache_purgevfs().
1179 */ 1186 */
1180static bool 1187static bool
1181cache_vdir_filter(void *cookie, vnode_t *vp) 1188cache_vdir_filter(void *cookie, vnode_t *vp)
1182{ 1189{
1183 1190
1184 return vp->v_type == VDIR; 1191 return vp->v_type == VDIR;
1185} 1192}
1186 1193
1187/* 1194/*
1188 * Cache flush, a whole filesystem; called when filesys is umounted to 1195 * Cache flush, a whole filesystem; called when filesys is umounted to
1189 * remove entries that would now be invalid. 1196 * remove entries that would now be invalid.
1190 */ 1197 */
1191void 1198void
1192cache_purgevfs(struct mount *mp) 1199cache_purgevfs(struct mount *mp)
1193{ 1200{
1194 struct vnode_iterator *iter; 1201 struct vnode_iterator *iter;
1195 vnode_t *dvp; 1202 vnode_t *dvp;
1196 1203
1197 vfs_vnode_iterator_init(mp, &iter); 1204 vfs_vnode_iterator_init(mp, &iter);
1198 for (;;) { 1205 for (;;) {
1199 dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL); 1206 dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL);
1200 if (dvp == NULL) { 1207 if (dvp == NULL) {
1201 break; 1208 break;
1202 } 1209 }
1203 cache_purge_children(dvp); 1210 cache_purge_children(dvp);
1204 vrele(dvp); 1211 vrele(dvp);
1205 } 1212 }
1206 vfs_vnode_iterator_destroy(iter); 1213 vfs_vnode_iterator_destroy(iter);
1207} 1214}
1208 1215
1209/* 1216/*
1210 * Re-queue an entry onto the tail of the active LRU list, after it has 1217 * Re-queue an entry onto the tail of the active LRU list, after it has
1211 * scored a hit. 1218 * scored a hit.
1212 */ 1219 */
1213static void 1220static void
1214cache_activate(struct namecache *ncp) 1221cache_activate(struct namecache *ncp)
1215{ 1222{
1216 1223
1217 mutex_enter(&cache_lru_lock); 1224 mutex_enter(&cache_lru_lock);
1218 TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); 1225 TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
1219 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); 1226 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
1220 cache_lru.count[ncp->nc_lrulist]--; 1227 cache_lru.count[ncp->nc_lrulist]--;
1221 cache_lru.count[LRU_ACTIVE]++; 1228 cache_lru.count[LRU_ACTIVE]++;
1222 ncp->nc_lrulist = LRU_ACTIVE; 1229 ncp->nc_lrulist = LRU_ACTIVE;
1223 mutex_exit(&cache_lru_lock); 1230 mutex_exit(&cache_lru_lock);
1224} 1231}
1225 1232
1226/* 1233/*
1227 * Try to balance the LRU lists. Pick some victim entries, and re-queue 1234 * Try to balance the LRU lists. Pick some victim entries, and re-queue
1228 * them from the head of the active list to the tail of the inactive list.  1235 * them from the head of the active list to the tail of the inactive list.
1229 */ 1236 */
1230static void 1237static void
1231cache_deactivate(void) 1238cache_deactivate(void)
1232{ 1239{
1233 struct namecache *ncp; 1240 struct namecache *ncp;
1234 int total, i; 1241 int total, i;
1235 1242
1236 KASSERT(mutex_owned(&cache_lru_lock)); 1243 KASSERT(mutex_owned(&cache_lru_lock));
1237 1244
1238 /* If we're nowhere near budget yet, don't bother. */ 1245 /* If we're nowhere near budget yet, don't bother. */
1239 total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE]; 1246 total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE];
1240 if (total < (desiredvnodes >> 1)) { 1247 if (total < (desiredvnodes >> 1)) {
1241 return; 1248 return;
1242 } 1249 }
1243 1250
1244 /* 1251 /*
1245 * Aim for a 1:1 ratio of active to inactive. This is to allow each 1252 * Aim for a 1:1 ratio of active to inactive. This is to allow each
1246 * potential victim a reasonable amount of time to cycle through the 1253 * potential victim a reasonable amount of time to cycle through the
1247 * inactive list in order to score a hit and be reactivated, while 1254 * inactive list in order to score a hit and be reactivated, while
1248 * trying not to cause reactivations too frequently. 1255 * trying not to cause reactivations too frequently.
1249 */ 1256 */
1250 if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) { 1257 if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) {
1251 return; 1258 return;
1252 } 1259 }
1253 1260
1254 /* Move only a few at a time; will catch up eventually. */ 1261 /* Move only a few at a time; will catch up eventually. */
1255 for (i = 0; i < cache_lru_maxdeact; i++) { 1262 for (i = 0; i < cache_lru_maxdeact; i++) {
1256 ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]); 1263 ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]);
1257 if (ncp == NULL) { 1264 if (ncp == NULL) {
1258 break; 1265 break;
1259 } 1266 }
1260 KASSERT(ncp->nc_lrulist == LRU_ACTIVE); 1267 KASSERT(ncp->nc_lrulist == LRU_ACTIVE);
1261 ncp->nc_lrulist = LRU_INACTIVE; 1268 ncp->nc_lrulist = LRU_INACTIVE;
1262 TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); 1269 TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
1263 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru); 1270 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru);
1264 cache_lru.count[LRU_ACTIVE]--; 1271 cache_lru.count[LRU_ACTIVE]--;
1265 cache_lru.count[LRU_INACTIVE]++; 1272 cache_lru.count[LRU_INACTIVE]++;
1266 } 1273 }
1267} 1274}
1268 1275
1269/* 1276/*
1270 * Free some entries from the cache, when we have gone over budget. 1277 * Free some entries from the cache, when we have gone over budget.
1271 * 1278 *
1272 * We don't want to cause too much work for any individual caller, and it 1279 * We don't want to cause too much work for any individual caller, and it
1273 * doesn't matter if we temporarily go over budget. This is also "just a 1280 * doesn't matter if we temporarily go over budget. This is also "just a
1274 * cache" so it's not a big deal if we screw up and throw out something we 1281 * cache" so it's not a big deal if we screw up and throw out something we
1275 * shouldn't. So we take a relaxed attitude to this process to reduce its 1282 * shouldn't. So we take a relaxed attitude to this process to reduce its
1276 * impact. 1283 * impact.
1277 */ 1284 */
1278static void 1285static void
1279cache_reclaim(void) 1286cache_reclaim(void)
1280{ 1287{
1281 struct namecache *ncp; 1288 struct namecache *ncp;
1282 vnode_impl_t *dvi; 1289 vnode_impl_t *dvi;
1283 int toscan; 1290 int toscan;
1284 1291
1285 /* 1292 /*
1286 * Scan up to a preset maxium number of entries, but no more than 1293 * Scan up to a preset maxium number of entries, but no more than
1287 * 0.8% of the total at once (to allow for very small systems). 1294 * 0.8% of the total at once (to allow for very small systems).
1288 * 1295 *
1289 * On bigger systems, do a larger chunk of work to reduce the number 1296 * On bigger systems, do a larger chunk of work to reduce the number
1290 * of times that cache_lru_lock is held for any length of time. 1297 * of times that cache_lru_lock is held for any length of time.
1291 */ 1298 */
1292 mutex_enter(&cache_lru_lock); 1299 mutex_enter(&cache_lru_lock);
1293 toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7); 1300 toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7);
1294 toscan = MAX(toscan, 1); 1301 toscan = MAX(toscan, 1);
1295 SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] + 1302 SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] +
1296 cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0); 1303 cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0);
1297 while (toscan-- != 0) { 1304 while (toscan-- != 0) {
1298 /* First try to balance the lists. */ 1305 /* First try to balance the lists. */
1299 cache_deactivate(); 1306 cache_deactivate();
1300 1307
1301 /* Now look for a victim on head of inactive list (old). */ 1308 /* Now look for a victim on head of inactive list (old). */
1302 ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]); 1309 ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]);
1303 if (ncp == NULL) { 1310 if (ncp == NULL) {
1304 break; 1311 break;
1305 } 1312 }
1306 dvi = VNODE_TO_VIMPL(ncp->nc_dvp); 1313 dvi = VNODE_TO_VIMPL(ncp->nc_dvp);
1307 KASSERT(ncp->nc_lrulist == LRU_INACTIVE); 1314 KASSERT(ncp->nc_lrulist == LRU_INACTIVE);
1308 KASSERT(dvi != NULL); 1315 KASSERT(dvi != NULL);
1309 1316
1310 /* 1317 /*
1311 * Locking in the wrong direction. If we can't get the 1318 * Locking in the wrong direction. If we can't get the
1312 * lock, the directory is actively busy, and it could also 1319 * lock, the directory is actively busy, and it could also
1313 * cause problems for the next guy in here, so send the 1320 * cause problems for the next guy in here, so send the
1314 * entry to the back of the list. 1321 * entry to the back of the list.
1315 */ 1322 */
1316 if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { 1323 if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
1317 TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE], 1324 TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE],
1318 ncp, nc_lru); 1325 ncp, nc_lru);
1319 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], 1326 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE],
1320 ncp, nc_lru); 1327 ncp, nc_lru);
1321 continue; 1328 continue;
1322 } 1329 }
1323 1330
1324 /* 1331 /*
1325 * Now have the victim entry locked. Drop the LRU list 1332 * Now have the victim entry locked. Drop the LRU list
1326 * lock, purge the entry, and start over. The hold on 1333 * lock, purge the entry, and start over. The hold on
1327 * vi_nc_lock will prevent the vnode from vanishing until 1334 * vi_nc_lock will prevent the vnode from vanishing until
1328 * finished (cache_purge() will be called on dvp before it 1335 * finished (cache_purge() will be called on dvp before it
1329 * disappears, and that will wait on vi_nc_lock). 1336 * disappears, and that will wait on vi_nc_lock).
1330 */ 1337 */
1331 mutex_exit(&cache_lru_lock); 1338 mutex_exit(&cache_lru_lock);
1332 cache_remove(ncp, true); 1339 cache_remove(ncp, true);
1333 rw_exit(&dvi->vi_nc_lock); 1340 rw_exit(&dvi->vi_nc_lock);
1334 mutex_enter(&cache_lru_lock); 1341 mutex_enter(&cache_lru_lock);
1335 } 1342 }
1336 mutex_exit(&cache_lru_lock); 1343 mutex_exit(&cache_lru_lock);
1337} 1344}
1338 1345
1339/* 1346/*
1340 * For file system code: count a lookup that required a full re-scan of 1347 * For file system code: count a lookup that required a full re-scan of
1341 * directory metadata. 1348 * directory metadata.
1342 */ 1349 */
1343void 1350void
1344namecache_count_pass2(void) 1351namecache_count_pass2(void)
1345{ 1352{
1346 1353
1347 COUNT(ncs_pass2); 1354 COUNT(ncs_pass2);
1348} 1355}
1349 1356
1350/* 1357/*
1351 * For file system code: count a lookup that scored a hit in the directory 1358 * For file system code: count a lookup that scored a hit in the directory
1352 * metadata near the location of the last lookup. 1359 * metadata near the location of the last lookup.
1353 */ 1360 */
1354void 1361void
1355namecache_count_2passes(void) 1362namecache_count_2passes(void)
1356{ 1363{
1357 1364
1358 COUNT(ncs_2passes); 1365 COUNT(ncs_2passes);
1359} 1366}
1360 1367
1361/* 1368/*
1362 * Sum the stats from all CPUs into nchstats. This needs to run at least 1369 * Sum the stats from all CPUs into nchstats. This needs to run at least
1363 * once within every window where a 32-bit counter could roll over. It's 1370 * once within every window where a 32-bit counter could roll over. It's
1364 * called regularly by timer to ensure this. 1371 * called regularly by timer to ensure this.
1365 */ 1372 */
1366static void 1373static void
1367cache_update_stats(void *cookie) 1374cache_update_stats(void *cookie)
1368{ 1375{
1369 CPU_INFO_ITERATOR cii; 1376 CPU_INFO_ITERATOR cii;
1370 struct cpu_info *ci; 1377 struct cpu_info *ci;
1371 1378
1372 mutex_enter(&cache_stat_lock); 1379 mutex_enter(&cache_stat_lock);
1373 for (CPU_INFO_FOREACH(cii, ci)) { 1380 for (CPU_INFO_FOREACH(cii, ci)) {
1374 struct nchcpu *nchcpu = ci->ci_data.cpu_nch; 1381 struct nchcpu *nchcpu = ci->ci_data.cpu_nch;
1375 UPDATE(nchcpu, ncs_goodhits); 1382 UPDATE(nchcpu, ncs_goodhits);
1376 UPDATE(nchcpu, ncs_neghits); 1383 UPDATE(nchcpu, ncs_neghits);
1377 UPDATE(nchcpu, ncs_badhits); 1384 UPDATE(nchcpu, ncs_badhits);
1378 UPDATE(nchcpu, ncs_falsehits); 1385 UPDATE(nchcpu, ncs_falsehits);
1379 UPDATE(nchcpu, ncs_miss); 1386 UPDATE(nchcpu, ncs_miss);
1380 UPDATE(nchcpu, ncs_long); 1387 UPDATE(nchcpu, ncs_long);
1381 UPDATE(nchcpu, ncs_pass2); 1388 UPDATE(nchcpu, ncs_pass2);
1382 UPDATE(nchcpu, ncs_2passes); 1389 UPDATE(nchcpu, ncs_2passes);
1383 UPDATE(nchcpu, ncs_revhits); 1390 UPDATE(nchcpu, ncs_revhits);
1384 UPDATE(nchcpu, ncs_revmiss); 1391 UPDATE(nchcpu, ncs_revmiss);
1385 UPDATE(nchcpu, ncs_denied); 1392 UPDATE(nchcpu, ncs_denied);
1386 } 1393 }
1387 if (cookie != NULL) { 1394 if (cookie != NULL) {
1388 memcpy(cookie, &nchstats, sizeof(nchstats)); 1395 memcpy(cookie, &nchstats, sizeof(nchstats));
1389 } 1396 }
1390 /* Reset the timer; arrive back here in N minutes at latest. */ 1397 /* Reset the timer; arrive back here in N minutes at latest. */
1391 callout_schedule(&cache_stat_callout, cache_stat_interval * hz); 1398 callout_schedule(&cache_stat_callout, cache_stat_interval * hz);
1392 mutex_exit(&cache_stat_lock); 1399 mutex_exit(&cache_stat_lock);
1393} 1400}
1394 1401
1395/* 1402/*
1396 * Fetch the current values of the stats for sysctl. 1403 * Fetch the current values of the stats for sysctl.
1397 */ 1404 */
1398static int 1405static int
1399cache_stat_sysctl(SYSCTLFN_ARGS) 1406cache_stat_sysctl(SYSCTLFN_ARGS)
1400{ 1407{
1401 struct nchstats stats; 1408 struct nchstats stats;
1402 1409
1403 if (oldp == NULL) { 1410 if (oldp == NULL) {
1404 *oldlenp = sizeof(nchstats); 1411 *oldlenp = sizeof(nchstats);
1405 return 0; 1412 return 0;
1406 } 1413 }
1407 1414
1408 if (*oldlenp <= 0) { 1415 if (*oldlenp <= 0) {
1409 *oldlenp = 0; 1416 *oldlenp = 0;
1410 return 0; 1417 return 0;
1411 } 1418 }
1412 1419
1413 /* Refresh the global stats. */ 1420 /* Refresh the global stats. */
1414 sysctl_unlock(); 1421 sysctl_unlock();
1415 cache_update_stats(&stats); 1422 cache_update_stats(&stats);
1416 sysctl_relock(); 1423 sysctl_relock();
1417 1424
1418 *oldlenp = MIN(sizeof(stats), *oldlenp); 1425 *oldlenp = MIN(sizeof(stats), *oldlenp);
1419 return sysctl_copyout(l, &stats, oldp, *oldlenp); 1426 return sysctl_copyout(l, &stats, oldp, *oldlenp);
1420} 1427}
1421 1428
1422/* 1429/*
1423 * For the debugger, given the address of a vnode, print all associated 1430 * For the debugger, given the address of a vnode, print all associated
1424 * names in the cache. 1431 * names in the cache.
1425 */ 1432 */
1426#ifdef DDB 1433#ifdef DDB
1427void 1434void
1428namecache_print(struct vnode *vp, void (*pr)(const char *, ...)) 1435namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
1429{ 1436{
1430 struct vnode *dvp = NULL; 1437 struct vnode *dvp = NULL;
1431 struct namecache *ncp; 1438 struct namecache *ncp;
1432 enum cache_lru_id id; 1439 enum cache_lru_id id;
1433 1440
1434 for (id = 0; id < LRU_COUNT; id++) { 1441 for (id = 0; id < LRU_COUNT; id++) {
1435 TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { 1442 TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
1436 if (ncp->nc_vp == vp) { 1443 if (ncp->nc_vp == vp) {
1437 (*pr)("name %.*s\n", ncp->nc_nlen, 1444 (*pr)("name %.*s\n", ncp->nc_nlen,
1438 ncp->nc_name); 1445 ncp->nc_name);
1439 dvp = ncp->nc_dvp; 1446 dvp = ncp->nc_dvp;
1440 } 1447 }
1441 } 1448 }
1442 } 1449 }
1443 if (dvp == NULL) { 1450 if (dvp == NULL) {
1444 (*pr)("name not found\n"); 1451 (*pr)("name not found\n");
1445 return; 1452 return;
1446 } 1453 }
1447 for (id = 0; id < LRU_COUNT; id++) { 1454 for (id = 0; id < LRU_COUNT; id++) {
1448 TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { 1455 TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
1449 if (ncp->nc_vp == dvp) { 1456 if (ncp->nc_vp == dvp) {
1450 (*pr)("parent %.*s\n", ncp->nc_nlen, 1457 (*pr)("parent %.*s\n", ncp->nc_nlen,
1451 ncp->nc_name); 1458 ncp->nc_name);
1452 } 1459 }
1453 } 1460 }
1454 } 1461 }
1455} 1462}
1456#endif 1463#endif

cvs diff -r1.218 -r1.219 src/sys/kern/vfs_lookup.c (switch to unified diff)

--- src/sys/kern/vfs_lookup.c 2020/04/21 21:42:47 1.218
+++ src/sys/kern/vfs_lookup.c 2020/04/22 21:35:52 1.219
@@ -1,2244 +1,2251 @@ @@ -1,2244 +1,2251 @@
1/* $NetBSD: vfs_lookup.c,v 1.218 2020/04/21 21:42:47 ad Exp $ */ 1/* $NetBSD: vfs_lookup.c,v 1.219 2020/04/22 21:35:52 ad Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 1982, 1986, 1989, 1993 4 * Copyright (c) 1982, 1986, 1989, 1993
5 * The Regents of the University of California. All rights reserved. 5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc. 6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed 7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph 8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc. 10 * the permission of UNIX System Laboratories, Inc.
11 * 11 *
12 * Redistribution and use in source and binary forms, with or without 12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions 13 * modification, are permitted provided that the following conditions
14 * are met: 14 * are met:
15 * 1. Redistributions of source code must retain the above copyright 15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer. 16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright 17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the 18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution. 19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors 20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software 21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission. 22 * without specific prior written permission.
23 * 23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE. 34 * SUCH DAMAGE.
35 * 35 *
36 * @(#)vfs_lookup.c 8.10 (Berkeley) 5/27/95 36 * @(#)vfs_lookup.c 8.10 (Berkeley) 5/27/95
37 */ 37 */
38 38
39#include <sys/cdefs.h> 39#include <sys/cdefs.h>
40__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.218 2020/04/21 21:42:47 ad Exp $"); 40__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.219 2020/04/22 21:35:52 ad Exp $");
41 41
42#ifdef _KERNEL_OPT 42#ifdef _KERNEL_OPT
43#include "opt_magiclinks.h" 43#include "opt_magiclinks.h"
44#endif 44#endif
45 45
46#include <sys/param.h> 46#include <sys/param.h>
47#include <sys/systm.h> 47#include <sys/systm.h>
48#include <sys/kernel.h> 48#include <sys/kernel.h>
49#include <sys/syslimits.h> 49#include <sys/syslimits.h>
50#include <sys/time.h> 50#include <sys/time.h>
51#include <sys/namei.h> 51#include <sys/namei.h>
52#include <sys/vnode.h> 52#include <sys/vnode.h>
53#include <sys/vnode_impl.h> 53#include <sys/vnode_impl.h>
54#include <sys/mount.h> 54#include <sys/mount.h>
55#include <sys/errno.h> 55#include <sys/errno.h>
56#include <sys/filedesc.h> 56#include <sys/filedesc.h>
57#include <sys/hash.h> 57#include <sys/hash.h>
58#include <sys/proc.h> 58#include <sys/proc.h>
59#include <sys/syslog.h> 59#include <sys/syslog.h>
60#include <sys/kauth.h> 60#include <sys/kauth.h>
61#include <sys/ktrace.h> 61#include <sys/ktrace.h>
62#include <sys/dirent.h> 62#include <sys/dirent.h>
63 63
64#ifndef MAGICLINKS 64#ifndef MAGICLINKS
65#define MAGICLINKS 0 65#define MAGICLINKS 0
66#endif 66#endif
67 67
68int vfs_magiclinks = MAGICLINKS; 68int vfs_magiclinks = MAGICLINKS;
69 69
70__CTASSERT(MAXNAMLEN == NAME_MAX); 70__CTASSERT(MAXNAMLEN == NAME_MAX);
71 71
72/* 72/*
73 * Substitute replacement text for 'magic' strings in symlinks. 73 * Substitute replacement text for 'magic' strings in symlinks.
74 * Returns 0 if successful, and returns non-zero if an error 74 * Returns 0 if successful, and returns non-zero if an error
75 * occurs. (Currently, the only possible error is running out 75 * occurs. (Currently, the only possible error is running out
76 * of temporary pathname space.) 76 * of temporary pathname space.)
77 * 77 *
78 * Looks for "@<string>" and "@<string>/", where <string> is a 78 * Looks for "@<string>" and "@<string>/", where <string> is a
79 * recognized 'magic' string. Replaces the "@<string>" with the 79 * recognized 'magic' string. Replaces the "@<string>" with the
80 * appropriate replacement text. (Note that in some cases the 80 * appropriate replacement text. (Note that in some cases the
81 * replacement text may have zero length.) 81 * replacement text may have zero length.)
82 * 82 *
83 * This would have been table driven, but the variance in 83 * This would have been table driven, but the variance in
84 * replacement strings (and replacement string lengths) made 84 * replacement strings (and replacement string lengths) made
85 * that impractical. 85 * that impractical.
86 */ 86 */
87#define VNL(x) \ 87#define VNL(x) \
88 (sizeof(x) - 1) 88 (sizeof(x) - 1)
89 89
90#define VO '{' 90#define VO '{'
91#define VC '}' 91#define VC '}'
92 92
93#define MATCH(str) \ 93#define MATCH(str) \
94 ((termchar == '/' && i + VNL(str) == *len) || \ 94 ((termchar == '/' && i + VNL(str) == *len) || \
95 (i + VNL(str) < *len && \ 95 (i + VNL(str) < *len && \
96 cp[i + VNL(str)] == termchar)) && \ 96 cp[i + VNL(str)] == termchar)) && \
97 !strncmp((str), &cp[i], VNL(str)) 97 !strncmp((str), &cp[i], VNL(str))
98 98
99#define SUBSTITUTE(m, s, sl) \ 99#define SUBSTITUTE(m, s, sl) \
100 if ((newlen + (sl)) >= MAXPATHLEN) \ 100 if ((newlen + (sl)) >= MAXPATHLEN) \
101 return 1; \ 101 return 1; \
102 i += VNL(m); \ 102 i += VNL(m); \
103 if (termchar != '/') \ 103 if (termchar != '/') \
104 i++; \ 104 i++; \
105 (void)memcpy(&tmp[newlen], (s), (sl)); \ 105 (void)memcpy(&tmp[newlen], (s), (sl)); \
106 newlen += (sl); \ 106 newlen += (sl); \
107 change = 1; \ 107 change = 1; \
108 termchar = '/'; 108 termchar = '/';
109 109
110static int 110static int
111symlink_magic(struct proc *p, char *cp, size_t *len) 111symlink_magic(struct proc *p, char *cp, size_t *len)
112{ 112{
113 char *tmp; 113 char *tmp;
114 size_t change, i, newlen, slen; 114 size_t change, i, newlen, slen;
115 char termchar = '/'; 115 char termchar = '/';
116 char idtmp[11]; /* enough for 32 bit *unsigned* integer */ 116 char idtmp[11]; /* enough for 32 bit *unsigned* integer */
117 117
118 118
119 tmp = PNBUF_GET(); 119 tmp = PNBUF_GET();
120 for (change = i = newlen = 0; i < *len; ) { 120 for (change = i = newlen = 0; i < *len; ) {
121 if (cp[i] != '@') { 121 if (cp[i] != '@') {
122 tmp[newlen++] = cp[i++]; 122 tmp[newlen++] = cp[i++];
123 continue; 123 continue;
124 } 124 }
125 125
126 i++; 126 i++;
127 127
128 /* Check for @{var} syntax. */ 128 /* Check for @{var} syntax. */
129 if (cp[i] == VO) { 129 if (cp[i] == VO) {
130 termchar = VC; 130 termchar = VC;
131 i++; 131 i++;
132 } 132 }
133 133
134 /* 134 /*
135 * The following checks should be ordered according 135 * The following checks should be ordered according
136 * to frequency of use. 136 * to frequency of use.
137 */ 137 */
138 if (MATCH("machine_arch")) { 138 if (MATCH("machine_arch")) {
139 slen = VNL(MACHINE_ARCH); 139 slen = VNL(MACHINE_ARCH);
140 SUBSTITUTE("machine_arch", MACHINE_ARCH, slen); 140 SUBSTITUTE("machine_arch", MACHINE_ARCH, slen);
141 } else if (MATCH("machine")) { 141 } else if (MATCH("machine")) {
142 slen = VNL(MACHINE); 142 slen = VNL(MACHINE);
143 SUBSTITUTE("machine", MACHINE, slen); 143 SUBSTITUTE("machine", MACHINE, slen);
144 } else if (MATCH("hostname")) { 144 } else if (MATCH("hostname")) {
145 SUBSTITUTE("hostname", hostname, hostnamelen); 145 SUBSTITUTE("hostname", hostname, hostnamelen);
146 } else if (MATCH("osrelease")) { 146 } else if (MATCH("osrelease")) {
147 slen = strlen(osrelease); 147 slen = strlen(osrelease);
148 SUBSTITUTE("osrelease", osrelease, slen); 148 SUBSTITUTE("osrelease", osrelease, slen);
149 } else if (MATCH("emul")) { 149 } else if (MATCH("emul")) {
150 slen = strlen(p->p_emul->e_name); 150 slen = strlen(p->p_emul->e_name);
151 SUBSTITUTE("emul", p->p_emul->e_name, slen); 151 SUBSTITUTE("emul", p->p_emul->e_name, slen);
152 } else if (MATCH("kernel_ident")) { 152 } else if (MATCH("kernel_ident")) {
153 slen = strlen(kernel_ident); 153 slen = strlen(kernel_ident);
154 SUBSTITUTE("kernel_ident", kernel_ident, slen); 154 SUBSTITUTE("kernel_ident", kernel_ident, slen);
155 } else if (MATCH("domainname")) { 155 } else if (MATCH("domainname")) {
156 SUBSTITUTE("domainname", domainname, domainnamelen); 156 SUBSTITUTE("domainname", domainname, domainnamelen);
157 } else if (MATCH("ostype")) { 157 } else if (MATCH("ostype")) {
158 slen = strlen(ostype); 158 slen = strlen(ostype);
159 SUBSTITUTE("ostype", ostype, slen); 159 SUBSTITUTE("ostype", ostype, slen);
160 } else if (MATCH("uid")) { 160 } else if (MATCH("uid")) {
161 slen = snprintf(idtmp, sizeof(idtmp), "%u", 161 slen = snprintf(idtmp, sizeof(idtmp), "%u",
162 kauth_cred_geteuid(kauth_cred_get())); 162 kauth_cred_geteuid(kauth_cred_get()));
163 SUBSTITUTE("uid", idtmp, slen); 163 SUBSTITUTE("uid", idtmp, slen);
164 } else if (MATCH("ruid")) { 164 } else if (MATCH("ruid")) {
165 slen = snprintf(idtmp, sizeof(idtmp), "%u", 165 slen = snprintf(idtmp, sizeof(idtmp), "%u",
166 kauth_cred_getuid(kauth_cred_get())); 166 kauth_cred_getuid(kauth_cred_get()));
167 SUBSTITUTE("ruid", idtmp, slen); 167 SUBSTITUTE("ruid", idtmp, slen);
168 } else if (MATCH("gid")) { 168 } else if (MATCH("gid")) {
169 slen = snprintf(idtmp, sizeof(idtmp), "%u", 169 slen = snprintf(idtmp, sizeof(idtmp), "%u",
170 kauth_cred_getegid(kauth_cred_get())); 170 kauth_cred_getegid(kauth_cred_get()));
171 SUBSTITUTE("gid", idtmp, slen); 171 SUBSTITUTE("gid", idtmp, slen);
172 } else if (MATCH("rgid")) { 172 } else if (MATCH("rgid")) {
173 slen = snprintf(idtmp, sizeof(idtmp), "%u", 173 slen = snprintf(idtmp, sizeof(idtmp), "%u",
174 kauth_cred_getgid(kauth_cred_get())); 174 kauth_cred_getgid(kauth_cred_get()));
175 SUBSTITUTE("rgid", idtmp, slen); 175 SUBSTITUTE("rgid", idtmp, slen);
176 } else { 176 } else {
177 tmp[newlen++] = '@'; 177 tmp[newlen++] = '@';
178 if (termchar == VC) 178 if (termchar == VC)
179 tmp[newlen++] = VO; 179 tmp[newlen++] = VO;
180 } 180 }
181 } 181 }
182 182
183 if (change) { 183 if (change) {
184 (void)memcpy(cp, tmp, newlen); 184 (void)memcpy(cp, tmp, newlen);
185 *len = newlen; 185 *len = newlen;
186 } 186 }
187 PNBUF_PUT(tmp); 187 PNBUF_PUT(tmp);
188 188
189 return 0; 189 return 0;
190} 190}
191 191
192#undef VNL 192#undef VNL
193#undef VO 193#undef VO
194#undef VC 194#undef VC
195#undef MATCH 195#undef MATCH
196#undef SUBSTITUTE 196#undef SUBSTITUTE
197 197
198//////////////////////////////////////////////////////////// 198////////////////////////////////////////////////////////////
199 199
200/* 200/*
201 * Determine the namei hash (for the namecache) for name. 201 * Determine the namei hash (for the namecache) for name.
202 * If *ep != NULL, hash from name to ep-1. 202 * If *ep != NULL, hash from name to ep-1.
203 * If *ep == NULL, hash from name until the first NUL or '/', and 203 * If *ep == NULL, hash from name until the first NUL or '/', and
204 * return the location of this termination character in *ep. 204 * return the location of this termination character in *ep.
205 * 205 *
206 * This function returns an equivalent hash to the MI hash32_strn(). 206 * This function returns an equivalent hash to the MI hash32_strn().
207 * The latter isn't used because in the *ep == NULL case, determining 207 * The latter isn't used because in the *ep == NULL case, determining
208 * the length of the string to the first NUL or `/' and then calling 208 * the length of the string to the first NUL or `/' and then calling
209 * hash32_strn() involves unnecessary double-handling of the data. 209 * hash32_strn() involves unnecessary double-handling of the data.
210 */ 210 */
211uint32_t 211uint32_t
212namei_hash(const char *name, const char **ep) 212namei_hash(const char *name, const char **ep)
213{ 213{
214 uint32_t hash; 214 uint32_t hash;
215 215
216 hash = HASH32_STR_INIT; 216 hash = HASH32_STR_INIT;
217 if (*ep != NULL) { 217 if (*ep != NULL) {
218 for (; name < *ep; name++) 218 for (; name < *ep; name++)
219 hash = hash * 33 + *(const uint8_t *)name; 219 hash = hash * 33 + *(const uint8_t *)name;
220 } else { 220 } else {
221 for (; *name != '\0' && *name != '/'; name++) 221 for (; *name != '\0' && *name != '/'; name++)
222 hash = hash * 33 + *(const uint8_t *)name; 222 hash = hash * 33 + *(const uint8_t *)name;
223 *ep = name; 223 *ep = name;
224 } 224 }
225 return (hash + (hash >> 5)); 225 return (hash + (hash >> 5));
226} 226}
227 227
228/* 228/*
229 * Find the end of the first path component in NAME and return its 229 * Find the end of the first path component in NAME and return its
230 * length. 230 * length.
231 */ 231 */
232static size_t 232static size_t
233namei_getcomponent(const char *name) 233namei_getcomponent(const char *name)
234{ 234{
235 size_t pos; 235 size_t pos;
236 236
237 pos = 0; 237 pos = 0;
238 while (name[pos] != '\0' && name[pos] != '/') { 238 while (name[pos] != '\0' && name[pos] != '/') {
239 pos++; 239 pos++;
240 } 240 }
241 return pos; 241 return pos;
242} 242}
243 243
244//////////////////////////////////////////////////////////// 244////////////////////////////////////////////////////////////
245 245
246/* 246/*
247 * Sealed abstraction for pathnames. 247 * Sealed abstraction for pathnames.
248 * 248 *
249 * System-call-layer level code that is going to call namei should 249 * System-call-layer level code that is going to call namei should
250 * first create a pathbuf and adjust all the bells and whistles on it 250 * first create a pathbuf and adjust all the bells and whistles on it
251 * as needed by context. 251 * as needed by context.
252 */ 252 */
253 253
254struct pathbuf { 254struct pathbuf {
255 char *pb_path; 255 char *pb_path;
256 char *pb_pathcopy; 256 char *pb_pathcopy;
257 unsigned pb_pathcopyuses; 257 unsigned pb_pathcopyuses;
258}; 258};
259 259
260static struct pathbuf * 260static struct pathbuf *
261pathbuf_create_raw(void) 261pathbuf_create_raw(void)
262{ 262{
263 struct pathbuf *pb; 263 struct pathbuf *pb;
264 264
265 pb = kmem_alloc(sizeof(*pb), KM_SLEEP); 265 pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
266 pb->pb_path = PNBUF_GET(); 266 pb->pb_path = PNBUF_GET();
267 if (pb->pb_path == NULL) { 267 if (pb->pb_path == NULL) {
268 kmem_free(pb, sizeof(*pb)); 268 kmem_free(pb, sizeof(*pb));
269 return NULL; 269 return NULL;
270 } 270 }
271 pb->pb_pathcopy = NULL; 271 pb->pb_pathcopy = NULL;
272 pb->pb_pathcopyuses = 0; 272 pb->pb_pathcopyuses = 0;
273 return pb; 273 return pb;
274} 274}
275 275
276void 276void
277pathbuf_destroy(struct pathbuf *pb) 277pathbuf_destroy(struct pathbuf *pb)
278{ 278{
279 KASSERT(pb->pb_pathcopyuses == 0); 279 KASSERT(pb->pb_pathcopyuses == 0);
280 KASSERT(pb->pb_pathcopy == NULL); 280 KASSERT(pb->pb_pathcopy == NULL);
281 PNBUF_PUT(pb->pb_path); 281 PNBUF_PUT(pb->pb_path);
282 kmem_free(pb, sizeof(*pb)); 282 kmem_free(pb, sizeof(*pb));
283} 283}
284 284
285struct pathbuf * 285struct pathbuf *
286pathbuf_assimilate(char *pnbuf) 286pathbuf_assimilate(char *pnbuf)
287{ 287{
288 struct pathbuf *pb; 288 struct pathbuf *pb;
289 289
290 pb = kmem_alloc(sizeof(*pb), KM_SLEEP); 290 pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
291 pb->pb_path = pnbuf; 291 pb->pb_path = pnbuf;
292 pb->pb_pathcopy = NULL; 292 pb->pb_pathcopy = NULL;
293 pb->pb_pathcopyuses = 0; 293 pb->pb_pathcopyuses = 0;
294 return pb; 294 return pb;
295} 295}
296 296
297struct pathbuf * 297struct pathbuf *
298pathbuf_create(const char *path) 298pathbuf_create(const char *path)
299{ 299{
300 struct pathbuf *pb; 300 struct pathbuf *pb;
301 int error; 301 int error;
302 302
303 pb = pathbuf_create_raw(); 303 pb = pathbuf_create_raw();
304 if (pb == NULL) { 304 if (pb == NULL) {
305 return NULL; 305 return NULL;
306 } 306 }
307 error = copystr(path, pb->pb_path, PATH_MAX, NULL); 307 error = copystr(path, pb->pb_path, PATH_MAX, NULL);
308 if (error != 0) { 308 if (error != 0) {
309 KASSERT(!"kernel path too long in pathbuf_create"); 309 KASSERT(!"kernel path too long in pathbuf_create");
310 /* make sure it's null-terminated, just in case */ 310 /* make sure it's null-terminated, just in case */
311 pb->pb_path[PATH_MAX-1] = '\0'; 311 pb->pb_path[PATH_MAX-1] = '\0';
312 } 312 }
313 return pb; 313 return pb;
314} 314}
315 315
316int 316int
317pathbuf_copyin(const char *userpath, struct pathbuf **ret) 317pathbuf_copyin(const char *userpath, struct pathbuf **ret)
318{ 318{
319 struct pathbuf *pb; 319 struct pathbuf *pb;
320 int error; 320 int error;
321 321
322 pb = pathbuf_create_raw(); 322 pb = pathbuf_create_raw();
323 if (pb == NULL) { 323 if (pb == NULL) {
324 return ENOMEM; 324 return ENOMEM;
325 } 325 }
326 error = copyinstr(userpath, pb->pb_path, PATH_MAX, NULL); 326 error = copyinstr(userpath, pb->pb_path, PATH_MAX, NULL);
327 if (error) { 327 if (error) {
328 pathbuf_destroy(pb); 328 pathbuf_destroy(pb);
329 return error; 329 return error;
330 } 330 }
331 *ret = pb; 331 *ret = pb;
332 return 0; 332 return 0;
333} 333}
334 334
335/* 335/*
336 * XXX should not exist: 336 * XXX should not exist:
337 * 1. whether a pointer is kernel or user should be statically checkable. 337 * 1. whether a pointer is kernel or user should be statically checkable.
338 * 2. copyin should be handled by the upper part of the syscall layer, 338 * 2. copyin should be handled by the upper part of the syscall layer,
339 * not in here. 339 * not in here.
340 */ 340 */
341int 341int
342pathbuf_maybe_copyin(const char *path, enum uio_seg seg, struct pathbuf **ret) 342pathbuf_maybe_copyin(const char *path, enum uio_seg seg, struct pathbuf **ret)
343{ 343{
344 if (seg == UIO_USERSPACE) { 344 if (seg == UIO_USERSPACE) {
345 return pathbuf_copyin(path, ret); 345 return pathbuf_copyin(path, ret);
346 } else { 346 } else {
347 *ret = pathbuf_create(path); 347 *ret = pathbuf_create(path);
348 if (*ret == NULL) { 348 if (*ret == NULL) {
349 return ENOMEM; 349 return ENOMEM;
350 } 350 }
351 return 0; 351 return 0;
352 } 352 }
353} 353}
354 354
355/* 355/*
356 * Get a copy of the path buffer as it currently exists. If this is 356 * Get a copy of the path buffer as it currently exists. If this is
357 * called after namei starts the results may be arbitrary. 357 * called after namei starts the results may be arbitrary.
358 */ 358 */
359void 359void
360pathbuf_copystring(const struct pathbuf *pb, char *buf, size_t maxlen) 360pathbuf_copystring(const struct pathbuf *pb, char *buf, size_t maxlen)
361{ 361{
362 strlcpy(buf, pb->pb_path, maxlen); 362 strlcpy(buf, pb->pb_path, maxlen);
363} 363}
364 364
365/* 365/*
366 * These two functions allow access to a saved copy of the original 366 * These two functions allow access to a saved copy of the original
367 * path string. The first copy should be gotten before namei is 367 * path string. The first copy should be gotten before namei is
368 * called. Each copy that is gotten should be put back. 368 * called. Each copy that is gotten should be put back.
369 */ 369 */
370 370
371const char * 371const char *
372pathbuf_stringcopy_get(struct pathbuf *pb) 372pathbuf_stringcopy_get(struct pathbuf *pb)
373{ 373{
374 if (pb->pb_pathcopyuses == 0) { 374 if (pb->pb_pathcopyuses == 0) {
375 pb->pb_pathcopy = PNBUF_GET(); 375 pb->pb_pathcopy = PNBUF_GET();
376 strcpy(pb->pb_pathcopy, pb->pb_path); 376 strcpy(pb->pb_pathcopy, pb->pb_path);
377 } 377 }
378 pb->pb_pathcopyuses++; 378 pb->pb_pathcopyuses++;
379 return pb->pb_pathcopy; 379 return pb->pb_pathcopy;
380} 380}
381 381
382void 382void
383pathbuf_stringcopy_put(struct pathbuf *pb, const char *str) 383pathbuf_stringcopy_put(struct pathbuf *pb, const char *str)
384{ 384{
385 KASSERT(str == pb->pb_pathcopy); 385 KASSERT(str == pb->pb_pathcopy);
386 KASSERT(pb->pb_pathcopyuses > 0); 386 KASSERT(pb->pb_pathcopyuses > 0);
387 pb->pb_pathcopyuses--; 387 pb->pb_pathcopyuses--;
388 if (pb->pb_pathcopyuses == 0) { 388 if (pb->pb_pathcopyuses == 0) {
389 PNBUF_PUT(pb->pb_pathcopy); 389 PNBUF_PUT(pb->pb_pathcopy);
390 pb->pb_pathcopy = NULL; 390 pb->pb_pathcopy = NULL;
391 } 391 }
392} 392}
393 393
394 394
395//////////////////////////////////////////////////////////// 395////////////////////////////////////////////////////////////
396 396
397/* 397/*
398 * namei: convert a pathname into a pointer to a (maybe-locked) vnode, 398 * namei: convert a pathname into a pointer to a (maybe-locked) vnode,
399 * and maybe also its parent directory vnode, and assorted other guff. 399 * and maybe also its parent directory vnode, and assorted other guff.
400 * See namei(9) for the interface documentation. 400 * See namei(9) for the interface documentation.
401 * 401 *
402 * 402 *
403 * The FOLLOW flag is set when symbolic links are to be followed 403 * The FOLLOW flag is set when symbolic links are to be followed
404 * when they occur at the end of the name translation process. 404 * when they occur at the end of the name translation process.
405 * Symbolic links are always followed for all other pathname 405 * Symbolic links are always followed for all other pathname
406 * components other than the last. 406 * components other than the last.
407 * 407 *
408 * The segflg defines whether the name is to be copied from user 408 * The segflg defines whether the name is to be copied from user
409 * space or kernel space. 409 * space or kernel space.
410 * 410 *
411 * Overall outline of namei: 411 * Overall outline of namei:
412 * 412 *
413 * copy in name 413 * copy in name
414 * get starting directory 414 * get starting directory
415 * while (!done && !error) { 415 * while (!done && !error) {
416 * call lookup to search path. 416 * call lookup to search path.
417 * if symbolic link, massage name in buffer and continue 417 * if symbolic link, massage name in buffer and continue
418 * } 418 * }
419 */ 419 */
420 420
421/* 421/*
422 * Search a pathname. 422 * Search a pathname.
423 * This is a very central and rather complicated routine. 423 * This is a very central and rather complicated routine.
424 * 424 *
425 * The pathname is pointed to by ni_ptr and is of length ni_pathlen. 425 * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
426 * The starting directory is passed in. The pathname is descended 426 * The starting directory is passed in. The pathname is descended
427 * until done, or a symbolic link is encountered. The variable ni_more 427 * until done, or a symbolic link is encountered. The variable ni_more
428 * is clear if the path is completed; it is set to one if a symbolic 428 * is clear if the path is completed; it is set to one if a symbolic
429 * link needing interpretation is encountered. 429 * link needing interpretation is encountered.
430 * 430 *
431 * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on 431 * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
432 * whether the name is to be looked up, created, renamed, or deleted. 432 * whether the name is to be looked up, created, renamed, or deleted.
433 * When CREATE, RENAME, or DELETE is specified, information usable in 433 * When CREATE, RENAME, or DELETE is specified, information usable in
434 * creating, renaming, or deleting a directory entry may be calculated. 434 * creating, renaming, or deleting a directory entry may be calculated.
435 * If flag has LOCKPARENT or'ed into it, the parent directory is returned 435 * If flag has LOCKPARENT or'ed into it, the parent directory is returned
436 * locked. Otherwise the parent directory is not returned. If the target 436 * locked. Otherwise the parent directory is not returned. If the target
437 * of the pathname exists and LOCKLEAF is or'ed into the flag the target 437 * of the pathname exists and LOCKLEAF is or'ed into the flag the target
438 * is returned locked, otherwise it is returned unlocked. When creating 438 * is returned locked, otherwise it is returned unlocked. When creating
439 * or renaming and LOCKPARENT is specified, the target may not be ".". 439 * or renaming and LOCKPARENT is specified, the target may not be ".".
440 * When deleting and LOCKPARENT is specified, the target may be ".". 440 * When deleting and LOCKPARENT is specified, the target may be ".".
441 * 441 *
442 * Overall outline of lookup: 442 * Overall outline of lookup:
443 * 443 *
444 * dirloop: 444 * dirloop:
445 * identify next component of name at ndp->ni_ptr 445 * identify next component of name at ndp->ni_ptr
446 * handle degenerate case where name is null string 446 * handle degenerate case where name is null string
447 * if .. and crossing mount points and on mounted filesys, find parent 447 * if .. and crossing mount points and on mounted filesys, find parent
448 * call VOP_LOOKUP routine for next component name 448 * call VOP_LOOKUP routine for next component name
449 * directory vnode returned in ni_dvp, locked. 449 * directory vnode returned in ni_dvp, locked.
450 * component vnode returned in ni_vp (if it exists), locked. 450 * component vnode returned in ni_vp (if it exists), locked.
451 * if result vnode is mounted on and crossing mount points, 451 * if result vnode is mounted on and crossing mount points,
452 * find mounted on vnode 452 * find mounted on vnode
453 * if more components of name, do next level at dirloop 453 * if more components of name, do next level at dirloop
454 * return the answer in ni_vp, locked if LOCKLEAF set 454 * return the answer in ni_vp, locked if LOCKLEAF set
455 * if LOCKPARENT set, return locked parent in ni_dvp 455 * if LOCKPARENT set, return locked parent in ni_dvp
456 */ 456 */
457 457
458 458
459/* 459/*
460 * Internal state for a namei operation. 460 * Internal state for a namei operation.
461 * 461 *
462 * cnp is always equal to &ndp->ni_cnp. 462 * cnp is always equal to &ndp->ni_cnp.
463 */ 463 */
464struct namei_state { 464struct namei_state {
465 struct nameidata *ndp; 465 struct nameidata *ndp;
466 struct componentname *cnp; 466 struct componentname *cnp;
467 467
468 int docache; /* == 0 do not cache last component */ 468 int docache; /* == 0 do not cache last component */
469 int rdonly; /* lookup read-only flag bit */ 469 int rdonly; /* lookup read-only flag bit */
470 int slashes; 470 int slashes;
471 471
472 unsigned attempt_retry:1; /* true if error allows emul retry */ 472 unsigned attempt_retry:1; /* true if error allows emul retry */
473 unsigned root_referenced:1; /* true if ndp->ni_rootdir and 473 unsigned root_referenced:1; /* true if ndp->ni_rootdir and
474 ndp->ni_erootdir were referenced */ 474 ndp->ni_erootdir were referenced */
475}; 475};
476 476
477 477
478/* 478/*
479 * Initialize the namei working state. 479 * Initialize the namei working state.
480 */ 480 */
481static void 481static void
482namei_init(struct namei_state *state, struct nameidata *ndp) 482namei_init(struct namei_state *state, struct nameidata *ndp)
483{ 483{
484 484
485 state->ndp = ndp; 485 state->ndp = ndp;
486 state->cnp = &ndp->ni_cnd; 486 state->cnp = &ndp->ni_cnd;
487 487
488 state->docache = 0; 488 state->docache = 0;
489 state->rdonly = 0; 489 state->rdonly = 0;
490 state->slashes = 0; 490 state->slashes = 0;
491 491
492 state->root_referenced = 0; 492 state->root_referenced = 0;
493 493
494 KASSERTMSG((state->cnp->cn_cred != NULL), "namei: bad cred/proc"); 494 KASSERTMSG((state->cnp->cn_cred != NULL), "namei: bad cred/proc");
495 KASSERTMSG(((state->cnp->cn_nameiop & (~OPMASK)) == 0), 495 KASSERTMSG(((state->cnp->cn_nameiop & (~OPMASK)) == 0),
496 "namei: nameiop contaminated with flags: %08"PRIx32, 496 "namei: nameiop contaminated with flags: %08"PRIx32,
497 state->cnp->cn_nameiop); 497 state->cnp->cn_nameiop);
498 KASSERTMSG(((state->cnp->cn_flags & OPMASK) == 0), 498 KASSERTMSG(((state->cnp->cn_flags & OPMASK) == 0),
499 "name: flags contaminated with nameiops: %08"PRIx32, 499 "name: flags contaminated with nameiops: %08"PRIx32,
500 state->cnp->cn_flags); 500 state->cnp->cn_flags);
501 501
502 /* 502 /*
503 * The buffer for name translation shall be the one inside the 503 * The buffer for name translation shall be the one inside the
504 * pathbuf. 504 * pathbuf.
505 */ 505 */
506 state->ndp->ni_pnbuf = state->ndp->ni_pathbuf->pb_path; 506 state->ndp->ni_pnbuf = state->ndp->ni_pathbuf->pb_path;
507} 507}
508 508
509/* 509/*
510 * Clean up the working namei state, leaving things ready for return 510 * Clean up the working namei state, leaving things ready for return
511 * from namei. 511 * from namei.
512 */ 512 */
513static void 513static void
514namei_cleanup(struct namei_state *state) 514namei_cleanup(struct namei_state *state)
515{ 515{
516 KASSERT(state->cnp == &state->ndp->ni_cnd); 516 KASSERT(state->cnp == &state->ndp->ni_cnd);
517 517
518 if (state->root_referenced) { 518 if (state->root_referenced) {
519 if (state->ndp->ni_rootdir != NULL) 519 if (state->ndp->ni_rootdir != NULL)
520 vrele(state->ndp->ni_rootdir); 520 vrele(state->ndp->ni_rootdir);
521 if (state->ndp->ni_erootdir != NULL) 521 if (state->ndp->ni_erootdir != NULL)
522 vrele(state->ndp->ni_erootdir); 522 vrele(state->ndp->ni_erootdir);
523 } 523 }
524} 524}
525 525
526////////////////////////////// 526//////////////////////////////
527 527
528/* 528/*
529 * Get the directory context. 529 * Get the directory context.
530 * Initializes the rootdir and erootdir state and returns a reference 530 * Initializes the rootdir and erootdir state and returns a reference
531 * to the starting dir. 531 * to the starting dir.
532 */ 532 */
533static struct vnode * 533static struct vnode *
534namei_getstartdir(struct namei_state *state) 534namei_getstartdir(struct namei_state *state)
535{ 535{
536 struct nameidata *ndp = state->ndp; 536 struct nameidata *ndp = state->ndp;
537 struct componentname *cnp = state->cnp; 537 struct componentname *cnp = state->cnp;
538 struct cwdinfo *cwdi; /* pointer to cwd state */ 538 struct cwdinfo *cwdi; /* pointer to cwd state */
539 struct lwp *self = curlwp; /* thread doing namei() */ 539 struct lwp *self = curlwp; /* thread doing namei() */
540 struct vnode *rootdir, *erootdir, *curdir, *startdir; 540 struct vnode *rootdir, *erootdir, *curdir, *startdir;
541 541
542 if (state->root_referenced) { 542 if (state->root_referenced) {
543 if (state->ndp->ni_rootdir != NULL) 543 if (state->ndp->ni_rootdir != NULL)
544 vrele(state->ndp->ni_rootdir); 544 vrele(state->ndp->ni_rootdir);
545 if (state->ndp->ni_erootdir != NULL) 545 if (state->ndp->ni_erootdir != NULL)
546 vrele(state->ndp->ni_erootdir); 546 vrele(state->ndp->ni_erootdir);
547 state->root_referenced = 0; 547 state->root_referenced = 0;
548 } 548 }
549 549
550 cwdi = self->l_proc->p_cwdi; 550 cwdi = self->l_proc->p_cwdi;
551 rw_enter(&cwdi->cwdi_lock, RW_READER); 551 rw_enter(&cwdi->cwdi_lock, RW_READER);
552 552
553 /* root dir */ 553 /* root dir */
554 if (cwdi->cwdi_rdir == NULL || (cnp->cn_flags & NOCHROOT)) { 554 if (cwdi->cwdi_rdir == NULL || (cnp->cn_flags & NOCHROOT)) {
555 rootdir = rootvnode; 555 rootdir = rootvnode;
556 } else { 556 } else {
557 rootdir = cwdi->cwdi_rdir; 557 rootdir = cwdi->cwdi_rdir;
558 } 558 }
559 559
560 /* emulation root dir, if any */ 560 /* emulation root dir, if any */
561 if ((cnp->cn_flags & TRYEMULROOT) == 0) { 561 if ((cnp->cn_flags & TRYEMULROOT) == 0) {
562 /* if we don't want it, don't fetch it */ 562 /* if we don't want it, don't fetch it */
563 erootdir = NULL; 563 erootdir = NULL;
564 } else if (cnp->cn_flags & EMULROOTSET) { 564 } else if (cnp->cn_flags & EMULROOTSET) {
565 /* explicitly set emulroot; "/../" doesn't override this */ 565 /* explicitly set emulroot; "/../" doesn't override this */
566 erootdir = ndp->ni_erootdir; 566 erootdir = ndp->ni_erootdir;
567 } else if (!strncmp(ndp->ni_pnbuf, "/../", 4)) { 567 } else if (!strncmp(ndp->ni_pnbuf, "/../", 4)) {
568 /* explicit reference to real rootdir */ 568 /* explicit reference to real rootdir */
569 erootdir = NULL; 569 erootdir = NULL;
570 } else { 570 } else {
571 /* may be null */ 571 /* may be null */
572 erootdir = cwdi->cwdi_edir; 572 erootdir = cwdi->cwdi_edir;
573 } 573 }
574 574
575 /* current dir */ 575 /* current dir */
576 curdir = cwdi->cwdi_cdir; 576 curdir = cwdi->cwdi_cdir;
577 577
578 if (ndp->ni_pnbuf[0] != '/') { 578 if (ndp->ni_pnbuf[0] != '/') {
579 if (ndp->ni_atdir != NULL) { 579 if (ndp->ni_atdir != NULL) {
580 startdir = ndp->ni_atdir; 580 startdir = ndp->ni_atdir;
581 } else { 581 } else {
582 startdir = curdir; 582 startdir = curdir;
583 } 583 }
584 erootdir = NULL; 584 erootdir = NULL;
585 } else if (cnp->cn_flags & TRYEMULROOT && erootdir != NULL) { 585 } else if (cnp->cn_flags & TRYEMULROOT && erootdir != NULL) {
586 startdir = erootdir; 586 startdir = erootdir;
587 } else { 587 } else {
588 startdir = rootdir; 588 startdir = rootdir;
589 erootdir = NULL; 589 erootdir = NULL;
590 } 590 }
591 591
592 state->ndp->ni_rootdir = rootdir; 592 state->ndp->ni_rootdir = rootdir;
593 state->ndp->ni_erootdir = erootdir; 593 state->ndp->ni_erootdir = erootdir;
594 594
595 /* 595 /*
596 * Get a reference to the start dir so we can safely unlock cwdi. 596 * Get a reference to the start dir so we can safely unlock cwdi.
597 * 597 *
598 * Must hold references to rootdir and erootdir while we're running. 598 * Must hold references to rootdir and erootdir while we're running.
599 * A multithreaded process may chroot during namei. 599 * A multithreaded process may chroot during namei.
600 */ 600 */
601 if (startdir != NULL) 601 if (startdir != NULL)
602 vref(startdir); 602 vref(startdir);
603 if (state->ndp->ni_rootdir != NULL) 603 if (state->ndp->ni_rootdir != NULL)
604 vref(state->ndp->ni_rootdir); 604 vref(state->ndp->ni_rootdir);
605 if (state->ndp->ni_erootdir != NULL) 605 if (state->ndp->ni_erootdir != NULL)
606 vref(state->ndp->ni_erootdir); 606 vref(state->ndp->ni_erootdir);
607 state->root_referenced = 1; 607 state->root_referenced = 1;
608 608
609 rw_exit(&cwdi->cwdi_lock); 609 rw_exit(&cwdi->cwdi_lock);
610 return startdir; 610 return startdir;
611} 611}
612 612
613/* 613/*
614 * Get the directory context for the nfsd case, in parallel to 614 * Get the directory context for the nfsd case, in parallel to
615 * getstartdir. Initializes the rootdir and erootdir state and 615 * getstartdir. Initializes the rootdir and erootdir state and
616 * returns a reference to the passed-in starting dir. 616 * returns a reference to the passed-in starting dir.
617 */ 617 */
618static struct vnode * 618static struct vnode *
619namei_getstartdir_for_nfsd(struct namei_state *state) 619namei_getstartdir_for_nfsd(struct namei_state *state)
620{ 620{
621 KASSERT(state->ndp->ni_atdir != NULL); 621 KASSERT(state->ndp->ni_atdir != NULL);
622 622
623 /* always use the real root, and never set an emulation root */ 623 /* always use the real root, and never set an emulation root */
624 if (rootvnode == NULL) { 624 if (rootvnode == NULL) {
625 return NULL; 625 return NULL;
626 } 626 }
627 state->ndp->ni_rootdir = rootvnode; 627 state->ndp->ni_rootdir = rootvnode;
628 state->ndp->ni_erootdir = NULL; 628 state->ndp->ni_erootdir = NULL;
629 629
630 vref(state->ndp->ni_atdir); 630 vref(state->ndp->ni_atdir);
631 KASSERT(! state->root_referenced); 631 KASSERT(! state->root_referenced);
632 vref(state->ndp->ni_rootdir); 632 vref(state->ndp->ni_rootdir);
633 state->root_referenced = 1; 633 state->root_referenced = 1;
634 return state->ndp->ni_atdir; 634 return state->ndp->ni_atdir;
635} 635}
636 636
637 637
638/* 638/*
639 * Ktrace the namei operation. 639 * Ktrace the namei operation.
640 */ 640 */
641static void 641static void
642namei_ktrace(struct namei_state *state) 642namei_ktrace(struct namei_state *state)
643{ 643{
644 struct nameidata *ndp = state->ndp; 644 struct nameidata *ndp = state->ndp;
645 struct componentname *cnp = state->cnp; 645 struct componentname *cnp = state->cnp;
646 struct lwp *self = curlwp; /* thread doing namei() */ 646 struct lwp *self = curlwp; /* thread doing namei() */
647 const char *emul_path; 647 const char *emul_path;
648 648
649 if (ktrpoint(KTR_NAMEI)) { 649 if (ktrpoint(KTR_NAMEI)) {
650 if (ndp->ni_erootdir != NULL) { 650 if (ndp->ni_erootdir != NULL) {
651 /* 651 /*
652 * To make any sense, the trace entry need to have the 652 * To make any sense, the trace entry need to have the
653 * text of the emulation path prepended. 653 * text of the emulation path prepended.
654 * Usually we can get this from the current process, 654 * Usually we can get this from the current process,
655 * but when called from emul_find_interp() it is only 655 * but when called from emul_find_interp() it is only
656 * in the exec_package - so we get it passed in ni_next 656 * in the exec_package - so we get it passed in ni_next
657 * (this is a hack). 657 * (this is a hack).
658 */ 658 */
659 if (cnp->cn_flags & EMULROOTSET) 659 if (cnp->cn_flags & EMULROOTSET)
660 emul_path = ndp->ni_next; 660 emul_path = ndp->ni_next;
661 else 661 else
662 emul_path = self->l_proc->p_emul->e_path; 662 emul_path = self->l_proc->p_emul->e_path;
663 ktrnamei2(emul_path, strlen(emul_path), 663 ktrnamei2(emul_path, strlen(emul_path),
664 ndp->ni_pnbuf, ndp->ni_pathlen); 664 ndp->ni_pnbuf, ndp->ni_pathlen);
665 } else 665 } else
666 ktrnamei(ndp->ni_pnbuf, ndp->ni_pathlen); 666 ktrnamei(ndp->ni_pnbuf, ndp->ni_pathlen);
667 } 667 }
668} 668}
669 669
670/* 670/*
671 * Start up namei. Find the root dir and cwd, establish the starting 671 * Start up namei. Find the root dir and cwd, establish the starting
672 * directory for lookup, and lock it. Also calls ktrace when 672 * directory for lookup, and lock it. Also calls ktrace when
673 * appropriate. 673 * appropriate.
674 */ 674 */
675static int 675static int
676namei_start(struct namei_state *state, int isnfsd, 676namei_start(struct namei_state *state, int isnfsd,
677 struct vnode **startdir_ret) 677 struct vnode **startdir_ret)
678{ 678{
679 struct nameidata *ndp = state->ndp; 679 struct nameidata *ndp = state->ndp;
680 struct vnode *startdir; 680 struct vnode *startdir;
681 681
682 /* length includes null terminator (was originally from copyinstr) */ 682 /* length includes null terminator (was originally from copyinstr) */
683 ndp->ni_pathlen = strlen(ndp->ni_pnbuf) + 1; 683 ndp->ni_pathlen = strlen(ndp->ni_pnbuf) + 1;
684 684
685 /* 685 /*
686 * POSIX.1 requirement: "" is not a valid file name. 686 * POSIX.1 requirement: "" is not a valid file name.
687 */ 687 */
688 if (ndp->ni_pathlen == 1) { 688 if (ndp->ni_pathlen == 1) {
689 ndp->ni_erootdir = NULL; 689 ndp->ni_erootdir = NULL;
690 return ENOENT; 690 return ENOENT;
691 } 691 }
692 692
693 ndp->ni_loopcnt = 0; 693 ndp->ni_loopcnt = 0;
694 694
695 /* Get starting directory, set up root, and ktrace. */ 695 /* Get starting directory, set up root, and ktrace. */
696 if (isnfsd) { 696 if (isnfsd) {
697 startdir = namei_getstartdir_for_nfsd(state); 697 startdir = namei_getstartdir_for_nfsd(state);
698 /* no ktrace */ 698 /* no ktrace */
699 } else { 699 } else {
700 startdir = namei_getstartdir(state); 700 startdir = namei_getstartdir(state);
701 namei_ktrace(state); 701 namei_ktrace(state);
702 } 702 }
703 703
704 if (startdir == NULL) { 704 if (startdir == NULL) {
705 return ENOENT; 705 return ENOENT;
706 } 706 }
707 707
708 /* NDAT may feed us with a non directory namei_getstartdir */ 708 /* NDAT may feed us with a non directory namei_getstartdir */
709 if (startdir->v_type != VDIR) { 709 if (startdir->v_type != VDIR) {
710 vrele(startdir); 710 vrele(startdir);
711 return ENOTDIR; 711 return ENOTDIR;
712 } 712 }
713 713
714 *startdir_ret = startdir; 714 *startdir_ret = startdir;
715 return 0; 715 return 0;
716} 716}
717 717
718/* 718/*
719 * Check for being at a symlink that we're going to follow. 719 * Check for being at a symlink that we're going to follow.
720 */ 720 */
721static inline int 721static inline int
722namei_atsymlink(struct namei_state *state, struct vnode *foundobj) 722namei_atsymlink(struct namei_state *state, struct vnode *foundobj)
723{ 723{
724 return (foundobj->v_type == VLNK) && 724 return (foundobj->v_type == VLNK) &&
725 (state->cnp->cn_flags & (FOLLOW|REQUIREDIR)); 725 (state->cnp->cn_flags & (FOLLOW|REQUIREDIR));
726} 726}
727 727
728/* 728/*
729 * Follow a symlink. 729 * Follow a symlink.
730 * 730 *
731 * Updates searchdir. inhibitmagic causes magic symlinks to not be 731 * Updates searchdir. inhibitmagic causes magic symlinks to not be
732 * interpreted; this is used by nfsd. 732 * interpreted; this is used by nfsd.
733 * 733 *
734 * Unlocks foundobj on success (ugh) 734 * Unlocks foundobj on success (ugh)
735 */ 735 */
736static inline int 736static inline int
737namei_follow(struct namei_state *state, int inhibitmagic, 737namei_follow(struct namei_state *state, int inhibitmagic,
738 struct vnode *searchdir, struct vnode *foundobj, 738 struct vnode *searchdir, struct vnode *foundobj,
739 struct vnode **newsearchdir_ret) 739 struct vnode **newsearchdir_ret)
740{ 740{
741 struct nameidata *ndp = state->ndp; 741 struct nameidata *ndp = state->ndp;
742 struct componentname *cnp = state->cnp; 742 struct componentname *cnp = state->cnp;
743 743
744 struct lwp *self = curlwp; /* thread doing namei() */ 744 struct lwp *self = curlwp; /* thread doing namei() */
745 struct iovec aiov; /* uio for reading symbolic links */ 745 struct iovec aiov; /* uio for reading symbolic links */
746 struct uio auio; 746 struct uio auio;
747 char *cp; /* pointer into pathname argument */ 747 char *cp; /* pointer into pathname argument */
748 size_t linklen; 748 size_t linklen;
749 int error; 749 int error;
750 750
751 if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { 751 if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
752 return ELOOP; 752 return ELOOP;
753 } 753 }
754 754
755 vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY); 755 vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
756 if (foundobj->v_mount->mnt_flag & MNT_SYMPERM) { 756 if (foundobj->v_mount->mnt_flag & MNT_SYMPERM) {
757 error = VOP_ACCESS(foundobj, VEXEC, cnp->cn_cred); 757 error = VOP_ACCESS(foundobj, VEXEC, cnp->cn_cred);
758 if (error != 0) { 758 if (error != 0) {
759 VOP_UNLOCK(foundobj); 759 VOP_UNLOCK(foundobj);
760 return error; 760 return error;
761 } 761 }
762 } 762 }
763 763
764 /* FUTURE: fix this to not use a second buffer */ 764 /* FUTURE: fix this to not use a second buffer */
765 cp = PNBUF_GET(); 765 cp = PNBUF_GET();
766 aiov.iov_base = cp; 766 aiov.iov_base = cp;
767 aiov.iov_len = MAXPATHLEN; 767 aiov.iov_len = MAXPATHLEN;
768 auio.uio_iov = &aiov; 768 auio.uio_iov = &aiov;
769 auio.uio_iovcnt = 1; 769 auio.uio_iovcnt = 1;
770 auio.uio_offset = 0; 770 auio.uio_offset = 0;
771 auio.uio_rw = UIO_READ; 771 auio.uio_rw = UIO_READ;
772 auio.uio_resid = MAXPATHLEN; 772 auio.uio_resid = MAXPATHLEN;
773 UIO_SETUP_SYSSPACE(&auio); 773 UIO_SETUP_SYSSPACE(&auio);
774 error = VOP_READLINK(foundobj, &auio, cnp->cn_cred); 774 error = VOP_READLINK(foundobj, &auio, cnp->cn_cred);
775 VOP_UNLOCK(foundobj); 775 VOP_UNLOCK(foundobj);
776 if (error) { 776 if (error) {
777 PNBUF_PUT(cp); 777 PNBUF_PUT(cp);
778 return error; 778 return error;
779 } 779 }
780 linklen = MAXPATHLEN - auio.uio_resid; 780 linklen = MAXPATHLEN - auio.uio_resid;
781 if (linklen == 0) { 781 if (linklen == 0) {
782 PNBUF_PUT(cp); 782 PNBUF_PUT(cp);
783 return ENOENT; 783 return ENOENT;
784 } 784 }
785 785
786 /* 786 /*
787 * Do symlink substitution, if appropriate, and 787 * Do symlink substitution, if appropriate, and
788 * check length for potential overflow. 788 * check length for potential overflow.
789 * 789 *
790 * Inhibit symlink substitution for nfsd. 790 * Inhibit symlink substitution for nfsd.
791 * XXX: This is how it was before; is that a bug or a feature? 791 * XXX: This is how it was before; is that a bug or a feature?
792 */ 792 */
793 if ((!inhibitmagic && vfs_magiclinks && 793 if ((!inhibitmagic && vfs_magiclinks &&
794 symlink_magic(self->l_proc, cp, &linklen)) || 794 symlink_magic(self->l_proc, cp, &linklen)) ||
795 (linklen + ndp->ni_pathlen >= MAXPATHLEN)) { 795 (linklen + ndp->ni_pathlen >= MAXPATHLEN)) {
796 PNBUF_PUT(cp); 796 PNBUF_PUT(cp);
797 return ENAMETOOLONG; 797 return ENAMETOOLONG;
798 } 798 }
799 if (ndp->ni_pathlen > 1) { 799 if (ndp->ni_pathlen > 1) {
800 /* includes a null-terminator */ 800 /* includes a null-terminator */
801 memcpy(cp + linklen, ndp->ni_next, ndp->ni_pathlen); 801 memcpy(cp + linklen, ndp->ni_next, ndp->ni_pathlen);
802 } else { 802 } else {
803 cp[linklen] = '\0'; 803 cp[linklen] = '\0';
804 } 804 }
805 ndp->ni_pathlen += linklen; 805 ndp->ni_pathlen += linklen;
806 memcpy(ndp->ni_pnbuf, cp, ndp->ni_pathlen); 806 memcpy(ndp->ni_pnbuf, cp, ndp->ni_pathlen);
807 PNBUF_PUT(cp); 807 PNBUF_PUT(cp);
808 808
809 /* we're now starting from the beginning of the buffer again */ 809 /* we're now starting from the beginning of the buffer again */
810 cnp->cn_nameptr = ndp->ni_pnbuf; 810 cnp->cn_nameptr = ndp->ni_pnbuf;
811 811
812 /* 812 /*
813 * Check if root directory should replace current directory. 813 * Check if root directory should replace current directory.
814 */ 814 */
815 if (ndp->ni_pnbuf[0] == '/') { 815 if (ndp->ni_pnbuf[0] == '/') {
816 vrele(searchdir); 816 vrele(searchdir);
817 /* Keep absolute symbolic links inside emulation root */ 817 /* Keep absolute symbolic links inside emulation root */
818 searchdir = ndp->ni_erootdir; 818 searchdir = ndp->ni_erootdir;
819 if (searchdir == NULL || 819 if (searchdir == NULL ||
820 (ndp->ni_pnbuf[1] == '.'  820 (ndp->ni_pnbuf[1] == '.'
821 && ndp->ni_pnbuf[2] == '.' 821 && ndp->ni_pnbuf[2] == '.'
822 && ndp->ni_pnbuf[3] == '/')) { 822 && ndp->ni_pnbuf[3] == '/')) {
823 ndp->ni_erootdir = NULL; 823 ndp->ni_erootdir = NULL;
824 searchdir = ndp->ni_rootdir; 824 searchdir = ndp->ni_rootdir;
825 } 825 }
826 vref(searchdir); 826 vref(searchdir);
827 while (cnp->cn_nameptr[0] == '/') { 827 while (cnp->cn_nameptr[0] == '/') {
828 cnp->cn_nameptr++; 828 cnp->cn_nameptr++;
829 ndp->ni_pathlen--; 829 ndp->ni_pathlen--;
830 } 830 }
831 } 831 }
832 832
833 *newsearchdir_ret = searchdir; 833 *newsearchdir_ret = searchdir;
834 return 0; 834 return 0;
835} 835}
836 836
837////////////////////////////// 837//////////////////////////////
838 838
839/* 839/*
840 * Inspect the leading path component and update the state accordingly. 840 * Inspect the leading path component and update the state accordingly.
841 */ 841 */
842static int 842static int
843lookup_parsepath(struct namei_state *state) 843lookup_parsepath(struct namei_state *state)
844{ 844{
845 const char *cp; /* pointer into pathname argument */ 845 const char *cp; /* pointer into pathname argument */
846 846
847 struct componentname *cnp = state->cnp; 847 struct componentname *cnp = state->cnp;
848 struct nameidata *ndp = state->ndp; 848 struct nameidata *ndp = state->ndp;
849 849
850 KASSERT(cnp == &ndp->ni_cnd); 850 KASSERT(cnp == &ndp->ni_cnd);
851 851
852 /* 852 /*
853 * Search a new directory. 853 * Search a new directory.
854 * 854 *
855 * The last component of the filename is left accessible via 855 * The last component of the filename is left accessible via
856 * cnp->cn_nameptr for callers that need the name. Callers needing 856 * cnp->cn_nameptr for callers that need the name. Callers needing
857 * the name set the SAVENAME flag. When done, they assume 857 * the name set the SAVENAME flag. When done, they assume
858 * responsibility for freeing the pathname buffer. 858 * responsibility for freeing the pathname buffer.
859 * 859 *
860 * At this point, our only vnode state is that the search dir 860 * At this point, our only vnode state is that the search dir
861 * is held. 861 * is held.
862 */ 862 */
863 cnp->cn_consume = 0; 863 cnp->cn_consume = 0;
864 cnp->cn_namelen = namei_getcomponent(cnp->cn_nameptr); 864 cnp->cn_namelen = namei_getcomponent(cnp->cn_nameptr);
865 cp = cnp->cn_nameptr + cnp->cn_namelen; 865 cp = cnp->cn_nameptr + cnp->cn_namelen;
866 if (cnp->cn_namelen > KERNEL_NAME_MAX) { 866 if (cnp->cn_namelen > KERNEL_NAME_MAX) {
867 return ENAMETOOLONG; 867 return ENAMETOOLONG;
868 } 868 }
869#ifdef NAMEI_DIAGNOSTIC 869#ifdef NAMEI_DIAGNOSTIC
870 { char c = *cp; 870 { char c = *cp;
871 *(char *)cp = '\0'; 871 *(char *)cp = '\0';
872 printf("{%s}: ", cnp->cn_nameptr); 872 printf("{%s}: ", cnp->cn_nameptr);
873 *(char *)cp = c; } 873 *(char *)cp = c; }
874#endif /* NAMEI_DIAGNOSTIC */ 874#endif /* NAMEI_DIAGNOSTIC */
875 ndp->ni_pathlen -= cnp->cn_namelen; 875 ndp->ni_pathlen -= cnp->cn_namelen;
876 ndp->ni_next = cp; 876 ndp->ni_next = cp;
877 /* 877 /*
878 * If this component is followed by a slash, then move the pointer to 878 * If this component is followed by a slash, then move the pointer to
879 * the next component forward, and remember that this component must be 879 * the next component forward, and remember that this component must be
880 * a directory. 880 * a directory.
881 */ 881 */
882 if (*cp == '/') { 882 if (*cp == '/') {
883 do { 883 do {
884 cp++; 884 cp++;
885 } while (*cp == '/'); 885 } while (*cp == '/');
886 state->slashes = cp - ndp->ni_next; 886 state->slashes = cp - ndp->ni_next;
887 ndp->ni_pathlen -= state->slashes; 887 ndp->ni_pathlen -= state->slashes;
888 ndp->ni_next = cp; 888 ndp->ni_next = cp;
889 cnp->cn_flags |= REQUIREDIR; 889 cnp->cn_flags |= REQUIREDIR;
890 } else { 890 } else {
891 state->slashes = 0; 891 state->slashes = 0;
892 cnp->cn_flags &= ~REQUIREDIR; 892 cnp->cn_flags &= ~REQUIREDIR;
893 } 893 }
894 /* 894 /*
895 * We do special processing on the last component, whether or not it's 895 * We do special processing on the last component, whether or not it's
896 * a directory. Cache all intervening lookups, but not the final one. 896 * a directory. Cache all intervening lookups, but not the final one.
897 */ 897 */
898 if (*cp == '\0') { 898 if (*cp == '\0') {
899 if (state->docache) 899 if (state->docache)
900 cnp->cn_flags |= MAKEENTRY; 900 cnp->cn_flags |= MAKEENTRY;
901 else 901 else
902 cnp->cn_flags &= ~MAKEENTRY; 902 cnp->cn_flags &= ~MAKEENTRY;
903 cnp->cn_flags |= ISLASTCN; 903 cnp->cn_flags |= ISLASTCN;
904 } else { 904 } else {
905 cnp->cn_flags |= MAKEENTRY; 905 cnp->cn_flags |= MAKEENTRY;
906 cnp->cn_flags &= ~ISLASTCN; 906 cnp->cn_flags &= ~ISLASTCN;
907 } 907 }
908 if (cnp->cn_namelen == 2 && 908 if (cnp->cn_namelen == 2 &&
909 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 909 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
910 cnp->cn_flags |= ISDOTDOT; 910 cnp->cn_flags |= ISDOTDOT;
911 else 911 else
912 cnp->cn_flags &= ~ISDOTDOT; 912 cnp->cn_flags &= ~ISDOTDOT;
913 913
914 return 0; 914 return 0;
915} 915}
916 916
917/* 917/*
918 * Take care of crossing a mounted-on vnode. On error, foundobj_ret will be 918 * Take care of crossing a mounted-on vnode. On error, foundobj_ret will be
919 * vrele'd, but searchdir is left alone. 919 * vrele'd, but searchdir is left alone.
920 */ 920 */
921static int 921static int
922lookup_crossmount(struct namei_state *state, 922lookup_crossmount(struct namei_state *state,
923 struct vnode **searchdir_ret, 923 struct vnode **searchdir_ret,
924 struct vnode **foundobj_ret, 924 struct vnode **foundobj_ret,
925 bool *searchdir_locked) 925 bool *searchdir_locked)
926{ 926{
927 struct componentname *cnp = state->cnp; 927 struct componentname *cnp = state->cnp;
928 struct vnode *foundobj; 928 struct vnode *foundobj;
929 struct vnode *searchdir; 929 struct vnode *searchdir;
930 struct mount *mp; 930 struct mount *mp;
931 int error, lktype; 931 int error, lktype;
932 932
933 searchdir = *searchdir_ret; 933 searchdir = *searchdir_ret;
934 foundobj = *foundobj_ret; 934 foundobj = *foundobj_ret;
935 error = 0; 935 error = 0;
936 936
937 KASSERT((cnp->cn_flags & NOCROSSMOUNT) == 0); 937 KASSERT((cnp->cn_flags & NOCROSSMOUNT) == 0);
938 KASSERT(searchdir != NULL); 938 KASSERT(searchdir != NULL);
939 939
940 /* First, unlock searchdir (oof). */ 940 /* First, unlock searchdir (oof). */
941 if (*searchdir_locked) { 941 if (*searchdir_locked) {
942 lktype = VOP_ISLOCKED(searchdir); 942 lktype = VOP_ISLOCKED(searchdir);
943 VOP_UNLOCK(searchdir); 943 VOP_UNLOCK(searchdir);
944 *searchdir_locked = false; 944 *searchdir_locked = false;
945 } else { 945 } else {
946 lktype = LK_NONE; 946 lktype = LK_NONE;
947 } 947 }
948 948
949 /* 949 /*
950 * Do an unlocked check to see if the vnode has been mounted on; if 950 * Do an unlocked check to see if the vnode has been mounted on; if
951 * so find the root of the mounted file system. 951 * so find the root of the mounted file system.
952 */ 952 */
953 while (foundobj->v_type == VDIR && 953 while (foundobj->v_type == VDIR &&
954 (mp = foundobj->v_mountedhere) != NULL && 954 (mp = foundobj->v_mountedhere) != NULL &&
955 (cnp->cn_flags & NOCROSSMOUNT) == 0) { 955 (cnp->cn_flags & NOCROSSMOUNT) == 0) {
956 KASSERTMSG(searchdir != foundobj, "same vn %p", searchdir); 956 KASSERTMSG(searchdir != foundobj, "same vn %p", searchdir);
957 /* 957 /*
958 * First get the vnode stable. LK_SHARED works brilliantly 958 * First get the vnode stable. LK_SHARED works brilliantly
959 * here because almost nothing else wants to lock the 959 * here because almost nothing else wants to lock the
960 * covered vnode. 960 * covered vnode.
961 */ 961 */
962 error = vn_lock(foundobj, LK_SHARED); 962 error = vn_lock(foundobj, LK_SHARED);
963 if (error != 0) { 963 if (error != 0) {
964 vrele(foundobj); 964 vrele(foundobj);
965 foundobj = NULL; 965 foundobj = NULL;
966 break; 966 break;
967 } 967 }
968 968
969 /* Then check to see if something is still mounted on it. */ 969 /* Then check to see if something is still mounted on it. */
970 if ((mp = foundobj->v_mountedhere) == NULL) { 970 if ((mp = foundobj->v_mountedhere) == NULL) {
971 VOP_UNLOCK(foundobj); 971 VOP_UNLOCK(foundobj);
972 break; 972 break;
973 } 973 }
974 974
975 /* Get a reference to the mountpoint, and ditch foundobj. */ 975 /* Get a reference to the mountpoint, and ditch foundobj. */
976 error = vfs_busy(mp); 976 error = vfs_busy(mp);
977 vput(foundobj); 977 vput(foundobj);
978 if (error != 0) { 978 if (error != 0) {
979 foundobj = NULL; 979 foundobj = NULL;
980 break; 980 break;
981 } 981 }
982 982
983 /* Now get a reference on the root vnode, and drop mount. */ 983 /* Now get a reference on the root vnode, and drop mount. */
984 error = VFS_ROOT(mp, LK_NONE, &foundobj); 984 error = VFS_ROOT(mp, LK_NONE, &foundobj);
985 vfs_unbusy(mp); 985 vfs_unbusy(mp);
986 if (error) { 986 if (error) {
987 foundobj = NULL; 987 foundobj = NULL;
988 break; 988 break;
989 } 989 }
990 990
991 /* 991 /*
992 * Avoid locking vnodes from two filesystems because 992 * Avoid locking vnodes from two filesystems because
993 * it's prone to deadlock, e.g. when using puffs. 993 * it's prone to deadlock, e.g. when using puffs.
994 * Also, it isn't a good idea to propagate slowness of 994 * Also, it isn't a good idea to propagate slowness of
995 * a filesystem up to the root directory. For now, 995 * a filesystem up to the root directory. For now,
996 * only handle the common case, where foundobj is 996 * only handle the common case, where foundobj is
997 * VDIR. 997 * VDIR.
998 * 998 *
999 * In this case set searchdir to null to avoid using 999 * In this case set searchdir to null to avoid using
1000 * it again. It is not correct to set searchdir == 1000 * it again. It is not correct to set searchdir ==
1001 * foundobj here as that will confuse the caller. 1001 * foundobj here as that will confuse the caller.
1002 * (See PR 40740.) 1002 * (See PR 40740.)
1003 */ 1003 */
1004 if (searchdir == NULL) { 1004 if (searchdir == NULL) {
1005 /* already been here once; do nothing further */ 1005 /* already been here once; do nothing further */
1006 } else if (foundobj->v_type == VDIR) { 1006 } else if (foundobj->v_type == VDIR) {
1007 vrele(searchdir); 1007 vrele(searchdir);
1008 *searchdir_ret = searchdir = NULL; 1008 *searchdir_ret = searchdir = NULL;
1009 lktype = LK_NONE; 1009 lktype = LK_NONE;
1010 } 1010 }
1011 } 1011 }
1012 1012
1013 /* If searchdir is still around, re-lock it. */ 1013 /* If searchdir is still around, re-lock it. */
1014 if (error == 0 && lktype != LK_NONE) { 1014 if (error == 0 && lktype != LK_NONE) {
1015 vn_lock(searchdir, lktype | LK_RETRY); 1015 vn_lock(searchdir, lktype | LK_RETRY);
1016 *searchdir_locked = true; 1016 *searchdir_locked = true;
1017 } 1017 }
1018 *foundobj_ret = foundobj; 1018 *foundobj_ret = foundobj;
1019 return error; 1019 return error;
1020} 1020}
1021 1021
1022/* 1022/*
1023 * Call VOP_LOOKUP for a single lookup; return a new search directory 1023 * Call VOP_LOOKUP for a single lookup; return a new search directory
1024 * (used when crossing mountpoints up or searching union mounts down) and  1024 * (used when crossing mountpoints up or searching union mounts down) and
1025 * the found object, which for create operations may be NULL on success. 1025 * the found object, which for create operations may be NULL on success.
1026 * 1026 *
1027 * Note that the new search directory may be null, which means the 1027 * Note that the new search directory may be null, which means the
1028 * searchdir was unlocked and released. This happens in the common case 1028 * searchdir was unlocked and released. This happens in the common case
1029 * when crossing a mount point downwards, in order to avoid coupling 1029 * when crossing a mount point downwards, in order to avoid coupling
1030 * locks between different file system volumes. Importantly, this can 1030 * locks between different file system volumes. Importantly, this can
1031 * happen even if the call fails. (XXX: this is gross and should be 1031 * happen even if the call fails. (XXX: this is gross and should be
1032 * tidied somehow.) 1032 * tidied somehow.)
1033 */ 1033 */
1034static int 1034static int
1035lookup_once(struct namei_state *state, 1035lookup_once(struct namei_state *state,
1036 struct vnode *searchdir, 1036 struct vnode *searchdir,
1037 struct vnode **newsearchdir_ret, 1037 struct vnode **newsearchdir_ret,
1038 struct vnode **foundobj_ret, 1038 struct vnode **foundobj_ret,
1039 bool *newsearchdir_locked_ret) 1039 bool *newsearchdir_locked_ret)
1040{ 1040{
1041 struct vnode *tmpvn; /* scratch vnode */ 1041 struct vnode *tmpvn; /* scratch vnode */
1042 struct vnode *foundobj; /* result */ 1042 struct vnode *foundobj; /* result */
1043 struct lwp *l = curlwp; 1043 struct lwp *l = curlwp;
1044 bool searchdir_locked = false; 1044 bool searchdir_locked = false;
1045 int error, lktype; 1045 int error, lktype;
1046 1046
1047 struct componentname *cnp = state->cnp; 1047 struct componentname *cnp = state->cnp;
1048 struct nameidata *ndp = state->ndp; 1048 struct nameidata *ndp = state->ndp;
1049 1049
1050 KASSERT(cnp == &ndp->ni_cnd); 1050 KASSERT(cnp == &ndp->ni_cnd);
1051 *newsearchdir_ret = searchdir; 1051 *newsearchdir_ret = searchdir;
1052 1052
1053 /* 1053 /*
1054 * Handle "..": two special cases. 1054 * Handle "..": two special cases.
1055 * 1. If at root directory (e.g. after chroot) 1055 * 1. If at root directory (e.g. after chroot)
1056 * or at absolute root directory 1056 * or at absolute root directory
1057 * then ignore it so can't get out. 1057 * then ignore it so can't get out.
1058 * 1a. If at the root of the emulation filesystem go to the real 1058 * 1a. If at the root of the emulation filesystem go to the real
1059 * root. So "/../<path>" is always absolute. 1059 * root. So "/../<path>" is always absolute.
1060 * 1b. If we have somehow gotten out of a jail, warn 1060 * 1b. If we have somehow gotten out of a jail, warn
1061 * and also ignore it so we can't get farther out. 1061 * and also ignore it so we can't get farther out.
1062 * 2. If this vnode is the root of a mounted 1062 * 2. If this vnode is the root of a mounted
1063 * filesystem, then replace it with the 1063 * filesystem, then replace it with the
1064 * vnode which was mounted on so we take the 1064 * vnode which was mounted on so we take the
1065 * .. in the other file system. 1065 * .. in the other file system.
1066 */ 1066 */
1067 if (cnp->cn_flags & ISDOTDOT) { 1067 if (cnp->cn_flags & ISDOTDOT) {
1068 struct proc *p = l->l_proc; 1068 struct proc *p = l->l_proc;
1069 1069
1070 for (;;) { 1070 for (;;) {
1071 if (searchdir == ndp->ni_rootdir || 1071 if (searchdir == ndp->ni_rootdir ||
1072 searchdir == rootvnode) { 1072 searchdir == rootvnode) {
1073 foundobj = searchdir; 1073 foundobj = searchdir;
1074 vref(foundobj); 1074 vref(foundobj);
1075 *foundobj_ret = foundobj; 1075 *foundobj_ret = foundobj;
1076 error = 0; 1076 error = 0;
1077 goto done; 1077 goto done;
1078 } 1078 }
1079 if (ndp->ni_rootdir != rootvnode) { 1079 if (ndp->ni_rootdir != rootvnode) {
1080 int retval; 1080 int retval;
1081 1081
1082 retval = vn_isunder(searchdir, ndp->ni_rootdir, l); 1082 retval = vn_isunder(searchdir, ndp->ni_rootdir, l);
1083 if (!retval) { 1083 if (!retval) {
1084 /* Oops! We got out of jail! */ 1084 /* Oops! We got out of jail! */
1085 log(LOG_WARNING, 1085 log(LOG_WARNING,
1086 "chrooted pid %d uid %d (%s) " 1086 "chrooted pid %d uid %d (%s) "
1087 "detected outside of its chroot\n", 1087 "detected outside of its chroot\n",
1088 p->p_pid, kauth_cred_geteuid(l->l_cred), 1088 p->p_pid, kauth_cred_geteuid(l->l_cred),
1089 p->p_comm); 1089 p->p_comm);
1090 /* Put us at the jail root. */ 1090 /* Put us at the jail root. */
1091 vrele(searchdir); 1091 vrele(searchdir);
1092 searchdir = NULL; 1092 searchdir = NULL;
1093 foundobj = ndp->ni_rootdir; 1093 foundobj = ndp->ni_rootdir;
1094 vref(foundobj); 1094 vref(foundobj);
1095 vref(foundobj); 1095 vref(foundobj);
1096 *newsearchdir_ret = foundobj; 1096 *newsearchdir_ret = foundobj;
1097 *foundobj_ret = foundobj; 1097 *foundobj_ret = foundobj;
1098 error = 0; 1098 error = 0;
1099 goto done; 1099 goto done;
1100 } 1100 }
1101 } 1101 }
1102 if ((searchdir->v_vflag & VV_ROOT) == 0 || 1102 if ((searchdir->v_vflag & VV_ROOT) == 0 ||
1103 (cnp->cn_flags & NOCROSSMOUNT)) 1103 (cnp->cn_flags & NOCROSSMOUNT))
1104 break; 1104 break;
1105 tmpvn = searchdir; 1105 tmpvn = searchdir;
1106 searchdir = searchdir->v_mount->mnt_vnodecovered; 1106 searchdir = searchdir->v_mount->mnt_vnodecovered;
1107 vref(searchdir); 1107 vref(searchdir);
1108 vrele(tmpvn); 1108 vrele(tmpvn);
1109 *newsearchdir_ret = searchdir; 1109 *newsearchdir_ret = searchdir;
1110 } 1110 }
1111 } 1111 }
1112 1112
1113 /* 1113 /*
1114 * If the file system supports VOP_LOOKUP() with a shared lock, and 1114 * If the file system supports VOP_LOOKUP() with a shared lock, and
1115 * we are not making any modifications (nameiop LOOKUP) or this is 1115 * we are not making any modifications (nameiop LOOKUP) or this is
1116 * not the last component then get a shared lock. Where we can't do 1116 * not the last component then get a shared lock. Where we can't do
1117 * fast-forwarded lookups (for example with layered file systems) 1117 * fast-forwarded lookups (for example with layered file systems)
1118 * then this is the fallback for reducing lock contention. 1118 * then this is the fallback for reducing lock contention.
1119 */ 1119 */
1120 if ((searchdir->v_mount->mnt_iflag & IMNT_SHRLOOKUP) != 0 && 1120 if ((searchdir->v_mount->mnt_iflag & IMNT_SHRLOOKUP) != 0 &&
1121 (cnp->cn_nameiop == LOOKUP || (cnp->cn_flags & ISLASTCN) == 0)) { 1121 (cnp->cn_nameiop == LOOKUP || (cnp->cn_flags & ISLASTCN) == 0)) {
1122 lktype = LK_SHARED; 1122 lktype = LK_SHARED;
1123 } else { 1123 } else {
1124 lktype = LK_EXCLUSIVE; 1124 lktype = LK_EXCLUSIVE;
1125 } 1125 }
1126 1126
1127 /* 1127 /*
1128 * We now have a segment name to search for, and a directory to search. 1128 * We now have a segment name to search for, and a directory to search.
1129 * Our vnode state here is that "searchdir" is held. 1129 * Our vnode state here is that "searchdir" is held.
1130 */ 1130 */
1131unionlookup: 1131unionlookup:
1132 foundobj = NULL; 1132 foundobj = NULL;
1133 if (!searchdir_locked) { 1133 if (!searchdir_locked) {
1134 vn_lock(searchdir, lktype | LK_RETRY); 1134 vn_lock(searchdir, lktype | LK_RETRY);
1135 searchdir_locked = true; 1135 searchdir_locked = true;
1136 } 1136 }
1137 error = VOP_LOOKUP(searchdir, &foundobj, cnp); 1137 error = VOP_LOOKUP(searchdir, &foundobj, cnp);
1138 1138
1139 if (error != 0) { 1139 if (error != 0) {
1140 KASSERTMSG((foundobj == NULL), 1140 KASSERTMSG((foundobj == NULL),
1141 "leaf `%s' should be empty but is %p", 1141 "leaf `%s' should be empty but is %p",
1142 cnp->cn_nameptr, foundobj); 1142 cnp->cn_nameptr, foundobj);
1143#ifdef NAMEI_DIAGNOSTIC 1143#ifdef NAMEI_DIAGNOSTIC
1144 printf("not found\n"); 1144 printf("not found\n");
1145#endif /* NAMEI_DIAGNOSTIC */ 1145#endif /* NAMEI_DIAGNOSTIC */
1146 1146
1147 /* 1147 /*
1148 * If ENOLCK, the file system needs us to retry the lookup 1148 * If ENOLCK, the file system needs us to retry the lookup
1149 * with an exclusive lock. It's likely nothing was found in 1149 * with an exclusive lock. It's likely nothing was found in
1150 * cache and/or modifications need to be made. 1150 * cache and/or modifications need to be made.
1151 */ 1151 */
1152 if (error == ENOLCK) { 1152 if (error == ENOLCK) {
1153 KASSERT(VOP_ISLOCKED(searchdir) == LK_SHARED); 1153 KASSERT(VOP_ISLOCKED(searchdir) == LK_SHARED);
1154 KASSERT(searchdir_locked); 1154 KASSERT(searchdir_locked);
1155 if (vn_lock(searchdir, LK_UPGRADE | LK_NOWAIT)) { 1155 if (vn_lock(searchdir, LK_UPGRADE | LK_NOWAIT)) {
1156 VOP_UNLOCK(searchdir); 1156 VOP_UNLOCK(searchdir);
1157 searchdir_locked = false; 1157 searchdir_locked = false;
1158 } 1158 }
1159 lktype = LK_EXCLUSIVE; 1159 lktype = LK_EXCLUSIVE;
1160 goto unionlookup; 1160 goto unionlookup;
1161 } 1161 }
1162 1162
1163 if ((error == ENOENT) && 1163 if ((error == ENOENT) &&
1164 (searchdir->v_vflag & VV_ROOT) && 1164 (searchdir->v_vflag & VV_ROOT) &&
1165 (searchdir->v_mount->mnt_flag & MNT_UNION)) { 1165 (searchdir->v_mount->mnt_flag & MNT_UNION)) {
1166 tmpvn = searchdir; 1166 tmpvn = searchdir;
1167 searchdir = searchdir->v_mount->mnt_vnodecovered; 1167 searchdir = searchdir->v_mount->mnt_vnodecovered;
1168 vref(searchdir); 1168 vref(searchdir);
1169 vput(tmpvn); 1169 vput(tmpvn);
1170 searchdir_locked = false; 1170 searchdir_locked = false;
1171 *newsearchdir_ret = searchdir; 1171 *newsearchdir_ret = searchdir;
1172 goto unionlookup; 1172 goto unionlookup;
1173 } 1173 }
1174 1174
1175 if (error != EJUSTRETURN) 1175 if (error != EJUSTRETURN)
1176 goto done; 1176 goto done;
1177 1177
1178 /* 1178 /*
1179 * If this was not the last component, or there were trailing 1179 * If this was not the last component, or there were trailing
1180 * slashes, and we are not going to create a directory, 1180 * slashes, and we are not going to create a directory,
1181 * then the name must exist. 1181 * then the name must exist.
1182 */ 1182 */
1183 if ((cnp->cn_flags & (REQUIREDIR | CREATEDIR)) == REQUIREDIR) { 1183 if ((cnp->cn_flags & (REQUIREDIR | CREATEDIR)) == REQUIREDIR) {
1184 error = ENOENT; 1184 error = ENOENT;
1185 goto done; 1185 goto done;
1186 } 1186 }
1187 1187
1188 /* 1188 /*
1189 * If creating and at end of pathname, then can consider 1189 * If creating and at end of pathname, then can consider
1190 * allowing file to be created. 1190 * allowing file to be created.
1191 */ 1191 */
1192 if (state->rdonly) { 1192 if (state->rdonly) {
1193 error = EROFS; 1193 error = EROFS;
1194 goto done; 1194 goto done;
1195 } 1195 }
1196 1196
1197 /* 1197 /*
1198 * We return success and a NULL foundobj to indicate 1198 * We return success and a NULL foundobj to indicate
1199 * that the entry doesn't currently exist, leaving a 1199 * that the entry doesn't currently exist, leaving a
1200 * pointer to the (normally, locked) directory vnode 1200 * pointer to the (normally, locked) directory vnode
1201 * as searchdir. 1201 * as searchdir.
1202 */ 1202 */
1203 *foundobj_ret = NULL; 1203 *foundobj_ret = NULL;
1204 error = 0; 1204 error = 0;
1205 goto done; 1205 goto done;
1206 } 1206 }
1207#ifdef NAMEI_DIAGNOSTIC 1207#ifdef NAMEI_DIAGNOSTIC
1208 printf("found\n"); 1208 printf("found\n");
1209#endif /* NAMEI_DIAGNOSTIC */ 1209#endif /* NAMEI_DIAGNOSTIC */
1210 1210
1211 /* 1211 /*
1212 * Take into account any additional components consumed by the 1212 * Take into account any additional components consumed by the
1213 * underlying filesystem. This will include any trailing slashes after 1213 * underlying filesystem. This will include any trailing slashes after
1214 * the last component consumed. 1214 * the last component consumed.
1215 */ 1215 */
1216 if (cnp->cn_consume > 0) { 1216 if (cnp->cn_consume > 0) {
1217 ndp->ni_pathlen -= cnp->cn_consume - state->slashes; 1217 ndp->ni_pathlen -= cnp->cn_consume - state->slashes;
1218 ndp->ni_next += cnp->cn_consume - state->slashes; 1218 ndp->ni_next += cnp->cn_consume - state->slashes;
1219 cnp->cn_consume = 0; 1219 cnp->cn_consume = 0;
1220 if (ndp->ni_next[0] == '\0') 1220 if (ndp->ni_next[0] == '\0')
1221 cnp->cn_flags |= ISLASTCN; 1221 cnp->cn_flags |= ISLASTCN;
1222 } 1222 }
1223 1223
1224 /* Unlock, unless the caller needs the parent locked. */ 1224 /* Unlock, unless the caller needs the parent locked. */
1225 if (searchdir != NULL) { 1225 if (searchdir != NULL) {
1226 KASSERT(searchdir_locked); 1226 KASSERT(searchdir_locked);
1227 if ((cnp->cn_flags & (ISLASTCN | LOCKPARENT)) != 1227 if ((cnp->cn_flags & (ISLASTCN | LOCKPARENT)) !=
1228 (ISLASTCN | LOCKPARENT)) { 1228 (ISLASTCN | LOCKPARENT)) {
1229 VOP_UNLOCK(searchdir); 1229 VOP_UNLOCK(searchdir);
1230 searchdir_locked = false; 1230 searchdir_locked = false;
1231 } 1231 }
1232 } else { 1232 } else {
1233 KASSERT(!searchdir_locked); 1233 KASSERT(!searchdir_locked);
1234 } 1234 }
1235 1235
1236 *foundobj_ret = foundobj; 1236 *foundobj_ret = foundobj;
1237 error = 0; 1237 error = 0;
1238done: 1238done:
1239 *newsearchdir_locked_ret = searchdir_locked; 1239 *newsearchdir_locked_ret = searchdir_locked;
1240 return error; 1240 return error;
1241} 1241}
1242 1242
1243/* 1243/*
1244 * Parse out the first path name component that we need to to consider.  1244 * Parse out the first path name component that we need to to consider.
1245 * 1245 *
1246 * While doing this, attempt to use the name cache to fast-forward through 1246 * While doing this, attempt to use the name cache to fast-forward through
1247 * as many "easy" to find components of the path as possible. 1247 * as many "easy" to find components of the path as possible.
1248 * 1248 *
1249 * We use the namecache's node locks to form a chain, and avoid as many 1249 * We use the namecache's node locks to form a chain, and avoid as many
1250 * vnode references and locks as possible. In the ideal case, only the 1250 * vnode references and locks as possible. In the ideal case, only the
1251 * final vnode will have its reference count adjusted and lock taken. 1251 * final vnode will have its reference count adjusted and lock taken.
1252 */ 1252 */
1253static int 1253static int
1254lookup_fastforward(struct namei_state *state, struct vnode **searchdir_ret, 1254lookup_fastforward(struct namei_state *state, struct vnode **searchdir_ret,
1255 struct vnode **foundobj_ret) 1255 struct vnode **foundobj_ret)
1256{ 1256{
1257 struct componentname *cnp = state->cnp; 1257 struct componentname *cnp = state->cnp;
1258 struct nameidata *ndp = state->ndp; 1258 struct nameidata *ndp = state->ndp;
1259 krwlock_t *plock; 1259 krwlock_t *plock;
1260 struct vnode *foundobj, *searchdir; 1260 struct vnode *foundobj, *searchdir;
1261 int error, error2; 1261 int error, error2;
1262 size_t oldpathlen; 1262 size_t oldpathlen;
1263 const char *oldnameptr; 1263 const char *oldnameptr;
1264 1264
1265 /* 1265 /*
1266 * Eat as many path name components as possible before giving up and 1266 * Eat as many path name components as possible before giving up and
1267 * letting lookup_once() handle it. Remember the starting point in 1267 * letting lookup_once() handle it. Remember the starting point in
1268 * case we can't get vnode references and need to roll back. 1268 * case we can't get vnode references and need to roll back.
1269 */ 1269 */
1270 plock = NULL; 1270 plock = NULL;
1271 searchdir = *searchdir_ret; 1271 searchdir = *searchdir_ret;
1272 oldnameptr = cnp->cn_nameptr; 1272 oldnameptr = cnp->cn_nameptr;
1273 oldpathlen = ndp->ni_pathlen; 1273 oldpathlen = ndp->ni_pathlen;
1274 for (;;) { 1274 for (;;) {
1275 foundobj = NULL; 1275 foundobj = NULL;
1276 1276
1277 /* 1277 /*
1278 * Get the next component name. There should be no slashes 1278 * Get the next component name. There should be no slashes
1279 * here, and we shouldn't have looped around if we were 1279 * here, and we shouldn't have looped around if we were
1280 * done. 1280 * done.
1281 */ 1281 */
1282 KASSERT(cnp->cn_nameptr[0] != '/'); 1282 KASSERT(cnp->cn_nameptr[0] != '/');
1283 KASSERT(cnp->cn_nameptr[0] != '\0'); 1283 KASSERT(cnp->cn_nameptr[0] != '\0');
1284 if ((error = lookup_parsepath(state)) != 0) { 1284 if ((error = lookup_parsepath(state)) != 0) {
1285 break; 1285 break;
1286 } 1286 }
1287 1287
1288 /* 1288 /*
1289 * Can't deal with dotdot lookups, because it means lock 1289 * Can't deal with DOTDOT lookups if NOCROSSMOUNT or the
1290 * order reversal, and there are checks in lookup_once() 1290 * lookup is chrooted.
1291 * that need to be made. Also check for missing mountpoints. 
1292 */ 1291 */
1293 if ((cnp->cn_flags & ISDOTDOT) != 0 || 1292 if ((cnp->cn_flags & ISDOTDOT) != 0) {
1294 searchdir->v_mount == NULL) { 1293 if ((searchdir->v_vflag & VV_ROOT) != 0 &&
1295 error = EOPNOTSUPP; 1294 (cnp->cn_flags & NOCROSSMOUNT)) {
1296 break; 1295 error = EOPNOTSUPP;
 1296 break;
 1297 }
 1298 if (ndp->ni_rootdir != rootvnode) {
 1299 error = EOPNOTSUPP;
 1300 break;
 1301 }
1297 } 1302 }
1298 1303
1299 /* 1304 /*
1300 * Can't deal with last component when modifying; this needs 1305 * Can't deal with last component when modifying; this needs
1301 * searchdir locked and VOP_LOOKUP() called (which can and 1306 * searchdir locked and VOP_LOOKUP() called (which can and
1302 * does modify state, despite the name). 1307 * does modify state, despite the name).
1303 */ 1308 */
1304 if ((cnp->cn_flags & ISLASTCN) != 0) { 1309 if ((cnp->cn_flags & ISLASTCN) != 0) {
1305 if (cnp->cn_nameiop != LOOKUP || 1310 if (cnp->cn_nameiop != LOOKUP ||
1306 (cnp->cn_flags & LOCKPARENT) != 0) { 1311 (cnp->cn_flags & LOCKPARENT) != 0) {
1307 error = EOPNOTSUPP; 1312 error = EOPNOTSUPP;
1308 break; 1313 break;
1309 } 1314 }
1310 } 1315 }
1311 1316
1312 /* Can't deal with -o union lookups. */ 
1313 if ((searchdir->v_vflag & VV_ROOT) != 0 && 
1314 (searchdir->v_mount->mnt_flag & MNT_UNION) != 0) { 
1315 error = EOPNOTSUPP; 
1316 break; 
1317 } 
1318 
1319 /* 1317 /*
1320 * Good, now look for it in cache. cache_lookup_linked() 1318 * Good, now look for it in cache. cache_lookup_linked()
1321 * will fail if there's nothing there, or if there's no 1319 * will fail if there's nothing there, or if there's no
1322 * ownership info for the directory, or if the user doesn't 1320 * ownership info for the directory, or if the user doesn't
1323 * have permission to look up files in this directory. 1321 * have permission to look up files in this directory.
1324 */ 1322 */
1325 if (!cache_lookup_linked(searchdir, cnp->cn_nameptr, 1323 if (!cache_lookup_linked(searchdir, cnp->cn_nameptr,
1326 cnp->cn_namelen, &foundobj, &plock, cnp->cn_cred)) { 1324 cnp->cn_namelen, &foundobj, &plock, cnp->cn_cred)) {
1327 error = EOPNOTSUPP; 1325 error = EOPNOTSUPP;
1328 break; 1326 break;
1329 } 1327 }
1330 KASSERT(plock != NULL && rw_lock_held(plock)); 1328 KASSERT(plock != NULL && rw_lock_held(plock));
1331 1329
1332 /* Scored a hit. Negative is good too (ENOENT). */ 1330 /*
 1331 * Scored a hit. Negative is good too (ENOENT). If there's
 1332 * a '-o union' mount here, punt and let lookup_once() deal
 1333 * with it.
 1334 */
1333 if (foundobj == NULL) { 1335 if (foundobj == NULL) {
1334 error = ENOENT; 1336 if ((searchdir->v_vflag & VV_ROOT) != 0 &&
 1337 (searchdir->v_mount->mnt_flag & MNT_UNION) != 0) {
 1338 error = EOPNOTSUPP;
 1339 } else {
 1340 error = ENOENT;
 1341 }
1335 break; 1342 break;
1336 } 1343 }
1337 1344
1338 /* 1345 /*
1339 * Stop and get a hold on the vnode if there's something 1346 * Stop and get a hold on the vnode if there's something
1340 * that can't be handled here: 1347 * that can't be handled here:
1341 * 1348 *
1342 * - we've reached the last component. 1349 * - we've reached the last component.
1343 * - or encountered a mount point that needs to be crossed. 1350 * - or encountered a mount point that needs to be crossed.
1344 * - or encountered something other than a directory. 1351 * - or encountered something other than a directory.
1345 */ 1352 */
1346 if ((cnp->cn_flags & ISLASTCN) != 0 || 1353 if ((cnp->cn_flags & ISLASTCN) != 0 ||
1347 foundobj->v_type != VDIR || 1354 foundobj->v_type != VDIR ||
1348 (foundobj->v_type == VDIR && 1355 (foundobj->v_type == VDIR &&
1349 foundobj->v_mountedhere != NULL)) { 1356 foundobj->v_mountedhere != NULL)) {
1350 mutex_enter(foundobj->v_interlock); 1357 mutex_enter(foundobj->v_interlock);
1351 error = vcache_tryvget(foundobj); 1358 error = vcache_tryvget(foundobj);
1352 /* v_interlock now unheld */ 1359 /* v_interlock now unheld */
1353 if (error != 0) { 1360 if (error != 0) {
1354 foundobj = NULL; 1361 foundobj = NULL;
1355 error = EOPNOTSUPP; 1362 error = EOPNOTSUPP;
1356 } 1363 }
1357 break; 1364 break;
1358 } 1365 }
1359 1366
1360 /* 1367 /*
1361 * Otherwise, we're still in business. Set the found VDIR 1368 * Otherwise, we're still in business. Set the found VDIR
1362 * vnode as the search dir for the next component and 1369 * vnode as the search dir for the next component and
1363 * continue on to it. 1370 * continue on to it.
1364 */ 1371 */
1365 cnp->cn_nameptr = ndp->ni_next; 1372 cnp->cn_nameptr = ndp->ni_next;
1366 searchdir = foundobj; 1373 searchdir = foundobj;
1367 } 1374 }
1368 1375
1369 /* 1376 /*
1370 * If we ended up with a new search dir, ref it before dropping the 1377 * If we ended up with a new search dir, ref it before dropping the
1371 * namecache's lock. The lock prevents both searchdir and foundobj 1378 * namecache's lock. The lock prevents both searchdir and foundobj
1372 * from disappearing. If we can't ref the new searchdir, we have a 1379 * from disappearing. If we can't ref the new searchdir, we have a
1373 * bit of a problem. Roll back the fastforward to the beginning and 1380 * bit of a problem. Roll back the fastforward to the beginning and
1374 * let lookup_once() take care of it. 1381 * let lookup_once() take care of it.
1375 */ 1382 */
1376 if (searchdir != *searchdir_ret) { 1383 if (searchdir != *searchdir_ret) {
1377 mutex_enter(searchdir->v_interlock); 1384 mutex_enter(searchdir->v_interlock);
1378 error2 = vcache_tryvget(searchdir); 1385 error2 = vcache_tryvget(searchdir);
1379 /* v_interlock now unheld */ 1386 /* v_interlock now unheld */
1380 KASSERT(plock != NULL); 1387 KASSERT(plock != NULL);
1381 rw_exit(plock); 1388 rw_exit(plock);
1382 if (__predict_true(error2 == 0)) { 1389 if (__predict_true(error2 == 0)) {
1383 /* Returning new searchdir, and maybe new foundobj. */ 1390 /* Returning new searchdir, and maybe new foundobj. */
1384 vrele(*searchdir_ret); 1391 vrele(*searchdir_ret);
1385 *searchdir_ret = searchdir; 1392 *searchdir_ret = searchdir;
1386 } else { 1393 } else {
1387 /* Returning nothing. */ 1394 /* Returning nothing. */
1388 if (foundobj != NULL) { 1395 if (foundobj != NULL) {
1389 vrele(foundobj); 1396 vrele(foundobj);
1390 foundobj = NULL; 1397 foundobj = NULL;
1391 } 1398 }
1392 cnp->cn_nameptr = oldnameptr; 1399 cnp->cn_nameptr = oldnameptr;
1393 ndp->ni_pathlen = oldpathlen; 1400 ndp->ni_pathlen = oldpathlen;
1394 error = lookup_parsepath(state); 1401 error = lookup_parsepath(state);
1395 if (error == 0) { 1402 if (error == 0) {
1396 error = EOPNOTSUPP; 1403 error = EOPNOTSUPP;
1397 } 1404 }
1398 } 1405 }
1399 } else if (plock != NULL) { 1406 } else if (plock != NULL) {
1400 /* Drop any namecache lock still held. */ 1407 /* Drop any namecache lock still held. */
1401 rw_exit(plock); 1408 rw_exit(plock);
1402 } 1409 }
1403 1410
1404 KASSERT(error == 0 ? foundobj != NULL : foundobj == NULL); 1411 KASSERT(error == 0 ? foundobj != NULL : foundobj == NULL);
1405 *foundobj_ret = foundobj; 1412 *foundobj_ret = foundobj;
1406 return error; 1413 return error;
1407} 1414}
1408 1415
1409////////////////////////////// 1416//////////////////////////////
1410 1417
1411/* 1418/*
1412 * Do a complete path search from a single root directory. 1419 * Do a complete path search from a single root directory.
1413 * (This is called up to twice if TRYEMULROOT is in effect.) 1420 * (This is called up to twice if TRYEMULROOT is in effect.)
1414 */ 1421 */
1415static int 1422static int
1416namei_oneroot(struct namei_state *state, 1423namei_oneroot(struct namei_state *state,
1417 int neverfollow, int inhibitmagic, int isnfsd) 1424 int neverfollow, int inhibitmagic, int isnfsd)
1418{ 1425{
1419 struct nameidata *ndp = state->ndp; 1426 struct nameidata *ndp = state->ndp;
1420 struct componentname *cnp = state->cnp; 1427 struct componentname *cnp = state->cnp;
1421 struct vnode *searchdir, *foundobj; 1428 struct vnode *searchdir, *foundobj;
1422 bool searchdir_locked = false; 1429 bool searchdir_locked = false;
1423 int error; 1430 int error;
1424 1431
1425 error = namei_start(state, isnfsd, &searchdir); 1432 error = namei_start(state, isnfsd, &searchdir);
1426 if (error) { 1433 if (error) {
1427 ndp->ni_dvp = NULL; 1434 ndp->ni_dvp = NULL;
1428 ndp->ni_vp = NULL; 1435 ndp->ni_vp = NULL;
1429 return error; 1436 return error;
1430 } 1437 }
1431 KASSERT(searchdir->v_type == VDIR); 1438 KASSERT(searchdir->v_type == VDIR);
1432 1439
1433 /* 1440 /*
1434 * Setup: break out flag bits into variables. 1441 * Setup: break out flag bits into variables.
1435 */ 1442 */
1436 state->docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 1443 state->docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
1437 if (cnp->cn_nameiop == DELETE) 1444 if (cnp->cn_nameiop == DELETE)
1438 state->docache = 0; 1445 state->docache = 0;
1439 state->rdonly = cnp->cn_flags & RDONLY; 1446 state->rdonly = cnp->cn_flags & RDONLY;
1440 1447
1441 /* 1448 /*
1442 * Keep going until we run out of path components. 1449 * Keep going until we run out of path components.
1443 */ 1450 */
1444 cnp->cn_nameptr = ndp->ni_pnbuf; 1451 cnp->cn_nameptr = ndp->ni_pnbuf;
1445 1452
1446 /* drop leading slashes (already used them to choose startdir) */ 1453 /* drop leading slashes (already used them to choose startdir) */
1447 while (cnp->cn_nameptr[0] == '/') { 1454 while (cnp->cn_nameptr[0] == '/') {
1448 cnp->cn_nameptr++; 1455 cnp->cn_nameptr++;
1449 ndp->ni_pathlen--; 1456 ndp->ni_pathlen--;
1450 } 1457 }
1451 /* was it just "/"? */ 1458 /* was it just "/"? */
1452 if (cnp->cn_nameptr[0] == '\0') { 1459 if (cnp->cn_nameptr[0] == '\0') {
1453 foundobj = searchdir; 1460 foundobj = searchdir;
1454 searchdir = NULL; 1461 searchdir = NULL;
1455 cnp->cn_flags |= ISLASTCN; 1462 cnp->cn_flags |= ISLASTCN;
1456 1463
1457 /* bleh */ 1464 /* bleh */
1458 goto skiploop; 1465 goto skiploop;
1459 } 1466 }
1460 1467
1461 for (;;) { 1468 for (;;) {
1462 KASSERT(searchdir != NULL); 1469 KASSERT(searchdir != NULL);
1463 KASSERT(!searchdir_locked); 1470 KASSERT(!searchdir_locked);
1464 1471
1465 /* 1472 /*
1466 * Parse out the first path name component that we need to 1473 * Parse out the first path name component that we need to
1467 * to consider. While doing this, attempt to use the name 1474 * to consider. While doing this, attempt to use the name
1468 * cache to fast-forward through as many "easy" to find 1475 * cache to fast-forward through as many "easy" to find
1469 * components of the path as possible. 1476 * components of the path as possible.
1470 */ 1477 */
1471 error = lookup_fastforward(state, &searchdir, &foundobj); 1478 error = lookup_fastforward(state, &searchdir, &foundobj);
1472 1479
1473 /* 1480 /*
1474 * If we didn't get a good answer from the namecache, then 1481 * If we didn't get a good answer from the namecache, then
1475 * go directly to the file system. 1482 * go directly to the file system.
1476 */ 1483 */
1477 if (error == EOPNOTSUPP) { 1484 if (error == EOPNOTSUPP) {
1478 error = lookup_once(state, searchdir, &searchdir, 1485 error = lookup_once(state, searchdir, &searchdir,
1479 &foundobj, &searchdir_locked); 1486 &foundobj, &searchdir_locked);
1480 } 1487 }
1481 1488
1482 /* 1489 /*
1483 * If the vnode we found is mounted on, then cross the mount 1490 * If the vnode we found is mounted on, then cross the mount
1484 * and get the root vnode in foundobj. If this encounters 1491 * and get the root vnode in foundobj. If this encounters
1485 * an error, it will dispose of foundobj, but searchdir is 1492 * an error, it will dispose of foundobj, but searchdir is
1486 * untouched. 1493 * untouched.
1487 */ 1494 */
1488 if (error == 0 && foundobj != NULL && 1495 if (error == 0 && foundobj != NULL &&
1489 foundobj->v_type == VDIR && 1496 foundobj->v_type == VDIR &&
1490 foundobj->v_mountedhere != NULL && 1497 foundobj->v_mountedhere != NULL &&
1491 (cnp->cn_flags & NOCROSSMOUNT) == 0) { 1498 (cnp->cn_flags & NOCROSSMOUNT) == 0) {
1492 error = lookup_crossmount(state, &searchdir, 1499 error = lookup_crossmount(state, &searchdir,
1493 &foundobj, &searchdir_locked); 1500 &foundobj, &searchdir_locked);
1494 } 1501 }
1495 1502
1496 if (error) { 1503 if (error) {
1497 if (searchdir != NULL) { 1504 if (searchdir != NULL) {
1498 if (searchdir_locked) { 1505 if (searchdir_locked) {
1499 searchdir_locked = false; 1506 searchdir_locked = false;
1500 vput(searchdir); 1507 vput(searchdir);
1501 } else { 1508 } else {
1502 vrele(searchdir); 1509 vrele(searchdir);
1503 } 1510 }
1504 } 1511 }
1505 ndp->ni_dvp = NULL; 1512 ndp->ni_dvp = NULL;
1506 ndp->ni_vp = NULL; 1513 ndp->ni_vp = NULL;
1507 /* 1514 /*
1508 * Note that if we're doing TRYEMULROOT we can 1515 * Note that if we're doing TRYEMULROOT we can
1509 * retry with the normal root. Where this is 1516 * retry with the normal root. Where this is
1510 * currently set matches previous practice, 1517 * currently set matches previous practice,
1511 * but the previous practice didn't make much 1518 * but the previous practice didn't make much
1512 * sense and somebody should sit down and 1519 * sense and somebody should sit down and
1513 * figure out which cases should cause retry 1520 * figure out which cases should cause retry
1514 * and which shouldn't. XXX. 1521 * and which shouldn't. XXX.
1515 */ 1522 */
1516 state->attempt_retry = 1; 1523 state->attempt_retry = 1;
1517 return (error); 1524 return (error);
1518 } 1525 }
1519 1526
1520 if (foundobj == NULL) { 1527 if (foundobj == NULL) {
1521 /* 1528 /*
1522 * Success with no object returned means we're 1529 * Success with no object returned means we're
1523 * creating something and it isn't already 1530 * creating something and it isn't already
1524 * there. Break out of the main loop now so 1531 * there. Break out of the main loop now so
1525 * the code below doesn't have to test for 1532 * the code below doesn't have to test for
1526 * foundobj == NULL. 1533 * foundobj == NULL.
1527 */ 1534 */
1528 /* lookup_once can't have dropped the searchdir */ 1535 /* lookup_once can't have dropped the searchdir */
1529 KASSERT(searchdir != NULL); 1536 KASSERT(searchdir != NULL);
1530 break; 1537 break;
1531 } 1538 }
1532 1539
1533 /* 1540 /*
1534 * Check for symbolic link. If we've reached one, 1541 * Check for symbolic link. If we've reached one,
1535 * follow it, unless we aren't supposed to. Back up 1542 * follow it, unless we aren't supposed to. Back up
1536 * over any slashes that we skipped, as we will need 1543 * over any slashes that we skipped, as we will need
1537 * them again. 1544 * them again.
1538 */ 1545 */
1539 if (namei_atsymlink(state, foundobj)) { 1546 if (namei_atsymlink(state, foundobj)) {
1540 /* Don't need searchdir locked any more. */ 1547 /* Don't need searchdir locked any more. */
1541 if (searchdir_locked) { 1548 if (searchdir_locked) {
1542 searchdir_locked = false; 1549 searchdir_locked = false;
1543 VOP_UNLOCK(searchdir); 1550 VOP_UNLOCK(searchdir);
1544 } 1551 }
1545 ndp->ni_pathlen += state->slashes; 1552 ndp->ni_pathlen += state->slashes;
1546 ndp->ni_next -= state->slashes; 1553 ndp->ni_next -= state->slashes;
1547 if (neverfollow) { 1554 if (neverfollow) {
1548 error = EINVAL; 1555 error = EINVAL;
1549 } else if (searchdir == NULL) { 1556 } else if (searchdir == NULL) {
1550 /* 1557 /*
1551 * dholland 20160410: lookup_once only 1558 * dholland 20160410: lookup_once only
1552 * drops searchdir if it crossed a 1559 * drops searchdir if it crossed a
1553 * mount point. Therefore, if we get 1560 * mount point. Therefore, if we get
1554 * here it means we crossed a mount 1561 * here it means we crossed a mount
1555 * point to a mounted filesystem whose 1562 * point to a mounted filesystem whose
1556 * root vnode is a symlink. In theory 1563 * root vnode is a symlink. In theory
1557 * we could continue at this point by 1564 * we could continue at this point by
1558 * using the pre-crossing searchdir 1565 * using the pre-crossing searchdir
1559 * (e.g. just take out an extra 1566 * (e.g. just take out an extra
1560 * reference on it before calling 1567 * reference on it before calling
1561 * lookup_once so we still have it), 1568 * lookup_once so we still have it),
1562 * but this will make an ugly mess and 1569 * but this will make an ugly mess and
1563 * it should never happen in practice 1570 * it should never happen in practice
1564 * as only badly broken filesystems 1571 * as only badly broken filesystems
1565 * have non-directory root vnodes. (I 1572 * have non-directory root vnodes. (I
1566 * have seen this sort of thing with 1573 * have seen this sort of thing with
1567 * NFS occasionally but even then it 1574 * NFS occasionally but even then it
1568 * means something's badly wrong.) 1575 * means something's badly wrong.)
1569 */ 1576 */
1570 error = ENOTDIR; 1577 error = ENOTDIR;
1571 } else { 1578 } else {
1572 /* 1579 /*
1573 * dholland 20110410: if we're at a 1580 * dholland 20110410: if we're at a
1574 * union mount it might make sense to 1581 * union mount it might make sense to
1575 * use the top of the union stack here 1582 * use the top of the union stack here
1576 * rather than the layer we found the 1583 * rather than the layer we found the
1577 * symlink in. (FUTURE) 1584 * symlink in. (FUTURE)
1578 */ 1585 */
1579 error = namei_follow(state, inhibitmagic, 1586 error = namei_follow(state, inhibitmagic,
1580 searchdir, foundobj, 1587 searchdir, foundobj,
1581 &searchdir); 1588 &searchdir);
1582 } 1589 }
1583 if (error) { 1590 if (error) {
1584 KASSERT(searchdir != foundobj); 1591 KASSERT(searchdir != foundobj);
1585 if (searchdir != NULL) { 1592 if (searchdir != NULL) {
1586 vrele(searchdir); 1593 vrele(searchdir);
1587 } 1594 }
1588 vrele(foundobj); 1595 vrele(foundobj);
1589 ndp->ni_dvp = NULL; 1596 ndp->ni_dvp = NULL;
1590 ndp->ni_vp = NULL; 1597 ndp->ni_vp = NULL;
1591 return error; 1598 return error;
1592 } 1599 }
1593 vrele(foundobj); 1600 vrele(foundobj);
1594 foundobj = NULL; 1601 foundobj = NULL;
1595 1602
1596 /* 1603 /*
1597 * If we followed a symlink to `/' and there 1604 * If we followed a symlink to `/' and there
1598 * are no more components after the symlink, 1605 * are no more components after the symlink,
1599 * we're done with the loop and what we found 1606 * we're done with the loop and what we found
1600 * is the searchdir. 1607 * is the searchdir.
1601 */ 1608 */
1602 if (cnp->cn_nameptr[0] == '\0') { 1609 if (cnp->cn_nameptr[0] == '\0') {
1603 KASSERT(searchdir != NULL); 1610 KASSERT(searchdir != NULL);
1604 foundobj = searchdir; 1611 foundobj = searchdir;
1605 searchdir = NULL; 1612 searchdir = NULL;
1606 cnp->cn_flags |= ISLASTCN; 1613 cnp->cn_flags |= ISLASTCN;
1607 break; 1614 break;
1608 } 1615 }
1609 1616
1610 continue; 1617 continue;
1611 } 1618 }
1612 1619
1613 /* 1620 /*
1614 * Not a symbolic link. 1621 * Not a symbolic link.
1615 * 1622 *
1616 * Check for directory, if the component was 1623 * Check for directory, if the component was
1617 * followed by a series of slashes. 1624 * followed by a series of slashes.
1618 */ 1625 */
1619 if ((foundobj->v_type != VDIR) && 1626 if ((foundobj->v_type != VDIR) &&
1620 (cnp->cn_flags & REQUIREDIR)) { 1627 (cnp->cn_flags & REQUIREDIR)) {
1621 KASSERT(foundobj != searchdir); 1628 KASSERT(foundobj != searchdir);
1622 if (searchdir) { 1629 if (searchdir) {
1623 if (searchdir_locked) { 1630 if (searchdir_locked) {
1624 searchdir_locked = false; 1631 searchdir_locked = false;
1625 vput(searchdir); 1632 vput(searchdir);
1626 } else { 1633 } else {
1627 vrele(searchdir); 1634 vrele(searchdir);
1628 } 1635 }
1629 } else { 1636 } else {
1630 KASSERT(!searchdir_locked); 1637 KASSERT(!searchdir_locked);
1631 } 1638 }
1632 vrele(foundobj); 1639 vrele(foundobj);
1633 ndp->ni_dvp = NULL; 1640 ndp->ni_dvp = NULL;
1634 ndp->ni_vp = NULL; 1641 ndp->ni_vp = NULL;
1635 state->attempt_retry = 1; 1642 state->attempt_retry = 1;
1636 return ENOTDIR; 1643 return ENOTDIR;
1637 } 1644 }
1638 1645
1639 /* 1646 /*
1640 * Stop if we've reached the last component. 1647 * Stop if we've reached the last component.
1641 */ 1648 */
1642 if (cnp->cn_flags & ISLASTCN) { 1649 if (cnp->cn_flags & ISLASTCN) {
1643 break; 1650 break;
1644 } 1651 }
1645 1652
1646 /* 1653 /*
1647 * Continue with the next component. 1654 * Continue with the next component.
1648 */ 1655 */
1649 cnp->cn_nameptr = ndp->ni_next; 1656 cnp->cn_nameptr = ndp->ni_next;
1650 if (searchdir != NULL) { 1657 if (searchdir != NULL) {
1651 if (searchdir_locked) { 1658 if (searchdir_locked) {
1652 searchdir_locked = false; 1659 searchdir_locked = false;
1653 vput(searchdir); 1660 vput(searchdir);
1654 } else { 1661 } else {
1655 vrele(searchdir); 1662 vrele(searchdir);
1656 } 1663 }
1657 } 1664 }
1658 searchdir = foundobj; 1665 searchdir = foundobj;
1659 foundobj = NULL; 1666 foundobj = NULL;
1660 } 1667 }
1661 1668
1662 KASSERT((cnp->cn_flags & LOCKPARENT) == 0 || searchdir == NULL || 1669 KASSERT((cnp->cn_flags & LOCKPARENT) == 0 || searchdir == NULL ||
1663 VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE); 1670 VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
1664 1671
1665 skiploop: 1672 skiploop:
1666 1673
1667 if (foundobj != NULL) { 1674 if (foundobj != NULL) {
1668 if (foundobj == ndp->ni_erootdir) { 1675 if (foundobj == ndp->ni_erootdir) {
1669 /* 1676 /*
1670 * We are about to return the emulation root. 1677 * We are about to return the emulation root.
1671 * This isn't a good idea because code might 1678 * This isn't a good idea because code might
1672 * repeatedly lookup ".." until the file 1679 * repeatedly lookup ".." until the file
1673 * matches that returned for "/" and loop 1680 * matches that returned for "/" and loop
1674 * forever. So convert it to the real root. 1681 * forever. So convert it to the real root.
1675 */ 1682 */
1676 if (searchdir != NULL) { 1683 if (searchdir != NULL) {
1677 if (searchdir_locked) { 1684 if (searchdir_locked) {
1678 vput(searchdir); 1685 vput(searchdir);
1679 searchdir_locked = false; 1686 searchdir_locked = false;
1680 } else { 1687 } else {
1681 vrele(searchdir); 1688 vrele(searchdir);
1682 } 1689 }
1683 searchdir = NULL; 1690 searchdir = NULL;
1684 } 1691 }
1685 vrele(foundobj); 1692 vrele(foundobj);
1686 foundobj = ndp->ni_rootdir; 1693 foundobj = ndp->ni_rootdir;
1687 vref(foundobj); 1694 vref(foundobj);
1688 } 1695 }
1689 1696
1690 /* 1697 /*
1691 * If the caller requested the parent node (i.e. it's 1698 * If the caller requested the parent node (i.e. it's
1692 * a CREATE, DELETE, or RENAME), and we don't have one 1699 * a CREATE, DELETE, or RENAME), and we don't have one
1693 * (because this is the root directory, or we crossed 1700 * (because this is the root directory, or we crossed
1694 * a mount point), then we must fail. 1701 * a mount point), then we must fail.
1695 */ 1702 */
1696 if (cnp->cn_nameiop != LOOKUP && 1703 if (cnp->cn_nameiop != LOOKUP &&
1697 (searchdir == NULL || 1704 (searchdir == NULL ||
1698 searchdir->v_mount != foundobj->v_mount)) { 1705 searchdir->v_mount != foundobj->v_mount)) {
1699 if (searchdir) { 1706 if (searchdir) {
1700 if (searchdir_locked) { 1707 if (searchdir_locked) {
1701 vput(searchdir); 1708 vput(searchdir);
1702 searchdir_locked = false; 1709 searchdir_locked = false;
1703 } else { 1710 } else {
1704 vrele(searchdir); 1711 vrele(searchdir);
1705 } 1712 }
1706 searchdir = NULL; 1713 searchdir = NULL;
1707 } 1714 }
1708 vrele(foundobj); 1715 vrele(foundobj);
1709 foundobj = NULL; 1716 foundobj = NULL;
1710 ndp->ni_dvp = NULL; 1717 ndp->ni_dvp = NULL;
1711 ndp->ni_vp = NULL; 1718 ndp->ni_vp = NULL;
1712 state->attempt_retry = 1; 1719 state->attempt_retry = 1;
1713 1720
1714 switch (cnp->cn_nameiop) { 1721 switch (cnp->cn_nameiop) {
1715 case CREATE: 1722 case CREATE:
1716 return EEXIST; 1723 return EEXIST;
1717 case DELETE: 1724 case DELETE:
1718 case RENAME: 1725 case RENAME:
1719 return EBUSY; 1726 return EBUSY;
1720 default: 1727 default:
1721 break; 1728 break;
1722 } 1729 }
1723 panic("Invalid nameiop\n"); 1730 panic("Invalid nameiop\n");
1724 } 1731 }
1725 1732
1726 /* 1733 /*
1727 * Disallow directory write attempts on read-only lookups. 1734 * Disallow directory write attempts on read-only lookups.
1728 * Prefers EEXIST over EROFS for the CREATE case. 1735 * Prefers EEXIST over EROFS for the CREATE case.
1729 */ 1736 */
1730 if (state->rdonly && 1737 if (state->rdonly &&
1731 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { 1738 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1732 if (searchdir) { 1739 if (searchdir) {
1733 if (searchdir_locked) { 1740 if (searchdir_locked) {
1734 vput(searchdir); 1741 vput(searchdir);
1735 searchdir_locked = false; 1742 searchdir_locked = false;
1736 } else { 1743 } else {
1737 vrele(searchdir); 1744 vrele(searchdir);
1738 } 1745 }
1739 searchdir = NULL; 1746 searchdir = NULL;
1740 } 1747 }
1741 vrele(foundobj); 1748 vrele(foundobj);
1742 foundobj = NULL; 1749 foundobj = NULL;
1743 ndp->ni_dvp = NULL; 1750 ndp->ni_dvp = NULL;
1744 ndp->ni_vp = NULL; 1751 ndp->ni_vp = NULL;
1745 state->attempt_retry = 1; 1752 state->attempt_retry = 1;
1746 return EROFS; 1753 return EROFS;
1747 } 1754 }
1748 1755
1749 /* Lock the leaf node if requested. */ 1756 /* Lock the leaf node if requested. */
1750 if ((cnp->cn_flags & (LOCKLEAF | LOCKPARENT)) == LOCKPARENT && 1757 if ((cnp->cn_flags & (LOCKLEAF | LOCKPARENT)) == LOCKPARENT &&
1751 searchdir == foundobj) { 1758 searchdir == foundobj) {
1752 /* 1759 /*
1753 * Note: if LOCKPARENT but not LOCKLEAF is 1760 * Note: if LOCKPARENT but not LOCKLEAF is
1754 * set, and searchdir == foundobj, this code 1761 * set, and searchdir == foundobj, this code
1755 * necessarily unlocks the parent as well as 1762 * necessarily unlocks the parent as well as
1756 * the leaf. That is, just because you specify 1763 * the leaf. That is, just because you specify
1757 * LOCKPARENT doesn't mean you necessarily get 1764 * LOCKPARENT doesn't mean you necessarily get
1758 * a locked parent vnode. The code in 1765 * a locked parent vnode. The code in
1759 * vfs_syscalls.c, and possibly elsewhere, 1766 * vfs_syscalls.c, and possibly elsewhere,
1760 * that uses this combination "knows" this, so 1767 * that uses this combination "knows" this, so
1761 * it can't be safely changed. Feh. XXX 1768 * it can't be safely changed. Feh. XXX
1762 */ 1769 */
1763 KASSERT(searchdir_locked); 1770 KASSERT(searchdir_locked);
1764 VOP_UNLOCK(searchdir); 1771 VOP_UNLOCK(searchdir);
1765 searchdir_locked = false; 1772 searchdir_locked = false;
1766 } else if ((cnp->cn_flags & LOCKLEAF) != 0 && 1773 } else if ((cnp->cn_flags & LOCKLEAF) != 0 &&
1767 (searchdir != foundobj || 1774 (searchdir != foundobj ||
1768 (cnp->cn_flags & LOCKPARENT) == 0)) { 1775 (cnp->cn_flags & LOCKPARENT) == 0)) {
1769 const int lktype = (cnp->cn_flags & LOCKSHARED) != 0 ? 1776 const int lktype = (cnp->cn_flags & LOCKSHARED) != 0 ?
1770 LK_SHARED : LK_EXCLUSIVE; 1777 LK_SHARED : LK_EXCLUSIVE;
1771 vn_lock(foundobj, lktype | LK_RETRY); 1778 vn_lock(foundobj, lktype | LK_RETRY);
1772 } 1779 }
1773 } 1780 }
1774 1781
1775 /* 1782 /*
1776 * Done. 1783 * Done.
1777 */ 1784 */
1778 1785
1779 /* 1786 /*
1780 * If LOCKPARENT is not set, the parent directory isn't returned. 1787 * If LOCKPARENT is not set, the parent directory isn't returned.
1781 */ 1788 */
1782 if ((cnp->cn_flags & LOCKPARENT) == 0 && searchdir != NULL) { 1789 if ((cnp->cn_flags & LOCKPARENT) == 0 && searchdir != NULL) {
1783 vrele(searchdir); 1790 vrele(searchdir);
1784 searchdir = NULL; 1791 searchdir = NULL;
1785 } 1792 }
1786 1793
1787 ndp->ni_dvp = searchdir; 1794 ndp->ni_dvp = searchdir;
1788 ndp->ni_vp = foundobj; 1795 ndp->ni_vp = foundobj;
1789 return 0; 1796 return 0;
1790} 1797}
1791 1798
1792/* 1799/*
1793 * Do namei; wrapper layer that handles TRYEMULROOT. 1800 * Do namei; wrapper layer that handles TRYEMULROOT.
1794 */ 1801 */
1795static int 1802static int
1796namei_tryemulroot(struct namei_state *state, 1803namei_tryemulroot(struct namei_state *state,
1797 int neverfollow, int inhibitmagic, int isnfsd) 1804 int neverfollow, int inhibitmagic, int isnfsd)
1798{ 1805{
1799 int error; 1806 int error;
1800 1807
1801 struct nameidata *ndp = state->ndp; 1808 struct nameidata *ndp = state->ndp;
1802 struct componentname *cnp = state->cnp; 1809 struct componentname *cnp = state->cnp;
1803 const char *savepath = NULL; 1810 const char *savepath = NULL;
1804 1811
1805 KASSERT(cnp == &ndp->ni_cnd); 1812 KASSERT(cnp == &ndp->ni_cnd);
1806 1813
1807 if (cnp->cn_flags & TRYEMULROOT) { 1814 if (cnp->cn_flags & TRYEMULROOT) {
1808 savepath = pathbuf_stringcopy_get(ndp->ni_pathbuf); 1815 savepath = pathbuf_stringcopy_get(ndp->ni_pathbuf);
1809 } 1816 }
1810 1817
1811 emul_retry: 1818 emul_retry:
1812 state->attempt_retry = 0; 1819 state->attempt_retry = 0;
1813 1820
1814 error = namei_oneroot(state, neverfollow, inhibitmagic, isnfsd); 1821 error = namei_oneroot(state, neverfollow, inhibitmagic, isnfsd);
1815 if (error) { 1822 if (error) {
1816 /* 1823 /*
1817 * Once namei has started up, the existence of ni_erootdir 1824 * Once namei has started up, the existence of ni_erootdir
1818 * tells us whether we're working from an emulation root. 1825 * tells us whether we're working from an emulation root.
1819 * The TRYEMULROOT flag isn't necessarily authoritative. 1826 * The TRYEMULROOT flag isn't necessarily authoritative.
1820 */ 1827 */
1821 if (ndp->ni_erootdir != NULL && state->attempt_retry) { 1828 if (ndp->ni_erootdir != NULL && state->attempt_retry) {
1822 /* Retry the whole thing using the normal root */ 1829 /* Retry the whole thing using the normal root */
1823 cnp->cn_flags &= ~TRYEMULROOT; 1830 cnp->cn_flags &= ~TRYEMULROOT;
1824 state->attempt_retry = 0; 1831 state->attempt_retry = 0;
1825 1832
1826 /* kinda gross */ 1833 /* kinda gross */
1827 strcpy(ndp->ni_pathbuf->pb_path, savepath); 1834 strcpy(ndp->ni_pathbuf->pb_path, savepath);
1828 pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath); 1835 pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
1829 savepath = NULL; 1836 savepath = NULL;
1830 1837
1831 goto emul_retry; 1838 goto emul_retry;
1832 } 1839 }
1833 } 1840 }
1834 if (savepath != NULL) { 1841 if (savepath != NULL) {
1835 pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath); 1842 pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
1836 } 1843 }
1837 return error; 1844 return error;
1838} 1845}
1839 1846
1840/* 1847/*
1841 * External interface. 1848 * External interface.
1842 */ 1849 */
1843int 1850int
1844namei(struct nameidata *ndp) 1851namei(struct nameidata *ndp)
1845{ 1852{
1846 struct namei_state state; 1853 struct namei_state state;
1847 int error; 1854 int error;
1848 1855
1849 namei_init(&state, ndp); 1856 namei_init(&state, ndp);
1850 error = namei_tryemulroot(&state, 1857 error = namei_tryemulroot(&state,
1851 0/*!neverfollow*/, 0/*!inhibitmagic*/, 1858 0/*!neverfollow*/, 0/*!inhibitmagic*/,
1852 0/*isnfsd*/); 1859 0/*isnfsd*/);
1853 namei_cleanup(&state); 1860 namei_cleanup(&state);
1854 1861
1855 if (error) { 1862 if (error) {
1856 /* make sure no stray refs leak out */ 1863 /* make sure no stray refs leak out */
1857 KASSERT(ndp->ni_dvp == NULL); 1864 KASSERT(ndp->ni_dvp == NULL);
1858 KASSERT(ndp->ni_vp == NULL); 1865 KASSERT(ndp->ni_vp == NULL);
1859 } 1866 }
1860 1867
1861 return error; 1868 return error;
1862} 1869}
1863 1870
1864//////////////////////////////////////////////////////////// 1871////////////////////////////////////////////////////////////
1865 1872
1866/* 1873/*
1867 * External interface used by nfsd. This is basically different from 1874 * External interface used by nfsd. This is basically different from
1868 * namei only in that it has the ability to pass in the "current 1875 * namei only in that it has the ability to pass in the "current
1869 * directory", and uses an extra flag "neverfollow" for which there's 1876 * directory", and uses an extra flag "neverfollow" for which there's
1870 * no physical flag defined in namei.h. (There used to be a cut&paste 1877 * no physical flag defined in namei.h. (There used to be a cut&paste
1871 * copy of about half of namei in nfsd to allow these minor 1878 * copy of about half of namei in nfsd to allow these minor
1872 * adjustments to exist.) 1879 * adjustments to exist.)
1873 * 1880 *
1874 * XXX: the namei interface should be adjusted so nfsd can just use 1881 * XXX: the namei interface should be adjusted so nfsd can just use
1875 * ordinary namei(). 1882 * ordinary namei().
1876 */ 1883 */
1877int 1884int
1878lookup_for_nfsd(struct nameidata *ndp, struct vnode *forcecwd, int neverfollow) 1885lookup_for_nfsd(struct nameidata *ndp, struct vnode *forcecwd, int neverfollow)
1879{ 1886{
1880 struct namei_state state; 1887 struct namei_state state;
1881 int error; 1888 int error;
1882 1889
1883 KASSERT(ndp->ni_atdir == NULL); 1890 KASSERT(ndp->ni_atdir == NULL);
1884 ndp->ni_atdir = forcecwd; 1891 ndp->ni_atdir = forcecwd;
1885 1892
1886 namei_init(&state, ndp); 1893 namei_init(&state, ndp);
1887 error = namei_tryemulroot(&state, 1894 error = namei_tryemulroot(&state,
1888 neverfollow, 1/*inhibitmagic*/, 1/*isnfsd*/); 1895 neverfollow, 1/*inhibitmagic*/, 1/*isnfsd*/);
1889 namei_cleanup(&state); 1896 namei_cleanup(&state);
1890 1897
1891 if (error) { 1898 if (error) {
1892 /* make sure no stray refs leak out */ 1899 /* make sure no stray refs leak out */
1893 KASSERT(ndp->ni_dvp == NULL); 1900 KASSERT(ndp->ni_dvp == NULL);
1894 KASSERT(ndp->ni_vp == NULL); 1901 KASSERT(ndp->ni_vp == NULL);
1895 } 1902 }
1896 1903
1897 return error; 1904 return error;
1898} 1905}
1899 1906
1900/* 1907/*
1901 * A second external interface used by nfsd. This turns out to be a 1908 * A second external interface used by nfsd. This turns out to be a
1902 * single lookup used by the WebNFS code (ha!) to get "index.html" or 1909 * single lookup used by the WebNFS code (ha!) to get "index.html" or
1903 * equivalent when asked for a directory. It should eventually evolve 1910 * equivalent when asked for a directory. It should eventually evolve
1904 * into some kind of namei_once() call; for the time being it's kind 1911 * into some kind of namei_once() call; for the time being it's kind
1905 * of a mess. XXX. 1912 * of a mess. XXX.
1906 * 1913 *
1907 * dholland 20110109: I don't think it works, and I don't think it 1914 * dholland 20110109: I don't think it works, and I don't think it
1908 * worked before I started hacking and slashing either, and I doubt 1915 * worked before I started hacking and slashing either, and I doubt
1909 * anyone will ever notice. 1916 * anyone will ever notice.
1910 */ 1917 */
1911 1918
1912/* 1919/*
1913 * Internals. This calls lookup_once() after setting up the assorted 1920 * Internals. This calls lookup_once() after setting up the assorted
1914 * pieces of state the way they ought to be. 1921 * pieces of state the way they ought to be.
1915 */ 1922 */
1916static int 1923static int
1917do_lookup_for_nfsd_index(struct namei_state *state) 1924do_lookup_for_nfsd_index(struct namei_state *state)
1918{ 1925{
1919 int error = 0; 1926 int error = 0;
1920 1927
1921 struct componentname *cnp = state->cnp; 1928 struct componentname *cnp = state->cnp;
1922 struct nameidata *ndp = state->ndp; 1929 struct nameidata *ndp = state->ndp;
1923 struct vnode *startdir; 1930 struct vnode *startdir;
1924 struct vnode *foundobj; 1931 struct vnode *foundobj;
1925 bool startdir_locked; 1932 bool startdir_locked;
1926 const char *cp; /* pointer into pathname argument */ 1933 const char *cp; /* pointer into pathname argument */
1927 1934
1928 KASSERT(cnp == &ndp->ni_cnd); 1935 KASSERT(cnp == &ndp->ni_cnd);
1929 1936
1930 startdir = state->ndp->ni_atdir; 1937 startdir = state->ndp->ni_atdir;
1931 1938
1932 cnp->cn_nameptr = ndp->ni_pnbuf; 1939 cnp->cn_nameptr = ndp->ni_pnbuf;
1933 state->docache = 1; 1940 state->docache = 1;
1934 state->rdonly = cnp->cn_flags & RDONLY; 1941 state->rdonly = cnp->cn_flags & RDONLY;
1935 ndp->ni_dvp = NULL; 1942 ndp->ni_dvp = NULL;
1936 1943
1937 cnp->cn_consume = 0; 1944 cnp->cn_consume = 0;
1938 cnp->cn_namelen = namei_getcomponent(cnp->cn_nameptr); 1945 cnp->cn_namelen = namei_getcomponent(cnp->cn_nameptr);
1939 cp = cnp->cn_nameptr + cnp->cn_namelen; 1946 cp = cnp->cn_nameptr + cnp->cn_namelen;
1940 KASSERT(cnp->cn_namelen <= KERNEL_NAME_MAX); 1947 KASSERT(cnp->cn_namelen <= KERNEL_NAME_MAX);
1941 ndp->ni_pathlen -= cnp->cn_namelen; 1948 ndp->ni_pathlen -= cnp->cn_namelen;
1942 ndp->ni_next = cp; 1949 ndp->ni_next = cp;
1943 state->slashes = 0; 1950 state->slashes = 0;
1944 cnp->cn_flags &= ~REQUIREDIR; 1951 cnp->cn_flags &= ~REQUIREDIR;
1945 cnp->cn_flags |= MAKEENTRY|ISLASTCN; 1952 cnp->cn_flags |= MAKEENTRY|ISLASTCN;
1946 1953
1947 if (cnp->cn_namelen == 2 && 1954 if (cnp->cn_namelen == 2 &&
1948 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 1955 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
1949 cnp->cn_flags |= ISDOTDOT; 1956 cnp->cn_flags |= ISDOTDOT;
1950 else 1957 else
1951 cnp->cn_flags &= ~ISDOTDOT; 1958 cnp->cn_flags &= ~ISDOTDOT;
1952 1959
1953 /* 1960 /*
1954 * Because lookup_once can change the startdir, we need our 1961 * Because lookup_once can change the startdir, we need our
1955 * own reference to it to avoid consuming the caller's. 1962 * own reference to it to avoid consuming the caller's.
1956 */ 1963 */
1957 vref(startdir); 1964 vref(startdir);
1958 error = lookup_once(state, startdir, &startdir, &foundobj, 1965 error = lookup_once(state, startdir, &startdir, &foundobj,
1959 &startdir_locked); 1966 &startdir_locked);
1960 1967
1961 KASSERT((cnp->cn_flags & LOCKPARENT) == 0); 1968 KASSERT((cnp->cn_flags & LOCKPARENT) == 0);
1962 if (startdir_locked) { 1969 if (startdir_locked) {
1963 VOP_UNLOCK(startdir); 1970 VOP_UNLOCK(startdir);
1964 startdir_locked = false; 1971 startdir_locked = false;
1965 } 1972 }
1966 1973
1967 /* 1974 /*
1968 * If the vnode we found is mounted on, then cross the mount and get 1975 * If the vnode we found is mounted on, then cross the mount and get
1969 * the root vnode in foundobj. If this encounters an error, it will 1976 * the root vnode in foundobj. If this encounters an error, it will
1970 * dispose of foundobj, but searchdir is untouched. 1977 * dispose of foundobj, but searchdir is untouched.
1971 */ 1978 */
1972 if (error == 0 && foundobj != NULL && 1979 if (error == 0 && foundobj != NULL &&
1973 foundobj->v_type == VDIR && 1980 foundobj->v_type == VDIR &&
1974 foundobj->v_mountedhere != NULL && 1981 foundobj->v_mountedhere != NULL &&
1975 (cnp->cn_flags & NOCROSSMOUNT) == 0) { 1982 (cnp->cn_flags & NOCROSSMOUNT) == 0) {
1976 error = lookup_crossmount(state, &startdir, &foundobj, 1983 error = lookup_crossmount(state, &startdir, &foundobj,
1977 &startdir_locked); 1984 &startdir_locked);
1978 } 1985 }
1979 1986
1980 /* Now toss startdir and see if we have an error. */ 1987 /* Now toss startdir and see if we have an error. */
1981 if (startdir != NULL) 1988 if (startdir != NULL)
1982 vrele(startdir); 1989 vrele(startdir);
1983 if (error) 1990 if (error)
1984 foundobj = NULL; 1991 foundobj = NULL;
1985 else if (foundobj != NULL && (cnp->cn_flags & LOCKLEAF) != 0) 1992 else if (foundobj != NULL && (cnp->cn_flags & LOCKLEAF) != 0)
1986 vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY); 1993 vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
1987 1994
1988 ndp->ni_vp = foundobj; 1995 ndp->ni_vp = foundobj;
1989 return (error); 1996 return (error);
1990} 1997}
1991 1998
1992/* 1999/*
1993 * External interface. The partitioning between this function and the 2000 * External interface. The partitioning between this function and the
1994 * above isn't very clear - the above function exists mostly so code 2001 * above isn't very clear - the above function exists mostly so code
1995 * that uses "state->" can be shuffled around without having to change 2002 * that uses "state->" can be shuffled around without having to change
1996 * it to "state.". 2003 * it to "state.".
1997 */ 2004 */
1998int 2005int
1999lookup_for_nfsd_index(struct nameidata *ndp, struct vnode *startdir) 2006lookup_for_nfsd_index(struct nameidata *ndp, struct vnode *startdir)
2000{ 2007{
2001 struct namei_state state; 2008 struct namei_state state;
2002 int error; 2009 int error;
2003 2010
2004 KASSERT(ndp->ni_atdir == NULL); 2011 KASSERT(ndp->ni_atdir == NULL);
2005 ndp->ni_atdir = startdir; 2012 ndp->ni_atdir = startdir;
2006 2013
2007 /* 2014 /*
2008 * Note: the name sent in here (is not|should not be) allowed 2015 * Note: the name sent in here (is not|should not be) allowed
2009 * to contain a slash. 2016 * to contain a slash.
2010 */ 2017 */
2011 if (strlen(ndp->ni_pathbuf->pb_path) > KERNEL_NAME_MAX) { 2018 if (strlen(ndp->ni_pathbuf->pb_path) > KERNEL_NAME_MAX) {
2012 return ENAMETOOLONG; 2019 return ENAMETOOLONG;
2013 } 2020 }
2014 if (strchr(ndp->ni_pathbuf->pb_path, '/')) { 2021 if (strchr(ndp->ni_pathbuf->pb_path, '/')) {
2015 return EINVAL; 2022 return EINVAL;
2016 } 2023 }
2017 2024
2018 ndp->ni_pathlen = strlen(ndp->ni_pathbuf->pb_path) + 1; 2025 ndp->ni_pathlen = strlen(ndp->ni_pathbuf->pb_path) + 1;
2019 ndp->ni_pnbuf = NULL; 2026 ndp->ni_pnbuf = NULL;
2020 ndp->ni_cnd.cn_nameptr = NULL; 2027 ndp->ni_cnd.cn_nameptr = NULL;
2021 2028
2022 namei_init(&state, ndp); 2029 namei_init(&state, ndp);
2023 error = do_lookup_for_nfsd_index(&state); 2030 error = do_lookup_for_nfsd_index(&state);
2024 namei_cleanup(&state); 2031 namei_cleanup(&state);
2025 2032
2026 return error; 2033 return error;
2027} 2034}
2028 2035
2029//////////////////////////////////////////////////////////// 2036////////////////////////////////////////////////////////////
2030 2037
2031/* 2038/*
2032 * Reacquire a path name component. 2039 * Reacquire a path name component.
2033 * dvp is locked on entry and exit. 2040 * dvp is locked on entry and exit.
2034 * *vpp is locked on exit unless it's NULL. 2041 * *vpp is locked on exit unless it's NULL.
2035 */ 2042 */
2036int 2043int
2037relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int dummy) 2044relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int dummy)
2038{ 2045{
2039 int rdonly; /* lookup read-only flag bit */ 2046 int rdonly; /* lookup read-only flag bit */
2040 int error = 0; 2047 int error = 0;
2041#ifdef DEBUG 2048#ifdef DEBUG
2042 size_t newlen; /* DEBUG: check name len */ 2049 size_t newlen; /* DEBUG: check name len */
2043 const char *cp; /* DEBUG: check name ptr */ 2050 const char *cp; /* DEBUG: check name ptr */
2044#endif /* DEBUG */ 2051#endif /* DEBUG */
2045 2052
2046 (void)dummy; 2053 (void)dummy;
2047 2054
2048 /* 2055 /*
2049 * Setup: break out flag bits into variables. 2056 * Setup: break out flag bits into variables.
2050 */ 2057 */
2051 rdonly = cnp->cn_flags & RDONLY; 2058 rdonly = cnp->cn_flags & RDONLY;
2052 2059
2053 /* 2060 /*
2054 * Search a new directory. 2061 * Search a new directory.
2055 * 2062 *
2056 * The cn_hash value is for use by vfs_cache. 2063 * The cn_hash value is for use by vfs_cache.
2057 * The last component of the filename is left accessible via 2064 * The last component of the filename is left accessible via
2058 * cnp->cn_nameptr for callers that need the name. Callers needing 2065 * cnp->cn_nameptr for callers that need the name. Callers needing
2059 * the name set the SAVENAME flag. When done, they assume 2066 * the name set the SAVENAME flag. When done, they assume
2060 * responsibility for freeing the pathname buffer. 2067 * responsibility for freeing the pathname buffer.
2061 */ 2068 */
2062#ifdef DEBUG 2069#ifdef DEBUG
2063#if 0 2070#if 0
2064 cp = NULL; 2071 cp = NULL;
2065 newhash = namei_hash(cnp->cn_nameptr, &cp); 2072 newhash = namei_hash(cnp->cn_nameptr, &cp);
2066 if ((uint32_t)newhash != (uint32_t)cnp->cn_hash) 2073 if ((uint32_t)newhash != (uint32_t)cnp->cn_hash)
2067 panic("relookup: bad hash"); 2074 panic("relookup: bad hash");
2068#endif 2075#endif
2069 newlen = namei_getcomponent(cnp->cn_nameptr); 2076 newlen = namei_getcomponent(cnp->cn_nameptr);
2070 if (cnp->cn_namelen != newlen) 2077 if (cnp->cn_namelen != newlen)
2071 panic("relookup: bad len"); 2078 panic("relookup: bad len");
2072 cp = cnp->cn_nameptr + cnp->cn_namelen; 2079 cp = cnp->cn_nameptr + cnp->cn_namelen;
2073 while (*cp == '/') 2080 while (*cp == '/')
2074 cp++; 2081 cp++;
2075 if (*cp != 0) 2082 if (*cp != 0)
2076 panic("relookup: not last component"); 2083 panic("relookup: not last component");
2077#endif /* DEBUG */ 2084#endif /* DEBUG */
2078 2085
2079 /* 2086 /*
2080 * Check for degenerate name (e.g. / or "") 2087 * Check for degenerate name (e.g. / or "")
2081 * which is a way of talking about a directory, 2088 * which is a way of talking about a directory,
2082 * e.g. like "/." or ".". 2089 * e.g. like "/." or ".".
2083 */ 2090 */
2084 if (cnp->cn_nameptr[0] == '\0') 2091 if (cnp->cn_nameptr[0] == '\0')
2085 panic("relookup: null name"); 2092 panic("relookup: null name");
2086 2093
2087 if (cnp->cn_flags & ISDOTDOT) 2094 if (cnp->cn_flags & ISDOTDOT)
2088 panic("relookup: lookup on dot-dot"); 2095 panic("relookup: lookup on dot-dot");
2089 2096
2090 /* 2097 /*
2091 * We now have a segment name to search for, and a directory to search. 2098 * We now have a segment name to search for, and a directory to search.
2092 */ 2099 */
2093 *vpp = NULL; 2100 *vpp = NULL;
2094 error = VOP_LOOKUP(dvp, vpp, cnp); 2101 error = VOP_LOOKUP(dvp, vpp, cnp);
2095 if ((error) != 0) { 2102 if ((error) != 0) {
2096 KASSERTMSG((*vpp == NULL), 2103 KASSERTMSG((*vpp == NULL),
2097 "leaf `%s' should be empty but is %p", 2104 "leaf `%s' should be empty but is %p",
2098 cnp->cn_nameptr, *vpp); 2105 cnp->cn_nameptr, *vpp);
2099 if (error != EJUSTRETURN) 2106 if (error != EJUSTRETURN)
2100 goto bad; 2107 goto bad;
2101 } 2108 }
2102 2109
2103 /* 2110 /*
2104 * Check for symbolic link 2111 * Check for symbolic link
2105 */ 2112 */
2106 KASSERTMSG((*vpp == NULL || (*vpp)->v_type != VLNK || 2113 KASSERTMSG((*vpp == NULL || (*vpp)->v_type != VLNK ||
2107 (cnp->cn_flags & FOLLOW) == 0), 2114 (cnp->cn_flags & FOLLOW) == 0),
2108 "relookup: symlink found"); 2115 "relookup: symlink found");
2109 2116
2110 /* 2117 /*
2111 * Check for read-only lookups. 2118 * Check for read-only lookups.
2112 */ 2119 */
2113 if (rdonly && cnp->cn_nameiop != LOOKUP) { 2120 if (rdonly && cnp->cn_nameiop != LOOKUP) {
2114 error = EROFS; 2121 error = EROFS;
2115 if (*vpp) { 2122 if (*vpp) {
2116 vrele(*vpp); 2123 vrele(*vpp);
2117 } 2124 }
2118 goto bad; 2125 goto bad;
2119 } 2126 }
2120 /* 2127 /*
2121 * Lock result. 2128 * Lock result.
2122 */ 2129 */
2123 if (*vpp && *vpp != dvp) { 2130 if (*vpp && *vpp != dvp) {
2124 error = vn_lock(*vpp, LK_EXCLUSIVE); 2131 error = vn_lock(*vpp, LK_EXCLUSIVE);
2125 if (error != 0) { 2132 if (error != 0) {
2126 vrele(*vpp); 2133 vrele(*vpp);
2127 goto bad; 2134 goto bad;
2128 } 2135 }
2129 } 2136 }
2130 return (0); 2137 return (0);
2131 2138
2132bad: 2139bad:
2133 *vpp = NULL; 2140 *vpp = NULL;
2134 return (error); 2141 return (error);
2135} 2142}
2136 2143
2137/* 2144/*
2138 * namei_simple - simple forms of namei. 2145 * namei_simple - simple forms of namei.
2139 * 2146 *
2140 * These are wrappers to allow the simple case callers of namei to be 2147 * These are wrappers to allow the simple case callers of namei to be
2141 * left alone while everything else changes under them. 2148 * left alone while everything else changes under them.
2142 */ 2149 */
2143 2150
2144/* Flags */ 2151/* Flags */
2145struct namei_simple_flags_type { 2152struct namei_simple_flags_type {
2146 int dummy; 2153 int dummy;
2147}; 2154};
2148static const struct namei_simple_flags_type ns_nn, ns_nt, ns_fn, ns_ft; 2155static const struct namei_simple_flags_type ns_nn, ns_nt, ns_fn, ns_ft;
2149const namei_simple_flags_t NSM_NOFOLLOW_NOEMULROOT = &ns_nn; 2156const namei_simple_flags_t NSM_NOFOLLOW_NOEMULROOT = &ns_nn;
2150const namei_simple_flags_t NSM_NOFOLLOW_TRYEMULROOT = &ns_nt; 2157const namei_simple_flags_t NSM_NOFOLLOW_TRYEMULROOT = &ns_nt;
2151const namei_simple_flags_t NSM_FOLLOW_NOEMULROOT = &ns_fn; 2158const namei_simple_flags_t NSM_FOLLOW_NOEMULROOT = &ns_fn;
2152const namei_simple_flags_t NSM_FOLLOW_TRYEMULROOT = &ns_ft; 2159const namei_simple_flags_t NSM_FOLLOW_TRYEMULROOT = &ns_ft;
2153 2160
2154static 2161static
2155int 2162int
2156namei_simple_convert_flags(namei_simple_flags_t sflags) 2163namei_simple_convert_flags(namei_simple_flags_t sflags)
2157{ 2164{
2158 if (sflags == NSM_NOFOLLOW_NOEMULROOT) 2165 if (sflags == NSM_NOFOLLOW_NOEMULROOT)
2159 return NOFOLLOW | 0; 2166 return NOFOLLOW | 0;
2160 if (sflags == NSM_NOFOLLOW_TRYEMULROOT) 2167 if (sflags == NSM_NOFOLLOW_TRYEMULROOT)
2161 return NOFOLLOW | TRYEMULROOT; 2168 return NOFOLLOW | TRYEMULROOT;
2162 if (sflags == NSM_FOLLOW_NOEMULROOT) 2169 if (sflags == NSM_FOLLOW_NOEMULROOT)
2163 return FOLLOW | 0; 2170 return FOLLOW | 0;
2164 if (sflags == NSM_FOLLOW_TRYEMULROOT) 2171 if (sflags == NSM_FOLLOW_TRYEMULROOT)
2165 return FOLLOW | TRYEMULROOT; 2172 return FOLLOW | TRYEMULROOT;
2166 panic("namei_simple_convert_flags: bogus sflags\n"); 2173 panic("namei_simple_convert_flags: bogus sflags\n");
2167 return 0; 2174 return 0;
2168} 2175}
2169 2176
2170int 2177int
2171namei_simple_kernel(const char *path, namei_simple_flags_t sflags, 2178namei_simple_kernel(const char *path, namei_simple_flags_t sflags,
2172 struct vnode **vp_ret) 2179 struct vnode **vp_ret)
2173{ 2180{
2174 return nameiat_simple_kernel(NULL, path, sflags, vp_ret); 2181 return nameiat_simple_kernel(NULL, path, sflags, vp_ret);
2175} 2182}
2176 2183
2177int 2184int
2178nameiat_simple_kernel(struct vnode *dvp, const char *path,  2185nameiat_simple_kernel(struct vnode *dvp, const char *path,
2179 namei_simple_flags_t sflags, struct vnode **vp_ret) 2186 namei_simple_flags_t sflags, struct vnode **vp_ret)
2180{ 2187{
2181 struct nameidata nd; 2188 struct nameidata nd;
2182 struct pathbuf *pb; 2189 struct pathbuf *pb;
2183 int err; 2190 int err;
2184 2191
2185 pb = pathbuf_create(path); 2192 pb = pathbuf_create(path);
2186 if (pb == NULL) { 2193 if (pb == NULL) {
2187 return ENOMEM; 2194 return ENOMEM;
2188 } 2195 }
2189 2196
2190 NDINIT(&nd, 2197 NDINIT(&nd,
2191 LOOKUP, 2198 LOOKUP,
2192 namei_simple_convert_flags(sflags), 2199 namei_simple_convert_flags(sflags),
2193 pb); 2200 pb);
2194 2201
2195 if (dvp != NULL) 2202 if (dvp != NULL)
2196 NDAT(&nd, dvp); 2203 NDAT(&nd, dvp);
2197 2204
2198 err = namei(&nd); 2205 err = namei(&nd);
2199 if (err != 0) { 2206 if (err != 0) {
2200 pathbuf_destroy(pb); 2207 pathbuf_destroy(pb);
2201 return err; 2208 return err;
2202 } 2209 }
2203 *vp_ret = nd.ni_vp; 2210 *vp_ret = nd.ni_vp;
2204 pathbuf_destroy(pb); 2211 pathbuf_destroy(pb);
2205 return 0; 2212 return 0;
2206} 2213}
2207 2214
2208int 2215int
2209namei_simple_user(const char *path, namei_simple_flags_t sflags, 2216namei_simple_user(const char *path, namei_simple_flags_t sflags,
2210 struct vnode **vp_ret) 2217 struct vnode **vp_ret)
2211{ 2218{
2212 return nameiat_simple_user(NULL, path, sflags, vp_ret); 2219 return nameiat_simple_user(NULL, path, sflags, vp_ret);
2213} 2220}
2214 2221
2215int 2222int
2216nameiat_simple_user(struct vnode *dvp, const char *path, 2223nameiat_simple_user(struct vnode *dvp, const char *path,
2217 namei_simple_flags_t sflags, struct vnode **vp_ret) 2224 namei_simple_flags_t sflags, struct vnode **vp_ret)
2218{ 2225{
2219 struct pathbuf *pb; 2226 struct pathbuf *pb;
2220 struct nameidata nd; 2227 struct nameidata nd;
2221 int err; 2228 int err;
2222 2229
2223 err = pathbuf_copyin(path, &pb); 2230 err = pathbuf_copyin(path, &pb);
2224 if (err) { 2231 if (err) {
2225 return err; 2232 return err;
2226 } 2233 }
2227 2234
2228 NDINIT(&nd, 2235 NDINIT(&nd,
2229 LOOKUP, 2236 LOOKUP,
2230 namei_simple_convert_flags(sflags), 2237 namei_simple_convert_flags(sflags),
2231 pb); 2238 pb);
2232 2239
2233 if (dvp != NULL) 2240 if (dvp != NULL)
2234 NDAT(&nd, dvp); 2241 NDAT(&nd, dvp);
2235 2242
2236 err = namei(&nd); 2243 err = namei(&nd);
2237 if (err != 0) { 2244 if (err != 0) {
2238 pathbuf_destroy(pb); 2245 pathbuf_destroy(pb);
2239 return err; 2246 return err;
2240 } 2247 }
2241 *vp_ret = nd.ni_vp; 2248 *vp_ret = nd.ni_vp;
2242 pathbuf_destroy(pb); 2249 pathbuf_destroy(pb);
2243 return 0; 2250 return 0;
2244} 2251}