| @@ -1,14 +1,14 @@ | | | @@ -1,14 +1,14 @@ |
1 | /* $NetBSD: sys_futex.c,v 1.11 2020/05/05 15:25:18 riastradh Exp $ */ | | 1 | /* $NetBSD: sys_futex.c,v 1.11.2.1 2020/11/01 15:16:43 thorpej Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc. | | 4 | * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc. |
5 | * All rights reserved. | | 5 | * All rights reserved. |
6 | * | | 6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation | | 7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Taylor R. Campbell and Jason R. Thorpe. | | 8 | * by Taylor R. Campbell and Jason R. Thorpe. |
9 | * | | 9 | * |
10 | * Redistribution and use in source and binary forms, with or without | | 10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions | | 11 | * modification, are permitted provided that the following conditions |
12 | * are met: | | 12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright | | 13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. | | 14 | * notice, this list of conditions and the following disclaimer. |
| @@ -20,27 +20,27 @@ | | | @@ -20,27 +20,27 @@ |
20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | | 20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | | 21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | | 22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | | 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | | 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | | 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | | 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | | 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | | 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | * POSSIBILITY OF SUCH DAMAGE. | | 29 | * POSSIBILITY OF SUCH DAMAGE. |
30 | */ | | 30 | */ |
31 | | | 31 | |
32 | #include <sys/cdefs.h> | | 32 | #include <sys/cdefs.h> |
33 | __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.11 2020/05/05 15:25:18 riastradh Exp $"); | | 33 | __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.11.2.1 2020/11/01 15:16:43 thorpej Exp $"); |
34 | | | 34 | |
35 | /* | | 35 | /* |
36 | * Futexes | | 36 | * Futexes |
37 | * | | 37 | * |
38 | * The futex system call coordinates notifying threads waiting for | | 38 | * The futex system call coordinates notifying threads waiting for |
39 | * changes on a 32-bit word of memory. The word can be managed by | | 39 | * changes on a 32-bit word of memory. The word can be managed by |
40 | * CPU atomic operations in userland, without system calls, as long | | 40 | * CPU atomic operations in userland, without system calls, as long |
41 | * as there is no contention. | | 41 | * as there is no contention. |
42 | * | | 42 | * |
43 | * The simplest use case demonstrating the utility is: | | 43 | * The simplest use case demonstrating the utility is: |
44 | * | | 44 | * |
45 | * // 32-bit word of memory shared among threads or | | 45 | * // 32-bit word of memory shared among threads or |
46 | * // processes in userland. lock & 1 means owned; | | 46 | * // processes in userland. lock & 1 means owned; |
| @@ -110,42 +110,98 @@ __KERNEL_RCSID(0, "$NetBSD: sys_futex.c, | | | @@ -110,42 +110,98 @@ __KERNEL_RCSID(0, "$NetBSD: sys_futex.c, |
110 | * | | 110 | * |
111 | * futex_tab.va[vmspace, va] for private futexes | | 111 | * futex_tab.va[vmspace, va] for private futexes |
112 | * futex_tab.oa[uvm_voaddr] for shared futexes | | 112 | * futex_tab.oa[uvm_voaddr] for shared futexes |
113 | * | | 113 | * |
114 | * This implementation does not support priority inheritance. | | 114 | * This implementation does not support priority inheritance. |
115 | */ | | 115 | */ |
116 | | | 116 | |
117 | #include <sys/types.h> | | 117 | #include <sys/types.h> |
118 | #include <sys/atomic.h> | | 118 | #include <sys/atomic.h> |
119 | #include <sys/condvar.h> | | 119 | #include <sys/condvar.h> |
120 | #include <sys/futex.h> | | 120 | #include <sys/futex.h> |
121 | #include <sys/mutex.h> | | 121 | #include <sys/mutex.h> |
122 | #include <sys/rbtree.h> | | 122 | #include <sys/rbtree.h> |
| | | 123 | #include <sys/sleepq.h> |
123 | #include <sys/queue.h> | | 124 | #include <sys/queue.h> |
| | | 125 | #include <sys/sdt.h> |
124 | | | 126 | |
125 | #include <sys/syscall.h> | | 127 | #include <sys/syscall.h> |
126 | #include <sys/syscallargs.h> | | 128 | #include <sys/syscallargs.h> |
127 | #include <sys/syscallvar.h> | | 129 | #include <sys/syscallvar.h> |
128 | | | 130 | |
129 | #include <uvm/uvm_extern.h> | | 131 | #include <uvm/uvm_extern.h> |
130 | | | 132 | |
131 | /* | | 133 | /* |
| | | 134 | * DTrace probes. |
| | | 135 | */ |
| | | 136 | SDT_PROVIDER_DEFINE(futex); |
| | | 137 | |
| | | 138 | /* entry: uaddr, val, bitset, timeout, clkflags, fflags */ |
| | | 139 | /* exit: error */ |
| | | 140 | SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int", |
| | | 141 | "struct timespec *", "int", "int"); |
| | | 142 | SDT_PROBE_DEFINE1(futex, func, wait, exit, "int"); |
| | | 143 | |
| | | 144 | /* entry: uaddr, nwake, bitset, fflags */ |
| | | 145 | /* exit: error, nwoken */ |
| | | 146 | SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int"); |
| | | 147 | SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int"); |
| | | 148 | |
| | | 149 | /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */ |
| | | 150 | /* exit: error, nwoken */ |
| | | 151 | SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int", |
| | | 152 | "int"); |
| | | 153 | SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int"); |
| | | 154 | |
| | | 155 | /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */ |
| | | 156 | /* exit: error, nwoken */ |
| | | 157 | SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *", |
| | | 158 | "int", "int", "int"); |
| | | 159 | SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int"); |
| | | 160 | |
| | | 161 | /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */ |
| | | 162 | /* exit: error, nwoken */ |
| | | 163 | SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int", |
| | | 164 | "int", "int"); |
| | | 165 | SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int"); |
| | | 166 | |
| | | 167 | /* entry: uaddr, val, r/w, abstime, fflags */ |
| | | 168 | /* exit: error */ |
| | | 169 | SDT_PROBE_DEFINE5(futex, func, rw_wait, entry, "int *", "int", "int", |
| | | 170 | "struct timespec *", "int"); |
| | | 171 | SDT_PROBE_DEFINE1(futex, func, rw_wait, exit, "int"); |
| | | 172 | |
| | | 173 | /* entry: uaddr, val, fflags */ |
| | | 174 | /* exit: error, nwoken */ |
| | | 175 | SDT_PROBE_DEFINE3(futex, func, rw_handoff, entry, "int *", "int", "int"); |
| | | 176 | SDT_PROBE_DEFINE2(futex, func, rw_handoff, exit, "int", "int"); |
| | | 177 | |
| | | 178 | SDT_PROBE_DEFINE0(futex, wait, finish, normally); |
| | | 179 | SDT_PROBE_DEFINE0(futex, wait, finish, wakerace); |
| | | 180 | SDT_PROBE_DEFINE0(futex, wait, finish, aborted); |
| | | 181 | |
| | | 182 | /* entry: timo */ |
| | | 183 | /* exit: error */ |
| | | 184 | SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int"); |
| | | 185 | SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int"); |
| | | 186 | |
| | | 187 | /* |
132 | * Lock order: | | 188 | * Lock order: |
133 | * | | 189 | * |
134 | * futex_tab.lock | | 190 | * futex_tab.lock |
135 | * futex::fx_qlock ordered by kva of struct futex | | 191 | * futex::fx_op_lock ordered by kva of struct futex |
136 | * -> futex_wait::fw_lock only one at a time | | 192 | * -> futex::fx_sq_lock ordered by kva of sleepq lock |
137 | * futex_wait::fw_lock only one at a time | | 193 | * |
138 | * -> futex::fx_abortlock only one at a time | | 194 | * N.B. multiple futexes can share a single sleepq lock. |
139 | */ | | 195 | */ |
140 | | | 196 | |
141 | /* | | 197 | /* |
142 | * union futex_key | | 198 | * union futex_key |
143 | * | | 199 | * |
144 | * A futex is addressed either by a vmspace+va (private) or by | | 200 | * A futex is addressed either by a vmspace+va (private) or by |
145 | * a uvm_voaddr (shared). | | 201 | * a uvm_voaddr (shared). |
146 | */ | | 202 | */ |
147 | union futex_key { | | 203 | union futex_key { |
148 | struct { | | 204 | struct { |
149 | struct vmspace *vmspace; | | 205 | struct vmspace *vmspace; |
150 | vaddr_t va; | | 206 | vaddr_t va; |
151 | } fk_private; | | 207 | } fk_private; |
| @@ -155,57 +211,75 @@ union futex_key { | | | @@ -155,57 +211,75 @@ union futex_key { |
155 | /* | | 211 | /* |
156 | * struct futex | | 212 | * struct futex |
157 | * | | 213 | * |
158 | * Kernel state for a futex located at a particular address in a | | 214 | * Kernel state for a futex located at a particular address in a |
159 | * particular virtual address space. | | 215 | * particular virtual address space. |
160 | * | | 216 | * |
161 | * N.B. fx_refcnt is an unsigned long because we need to be able | | 217 | * N.B. fx_refcnt is an unsigned long because we need to be able |
162 | * to operate on it atomically on all systems while at the same | | 218 | * to operate on it atomically on all systems while at the same |
163 | * time rendering practically impossible the chance of it reaching | | 219 | * time rendering practically impossible the chance of it reaching |
164 | * its max value. In practice, we're limited by the number of LWPs | | 220 | * its max value. In practice, we're limited by the number of LWPs |
165 | * that can be present on the system at any given time, and the | | 221 | * that can be present on the system at any given time, and the |
166 | * assumption is that limit will be good enough on a 32-bit platform. | | 222 | * assumption is that limit will be good enough on a 32-bit platform. |
167 | * See futex_wake() for why overflow needs to be avoided. | | 223 | * See futex_wake() for why overflow needs to be avoided. |
| | | 224 | * |
| | | 225 | * XXX Since futex addresses must be 4-byte aligned, we could |
| | | 226 | * XXX squirrel away fx_shared and fx_on_tree bits in the "va" |
| | | 227 | * XXX field of the key. Worth it? |
168 | */ | | 228 | */ |
169 | struct futex { | | 229 | struct futex { |
170 | union futex_key fx_key; | | 230 | union futex_key fx_key; |
| | | 231 | struct rb_node fx_node; |
171 | unsigned long fx_refcnt; | | 232 | unsigned long fx_refcnt; |
172 | bool fx_shared; | | 233 | bool fx_shared; |
173 | bool fx_on_tree; | | 234 | bool fx_on_tree; |
174 | struct rb_node fx_node; | | 235 | uint8_t fx_class; |
175 | | | | |
176 | kmutex_t fx_qlock; | | | |
177 | TAILQ_HEAD(, futex_wait) fx_queue; | | | |
178 | | | 236 | |
179 | kmutex_t fx_abortlock; | | 237 | kmutex_t fx_op_lock; /* adaptive */ |
180 | LIST_HEAD(, futex_wait) fx_abortlist; | | 238 | kmutex_t * fx_sq_lock; /* &sleepq_locks[...] */ |
181 | kcondvar_t fx_abortcv; | | 239 | sleepq_t fx_sqs[2]; /* 0=reader, 1=writer */ |
| | | 240 | unsigned int fx_nwaiters[2]; |
182 | }; | | 241 | }; |
183 | | | 242 | |
184 | /* | | 243 | /* |
185 | * struct futex_wait | | 244 | * futex classes: Some futex operations can only be carried out on |
186 | * | | 245 | * futexes that are known to be abiding by a certain protocol. These |
187 | * State for a thread to wait on a futex. Threads wait on fw_cv | | 246 | * classes are assigned to a futex when created due to a wait event, |
188 | * for fw_bitset to be set to zero. The thread may transition to | | 247 | * and when subsequent wake or requeue operations are issued, the |
189 | * a different futex queue at any time under the futex's lock. | | 248 | * class is checked at futex lookup time. If the class does not match, |
190 | */ | | 249 | * EINVAL is the result. |
191 | struct futex_wait { | | 250 | */ |
192 | kmutex_t fw_lock; | | 251 | #define FUTEX_CLASS_ANY 0 /* match any class in lookup */ |
193 | kcondvar_t fw_cv; | | 252 | #define FUTEX_CLASS_NORMAL 1 /* normal futex */ |
194 | struct futex *fw_futex; | | 253 | #define FUTEX_CLASS_RWLOCK 2 /* rwlock futex */ |
195 | TAILQ_ENTRY(futex_wait) fw_entry; /* queue lock */ | | 254 | #define FUTEX_CLASS_PI 3 /* for FUTEX_*_PI ops */ |
196 | LIST_ENTRY(futex_wait) fw_abort; /* queue abortlock */ | | 255 | |
197 | int fw_bitset; | | 256 | /* sleepq definitions */ |
198 | bool fw_aborting; /* fw_lock */ | | 257 | #define FUTEX_READERQ 0 |
| | | 258 | #define FUTEX_WRITERQ 1 |
| | | 259 | |
| | | 260 | CTASSERT(FUTEX_READERQ == FUTEX_RW_READER); |
| | | 261 | CTASSERT(FUTEX_WRITERQ == FUTEX_RW_WRITER); |
| | | 262 | |
| | | 263 | static const char futex_wmesg[] = "futex"; |
| | | 264 | |
| | | 265 | static void futex_unsleep(lwp_t *, bool); |
| | | 266 | |
| | | 267 | static syncobj_t futex_syncobj = { |
| | | 268 | .sobj_flag = SOBJ_SLEEPQ_SORTED, |
| | | 269 | .sobj_unsleep = futex_unsleep, |
| | | 270 | .sobj_changepri = sleepq_changepri, |
| | | 271 | .sobj_lendpri = sleepq_lendpri, |
| | | 272 | .sobj_owner = syncobj_noowner, |
199 | }; | | 273 | }; |
200 | | | 274 | |
201 | /* | | 275 | /* |
202 | * futex_tab | | 276 | * futex_tab |
203 | * | | 277 | * |
204 | * Global trees of futexes by vmspace/va and VM object address. | | 278 | * Global trees of futexes by vmspace/va and VM object address. |
205 | * | | 279 | * |
206 | * XXX This obviously doesn't scale in parallel. We could use a | | 280 | * XXX This obviously doesn't scale in parallel. We could use a |
207 | * pserialize-safe data structure, but there may be a high cost to | | 281 | * pserialize-safe data structure, but there may be a high cost to |
208 | * frequent deletion since we don't cache futexes after we're done | | 282 | * frequent deletion since we don't cache futexes after we're done |
209 | * with them. We could use hashed locks. But for now, just make | | 283 | * with them. We could use hashed locks. But for now, just make |
210 | * sure userland can't DoS the serial performance, by using a | | 284 | * sure userland can't DoS the serial performance, by using a |
211 | * balanced binary tree for lookup. | | 285 | * balanced binary tree for lookup. |
| @@ -269,27 +343,115 @@ compare_futex_shared(void *cookie, const | | | @@ -269,27 +343,115 @@ compare_futex_shared(void *cookie, const |
269 | { | | 343 | { |
270 | const struct futex *fa = na; | | 344 | const struct futex *fa = na; |
271 | const struct futex *fb = nb; | | 345 | const struct futex *fb = nb; |
272 | | | 346 | |
273 | return compare_futex_shared_key(cookie, fa, &fb->fx_key); | | 347 | return compare_futex_shared_key(cookie, fa, &fb->fx_key); |
274 | } | | 348 | } |
275 | | | 349 | |
276 | static const rb_tree_ops_t futex_shared_rb_ops = { | | 350 | static const rb_tree_ops_t futex_shared_rb_ops = { |
277 | .rbto_compare_nodes = compare_futex_shared, | | 351 | .rbto_compare_nodes = compare_futex_shared, |
278 | .rbto_compare_key = compare_futex_shared_key, | | 352 | .rbto_compare_key = compare_futex_shared_key, |
279 | .rbto_node_offset = offsetof(struct futex, fx_node), | | 353 | .rbto_node_offset = offsetof(struct futex, fx_node), |
280 | }; | | 354 | }; |
281 | | | 355 | |
282 | static void futex_wait_dequeue(struct futex_wait *, struct futex *); | | 356 | /* |
| | | 357 | * futex_sq_lock2(f, f2) |
| | | 358 | * |
| | | 359 | * Acquire the sleepq locks for f and f2, which may be null, or |
| | | 360 | * which may be the same. If they are distinct, an arbitrary total |
| | | 361 | * order is chosen on the locks. |
| | | 362 | * |
| | | 363 | * Callers should only ever acquire multiple sleepq locks |
| | | 364 | * simultaneously using futex_sq_lock2. |
| | | 365 | */ |
| | | 366 | static void |
| | | 367 | futex_sq_lock2(struct futex * const f, struct futex * const f2) |
| | | 368 | { |
| | | 369 | |
| | | 370 | /* |
| | | 371 | * If both are null, do nothing; if one is null and the other |
| | | 372 | * is not, lock the other and be done with it. |
| | | 373 | */ |
| | | 374 | if (f == NULL && f2 == NULL) { |
| | | 375 | return; |
| | | 376 | } else if (f == NULL) { |
| | | 377 | mutex_spin_enter(f2->fx_sq_lock); |
| | | 378 | return; |
| | | 379 | } else if (f2 == NULL) { |
| | | 380 | mutex_spin_enter(f->fx_sq_lock); |
| | | 381 | return; |
| | | 382 | } |
| | | 383 | |
| | | 384 | kmutex_t * const m = f->fx_sq_lock; |
| | | 385 | kmutex_t * const m2 = f2->fx_sq_lock; |
| | | 386 | |
| | | 387 | /* If both locks are the same, acquire only one. */ |
| | | 388 | if (m == m2) { |
| | | 389 | mutex_spin_enter(m); |
| | | 390 | return; |
| | | 391 | } |
| | | 392 | |
| | | 393 | /* Otherwise, use the ordering on the kva of the lock pointer. */ |
| | | 394 | if ((uintptr_t)m < (uintptr_t)m2) { |
| | | 395 | mutex_spin_enter(m); |
| | | 396 | mutex_spin_enter(m2); |
| | | 397 | } else { |
| | | 398 | mutex_spin_enter(m2); |
| | | 399 | mutex_spin_enter(m); |
| | | 400 | } |
| | | 401 | } |
| | | 402 | |
| | | 403 | /* |
| | | 404 | * futex_sq_unlock2(f, f2) |
| | | 405 | * |
| | | 406 | * Release the sleep queue locks for f and f2, which may be null, or |
| | | 407 | * which may have the same underlying lock. |
| | | 408 | */ |
| | | 409 | static void |
| | | 410 | futex_sq_unlock2(struct futex * const f, struct futex * const f2) |
| | | 411 | { |
| | | 412 | |
| | | 413 | /* |
| | | 414 | * If both are null, do nothing; if one is null and the other |
| | | 415 | * is not, unlock the other and be done with it. |
| | | 416 | */ |
| | | 417 | if (f == NULL && f2 == NULL) { |
| | | 418 | return; |
| | | 419 | } else if (f == NULL) { |
| | | 420 | mutex_spin_exit(f2->fx_sq_lock); |
| | | 421 | return; |
| | | 422 | } else if (f2 == NULL) { |
| | | 423 | mutex_spin_exit(f->fx_sq_lock); |
| | | 424 | return; |
| | | 425 | } |
| | | 426 | |
| | | 427 | kmutex_t * const m = f->fx_sq_lock; |
| | | 428 | kmutex_t * const m2 = f2->fx_sq_lock; |
| | | 429 | |
| | | 430 | /* If both locks are the same, release only one. */ |
| | | 431 | if (m == m2) { |
| | | 432 | mutex_spin_exit(m); |
| | | 433 | return; |
| | | 434 | } |
| | | 435 | |
| | | 436 | /* Otherwise, use the ordering on the kva of the lock pointer. */ |
| | | 437 | if ((uintptr_t)m < (uintptr_t)m2) { |
| | | 438 | mutex_spin_exit(m2); |
| | | 439 | mutex_spin_exit(m); |
| | | 440 | } else { |
| | | 441 | mutex_spin_exit(m); |
| | | 442 | mutex_spin_exit(m2); |
| | | 443 | } |
| | | 444 | } |
283 | | | 445 | |
284 | /* | | 446 | /* |
285 | * futex_load(uaddr, kaddr) | | 447 | * futex_load(uaddr, kaddr) |
286 | * | | 448 | * |
287 | * Perform a single atomic load to read *uaddr, and return the | | 449 | * Perform a single atomic load to read *uaddr, and return the |
288 | * result in *kaddr. Return 0 on success, EFAULT if uaddr is not | | 450 | * result in *kaddr. Return 0 on success, EFAULT if uaddr is not |
289 | * mapped. | | 451 | * mapped. |
290 | */ | | 452 | */ |
291 | static inline int | | 453 | static inline int |
292 | futex_load(int *uaddr, int *kaddr) | | 454 | futex_load(int *uaddr, int *kaddr) |
293 | { | | 455 | { |
294 | return ufetch_int((u_int *)uaddr, (u_int *)kaddr); | | 456 | return ufetch_int((u_int *)uaddr, (u_int *)kaddr); |
295 | } | | 457 | } |
| @@ -302,118 +464,96 @@ futex_load(int *uaddr, int *kaddr) | | | @@ -302,118 +464,96 @@ futex_load(int *uaddr, int *kaddr) |
302 | */ | | 464 | */ |
303 | static bool | | 465 | static bool |
304 | futex_test(int *uaddr, int expected) | | 466 | futex_test(int *uaddr, int expected) |
305 | { | | 467 | { |
306 | int val; | | 468 | int val; |
307 | int error; | | 469 | int error; |
308 | | | 470 | |
309 | error = futex_load(uaddr, &val); | | 471 | error = futex_load(uaddr, &val); |
310 | if (error) | | 472 | if (error) |
311 | return false; | | 473 | return false; |
312 | return val == expected; | | 474 | return val == expected; |
313 | } | | 475 | } |
314 | | | 476 | |
| | | 477 | static pool_cache_t futex_cache __read_mostly; |
| | | 478 | |
| | | 479 | static int futex_ctor(void *, void *, int); |
| | | 480 | static void futex_dtor(void *, void *); |
| | | 481 | |
315 | /* | | 482 | /* |
316 | * futex_sys_init() | | 483 | * futex_sys_init() |
317 | * | | 484 | * |
318 | * Initialize the futex subsystem. | | 485 | * Initialize the futex subsystem. |
319 | */ | | 486 | */ |
320 | void | | 487 | void |
321 | futex_sys_init(void) | | 488 | futex_sys_init(void) |
322 | { | | 489 | { |
323 | | | 490 | |
324 | mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE); | | 491 | mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE); |
325 | rb_tree_init(&futex_tab.va, &futex_rb_ops); | | 492 | rb_tree_init(&futex_tab.va, &futex_rb_ops); |
326 | rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops); | | 493 | rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops); |
| | | 494 | |
| | | 495 | futex_cache = pool_cache_init(sizeof(struct futex), |
| | | 496 | coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor, |
| | | 497 | futex_dtor, NULL); |
327 | } | | 498 | } |
328 | | | 499 | |
329 | /* | | 500 | /* |
330 | * futex_sys_fini() | | 501 | * futex_sys_fini() |
331 | * | | 502 | * |
332 | * Finalize the futex subsystem. | | 503 | * Finalize the futex subsystem. |
333 | */ | | 504 | */ |
334 | void | | 505 | void |
335 | futex_sys_fini(void) | | 506 | futex_sys_fini(void) |
336 | { | | 507 | { |
337 | | | 508 | |
338 | KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL); | | 509 | KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL); |
339 | KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL); | | 510 | KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL); |
340 | mutex_destroy(&futex_tab.lock); | | 511 | mutex_destroy(&futex_tab.lock); |
341 | } | | 512 | } |
342 | | | 513 | |
343 | /* | | 514 | /* |
344 | * futex_queue_init(f) | | 515 | * futex_ctor() |
345 | * | | | |
346 | * Initialize the futex queue. Caller must call futex_queue_fini | | | |
347 | * when done. | | | |
348 | * | | 516 | * |
349 | * Never sleeps. | | 517 | * Futex object constructor. |
350 | */ | | 518 | */ |
351 | static void | | 519 | static int |
352 | futex_queue_init(struct futex *f) | | 520 | futex_ctor(void *arg __unused, void *obj, int flags __unused) |
353 | { | | 521 | { |
| | | 522 | extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE]; |
| | | 523 | struct futex * const f = obj; |
354 | | | 524 | |
355 | mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE); | | 525 | mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE); |
356 | mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE); | | 526 | f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock; |
357 | cv_init(&f->fx_abortcv, "fqabort"); | | | |
358 | LIST_INIT(&f->fx_abortlist); | | | |
359 | TAILQ_INIT(&f->fx_queue); | | | |
360 | } | | | |
361 | | | 527 | |
362 | /* | | 528 | sleepq_init(&f->fx_sqs[FUTEX_READERQ]); |
363 | * futex_queue_drain(f) | | 529 | sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]); |
364 | * | | 530 | f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0; |
365 | * Wait for any aborting waiters in f; then empty the queue of | | | |
366 | * any stragglers and wake them. Caller must guarantee no new | | | |
367 | * references to f. | | | |
368 | * | | | |
369 | * May sleep. | | | |
370 | */ | | | |
371 | static void | | | |
372 | futex_queue_drain(struct futex *f) | | | |
373 | { | | | |
374 | struct futex_wait *fw, *fw_next; | | | |
375 | | | | |
376 | mutex_enter(&f->fx_abortlock); | | | |
377 | while (!LIST_EMPTY(&f->fx_abortlist)) | | | |
378 | cv_wait(&f->fx_abortcv, &f->fx_abortlock); | | | |
379 | mutex_exit(&f->fx_abortlock); | | | |
380 | | | 531 | |
381 | mutex_enter(&f->fx_qlock); | | 532 | return 0; |
382 | TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { | | | |
383 | mutex_enter(&fw->fw_lock); | | | |
384 | futex_wait_dequeue(fw, f); | | | |
385 | cv_broadcast(&fw->fw_cv); | | | |
386 | mutex_exit(&fw->fw_lock); | | | |
387 | } | | | |
388 | mutex_exit(&f->fx_qlock); | | | |
389 | } | | 533 | } |
390 | | | 534 | |
391 | /* | | 535 | /* |
392 | * futex_queue_fini(fq) | | 536 | * futex_dtor() |
393 | * | | 537 | * |
394 | * Finalize the futex queue initialized by futex_queue_init. Queue | | 538 | * Futex object destructor. |
395 | * must be empty. Caller must not use f again until a subsequent | | | |
396 | * futex_queue_init. | | | |
397 | */ | | 539 | */ |
398 | static void | | 540 | static void |
399 | futex_queue_fini(struct futex *f) | | 541 | futex_dtor(void *arg __unused, void *obj) |
400 | { | | 542 | { |
| | | 543 | struct futex * const f = obj; |
401 | | | 544 | |
402 | KASSERT(TAILQ_EMPTY(&f->fx_queue)); | | 545 | mutex_destroy(&f->fx_op_lock); |
403 | KASSERT(LIST_EMPTY(&f->fx_abortlist)); | | 546 | f->fx_sq_lock = NULL; |
404 | mutex_destroy(&f->fx_qlock); | | | |
405 | mutex_destroy(&f->fx_abortlock); | | | |
406 | cv_destroy(&f->fx_abortcv); | | | |
407 | } | | 547 | } |
408 | | | 548 | |
409 | /* | | 549 | /* |
410 | * futex_key_init(key, vm, va, shared) | | 550 | * futex_key_init(key, vm, va, shared) |
411 | * | | 551 | * |
412 | * Initialize a futex key for lookup, etc. | | 552 | * Initialize a futex key for lookup, etc. |
413 | */ | | 553 | */ |
414 | static int | | 554 | static int |
415 | futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared) | | 555 | futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared) |
416 | { | | 556 | { |
417 | int error = 0; | | 557 | int error = 0; |
418 | | | 558 | |
419 | if (__predict_false(shared)) { | | 559 | if (__predict_false(shared)) { |
| @@ -441,179 +581,191 @@ futex_key_fini(union futex_key *fk, bool | | | @@ -441,179 +581,191 @@ futex_key_fini(union futex_key *fk, bool |
441 | } | | 581 | } |
442 | | | 582 | |
443 | /* | | 583 | /* |
444 | * futex_create(fk, shared) | | 584 | * futex_create(fk, shared) |
445 | * | | 585 | * |
446 | * Create a futex. Initial reference count is 1, representing the | | 586 | * Create a futex. Initial reference count is 1, representing the |
447 | * caller. Returns NULL on failure. Always takes ownership of the | | 587 | * caller. Returns NULL on failure. Always takes ownership of the |
448 | * key, either transferring it to the newly-created futex, or releasing | | 588 | * key, either transferring it to the newly-created futex, or releasing |
449 | * the key if creation fails. | | 589 | * the key if creation fails. |
450 | * | | 590 | * |
451 | * Never sleeps for memory, but may sleep to acquire a lock. | | 591 | * Never sleeps for memory, but may sleep to acquire a lock. |
452 | */ | | 592 | */ |
453 | static struct futex * | | 593 | static struct futex * |
454 | futex_create(union futex_key *fk, bool shared) | | 594 | futex_create(union futex_key *fk, bool shared, uint8_t class) |
455 | { | | 595 | { |
456 | struct futex *f; | | 596 | struct futex *f; |
457 | | | 597 | |
458 | f = kmem_alloc(sizeof(*f), KM_NOSLEEP); | | 598 | f = pool_cache_get(futex_cache, PR_NOWAIT); |
459 | if (f == NULL) { | | 599 | if (f == NULL) { |
460 | futex_key_fini(fk, shared); | | 600 | futex_key_fini(fk, shared); |
461 | return NULL; | | 601 | return NULL; |
462 | } | | 602 | } |
463 | f->fx_key = *fk; | | 603 | f->fx_key = *fk; |
464 | f->fx_refcnt = 1; | | 604 | f->fx_refcnt = 1; |
465 | f->fx_shared = shared; | | 605 | f->fx_shared = shared; |
466 | f->fx_on_tree = false; | | 606 | f->fx_on_tree = false; |
467 | futex_queue_init(f); | | 607 | f->fx_class = class; |
468 | | | 608 | |
469 | return f; | | 609 | return f; |
470 | } | | 610 | } |
471 | | | 611 | |
472 | /* | | 612 | /* |
473 | * futex_destroy(f) | | 613 | * futex_destroy(f) |
474 | * | | 614 | * |
475 | * Destroy a futex created with futex_create. Reference count | | 615 | * Destroy a futex created with futex_create. Reference count |
476 | * must be zero. | | 616 | * must be zero. |
477 | * | | 617 | * |
478 | * May sleep. | | 618 | * May sleep. |
479 | */ | | 619 | */ |
480 | static void | | 620 | static void |
481 | futex_destroy(struct futex *f) | | 621 | futex_destroy(struct futex *f) |
482 | { | | 622 | { |
483 | | | 623 | |
484 | ASSERT_SLEEPABLE(); | | 624 | ASSERT_SLEEPABLE(); |
485 | | | 625 | |
486 | KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0); | | 626 | KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0); |
487 | KASSERT(!f->fx_on_tree); | | 627 | KASSERT(!f->fx_on_tree); |
488 | | | 628 | KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ])); |
489 | /* Drain and destroy the private queue. */ | | 629 | KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ])); |
490 | futex_queue_drain(f); | | 630 | KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0); |
491 | futex_queue_fini(f); | | 631 | KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0); |
492 | | | 632 | |
493 | futex_key_fini(&f->fx_key, f->fx_shared); | | 633 | futex_key_fini(&f->fx_key, f->fx_shared); |
494 | | | 634 | |
495 | kmem_free(f, sizeof(*f)); | | 635 | pool_cache_put(futex_cache, f); |
496 | } | | 636 | } |
497 | | | 637 | |
498 | /* | | 638 | /* |
499 | * futex_hold(f) | | 639 | * futex_hold_count(f, n) |
500 | * | | 640 | * |
501 | * Attempt to acquire a reference to f. Return 0 on success, | | 641 | * Attempt to acquire a reference to f. Return 0 on success, |
502 | * ENFILE on too many references. | | 642 | * ENFILE on too many references. |
503 | * | | 643 | * |
504 | * Never sleeps. | | 644 | * Never sleeps. |
505 | */ | | 645 | */ |
506 | static int | | 646 | static int |
507 | futex_hold(struct futex *f) | | 647 | futex_hold_count(struct futex *f, unsigned long const count) |
508 | { | | 648 | { |
509 | unsigned long refcnt; | | 649 | unsigned long refcnt; |
510 | | | 650 | |
511 | do { | | 651 | do { |
512 | refcnt = atomic_load_relaxed(&f->fx_refcnt); | | 652 | refcnt = atomic_load_relaxed(&f->fx_refcnt); |
513 | if (refcnt == ULONG_MAX) | | 653 | if (ULONG_MAX - refcnt < count) |
514 | return ENFILE; | | 654 | return ENFILE; |
515 | } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt); | | 655 | } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, |
| | | 656 | refcnt + count) != refcnt); |
516 | | | 657 | |
517 | return 0; | | 658 | return 0; |
518 | } | | 659 | } |
| | | 660 | #define futex_hold(f) futex_hold_count(f, 1) |
519 | | | 661 | |
520 | /* | | 662 | /* |
521 | * futex_rele(f) | | 663 | * futex_rele_count(f, n) |
522 | * | | 664 | * |
523 | * Release a reference to f acquired with futex_create or | | 665 | * Release a reference to f acquired with futex_create or |
524 | * futex_hold. | | 666 | * futex_hold. |
525 | * | | 667 | * |
526 | * May sleep to free f. | | 668 | * May sleep to free f. |
527 | */ | | 669 | */ |
528 | static void | | 670 | static void |
529 | futex_rele(struct futex *f) | | 671 | futex_rele_count(struct futex *f, unsigned long const count) |
530 | { | | 672 | { |
531 | unsigned long refcnt; | | 673 | unsigned long refcnt; |
532 | | | 674 | |
533 | ASSERT_SLEEPABLE(); | | 675 | ASSERT_SLEEPABLE(); |
534 | | | 676 | |
535 | do { | | 677 | do { |
536 | refcnt = atomic_load_relaxed(&f->fx_refcnt); | | 678 | refcnt = atomic_load_relaxed(&f->fx_refcnt); |
537 | if (refcnt == 1) | | 679 | KASSERT(refcnt >= count); |
| | | 680 | if (refcnt - count == 0) |
538 | goto trylast; | | 681 | goto trylast; |
539 | } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); | | 682 | } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, |
| | | 683 | refcnt - count) != refcnt); |
540 | return; | | 684 | return; |
541 | | | 685 | |
542 | trylast: | | 686 | trylast: |
| | | 687 | KASSERT(count <= LONG_MAX); |
543 | mutex_enter(&futex_tab.lock); | | 688 | mutex_enter(&futex_tab.lock); |
544 | if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) { | | 689 | if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) { |
545 | if (f->fx_on_tree) { | | 690 | if (f->fx_on_tree) { |
546 | if (__predict_false(f->fx_shared)) | | 691 | if (__predict_false(f->fx_shared)) |
547 | rb_tree_remove_node(&futex_tab.oa, f); | | 692 | rb_tree_remove_node(&futex_tab.oa, f); |
548 | else | | 693 | else |
549 | rb_tree_remove_node(&futex_tab.va, f); | | 694 | rb_tree_remove_node(&futex_tab.va, f); |
550 | f->fx_on_tree = false; | | 695 | f->fx_on_tree = false; |
551 | } | | 696 | } |
552 | } else { | | 697 | } else { |
553 | /* References remain -- don't destroy it. */ | | 698 | /* References remain -- don't destroy it. */ |
554 | f = NULL; | | 699 | f = NULL; |
555 | } | | 700 | } |
556 | mutex_exit(&futex_tab.lock); | | 701 | mutex_exit(&futex_tab.lock); |
557 | if (f != NULL) | | 702 | if (f != NULL) |
558 | futex_destroy(f); | | 703 | futex_destroy(f); |
559 | } | | 704 | } |
| | | 705 | #define futex_rele(f) futex_rele_count(f, 1) |
560 | | | 706 | |
561 | /* | | 707 | /* |
562 | * futex_rele_not_last(f) | | 708 | * futex_rele_count_not_last(f, n) |
563 | * | | 709 | * |
564 | * Release a reference to f acquired with futex_create or | | 710 | * Release a reference to f acquired with futex_create or |
565 | * futex_hold. | | 711 | * futex_hold. |
566 | * | | 712 | * |
567 | * This version asserts that we are not dropping the last | | 713 | * This version asserts that we are not dropping the last |
568 | * reference to f. | | 714 | * reference to f. |
569 | */ | | 715 | */ |
570 | static void | | 716 | static void |
571 | futex_rele_not_last(struct futex *f) | | 717 | futex_rele_count_not_last(struct futex *f, unsigned long const count) |
572 | { | | 718 | { |
573 | unsigned long refcnt; | | 719 | unsigned long refcnt; |
574 | | | 720 | |
575 | do { | | 721 | do { |
576 | refcnt = atomic_load_relaxed(&f->fx_refcnt); | | 722 | refcnt = atomic_load_relaxed(&f->fx_refcnt); |
577 | KASSERT(refcnt > 1); | | 723 | KASSERT(refcnt >= count); |
578 | } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); | | 724 | } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, |
| | | 725 | refcnt - count) != refcnt); |
579 | } | | 726 | } |
580 | | | 727 | |
581 | /* | | 728 | /* |
582 | * futex_lookup_by_key(key, shared, &f) | | 729 | * futex_lookup_by_key(key, shared, class, &f) |
583 | * | | 730 | * |
584 | * Try to find an existing futex va reference in the specified key | | 731 | * Try to find an existing futex va reference in the specified key |
585 | * On success, return 0, set f to found futex or to NULL if not found, | | 732 | * On success, return 0, set f to found futex or to NULL if not found, |
586 | * and increment f's reference count if found. | | 733 | * and increment f's reference count if found. |
587 | * | | 734 | * |
588 | * Return ENFILE if reference count too high. | | 735 | * Return ENFILE if reference count too high. |
589 | * | | 736 | * |
590 | * Internal lookup routine shared by futex_lookup() and | | 737 | * Internal lookup routine shared by futex_lookup() and |
591 | * futex_lookup_create(). | | 738 | * futex_lookup_create(). |
592 | */ | | 739 | */ |
593 | static int | | 740 | static int |
594 | futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp) | | 741 | futex_lookup_by_key(union futex_key *fk, bool shared, uint8_t class, |
| | | 742 | struct futex **fp) |
595 | { | | 743 | { |
596 | struct futex *f; | | 744 | struct futex *f; |
597 | int error = 0; | | 745 | int error = 0; |
598 | | | 746 | |
599 | mutex_enter(&futex_tab.lock); | | 747 | mutex_enter(&futex_tab.lock); |
600 | if (__predict_false(shared)) { | | 748 | if (__predict_false(shared)) { |
601 | f = rb_tree_find_node(&futex_tab.oa, fk); | | 749 | f = rb_tree_find_node(&futex_tab.oa, fk); |
602 | } else { | | 750 | } else { |
603 | f = rb_tree_find_node(&futex_tab.va, fk); | | 751 | f = rb_tree_find_node(&futex_tab.va, fk); |
604 | } | | 752 | } |
605 | if (f) { | | 753 | if (f) { |
606 | error = futex_hold(f); | | 754 | if (__predict_true(f->fx_class == class || |
| | | 755 | class == FUTEX_CLASS_ANY)) |
| | | 756 | error = futex_hold(f); |
| | | 757 | else |
| | | 758 | error = EINVAL; |
607 | if (error) | | 759 | if (error) |
608 | f = NULL; | | 760 | f = NULL; |
609 | } | | 761 | } |
610 | *fp = f; | | 762 | *fp = f; |
611 | mutex_exit(&futex_tab.lock); | | 763 | mutex_exit(&futex_tab.lock); |
612 | | | 764 | |
613 | return error; | | 765 | return error; |
614 | } | | 766 | } |
615 | | | 767 | |
616 | /* | | 768 | /* |
617 | * futex_insert(f, fp) | | 769 | * futex_insert(f, fp) |
618 | * | | 770 | * |
619 | * Try to insert the futex f into the tree by va. If there | | 771 | * Try to insert the futex f into the tree by va. If there |
| @@ -644,674 +796,682 @@ futex_insert(struct futex *f, struct fut | | | @@ -644,674 +796,682 @@ futex_insert(struct futex *f, struct fut |
644 | KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0); | | 796 | KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0); |
645 | KASSERT(f0->fx_on_tree); | | 797 | KASSERT(f0->fx_on_tree); |
646 | error = futex_hold(f0); | | 798 | error = futex_hold(f0); |
647 | if (error) | | 799 | if (error) |
648 | goto out; | | 800 | goto out; |
649 | } | | 801 | } |
650 | *fp = f0; | | 802 | *fp = f0; |
651 | out: mutex_exit(&futex_tab.lock); | | 803 | out: mutex_exit(&futex_tab.lock); |
652 | | | 804 | |
653 | return error; | | 805 | return error; |
654 | } | | 806 | } |
655 | | | 807 | |
656 | /* | | 808 | /* |
657 | * futex_lookup(uaddr, shared, &f) | | 809 | * futex_lookup(uaddr, shared, class, &f) |
658 | * | | 810 | * |
659 | * Find a futex at the userland pointer uaddr in the current | | 811 | * Find a futex at the userland pointer uaddr in the current |
660 | * process's VM space. On success, return the futex in f and | | 812 | * process's VM space. On success, return the futex in f and |
661 | * increment its reference count. | | 813 | * increment its reference count. |
662 | * | | 814 | * |
663 | * Caller must call futex_rele when done. | | 815 | * Caller must call futex_rele when done. |
664 | */ | | 816 | */ |
665 | static int | | 817 | static int |
666 | futex_lookup(int *uaddr, bool shared, struct futex **fp) | | 818 | futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp) |
667 | { | | 819 | { |
668 | union futex_key fk; | | 820 | union futex_key fk; |
669 | struct vmspace *vm = curproc->p_vmspace; | | 821 | struct vmspace *vm = curproc->p_vmspace; |
670 | vaddr_t va = (vaddr_t)uaddr; | | 822 | vaddr_t va = (vaddr_t)uaddr; |
671 | int error; | | 823 | int error; |
672 | | | 824 | |
673 | /* | | 825 | /* |
674 | * Reject unaligned user pointers so we don't cross page | | 826 | * Reject unaligned user pointers so we don't cross page |
675 | * boundaries and so atomics will work. | | 827 | * boundaries and so atomics will work. |
676 | */ | | 828 | */ |
677 | if ((va & 3) != 0) | | 829 | if (__predict_false((va & 3) != 0)) |
678 | return EINVAL; | | 830 | return EINVAL; |
679 | | | 831 | |
680 | /* Look it up. */ | | 832 | /* Look it up. */ |
681 | error = futex_key_init(&fk, vm, va, shared); | | 833 | error = futex_key_init(&fk, vm, va, shared); |
682 | if (error) | | 834 | if (error) |
683 | return error; | | 835 | return error; |
684 | | | 836 | |
685 | error = futex_lookup_by_key(&fk, shared, fp); | | 837 | error = futex_lookup_by_key(&fk, shared, class, fp); |
686 | futex_key_fini(&fk, shared); | | 838 | futex_key_fini(&fk, shared); |
687 | if (error) | | 839 | if (error) |
688 | return error; | | 840 | return error; |
689 | | | 841 | |
690 | KASSERT(*fp == NULL || (*fp)->fx_shared == shared); | | 842 | KASSERT(*fp == NULL || (*fp)->fx_shared == shared); |
| | | 843 | KASSERT(*fp == NULL || (*fp)->fx_class == class); |
691 | KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); | | 844 | KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); |
692 | | | 845 | |
693 | /* | | 846 | /* |
694 | * Success! (Caller must still check whether we found | | 847 | * Success! (Caller must still check whether we found |
695 | * anything, but nothing went _wrong_ like trying to use | | 848 | * anything, but nothing went _wrong_ like trying to use |
696 | * unmapped memory.) | | 849 | * unmapped memory.) |
697 | */ | | 850 | */ |
698 | KASSERT(error == 0); | | 851 | KASSERT(error == 0); |
699 | | | 852 | |
700 | return error; | | 853 | return error; |
701 | } | | 854 | } |
702 | | | 855 | |
703 | /* | | 856 | /* |
704 | * futex_lookup_create(uaddr, shared, &f) | | 857 | * futex_lookup_create(uaddr, shared, class, &f) |
705 | * | | 858 | * |
706 | * Find or create a futex at the userland pointer uaddr in the | | 859 | * Find or create a futex at the userland pointer uaddr in the |
707 | * current process's VM space. On success, return the futex in f | | 860 | * current process's VM space. On success, return the futex in f |
708 | * and increment its reference count. | | 861 | * and increment its reference count. |
709 | * | | 862 | * |
710 | * Caller must call futex_rele when done. | | 863 | * Caller must call futex_rele when done. |
711 | */ | | 864 | */ |
712 | static int | | 865 | static int |
713 | futex_lookup_create(int *uaddr, bool shared, struct futex **fp) | | 866 | futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp) |
714 | { | | 867 | { |
715 | union futex_key fk; | | 868 | union futex_key fk; |
716 | struct vmspace *vm = curproc->p_vmspace; | | 869 | struct vmspace *vm = curproc->p_vmspace; |
717 | struct futex *f = NULL; | | 870 | struct futex *f = NULL; |
718 | vaddr_t va = (vaddr_t)uaddr; | | 871 | vaddr_t va = (vaddr_t)uaddr; |
719 | int error; | | 872 | int error; |
720 | | | 873 | |
721 | /* | | 874 | /* |
722 | * Reject unaligned user pointers so we don't cross page | | 875 | * Reject unaligned user pointers so we don't cross page |
723 | * boundaries and so atomics will work. | | 876 | * boundaries and so atomics will work. |
724 | */ | | 877 | */ |
725 | if ((va & 3) != 0) | | 878 | if (__predict_false((va & 3) != 0)) |
| | | 879 | return EINVAL; |
| | | 880 | |
| | | 881 | if (__predict_false(class == FUTEX_CLASS_ANY)) |
726 | return EINVAL; | | 882 | return EINVAL; |
727 | | | 883 | |
728 | error = futex_key_init(&fk, vm, va, shared); | | 884 | error = futex_key_init(&fk, vm, va, shared); |
729 | if (error) | | 885 | if (error) |
730 | return error; | | 886 | return error; |
731 | | | 887 | |
732 | /* | | 888 | /* |
733 | * Optimistically assume there already is one, and try to find | | 889 | * Optimistically assume there already is one, and try to find |
734 | * it. | | 890 | * it. |
735 | */ | | 891 | */ |
736 | error = futex_lookup_by_key(&fk, shared, fp); | | 892 | error = futex_lookup_by_key(&fk, shared, class, fp); |
737 | if (error || *fp != NULL) { | | 893 | if (error || *fp != NULL) { |
738 | /* | | 894 | /* |
739 | * We either found one, or there was an error. | | 895 | * We either found one, or there was an error. |
740 | * In either case, we are done with the key. | | 896 | * In either case, we are done with the key. |
741 | */ | | 897 | */ |
742 | futex_key_fini(&fk, shared); | | 898 | futex_key_fini(&fk, shared); |
743 | goto out; | | 899 | goto out; |
744 | } | | 900 | } |
745 | | | 901 | |
746 | /* | | 902 | /* |
747 | * Create a futex recoard. This tranfers ownership of the key | | 903 | * Create a futex recoard. This tranfers ownership of the key |
748 | * in all cases. | | 904 | * in all cases. |
749 | */ | | 905 | */ |
750 | f = futex_create(&fk, shared); | | 906 | f = futex_create(&fk, shared, class); |
751 | if (f == NULL) { | | 907 | if (f == NULL) { |
752 | error = ENOMEM; | | 908 | error = ENOMEM; |
753 | goto out; | | 909 | goto out; |
754 | } | | 910 | } |
755 | | | 911 | |
756 | /* | | 912 | /* |
757 | * Insert our new futex, or use existing if someone else beat | | 913 | * Insert our new futex, or use existing if someone else beat |
758 | * us to it. | | 914 | * us to it. |
759 | */ | | 915 | */ |
760 | error = futex_insert(f, fp); | | 916 | error = futex_insert(f, fp); |
761 | if (error) | | 917 | if (error) |
762 | goto out; | | 918 | goto out; |
763 | if (*fp == f) | | 919 | if (*fp == f) |
764 | f = NULL; /* don't release on exit */ | | 920 | f = NULL; /* don't release on exit */ |
765 | | | 921 | |
766 | /* Success! */ | | 922 | /* Success! */ |
767 | KASSERT(error == 0); | | 923 | KASSERT(error == 0); |
768 | | | 924 | |
769 | out: if (f != NULL) | | 925 | out: if (f != NULL) |
770 | futex_rele(f); | | 926 | futex_rele(f); |
771 | KASSERT(error || *fp != NULL); | | 927 | KASSERT(error || *fp != NULL); |
772 | KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); | | 928 | KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); |
773 | return error; | | 929 | return error; |
774 | } | | 930 | } |
775 | | | 931 | |
776 | /* | | 932 | /* |
777 | * futex_wait_init(fw, bitset) | | 933 | * futex_unsleep: |
778 | * | | | |
779 | * Initialize a record for a thread to wait on a futex matching | | | |
780 | * the specified bit set. Should be passed to futex_wait_enqueue | | | |
781 | * before futex_wait, and should be passed to futex_wait_fini when | | | |
782 | * done. | | | |
783 | */ | | | |
784 | static void | | | |
785 | futex_wait_init(struct futex_wait *fw, int bitset) | | | |
786 | { | | | |
787 | | | | |
788 | KASSERT(bitset); | | | |
789 | | | | |
790 | mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE); | | | |
791 | cv_init(&fw->fw_cv, "futex"); | | | |
792 | fw->fw_futex = NULL; | | | |
793 | fw->fw_bitset = bitset; | | | |
794 | fw->fw_aborting = false; | | | |
795 | } | | | |
796 | | | | |
797 | /* | | | |
798 | * futex_wait_fini(fw) | | | |
799 | * | | 934 | * |
800 | * Finalize a record for a futex waiter. Must not be on any | | 935 | * Remove an LWP from the futex and sleep queue. This is called when |
801 | * futex's queue. | | 936 | * the LWP has not been awoken normally but instead interrupted: for |
| | | 937 | * example, when a signal is received. Must be called with the LWP |
| | | 938 | * locked. Will unlock if "unlock" is true. |
802 | */ | | 939 | */ |
803 | static void | | 940 | static void |
804 | futex_wait_fini(struct futex_wait *fw) | | 941 | futex_unsleep(lwp_t *l, bool unlock) |
805 | { | | 942 | { |
| | | 943 | struct futex *f __diagused; |
806 | | | 944 | |
807 | KASSERT(fw->fw_futex == NULL); | | 945 | f = (struct futex *)(uintptr_t)l->l_wchan; |
808 | | | 946 | |
809 | cv_destroy(&fw->fw_cv); | | 947 | KASSERT(mutex_owned(f->fx_sq_lock)); |
810 | mutex_destroy(&fw->fw_lock); | | | |
811 | } | | | |
812 | | | 948 | |
813 | /* | | 949 | /* WRITERQ is more likely */ |
814 | * futex_wait_enqueue(fw, f) | | 950 | if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) { |
815 | * | | 951 | KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0); |
816 | * Put fw on the futex queue. Must be done before futex_wait. | | 952 | f->fx_nwaiters[FUTEX_WRITERQ]--; |
817 | * Caller must hold fw's lock and f's lock, and fw must not be on | | 953 | } else { |
818 | * any existing futex's waiter list. | | 954 | KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]); |
819 | */ | | 955 | KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0); |
820 | static void | | 956 | f->fx_nwaiters[FUTEX_READERQ]--; |
821 | futex_wait_enqueue(struct futex_wait *fw, struct futex *f) | | 957 | } |
822 | { | | | |
823 | | | | |
824 | KASSERT(mutex_owned(&f->fx_qlock)); | | | |
825 | KASSERT(mutex_owned(&fw->fw_lock)); | | | |
826 | KASSERT(fw->fw_futex == NULL); | | | |
827 | KASSERT(!fw->fw_aborting); | | | |
828 | | | | |
829 | fw->fw_futex = f; | | | |
830 | TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry); | | | |
831 | } | | | |
832 | | | | |
833 | /* | | | |
834 | * futex_wait_dequeue(fw, f) | | | |
835 | * | | | |
836 | * Remove fw from the futex queue. Precludes subsequent | | | |
837 | * futex_wait until a futex_wait_enqueue. Caller must hold fw's | | | |
838 | * lock and f's lock, and fw must be on f. | | | |
839 | */ | | | |
840 | static void | | | |
841 | futex_wait_dequeue(struct futex_wait *fw, struct futex *f) | | | |
842 | { | | | |
843 | | | | |
844 | KASSERT(mutex_owned(&f->fx_qlock)); | | | |
845 | KASSERT(mutex_owned(&fw->fw_lock)); | | | |
846 | KASSERT(fw->fw_futex == f); | | | |
847 | | | 958 | |
848 | TAILQ_REMOVE(&f->fx_queue, fw, fw_entry); | | 959 | sleepq_unsleep(l, unlock); |
849 | fw->fw_futex = NULL; | | | |
850 | } | | 960 | } |
851 | | | 961 | |
852 | /* | | 962 | /* |
853 | * futex_wait_abort(fw) | | 963 | * futex_wait(f, timeout, clkid, bitset) |
854 | * | | 964 | * |
855 | * Caller is no longer waiting for fw. Remove it from any queue | | 965 | * Wait until deadline on the clock clkid, or forever if deadline |
856 | * if it was on one. Caller must hold fw->fw_lock. | | 966 | * is NULL, for a futex wakeup. Return 0 on explicit wakeup, |
| | | 967 | * ETIMEDOUT on timeout, EINTR on signal. |
857 | */ | | 968 | */ |
858 | static void | | 969 | static int |
859 | futex_wait_abort(struct futex_wait *fw) | | 970 | futex_wait(struct futex *f, int q, const struct timespec *deadline, |
| | | 971 | clockid_t clkid, int bitset) |
860 | { | | 972 | { |
861 | struct futex *f; | | | |
862 | | | | |
863 | KASSERT(mutex_owned(&fw->fw_lock)); | | | |
864 | | | 973 | |
865 | /* | | 974 | /* |
866 | * Grab the futex queue. It can't go away as long as we hold | | 975 | * Some things to note about how this function works: |
867 | * fw_lock. However, we can't take the queue lock because | | 976 | * |
868 | * that's a lock order reversal. | | 977 | * ==> When we enter futex_wait(), the futex's op lock is |
| | | 978 | * held, preventing us from missing wakes. Once we are on |
| | | 979 | * the futex's sleepq, it is safe to release the op lock. |
| | | 980 | * |
| | | 981 | * ==> We have a single early escape to avoid calling |
| | | 982 | * sleepq_block(): our deadline already having passed. |
| | | 983 | * If this is a no-timeout wait, or if the deadline has |
| | | 984 | * not already passed, we are guaranteed to call sleepq_block(). |
| | | 985 | * |
| | | 986 | * ==> sleepq_block() contains all of the necessary logic |
| | | 987 | * for handling signals; we do not need to check for them |
| | | 988 | * ourselves. |
| | | 989 | * |
| | | 990 | * ==> Once we have blocked, other futex operations will |
| | | 991 | * be able to wake us or requeue us. Until we have blocked, |
| | | 992 | * those other futex operations will not be able to acquire |
| | | 993 | * the sleepq locks in order to perform those operations, |
| | | 994 | * even if we have dropped the op lock. Thus, we will not |
| | | 995 | * miss wakes. This fundamental invariant is relied upon by |
| | | 996 | * every other synchronization object in the kernel. |
| | | 997 | * |
| | | 998 | * ==> sleepq_block() returns for one of three reasons: |
| | | 999 | * |
| | | 1000 | * -- We were awakened. |
| | | 1001 | * -- We were signaled. |
| | | 1002 | * -- We timed out. |
| | | 1003 | * |
| | | 1004 | * ==> Once sleepq_block() returns, we will not call |
| | | 1005 | * sleepq_block() again. |
| | | 1006 | * |
| | | 1007 | * ==> If sleepq_block() returns due to being signaled, |
| | | 1008 | * we must check to see if we were also awakened. This |
| | | 1009 | * is to handle the following sequence: |
| | | 1010 | * |
| | | 1011 | * -- Futex wake takes us off sleepq, places us on runq. |
| | | 1012 | * -- We are signaled while sitting on the runq. |
| | | 1013 | * -- We run, and sleepq_block() notices the signal and |
| | | 1014 | * returns EINTR/ERESTART. |
| | | 1015 | * |
| | | 1016 | * In this situation, we want to indicate a successful wake |
| | | 1017 | * to the caller, because that wake has been reported to the |
| | | 1018 | * thread that issued it. |
| | | 1019 | * |
| | | 1020 | * Note that it is NOT possible for a wake to be issued after |
| | | 1021 | * a signal; the LWP will already have been taken off the sleepq |
| | | 1022 | * by the signal, so the wake will never happen. Note that for |
| | | 1023 | * this reason, we must *never* return ERESTART, because it could |
| | | 1024 | * result in us waiting again with no one to awaken us. |
869 | */ | | 1025 | */ |
870 | f = fw->fw_futex; | | | |
871 | | | 1026 | |
872 | /* Put us on the abort list so that fq won't go away. */ | | 1027 | struct lwp * const l = curlwp; |
873 | mutex_enter(&f->fx_abortlock); | | 1028 | int timo; |
874 | LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort); | | 1029 | int error; |
875 | mutex_exit(&f->fx_abortlock); | | | |
876 | | | 1030 | |
877 | /* | | 1031 | /* |
878 | * Mark fw as aborting so it won't lose wakeups and won't be | | 1032 | * Compute our timeout before taking any locks. |
879 | * transferred to any other queue. | | | |
880 | */ | | 1033 | */ |
881 | fw->fw_aborting = true; | | 1034 | if (deadline) { |
| | | 1035 | struct timespec ts; |
882 | | | 1036 | |
883 | /* f is now stable, so we can release fw_lock. */ | | 1037 | /* Check our watch. */ |
884 | mutex_exit(&fw->fw_lock); | | 1038 | error = clock_gettime1(clkid, &ts); |
| | | 1039 | if (error) { |
| | | 1040 | mutex_exit(&f->fx_op_lock); |
| | | 1041 | return error; |
| | | 1042 | } |
885 | | | 1043 | |
886 | /* Now we can remove fw under the queue lock. */ | | 1044 | /* |
887 | mutex_enter(&f->fx_qlock); | | 1045 | * If we're past the deadline, ETIMEDOUT. |
888 | mutex_enter(&fw->fw_lock); | | 1046 | */ |
889 | futex_wait_dequeue(fw, f); | | 1047 | if (timespeccmp(deadline, &ts, <=)) { |
890 | mutex_exit(&fw->fw_lock); | | 1048 | mutex_exit(&f->fx_op_lock); |
891 | mutex_exit(&f->fx_qlock); | | 1049 | return ETIMEDOUT; |
| | | 1050 | } else { |
| | | 1051 | /* Count how much time is left. */ |
| | | 1052 | timespecsub(deadline, &ts, &ts); |
| | | 1053 | timo = tstohz(&ts); |
| | | 1054 | if (timo == 0) { |
| | | 1055 | /* |
| | | 1056 | * tstohz() already rounds up, and |
| | | 1057 | * a return value of 0 ticks means |
| | | 1058 | * "expired now or in the past". |
| | | 1059 | */ |
| | | 1060 | mutex_exit(&f->fx_op_lock); |
| | | 1061 | return ETIMEDOUT; |
| | | 1062 | } |
| | | 1063 | } |
| | | 1064 | } else { |
| | | 1065 | timo = 0; |
| | | 1066 | } |
892 | | | 1067 | |
893 | /* | | 1068 | /* |
894 | * Finally, remove us from the abort list and notify anyone | | 1069 | * Acquire in sleepq -> lwp order. While we're at it, ensure |
895 | * waiting for the abort to complete if we were the last to go. | | 1070 | * that there's room for us to block. |
896 | */ | | 1071 | */ |
897 | mutex_enter(&f->fx_abortlock); | | 1072 | mutex_spin_enter(f->fx_sq_lock); |
898 | LIST_REMOVE(fw, fw_abort); | | 1073 | if (__predict_false(q == FUTEX_READERQ && |
899 | if (LIST_EMPTY(&f->fx_abortlist)) | | 1074 | f->fx_nwaiters[q] == FUTEX_TID_MASK)) { |
900 | cv_broadcast(&f->fx_abortcv); | | 1075 | mutex_spin_exit(f->fx_sq_lock); |
901 | mutex_exit(&f->fx_abortlock); | | 1076 | mutex_exit(&f->fx_op_lock); |
| | | 1077 | return ENFILE; |
| | | 1078 | } |
| | | 1079 | lwp_lock(l); |
902 | | | 1080 | |
903 | /* | | 1081 | /* |
904 | * Release our reference to the futex now that we are not | | 1082 | * No need for locks here; until we're on a futex's sleepq, these |
905 | * waiting for it. | | 1083 | * values are private to the LWP, and the locks needed to get onto |
| | | 1084 | * the sleepq provide the memory barriers needed to ensure visibility. |
906 | */ | | 1085 | */ |
907 | futex_rele(f); | | 1086 | l->l_futex = f; |
| | | 1087 | l->l_futex_wakesel = bitset; |
908 | | | 1088 | |
909 | /* | | 1089 | /* |
910 | * Reacquire the fw lock as caller expects. Verify that we're | | 1090 | * This is an inlined bit of sleepq_enter(); |
911 | * aborting and no longer associated with a futex. | | 1091 | * we already hold the lwp lock. |
912 | */ | | 1092 | */ |
913 | mutex_enter(&fw->fw_lock); | | 1093 | lwp_unlock_to(l, f->fx_sq_lock); |
914 | KASSERT(fw->fw_aborting); | | 1094 | KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks); |
915 | KASSERT(fw->fw_futex == NULL); | | 1095 | KASSERT(lwp_locked(l, f->fx_sq_lock)); |
916 | } | | | |
917 | | | | |
918 | /* | | | |
919 | * futex_wait(fw, deadline, clkid) | | | |
920 | * | | | |
921 | * fw must be a waiter on a futex's queue. Wait until deadline on | | | |
922 | * the clock clkid, or forever if deadline is NULL, for a futex | | | |
923 | * wakeup. Return 0 on explicit wakeup or destruction of futex, | | | |
924 | * ETIMEDOUT on timeout, EINTR/ERESTART on signal. Either way, fw | | | |
925 | * will no longer be on a futex queue on return. | | | |
926 | */ | | | |
927 | static int | | | |
928 | futex_wait(struct futex_wait *fw, const struct timespec *deadline, | | | |
929 | clockid_t clkid) | | | |
930 | { | | | |
931 | int error = 0; | | | |
932 | | | 1096 | |
933 | /* Test and wait under the wait lock. */ | | 1097 | sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true); |
934 | mutex_enter(&fw->fw_lock); | | 1098 | f->fx_nwaiters[q]++; |
935 | | | 1099 | |
936 | for (;;) { | | 1100 | /* We can now safely release the op lock. */ |
937 | /* If we're done yet, stop and report success. */ | | 1101 | mutex_exit(&f->fx_op_lock); |
938 | if (fw->fw_bitset == 0 || fw->fw_futex == NULL) { | | | |
939 | error = 0; | | | |
940 | break; | | | |
941 | } | | | |
942 | | | | |
943 | /* If anything went wrong in the last iteration, stop. */ | | | |
944 | if (error) | | | |
945 | break; | | | |
946 | | | | |
947 | /* Not done yet. Wait. */ | | | |
948 | if (deadline) { | | | |
949 | struct timespec ts; | | | |
950 | | | | |
951 | /* Check our watch. */ | | | |
952 | error = clock_gettime1(clkid, &ts); | | | |
953 | if (error) | | | |
954 | break; | | | |
955 | | | | |
956 | /* If we're past the deadline, ETIMEDOUT. */ | | | |
957 | if (timespeccmp(deadline, &ts, <=)) { | | | |
958 | error = ETIMEDOUT; | | | |
959 | break; | | | |
960 | } | | | |
961 | | | 1102 | |
962 | /* Count how much time is left. */ | | 1103 | SDT_PROBE1(futex, wait, sleepq_block, entry, timo); |
963 | timespecsub(deadline, &ts, &ts); | | 1104 | error = sleepq_block(timo, true); |
| | | 1105 | SDT_PROBE1(futex, wait, sleepq_block, exit, error); |
964 | | | 1106 | |
965 | /* Wait for that much time, allowing signals. */ | | 1107 | /* |
966 | error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock, | | 1108 | * f is no longer valid; we may have been moved to another |
967 | tstohz(&ts)); | | 1109 | * futex sleepq while we slept. |
968 | } else { | | 1110 | */ |
969 | /* Wait indefinitely, allowing signals. */ | | | |
970 | error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock); | | | |
971 | } | | | |
972 | } | | | |
973 | | | 1111 | |
974 | /* | | 1112 | /* |
975 | * If we were woken up, the waker will have removed fw from the | | 1113 | * If something went wrong, we may still have a futex reference. |
976 | * queue. But if anything went wrong, we must remove fw from | | 1114 | * Check for that here and drop it if so. We can do this without |
977 | * the queue ourselves. While here, convert EWOULDBLOCK to | | 1115 | * taking any locks because l->l_futex is private to the LWP when |
978 | * ETIMEDOUT. | | 1116 | * not on any futex's sleepq, and we are not on any sleepq because |
| | | 1117 | * we are running. |
| | | 1118 | * |
| | | 1119 | * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned |
| | | 1120 | * EWOULDBLOCK, and ensure the only other error we return is EINTR. |
979 | */ | | 1121 | */ |
980 | if (error) { | | 1122 | if (error) { |
981 | futex_wait_abort(fw); | | 1123 | f = l->l_futex; |
| | | 1124 | if (f != NULL) { |
| | | 1125 | SDT_PROBE0(futex, wait, finish, aborted); |
| | | 1126 | l->l_futex = NULL; |
| | | 1127 | futex_rele(f); |
| | | 1128 | } else { |
| | | 1129 | /* Raced with wakeup; report success. */ |
| | | 1130 | SDT_PROBE0(futex, wait, finish, wakerace); |
| | | 1131 | error = 0; |
| | | 1132 | } |
982 | if (error == EWOULDBLOCK) | | 1133 | if (error == EWOULDBLOCK) |
983 | error = ETIMEDOUT; | | 1134 | error = ETIMEDOUT; |
| | | 1135 | else if (error != ETIMEDOUT) |
| | | 1136 | error = EINTR; |
| | | 1137 | } else { |
| | | 1138 | KASSERT(l->l_futex == NULL); |
| | | 1139 | SDT_PROBE0(futex, wait, finish, normally); |
984 | } | | 1140 | } |
985 | | | 1141 | |
986 | mutex_exit(&fw->fw_lock); | | | |
987 | | | | |
988 | return error; | | 1142 | return error; |
989 | } | | 1143 | } |
990 | | | 1144 | |
991 | /* | | 1145 | /* |
992 | * futex_wake(f, nwake, f2, nrequeue, bitset) | | 1146 | * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset) |
993 | * | | 1147 | * |
994 | * Wake up to nwake waiters on f matching bitset; then, if f2 is | | 1148 | * Wake up to nwake waiters on f matching bitset; then, if f2 is |
995 | * provided, move up to nrequeue remaining waiters on f matching | | 1149 | * provided, move up to nrequeue remaining waiters on f matching |
996 | * bitset to f2. Return the number of waiters actually woken. | | 1150 | * bitset to f2. Return the number of waiters actually woken. |
997 | * Caller must hold the locks of f and f2, if provided. | | 1151 | * Caller must hold the locks of f and f2, if provided. |
998 | */ | | 1152 | */ |
999 | static unsigned | | 1153 | static unsigned |
1000 | futex_wake(struct futex *f, unsigned nwake, struct futex *f2, | | 1154 | futex_wake(struct futex *f, int q, unsigned int const nwake, |
1001 | unsigned nrequeue, int bitset) | | 1155 | struct futex *f2, int q2, unsigned int const nrequeue, |
| | | 1156 | int bitset) |
1002 | { | | 1157 | { |
1003 | struct futex_wait *fw, *fw_next; | | 1158 | struct lwp *l, *l_next; |
1004 | unsigned nwoken = 0; | | 1159 | unsigned nwoken = 0; |
1005 | int hold_error __diagused; | | 1160 | unsigned nrequeued = 0; |
| | | 1161 | |
| | | 1162 | KASSERT(mutex_owned(&f->fx_op_lock)); |
| | | 1163 | KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock)); |
1006 | | | 1164 | |
1007 | KASSERT(mutex_owned(&f->fx_qlock)); | | 1165 | futex_sq_lock2(f, f2); |
1008 | KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock)); | | | |
1009 | | | 1166 | |
1010 | /* Wake up to nwake waiters, and count the number woken. */ | | 1167 | /* Wake up to nwake waiters, and count the number woken. */ |
1011 | TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { | | 1168 | LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) { |
1012 | if ((fw->fw_bitset & bitset) == 0) | | 1169 | if (nwoken == nwake) |
1013 | continue; | | | |
1014 | if (nwake > 0) { | | | |
1015 | mutex_enter(&fw->fw_lock); | | | |
1016 | if (__predict_false(fw->fw_aborting)) { | | | |
1017 | mutex_exit(&fw->fw_lock); | | | |
1018 | continue; | | | |
1019 | } | | | |
1020 | futex_wait_dequeue(fw, f); | | | |
1021 | fw->fw_bitset = 0; | | | |
1022 | cv_broadcast(&fw->fw_cv); | | | |
1023 | mutex_exit(&fw->fw_lock); | | | |
1024 | nwake--; | | | |
1025 | nwoken++; | | | |
1026 | /* | | | |
1027 | * Drop the futex reference on behalf of the | | | |
1028 | * waiter. We assert this is not the last | | | |
1029 | * reference on the futex (our caller should | | | |
1030 | * also have one). | | | |
1031 | */ | | | |
1032 | futex_rele_not_last(f); | | | |
1033 | } else { | | | |
1034 | break; | | 1170 | break; |
1035 | } | | 1171 | |
| | | 1172 | KASSERT(l->l_syncobj == &futex_syncobj); |
| | | 1173 | KASSERT(l->l_wchan == (wchan_t)f); |
| | | 1174 | KASSERT(l->l_futex == f); |
| | | 1175 | KASSERT(l->l_sleepq == &f->fx_sqs[q]); |
| | | 1176 | KASSERT(l->l_mutex == f->fx_sq_lock); |
| | | 1177 | |
| | | 1178 | if ((l->l_futex_wakesel & bitset) == 0) |
| | | 1179 | continue; |
| | | 1180 | |
| | | 1181 | l->l_futex_wakesel = 0; |
| | | 1182 | l->l_futex = NULL; |
| | | 1183 | sleepq_remove(&f->fx_sqs[q], l); |
| | | 1184 | f->fx_nwaiters[q]--; |
| | | 1185 | nwoken++; |
| | | 1186 | /* hold counts adjusted in bulk below */ |
1036 | } | | 1187 | } |
1037 | | | 1188 | |
1038 | if (f2) { | | 1189 | if (nrequeue) { |
| | | 1190 | KASSERT(f2 != NULL); |
1039 | /* Move up to nrequeue waiters from f's queue to f2's queue. */ | | 1191 | /* Move up to nrequeue waiters from f's queue to f2's queue. */ |
1040 | TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { | | 1192 | LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) { |
1041 | if ((fw->fw_bitset & bitset) == 0) | | 1193 | if (nrequeued == nrequeue) |
1042 | continue; | | | |
1043 | if (nrequeue > 0) { | | | |
1044 | mutex_enter(&fw->fw_lock); | | | |
1045 | if (__predict_false(fw->fw_aborting)) { | | | |
1046 | mutex_exit(&fw->fw_lock); | | | |
1047 | continue; | | | |
1048 | } | | | |
1049 | futex_wait_dequeue(fw, f); | | | |
1050 | futex_wait_enqueue(fw, f2); | | | |
1051 | mutex_exit(&fw->fw_lock); | | | |
1052 | nrequeue--; | | | |
1053 | /* | | | |
1054 | * Transfer the reference from f to f2. | | | |
1055 | * As above, we assert that we are not | | | |
1056 | * dropping the last reference to f here. | | | |
1057 | * | | | |
1058 | * XXX futex_hold() could theoretically | | | |
1059 | * XXX fail here. | | | |
1060 | */ | | | |
1061 | futex_rele_not_last(f); | | | |
1062 | hold_error = futex_hold(f2); | | | |
1063 | KASSERT(hold_error == 0); | | | |
1064 | } else { | | | |
1065 | break; | | 1194 | break; |
1066 | } | | 1195 | |
| | | 1196 | KASSERT(l->l_syncobj == &futex_syncobj); |
| | | 1197 | KASSERT(l->l_wchan == (wchan_t)f); |
| | | 1198 | KASSERT(l->l_futex == f); |
| | | 1199 | KASSERT(l->l_sleepq == &f->fx_sqs[q]); |
| | | 1200 | KASSERT(l->l_mutex == f->fx_sq_lock); |
| | | 1201 | |
| | | 1202 | if ((l->l_futex_wakesel & bitset) == 0) |
| | | 1203 | continue; |
| | | 1204 | |
| | | 1205 | l->l_futex = f2; |
| | | 1206 | sleepq_transfer(l, &f->fx_sqs[q], |
| | | 1207 | &f2->fx_sqs[q2], f2, futex_wmesg, |
| | | 1208 | &futex_syncobj, f2->fx_sq_lock, true); |
| | | 1209 | f->fx_nwaiters[q]--; |
| | | 1210 | f2->fx_nwaiters[q2]++; |
| | | 1211 | nrequeued++; |
| | | 1212 | /* hold counts adjusted in bulk below */ |
| | | 1213 | } |
| | | 1214 | if (nrequeued) { |
| | | 1215 | /* XXX futex_hold() could theoretically fail here. */ |
| | | 1216 | int hold_error __diagused = |
| | | 1217 | futex_hold_count(f2, nrequeued); |
| | | 1218 | KASSERT(hold_error == 0); |
1067 | } | | 1219 | } |
1068 | } else { | | | |
1069 | KASSERT(nrequeue == 0); | | | |
1070 | } | | 1220 | } |
1071 | | | 1221 | |
1072 | /* Return the number of waiters woken. */ | | 1222 | /* |
1073 | return nwoken; | | 1223 | * Adjust the futex reference counts for the wakes and |
| | | 1224 | * requeues. |
| | | 1225 | */ |
| | | 1226 | KASSERT(nwoken + nrequeued <= LONG_MAX); |
| | | 1227 | futex_rele_count_not_last(f, nwoken + nrequeued); |
| | | 1228 | |
| | | 1229 | futex_sq_unlock2(f, f2); |
| | | 1230 | |
| | | 1231 | /* Return the number of waiters woken and requeued. */ |
| | | 1232 | return nwoken + nrequeued; |
1074 | } | | 1233 | } |
1075 | | | 1234 | |
1076 | /* | | 1235 | /* |
1077 | * futex_queue_lock(f) | | 1236 | * futex_op_lock(f) |
1078 | * | | 1237 | * |
1079 | * Acquire the queue lock of f. Pair with futex_queue_unlock. Do | | 1238 | * Acquire the op lock of f. Pair with futex_op_unlock. Do |
1080 | * not use if caller needs to acquire two locks; use | | 1239 | * not use if caller needs to acquire two locks; use |
1081 | * futex_queue_lock2 instead. | | 1240 | * futex_op_lock2 instead. |
1082 | */ | | 1241 | */ |
1083 | static void | | 1242 | static void |
1084 | futex_queue_lock(struct futex *f) | | 1243 | futex_op_lock(struct futex *f) |
1085 | { | | 1244 | { |
1086 | mutex_enter(&f->fx_qlock); | | 1245 | mutex_enter(&f->fx_op_lock); |
1087 | } | | 1246 | } |
1088 | | | 1247 | |
1089 | /* | | 1248 | /* |
1090 | * futex_queue_unlock(f) | | 1249 | * futex_op_unlock(f) |
1091 | * | | 1250 | * |
1092 | * Release the queue lock of f. | | 1251 | * Release the queue lock of f. |
1093 | */ | | 1252 | */ |
1094 | static void | | 1253 | static void |
1095 | futex_queue_unlock(struct futex *f) | | 1254 | futex_op_unlock(struct futex *f) |
1096 | { | | 1255 | { |
1097 | mutex_exit(&f->fx_qlock); | | 1256 | mutex_exit(&f->fx_op_lock); |
1098 | } | | 1257 | } |
1099 | | | 1258 | |
1100 | /* | | 1259 | /* |
1101 | * futex_queue_lock2(f, f2) | | 1260 | * futex_op_lock2(f, f2) |
1102 | * | | 1261 | * |
1103 | * Acquire the queue locks of both f and f2, which may be null, or | | 1262 | * Acquire the op locks of both f and f2, which may be null, or |
1104 | * which may have the same underlying queue. If they are | | 1263 | * which may be the same. If they are distinct, an arbitrary total |
1105 | * distinct, an arbitrary total order is chosen on the locks. | | 1264 | * order is chosen on the locks. |
1106 | * | | 1265 | * |
1107 | * Callers should only ever acquire multiple queue locks | | 1266 | * Callers should only ever acquire multiple op locks |
1108 | * simultaneously using futex_queue_lock2. | | 1267 | * simultaneously using futex_op_lock2. |
1109 | */ | | 1268 | */ |
1110 | static void | | 1269 | static void |
1111 | futex_queue_lock2(struct futex *f, struct futex *f2) | | 1270 | futex_op_lock2(struct futex *f, struct futex *f2) |
1112 | { | | 1271 | { |
1113 | | | 1272 | |
1114 | /* | | 1273 | /* |
1115 | * If both are null, do nothing; if one is null and the other | | 1274 | * If both are null, do nothing; if one is null and the other |
1116 | * is not, lock the other and be done with it. | | 1275 | * is not, lock the other and be done with it. |
1117 | */ | | 1276 | */ |
1118 | if (f == NULL && f2 == NULL) { | | 1277 | if (f == NULL && f2 == NULL) { |
1119 | return; | | 1278 | return; |
1120 | } else if (f == NULL) { | | 1279 | } else if (f == NULL) { |
1121 | mutex_enter(&f2->fx_qlock); | | 1280 | mutex_enter(&f2->fx_op_lock); |
1122 | return; | | 1281 | return; |
1123 | } else if (f2 == NULL) { | | 1282 | } else if (f2 == NULL) { |
1124 | mutex_enter(&f->fx_qlock); | | 1283 | mutex_enter(&f->fx_op_lock); |
1125 | return; | | 1284 | return; |
1126 | } | | 1285 | } |
1127 | | | 1286 | |
1128 | /* If both futexes are the same, acquire only one. */ | | 1287 | /* If both futexes are the same, acquire only one. */ |
1129 | if (f == f2) { | | 1288 | if (f == f2) { |
1130 | mutex_enter(&f->fx_qlock); | | 1289 | mutex_enter(&f->fx_op_lock); |
1131 | return; | | 1290 | return; |
1132 | } | | 1291 | } |
1133 | | | 1292 | |
1134 | /* Otherwise, use the ordering on the kva of the futex pointer. */ | | 1293 | /* Otherwise, use the ordering on the kva of the futex pointer. */ |
1135 | if ((uintptr_t)f < (uintptr_t)f2) { | | 1294 | if ((uintptr_t)f < (uintptr_t)f2) { |
1136 | mutex_enter(&f->fx_qlock); | | 1295 | mutex_enter(&f->fx_op_lock); |
1137 | mutex_enter(&f2->fx_qlock); | | 1296 | mutex_enter(&f2->fx_op_lock); |
1138 | } else { | | 1297 | } else { |
1139 | mutex_enter(&f2->fx_qlock); | | 1298 | mutex_enter(&f2->fx_op_lock); |
1140 | mutex_enter(&f->fx_qlock); | | 1299 | mutex_enter(&f->fx_op_lock); |
1141 | } | | 1300 | } |
1142 | } | | 1301 | } |
1143 | | | 1302 | |
1144 | /* | | 1303 | /* |
1145 | * futex_queue_unlock2(f, f2) | | 1304 | * futex_op_unlock2(f, f2) |
1146 | * | | 1305 | * |
1147 | * Release the queue locks of both f and f2, which may be null, or | | 1306 | * Release the queue locks of both f and f2, which may be null, or |
1148 | * which may have the same underlying queue. | | 1307 | * which may have the same underlying queue. |
1149 | */ | | 1308 | */ |
1150 | static void | | 1309 | static void |
1151 | futex_queue_unlock2(struct futex *f, struct futex *f2) | | 1310 | futex_op_unlock2(struct futex *f, struct futex *f2) |
1152 | { | | 1311 | { |
1153 | | | 1312 | |
1154 | /* | | 1313 | /* |
1155 | * If both are null, do nothing; if one is null and the other | | 1314 | * If both are null, do nothing; if one is null and the other |
1156 | * is not, unlock the other and be done with it. | | 1315 | * is not, unlock the other and be done with it. |
1157 | */ | | 1316 | */ |
1158 | if (f == NULL && f2 == NULL) { | | 1317 | if (f == NULL && f2 == NULL) { |
1159 | return; | | 1318 | return; |
1160 | } else if (f == NULL) { | | 1319 | } else if (f == NULL) { |
1161 | mutex_exit(&f2->fx_qlock); | | 1320 | mutex_exit(&f2->fx_op_lock); |
1162 | return; | | 1321 | return; |
1163 | } else if (f2 == NULL) { | | 1322 | } else if (f2 == NULL) { |
1164 | mutex_exit(&f->fx_qlock); | | 1323 | mutex_exit(&f->fx_op_lock); |
1165 | return; | | 1324 | return; |
1166 | } | | 1325 | } |
1167 | | | 1326 | |
1168 | /* If both futexes are the same, release only one. */ | | 1327 | /* If both futexes are the same, release only one. */ |
1169 | if (f == f2) { | | 1328 | if (f == f2) { |
1170 | mutex_exit(&f->fx_qlock); | | 1329 | mutex_exit(&f->fx_op_lock); |
1171 | return; | | 1330 | return; |
1172 | } | | 1331 | } |
1173 | | | 1332 | |
1174 | /* Otherwise, use the ordering on the kva of the futex pointer. */ | | 1333 | /* Otherwise, use the ordering on the kva of the futex pointer. */ |
1175 | if ((uintptr_t)f < (uintptr_t)f2) { | | 1334 | if ((uintptr_t)f < (uintptr_t)f2) { |
1176 | mutex_exit(&f2->fx_qlock); | | 1335 | mutex_exit(&f2->fx_op_lock); |
1177 | mutex_exit(&f->fx_qlock); | | 1336 | mutex_exit(&f->fx_op_lock); |
1178 | } else { | | 1337 | } else { |
1179 | mutex_exit(&f->fx_qlock); | | 1338 | mutex_exit(&f->fx_op_lock); |
1180 | mutex_exit(&f2->fx_qlock); | | 1339 | mutex_exit(&f2->fx_op_lock); |
1181 | } | | 1340 | } |
1182 | } | | 1341 | } |
1183 | | | 1342 | |
1184 | /* | | 1343 | /* |
1185 | * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval) | | 1344 | * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval) |
1186 | * | | 1345 | * |
1187 | * Implement futex(FUTEX_WAIT). | | 1346 | * Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET). |
1188 | */ | | 1347 | */ |
1189 | static int | | 1348 | static int |
1190 | futex_func_wait(bool shared, int *uaddr, int val, int val3, | | 1349 | futex_func_wait(bool shared, int *uaddr, int val, int val3, |
1191 | const struct timespec *timeout, clockid_t clkid, int clkflags, | | 1350 | const struct timespec *timeout, clockid_t clkid, int clkflags, |
1192 | register_t *retval) | | 1351 | register_t *retval) |
1193 | { | | 1352 | { |
1194 | struct futex *f; | | 1353 | struct futex *f; |
1195 | struct futex_wait wait, *fw = &wait; | | | |
1196 | struct timespec ts; | | 1354 | struct timespec ts; |
1197 | const struct timespec *deadline; | | 1355 | const struct timespec *deadline; |
1198 | int error; | | 1356 | int error; |
1199 | | | 1357 | |
1200 | /* | | 1358 | /* |
1201 | * If there's nothing to wait for, and nobody will ever wake | | 1359 | * If there's nothing to wait for, and nobody will ever wake |
1202 | * us, then don't set anything up to wait -- just stop here. | | 1360 | * us, then don't set anything up to wait -- just stop here. |
1203 | */ | | 1361 | */ |
1204 | if (val3 == 0) | | 1362 | if (val3 == 0) |
1205 | return EINVAL; | | 1363 | return EINVAL; |
1206 | | | 1364 | |
1207 | /* Optimistically test before anything else. */ | | 1365 | /* Optimistically test before anything else. */ |
1208 | if (!futex_test(uaddr, val)) | | 1366 | if (!futex_test(uaddr, val)) |
1209 | return EAGAIN; | | 1367 | return EAGAIN; |
1210 | | | 1368 | |
1211 | /* Determine a deadline on the specified clock. */ | | 1369 | /* Determine a deadline on the specified clock. */ |
1212 | if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) { | | 1370 | if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) { |
1213 | deadline = timeout; | | 1371 | deadline = timeout; |
1214 | } else { | | 1372 | } else { |
| | | 1373 | struct timespec interval = *timeout; |
| | | 1374 | |
| | | 1375 | error = itimespecfix(&interval); |
| | | 1376 | if (error) |
| | | 1377 | return error; |
1215 | error = clock_gettime1(clkid, &ts); | | 1378 | error = clock_gettime1(clkid, &ts); |
1216 | if (error) | | 1379 | if (error) |
1217 | return error; | | 1380 | return error; |
1218 | timespecadd(&ts, timeout, &ts); | | 1381 | timespecadd(&ts, &interval, &ts); |
1219 | deadline = &ts; | | 1382 | deadline = &ts; |
1220 | } | | 1383 | } |
1221 | | | 1384 | |
1222 | /* Get the futex, creating it if necessary. */ | | 1385 | /* Get the futex, creating it if necessary. */ |
1223 | error = futex_lookup_create(uaddr, shared, &f); | | 1386 | error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f); |
1224 | if (error) | | 1387 | if (error) |
1225 | return error; | | 1388 | return error; |
1226 | KASSERT(f); | | 1389 | KASSERT(f); |
1227 | | | 1390 | |
1228 | /* Get ready to wait. */ | | | |
1229 | futex_wait_init(fw, val3); | | | |
1230 | | | | |
1231 | /* | | 1391 | /* |
1232 | * Under the queue lock, check the value again: if it has | | 1392 | * Under the op lock, check the value again: if it has |
1233 | * already changed, EAGAIN; otherwise enqueue the waiter. | | 1393 | * already changed, EAGAIN; otherwise enqueue the waiter. |
1234 | * Since FUTEX_WAKE will use the same lock and be done after | | 1394 | * Since FUTEX_WAKE will use the same lock and be done after |
1235 | * modifying the value, the order in which we check and enqueue | | 1395 | * modifying the value, the order in which we check and enqueue |
1236 | * is immaterial. | | 1396 | * is immaterial. |
1237 | */ | | 1397 | */ |
1238 | futex_queue_lock(f); | | 1398 | futex_op_lock(f); |
1239 | if (!futex_test(uaddr, val)) { | | 1399 | if (!futex_test(uaddr, val)) { |
1240 | futex_queue_unlock(f); | | 1400 | futex_op_unlock(f); |
1241 | error = EAGAIN; | | 1401 | error = EAGAIN; |
1242 | goto out; | | 1402 | goto out; |
1243 | } | | 1403 | } |
1244 | mutex_enter(&fw->fw_lock); | | 1404 | |
1245 | futex_wait_enqueue(fw, f); | | 1405 | /* |
1246 | mutex_exit(&fw->fw_lock); | | 1406 | * Now wait. futex_wait() will drop our op lock once we |
1247 | futex_queue_unlock(f); | | 1407 | * are entered into the sleep queue, thus ensuring atomicity |
| | | 1408 | * of wakes with respect to waits. |
| | | 1409 | */ |
| | | 1410 | error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3); |
1248 | | | 1411 | |
1249 | /* | | 1412 | /* |
1250 | * We cannot drop our reference to the futex here, because | | 1413 | * We cannot drop our reference to the futex here, because |
1251 | * we might be enqueued on a different one when we are awakened. | | 1414 | * we might be enqueued on a different one when we are awakened. |
1252 | * The references will be managed on our behalf in the requeue | | 1415 | * The references will be managed on our behalf in the requeue, |
1253 | * and wake cases. | | 1416 | * wake, and error cases. |
1254 | */ | | 1417 | */ |
1255 | f = NULL; | | 1418 | f = NULL; |
1256 | | | 1419 | |
1257 | /* Wait. */ | | 1420 | if (__predict_true(error == 0)) { |
1258 | error = futex_wait(fw, deadline, clkid); | | 1421 | /* Return 0 on success, error on failure. */ |
1259 | if (error) | | 1422 | *retval = 0; |
1260 | goto out; | | 1423 | } |
1261 | | | | |
1262 | /* Return 0 on success, error on failure. */ | | | |
1263 | *retval = 0; | | | |
1264 | | | 1424 | |
1265 | out: if (f != NULL) | | 1425 | out: if (f != NULL) |
1266 | futex_rele(f); | | 1426 | futex_rele(f); |
1267 | futex_wait_fini(fw); | | | |
1268 | return error; | | 1427 | return error; |
1269 | } | | 1428 | } |
1270 | | | 1429 | |
1271 | /* | | 1430 | /* |
1272 | * futex_func_wake(uaddr, val, val3, retval) | | 1431 | * futex_func_wake(uaddr, val, val3, retval) |
1273 | * | | 1432 | * |
1274 | * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET). | | 1433 | * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET). |
1275 | */ | | 1434 | */ |
1276 | static int | | 1435 | static int |
1277 | futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval) | | 1436 | futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval) |
1278 | { | | 1437 | { |
1279 | struct futex *f; | | 1438 | struct futex *f; |
1280 | unsigned int nwoken = 0; | | 1439 | unsigned int nwoken = 0; |
1281 | int error = 0; | | 1440 | int error = 0; |
1282 | | | 1441 | |
1283 | /* Reject negative number of wakeups. */ | | 1442 | /* Reject negative number of wakeups. */ |
1284 | if (val < 0) { | | 1443 | if (val < 0) { |
1285 | error = EINVAL; | | 1444 | error = EINVAL; |
1286 | goto out; | | 1445 | goto out; |
1287 | } | | 1446 | } |
1288 | | | 1447 | |
1289 | /* Look up the futex, if any. */ | | 1448 | /* Look up the futex, if any. */ |
1290 | error = futex_lookup(uaddr, shared, &f); | | 1449 | error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f); |
1291 | if (error) | | 1450 | if (error) |
1292 | goto out; | | 1451 | goto out; |
1293 | | | 1452 | |
1294 | /* If there's no futex, there are no waiters to wake. */ | | 1453 | /* If there's no futex, there are no waiters to wake. */ |
1295 | if (f == NULL) | | 1454 | if (f == NULL) |
1296 | goto out; | | 1455 | goto out; |
1297 | | | 1456 | |
1298 | /* | | 1457 | /* |
1299 | * Under f's queue lock, wake the waiters and remember the | | 1458 | * Under f's op lock, wake the waiters and remember the |
1300 | * number woken. | | 1459 | * number woken. |
1301 | */ | | 1460 | */ |
1302 | futex_queue_lock(f); | | 1461 | futex_op_lock(f); |
1303 | nwoken = futex_wake(f, val, NULL, 0, val3); | | 1462 | nwoken = futex_wake(f, FUTEX_WRITERQ, val, |
1304 | futex_queue_unlock(f); | | 1463 | NULL, FUTEX_WRITERQ, 0, val3); |
| | | 1464 | futex_op_unlock(f); |
1305 | | | 1465 | |
1306 | /* Release the futex. */ | | 1466 | /* Release the futex. */ |
1307 | futex_rele(f); | | 1467 | futex_rele(f); |
1308 | | | 1468 | |
1309 | out: | | 1469 | out: |
1310 | /* Return the number of waiters woken. */ | | 1470 | /* Return the number of waiters woken. */ |
1311 | *retval = nwoken; | | 1471 | *retval = nwoken; |
1312 | | | 1472 | |
1313 | /* Success! */ | | 1473 | /* Success! */ |
1314 | return error; | | 1474 | return error; |
1315 | } | | 1475 | } |
1316 | | | 1476 | |
1317 | /* | | 1477 | /* |
| @@ -1324,54 +1484,56 @@ futex_func_requeue(bool shared, int op, | | | @@ -1324,54 +1484,56 @@ futex_func_requeue(bool shared, int op, |
1324 | int val2, int val3, register_t *retval) | | 1484 | int val2, int val3, register_t *retval) |
1325 | { | | 1485 | { |
1326 | struct futex *f = NULL, *f2 = NULL; | | 1486 | struct futex *f = NULL, *f2 = NULL; |
1327 | unsigned nwoken = 0; /* default to zero woken on early return */ | | 1487 | unsigned nwoken = 0; /* default to zero woken on early return */ |
1328 | int error; | | 1488 | int error; |
1329 | | | 1489 | |
1330 | /* Reject negative number of wakeups or requeues. */ | | 1490 | /* Reject negative number of wakeups or requeues. */ |
1331 | if (val < 0 || val2 < 0) { | | 1491 | if (val < 0 || val2 < 0) { |
1332 | error = EINVAL; | | 1492 | error = EINVAL; |
1333 | goto out; | | 1493 | goto out; |
1334 | } | | 1494 | } |
1335 | | | 1495 | |
1336 | /* Look up the source futex, if any. */ | | 1496 | /* Look up the source futex, if any. */ |
1337 | error = futex_lookup(uaddr, shared, &f); | | 1497 | error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f); |
1338 | if (error) | | 1498 | if (error) |
1339 | goto out; | | 1499 | goto out; |
1340 | | | 1500 | |
1341 | /* If there is none, nothing to do. */ | | 1501 | /* If there is none, nothing to do. */ |
1342 | if (f == NULL) | | 1502 | if (f == NULL) |
1343 | goto out; | | 1503 | goto out; |
1344 | | | 1504 | |
1345 | /* | | 1505 | /* |
1346 | * We may need to create the destination futex because it's | | 1506 | * We may need to create the destination futex because it's |
1347 | * entirely possible it does not currently have any waiters. | | 1507 | * entirely possible it does not currently have any waiters. |
1348 | */ | | 1508 | */ |
1349 | error = futex_lookup_create(uaddr2, shared, &f2); | | 1509 | error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2); |
1350 | if (error) | | 1510 | if (error) |
1351 | goto out; | | 1511 | goto out; |
1352 | | | 1512 | |
1353 | /* | | 1513 | /* |
1354 | * Under the futexes' queue locks, check the value; if | | 1514 | * Under the futexes' op locks, check the value; if |
1355 | * unchanged from val3, wake the waiters. | | 1515 | * unchanged from val3, wake the waiters. |
1356 | */ | | 1516 | */ |
1357 | futex_queue_lock2(f, f2); | | 1517 | futex_op_lock2(f, f2); |
1358 | if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) { | | 1518 | if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) { |
1359 | error = EAGAIN; | | 1519 | error = EAGAIN; |
1360 | } else { | | 1520 | } else { |
1361 | error = 0; | | 1521 | error = 0; |
1362 | nwoken = futex_wake(f, val, f2, val2, FUTEX_BITSET_MATCH_ANY); | | 1522 | nwoken = futex_wake(f, FUTEX_WRITERQ, val, |
| | | 1523 | f2, FUTEX_WRITERQ, val2, |
| | | 1524 | FUTEX_BITSET_MATCH_ANY); |
1363 | } | | 1525 | } |
1364 | futex_queue_unlock2(f, f2); | | 1526 | futex_op_unlock2(f, f2); |
1365 | | | 1527 | |
1366 | out: | | 1528 | out: |
1367 | /* Return the number of waiters woken. */ | | 1529 | /* Return the number of waiters woken. */ |
1368 | *retval = nwoken; | | 1530 | *retval = nwoken; |
1369 | | | 1531 | |
1370 | /* Release the futexes if we got them. */ | | 1532 | /* Release the futexes if we got them. */ |
1371 | if (f2) | | 1533 | if (f2) |
1372 | futex_rele(f2); | | 1534 | futex_rele(f2); |
1373 | if (f) | | 1535 | if (f) |
1374 | futex_rele(f); | | 1536 | futex_rele(f); |
1375 | return error; | | 1537 | return error; |
1376 | } | | 1538 | } |
1377 | | | 1539 | |
| @@ -1515,119 +1677,661 @@ futex_func_wake_op(bool shared, int *uad | | | @@ -1515,119 +1677,661 @@ futex_func_wake_op(bool shared, int *uad |
1515 | int error; | | 1677 | int error; |
1516 | | | 1678 | |
1517 | /* Reject negative number of wakeups. */ | | 1679 | /* Reject negative number of wakeups. */ |
1518 | if (val < 0 || val2 < 0) { | | 1680 | if (val < 0 || val2 < 0) { |
1519 | error = EINVAL; | | 1681 | error = EINVAL; |
1520 | goto out; | | 1682 | goto out; |
1521 | } | | 1683 | } |
1522 | | | 1684 | |
1523 | /* Reject invalid operations before we start doing things. */ | | 1685 | /* Reject invalid operations before we start doing things. */ |
1524 | if ((error = futex_validate_op_cmp(val3)) != 0) | | 1686 | if ((error = futex_validate_op_cmp(val3)) != 0) |
1525 | goto out; | | 1687 | goto out; |
1526 | | | 1688 | |
1527 | /* Look up the first futex, if any. */ | | 1689 | /* Look up the first futex, if any. */ |
1528 | error = futex_lookup(uaddr, shared, &f); | | 1690 | error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f); |
1529 | if (error) | | 1691 | if (error) |
1530 | goto out; | | 1692 | goto out; |
1531 | | | 1693 | |
1532 | /* Look up the second futex, if any. */ | | 1694 | /* Look up the second futex, if any. */ |
1533 | error = futex_lookup(uaddr2, shared, &f2); | | 1695 | error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2); |
1534 | if (error) | | 1696 | if (error) |
1535 | goto out; | | 1697 | goto out; |
1536 | | | 1698 | |
1537 | /* | | 1699 | /* |
1538 | * Under the queue locks: | | 1700 | * Under the op locks: |
1539 | * | | 1701 | * |
1540 | * 1. Read/modify/write: *uaddr2 op= oparg. | | 1702 | * 1. Read/modify/write: *uaddr2 op= oparg. |
1541 | * 2. Unconditionally wake uaddr. | | 1703 | * 2. Unconditionally wake uaddr. |
1542 | * 3. Conditionally wake uaddr2, if it previously matched val2. | | 1704 | * 3. Conditionally wake uaddr2, if it previously matched val3. |
1543 | */ | | 1705 | */ |
1544 | futex_queue_lock2(f, f2); | | 1706 | futex_op_lock2(f, f2); |
1545 | do { | | 1707 | do { |
1546 | error = futex_load(uaddr2, &oldval); | | 1708 | error = futex_load(uaddr2, &oldval); |
1547 | if (error) | | 1709 | if (error) |
1548 | goto out_unlock; | | 1710 | goto out_unlock; |
1549 | newval = futex_compute_op(oldval, val3); | | 1711 | newval = futex_compute_op(oldval, val3); |
1550 | error = ucas_int(uaddr2, oldval, newval, &actual); | | 1712 | error = ucas_int(uaddr2, oldval, newval, &actual); |
1551 | if (error) | | 1713 | if (error) |
1552 | goto out_unlock; | | 1714 | goto out_unlock; |
1553 | } while (actual != oldval); | | 1715 | } while (actual != oldval); |
1554 | nwoken = (f ? futex_wake(f, val, NULL, 0, FUTEX_BITSET_MATCH_ANY) : 0); | | 1716 | nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val, |
| | | 1717 | NULL, FUTEX_WRITERQ, 0, |
| | | 1718 | FUTEX_BITSET_MATCH_ANY) : 0); |
1555 | if (f2 && futex_compute_cmp(oldval, val3)) | | 1719 | if (f2 && futex_compute_cmp(oldval, val3)) |
1556 | nwoken += futex_wake(f2, val2, NULL, 0, | | 1720 | nwoken += futex_wake(f2, FUTEX_WRITERQ, val2, |
1557 | FUTEX_BITSET_MATCH_ANY); | | 1721 | NULL, FUTEX_WRITERQ, 0, |
| | | 1722 | FUTEX_BITSET_MATCH_ANY); |
1558 | | | 1723 | |
1559 | /* Success! */ | | 1724 | /* Success! */ |
1560 | error = 0; | | 1725 | error = 0; |
1561 | out_unlock: | | 1726 | out_unlock: |
1562 | futex_queue_unlock2(f, f2); | | 1727 | futex_op_unlock2(f, f2); |
1563 | | | 1728 | |
1564 | out: | | 1729 | out: |
1565 | /* Return the number of waiters woken. */ | | 1730 | /* Return the number of waiters woken. */ |
1566 | *retval = nwoken; | | 1731 | *retval = nwoken; |
1567 | | | 1732 | |
1568 | /* Release the futexes, if we got them. */ | | 1733 | /* Release the futexes, if we got them. */ |
1569 | if (f2) | | 1734 | if (f2) |
1570 | futex_rele(f2); | | 1735 | futex_rele(f2); |
1571 | if (f) | | 1736 | if (f) |
1572 | futex_rele(f); | | 1737 | futex_rele(f); |
1573 | return error; | | 1738 | return error; |
1574 | } | | 1739 | } |
1575 | | | 1740 | |
1576 | /* | | 1741 | /* |
| | | 1742 | * futex_read_waiter_prio(l) |
| | | 1743 | * |
| | | 1744 | * Returns the read-waiter priority for purposes of comparing |
| | | 1745 | * against a write-waiter's priority. Read-waiters are only |
| | | 1746 | * prioritized if they are real-time threads. |
| | | 1747 | */ |
| | | 1748 | static inline pri_t |
| | | 1749 | futex_read_waiter_prio(struct lwp * const l) |
| | | 1750 | { |
| | | 1751 | const pri_t pri = lwp_eprio(l); |
| | | 1752 | if (__predict_false(pri >= PRI_USER_RT)) |
| | | 1753 | return pri; |
| | | 1754 | return PRI_NONE; |
| | | 1755 | } |
| | | 1756 | |
| | | 1757 | #if 0 /* XXX */ |
| | | 1758 | /* |
| | | 1759 | * futex_rw_handle_rt_reader(f, uaddr, val, pri, errorp) |
| | | 1760 | * |
| | | 1761 | * Attempt to resolve the case of a real-time thread attempting |
| | | 1762 | * to acquire a read lock. Turns true if the attempt is resolved |
| | | 1763 | * and the wait should be elided. |
| | | 1764 | */ |
| | | 1765 | static int __noinline |
| | | 1766 | futex_rw_handle_rt_reader(struct futex *f, int *uaddr, int val, |
| | | 1767 | pri_t pri_reader) |
| | | 1768 | { |
| | | 1769 | struct lwp *l_writer = NULL; |
| | | 1770 | int newval, next; |
| | | 1771 | int error; |
| | | 1772 | |
| | | 1773 | KASSERT(mutex_owned(&f->fx_op_lock)); |
| | | 1774 | |
| | | 1775 | /* |
| | | 1776 | * If the lock is write-locked, there isn't anything we |
| | | 1777 | * can do but wait. |
| | | 1778 | */ |
| | | 1779 | if (val & FUTEX_RW_WRITE_LOCKED) { |
| | | 1780 | return 0; |
| | | 1781 | } |
| | | 1782 | |
| | | 1783 | /* |
| | | 1784 | * If we're maxed out on readers already, nothing we can do. |
| | | 1785 | */ |
| | | 1786 | if ((val & FUTEX_TID_MASK) == FUTEX_TID_MASK) { |
| | | 1787 | return ENFILE; /* disctinct from EAGAIN */ |
| | | 1788 | } |
| | | 1789 | |
| | | 1790 | /* |
| | | 1791 | * The assumption then is that we arrived here with WRITE_WANTED |
| | | 1792 | * set. We're not going to bother testing that, however. We |
| | | 1793 | * will preserve it if we see it. |
| | | 1794 | * |
| | | 1795 | * Because WRITE_WANTED is set, This will cause everyone to enter |
| | | 1796 | * the futex to rw_wait. And we are holding the op lock so no |
| | | 1797 | * additional waiters will apear on the sleepq. We are going |
| | | 1798 | * to do the same tricky dance as rw_handoff to let higher-priority |
| | | 1799 | * real-time waiters slip past the gate. |
| | | 1800 | */ |
| | | 1801 | |
| | | 1802 | /* |
| | | 1803 | * All we want to do here is check if there is a writer waiting. |
| | | 1804 | * If there is, and it is equal or greater priority, this reader |
| | | 1805 | * loses, otherwise we will just make note if it to ensure that |
| | | 1806 | * the WRITE_WANTED bit is set when we update the futex word. |
| | | 1807 | * |
| | | 1808 | * Since we are not going to actually wake someone from the |
| | | 1809 | * queue here, we're not really interested if the write-waiter |
| | | 1810 | * happens to leave based on a timeout or signal; we know that |
| | | 1811 | * any write-waiter *after* the first one is of equal or lower |
| | | 1812 | * priority, so we would still best it. |
| | | 1813 | */ |
| | | 1814 | mutex_spin_enter(f->fx_sq_lock); |
| | | 1815 | l_writer = LIST_FIRST(&f->fx_sqs[FUTEX_WRITERQ]); |
| | | 1816 | if (__predict_true(l_writer != NULL)) { |
| | | 1817 | if (pri_reader <= lwp_eprio(l_writer)) { |
| | | 1818 | return 0; |
| | | 1819 | } |
| | | 1820 | } |
| | | 1821 | mutex_spin_exit(f->fx_sq_lock); |
| | | 1822 | |
| | | 1823 | for (;; val = next) { |
| | | 1824 | if (__predict_true(l_writer != NULL)) { |
| | | 1825 | newval = (val + 1) | FUTEX_RW_WRITE_WANTED; |
| | | 1826 | } else { |
| | | 1827 | /* |
| | | 1828 | * No writer waiting; increment the reader |
| | | 1829 | * count, preserve any WRITE_WANTED bit. |
| | | 1830 | */ |
| | | 1831 | newval = val + 1; |
| | | 1832 | KASSERT(((newval ^ val) & FUTEX_RW_WRITE_WANTED) == 0); |
| | | 1833 | } |
| | | 1834 | |
| | | 1835 | error = ucas_int(uaddr, val, newval, &next); |
| | | 1836 | if (__predict_false(error != 0)) { |
| | | 1837 | return error; |
| | | 1838 | } |
| | | 1839 | if (next == val) { |
| | | 1840 | /* Successfully acquired the read lock. */ |
| | | 1841 | return EJUSTRETURN; |
| | | 1842 | } |
| | | 1843 | /* |
| | | 1844 | * Repeat the terminal checks from above on the new |
| | | 1845 | * value. |
| | | 1846 | */ |
| | | 1847 | if (next & FUTEX_RW_WRITE_LOCKED) { |
| | | 1848 | return 0; |
| | | 1849 | } |
| | | 1850 | if ((next & FUTEX_TID_MASK) == FUTEX_TID_MASK) { |
| | | 1851 | return ENFILE; /* disctinct from EAGAIN */ |
| | | 1852 | } |
| | | 1853 | } |
| | | 1854 | } |
| | | 1855 | #endif /* XXX */ |
| | | 1856 | |
| | | 1857 | /* |
| | | 1858 | * futex_func_rw_wait(uaddr, val, val3, abstime, clkid, retval) |
| | | 1859 | * |
| | | 1860 | * Implement futex(FUTEX_NETBSD_RW_WAIT). |
| | | 1861 | */ |
| | | 1862 | static int |
| | | 1863 | futex_func_rw_wait(bool shared, int *uaddr, int val, int val3, |
| | | 1864 | const struct timespec *abstime, clockid_t clkid, register_t *retval) |
| | | 1865 | { |
| | | 1866 | #if 1 |
| | | 1867 | /* XXX NETBSD_RW ops are currently broken XXX */ |
| | | 1868 | return ENOSYS; |
| | | 1869 | #else |
| | | 1870 | struct futex *f; |
| | | 1871 | int error; |
| | | 1872 | |
| | | 1873 | /* Must specify READER or WRITER. */ |
| | | 1874 | if (__predict_false(val3 != FUTEX_RW_READER && |
| | | 1875 | val3 != FUTEX_RW_WRITER)) |
| | | 1876 | return EINVAL; |
| | | 1877 | |
| | | 1878 | /* Optimistically test before anything else. */ |
| | | 1879 | if (!futex_test(uaddr, val)) |
| | | 1880 | return EAGAIN; |
| | | 1881 | |
| | | 1882 | /* Get the futex, creating it if necessary. */ |
| | | 1883 | error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_RWLOCK, &f); |
| | | 1884 | if (error) |
| | | 1885 | return error; |
| | | 1886 | KASSERT(f); |
| | | 1887 | |
| | | 1888 | /* |
| | | 1889 | * Under the op lock, check the value again: if it has |
| | | 1890 | * already changed, EAGAIN; otherwise, enqueue the waiter |
| | | 1891 | * on the specified queue. |
| | | 1892 | */ |
| | | 1893 | futex_op_lock(f); |
| | | 1894 | if (!futex_test(uaddr, val)) { |
| | | 1895 | futex_op_unlock(f); |
| | | 1896 | error = EAGAIN; |
| | | 1897 | goto out; |
| | | 1898 | } |
| | | 1899 | |
| | | 1900 | /* |
| | | 1901 | * POSIX dictates that a real-time reader will be prioritized |
| | | 1902 | * over writers of lesser priority, when normally we would |
| | | 1903 | * prefer the writers. |
| | | 1904 | */ |
| | | 1905 | if (__predict_false(val3 == FUTEX_RW_READER)) { |
| | | 1906 | struct lwp * const l = curlwp; |
| | | 1907 | const pri_t pri_reader = futex_read_waiter_prio(l); |
| | | 1908 | if (__predict_false(pri_reader > PRI_NONE)) { |
| | | 1909 | error = futex_rw_handle_rt_reader(f, uaddr, val, |
| | | 1910 | pri_reader); |
| | | 1911 | if (error) { |
| | | 1912 | if (__predict_true(error == EJUSTRETURN)) { |
| | | 1913 | /* RT reader acquired the lock. */ |
| | | 1914 | error = 0; |
| | | 1915 | } |
| | | 1916 | futex_op_unlock(f); |
| | | 1917 | goto out; |
| | | 1918 | } |
| | | 1919 | } |
| | | 1920 | } |
| | | 1921 | |
| | | 1922 | /* |
| | | 1923 | * Now wait. futex_wait() will dop our op lock once we |
| | | 1924 | * are entered into the sleep queue, thus ensuring atomicity |
| | | 1925 | * of wakes with respect to waits. |
| | | 1926 | * |
| | | 1927 | * Use a wake selector of 0 so that this waiter won't match on any |
| | | 1928 | * of the other operations in case someone makes that error; only |
| | | 1929 | * rw_handoff is allowed! This is critical because a waiter that |
| | | 1930 | * awakens from an rw_wait assumes it has been given ownership of |
| | | 1931 | * the lock. |
| | | 1932 | */ |
| | | 1933 | error = futex_wait(f, val3, abstime, clkid, 0); |
| | | 1934 | |
| | | 1935 | /* |
| | | 1936 | * Don't drop our reference here. We won't be requeued, but |
| | | 1937 | * it's best to main symmetry with other operations. |
| | | 1938 | */ |
| | | 1939 | f = NULL; |
| | | 1940 | |
| | | 1941 | out: if (__predict_true(error == 0)) { |
| | | 1942 | /* Return 0 on success, error on failure. */ |
| | | 1943 | *retval = 0; |
| | | 1944 | } |
| | | 1945 | |
| | | 1946 | if (f != NULL) |
| | | 1947 | futex_rele(f); |
| | | 1948 | return error; |
| | | 1949 | #endif |
| | | 1950 | } |
| | | 1951 | |
| | | 1952 | /* |
| | | 1953 | * futex_func_rw_handoff(uaddr, val, retval) |
| | | 1954 | * |
| | | 1955 | * Implement futex(FUTEX_NETBSD_RW_HANDOFF). |
| | | 1956 | * |
| | | 1957 | * This implements direct hand-off to either a single writer |
| | | 1958 | * or all readers. |
| | | 1959 | */ |
| | | 1960 | static int |
| | | 1961 | futex_func_rw_handoff(bool shared, int *uaddr, int val, register_t *retval) |
| | | 1962 | { |
| | | 1963 | #if 1 |
| | | 1964 | /* XXX NETBSD_RW ops are currently broken XXX */ |
| | | 1965 | return ENOSYS; |
| | | 1966 | #else |
| | | 1967 | struct lwp *l, *l_next, *l_writer, *l_reader; |
| | | 1968 | struct futex *f; |
| | | 1969 | sleepq_t wsq, *sq; |
| | | 1970 | int error = 0; |
| | | 1971 | int newval, next, nwake, nwoken = 0; |
| | | 1972 | int allreaders __diagused = 0; |
| | | 1973 | unsigned int *nwaitersp; |
| | | 1974 | pri_t pri_writer; |
| | | 1975 | |
| | | 1976 | /* Look up the futex, if any. */ |
| | | 1977 | error = futex_lookup(uaddr, shared, FUTEX_CLASS_RWLOCK, &f); |
| | | 1978 | if (error) |
| | | 1979 | goto out; |
| | | 1980 | |
| | | 1981 | /* |
| | | 1982 | * There are a couple of diffrent scenarios where a thread |
| | | 1983 | * releasing a RW lock would call rw_handoff, yet we find no |
| | | 1984 | * waiters: |
| | | 1985 | * |
| | | 1986 | * ==> There were waiters on the queue, but they left the queue |
| | | 1987 | * due to a signal. |
| | | 1988 | * ==> The waiting thread set the waiter bit(s), but decided to |
| | | 1989 | * try spinning before calling rw_wait. |
| | | 1990 | * |
| | | 1991 | * In both of these cases, we will ensure that the lock word |
| | | 1992 | * gets cleared. |
| | | 1993 | */ |
| | | 1994 | |
| | | 1995 | /* If there's no futex, there are no waiters to wake. */ |
| | | 1996 | if (__predict_false(f == NULL)) { |
| | | 1997 | /* |
| | | 1998 | * If there are no waiters, ensure that the lock |
| | | 1999 | * word is cleared. |
| | | 2000 | */ |
| | | 2001 | error = ucas_int(uaddr, val, 0, &next); |
| | | 2002 | if (__predict_true(error == 0)) { |
| | | 2003 | if (__predict_false(val != next)) |
| | | 2004 | error = EAGAIN; |
| | | 2005 | } |
| | | 2006 | goto out; |
| | | 2007 | } |
| | | 2008 | |
| | | 2009 | /* |
| | | 2010 | * Compute the new value and store it in the futex word. |
| | | 2011 | * |
| | | 2012 | * This is a little tricky because the ucas could cause |
| | | 2013 | * a page fault, and we can't let that happen while holding |
| | | 2014 | * the sleepq locks. But we have to make sure that choices |
| | | 2015 | * we make regarding what threads to wake is accurately |
| | | 2016 | * reflected in the futex word and that the futex word is |
| | | 2017 | * updated before the LWPs can run. |
| | | 2018 | * |
| | | 2019 | * This is easy enough ... we can transfer the LWPs to a private |
| | | 2020 | * sleepq to prevent changes in the original sleepq while we have |
| | | 2021 | * it unlocked from affecting the results, but we must also |
| | | 2022 | * consider that LWPs might be using timed-wait, so we have to |
| | | 2023 | * make sure that won't wake the LWP up out from under us if the |
| | | 2024 | * timer fires. To do this, we have to set the "wchan" to NULL |
| | | 2025 | * and use a dummy syncobj that takes no action on "unsleep". |
| | | 2026 | * |
| | | 2027 | * We thus perform the hand-off in three steps, all with the op |
| | | 2028 | * lock held: |
| | | 2029 | * |
| | | 2030 | * ==> Wake selection: sleepq locked, select LWPs to wake, |
| | | 2031 | * compute new futex word. LWPs are tranferred from the |
| | | 2032 | * futex sleepq to the private sleepq and further sedated. |
| | | 2033 | * |
| | | 2034 | * ==> Futex word update: sleepq unlocked, use a loop around ucas |
| | | 2035 | * to update the futex word. There are no scenarios where |
| | | 2036 | * user space code can update the futex in a way that would |
| | | 2037 | * impact the work we do here; because we've been asked to do |
| | | 2038 | * the hand-off, any new LWPs attempting to acquire the lock |
| | | 2039 | * would be entering rw_wait by definition (either because |
| | | 2040 | * they're read-lockers and the lock is write-wanted, or they're |
| | | 2041 | * write-lockers and the lock is read-held). Those new rw_wait |
| | | 2042 | * LWPs will serialize against the op lock. We DO, however, |
| | | 2043 | * need to preserve any newly-set WANTED bits, hence the ucas |
| | | 2044 | * loop. Those newly-set WANTED bits, however, will not impact |
| | | 2045 | * our decisions. Those LWPs have simply lost the race to |
| | | 2046 | * acquire the lock, and we don't consult those bits anyway; |
| | | 2047 | * we instead use the contents of the sleepqs as the truth |
| | | 2048 | * about who is waiting, and now new waiters will appear on |
| | | 2049 | * the sleepqs while we hold the op lock. |
| | | 2050 | * |
| | | 2051 | * ==> Wake waiters: sleepq locked, run down our private sleepq |
| | | 2052 | * and actually awaken all of the LWPs we selected earlier. |
| | | 2053 | * |
| | | 2054 | * If for some reason, the ucas fails because it page backing it |
| | | 2055 | * was unmapped, all bets are off. We still awaken the waiters. |
| | | 2056 | * This is either a malicious attempt to screw things up, or a |
| | | 2057 | * programming error, and we don't care about either one. |
| | | 2058 | */ |
| | | 2059 | sleepq_init(&wsq); |
| | | 2060 | |
| | | 2061 | futex_op_lock(f); |
| | | 2062 | mutex_spin_enter(f->fx_sq_lock); |
| | | 2063 | |
| | | 2064 | /* |
| | | 2065 | * STEP 1 |
| | | 2066 | */ |
| | | 2067 | |
| | | 2068 | /* |
| | | 2069 | * POSIX dictates that any real-time waiters will acquire the |
| | | 2070 | * lock in priority order. This implies that a real-time |
| | | 2071 | * read-waiter has priority over a non-real-time write-waiter, |
| | | 2072 | * where we would otherwise prioritize waking the write-waiter. |
| | | 2073 | */ |
| | | 2074 | l_writer = LIST_FIRST(&f->fx_sqs[FUTEX_WRITERQ]); |
| | | 2075 | l_reader = LIST_FIRST(&f->fx_sqs[FUTEX_READERQ]); |
| | | 2076 | if (__predict_true(l_writer == NULL)) { |
| | | 2077 | /* We will wake all the readers. */ |
| | | 2078 | sq = &f->fx_sqs[FUTEX_READERQ]; |
| | | 2079 | nwaitersp = &f->fx_nwaiters[FUTEX_READERQ]; |
| | | 2080 | nwake = allreaders = f->fx_nwaiters[FUTEX_READERQ]; |
| | | 2081 | KASSERT(nwake >= 0 && nwake <= FUTEX_TID_MASK); |
| | | 2082 | if (__predict_false(nwake == 0)) { |
| | | 2083 | KASSERT(l_reader == NULL); |
| | | 2084 | newval = 0; |
| | | 2085 | } else { |
| | | 2086 | KASSERT(l_reader != NULL); |
| | | 2087 | newval = nwake; |
| | | 2088 | } |
| | | 2089 | l = l_reader; |
| | | 2090 | } else if (__predict_false(l_reader != NULL && |
| | | 2091 | futex_read_waiter_prio(l_reader) > |
| | | 2092 | (pri_writer = lwp_eprio(l_writer)))) { |
| | | 2093 | /* |
| | | 2094 | * Count the number of real-time read-waiters that |
| | | 2095 | * exceed the write-waiter's priority. We will |
| | | 2096 | * wake that many (we alreay know it's at least one). |
| | | 2097 | */ |
| | | 2098 | sq = &f->fx_sqs[FUTEX_READERQ]; |
| | | 2099 | nwaitersp = &f->fx_nwaiters[FUTEX_READERQ]; |
| | | 2100 | for (nwake = 1, l = LIST_NEXT(l_reader, l_sleepchain); |
| | | 2101 | l != NULL && futex_read_waiter_prio(l) > pri_writer; |
| | | 2102 | l = LIST_NEXT(l, l_sleepchain)) { |
| | | 2103 | nwake++; |
| | | 2104 | } |
| | | 2105 | KASSERT(nwake >= 0 && nwake <= FUTEX_TID_MASK); |
| | | 2106 | /* We know there is at least one write-waiter. */ |
| | | 2107 | newval = nwake | FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED; |
| | | 2108 | l = l_reader; |
| | | 2109 | } else { |
| | | 2110 | /* We will wake one writer. */ |
| | | 2111 | sq = &f->fx_sqs[FUTEX_WRITERQ]; |
| | | 2112 | nwaitersp = &f->fx_nwaiters[FUTEX_WRITERQ]; |
| | | 2113 | nwake = 1; |
| | | 2114 | newval = FUTEX_RW_WRITE_LOCKED | l_writer->l_lid; |
| | | 2115 | if (__predict_false(f->fx_nwaiters[FUTEX_WRITERQ] > 1)) { |
| | | 2116 | KASSERT(LIST_NEXT(l_writer, l_sleepchain) != NULL); |
| | | 2117 | newval |= FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED; |
| | | 2118 | } else if (__predict_true(f->fx_nwaiters[FUTEX_READERQ] != 0)) { |
| | | 2119 | KASSERT(!LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ])); |
| | | 2120 | newval |= FUTEX_WAITERS; |
| | | 2121 | } else { |
| | | 2122 | KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ])); |
| | | 2123 | } |
| | | 2124 | l = l_writer; |
| | | 2125 | } |
| | | 2126 | |
| | | 2127 | KASSERT(sq == &f->fx_sqs[FUTEX_READERQ] || |
| | | 2128 | sq == &f->fx_sqs[FUTEX_WRITERQ]); |
| | | 2129 | while (nwake != 0 && l != NULL) { |
| | | 2130 | l_next = LIST_NEXT(l, l_sleepchain); |
| | | 2131 | KASSERT(l->l_syncobj == &futex_syncobj); |
| | | 2132 | KASSERT(l->l_wchan == (wchan_t)f); |
| | | 2133 | KASSERT(l->l_futex == f); |
| | | 2134 | KASSERT(l->l_sleepq == sq); |
| | | 2135 | KASSERT(l->l_mutex == f->fx_sq_lock); |
| | | 2136 | |
| | | 2137 | /* |
| | | 2138 | * NULL wchan --> timeout will not wake LWP. |
| | | 2139 | * NULL lock --> keep existing lock. |
| | | 2140 | */ |
| | | 2141 | sleepq_transfer(l, sq, &wsq, NULL/*wchan*/, futex_wmesg, |
| | | 2142 | &sched_syncobj, NULL/*lock*/, false); |
| | | 2143 | |
| | | 2144 | KASSERT(*nwaitersp > 0); |
| | | 2145 | (*nwaitersp)--; |
| | | 2146 | nwoken++; |
| | | 2147 | nwake--; |
| | | 2148 | /* hold count adjusted below */ |
| | | 2149 | l = l_next; |
| | | 2150 | } |
| | | 2151 | |
| | | 2152 | mutex_spin_exit(f->fx_sq_lock); |
| | | 2153 | |
| | | 2154 | /* |
| | | 2155 | * STEP 2 |
| | | 2156 | */ |
| | | 2157 | for (;; val = next) { |
| | | 2158 | error = ucas_int(uaddr, val, newval, &next); |
| | | 2159 | if (__predict_false(error != 0)) { |
| | | 2160 | /* |
| | | 2161 | * The futex word has become unmapped. All bets |
| | | 2162 | * are off. Break out of the loop and awaken the |
| | | 2163 | * waiters; this is easier than trying to stuff |
| | | 2164 | * them back into their previous sleepq, and the |
| | | 2165 | * application is screwed anyway. |
| | | 2166 | */ |
| | | 2167 | break; |
| | | 2168 | } |
| | | 2169 | if (__predict_true(next == val)) { |
| | | 2170 | /* |
| | | 2171 | * Successfully updated the futex word! |
| | | 2172 | */ |
| | | 2173 | break; |
| | | 2174 | } |
| | | 2175 | /* |
| | | 2176 | * The only thing that could have possibly happened |
| | | 2177 | * (barring some bug in the thread library) is that |
| | | 2178 | * additional waiter bits arrived. Those new waiters |
| | | 2179 | * have lost the race to acquire the lock, but we |
| | | 2180 | * need to preserve those bits. |
| | | 2181 | */ |
| | | 2182 | newval |= next & (FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED); |
| | | 2183 | } |
| | | 2184 | |
| | | 2185 | mutex_spin_enter(f->fx_sq_lock); |
| | | 2186 | |
| | | 2187 | /* |
| | | 2188 | * STEP 3 |
| | | 2189 | */ |
| | | 2190 | |
| | | 2191 | LIST_FOREACH_SAFE(l, &wsq, l_sleepchain, l_next) { |
| | | 2192 | KASSERT(l->l_syncobj == &sched_syncobj); |
| | | 2193 | KASSERT(l->l_wchan == NULL); |
| | | 2194 | KASSERT(l->l_futex == f); |
| | | 2195 | KASSERT(l->l_sleepq == &wsq); |
| | | 2196 | KASSERT(l->l_mutex == f->fx_sq_lock); |
| | | 2197 | |
| | | 2198 | l->l_futex_wakesel = 0; |
| | | 2199 | l->l_futex = NULL; |
| | | 2200 | sleepq_remove(&wsq, l); |
| | | 2201 | } |
| | | 2202 | /* If we woke all-readers, ensure we will wake them all. */ |
| | | 2203 | KASSERT(allreaders == 0 || allreaders == nwoken); |
| | | 2204 | KASSERT(allreaders == 0 || LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ])); |
| | | 2205 | KASSERT(allreaders == 0 || f->fx_nwaiters[FUTEX_READERQ] == 0); |
| | | 2206 | |
| | | 2207 | mutex_spin_exit(f->fx_sq_lock); |
| | | 2208 | |
| | | 2209 | /* Adjust hold count. */ |
| | | 2210 | futex_rele_count_not_last(f, nwoken); |
| | | 2211 | |
| | | 2212 | futex_op_unlock(f); |
| | | 2213 | |
| | | 2214 | /* Release the futex. */ |
| | | 2215 | futex_rele(f); |
| | | 2216 | |
| | | 2217 | out: if (__predict_true(error == 0)) { |
| | | 2218 | /* Return the number of waiters woken. */ |
| | | 2219 | *retval = nwoken; |
| | | 2220 | } |
| | | 2221 | |
| | | 2222 | /* Success! */ |
| | | 2223 | return error; |
| | | 2224 | #endif |
| | | 2225 | } |
| | | 2226 | |
| | | 2227 | /* |
1577 | * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3) | | 2228 | * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3) |
1578 | * | | 2229 | * |
1579 | * Implement the futex system call with all the parameters | | 2230 | * Implement the futex system call with all the parameters |
1580 | * parsed out. | | 2231 | * parsed out. |
1581 | */ | | 2232 | */ |
1582 | int | | 2233 | int |
1583 | do_futex(int *uaddr, int op, int val, const struct timespec *timeout, | | 2234 | do_futex(int *uaddr, int op, int val, const struct timespec *timeout, |
1584 | int *uaddr2, int val2, int val3, register_t *retval) | | 2235 | int *uaddr2, int val2, int val3, register_t *retval) |
1585 | { | | 2236 | { |
1586 | const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true; | | 2237 | const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true; |
1587 | const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME | | 2238 | const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME |
1588 | : CLOCK_MONOTONIC; | | 2239 | : CLOCK_MONOTONIC; |
| | | 2240 | int error; |
1589 | | | 2241 | |
1590 | op &= FUTEX_CMD_MASK; | | 2242 | op &= FUTEX_CMD_MASK; |
1591 | | | 2243 | |
1592 | switch (op) { | | 2244 | switch (op) { |
1593 | case FUTEX_WAIT: | | 2245 | case FUTEX_WAIT: |
1594 | return futex_func_wait(shared, uaddr, val, | | 2246 | SDT_PROBE6(futex, func, wait, entry, |
| | | 2247 | uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout, |
| | | 2248 | TIMER_RELTIME, op & ~FUTEX_CMD_MASK); |
| | | 2249 | error = futex_func_wait(shared, uaddr, val, |
1595 | FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME, | | 2250 | FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME, |
1596 | retval); | | 2251 | retval); |
| | | 2252 | SDT_PROBE1(futex, func, wait, exit, error); |
| | | 2253 | break; |
| | | 2254 | |
| | | 2255 | case FUTEX_WAIT_BITSET: |
| | | 2256 | SDT_PROBE6(futex, func, wait, entry, |
| | | 2257 | uaddr, val, val3, timeout, |
| | | 2258 | TIMER_ABSTIME, op & ~FUTEX_CMD_MASK); |
| | | 2259 | error = futex_func_wait(shared, uaddr, val, val3, timeout, |
| | | 2260 | clkid, TIMER_ABSTIME, retval); |
| | | 2261 | SDT_PROBE1(futex, func, wait, exit, error); |
| | | 2262 | break; |
1597 | | | 2263 | |
1598 | case FUTEX_WAKE: | | 2264 | case FUTEX_WAKE: |
1599 | val3 = FUTEX_BITSET_MATCH_ANY; | | 2265 | SDT_PROBE4(futex, func, wake, entry, |
1600 | /* FALLTHROUGH */ | | 2266 | uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK); |
| | | 2267 | error = futex_func_wake(shared, uaddr, val, |
| | | 2268 | FUTEX_BITSET_MATCH_ANY, retval); |
| | | 2269 | SDT_PROBE2(futex, func, wake, exit, error, *retval); |
| | | 2270 | break; |
| | | 2271 | |
1601 | case FUTEX_WAKE_BITSET: | | 2272 | case FUTEX_WAKE_BITSET: |
1602 | return futex_func_wake(shared, uaddr, val, val3, retval); | | 2273 | SDT_PROBE4(futex, func, wake, entry, |
| | | 2274 | uaddr, val, val3, op & ~FUTEX_CMD_MASK); |
| | | 2275 | error = futex_func_wake(shared, uaddr, val, val3, retval); |
| | | 2276 | SDT_PROBE2(futex, func, wake, exit, error, *retval); |
| | | 2277 | break; |
1603 | | | 2278 | |
1604 | case FUTEX_REQUEUE: | | 2279 | case FUTEX_REQUEUE: |
| | | 2280 | SDT_PROBE5(futex, func, requeue, entry, |
| | | 2281 | uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK); |
| | | 2282 | error = futex_func_requeue(shared, op, uaddr, val, uaddr2, |
| | | 2283 | val2, 0, retval); |
| | | 2284 | SDT_PROBE2(futex, func, requeue, exit, error, *retval); |
| | | 2285 | break; |
| | | 2286 | |
1605 | case FUTEX_CMP_REQUEUE: | | 2287 | case FUTEX_CMP_REQUEUE: |
1606 | return futex_func_requeue(shared, op, uaddr, val, uaddr2, | | 2288 | SDT_PROBE6(futex, func, cmp_requeue, entry, |
| | | 2289 | uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK); |
| | | 2290 | error = futex_func_requeue(shared, op, uaddr, val, uaddr2, |
1607 | val2, val3, retval); | | 2291 | val2, val3, retval); |
1608 | | | 2292 | SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval); |
1609 | case FUTEX_WAIT_BITSET: | | 2293 | break; |
1610 | return futex_func_wait(shared, uaddr, val, val3, timeout, | | | |
1611 | clkid, TIMER_ABSTIME, retval); | | | |
1612 | | | 2294 | |
1613 | case FUTEX_WAKE_OP: | | 2295 | case FUTEX_WAKE_OP: |
1614 | return futex_func_wake_op(shared, uaddr, val, uaddr2, val2, | | 2296 | SDT_PROBE6(futex, func, wake_op, entry, |
| | | 2297 | uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK); |
| | | 2298 | error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2, |
1615 | val3, retval); | | 2299 | val3, retval); |
| | | 2300 | SDT_PROBE2(futex, func, wake_op, exit, error, *retval); |
| | | 2301 | break; |
| | | 2302 | |
| | | 2303 | case FUTEX_NETBSD_RW_WAIT: |
| | | 2304 | SDT_PROBE5(futex, func, rw_wait, entry, |
| | | 2305 | uaddr, val, val3, timeout, op & ~FUTEX_CMD_MASK); |
| | | 2306 | error = futex_func_rw_wait(shared, uaddr, val, val3, |
| | | 2307 | timeout, clkid, retval); |
| | | 2308 | SDT_PROBE1(futex, func, rw_wait, exit, error); |
| | | 2309 | break; |
| | | 2310 | |
| | | 2311 | case FUTEX_NETBSD_RW_HANDOFF: |
| | | 2312 | SDT_PROBE3(futex, func, rw_handoff, entry, |
| | | 2313 | uaddr, val, op & ~FUTEX_CMD_MASK); |
| | | 2314 | error = futex_func_rw_handoff(shared, uaddr, val, retval); |
| | | 2315 | SDT_PROBE2(futex, func, rw_handoff, exit, error, *retval); |
| | | 2316 | break; |
1616 | | | 2317 | |
1617 | case FUTEX_FD: | | 2318 | case FUTEX_FD: |
1618 | default: | | 2319 | default: |
1619 | return ENOSYS; | | 2320 | error = ENOSYS; |
| | | 2321 | break; |
1620 | } | | 2322 | } |
| | | 2323 | |
| | | 2324 | return error; |
1621 | } | | 2325 | } |
1622 | | | 2326 | |
1623 | /* | | 2327 | /* |
1624 | * sys___futex(l, uap, retval) | | 2328 | * sys___futex(l, uap, retval) |
1625 | * | | 2329 | * |
1626 | * __futex(2) system call: generic futex operations. | | 2330 | * __futex(2) system call: generic futex operations. |
1627 | */ | | 2331 | */ |
1628 | int | | 2332 | int |
1629 | sys___futex(struct lwp *l, const struct sys___futex_args *uap, | | 2333 | sys___futex(struct lwp *l, const struct sys___futex_args *uap, |
1630 | register_t *retval) | | 2334 | register_t *retval) |
1631 | { | | 2335 | { |
1632 | /* { | | 2336 | /* { |
1633 | syscallarg(int *) uaddr; | | 2337 | syscallarg(int *) uaddr; |
| @@ -1719,26 +2423,27 @@ sys___futex_get_robust_list(struct lwp * | | | @@ -1719,26 +2423,27 @@ sys___futex_get_robust_list(struct lwp * |
1719 | * | | 2423 | * |
1720 | * Try to release the robust futex at uva in the current process | | 2424 | * Try to release the robust futex at uva in the current process |
1721 | * on lwp exit. If anything goes wrong, silently fail. It is the | | 2425 | * on lwp exit. If anything goes wrong, silently fail. It is the |
1722 | * userland program's obligation to arrange correct behaviour. | | 2426 | * userland program's obligation to arrange correct behaviour. |
1723 | */ | | 2427 | */ |
1724 | static void | | 2428 | static void |
1725 | release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi, | | 2429 | release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi, |
1726 | bool const is_pending) | | 2430 | bool const is_pending) |
1727 | { | | 2431 | { |
1728 | int *uaddr; | | 2432 | int *uaddr; |
1729 | struct futex *f; | | 2433 | struct futex *f; |
1730 | int oldval, newval, actual; | | 2434 | int oldval, newval, actual; |
1731 | int error; | | 2435 | int error; |
| | | 2436 | bool shared; |
1732 | | | 2437 | |
1733 | /* If it's misaligned, tough. */ | | 2438 | /* If it's misaligned, tough. */ |
1734 | if (__predict_false(uptr & 3)) | | 2439 | if (__predict_false(uptr & 3)) |
1735 | return; | | 2440 | return; |
1736 | uaddr = (int *)uptr; | | 2441 | uaddr = (int *)uptr; |
1737 | | | 2442 | |
1738 | error = futex_load(uaddr, &oldval); | | 2443 | error = futex_load(uaddr, &oldval); |
1739 | if (__predict_false(error)) | | 2444 | if (__predict_false(error)) |
1740 | return; | | 2445 | return; |
1741 | | | 2446 | |
1742 | /* | | 2447 | /* |
1743 | * There are two race conditions we need to handle here: | | 2448 | * There are two race conditions we need to handle here: |
1744 | * | | 2449 | * |
| @@ -1749,28 +2454,31 @@ release_futex(uintptr_t const uptr, lwpi | | | @@ -1749,28 +2454,31 @@ release_futex(uintptr_t const uptr, lwpi |
1749 | * 2. Awakened waiter died before being able to acquire | | 2454 | * 2. Awakened waiter died before being able to acquire |
1750 | * the futex in user space. Any other waiters are | | 2455 | * the futex in user space. Any other waiters are |
1751 | * now stuck, oops! | | 2456 | * now stuck, oops! |
1752 | * | | 2457 | * |
1753 | * In both of these cases, the futex word will be 0 (because | | 2458 | * In both of these cases, the futex word will be 0 (because |
1754 | * it's updated before the wake is issued). The best we can | | 2459 | * it's updated before the wake is issued). The best we can |
1755 | * do is detect this situation if it's the pending futex and | | 2460 | * do is detect this situation if it's the pending futex and |
1756 | * issue a wake without modifying the futex word. | | 2461 | * issue a wake without modifying the futex word. |
1757 | * | | 2462 | * |
1758 | * XXX eventual PI handling? | | 2463 | * XXX eventual PI handling? |
1759 | */ | | 2464 | */ |
1760 | if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) { | | 2465 | if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) { |
1761 | register_t retval; | | 2466 | register_t retval; |
1762 | (void) futex_func_wake(/*shared*/true, uaddr, 1, | | 2467 | error = futex_func_wake(/*shared*/true, uaddr, 1, |
1763 | FUTEX_BITSET_MATCH_ANY, &retval); | | 2468 | FUTEX_BITSET_MATCH_ANY, &retval); |
| | | 2469 | if (error != 0 || retval == 0) |
| | | 2470 | (void) futex_func_wake(/*shared*/false, uaddr, 1, |
| | | 2471 | FUTEX_BITSET_MATCH_ANY, &retval); |
1764 | return; | | 2472 | return; |
1765 | } | | 2473 | } |
1766 | | | 2474 | |
1767 | /* Optimistically test whether we need to do anything at all. */ | | 2475 | /* Optimistically test whether we need to do anything at all. */ |
1768 | if ((oldval & FUTEX_TID_MASK) != tid) | | 2476 | if ((oldval & FUTEX_TID_MASK) != tid) |
1769 | return; | | 2477 | return; |
1770 | | | 2478 | |
1771 | /* | | 2479 | /* |
1772 | * We need to handle the case where this thread owned the futex, | | 2480 | * We need to handle the case where this thread owned the futex, |
1773 | * but it was uncontended. In this case, there won't be any | | 2481 | * but it was uncontended. In this case, there won't be any |
1774 | * kernel state to look up. All we can do is mark the futex | | 2482 | * kernel state to look up. All we can do is mark the futex |
1775 | * as a zombie to be mopped up the next time another thread | | 2483 | * as a zombie to be mopped up the next time another thread |
1776 | * attempts to acquire it. | | 2484 | * attempts to acquire it. |
| @@ -1794,70 +2502,74 @@ release_futex(uintptr_t const uptr, lwpi | | | @@ -1794,70 +2502,74 @@ release_futex(uintptr_t const uptr, lwpi |
1794 | if (error) | | 2502 | if (error) |
1795 | return; | | 2503 | return; |
1796 | } while (actual != oldval); | | 2504 | } while (actual != oldval); |
1797 | | | 2505 | |
1798 | /* | | 2506 | /* |
1799 | * If where is still no indication of waiters, then there is | | 2507 | * If where is still no indication of waiters, then there is |
1800 | * no more work for us to do. | | 2508 | * no more work for us to do. |
1801 | */ | | 2509 | */ |
1802 | if ((oldval & FUTEX_WAITERS) == 0) | | 2510 | if ((oldval & FUTEX_WAITERS) == 0) |
1803 | return; | | 2511 | return; |
1804 | } | | 2512 | } |
1805 | | | 2513 | |
1806 | /* | | 2514 | /* |
1807 | * Look for a shared futex since we have no positive indication | | 2515 | * Look for a futex. Try shared first, then private. If we |
1808 | * it is private. If we can't, tough. | | 2516 | * can't fine one, tough. |
1809 | */ | | 2517 | * |
1810 | error = futex_lookup(uaddr, /*shared*/true, &f); | | 2518 | * Note: the assumption here is that anyone placing a futex on |
1811 | if (error) | | 2519 | * the robust list is adhering to the rules, regardless of the |
1812 | return; | | 2520 | * futex class. |
1813 | | | | |
1814 | /* | | | |
1815 | * If there's no kernel state for this futex, there's nothing to | | | |
1816 | * release. | | | |
1817 | */ | | 2521 | */ |
1818 | if (f == NULL) | | 2522 | for (f = NULL, shared = true; f == NULL; shared = false) { |
1819 | return; | | 2523 | error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f); |
| | | 2524 | if (error) |
| | | 2525 | return; |
| | | 2526 | if (f == NULL && shared == false) |
| | | 2527 | return; |
| | | 2528 | } |
1820 | | | 2529 | |
1821 | /* Work under the futex queue lock. */ | | 2530 | /* Work under the futex op lock. */ |
1822 | futex_queue_lock(f); | | 2531 | futex_op_lock(f); |
1823 | | | 2532 | |
1824 | /* | | 2533 | /* |
1825 | * Fetch the word: if the tid doesn't match ours, skip; | | 2534 | * Fetch the word: if the tid doesn't match ours, skip; |
1826 | * otherwise, set the owner-died bit, atomically. | | 2535 | * otherwise, set the owner-died bit, atomically. |
1827 | */ | | 2536 | */ |
1828 | do { | | 2537 | do { |
1829 | error = futex_load(uaddr, &oldval); | | 2538 | error = futex_load(uaddr, &oldval); |
1830 | if (error) | | 2539 | if (error) |
1831 | goto out; | | 2540 | goto out; |
1832 | if ((oldval & FUTEX_TID_MASK) != tid) | | 2541 | if ((oldval & FUTEX_TID_MASK) != tid) |
1833 | goto out; | | 2542 | goto out; |
1834 | newval = oldval | FUTEX_OWNER_DIED; | | 2543 | newval = oldval | FUTEX_OWNER_DIED; |
1835 | error = ucas_int(uaddr, oldval, newval, &actual); | | 2544 | error = ucas_int(uaddr, oldval, newval, &actual); |
1836 | if (error) | | 2545 | if (error) |
1837 | goto out; | | 2546 | goto out; |
1838 | } while (actual != oldval); | | 2547 | } while (actual != oldval); |
1839 | | | 2548 | |
1840 | /* | | 2549 | /* |
1841 | * If there may be waiters, try to wake one. If anything goes | | 2550 | * If there may be waiters, try to wake one. If anything goes |
1842 | * wrong, tough. | | 2551 | * wrong, tough. |
1843 | * | | 2552 | * |
1844 | * XXX eventual PI handling? | | 2553 | * XXX eventual PI handling? |
1845 | */ | | 2554 | */ |
1846 | if (oldval & FUTEX_WAITERS) | | 2555 | if (oldval & FUTEX_WAITERS) { |
1847 | (void)futex_wake(f, 1, NULL, 0, FUTEX_BITSET_MATCH_ANY); | | 2556 | (void)futex_wake(f, FUTEX_WRITERQ, 1, |
| | | 2557 | NULL, FUTEX_WRITERQ, 0, |
| | | 2558 | FUTEX_BITSET_MATCH_ANY); |
| | | 2559 | } |
1848 | | | 2560 | |
1849 | /* Unlock the queue and release the futex. */ | | 2561 | /* Unlock the queue and release the futex. */ |
1850 | out: futex_queue_unlock(f); | | 2562 | out: futex_op_unlock(f); |
1851 | futex_rele(f); | | 2563 | futex_rele(f); |
1852 | } | | 2564 | } |
1853 | | | 2565 | |
1854 | /* | | 2566 | /* |
1855 | * futex_robust_head_lookup(l, lwpid) | | 2567 | * futex_robust_head_lookup(l, lwpid) |
1856 | * | | 2568 | * |
1857 | * Helper function to look up a robust head by LWP ID. | | 2569 | * Helper function to look up a robust head by LWP ID. |
1858 | */ | | 2570 | */ |
1859 | int | | 2571 | int |
1860 | futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp) | | 2572 | futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp) |
1861 | { | | 2573 | { |
1862 | struct proc *p = l->l_proc; | | 2574 | struct proc *p = l->l_proc; |
1863 | | | 2575 | |