Sun Nov 1 15:16:43 2020 UTC ()
Major overfaul of futex implemention:
- Use sleepqs directly, rather than using condition variables and
  separate wait queues / strutures.  By doing this, and using the
  standard mechanism for keeping sleepqs sorted by priority, we
  acn ensure that the highest priority waiters will be awakened,
  rather than naively awakening in FIFO order.
- As part of the data structure re-organization, struct lwp gains
  "l_futex" (the futex an LWP is blocked on) and "l_futex_wakesel"
  (the futex wake selector bitset) fields (and loses l___rsvd1).
  Plese note the special locking considerations for these fields
  documented in the comments.
- Add the notion of a "futex class".  This is prep work for eventually
  supporting the FUTEX_*_PI operations, as well as some future NetBSD
  extensions to the futex interface.
- Add a preliminary implementation of the first of those NetBSD extensions,
  FUTEX_NETBSD_RW_WAIT and FUTEX_NETBSD_RW_HANDOFF.  These are designed
  to implement reader/writer locks with direct-handoff to the correct
  priority thread(s) (real-time read-waiters need to have priority over
  non-real-time write-waiters).  NOTE: this is currently disabled due to
  a mysterious panic that haasn't yet been tracked down.
- Add some SDT probes to aid in debugging.


(thorpej)
diff -r1.11 -r1.11.2.1 src/sys/kern/sys_futex.c
diff -r1.212 -r1.212.2.1 src/sys/sys/lwp.h

cvs diff -r1.11 -r1.11.2.1 src/sys/kern/sys_futex.c (expand / switch to unified diff)

--- src/sys/kern/sys_futex.c 2020/05/05 15:25:18 1.11
+++ src/sys/kern/sys_futex.c 2020/11/01 15:16:43 1.11.2.1
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: sys_futex.c,v 1.11 2020/05/05 15:25:18 riastradh Exp $ */ 1/* $NetBSD: sys_futex.c,v 1.11.2.1 2020/11/01 15:16:43 thorpej Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell and Jason R. Thorpe. 8 * by Taylor R. Campbell and Jason R. Thorpe.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
@@ -20,27 +20,27 @@ @@ -20,27 +20,27 @@
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32#include <sys/cdefs.h> 32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.11 2020/05/05 15:25:18 riastradh Exp $"); 33__KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.11.2.1 2020/11/01 15:16:43 thorpej Exp $");
34 34
35/* 35/*
36 * Futexes 36 * Futexes
37 * 37 *
38 * The futex system call coordinates notifying threads waiting for 38 * The futex system call coordinates notifying threads waiting for
39 * changes on a 32-bit word of memory. The word can be managed by 39 * changes on a 32-bit word of memory. The word can be managed by
40 * CPU atomic operations in userland, without system calls, as long 40 * CPU atomic operations in userland, without system calls, as long
41 * as there is no contention. 41 * as there is no contention.
42 * 42 *
43 * The simplest use case demonstrating the utility is: 43 * The simplest use case demonstrating the utility is:
44 * 44 *
45 * // 32-bit word of memory shared among threads or 45 * // 32-bit word of memory shared among threads or
46 * // processes in userland. lock & 1 means owned; 46 * // processes in userland. lock & 1 means owned;
@@ -110,42 +110,98 @@ __KERNEL_RCSID(0, "$NetBSD: sys_futex.c, @@ -110,42 +110,98 @@ __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,
110 * 110 *
111 * futex_tab.va[vmspace, va] for private futexes 111 * futex_tab.va[vmspace, va] for private futexes
112 * futex_tab.oa[uvm_voaddr] for shared futexes 112 * futex_tab.oa[uvm_voaddr] for shared futexes
113 * 113 *
114 * This implementation does not support priority inheritance. 114 * This implementation does not support priority inheritance.
115 */ 115 */
116 116
117#include <sys/types.h> 117#include <sys/types.h>
118#include <sys/atomic.h> 118#include <sys/atomic.h>
119#include <sys/condvar.h> 119#include <sys/condvar.h>
120#include <sys/futex.h> 120#include <sys/futex.h>
121#include <sys/mutex.h> 121#include <sys/mutex.h>
122#include <sys/rbtree.h> 122#include <sys/rbtree.h>
 123#include <sys/sleepq.h>
123#include <sys/queue.h> 124#include <sys/queue.h>
 125#include <sys/sdt.h>
124 126
125#include <sys/syscall.h> 127#include <sys/syscall.h>
126#include <sys/syscallargs.h> 128#include <sys/syscallargs.h>
127#include <sys/syscallvar.h> 129#include <sys/syscallvar.h>
128 130
129#include <uvm/uvm_extern.h> 131#include <uvm/uvm_extern.h>
130 132
131/* 133/*
 134 * DTrace probes.
 135 */
 136SDT_PROVIDER_DEFINE(futex);
 137
 138/* entry: uaddr, val, bitset, timeout, clkflags, fflags */
 139/* exit: error */
 140SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
 141 "struct timespec *", "int", "int");
 142SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
 143
 144/* entry: uaddr, nwake, bitset, fflags */
 145/* exit: error, nwoken */
 146SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
 147SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
 148
 149/* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
 150/* exit: error, nwoken */
 151SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
 152 "int");
 153SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
 154
 155/* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
 156/* exit: error, nwoken */
 157SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
 158 "int", "int", "int");
 159SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
 160
 161/* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
 162/* exit: error, nwoken */
 163SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
 164 "int", "int");
 165SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
 166
 167/* entry: uaddr, val, r/w, abstime, fflags */
 168/* exit: error */
 169SDT_PROBE_DEFINE5(futex, func, rw_wait, entry, "int *", "int", "int",
 170 "struct timespec *", "int");
 171SDT_PROBE_DEFINE1(futex, func, rw_wait, exit, "int");
 172
 173/* entry: uaddr, val, fflags */
 174/* exit: error, nwoken */
 175SDT_PROBE_DEFINE3(futex, func, rw_handoff, entry, "int *", "int", "int");
 176SDT_PROBE_DEFINE2(futex, func, rw_handoff, exit, "int", "int");
 177
 178SDT_PROBE_DEFINE0(futex, wait, finish, normally);
 179SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
 180SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
 181
 182/* entry: timo */
 183/* exit: error */
 184SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
 185SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
 186
 187/*
132 * Lock order: 188 * Lock order:
133 * 189 *
134 * futex_tab.lock 190 * futex_tab.lock
135 * futex::fx_qlock ordered by kva of struct futex 191 * futex::fx_op_lock ordered by kva of struct futex
136 * -> futex_wait::fw_lock only one at a time 192 * -> futex::fx_sq_lock ordered by kva of sleepq lock
137 * futex_wait::fw_lock only one at a time 193 *
138 * -> futex::fx_abortlock only one at a time 194 * N.B. multiple futexes can share a single sleepq lock.
139 */ 195 */
140 196
141/* 197/*
142 * union futex_key 198 * union futex_key
143 * 199 *
144 * A futex is addressed either by a vmspace+va (private) or by 200 * A futex is addressed either by a vmspace+va (private) or by
145 * a uvm_voaddr (shared). 201 * a uvm_voaddr (shared).
146 */ 202 */
147union futex_key { 203union futex_key {
148 struct { 204 struct {
149 struct vmspace *vmspace; 205 struct vmspace *vmspace;
150 vaddr_t va; 206 vaddr_t va;
151 } fk_private; 207 } fk_private;
@@ -155,57 +211,75 @@ union futex_key { @@ -155,57 +211,75 @@ union futex_key {
155/* 211/*
156 * struct futex 212 * struct futex
157 * 213 *
158 * Kernel state for a futex located at a particular address in a 214 * Kernel state for a futex located at a particular address in a
159 * particular virtual address space. 215 * particular virtual address space.
160 * 216 *
161 * N.B. fx_refcnt is an unsigned long because we need to be able 217 * N.B. fx_refcnt is an unsigned long because we need to be able
162 * to operate on it atomically on all systems while at the same 218 * to operate on it atomically on all systems while at the same
163 * time rendering practically impossible the chance of it reaching 219 * time rendering practically impossible the chance of it reaching
164 * its max value. In practice, we're limited by the number of LWPs 220 * its max value. In practice, we're limited by the number of LWPs
165 * that can be present on the system at any given time, and the 221 * that can be present on the system at any given time, and the
166 * assumption is that limit will be good enough on a 32-bit platform. 222 * assumption is that limit will be good enough on a 32-bit platform.
167 * See futex_wake() for why overflow needs to be avoided. 223 * See futex_wake() for why overflow needs to be avoided.
 224 *
 225 * XXX Since futex addresses must be 4-byte aligned, we could
 226 * XXX squirrel away fx_shared and fx_on_tree bits in the "va"
 227 * XXX field of the key. Worth it?
168 */ 228 */
169struct futex { 229struct futex {
170 union futex_key fx_key; 230 union futex_key fx_key;
 231 struct rb_node fx_node;
171 unsigned long fx_refcnt; 232 unsigned long fx_refcnt;
172 bool fx_shared; 233 bool fx_shared;
173 bool fx_on_tree; 234 bool fx_on_tree;
174 struct rb_node fx_node; 235 uint8_t fx_class;
175 
176 kmutex_t fx_qlock; 
177 TAILQ_HEAD(, futex_wait) fx_queue; 
178 236
179 kmutex_t fx_abortlock; 237 kmutex_t fx_op_lock; /* adaptive */
180 LIST_HEAD(, futex_wait) fx_abortlist; 238 kmutex_t * fx_sq_lock; /* &sleepq_locks[...] */
181 kcondvar_t fx_abortcv; 239 sleepq_t fx_sqs[2]; /* 0=reader, 1=writer */
 240 unsigned int fx_nwaiters[2];
182}; 241};
183 242
184/* 243/*
185 * struct futex_wait 244 * futex classes: Some futex operations can only be carried out on
186 * 245 * futexes that are known to be abiding by a certain protocol. These
187 * State for a thread to wait on a futex. Threads wait on fw_cv 246 * classes are assigned to a futex when created due to a wait event,
188 * for fw_bitset to be set to zero. The thread may transition to 247 * and when subsequent wake or requeue operations are issued, the
189 * a different futex queue at any time under the futex's lock. 248 * class is checked at futex lookup time. If the class does not match,
190 */ 249 * EINVAL is the result.
191struct futex_wait { 250 */
192 kmutex_t fw_lock; 251#define FUTEX_CLASS_ANY 0 /* match any class in lookup */
193 kcondvar_t fw_cv; 252#define FUTEX_CLASS_NORMAL 1 /* normal futex */
194 struct futex *fw_futex; 253#define FUTEX_CLASS_RWLOCK 2 /* rwlock futex */
195 TAILQ_ENTRY(futex_wait) fw_entry; /* queue lock */ 254#define FUTEX_CLASS_PI 3 /* for FUTEX_*_PI ops */
196 LIST_ENTRY(futex_wait) fw_abort; /* queue abortlock */ 255
197 int fw_bitset; 256/* sleepq definitions */
198 bool fw_aborting; /* fw_lock */ 257#define FUTEX_READERQ 0
 258#define FUTEX_WRITERQ 1
 259
 260CTASSERT(FUTEX_READERQ == FUTEX_RW_READER);
 261CTASSERT(FUTEX_WRITERQ == FUTEX_RW_WRITER);
 262
 263static const char futex_wmesg[] = "futex";
 264
 265static void futex_unsleep(lwp_t *, bool);
 266
 267static syncobj_t futex_syncobj = {
 268 .sobj_flag = SOBJ_SLEEPQ_SORTED,
 269 .sobj_unsleep = futex_unsleep,
 270 .sobj_changepri = sleepq_changepri,
 271 .sobj_lendpri = sleepq_lendpri,
 272 .sobj_owner = syncobj_noowner,
199}; 273};
200 274
201/* 275/*
202 * futex_tab 276 * futex_tab
203 * 277 *
204 * Global trees of futexes by vmspace/va and VM object address. 278 * Global trees of futexes by vmspace/va and VM object address.
205 * 279 *
206 * XXX This obviously doesn't scale in parallel. We could use a 280 * XXX This obviously doesn't scale in parallel. We could use a
207 * pserialize-safe data structure, but there may be a high cost to 281 * pserialize-safe data structure, but there may be a high cost to
208 * frequent deletion since we don't cache futexes after we're done 282 * frequent deletion since we don't cache futexes after we're done
209 * with them. We could use hashed locks. But for now, just make 283 * with them. We could use hashed locks. But for now, just make
210 * sure userland can't DoS the serial performance, by using a 284 * sure userland can't DoS the serial performance, by using a
211 * balanced binary tree for lookup. 285 * balanced binary tree for lookup.
@@ -269,27 +343,115 @@ compare_futex_shared(void *cookie, const @@ -269,27 +343,115 @@ compare_futex_shared(void *cookie, const
269{ 343{
270 const struct futex *fa = na; 344 const struct futex *fa = na;
271 const struct futex *fb = nb; 345 const struct futex *fb = nb;
272 346
273 return compare_futex_shared_key(cookie, fa, &fb->fx_key); 347 return compare_futex_shared_key(cookie, fa, &fb->fx_key);
274} 348}
275 349
276static const rb_tree_ops_t futex_shared_rb_ops = { 350static const rb_tree_ops_t futex_shared_rb_ops = {
277 .rbto_compare_nodes = compare_futex_shared, 351 .rbto_compare_nodes = compare_futex_shared,
278 .rbto_compare_key = compare_futex_shared_key, 352 .rbto_compare_key = compare_futex_shared_key,
279 .rbto_node_offset = offsetof(struct futex, fx_node), 353 .rbto_node_offset = offsetof(struct futex, fx_node),
280}; 354};
281 355
282static void futex_wait_dequeue(struct futex_wait *, struct futex *); 356/*
 357 * futex_sq_lock2(f, f2)
 358 *
 359 * Acquire the sleepq locks for f and f2, which may be null, or
 360 * which may be the same. If they are distinct, an arbitrary total
 361 * order is chosen on the locks.
 362 *
 363 * Callers should only ever acquire multiple sleepq locks
 364 * simultaneously using futex_sq_lock2.
 365 */
 366static void
 367futex_sq_lock2(struct futex * const f, struct futex * const f2)
 368{
 369
 370 /*
 371 * If both are null, do nothing; if one is null and the other
 372 * is not, lock the other and be done with it.
 373 */
 374 if (f == NULL && f2 == NULL) {
 375 return;
 376 } else if (f == NULL) {
 377 mutex_spin_enter(f2->fx_sq_lock);
 378 return;
 379 } else if (f2 == NULL) {
 380 mutex_spin_enter(f->fx_sq_lock);
 381 return;
 382 }
 383
 384 kmutex_t * const m = f->fx_sq_lock;
 385 kmutex_t * const m2 = f2->fx_sq_lock;
 386
 387 /* If both locks are the same, acquire only one. */
 388 if (m == m2) {
 389 mutex_spin_enter(m);
 390 return;
 391 }
 392
 393 /* Otherwise, use the ordering on the kva of the lock pointer. */
 394 if ((uintptr_t)m < (uintptr_t)m2) {
 395 mutex_spin_enter(m);
 396 mutex_spin_enter(m2);
 397 } else {
 398 mutex_spin_enter(m2);
 399 mutex_spin_enter(m);
 400 }
 401}
 402
 403/*
 404 * futex_sq_unlock2(f, f2)
 405 *
 406 * Release the sleep queue locks for f and f2, which may be null, or
 407 * which may have the same underlying lock.
 408 */
 409static void
 410futex_sq_unlock2(struct futex * const f, struct futex * const f2)
 411{
 412
 413 /*
 414 * If both are null, do nothing; if one is null and the other
 415 * is not, unlock the other and be done with it.
 416 */
 417 if (f == NULL && f2 == NULL) {
 418 return;
 419 } else if (f == NULL) {
 420 mutex_spin_exit(f2->fx_sq_lock);
 421 return;
 422 } else if (f2 == NULL) {
 423 mutex_spin_exit(f->fx_sq_lock);
 424 return;
 425 }
 426
 427 kmutex_t * const m = f->fx_sq_lock;
 428 kmutex_t * const m2 = f2->fx_sq_lock;
 429
 430 /* If both locks are the same, release only one. */
 431 if (m == m2) {
 432 mutex_spin_exit(m);
 433 return;
 434 }
 435
 436 /* Otherwise, use the ordering on the kva of the lock pointer. */
 437 if ((uintptr_t)m < (uintptr_t)m2) {
 438 mutex_spin_exit(m2);
 439 mutex_spin_exit(m);
 440 } else {
 441 mutex_spin_exit(m);
 442 mutex_spin_exit(m2);
 443 }
 444}
283 445
284/* 446/*
285 * futex_load(uaddr, kaddr) 447 * futex_load(uaddr, kaddr)
286 * 448 *
287 * Perform a single atomic load to read *uaddr, and return the 449 * Perform a single atomic load to read *uaddr, and return the
288 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not 450 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not
289 * mapped. 451 * mapped.
290 */ 452 */
291static inline int 453static inline int
292futex_load(int *uaddr, int *kaddr) 454futex_load(int *uaddr, int *kaddr)
293{ 455{
294 return ufetch_int((u_int *)uaddr, (u_int *)kaddr); 456 return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
295} 457}
@@ -302,118 +464,96 @@ futex_load(int *uaddr, int *kaddr) @@ -302,118 +464,96 @@ futex_load(int *uaddr, int *kaddr)
302 */ 464 */
303static bool 465static bool
304futex_test(int *uaddr, int expected) 466futex_test(int *uaddr, int expected)
305{ 467{
306 int val; 468 int val;
307 int error; 469 int error;
308 470
309 error = futex_load(uaddr, &val); 471 error = futex_load(uaddr, &val);
310 if (error) 472 if (error)
311 return false; 473 return false;
312 return val == expected; 474 return val == expected;
313} 475}
314 476
 477static pool_cache_t futex_cache __read_mostly;
 478
 479static int futex_ctor(void *, void *, int);
 480static void futex_dtor(void *, void *);
 481
315/* 482/*
316 * futex_sys_init() 483 * futex_sys_init()
317 * 484 *
318 * Initialize the futex subsystem. 485 * Initialize the futex subsystem.
319 */ 486 */
320void 487void
321futex_sys_init(void) 488futex_sys_init(void)
322{ 489{
323 490
324 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE); 491 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
325 rb_tree_init(&futex_tab.va, &futex_rb_ops); 492 rb_tree_init(&futex_tab.va, &futex_rb_ops);
326 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops); 493 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
 494
 495 futex_cache = pool_cache_init(sizeof(struct futex),
 496 coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
 497 futex_dtor, NULL);
327} 498}
328 499
329/* 500/*
330 * futex_sys_fini() 501 * futex_sys_fini()
331 * 502 *
332 * Finalize the futex subsystem. 503 * Finalize the futex subsystem.
333 */ 504 */
334void 505void
335futex_sys_fini(void) 506futex_sys_fini(void)
336{ 507{
337 508
338 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL); 509 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
339 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL); 510 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
340 mutex_destroy(&futex_tab.lock); 511 mutex_destroy(&futex_tab.lock);
341} 512}
342 513
343/* 514/*
344 * futex_queue_init(f) 515 * futex_ctor()
345 * 
346 * Initialize the futex queue. Caller must call futex_queue_fini 
347 * when done. 
348 * 516 *
349 * Never sleeps. 517 * Futex object constructor.
350 */ 518 */
351static void 519static int
352futex_queue_init(struct futex *f) 520futex_ctor(void *arg __unused, void *obj, int flags __unused)
353{ 521{
 522 extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
 523 struct futex * const f = obj;
354 524
355 mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE); 525 mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
356 mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE); 526 f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
357 cv_init(&f->fx_abortcv, "fqabort"); 
358 LIST_INIT(&f->fx_abortlist); 
359 TAILQ_INIT(&f->fx_queue); 
360} 
361 527
362/* 528 sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
363 * futex_queue_drain(f) 529 sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
364 * 530 f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
365 * Wait for any aborting waiters in f; then empty the queue of 
366 * any stragglers and wake them. Caller must guarantee no new 
367 * references to f. 
368 * 
369 * May sleep. 
370 */ 
371static void 
372futex_queue_drain(struct futex *f) 
373{ 
374 struct futex_wait *fw, *fw_next; 
375 
376 mutex_enter(&f->fx_abortlock); 
377 while (!LIST_EMPTY(&f->fx_abortlist)) 
378 cv_wait(&f->fx_abortcv, &f->fx_abortlock); 
379 mutex_exit(&f->fx_abortlock); 
380 531
381 mutex_enter(&f->fx_qlock); 532 return 0;
382 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 
383 mutex_enter(&fw->fw_lock); 
384 futex_wait_dequeue(fw, f); 
385 cv_broadcast(&fw->fw_cv); 
386 mutex_exit(&fw->fw_lock); 
387 } 
388 mutex_exit(&f->fx_qlock); 
389} 533}
390 534
391/* 535/*
392 * futex_queue_fini(fq) 536 * futex_dtor()
393 * 537 *
394 * Finalize the futex queue initialized by futex_queue_init. Queue 538 * Futex object destructor.
395 * must be empty. Caller must not use f again until a subsequent 
396 * futex_queue_init. 
397 */ 539 */
398static void 540static void
399futex_queue_fini(struct futex *f) 541futex_dtor(void *arg __unused, void *obj)
400{ 542{
 543 struct futex * const f = obj;
401 544
402 KASSERT(TAILQ_EMPTY(&f->fx_queue)); 545 mutex_destroy(&f->fx_op_lock);
403 KASSERT(LIST_EMPTY(&f->fx_abortlist)); 546 f->fx_sq_lock = NULL;
404 mutex_destroy(&f->fx_qlock); 
405 mutex_destroy(&f->fx_abortlock); 
406 cv_destroy(&f->fx_abortcv); 
407} 547}
408 548
409/* 549/*
410 * futex_key_init(key, vm, va, shared) 550 * futex_key_init(key, vm, va, shared)
411 * 551 *
412 * Initialize a futex key for lookup, etc. 552 * Initialize a futex key for lookup, etc.
413 */ 553 */
414static int 554static int
415futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared) 555futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
416{ 556{
417 int error = 0; 557 int error = 0;
418 558
419 if (__predict_false(shared)) { 559 if (__predict_false(shared)) {
@@ -441,179 +581,191 @@ futex_key_fini(union futex_key *fk, bool @@ -441,179 +581,191 @@ futex_key_fini(union futex_key *fk, bool
441} 581}
442 582
443/* 583/*
444 * futex_create(fk, shared) 584 * futex_create(fk, shared)
445 * 585 *
446 * Create a futex. Initial reference count is 1, representing the 586 * Create a futex. Initial reference count is 1, representing the
447 * caller. Returns NULL on failure. Always takes ownership of the 587 * caller. Returns NULL on failure. Always takes ownership of the
448 * key, either transferring it to the newly-created futex, or releasing 588 * key, either transferring it to the newly-created futex, or releasing
449 * the key if creation fails. 589 * the key if creation fails.
450 * 590 *
451 * Never sleeps for memory, but may sleep to acquire a lock. 591 * Never sleeps for memory, but may sleep to acquire a lock.
452 */ 592 */
453static struct futex * 593static struct futex *
454futex_create(union futex_key *fk, bool shared) 594futex_create(union futex_key *fk, bool shared, uint8_t class)
455{ 595{
456 struct futex *f; 596 struct futex *f;
457 597
458 f = kmem_alloc(sizeof(*f), KM_NOSLEEP); 598 f = pool_cache_get(futex_cache, PR_NOWAIT);
459 if (f == NULL) { 599 if (f == NULL) {
460 futex_key_fini(fk, shared); 600 futex_key_fini(fk, shared);
461 return NULL; 601 return NULL;
462 } 602 }
463 f->fx_key = *fk; 603 f->fx_key = *fk;
464 f->fx_refcnt = 1; 604 f->fx_refcnt = 1;
465 f->fx_shared = shared; 605 f->fx_shared = shared;
466 f->fx_on_tree = false; 606 f->fx_on_tree = false;
467 futex_queue_init(f); 607 f->fx_class = class;
468 608
469 return f; 609 return f;
470} 610}
471 611
472/* 612/*
473 * futex_destroy(f) 613 * futex_destroy(f)
474 * 614 *
475 * Destroy a futex created with futex_create. Reference count 615 * Destroy a futex created with futex_create. Reference count
476 * must be zero. 616 * must be zero.
477 * 617 *
478 * May sleep. 618 * May sleep.
479 */ 619 */
480static void 620static void
481futex_destroy(struct futex *f) 621futex_destroy(struct futex *f)
482{ 622{
483 623
484 ASSERT_SLEEPABLE(); 624 ASSERT_SLEEPABLE();
485 625
486 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0); 626 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
487 KASSERT(!f->fx_on_tree); 627 KASSERT(!f->fx_on_tree);
488 628 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
489 /* Drain and destroy the private queue. */ 629 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
490 futex_queue_drain(f); 630 KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
491 futex_queue_fini(f); 631 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
492 632
493 futex_key_fini(&f->fx_key, f->fx_shared); 633 futex_key_fini(&f->fx_key, f->fx_shared);
494 634
495 kmem_free(f, sizeof(*f)); 635 pool_cache_put(futex_cache, f);
496} 636}
497 637
498/* 638/*
499 * futex_hold(f) 639 * futex_hold_count(f, n)
500 * 640 *
501 * Attempt to acquire a reference to f. Return 0 on success, 641 * Attempt to acquire a reference to f. Return 0 on success,
502 * ENFILE on too many references. 642 * ENFILE on too many references.
503 * 643 *
504 * Never sleeps. 644 * Never sleeps.
505 */ 645 */
506static int 646static int
507futex_hold(struct futex *f) 647futex_hold_count(struct futex *f, unsigned long const count)
508{ 648{
509 unsigned long refcnt; 649 unsigned long refcnt;
510 650
511 do { 651 do {
512 refcnt = atomic_load_relaxed(&f->fx_refcnt); 652 refcnt = atomic_load_relaxed(&f->fx_refcnt);
513 if (refcnt == ULONG_MAX) 653 if (ULONG_MAX - refcnt < count)
514 return ENFILE; 654 return ENFILE;
515 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt); 655 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
 656 refcnt + count) != refcnt);
516 657
517 return 0; 658 return 0;
518} 659}
 660#define futex_hold(f) futex_hold_count(f, 1)
519 661
520/* 662/*
521 * futex_rele(f) 663 * futex_rele_count(f, n)
522 * 664 *
523 * Release a reference to f acquired with futex_create or 665 * Release a reference to f acquired with futex_create or
524 * futex_hold. 666 * futex_hold.
525 * 667 *
526 * May sleep to free f. 668 * May sleep to free f.
527 */ 669 */
528static void 670static void
529futex_rele(struct futex *f) 671futex_rele_count(struct futex *f, unsigned long const count)
530{ 672{
531 unsigned long refcnt; 673 unsigned long refcnt;
532 674
533 ASSERT_SLEEPABLE(); 675 ASSERT_SLEEPABLE();
534 676
535 do { 677 do {
536 refcnt = atomic_load_relaxed(&f->fx_refcnt); 678 refcnt = atomic_load_relaxed(&f->fx_refcnt);
537 if (refcnt == 1) 679 KASSERT(refcnt >= count);
 680 if (refcnt - count == 0)
538 goto trylast; 681 goto trylast;
539 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); 682 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
 683 refcnt - count) != refcnt);
540 return; 684 return;
541 685
542trylast: 686trylast:
 687 KASSERT(count <= LONG_MAX);
543 mutex_enter(&futex_tab.lock); 688 mutex_enter(&futex_tab.lock);
544 if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) { 689 if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
545 if (f->fx_on_tree) { 690 if (f->fx_on_tree) {
546 if (__predict_false(f->fx_shared)) 691 if (__predict_false(f->fx_shared))
547 rb_tree_remove_node(&futex_tab.oa, f); 692 rb_tree_remove_node(&futex_tab.oa, f);
548 else 693 else
549 rb_tree_remove_node(&futex_tab.va, f); 694 rb_tree_remove_node(&futex_tab.va, f);
550 f->fx_on_tree = false; 695 f->fx_on_tree = false;
551 } 696 }
552 } else { 697 } else {
553 /* References remain -- don't destroy it. */ 698 /* References remain -- don't destroy it. */
554 f = NULL; 699 f = NULL;
555 } 700 }
556 mutex_exit(&futex_tab.lock); 701 mutex_exit(&futex_tab.lock);
557 if (f != NULL) 702 if (f != NULL)
558 futex_destroy(f); 703 futex_destroy(f);
559} 704}
 705#define futex_rele(f) futex_rele_count(f, 1)
560 706
561/* 707/*
562 * futex_rele_not_last(f) 708 * futex_rele_count_not_last(f, n)
563 * 709 *
564 * Release a reference to f acquired with futex_create or 710 * Release a reference to f acquired with futex_create or
565 * futex_hold. 711 * futex_hold.
566 * 712 *
567 * This version asserts that we are not dropping the last 713 * This version asserts that we are not dropping the last
568 * reference to f. 714 * reference to f.
569 */ 715 */
570static void 716static void
571futex_rele_not_last(struct futex *f) 717futex_rele_count_not_last(struct futex *f, unsigned long const count)
572{ 718{
573 unsigned long refcnt; 719 unsigned long refcnt;
574 720
575 do { 721 do {
576 refcnt = atomic_load_relaxed(&f->fx_refcnt); 722 refcnt = atomic_load_relaxed(&f->fx_refcnt);
577 KASSERT(refcnt > 1); 723 KASSERT(refcnt >= count);
578 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); 724 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
 725 refcnt - count) != refcnt);
579} 726}
580 727
581/* 728/*
582 * futex_lookup_by_key(key, shared, &f) 729 * futex_lookup_by_key(key, shared, class, &f)
583 * 730 *
584 * Try to find an existing futex va reference in the specified key 731 * Try to find an existing futex va reference in the specified key
585 * On success, return 0, set f to found futex or to NULL if not found, 732 * On success, return 0, set f to found futex or to NULL if not found,
586 * and increment f's reference count if found. 733 * and increment f's reference count if found.
587 * 734 *
588 * Return ENFILE if reference count too high. 735 * Return ENFILE if reference count too high.
589 * 736 *
590 * Internal lookup routine shared by futex_lookup() and 737 * Internal lookup routine shared by futex_lookup() and
591 * futex_lookup_create(). 738 * futex_lookup_create().
592 */ 739 */
593static int 740static int
594futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp) 741futex_lookup_by_key(union futex_key *fk, bool shared, uint8_t class,
 742 struct futex **fp)
595{ 743{
596 struct futex *f; 744 struct futex *f;
597 int error = 0; 745 int error = 0;
598 746
599 mutex_enter(&futex_tab.lock); 747 mutex_enter(&futex_tab.lock);
600 if (__predict_false(shared)) { 748 if (__predict_false(shared)) {
601 f = rb_tree_find_node(&futex_tab.oa, fk); 749 f = rb_tree_find_node(&futex_tab.oa, fk);
602 } else { 750 } else {
603 f = rb_tree_find_node(&futex_tab.va, fk); 751 f = rb_tree_find_node(&futex_tab.va, fk);
604 } 752 }
605 if (f) { 753 if (f) {
606 error = futex_hold(f); 754 if (__predict_true(f->fx_class == class ||
 755 class == FUTEX_CLASS_ANY))
 756 error = futex_hold(f);
 757 else
 758 error = EINVAL;
607 if (error) 759 if (error)
608 f = NULL; 760 f = NULL;
609 } 761 }
610 *fp = f; 762 *fp = f;
611 mutex_exit(&futex_tab.lock); 763 mutex_exit(&futex_tab.lock);
612 764
613 return error; 765 return error;
614} 766}
615 767
616/* 768/*
617 * futex_insert(f, fp) 769 * futex_insert(f, fp)
618 * 770 *
619 * Try to insert the futex f into the tree by va. If there 771 * Try to insert the futex f into the tree by va. If there
@@ -644,674 +796,682 @@ futex_insert(struct futex *f, struct fut @@ -644,674 +796,682 @@ futex_insert(struct futex *f, struct fut
644 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0); 796 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
645 KASSERT(f0->fx_on_tree); 797 KASSERT(f0->fx_on_tree);
646 error = futex_hold(f0); 798 error = futex_hold(f0);
647 if (error) 799 if (error)
648 goto out; 800 goto out;
649 } 801 }
650 *fp = f0; 802 *fp = f0;
651out: mutex_exit(&futex_tab.lock); 803out: mutex_exit(&futex_tab.lock);
652 804
653 return error; 805 return error;
654} 806}
655 807
656/* 808/*
657 * futex_lookup(uaddr, shared, &f) 809 * futex_lookup(uaddr, shared, class, &f)
658 * 810 *
659 * Find a futex at the userland pointer uaddr in the current 811 * Find a futex at the userland pointer uaddr in the current
660 * process's VM space. On success, return the futex in f and 812 * process's VM space. On success, return the futex in f and
661 * increment its reference count. 813 * increment its reference count.
662 * 814 *
663 * Caller must call futex_rele when done. 815 * Caller must call futex_rele when done.
664 */ 816 */
665static int 817static int
666futex_lookup(int *uaddr, bool shared, struct futex **fp) 818futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
667{ 819{
668 union futex_key fk; 820 union futex_key fk;
669 struct vmspace *vm = curproc->p_vmspace; 821 struct vmspace *vm = curproc->p_vmspace;
670 vaddr_t va = (vaddr_t)uaddr; 822 vaddr_t va = (vaddr_t)uaddr;
671 int error; 823 int error;
672 824
673 /* 825 /*
674 * Reject unaligned user pointers so we don't cross page 826 * Reject unaligned user pointers so we don't cross page
675 * boundaries and so atomics will work. 827 * boundaries and so atomics will work.
676 */ 828 */
677 if ((va & 3) != 0) 829 if (__predict_false((va & 3) != 0))
678 return EINVAL; 830 return EINVAL;
679 831
680 /* Look it up. */ 832 /* Look it up. */
681 error = futex_key_init(&fk, vm, va, shared); 833 error = futex_key_init(&fk, vm, va, shared);
682 if (error) 834 if (error)
683 return error; 835 return error;
684 836
685 error = futex_lookup_by_key(&fk, shared, fp); 837 error = futex_lookup_by_key(&fk, shared, class, fp);
686 futex_key_fini(&fk, shared); 838 futex_key_fini(&fk, shared);
687 if (error) 839 if (error)
688 return error; 840 return error;
689 841
690 KASSERT(*fp == NULL || (*fp)->fx_shared == shared); 842 KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
 843 KASSERT(*fp == NULL || (*fp)->fx_class == class);
691 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); 844 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
692 845
693 /* 846 /*
694 * Success! (Caller must still check whether we found 847 * Success! (Caller must still check whether we found
695 * anything, but nothing went _wrong_ like trying to use 848 * anything, but nothing went _wrong_ like trying to use
696 * unmapped memory.) 849 * unmapped memory.)
697 */ 850 */
698 KASSERT(error == 0); 851 KASSERT(error == 0);
699 852
700 return error; 853 return error;
701} 854}
702 855
703/* 856/*
704 * futex_lookup_create(uaddr, shared, &f) 857 * futex_lookup_create(uaddr, shared, class, &f)
705 * 858 *
706 * Find or create a futex at the userland pointer uaddr in the 859 * Find or create a futex at the userland pointer uaddr in the
707 * current process's VM space. On success, return the futex in f 860 * current process's VM space. On success, return the futex in f
708 * and increment its reference count. 861 * and increment its reference count.
709 * 862 *
710 * Caller must call futex_rele when done. 863 * Caller must call futex_rele when done.
711 */ 864 */
712static int 865static int
713futex_lookup_create(int *uaddr, bool shared, struct futex **fp) 866futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
714{ 867{
715 union futex_key fk; 868 union futex_key fk;
716 struct vmspace *vm = curproc->p_vmspace; 869 struct vmspace *vm = curproc->p_vmspace;
717 struct futex *f = NULL; 870 struct futex *f = NULL;
718 vaddr_t va = (vaddr_t)uaddr; 871 vaddr_t va = (vaddr_t)uaddr;
719 int error; 872 int error;
720 873
721 /* 874 /*
722 * Reject unaligned user pointers so we don't cross page 875 * Reject unaligned user pointers so we don't cross page
723 * boundaries and so atomics will work. 876 * boundaries and so atomics will work.
724 */ 877 */
725 if ((va & 3) != 0) 878 if (__predict_false((va & 3) != 0))
 879 return EINVAL;
 880
 881 if (__predict_false(class == FUTEX_CLASS_ANY))
726 return EINVAL; 882 return EINVAL;
727 883
728 error = futex_key_init(&fk, vm, va, shared); 884 error = futex_key_init(&fk, vm, va, shared);
729 if (error) 885 if (error)
730 return error; 886 return error;
731 887
732 /* 888 /*
733 * Optimistically assume there already is one, and try to find 889 * Optimistically assume there already is one, and try to find
734 * it. 890 * it.
735 */ 891 */
736 error = futex_lookup_by_key(&fk, shared, fp); 892 error = futex_lookup_by_key(&fk, shared, class, fp);
737 if (error || *fp != NULL) { 893 if (error || *fp != NULL) {
738 /* 894 /*
739 * We either found one, or there was an error. 895 * We either found one, or there was an error.
740 * In either case, we are done with the key. 896 * In either case, we are done with the key.
741 */ 897 */
742 futex_key_fini(&fk, shared); 898 futex_key_fini(&fk, shared);
743 goto out; 899 goto out;
744 } 900 }
745 901
746 /* 902 /*
747 * Create a futex recoard. This tranfers ownership of the key 903 * Create a futex recoard. This tranfers ownership of the key
748 * in all cases. 904 * in all cases.
749 */ 905 */
750 f = futex_create(&fk, shared); 906 f = futex_create(&fk, shared, class);
751 if (f == NULL) { 907 if (f == NULL) {
752 error = ENOMEM; 908 error = ENOMEM;
753 goto out; 909 goto out;
754 } 910 }
755 911
756 /* 912 /*
757 * Insert our new futex, or use existing if someone else beat 913 * Insert our new futex, or use existing if someone else beat
758 * us to it. 914 * us to it.
759 */ 915 */
760 error = futex_insert(f, fp); 916 error = futex_insert(f, fp);
761 if (error) 917 if (error)
762 goto out; 918 goto out;
763 if (*fp == f) 919 if (*fp == f)
764 f = NULL; /* don't release on exit */ 920 f = NULL; /* don't release on exit */
765 921
766 /* Success! */ 922 /* Success! */
767 KASSERT(error == 0); 923 KASSERT(error == 0);
768 924
769out: if (f != NULL) 925out: if (f != NULL)
770 futex_rele(f); 926 futex_rele(f);
771 KASSERT(error || *fp != NULL); 927 KASSERT(error || *fp != NULL);
772 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); 928 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
773 return error; 929 return error;
774} 930}
775 931
776/* 932/*
777 * futex_wait_init(fw, bitset) 933 * futex_unsleep:
778 * 
779 * Initialize a record for a thread to wait on a futex matching 
780 * the specified bit set. Should be passed to futex_wait_enqueue 
781 * before futex_wait, and should be passed to futex_wait_fini when 
782 * done. 
783 */ 
784static void 
785futex_wait_init(struct futex_wait *fw, int bitset) 
786{ 
787 
788 KASSERT(bitset); 
789 
790 mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE); 
791 cv_init(&fw->fw_cv, "futex"); 
792 fw->fw_futex = NULL; 
793 fw->fw_bitset = bitset; 
794 fw->fw_aborting = false; 
795} 
796 
797/* 
798 * futex_wait_fini(fw) 
799 * 934 *
800 * Finalize a record for a futex waiter. Must not be on any 935 * Remove an LWP from the futex and sleep queue. This is called when
801 * futex's queue. 936 * the LWP has not been awoken normally but instead interrupted: for
 937 * example, when a signal is received. Must be called with the LWP
 938 * locked. Will unlock if "unlock" is true.
802 */ 939 */
803static void 940static void
804futex_wait_fini(struct futex_wait *fw) 941futex_unsleep(lwp_t *l, bool unlock)
805{ 942{
 943 struct futex *f __diagused;
806 944
807 KASSERT(fw->fw_futex == NULL); 945 f = (struct futex *)(uintptr_t)l->l_wchan;
808 946
809 cv_destroy(&fw->fw_cv); 947 KASSERT(mutex_owned(f->fx_sq_lock));
810 mutex_destroy(&fw->fw_lock); 
811} 
812 948
813/* 949 /* WRITERQ is more likely */
814 * futex_wait_enqueue(fw, f) 950 if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
815 * 951 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
816 * Put fw on the futex queue. Must be done before futex_wait. 952 f->fx_nwaiters[FUTEX_WRITERQ]--;
817 * Caller must hold fw's lock and f's lock, and fw must not be on 953 } else {
818 * any existing futex's waiter list. 954 KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
819 */ 955 KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
820static void 956 f->fx_nwaiters[FUTEX_READERQ]--;
821futex_wait_enqueue(struct futex_wait *fw, struct futex *f) 957 }
822{ 
823 
824 KASSERT(mutex_owned(&f->fx_qlock)); 
825 KASSERT(mutex_owned(&fw->fw_lock)); 
826 KASSERT(fw->fw_futex == NULL); 
827 KASSERT(!fw->fw_aborting); 
828 
829 fw->fw_futex = f; 
830 TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry); 
831} 
832 
833/* 
834 * futex_wait_dequeue(fw, f) 
835 * 
836 * Remove fw from the futex queue. Precludes subsequent 
837 * futex_wait until a futex_wait_enqueue. Caller must hold fw's 
838 * lock and f's lock, and fw must be on f. 
839 */ 
840static void 
841futex_wait_dequeue(struct futex_wait *fw, struct futex *f) 
842{ 
843 
844 KASSERT(mutex_owned(&f->fx_qlock)); 
845 KASSERT(mutex_owned(&fw->fw_lock)); 
846 KASSERT(fw->fw_futex == f); 
847 958
848 TAILQ_REMOVE(&f->fx_queue, fw, fw_entry); 959 sleepq_unsleep(l, unlock);
849 fw->fw_futex = NULL; 
850} 960}
851 961
852/* 962/*
853 * futex_wait_abort(fw) 963 * futex_wait(f, timeout, clkid, bitset)
854 * 964 *
855 * Caller is no longer waiting for fw. Remove it from any queue 965 * Wait until deadline on the clock clkid, or forever if deadline
856 * if it was on one. Caller must hold fw->fw_lock. 966 * is NULL, for a futex wakeup. Return 0 on explicit wakeup,
 967 * ETIMEDOUT on timeout, EINTR on signal.
857 */ 968 */
858static void 969static int
859futex_wait_abort(struct futex_wait *fw) 970futex_wait(struct futex *f, int q, const struct timespec *deadline,
 971 clockid_t clkid, int bitset)
860{ 972{
861 struct futex *f; 
862 
863 KASSERT(mutex_owned(&fw->fw_lock)); 
864 973
865 /* 974 /*
866 * Grab the futex queue. It can't go away as long as we hold 975 * Some things to note about how this function works:
867 * fw_lock. However, we can't take the queue lock because 976 *
868 * that's a lock order reversal. 977 * ==> When we enter futex_wait(), the futex's op lock is
 978 * held, preventing us from missing wakes. Once we are on
 979 * the futex's sleepq, it is safe to release the op lock.
 980 *
 981 * ==> We have a single early escape to avoid calling
 982 * sleepq_block(): our deadline already having passed.
 983 * If this is a no-timeout wait, or if the deadline has
 984 * not already passed, we are guaranteed to call sleepq_block().
 985 *
 986 * ==> sleepq_block() contains all of the necessary logic
 987 * for handling signals; we do not need to check for them
 988 * ourselves.
 989 *
 990 * ==> Once we have blocked, other futex operations will
 991 * be able to wake us or requeue us. Until we have blocked,
 992 * those other futex operations will not be able to acquire
 993 * the sleepq locks in order to perform those operations,
 994 * even if we have dropped the op lock. Thus, we will not
 995 * miss wakes. This fundamental invariant is relied upon by
 996 * every other synchronization object in the kernel.
 997 *
 998 * ==> sleepq_block() returns for one of three reasons:
 999 *
 1000 * -- We were awakened.
 1001 * -- We were signaled.
 1002 * -- We timed out.
 1003 *
 1004 * ==> Once sleepq_block() returns, we will not call
 1005 * sleepq_block() again.
 1006 *
 1007 * ==> If sleepq_block() returns due to being signaled,
 1008 * we must check to see if we were also awakened. This
 1009 * is to handle the following sequence:
 1010 *
 1011 * -- Futex wake takes us off sleepq, places us on runq.
 1012 * -- We are signaled while sitting on the runq.
 1013 * -- We run, and sleepq_block() notices the signal and
 1014 * returns EINTR/ERESTART.
 1015 *
 1016 * In this situation, we want to indicate a successful wake
 1017 * to the caller, because that wake has been reported to the
 1018 * thread that issued it.
 1019 *
 1020 * Note that it is NOT possible for a wake to be issued after
 1021 * a signal; the LWP will already have been taken off the sleepq
 1022 * by the signal, so the wake will never happen. Note that for
 1023 * this reason, we must *never* return ERESTART, because it could
 1024 * result in us waiting again with no one to awaken us.
869 */ 1025 */
870 f = fw->fw_futex; 
871 1026
872 /* Put us on the abort list so that fq won't go away. */ 1027 struct lwp * const l = curlwp;
873 mutex_enter(&f->fx_abortlock); 1028 int timo;
874 LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort); 1029 int error;
875 mutex_exit(&f->fx_abortlock); 
876 1030
877 /* 1031 /*
878 * Mark fw as aborting so it won't lose wakeups and won't be 1032 * Compute our timeout before taking any locks.
879 * transferred to any other queue. 
880 */ 1033 */
881 fw->fw_aborting = true; 1034 if (deadline) {
 1035 struct timespec ts;
882 1036
883 /* f is now stable, so we can release fw_lock. */ 1037 /* Check our watch. */
884 mutex_exit(&fw->fw_lock); 1038 error = clock_gettime1(clkid, &ts);
 1039 if (error) {
 1040 mutex_exit(&f->fx_op_lock);
 1041 return error;
 1042 }
885 1043
886 /* Now we can remove fw under the queue lock. */ 1044 /*
887 mutex_enter(&f->fx_qlock); 1045 * If we're past the deadline, ETIMEDOUT.
888 mutex_enter(&fw->fw_lock); 1046 */
889 futex_wait_dequeue(fw, f); 1047 if (timespeccmp(deadline, &ts, <=)) {
890 mutex_exit(&fw->fw_lock); 1048 mutex_exit(&f->fx_op_lock);
891 mutex_exit(&f->fx_qlock); 1049 return ETIMEDOUT;
 1050 } else {
 1051 /* Count how much time is left. */
 1052 timespecsub(deadline, &ts, &ts);
 1053 timo = tstohz(&ts);
 1054 if (timo == 0) {
 1055 /*
 1056 * tstohz() already rounds up, and
 1057 * a return value of 0 ticks means
 1058 * "expired now or in the past".
 1059 */
 1060 mutex_exit(&f->fx_op_lock);
 1061 return ETIMEDOUT;
 1062 }
 1063 }
 1064 } else {
 1065 timo = 0;
 1066 }
892 1067
893 /* 1068 /*
894 * Finally, remove us from the abort list and notify anyone 1069 * Acquire in sleepq -> lwp order. While we're at it, ensure
895 * waiting for the abort to complete if we were the last to go. 1070 * that there's room for us to block.
896 */ 1071 */
897 mutex_enter(&f->fx_abortlock); 1072 mutex_spin_enter(f->fx_sq_lock);
898 LIST_REMOVE(fw, fw_abort); 1073 if (__predict_false(q == FUTEX_READERQ &&
899 if (LIST_EMPTY(&f->fx_abortlist)) 1074 f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
900 cv_broadcast(&f->fx_abortcv); 1075 mutex_spin_exit(f->fx_sq_lock);
901 mutex_exit(&f->fx_abortlock); 1076 mutex_exit(&f->fx_op_lock);
 1077 return ENFILE;
 1078 }
 1079 lwp_lock(l);
902 1080
903 /* 1081 /*
904 * Release our reference to the futex now that we are not 1082 * No need for locks here; until we're on a futex's sleepq, these
905 * waiting for it. 1083 * values are private to the LWP, and the locks needed to get onto
 1084 * the sleepq provide the memory barriers needed to ensure visibility.
906 */ 1085 */
907 futex_rele(f); 1086 l->l_futex = f;
 1087 l->l_futex_wakesel = bitset;
908 1088
909 /* 1089 /*
910 * Reacquire the fw lock as caller expects. Verify that we're 1090 * This is an inlined bit of sleepq_enter();
911 * aborting and no longer associated with a futex. 1091 * we already hold the lwp lock.
912 */ 1092 */
913 mutex_enter(&fw->fw_lock); 1093 lwp_unlock_to(l, f->fx_sq_lock);
914 KASSERT(fw->fw_aborting); 1094 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
915 KASSERT(fw->fw_futex == NULL); 1095 KASSERT(lwp_locked(l, f->fx_sq_lock));
916} 
917 
918/* 
919 * futex_wait(fw, deadline, clkid) 
920 * 
921 * fw must be a waiter on a futex's queue. Wait until deadline on 
922 * the clock clkid, or forever if deadline is NULL, for a futex 
923 * wakeup. Return 0 on explicit wakeup or destruction of futex, 
924 * ETIMEDOUT on timeout, EINTR/ERESTART on signal. Either way, fw 
925 * will no longer be on a futex queue on return. 
926 */ 
927static int 
928futex_wait(struct futex_wait *fw, const struct timespec *deadline, 
929 clockid_t clkid) 
930{ 
931 int error = 0; 
932 1096
933 /* Test and wait under the wait lock. */ 1097 sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
934 mutex_enter(&fw->fw_lock); 1098 f->fx_nwaiters[q]++;
935 1099
936 for (;;) { 1100 /* We can now safely release the op lock. */
937 /* If we're done yet, stop and report success. */ 1101 mutex_exit(&f->fx_op_lock);
938 if (fw->fw_bitset == 0 || fw->fw_futex == NULL) { 
939 error = 0; 
940 break; 
941 } 
942 
943 /* If anything went wrong in the last iteration, stop. */ 
944 if (error) 
945 break; 
946 
947 /* Not done yet. Wait. */ 
948 if (deadline) { 
949 struct timespec ts; 
950 
951 /* Check our watch. */ 
952 error = clock_gettime1(clkid, &ts); 
953 if (error) 
954 break; 
955 
956 /* If we're past the deadline, ETIMEDOUT. */ 
957 if (timespeccmp(deadline, &ts, <=)) { 
958 error = ETIMEDOUT; 
959 break; 
960 } 
961 1102
962 /* Count how much time is left. */ 1103 SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
963 timespecsub(deadline, &ts, &ts); 1104 error = sleepq_block(timo, true);
 1105 SDT_PROBE1(futex, wait, sleepq_block, exit, error);
964 1106
965 /* Wait for that much time, allowing signals. */ 1107 /*
966 error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock, 1108 * f is no longer valid; we may have been moved to another
967 tstohz(&ts)); 1109 * futex sleepq while we slept.
968 } else { 1110 */
969 /* Wait indefinitely, allowing signals. */ 
970 error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock); 
971 } 
972 } 
973 1111
974 /* 1112 /*
975 * If we were woken up, the waker will have removed fw from the 1113 * If something went wrong, we may still have a futex reference.
976 * queue. But if anything went wrong, we must remove fw from 1114 * Check for that here and drop it if so. We can do this without
977 * the queue ourselves. While here, convert EWOULDBLOCK to 1115 * taking any locks because l->l_futex is private to the LWP when
978 * ETIMEDOUT. 1116 * not on any futex's sleepq, and we are not on any sleepq because
 1117 * we are running.
 1118 *
 1119 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
 1120 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
979 */ 1121 */
980 if (error) { 1122 if (error) {
981 futex_wait_abort(fw); 1123 f = l->l_futex;
 1124 if (f != NULL) {
 1125 SDT_PROBE0(futex, wait, finish, aborted);
 1126 l->l_futex = NULL;
 1127 futex_rele(f);
 1128 } else {
 1129 /* Raced with wakeup; report success. */
 1130 SDT_PROBE0(futex, wait, finish, wakerace);
 1131 error = 0;
 1132 }
982 if (error == EWOULDBLOCK) 1133 if (error == EWOULDBLOCK)
983 error = ETIMEDOUT; 1134 error = ETIMEDOUT;
 1135 else if (error != ETIMEDOUT)
 1136 error = EINTR;
 1137 } else {
 1138 KASSERT(l->l_futex == NULL);
 1139 SDT_PROBE0(futex, wait, finish, normally);
984 } 1140 }
985 1141
986 mutex_exit(&fw->fw_lock); 
987 
988 return error; 1142 return error;
989} 1143}
990 1144
991/* 1145/*
992 * futex_wake(f, nwake, f2, nrequeue, bitset) 1146 * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
993 * 1147 *
994 * Wake up to nwake waiters on f matching bitset; then, if f2 is 1148 * Wake up to nwake waiters on f matching bitset; then, if f2 is
995 * provided, move up to nrequeue remaining waiters on f matching 1149 * provided, move up to nrequeue remaining waiters on f matching
996 * bitset to f2. Return the number of waiters actually woken. 1150 * bitset to f2. Return the number of waiters actually woken.
997 * Caller must hold the locks of f and f2, if provided. 1151 * Caller must hold the locks of f and f2, if provided.
998 */ 1152 */
999static unsigned 1153static unsigned
1000futex_wake(struct futex *f, unsigned nwake, struct futex *f2, 1154futex_wake(struct futex *f, int q, unsigned int const nwake,
1001 unsigned nrequeue, int bitset) 1155 struct futex *f2, int q2, unsigned int const nrequeue,
 1156 int bitset)
1002{ 1157{
1003 struct futex_wait *fw, *fw_next; 1158 struct lwp *l, *l_next;
1004 unsigned nwoken = 0; 1159 unsigned nwoken = 0;
1005 int hold_error __diagused; 1160 unsigned nrequeued = 0;
 1161
 1162 KASSERT(mutex_owned(&f->fx_op_lock));
 1163 KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
1006 1164
1007 KASSERT(mutex_owned(&f->fx_qlock)); 1165 futex_sq_lock2(f, f2);
1008 KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock)); 
1009 1166
1010 /* Wake up to nwake waiters, and count the number woken. */ 1167 /* Wake up to nwake waiters, and count the number woken. */
1011 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 1168 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1012 if ((fw->fw_bitset & bitset) == 0) 1169 if (nwoken == nwake)
1013 continue; 
1014 if (nwake > 0) { 
1015 mutex_enter(&fw->fw_lock); 
1016 if (__predict_false(fw->fw_aborting)) { 
1017 mutex_exit(&fw->fw_lock); 
1018 continue; 
1019 } 
1020 futex_wait_dequeue(fw, f); 
1021 fw->fw_bitset = 0; 
1022 cv_broadcast(&fw->fw_cv); 
1023 mutex_exit(&fw->fw_lock); 
1024 nwake--; 
1025 nwoken++; 
1026 /* 
1027 * Drop the futex reference on behalf of the 
1028 * waiter. We assert this is not the last 
1029 * reference on the futex (our caller should 
1030 * also have one). 
1031 */ 
1032 futex_rele_not_last(f); 
1033 } else { 
1034 break; 1170 break;
1035 } 1171
 1172 KASSERT(l->l_syncobj == &futex_syncobj);
 1173 KASSERT(l->l_wchan == (wchan_t)f);
 1174 KASSERT(l->l_futex == f);
 1175 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
 1176 KASSERT(l->l_mutex == f->fx_sq_lock);
 1177
 1178 if ((l->l_futex_wakesel & bitset) == 0)
 1179 continue;
 1180
 1181 l->l_futex_wakesel = 0;
 1182 l->l_futex = NULL;
 1183 sleepq_remove(&f->fx_sqs[q], l);
 1184 f->fx_nwaiters[q]--;
 1185 nwoken++;
 1186 /* hold counts adjusted in bulk below */
1036 } 1187 }
1037 1188
1038 if (f2) { 1189 if (nrequeue) {
 1190 KASSERT(f2 != NULL);
1039 /* Move up to nrequeue waiters from f's queue to f2's queue. */ 1191 /* Move up to nrequeue waiters from f's queue to f2's queue. */
1040 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 1192 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1041 if ((fw->fw_bitset & bitset) == 0) 1193 if (nrequeued == nrequeue)
1042 continue; 
1043 if (nrequeue > 0) { 
1044 mutex_enter(&fw->fw_lock); 
1045 if (__predict_false(fw->fw_aborting)) { 
1046 mutex_exit(&fw->fw_lock); 
1047 continue; 
1048 } 
1049 futex_wait_dequeue(fw, f); 
1050 futex_wait_enqueue(fw, f2); 
1051 mutex_exit(&fw->fw_lock); 
1052 nrequeue--; 
1053 /* 
1054 * Transfer the reference from f to f2. 
1055 * As above, we assert that we are not 
1056 * dropping the last reference to f here. 
1057 * 
1058 * XXX futex_hold() could theoretically 
1059 * XXX fail here. 
1060 */ 
1061 futex_rele_not_last(f); 
1062 hold_error = futex_hold(f2); 
1063 KASSERT(hold_error == 0); 
1064 } else { 
1065 break; 1194 break;
1066 } 1195
 1196 KASSERT(l->l_syncobj == &futex_syncobj);
 1197 KASSERT(l->l_wchan == (wchan_t)f);
 1198 KASSERT(l->l_futex == f);
 1199 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
 1200 KASSERT(l->l_mutex == f->fx_sq_lock);
 1201
 1202 if ((l->l_futex_wakesel & bitset) == 0)
 1203 continue;
 1204
 1205 l->l_futex = f2;
 1206 sleepq_transfer(l, &f->fx_sqs[q],
 1207 &f2->fx_sqs[q2], f2, futex_wmesg,
 1208 &futex_syncobj, f2->fx_sq_lock, true);
 1209 f->fx_nwaiters[q]--;
 1210 f2->fx_nwaiters[q2]++;
 1211 nrequeued++;
 1212 /* hold counts adjusted in bulk below */
 1213 }
 1214 if (nrequeued) {
 1215 /* XXX futex_hold() could theoretically fail here. */
 1216 int hold_error __diagused =
 1217 futex_hold_count(f2, nrequeued);
 1218 KASSERT(hold_error == 0);
1067 } 1219 }
1068 } else { 
1069 KASSERT(nrequeue == 0); 
1070 } 1220 }
1071 1221
1072 /* Return the number of waiters woken. */ 1222 /*
1073 return nwoken; 1223 * Adjust the futex reference counts for the wakes and
 1224 * requeues.
 1225 */
 1226 KASSERT(nwoken + nrequeued <= LONG_MAX);
 1227 futex_rele_count_not_last(f, nwoken + nrequeued);
 1228
 1229 futex_sq_unlock2(f, f2);
 1230
 1231 /* Return the number of waiters woken and requeued. */
 1232 return nwoken + nrequeued;
1074} 1233}
1075 1234
1076/* 1235/*
1077 * futex_queue_lock(f) 1236 * futex_op_lock(f)
1078 * 1237 *
1079 * Acquire the queue lock of f. Pair with futex_queue_unlock. Do 1238 * Acquire the op lock of f. Pair with futex_op_unlock. Do
1080 * not use if caller needs to acquire two locks; use 1239 * not use if caller needs to acquire two locks; use
1081 * futex_queue_lock2 instead. 1240 * futex_op_lock2 instead.
1082 */ 1241 */
1083static void 1242static void
1084futex_queue_lock(struct futex *f) 1243futex_op_lock(struct futex *f)
1085{ 1244{
1086 mutex_enter(&f->fx_qlock); 1245 mutex_enter(&f->fx_op_lock);
1087} 1246}
1088 1247
1089/* 1248/*
1090 * futex_queue_unlock(f) 1249 * futex_op_unlock(f)
1091 * 1250 *
1092 * Release the queue lock of f. 1251 * Release the queue lock of f.
1093 */ 1252 */
1094static void 1253static void
1095futex_queue_unlock(struct futex *f) 1254futex_op_unlock(struct futex *f)
1096{ 1255{
1097 mutex_exit(&f->fx_qlock); 1256 mutex_exit(&f->fx_op_lock);
1098} 1257}
1099 1258
1100/* 1259/*
1101 * futex_queue_lock2(f, f2) 1260 * futex_op_lock2(f, f2)
1102 * 1261 *
1103 * Acquire the queue locks of both f and f2, which may be null, or 1262 * Acquire the op locks of both f and f2, which may be null, or
1104 * which may have the same underlying queue. If they are 1263 * which may be the same. If they are distinct, an arbitrary total
1105 * distinct, an arbitrary total order is chosen on the locks. 1264 * order is chosen on the locks.
1106 * 1265 *
1107 * Callers should only ever acquire multiple queue locks 1266 * Callers should only ever acquire multiple op locks
1108 * simultaneously using futex_queue_lock2. 1267 * simultaneously using futex_op_lock2.
1109 */ 1268 */
1110static void 1269static void
1111futex_queue_lock2(struct futex *f, struct futex *f2) 1270futex_op_lock2(struct futex *f, struct futex *f2)
1112{ 1271{
1113 1272
1114 /* 1273 /*
1115 * If both are null, do nothing; if one is null and the other 1274 * If both are null, do nothing; if one is null and the other
1116 * is not, lock the other and be done with it. 1275 * is not, lock the other and be done with it.
1117 */ 1276 */
1118 if (f == NULL && f2 == NULL) { 1277 if (f == NULL && f2 == NULL) {
1119 return; 1278 return;
1120 } else if (f == NULL) { 1279 } else if (f == NULL) {
1121 mutex_enter(&f2->fx_qlock); 1280 mutex_enter(&f2->fx_op_lock);
1122 return; 1281 return;
1123 } else if (f2 == NULL) { 1282 } else if (f2 == NULL) {
1124 mutex_enter(&f->fx_qlock); 1283 mutex_enter(&f->fx_op_lock);
1125 return; 1284 return;
1126 } 1285 }
1127 1286
1128 /* If both futexes are the same, acquire only one. */ 1287 /* If both futexes are the same, acquire only one. */
1129 if (f == f2) { 1288 if (f == f2) {
1130 mutex_enter(&f->fx_qlock); 1289 mutex_enter(&f->fx_op_lock);
1131 return; 1290 return;
1132 } 1291 }
1133 1292
1134 /* Otherwise, use the ordering on the kva of the futex pointer. */ 1293 /* Otherwise, use the ordering on the kva of the futex pointer. */
1135 if ((uintptr_t)f < (uintptr_t)f2) { 1294 if ((uintptr_t)f < (uintptr_t)f2) {
1136 mutex_enter(&f->fx_qlock); 1295 mutex_enter(&f->fx_op_lock);
1137 mutex_enter(&f2->fx_qlock); 1296 mutex_enter(&f2->fx_op_lock);
1138 } else { 1297 } else {
1139 mutex_enter(&f2->fx_qlock); 1298 mutex_enter(&f2->fx_op_lock);
1140 mutex_enter(&f->fx_qlock); 1299 mutex_enter(&f->fx_op_lock);
1141 } 1300 }
1142} 1301}
1143 1302
1144/* 1303/*
1145 * futex_queue_unlock2(f, f2) 1304 * futex_op_unlock2(f, f2)
1146 * 1305 *
1147 * Release the queue locks of both f and f2, which may be null, or 1306 * Release the queue locks of both f and f2, which may be null, or
1148 * which may have the same underlying queue. 1307 * which may have the same underlying queue.
1149 */ 1308 */
1150static void 1309static void
1151futex_queue_unlock2(struct futex *f, struct futex *f2) 1310futex_op_unlock2(struct futex *f, struct futex *f2)
1152{ 1311{
1153 1312
1154 /* 1313 /*
1155 * If both are null, do nothing; if one is null and the other 1314 * If both are null, do nothing; if one is null and the other
1156 * is not, unlock the other and be done with it. 1315 * is not, unlock the other and be done with it.
1157 */ 1316 */
1158 if (f == NULL && f2 == NULL) { 1317 if (f == NULL && f2 == NULL) {
1159 return; 1318 return;
1160 } else if (f == NULL) { 1319 } else if (f == NULL) {
1161 mutex_exit(&f2->fx_qlock); 1320 mutex_exit(&f2->fx_op_lock);
1162 return; 1321 return;
1163 } else if (f2 == NULL) { 1322 } else if (f2 == NULL) {
1164 mutex_exit(&f->fx_qlock); 1323 mutex_exit(&f->fx_op_lock);
1165 return; 1324 return;
1166 } 1325 }
1167 1326
1168 /* If both futexes are the same, release only one. */ 1327 /* If both futexes are the same, release only one. */
1169 if (f == f2) { 1328 if (f == f2) {
1170 mutex_exit(&f->fx_qlock); 1329 mutex_exit(&f->fx_op_lock);
1171 return; 1330 return;
1172 } 1331 }
1173 1332
1174 /* Otherwise, use the ordering on the kva of the futex pointer. */ 1333 /* Otherwise, use the ordering on the kva of the futex pointer. */
1175 if ((uintptr_t)f < (uintptr_t)f2) { 1334 if ((uintptr_t)f < (uintptr_t)f2) {
1176 mutex_exit(&f2->fx_qlock); 1335 mutex_exit(&f2->fx_op_lock);
1177 mutex_exit(&f->fx_qlock); 1336 mutex_exit(&f->fx_op_lock);
1178 } else { 1337 } else {
1179 mutex_exit(&f->fx_qlock); 1338 mutex_exit(&f->fx_op_lock);
1180 mutex_exit(&f2->fx_qlock); 1339 mutex_exit(&f2->fx_op_lock);
1181 } 1340 }
1182} 1341}
1183 1342
1184/* 1343/*
1185 * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval) 1344 * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
1186 * 1345 *
1187 * Implement futex(FUTEX_WAIT). 1346 * Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
1188 */ 1347 */
1189static int 1348static int
1190futex_func_wait(bool shared, int *uaddr, int val, int val3, 1349futex_func_wait(bool shared, int *uaddr, int val, int val3,
1191 const struct timespec *timeout, clockid_t clkid, int clkflags, 1350 const struct timespec *timeout, clockid_t clkid, int clkflags,
1192 register_t *retval) 1351 register_t *retval)
1193{ 1352{
1194 struct futex *f; 1353 struct futex *f;
1195 struct futex_wait wait, *fw = &wait; 
1196 struct timespec ts; 1354 struct timespec ts;
1197 const struct timespec *deadline; 1355 const struct timespec *deadline;
1198 int error; 1356 int error;
1199 1357
1200 /* 1358 /*
1201 * If there's nothing to wait for, and nobody will ever wake 1359 * If there's nothing to wait for, and nobody will ever wake
1202 * us, then don't set anything up to wait -- just stop here. 1360 * us, then don't set anything up to wait -- just stop here.
1203 */ 1361 */
1204 if (val3 == 0) 1362 if (val3 == 0)
1205 return EINVAL; 1363 return EINVAL;
1206 1364
1207 /* Optimistically test before anything else. */ 1365 /* Optimistically test before anything else. */
1208 if (!futex_test(uaddr, val)) 1366 if (!futex_test(uaddr, val))
1209 return EAGAIN; 1367 return EAGAIN;
1210 1368
1211 /* Determine a deadline on the specified clock. */ 1369 /* Determine a deadline on the specified clock. */
1212 if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) { 1370 if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
1213 deadline = timeout; 1371 deadline = timeout;
1214 } else { 1372 } else {
 1373 struct timespec interval = *timeout;
 1374
 1375 error = itimespecfix(&interval);
 1376 if (error)
 1377 return error;
1215 error = clock_gettime1(clkid, &ts); 1378 error = clock_gettime1(clkid, &ts);
1216 if (error) 1379 if (error)
1217 return error; 1380 return error;
1218 timespecadd(&ts, timeout, &ts); 1381 timespecadd(&ts, &interval, &ts);
1219 deadline = &ts; 1382 deadline = &ts;
1220 } 1383 }
1221 1384
1222 /* Get the futex, creating it if necessary. */ 1385 /* Get the futex, creating it if necessary. */
1223 error = futex_lookup_create(uaddr, shared, &f); 1386 error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1224 if (error) 1387 if (error)
1225 return error; 1388 return error;
1226 KASSERT(f); 1389 KASSERT(f);
1227 1390
1228 /* Get ready to wait. */ 
1229 futex_wait_init(fw, val3); 
1230 
1231 /* 1391 /*
1232 * Under the queue lock, check the value again: if it has 1392 * Under the op lock, check the value again: if it has
1233 * already changed, EAGAIN; otherwise enqueue the waiter. 1393 * already changed, EAGAIN; otherwise enqueue the waiter.
1234 * Since FUTEX_WAKE will use the same lock and be done after 1394 * Since FUTEX_WAKE will use the same lock and be done after
1235 * modifying the value, the order in which we check and enqueue 1395 * modifying the value, the order in which we check and enqueue
1236 * is immaterial. 1396 * is immaterial.
1237 */ 1397 */
1238 futex_queue_lock(f); 1398 futex_op_lock(f);
1239 if (!futex_test(uaddr, val)) { 1399 if (!futex_test(uaddr, val)) {
1240 futex_queue_unlock(f); 1400 futex_op_unlock(f);
1241 error = EAGAIN; 1401 error = EAGAIN;
1242 goto out; 1402 goto out;
1243 } 1403 }
1244 mutex_enter(&fw->fw_lock); 1404
1245 futex_wait_enqueue(fw, f); 1405 /*
1246 mutex_exit(&fw->fw_lock); 1406 * Now wait. futex_wait() will drop our op lock once we
1247 futex_queue_unlock(f); 1407 * are entered into the sleep queue, thus ensuring atomicity
 1408 * of wakes with respect to waits.
 1409 */
 1410 error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
1248 1411
1249 /* 1412 /*
1250 * We cannot drop our reference to the futex here, because 1413 * We cannot drop our reference to the futex here, because
1251 * we might be enqueued on a different one when we are awakened. 1414 * we might be enqueued on a different one when we are awakened.
1252 * The references will be managed on our behalf in the requeue 1415 * The references will be managed on our behalf in the requeue,
1253 * and wake cases. 1416 * wake, and error cases.
1254 */ 1417 */
1255 f = NULL; 1418 f = NULL;
1256 1419
1257 /* Wait. */ 1420 if (__predict_true(error == 0)) {
1258 error = futex_wait(fw, deadline, clkid); 1421 /* Return 0 on success, error on failure. */
1259 if (error) 1422 *retval = 0;
1260 goto out; 1423 }
1261 
1262 /* Return 0 on success, error on failure. */ 
1263 *retval = 0; 
1264 1424
1265out: if (f != NULL) 1425out: if (f != NULL)
1266 futex_rele(f); 1426 futex_rele(f);
1267 futex_wait_fini(fw); 
1268 return error; 1427 return error;
1269} 1428}
1270 1429
1271/* 1430/*
1272 * futex_func_wake(uaddr, val, val3, retval) 1431 * futex_func_wake(uaddr, val, val3, retval)
1273 * 1432 *
1274 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET). 1433 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
1275 */ 1434 */
1276static int 1435static int
1277futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval) 1436futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
1278{ 1437{
1279 struct futex *f; 1438 struct futex *f;
1280 unsigned int nwoken = 0; 1439 unsigned int nwoken = 0;
1281 int error = 0; 1440 int error = 0;
1282 1441
1283 /* Reject negative number of wakeups. */ 1442 /* Reject negative number of wakeups. */
1284 if (val < 0) { 1443 if (val < 0) {
1285 error = EINVAL; 1444 error = EINVAL;
1286 goto out; 1445 goto out;
1287 } 1446 }
1288 1447
1289 /* Look up the futex, if any. */ 1448 /* Look up the futex, if any. */
1290 error = futex_lookup(uaddr, shared, &f); 1449 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1291 if (error) 1450 if (error)
1292 goto out; 1451 goto out;
1293 1452
1294 /* If there's no futex, there are no waiters to wake. */ 1453 /* If there's no futex, there are no waiters to wake. */
1295 if (f == NULL) 1454 if (f == NULL)
1296 goto out; 1455 goto out;
1297 1456
1298 /* 1457 /*
1299 * Under f's queue lock, wake the waiters and remember the 1458 * Under f's op lock, wake the waiters and remember the
1300 * number woken. 1459 * number woken.
1301 */ 1460 */
1302 futex_queue_lock(f); 1461 futex_op_lock(f);
1303 nwoken = futex_wake(f, val, NULL, 0, val3); 1462 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1304 futex_queue_unlock(f); 1463 NULL, FUTEX_WRITERQ, 0, val3);
 1464 futex_op_unlock(f);
1305 1465
1306 /* Release the futex. */ 1466 /* Release the futex. */
1307 futex_rele(f); 1467 futex_rele(f);
1308 1468
1309out: 1469out:
1310 /* Return the number of waiters woken. */ 1470 /* Return the number of waiters woken. */
1311 *retval = nwoken; 1471 *retval = nwoken;
1312 1472
1313 /* Success! */ 1473 /* Success! */
1314 return error; 1474 return error;
1315} 1475}
1316 1476
1317/* 1477/*
@@ -1324,54 +1484,56 @@ futex_func_requeue(bool shared, int op,  @@ -1324,54 +1484,56 @@ futex_func_requeue(bool shared, int op,
1324 int val2, int val3, register_t *retval) 1484 int val2, int val3, register_t *retval)
1325{ 1485{
1326 struct futex *f = NULL, *f2 = NULL; 1486 struct futex *f = NULL, *f2 = NULL;
1327 unsigned nwoken = 0; /* default to zero woken on early return */ 1487 unsigned nwoken = 0; /* default to zero woken on early return */
1328 int error; 1488 int error;
1329 1489
1330 /* Reject negative number of wakeups or requeues. */ 1490 /* Reject negative number of wakeups or requeues. */
1331 if (val < 0 || val2 < 0) { 1491 if (val < 0 || val2 < 0) {
1332 error = EINVAL; 1492 error = EINVAL;
1333 goto out; 1493 goto out;
1334 } 1494 }
1335 1495
1336 /* Look up the source futex, if any. */ 1496 /* Look up the source futex, if any. */
1337 error = futex_lookup(uaddr, shared, &f); 1497 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1338 if (error) 1498 if (error)
1339 goto out; 1499 goto out;
1340 1500
1341 /* If there is none, nothing to do. */ 1501 /* If there is none, nothing to do. */
1342 if (f == NULL) 1502 if (f == NULL)
1343 goto out; 1503 goto out;
1344 1504
1345 /* 1505 /*
1346 * We may need to create the destination futex because it's 1506 * We may need to create the destination futex because it's
1347 * entirely possible it does not currently have any waiters. 1507 * entirely possible it does not currently have any waiters.
1348 */ 1508 */
1349 error = futex_lookup_create(uaddr2, shared, &f2); 1509 error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1350 if (error) 1510 if (error)
1351 goto out; 1511 goto out;
1352 1512
1353 /* 1513 /*
1354 * Under the futexes' queue locks, check the value; if 1514 * Under the futexes' op locks, check the value; if
1355 * unchanged from val3, wake the waiters. 1515 * unchanged from val3, wake the waiters.
1356 */ 1516 */
1357 futex_queue_lock2(f, f2); 1517 futex_op_lock2(f, f2);
1358 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) { 1518 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
1359 error = EAGAIN; 1519 error = EAGAIN;
1360 } else { 1520 } else {
1361 error = 0; 1521 error = 0;
1362 nwoken = futex_wake(f, val, f2, val2, FUTEX_BITSET_MATCH_ANY); 1522 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
 1523 f2, FUTEX_WRITERQ, val2,
 1524 FUTEX_BITSET_MATCH_ANY);
1363 } 1525 }
1364 futex_queue_unlock2(f, f2); 1526 futex_op_unlock2(f, f2);
1365 1527
1366out: 1528out:
1367 /* Return the number of waiters woken. */ 1529 /* Return the number of waiters woken. */
1368 *retval = nwoken; 1530 *retval = nwoken;
1369 1531
1370 /* Release the futexes if we got them. */ 1532 /* Release the futexes if we got them. */
1371 if (f2) 1533 if (f2)
1372 futex_rele(f2); 1534 futex_rele(f2);
1373 if (f) 1535 if (f)
1374 futex_rele(f); 1536 futex_rele(f);
1375 return error; 1537 return error;
1376} 1538}
1377 1539
@@ -1515,119 +1677,661 @@ futex_func_wake_op(bool shared, int *uad @@ -1515,119 +1677,661 @@ futex_func_wake_op(bool shared, int *uad
1515 int error; 1677 int error;
1516 1678
1517 /* Reject negative number of wakeups. */ 1679 /* Reject negative number of wakeups. */
1518 if (val < 0 || val2 < 0) { 1680 if (val < 0 || val2 < 0) {
1519 error = EINVAL; 1681 error = EINVAL;
1520 goto out; 1682 goto out;
1521 } 1683 }
1522 1684
1523 /* Reject invalid operations before we start doing things. */ 1685 /* Reject invalid operations before we start doing things. */
1524 if ((error = futex_validate_op_cmp(val3)) != 0) 1686 if ((error = futex_validate_op_cmp(val3)) != 0)
1525 goto out; 1687 goto out;
1526 1688
1527 /* Look up the first futex, if any. */ 1689 /* Look up the first futex, if any. */
1528 error = futex_lookup(uaddr, shared, &f); 1690 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1529 if (error) 1691 if (error)
1530 goto out; 1692 goto out;
1531 1693
1532 /* Look up the second futex, if any. */ 1694 /* Look up the second futex, if any. */
1533 error = futex_lookup(uaddr2, shared, &f2); 1695 error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1534 if (error) 1696 if (error)
1535 goto out; 1697 goto out;
1536 1698
1537 /* 1699 /*
1538 * Under the queue locks: 1700 * Under the op locks:
1539 * 1701 *
1540 * 1. Read/modify/write: *uaddr2 op= oparg. 1702 * 1. Read/modify/write: *uaddr2 op= oparg.
1541 * 2. Unconditionally wake uaddr. 1703 * 2. Unconditionally wake uaddr.
1542 * 3. Conditionally wake uaddr2, if it previously matched val2. 1704 * 3. Conditionally wake uaddr2, if it previously matched val3.
1543 */ 1705 */
1544 futex_queue_lock2(f, f2); 1706 futex_op_lock2(f, f2);
1545 do { 1707 do {
1546 error = futex_load(uaddr2, &oldval); 1708 error = futex_load(uaddr2, &oldval);
1547 if (error) 1709 if (error)
1548 goto out_unlock; 1710 goto out_unlock;
1549 newval = futex_compute_op(oldval, val3); 1711 newval = futex_compute_op(oldval, val3);
1550 error = ucas_int(uaddr2, oldval, newval, &actual); 1712 error = ucas_int(uaddr2, oldval, newval, &actual);
1551 if (error) 1713 if (error)
1552 goto out_unlock; 1714 goto out_unlock;
1553 } while (actual != oldval); 1715 } while (actual != oldval);
1554 nwoken = (f ? futex_wake(f, val, NULL, 0, FUTEX_BITSET_MATCH_ANY) : 0); 1716 nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
 1717 NULL, FUTEX_WRITERQ, 0,
 1718 FUTEX_BITSET_MATCH_ANY) : 0);
1555 if (f2 && futex_compute_cmp(oldval, val3)) 1719 if (f2 && futex_compute_cmp(oldval, val3))
1556 nwoken += futex_wake(f2, val2, NULL, 0, 1720 nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
1557 FUTEX_BITSET_MATCH_ANY); 1721 NULL, FUTEX_WRITERQ, 0,
 1722 FUTEX_BITSET_MATCH_ANY);
1558 1723
1559 /* Success! */ 1724 /* Success! */
1560 error = 0; 1725 error = 0;
1561out_unlock: 1726out_unlock:
1562 futex_queue_unlock2(f, f2); 1727 futex_op_unlock2(f, f2);
1563 1728
1564out: 1729out:
1565 /* Return the number of waiters woken. */ 1730 /* Return the number of waiters woken. */
1566 *retval = nwoken; 1731 *retval = nwoken;
1567 1732
1568 /* Release the futexes, if we got them. */ 1733 /* Release the futexes, if we got them. */
1569 if (f2) 1734 if (f2)
1570 futex_rele(f2); 1735 futex_rele(f2);
1571 if (f) 1736 if (f)
1572 futex_rele(f); 1737 futex_rele(f);
1573 return error; 1738 return error;
1574} 1739}
1575 1740
1576/* 1741/*
 1742 * futex_read_waiter_prio(l)
 1743 *
 1744 * Returns the read-waiter priority for purposes of comparing
 1745 * against a write-waiter's priority. Read-waiters are only
 1746 * prioritized if they are real-time threads.
 1747 */
 1748static inline pri_t
 1749futex_read_waiter_prio(struct lwp * const l)
 1750{
 1751 const pri_t pri = lwp_eprio(l);
 1752 if (__predict_false(pri >= PRI_USER_RT))
 1753 return pri;
 1754 return PRI_NONE;
 1755}
 1756
 1757#if 0 /* XXX */
 1758/*
 1759 * futex_rw_handle_rt_reader(f, uaddr, val, pri, errorp)
 1760 *
 1761 * Attempt to resolve the case of a real-time thread attempting
 1762 * to acquire a read lock. Turns true if the attempt is resolved
 1763 * and the wait should be elided.
 1764 */
 1765static int __noinline
 1766futex_rw_handle_rt_reader(struct futex *f, int *uaddr, int val,
 1767 pri_t pri_reader)
 1768{
 1769 struct lwp *l_writer = NULL;
 1770 int newval, next;
 1771 int error;
 1772
 1773 KASSERT(mutex_owned(&f->fx_op_lock));
 1774
 1775 /*
 1776 * If the lock is write-locked, there isn't anything we
 1777 * can do but wait.
 1778 */
 1779 if (val & FUTEX_RW_WRITE_LOCKED) {
 1780 return 0;
 1781 }
 1782
 1783 /*
 1784 * If we're maxed out on readers already, nothing we can do.
 1785 */
 1786 if ((val & FUTEX_TID_MASK) == FUTEX_TID_MASK) {
 1787 return ENFILE; /* disctinct from EAGAIN */
 1788 }
 1789
 1790 /*
 1791 * The assumption then is that we arrived here with WRITE_WANTED
 1792 * set. We're not going to bother testing that, however. We
 1793 * will preserve it if we see it.
 1794 *
 1795 * Because WRITE_WANTED is set, This will cause everyone to enter
 1796 * the futex to rw_wait. And we are holding the op lock so no
 1797 * additional waiters will apear on the sleepq. We are going
 1798 * to do the same tricky dance as rw_handoff to let higher-priority
 1799 * real-time waiters slip past the gate.
 1800 */
 1801
 1802 /*
 1803 * All we want to do here is check if there is a writer waiting.
 1804 * If there is, and it is equal or greater priority, this reader
 1805 * loses, otherwise we will just make note if it to ensure that
 1806 * the WRITE_WANTED bit is set when we update the futex word.
 1807 *
 1808 * Since we are not going to actually wake someone from the
 1809 * queue here, we're not really interested if the write-waiter
 1810 * happens to leave based on a timeout or signal; we know that
 1811 * any write-waiter *after* the first one is of equal or lower
 1812 * priority, so we would still best it.
 1813 */
 1814 mutex_spin_enter(f->fx_sq_lock);
 1815 l_writer = LIST_FIRST(&f->fx_sqs[FUTEX_WRITERQ]);
 1816 if (__predict_true(l_writer != NULL)) {
 1817 if (pri_reader <= lwp_eprio(l_writer)) {
 1818 return 0;
 1819 }
 1820 }
 1821 mutex_spin_exit(f->fx_sq_lock);
 1822
 1823 for (;; val = next) {
 1824 if (__predict_true(l_writer != NULL)) {
 1825 newval = (val + 1) | FUTEX_RW_WRITE_WANTED;
 1826 } else {
 1827 /*
 1828 * No writer waiting; increment the reader
 1829 * count, preserve any WRITE_WANTED bit.
 1830 */
 1831 newval = val + 1;
 1832 KASSERT(((newval ^ val) & FUTEX_RW_WRITE_WANTED) == 0);
 1833 }
 1834
 1835 error = ucas_int(uaddr, val, newval, &next);
 1836 if (__predict_false(error != 0)) {
 1837 return error;
 1838 }
 1839 if (next == val) {
 1840 /* Successfully acquired the read lock. */
 1841 return EJUSTRETURN;
 1842 }
 1843 /*
 1844 * Repeat the terminal checks from above on the new
 1845 * value.
 1846 */
 1847 if (next & FUTEX_RW_WRITE_LOCKED) {
 1848 return 0;
 1849 }
 1850 if ((next & FUTEX_TID_MASK) == FUTEX_TID_MASK) {
 1851 return ENFILE; /* disctinct from EAGAIN */
 1852 }
 1853 }
 1854}
 1855#endif /* XXX */
 1856
 1857/*
 1858 * futex_func_rw_wait(uaddr, val, val3, abstime, clkid, retval)
 1859 *
 1860 * Implement futex(FUTEX_NETBSD_RW_WAIT).
 1861 */
 1862static int
 1863futex_func_rw_wait(bool shared, int *uaddr, int val, int val3,
 1864 const struct timespec *abstime, clockid_t clkid, register_t *retval)
 1865{
 1866#if 1
 1867 /* XXX NETBSD_RW ops are currently broken XXX */
 1868 return ENOSYS;
 1869#else
 1870 struct futex *f;
 1871 int error;
 1872
 1873 /* Must specify READER or WRITER. */
 1874 if (__predict_false(val3 != FUTEX_RW_READER &&
 1875 val3 != FUTEX_RW_WRITER))
 1876 return EINVAL;
 1877
 1878 /* Optimistically test before anything else. */
 1879 if (!futex_test(uaddr, val))
 1880 return EAGAIN;
 1881
 1882 /* Get the futex, creating it if necessary. */
 1883 error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_RWLOCK, &f);
 1884 if (error)
 1885 return error;
 1886 KASSERT(f);
 1887
 1888 /*
 1889 * Under the op lock, check the value again: if it has
 1890 * already changed, EAGAIN; otherwise, enqueue the waiter
 1891 * on the specified queue.
 1892 */
 1893 futex_op_lock(f);
 1894 if (!futex_test(uaddr, val)) {
 1895 futex_op_unlock(f);
 1896 error = EAGAIN;
 1897 goto out;
 1898 }
 1899
 1900 /*
 1901 * POSIX dictates that a real-time reader will be prioritized
 1902 * over writers of lesser priority, when normally we would
 1903 * prefer the writers.
 1904 */
 1905 if (__predict_false(val3 == FUTEX_RW_READER)) {
 1906 struct lwp * const l = curlwp;
 1907 const pri_t pri_reader = futex_read_waiter_prio(l);
 1908 if (__predict_false(pri_reader > PRI_NONE)) {
 1909 error = futex_rw_handle_rt_reader(f, uaddr, val,
 1910 pri_reader);
 1911 if (error) {
 1912 if (__predict_true(error == EJUSTRETURN)) {
 1913 /* RT reader acquired the lock. */
 1914 error = 0;
 1915 }
 1916 futex_op_unlock(f);
 1917 goto out;
 1918 }
 1919 }
 1920 }
 1921
 1922 /*
 1923 * Now wait. futex_wait() will dop our op lock once we
 1924 * are entered into the sleep queue, thus ensuring atomicity
 1925 * of wakes with respect to waits.
 1926 *
 1927 * Use a wake selector of 0 so that this waiter won't match on any
 1928 * of the other operations in case someone makes that error; only
 1929 * rw_handoff is allowed! This is critical because a waiter that
 1930 * awakens from an rw_wait assumes it has been given ownership of
 1931 * the lock.
 1932 */
 1933 error = futex_wait(f, val3, abstime, clkid, 0);
 1934
 1935 /*
 1936 * Don't drop our reference here. We won't be requeued, but
 1937 * it's best to main symmetry with other operations.
 1938 */
 1939 f = NULL;
 1940
 1941out: if (__predict_true(error == 0)) {
 1942 /* Return 0 on success, error on failure. */
 1943 *retval = 0;
 1944 }
 1945
 1946 if (f != NULL)
 1947 futex_rele(f);
 1948 return error;
 1949#endif
 1950}
 1951
 1952/*
 1953 * futex_func_rw_handoff(uaddr, val, retval)
 1954 *
 1955 * Implement futex(FUTEX_NETBSD_RW_HANDOFF).
 1956 *
 1957 * This implements direct hand-off to either a single writer
 1958 * or all readers.
 1959 */
 1960static int
 1961futex_func_rw_handoff(bool shared, int *uaddr, int val, register_t *retval)
 1962{
 1963#if 1
 1964 /* XXX NETBSD_RW ops are currently broken XXX */
 1965 return ENOSYS;
 1966#else
 1967 struct lwp *l, *l_next, *l_writer, *l_reader;
 1968 struct futex *f;
 1969 sleepq_t wsq, *sq;
 1970 int error = 0;
 1971 int newval, next, nwake, nwoken = 0;
 1972 int allreaders __diagused = 0;
 1973 unsigned int *nwaitersp;
 1974 pri_t pri_writer;
 1975
 1976 /* Look up the futex, if any. */
 1977 error = futex_lookup(uaddr, shared, FUTEX_CLASS_RWLOCK, &f);
 1978 if (error)
 1979 goto out;
 1980
 1981 /*
 1982 * There are a couple of diffrent scenarios where a thread
 1983 * releasing a RW lock would call rw_handoff, yet we find no
 1984 * waiters:
 1985 *
 1986 * ==> There were waiters on the queue, but they left the queue
 1987 * due to a signal.
 1988 * ==> The waiting thread set the waiter bit(s), but decided to
 1989 * try spinning before calling rw_wait.
 1990 *
 1991 * In both of these cases, we will ensure that the lock word
 1992 * gets cleared.
 1993 */
 1994
 1995 /* If there's no futex, there are no waiters to wake. */
 1996 if (__predict_false(f == NULL)) {
 1997 /*
 1998 * If there are no waiters, ensure that the lock
 1999 * word is cleared.
 2000 */
 2001 error = ucas_int(uaddr, val, 0, &next);
 2002 if (__predict_true(error == 0)) {
 2003 if (__predict_false(val != next))
 2004 error = EAGAIN;
 2005 }
 2006 goto out;
 2007 }
 2008
 2009 /*
 2010 * Compute the new value and store it in the futex word.
 2011 *
 2012 * This is a little tricky because the ucas could cause
 2013 * a page fault, and we can't let that happen while holding
 2014 * the sleepq locks. But we have to make sure that choices
 2015 * we make regarding what threads to wake is accurately
 2016 * reflected in the futex word and that the futex word is
 2017 * updated before the LWPs can run.
 2018 *
 2019 * This is easy enough ... we can transfer the LWPs to a private
 2020 * sleepq to prevent changes in the original sleepq while we have
 2021 * it unlocked from affecting the results, but we must also
 2022 * consider that LWPs might be using timed-wait, so we have to
 2023 * make sure that won't wake the LWP up out from under us if the
 2024 * timer fires. To do this, we have to set the "wchan" to NULL
 2025 * and use a dummy syncobj that takes no action on "unsleep".
 2026 *
 2027 * We thus perform the hand-off in three steps, all with the op
 2028 * lock held:
 2029 *
 2030 * ==> Wake selection: sleepq locked, select LWPs to wake,
 2031 * compute new futex word. LWPs are tranferred from the
 2032 * futex sleepq to the private sleepq and further sedated.
 2033 *
 2034 * ==> Futex word update: sleepq unlocked, use a loop around ucas
 2035 * to update the futex word. There are no scenarios where
 2036 * user space code can update the futex in a way that would
 2037 * impact the work we do here; because we've been asked to do
 2038 * the hand-off, any new LWPs attempting to acquire the lock
 2039 * would be entering rw_wait by definition (either because
 2040 * they're read-lockers and the lock is write-wanted, or they're
 2041 * write-lockers and the lock is read-held). Those new rw_wait
 2042 * LWPs will serialize against the op lock. We DO, however,
 2043 * need to preserve any newly-set WANTED bits, hence the ucas
 2044 * loop. Those newly-set WANTED bits, however, will not impact
 2045 * our decisions. Those LWPs have simply lost the race to
 2046 * acquire the lock, and we don't consult those bits anyway;
 2047 * we instead use the contents of the sleepqs as the truth
 2048 * about who is waiting, and now new waiters will appear on
 2049 * the sleepqs while we hold the op lock.
 2050 *
 2051 * ==> Wake waiters: sleepq locked, run down our private sleepq
 2052 * and actually awaken all of the LWPs we selected earlier.
 2053 *
 2054 * If for some reason, the ucas fails because it page backing it
 2055 * was unmapped, all bets are off. We still awaken the waiters.
 2056 * This is either a malicious attempt to screw things up, or a
 2057 * programming error, and we don't care about either one.
 2058 */
 2059 sleepq_init(&wsq);
 2060
 2061 futex_op_lock(f);
 2062 mutex_spin_enter(f->fx_sq_lock);
 2063
 2064 /*
 2065 * STEP 1
 2066 */
 2067
 2068 /*
 2069 * POSIX dictates that any real-time waiters will acquire the
 2070 * lock in priority order. This implies that a real-time
 2071 * read-waiter has priority over a non-real-time write-waiter,
 2072 * where we would otherwise prioritize waking the write-waiter.
 2073 */
 2074 l_writer = LIST_FIRST(&f->fx_sqs[FUTEX_WRITERQ]);
 2075 l_reader = LIST_FIRST(&f->fx_sqs[FUTEX_READERQ]);
 2076 if (__predict_true(l_writer == NULL)) {
 2077 /* We will wake all the readers. */
 2078 sq = &f->fx_sqs[FUTEX_READERQ];
 2079 nwaitersp = &f->fx_nwaiters[FUTEX_READERQ];
 2080 nwake = allreaders = f->fx_nwaiters[FUTEX_READERQ];
 2081 KASSERT(nwake >= 0 && nwake <= FUTEX_TID_MASK);
 2082 if (__predict_false(nwake == 0)) {
 2083 KASSERT(l_reader == NULL);
 2084 newval = 0;
 2085 } else {
 2086 KASSERT(l_reader != NULL);
 2087 newval = nwake;
 2088 }
 2089 l = l_reader;
 2090 } else if (__predict_false(l_reader != NULL &&
 2091 futex_read_waiter_prio(l_reader) >
 2092 (pri_writer = lwp_eprio(l_writer)))) {
 2093 /*
 2094 * Count the number of real-time read-waiters that
 2095 * exceed the write-waiter's priority. We will
 2096 * wake that many (we alreay know it's at least one).
 2097 */
 2098 sq = &f->fx_sqs[FUTEX_READERQ];
 2099 nwaitersp = &f->fx_nwaiters[FUTEX_READERQ];
 2100 for (nwake = 1, l = LIST_NEXT(l_reader, l_sleepchain);
 2101 l != NULL && futex_read_waiter_prio(l) > pri_writer;
 2102 l = LIST_NEXT(l, l_sleepchain)) {
 2103 nwake++;
 2104 }
 2105 KASSERT(nwake >= 0 && nwake <= FUTEX_TID_MASK);
 2106 /* We know there is at least one write-waiter. */
 2107 newval = nwake | FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED;
 2108 l = l_reader;
 2109 } else {
 2110 /* We will wake one writer. */
 2111 sq = &f->fx_sqs[FUTEX_WRITERQ];
 2112 nwaitersp = &f->fx_nwaiters[FUTEX_WRITERQ];
 2113 nwake = 1;
 2114 newval = FUTEX_RW_WRITE_LOCKED | l_writer->l_lid;
 2115 if (__predict_false(f->fx_nwaiters[FUTEX_WRITERQ] > 1)) {
 2116 KASSERT(LIST_NEXT(l_writer, l_sleepchain) != NULL);
 2117 newval |= FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED;
 2118 } else if (__predict_true(f->fx_nwaiters[FUTEX_READERQ] != 0)) {
 2119 KASSERT(!LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
 2120 newval |= FUTEX_WAITERS;
 2121 } else {
 2122 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
 2123 }
 2124 l = l_writer;
 2125 }
 2126
 2127 KASSERT(sq == &f->fx_sqs[FUTEX_READERQ] ||
 2128 sq == &f->fx_sqs[FUTEX_WRITERQ]);
 2129 while (nwake != 0 && l != NULL) {
 2130 l_next = LIST_NEXT(l, l_sleepchain);
 2131 KASSERT(l->l_syncobj == &futex_syncobj);
 2132 KASSERT(l->l_wchan == (wchan_t)f);
 2133 KASSERT(l->l_futex == f);
 2134 KASSERT(l->l_sleepq == sq);
 2135 KASSERT(l->l_mutex == f->fx_sq_lock);
 2136
 2137 /*
 2138 * NULL wchan --> timeout will not wake LWP.
 2139 * NULL lock --> keep existing lock.
 2140 */
 2141 sleepq_transfer(l, sq, &wsq, NULL/*wchan*/, futex_wmesg,
 2142 &sched_syncobj, NULL/*lock*/, false);
 2143
 2144 KASSERT(*nwaitersp > 0);
 2145 (*nwaitersp)--;
 2146 nwoken++;
 2147 nwake--;
 2148 /* hold count adjusted below */
 2149 l = l_next;
 2150 }
 2151
 2152 mutex_spin_exit(f->fx_sq_lock);
 2153
 2154 /*
 2155 * STEP 2
 2156 */
 2157 for (;; val = next) {
 2158 error = ucas_int(uaddr, val, newval, &next);
 2159 if (__predict_false(error != 0)) {
 2160 /*
 2161 * The futex word has become unmapped. All bets
 2162 * are off. Break out of the loop and awaken the
 2163 * waiters; this is easier than trying to stuff
 2164 * them back into their previous sleepq, and the
 2165 * application is screwed anyway.
 2166 */
 2167 break;
 2168 }
 2169 if (__predict_true(next == val)) {
 2170 /*
 2171 * Successfully updated the futex word!
 2172 */
 2173 break;
 2174 }
 2175 /*
 2176 * The only thing that could have possibly happened
 2177 * (barring some bug in the thread library) is that
 2178 * additional waiter bits arrived. Those new waiters
 2179 * have lost the race to acquire the lock, but we
 2180 * need to preserve those bits.
 2181 */
 2182 newval |= next & (FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED);
 2183 }
 2184
 2185 mutex_spin_enter(f->fx_sq_lock);
 2186
 2187 /*
 2188 * STEP 3
 2189 */
 2190
 2191 LIST_FOREACH_SAFE(l, &wsq, l_sleepchain, l_next) {
 2192 KASSERT(l->l_syncobj == &sched_syncobj);
 2193 KASSERT(l->l_wchan == NULL);
 2194 KASSERT(l->l_futex == f);
 2195 KASSERT(l->l_sleepq == &wsq);
 2196 KASSERT(l->l_mutex == f->fx_sq_lock);
 2197
 2198 l->l_futex_wakesel = 0;
 2199 l->l_futex = NULL;
 2200 sleepq_remove(&wsq, l);
 2201 }
 2202 /* If we woke all-readers, ensure we will wake them all. */
 2203 KASSERT(allreaders == 0 || allreaders == nwoken);
 2204 KASSERT(allreaders == 0 || LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
 2205 KASSERT(allreaders == 0 || f->fx_nwaiters[FUTEX_READERQ] == 0);
 2206
 2207 mutex_spin_exit(f->fx_sq_lock);
 2208
 2209 /* Adjust hold count. */
 2210 futex_rele_count_not_last(f, nwoken);
 2211
 2212 futex_op_unlock(f);
 2213
 2214 /* Release the futex. */
 2215 futex_rele(f);
 2216
 2217out: if (__predict_true(error == 0)) {
 2218 /* Return the number of waiters woken. */
 2219 *retval = nwoken;
 2220 }
 2221
 2222 /* Success! */
 2223 return error;
 2224#endif
 2225}
 2226
 2227/*
1577 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3) 2228 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
1578 * 2229 *
1579 * Implement the futex system call with all the parameters 2230 * Implement the futex system call with all the parameters
1580 * parsed out. 2231 * parsed out.
1581 */ 2232 */
1582int 2233int
1583do_futex(int *uaddr, int op, int val, const struct timespec *timeout, 2234do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
1584 int *uaddr2, int val2, int val3, register_t *retval) 2235 int *uaddr2, int val2, int val3, register_t *retval)
1585{ 2236{
1586 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true; 2237 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
1587 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME 2238 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
1588 : CLOCK_MONOTONIC; 2239 : CLOCK_MONOTONIC;
 2240 int error;
1589 2241
1590 op &= FUTEX_CMD_MASK; 2242 op &= FUTEX_CMD_MASK;
1591 2243
1592 switch (op) { 2244 switch (op) {
1593 case FUTEX_WAIT: 2245 case FUTEX_WAIT:
1594 return futex_func_wait(shared, uaddr, val, 2246 SDT_PROBE6(futex, func, wait, entry,
 2247 uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
 2248 TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
 2249 error = futex_func_wait(shared, uaddr, val,
1595 FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME, 2250 FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
1596 retval); 2251 retval);
 2252 SDT_PROBE1(futex, func, wait, exit, error);
 2253 break;
 2254
 2255 case FUTEX_WAIT_BITSET:
 2256 SDT_PROBE6(futex, func, wait, entry,
 2257 uaddr, val, val3, timeout,
 2258 TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
 2259 error = futex_func_wait(shared, uaddr, val, val3, timeout,
 2260 clkid, TIMER_ABSTIME, retval);
 2261 SDT_PROBE1(futex, func, wait, exit, error);
 2262 break;
1597 2263
1598 case FUTEX_WAKE: 2264 case FUTEX_WAKE:
1599 val3 = FUTEX_BITSET_MATCH_ANY; 2265 SDT_PROBE4(futex, func, wake, entry,
1600 /* FALLTHROUGH */ 2266 uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
 2267 error = futex_func_wake(shared, uaddr, val,
 2268 FUTEX_BITSET_MATCH_ANY, retval);
 2269 SDT_PROBE2(futex, func, wake, exit, error, *retval);
 2270 break;
 2271
1601 case FUTEX_WAKE_BITSET: 2272 case FUTEX_WAKE_BITSET:
1602 return futex_func_wake(shared, uaddr, val, val3, retval); 2273 SDT_PROBE4(futex, func, wake, entry,
 2274 uaddr, val, val3, op & ~FUTEX_CMD_MASK);
 2275 error = futex_func_wake(shared, uaddr, val, val3, retval);
 2276 SDT_PROBE2(futex, func, wake, exit, error, *retval);
 2277 break;
1603 2278
1604 case FUTEX_REQUEUE: 2279 case FUTEX_REQUEUE:
 2280 SDT_PROBE5(futex, func, requeue, entry,
 2281 uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
 2282 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
 2283 val2, 0, retval);
 2284 SDT_PROBE2(futex, func, requeue, exit, error, *retval);
 2285 break;
 2286
1605 case FUTEX_CMP_REQUEUE: 2287 case FUTEX_CMP_REQUEUE:
1606 return futex_func_requeue(shared, op, uaddr, val, uaddr2, 2288 SDT_PROBE6(futex, func, cmp_requeue, entry,
 2289 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
 2290 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
1607 val2, val3, retval); 2291 val2, val3, retval);
1608 2292 SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
1609 case FUTEX_WAIT_BITSET: 2293 break;
1610 return futex_func_wait(shared, uaddr, val, val3, timeout, 
1611 clkid, TIMER_ABSTIME, retval); 
1612 2294
1613 case FUTEX_WAKE_OP: 2295 case FUTEX_WAKE_OP:
1614 return futex_func_wake_op(shared, uaddr, val, uaddr2, val2, 2296 SDT_PROBE6(futex, func, wake_op, entry,
 2297 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
 2298 error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
1615 val3, retval); 2299 val3, retval);
 2300 SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
 2301 break;
 2302
 2303 case FUTEX_NETBSD_RW_WAIT:
 2304 SDT_PROBE5(futex, func, rw_wait, entry,
 2305 uaddr, val, val3, timeout, op & ~FUTEX_CMD_MASK);
 2306 error = futex_func_rw_wait(shared, uaddr, val, val3,
 2307 timeout, clkid, retval);
 2308 SDT_PROBE1(futex, func, rw_wait, exit, error);
 2309 break;
 2310
 2311 case FUTEX_NETBSD_RW_HANDOFF:
 2312 SDT_PROBE3(futex, func, rw_handoff, entry,
 2313 uaddr, val, op & ~FUTEX_CMD_MASK);
 2314 error = futex_func_rw_handoff(shared, uaddr, val, retval);
 2315 SDT_PROBE2(futex, func, rw_handoff, exit, error, *retval);
 2316 break;
1616 2317
1617 case FUTEX_FD: 2318 case FUTEX_FD:
1618 default: 2319 default:
1619 return ENOSYS; 2320 error = ENOSYS;
 2321 break;
1620 } 2322 }
 2323
 2324 return error;
1621} 2325}
1622 2326
1623/* 2327/*
1624 * sys___futex(l, uap, retval) 2328 * sys___futex(l, uap, retval)
1625 * 2329 *
1626 * __futex(2) system call: generic futex operations. 2330 * __futex(2) system call: generic futex operations.
1627 */ 2331 */
1628int 2332int
1629sys___futex(struct lwp *l, const struct sys___futex_args *uap, 2333sys___futex(struct lwp *l, const struct sys___futex_args *uap,
1630 register_t *retval) 2334 register_t *retval)
1631{ 2335{
1632 /* { 2336 /* {
1633 syscallarg(int *) uaddr; 2337 syscallarg(int *) uaddr;
@@ -1719,26 +2423,27 @@ sys___futex_get_robust_list(struct lwp * @@ -1719,26 +2423,27 @@ sys___futex_get_robust_list(struct lwp *
1719 * 2423 *
1720 * Try to release the robust futex at uva in the current process 2424 * Try to release the robust futex at uva in the current process
1721 * on lwp exit. If anything goes wrong, silently fail. It is the 2425 * on lwp exit. If anything goes wrong, silently fail. It is the
1722 * userland program's obligation to arrange correct behaviour. 2426 * userland program's obligation to arrange correct behaviour.
1723 */ 2427 */
1724static void 2428static void
1725release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi, 2429release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
1726 bool const is_pending) 2430 bool const is_pending)
1727{ 2431{
1728 int *uaddr; 2432 int *uaddr;
1729 struct futex *f; 2433 struct futex *f;
1730 int oldval, newval, actual; 2434 int oldval, newval, actual;
1731 int error; 2435 int error;
 2436 bool shared;
1732 2437
1733 /* If it's misaligned, tough. */ 2438 /* If it's misaligned, tough. */
1734 if (__predict_false(uptr & 3)) 2439 if (__predict_false(uptr & 3))
1735 return; 2440 return;
1736 uaddr = (int *)uptr; 2441 uaddr = (int *)uptr;
1737 2442
1738 error = futex_load(uaddr, &oldval); 2443 error = futex_load(uaddr, &oldval);
1739 if (__predict_false(error)) 2444 if (__predict_false(error))
1740 return; 2445 return;
1741 2446
1742 /* 2447 /*
1743 * There are two race conditions we need to handle here: 2448 * There are two race conditions we need to handle here:
1744 * 2449 *
@@ -1749,28 +2454,31 @@ release_futex(uintptr_t const uptr, lwpi @@ -1749,28 +2454,31 @@ release_futex(uintptr_t const uptr, lwpi
1749 * 2. Awakened waiter died before being able to acquire 2454 * 2. Awakened waiter died before being able to acquire
1750 * the futex in user space. Any other waiters are 2455 * the futex in user space. Any other waiters are
1751 * now stuck, oops! 2456 * now stuck, oops!
1752 * 2457 *
1753 * In both of these cases, the futex word will be 0 (because 2458 * In both of these cases, the futex word will be 0 (because
1754 * it's updated before the wake is issued). The best we can 2459 * it's updated before the wake is issued). The best we can
1755 * do is detect this situation if it's the pending futex and 2460 * do is detect this situation if it's the pending futex and
1756 * issue a wake without modifying the futex word. 2461 * issue a wake without modifying the futex word.
1757 * 2462 *
1758 * XXX eventual PI handling? 2463 * XXX eventual PI handling?
1759 */ 2464 */
1760 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) { 2465 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
1761 register_t retval; 2466 register_t retval;
1762 (void) futex_func_wake(/*shared*/true, uaddr, 1, 2467 error = futex_func_wake(/*shared*/true, uaddr, 1,
1763 FUTEX_BITSET_MATCH_ANY, &retval); 2468 FUTEX_BITSET_MATCH_ANY, &retval);
 2469 if (error != 0 || retval == 0)
 2470 (void) futex_func_wake(/*shared*/false, uaddr, 1,
 2471 FUTEX_BITSET_MATCH_ANY, &retval);
1764 return; 2472 return;
1765 } 2473 }
1766 2474
1767 /* Optimistically test whether we need to do anything at all. */ 2475 /* Optimistically test whether we need to do anything at all. */
1768 if ((oldval & FUTEX_TID_MASK) != tid) 2476 if ((oldval & FUTEX_TID_MASK) != tid)
1769 return; 2477 return;
1770 2478
1771 /* 2479 /*
1772 * We need to handle the case where this thread owned the futex, 2480 * We need to handle the case where this thread owned the futex,
1773 * but it was uncontended. In this case, there won't be any 2481 * but it was uncontended. In this case, there won't be any
1774 * kernel state to look up. All we can do is mark the futex 2482 * kernel state to look up. All we can do is mark the futex
1775 * as a zombie to be mopped up the next time another thread 2483 * as a zombie to be mopped up the next time another thread
1776 * attempts to acquire it. 2484 * attempts to acquire it.
@@ -1794,70 +2502,74 @@ release_futex(uintptr_t const uptr, lwpi @@ -1794,70 +2502,74 @@ release_futex(uintptr_t const uptr, lwpi
1794 if (error) 2502 if (error)
1795 return; 2503 return;
1796 } while (actual != oldval); 2504 } while (actual != oldval);
1797 2505
1798 /* 2506 /*
1799 * If where is still no indication of waiters, then there is 2507 * If where is still no indication of waiters, then there is
1800 * no more work for us to do. 2508 * no more work for us to do.
1801 */ 2509 */
1802 if ((oldval & FUTEX_WAITERS) == 0) 2510 if ((oldval & FUTEX_WAITERS) == 0)
1803 return; 2511 return;
1804 } 2512 }
1805 2513
1806 /* 2514 /*
1807 * Look for a shared futex since we have no positive indication 2515 * Look for a futex. Try shared first, then private. If we
1808 * it is private. If we can't, tough. 2516 * can't fine one, tough.
1809 */ 2517 *
1810 error = futex_lookup(uaddr, /*shared*/true, &f); 2518 * Note: the assumption here is that anyone placing a futex on
1811 if (error) 2519 * the robust list is adhering to the rules, regardless of the
1812 return; 2520 * futex class.
1813 
1814 /* 
1815 * If there's no kernel state for this futex, there's nothing to 
1816 * release. 
1817 */ 2521 */
1818 if (f == NULL) 2522 for (f = NULL, shared = true; f == NULL; shared = false) {
1819 return; 2523 error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
 2524 if (error)
 2525 return;
 2526 if (f == NULL && shared == false)
 2527 return;
 2528 }
1820 2529
1821 /* Work under the futex queue lock. */ 2530 /* Work under the futex op lock. */
1822 futex_queue_lock(f); 2531 futex_op_lock(f);
1823 2532
1824 /* 2533 /*
1825 * Fetch the word: if the tid doesn't match ours, skip; 2534 * Fetch the word: if the tid doesn't match ours, skip;
1826 * otherwise, set the owner-died bit, atomically. 2535 * otherwise, set the owner-died bit, atomically.
1827 */ 2536 */
1828 do { 2537 do {
1829 error = futex_load(uaddr, &oldval); 2538 error = futex_load(uaddr, &oldval);
1830 if (error) 2539 if (error)
1831 goto out; 2540 goto out;
1832 if ((oldval & FUTEX_TID_MASK) != tid) 2541 if ((oldval & FUTEX_TID_MASK) != tid)
1833 goto out; 2542 goto out;
1834 newval = oldval | FUTEX_OWNER_DIED; 2543 newval = oldval | FUTEX_OWNER_DIED;
1835 error = ucas_int(uaddr, oldval, newval, &actual); 2544 error = ucas_int(uaddr, oldval, newval, &actual);
1836 if (error) 2545 if (error)
1837 goto out; 2546 goto out;
1838 } while (actual != oldval); 2547 } while (actual != oldval);
1839 2548
1840 /* 2549 /*
1841 * If there may be waiters, try to wake one. If anything goes 2550 * If there may be waiters, try to wake one. If anything goes
1842 * wrong, tough. 2551 * wrong, tough.
1843 * 2552 *
1844 * XXX eventual PI handling? 2553 * XXX eventual PI handling?
1845 */ 2554 */
1846 if (oldval & FUTEX_WAITERS) 2555 if (oldval & FUTEX_WAITERS) {
1847 (void)futex_wake(f, 1, NULL, 0, FUTEX_BITSET_MATCH_ANY); 2556 (void)futex_wake(f, FUTEX_WRITERQ, 1,
 2557 NULL, FUTEX_WRITERQ, 0,
 2558 FUTEX_BITSET_MATCH_ANY);
 2559 }
1848 2560
1849 /* Unlock the queue and release the futex. */ 2561 /* Unlock the queue and release the futex. */
1850out: futex_queue_unlock(f); 2562out: futex_op_unlock(f);
1851 futex_rele(f); 2563 futex_rele(f);
1852} 2564}
1853 2565
1854/* 2566/*
1855 * futex_robust_head_lookup(l, lwpid) 2567 * futex_robust_head_lookup(l, lwpid)
1856 * 2568 *
1857 * Helper function to look up a robust head by LWP ID. 2569 * Helper function to look up a robust head by LWP ID.
1858 */ 2570 */
1859int 2571int
1860futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp) 2572futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
1861{ 2573{
1862 struct proc *p = l->l_proc; 2574 struct proc *p = l->l_proc;
1863 2575

cvs diff -r1.212 -r1.212.2.1 src/sys/sys/lwp.h (expand / switch to unified diff)

--- src/sys/sys/lwp.h 2020/10/23 00:25:45 1.212
+++ src/sys/sys/lwp.h 2020/11/01 15:16:43 1.212.2.1
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: lwp.h,v 1.212 2020/10/23 00:25:45 thorpej Exp $ */ 1/* $NetBSD: lwp.h,v 1.212.2.1 2020/11/01 15:16:43 thorpej Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010, 2019, 2020 4 * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010, 2019, 2020
5 * The NetBSD Foundation, Inc. 5 * The NetBSD Foundation, Inc.
6 * All rights reserved. 6 * All rights reserved.
7 * 7 *
8 * This code is derived from software contributed to The NetBSD Foundation 8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Nathan J. Williams and Andrew Doran. 9 * by Nathan J. Williams and Andrew Doran.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions 12 * modification, are permitted provided that the following conditions
13 * are met: 13 * are met:
14 * 1. Redistributions of source code must retain the above copyright 14 * 1. Redistributions of source code must retain the above copyright
@@ -62,33 +62,35 @@ static __inline struct cpu_info *lwp_get @@ -62,33 +62,35 @@ static __inline struct cpu_info *lwp_get
62#include <machine/proc.h> /* Machine-dependent proc substruct. */ 62#include <machine/proc.h> /* Machine-dependent proc substruct. */
63 63
64/* 64/*
65 * Lightweight process. Field markings and the corresponding locks: 65 * Lightweight process. Field markings and the corresponding locks:
66 * 66 *
67 * a: proc_lock 67 * a: proc_lock
68 * c: condition variable interlock, passed to cv_wait() 68 * c: condition variable interlock, passed to cv_wait()
69 * l: *l_mutex 69 * l: *l_mutex
70 * p: l_proc->p_lock 70 * p: l_proc->p_lock
71 * s: spc_mutex, which may or may not be referenced by l_mutex 71 * s: spc_mutex, which may or may not be referenced by l_mutex
72 * S: l_selcluster->sc_lock 72 * S: l_selcluster->sc_lock
73 * (: unlocked, stable 73 * (: unlocked, stable
74 * !: unlocked, may only be reliably accessed by the LWP itself 74 * !: unlocked, may only be reliably accessed by the LWP itself
 75 * x: special; see comments for field.
75 * 76 *
76 * Fields are clustered together by usage (to increase the likelihood 77 * Fields are clustered together by usage (to increase the likelihood
77 * of cache hits) and by size (to reduce dead space in the structure). 78 * of cache hits) and by size (to reduce dead space in the structure).
78 */ 79 */
79 80
80#include <sys/pcu.h> 81#include <sys/pcu.h>
81 82
 83struct futex;
82struct lockdebug; 84struct lockdebug;
83struct sysent; 85struct sysent;
84 86
85struct lwp { 87struct lwp {
86 /* Must not be zeroed on free. */ 88 /* Must not be zeroed on free. */
87 struct cpu_info *volatile l_cpu;/* s: CPU we're on if LSONPROC */ 89 struct cpu_info *volatile l_cpu;/* s: CPU we're on if LSONPROC */
88 kmutex_t * volatile l_mutex; /* l: ptr to mutex on sched state */ 90 kmutex_t * volatile l_mutex; /* l: ptr to mutex on sched state */
89 struct turnstile *l_ts; /* l: current turnstile */ 91 struct turnstile *l_ts; /* l: current turnstile */
90 int l_stat; /* l: overall LWP status */ 92 int l_stat; /* l: overall LWP status */
91 int l__reserved; /* : padding - reuse as needed */  93 int l__reserved; /* : padding - reuse as needed */
92 94
93 /* Scheduling and overall state. */ 95 /* Scheduling and overall state. */
94#define l_startzero l_runq 96#define l_startzero l_runq
@@ -129,29 +131,39 @@ struct lwp { @@ -129,29 +131,39 @@ struct lwp {
129 kcpuset_t *l_affinity; /* l: CPU set for affinity */ 131 kcpuset_t *l_affinity; /* l: CPU set for affinity */
130 132
131 /* Synchronisation. */ 133 /* Synchronisation. */
132 struct syncobj *l_syncobj; /* l: sync object operations set */ 134 struct syncobj *l_syncobj; /* l: sync object operations set */
133 LIST_ENTRY(lwp) l_sleepchain; /* l: sleep queue */ 135 LIST_ENTRY(lwp) l_sleepchain; /* l: sleep queue */
134 wchan_t l_wchan; /* l: sleep address */ 136 wchan_t l_wchan; /* l: sleep address */
135 const char *l_wmesg; /* l: reason for sleep */ 137 const char *l_wmesg; /* l: reason for sleep */
136 struct sleepq *l_sleepq; /* l: current sleep queue */ 138 struct sleepq *l_sleepq; /* l: current sleep queue */
137 callout_t l_timeout_ch; /* !: callout for tsleep */ 139 callout_t l_timeout_ch; /* !: callout for tsleep */
138 kcondvar_t l_waitcv; /* a: vfork() wait */ 140 kcondvar_t l_waitcv; /* a: vfork() wait */
139 u_int l_slptime; /* l: time since last blocked */ 141 u_int l_slptime; /* l: time since last blocked */
140 bool l_vforkwaiting; /* a: vfork() waiting */ 142 bool l_vforkwaiting; /* a: vfork() waiting */
141 143
142 /* User-space synchronization. */ 144 /*
 145 * User-space synchronization.
 146 *
 147 * Special locking considerations:
 148 *
 149 * l_futex and l_futex_wakesel are acccessed unlocked and private
 150 * to the LWP *unless* the LWP is present on a futex sleepq, in
 151 * which case they are protected by the lwp_lock (which will in
 152 * actuality be the futex sleepq lock).
 153 */
143 uintptr_t l_robust_head; /* !: list of robust futexes */ 154 uintptr_t l_robust_head; /* !: list of robust futexes */
144 uint32_t l___rsvd1; /* reserved for future use */ 155 struct futex *l_futex; /* x: futex we're waiting on */
 156 uint32_t l_futex_wakesel;/* x: futex wake selector */
145 157
146#if PCU_UNIT_COUNT > 0 158#if PCU_UNIT_COUNT > 0
147 struct cpu_info * volatile l_pcu_cpu[PCU_UNIT_COUNT]; 159 struct cpu_info * volatile l_pcu_cpu[PCU_UNIT_COUNT];
148 uint32_t l_pcu_valid; 160 uint32_t l_pcu_valid;
149#endif 161#endif
150 162
151 /* Process level and global state, misc. */ 163 /* Process level and global state, misc. */
152 lwpid_t l_lid; /* (: LWP identifier; local to proc */ 164 lwpid_t l_lid; /* (: LWP identifier; local to proc */
153 LIST_ENTRY(lwp) l_list; /* a: entry on list of all LWPs */ 165 LIST_ENTRY(lwp) l_list; /* a: entry on list of all LWPs */
154 void *l_ctxlink; /* p: uc_link {get,set}context */ 166 void *l_ctxlink; /* p: uc_link {get,set}context */
155 struct proc *l_proc; /* p: parent process */ 167 struct proc *l_proc; /* p: parent process */
156 LIST_ENTRY(lwp) l_sibling; /* p: entry on proc's list of LWPs */ 168 LIST_ENTRY(lwp) l_sibling; /* p: entry on proc's list of LWPs */
157 char *l_name; /* (: name, optional */ 169 char *l_name; /* (: name, optional */