Fri Feb 11 17:53:28 2022 UTC ()
ucas(9): Membar audit.

- Omit needless membar_enter before ipi_trigger_broadcast.  This was
  presumably intended to imply a happens-before relation for the
  following two CPUs:

	/* CPU doing ucas */
	ucas_critical_enter()
		ucas_critical_pausing_cpus = ncpu - 1	(A)
		ipi_trigger_broadcast()

	/* other CPU walking by whistling innocently */
	IPI handler
	ucas_critical_cpu_gate()
		load ucas_critical_pausing_cpus		(B)

  That is, this was presumably meant to ensure (A) happens-before (B).
  This relation is already guaranteed by ipi(9), so there is no need
  for any explicit memory barrier.

- Issue a store-release in ucas_critical_cpu_gate so we have the
  following happens-before relation which was otherwise not guaranteed
  except if __HAVE_ATOMIC_AS_MEMBAR:

	/* other CPU walking by whistling innocently */
	...other logic touching the target ucas word...	(A)
	IPI handler
	ucas_critical_cpu_gate()
		...
		atomic_dec_uint(&ucas_critical_pausing_cpus)

  happens-before

	/* CPU doing ucas */
	ucas_critical_enter() -> ucas_critical_wait();
	...touching the word with ufetch/ustore...	(B)

  We need to ensure the logic (A) on another CPU touching the target
  ucas word happens-before we actually do the ucas at (B).

  (a) This requires the other CPU to do a store-release on
      ucas_critical_pausing_cpus in ucas_critical_cpu_gate, and

  (b) this requires the ucas CPU to do a load-acquire on
      ucas_critical_pausing_cpus in ucas_critical_wait.

  Without _both_ sides -- store-release and then load-acquire -- there
  is no such happens-before guarantee; another CPU may have a buffered
  store, for instance, that clobbers the ucas.

  For now, do the store-release with membar_exit conditional on
  __HAVE_ATOMIC_AS_MEMBAR and then atomic_dec_uint -- later with the
  C11 API we can dispense with the #ifdef and just use
  atomic_fetch_add_explicit(..., memory_order_release).  The
  load-acquire we can do with atomic_load_acquire.

- Issue a load-acquire in ucas_critical_cpu_gate so we have the
  following happens-before relation which was otherwise not guaranteed:

	/* CPU doing ucas */
	...ufetch/ustore...				(A)
	ucas_critical_exit()
		ucas_critical_pausing_cpus = -1;

	/* other CPU walking by whistling innocently */
	IPI handler
	ucas_critical_cpu_gate()
		...
		while (ucas_critical_pausing_cpus != -1)
			spin;
	...other logic touching the target ucas word...	(B)

  We need to ensure the logic (A) to do the ucas happens-before logic
  that might use it on another CPU at (B).

  (a) This requires that the ucas CPU do a store-release on
      ucas_critical_pausing_cpus in ucas_critical_exit, and

  (b) this requires that the other CPU do a load-acquire on
      ucas_critical_pausing_cpus in ucas_critical_cpu_gate.

  Without _both_ sides -- store-release and then load-acquire -- there
  is no such happens-before guarantee; the other CPU might witness a
  cached stale value of the target location but a new value of some
  other location in the wrong order.

- Use atomic_load/store_* to avoid the appearance of races, e.g. for
  sanitizers.

- Document which barriers pair up with which barriers and what they're
  doing.


(riastradh)
diff -r1.14 -r1.15 src/sys/kern/subr_copy.c

cvs diff -r1.14 -r1.15 src/sys/kern/subr_copy.c (expand / switch to unified diff)

--- src/sys/kern/subr_copy.c 2020/05/23 23:42:43 1.14
+++ src/sys/kern/subr_copy.c 2022/02/11 17:53:28 1.15
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: subr_copy.c,v 1.14 2020/05/23 23:42:43 ad Exp $ */ 1/* $NetBSD: subr_copy.c,v 1.15 2022/02/11 17:53:28 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008, 2019 4 * Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008, 2019
5 * The NetBSD Foundation, Inc. 5 * The NetBSD Foundation, Inc.
6 * All rights reserved. 6 * All rights reserved.
7 * 7 *
8 * This code is derived from software contributed to The NetBSD Foundation 8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
10 * NASA Ames Research Center. 10 * NASA Ames Research Center.
11 * 11 *
12 * Redistribution and use in source and binary forms, with or without 12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions 13 * modification, are permitted provided that the following conditions
14 * are met: 14 * are met:
@@ -70,27 +70,27 @@ @@ -70,27 +70,27 @@
70 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 70 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
71 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 71 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
72 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 72 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
73 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 73 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
74 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 74 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
75 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 75 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
76 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 76 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
77 * SUCH DAMAGE. 77 * SUCH DAMAGE.
78 * 78 *
79 * @(#)kern_subr.c 8.4 (Berkeley) 2/14/95 79 * @(#)kern_subr.c 8.4 (Berkeley) 2/14/95
80 */ 80 */
81 81
82#include <sys/cdefs.h> 82#include <sys/cdefs.h>
83__KERNEL_RCSID(0, "$NetBSD: subr_copy.c,v 1.14 2020/05/23 23:42:43 ad Exp $"); 83__KERNEL_RCSID(0, "$NetBSD: subr_copy.c,v 1.15 2022/02/11 17:53:28 riastradh Exp $");
84 84
85#define __UFETCHSTORE_PRIVATE 85#define __UFETCHSTORE_PRIVATE
86#define __UCAS_PRIVATE 86#define __UCAS_PRIVATE
87 87
88#include <sys/param.h> 88#include <sys/param.h>
89#include <sys/fcntl.h> 89#include <sys/fcntl.h>
90#include <sys/proc.h> 90#include <sys/proc.h>
91#include <sys/systm.h> 91#include <sys/systm.h>
92 92
93#include <uvm/uvm_extern.h> 93#include <uvm/uvm_extern.h>
94 94
95void 95void
96uio_setup_sysspace(struct uio *uio) 96uio_setup_sysspace(struct uio *uio)
@@ -390,89 +390,129 @@ do { \ @@ -390,89 +390,129 @@ do { \
390#include <sys/mutex.h> 390#include <sys/mutex.h>
391#include <sys/ipi.h> 391#include <sys/ipi.h>
392 392
393static int ucas_critical_splcookie; 393static int ucas_critical_splcookie;
394static volatile u_int ucas_critical_pausing_cpus; 394static volatile u_int ucas_critical_pausing_cpus;
395static u_int ucas_critical_ipi; 395static u_int ucas_critical_ipi;
396static ONCE_DECL(ucas_critical_init_once) 396static ONCE_DECL(ucas_critical_init_once)
397 397
398static void 398static void
399ucas_critical_cpu_gate(void *arg __unused) 399ucas_critical_cpu_gate(void *arg __unused)
400{ 400{
401 int count = SPINLOCK_BACKOFF_MIN; 401 int count = SPINLOCK_BACKOFF_MIN;
402 402
403 KASSERT(ucas_critical_pausing_cpus > 0); 403 KASSERT(atomic_load_relaxed(&ucas_critical_pausing_cpus) > 0);
 404
 405 /*
 406 * Notify ucas_critical_wait that we have stopped. Using
 407 * store-release ensures all our memory operations up to the
 408 * IPI happen before the ucas -- no buffered stores on our end
 409 * can clobber it later on, for instance.
 410 *
 411 * Matches atomic_load_acquire in ucas_critical_wait -- turns
 412 * the following atomic_dec_uint into a store-release.
 413 */
 414#ifndef __HAVE_ATOMIC_AS_MEMBAR
 415 membar_exit();
 416#endif
404 atomic_dec_uint(&ucas_critical_pausing_cpus); 417 atomic_dec_uint(&ucas_critical_pausing_cpus);
405 while (ucas_critical_pausing_cpus != (u_int)-1) { 418
 419 /*
 420 * Wait for ucas_critical_exit to reopen the gate and let us
 421 * proceed. Using a load-acquire ensures the ucas happens
 422 * before any of our memory operations when we return from the
 423 * IPI and proceed -- we won't observe any stale cached value
 424 * that the ucas overwrote, for instance.
 425 *
 426 * Matches atomic_store_release in ucas_critical_exit.
 427 */
 428 while (atomic_load_acquire(&ucas_critical_pausing_cpus) != (u_int)-1) {
406 SPINLOCK_BACKOFF(count); 429 SPINLOCK_BACKOFF(count);
407 } 430 }
408} 431}
409 432
410static int 433static int
411ucas_critical_init(void) 434ucas_critical_init(void)
412{ 435{
 436
413 ucas_critical_ipi = ipi_register(ucas_critical_cpu_gate, NULL); 437 ucas_critical_ipi = ipi_register(ucas_critical_cpu_gate, NULL);
414 return 0; 438 return 0;
415} 439}
416 440
417static void 441static void
418ucas_critical_wait(void) 442ucas_critical_wait(void)
419{ 443{
420 int count = SPINLOCK_BACKOFF_MIN; 444 int count = SPINLOCK_BACKOFF_MIN;
421 445
422 while (ucas_critical_pausing_cpus > 0) { 446 /*
 447 * Wait for all CPUs to stop at the gate. Using a load-acquire
 448 * ensures all memory operations before they stop at the gate
 449 * happen before the ucas -- no buffered stores in other CPUs
 450 * can clobber it later on, for instance.
 451 *
 452 * Matches membar_exit/atomic_dec_uint (store-release) in
 453 * ucas_critical_cpu_gate.
 454 */
 455 while (atomic_load_acquire(&ucas_critical_pausing_cpus) > 0) {
423 SPINLOCK_BACKOFF(count); 456 SPINLOCK_BACKOFF(count);
424 } 457 }
425} 458}
426#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */ 459#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */
427 460
428static inline void 461static inline void
429ucas_critical_enter(lwp_t * const l) 462ucas_critical_enter(lwp_t * const l)
430{ 463{
431 464
432#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR) 465#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)
433 if (ncpu > 1) { 466 if (ncpu > 1) {
434 RUN_ONCE(&ucas_critical_init_once, ucas_critical_init); 467 RUN_ONCE(&ucas_critical_init_once, ucas_critical_init);
435 468
436 /* 469 /*
437 * Acquire the mutex first, then go to splhigh() and 470 * Acquire the mutex first, then go to splhigh() and
438 * broadcast the IPI to lock all of the other CPUs 471 * broadcast the IPI to lock all of the other CPUs
439 * behind the gate. 472 * behind the gate.
440 * 473 *
441 * N.B. Going to splhigh() implicitly disables preemption, 474 * N.B. Going to splhigh() implicitly disables preemption,
442 * so there's no need to do it explicitly. 475 * so there's no need to do it explicitly.
443 */ 476 */
444 mutex_enter(&cpu_lock); 477 mutex_enter(&cpu_lock);
445 ucas_critical_splcookie = splhigh(); 478 ucas_critical_splcookie = splhigh();
446 ucas_critical_pausing_cpus = ncpu - 1; 479 ucas_critical_pausing_cpus = ncpu - 1;
447 membar_enter(); 
448 
449 ipi_trigger_broadcast(ucas_critical_ipi, true); 480 ipi_trigger_broadcast(ucas_critical_ipi, true);
450 ucas_critical_wait(); 481 ucas_critical_wait();
451 return; 482 return;
452 } 483 }
453#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */ 484#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */
454 485
455 KPREEMPT_DISABLE(l); 486 KPREEMPT_DISABLE(l);
456} 487}
457 488
458static inline void 489static inline void
459ucas_critical_exit(lwp_t * const l) 490ucas_critical_exit(lwp_t * const l)
460{ 491{
461 492
462#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR) 493#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)
463 if (ncpu > 1) { 494 if (ncpu > 1) {
464 membar_exit(); 495 /*
465 ucas_critical_pausing_cpus = (u_int)-1; 496 * Open the gate and notify all CPUs in
 497 * ucas_critical_cpu_gate that they can now proceed.
 498 * Using a store-release ensures the ucas happens
 499 * before any memory operations they issue after the
 500 * IPI -- they won't observe any stale cache of the
 501 * target word, for instance.
 502 *
 503 * Matches atomic_load_acquire in ucas_critical_cpu_gate.
 504 */
 505 atomic_store_release(&ucas_critical_pausing_cpus, (u_int)-1);
466 splx(ucas_critical_splcookie); 506 splx(ucas_critical_splcookie);
467 mutex_exit(&cpu_lock); 507 mutex_exit(&cpu_lock);
468 return; 508 return;
469 } 509 }
470#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */ 510#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */
471 511
472 KPREEMPT_ENABLE(l); 512 KPREEMPT_ENABLE(l);
473} 513}
474 514
475int 515int
476_ucas_32(volatile uint32_t *uaddr, uint32_t old, uint32_t new, uint32_t *ret) 516_ucas_32(volatile uint32_t *uaddr, uint32_t old, uint32_t new, uint32_t *ret)
477{ 517{
478 lwp_t * const l = curlwp; 518 lwp_t * const l = curlwp;