Fri Feb 11 17:53:28 2022 UTC ()
ucas(9): Membar audit.

- Omit needless membar_enter before ipi_trigger_broadcast.  This was
  presumably intended to imply a happens-before relation for the
  following two CPUs:

	/* CPU doing ucas */
	ucas_critical_enter()
		ucas_critical_pausing_cpus = ncpu - 1	(A)
		ipi_trigger_broadcast()

	/* other CPU walking by whistling innocently */
	IPI handler
	ucas_critical_cpu_gate()
		load ucas_critical_pausing_cpus		(B)

  That is, this was presumably meant to ensure (A) happens-before (B).
  This relation is already guaranteed by ipi(9), so there is no need
  for any explicit memory barrier.

- Issue a store-release in ucas_critical_cpu_gate so we have the
  following happens-before relation which was otherwise not guaranteed
  except if __HAVE_ATOMIC_AS_MEMBAR:

	/* other CPU walking by whistling innocently */
	...other logic touching the target ucas word...	(A)
	IPI handler
	ucas_critical_cpu_gate()
		...
		atomic_dec_uint(&ucas_critical_pausing_cpus)

  happens-before

	/* CPU doing ucas */
	ucas_critical_enter() -> ucas_critical_wait();
	...touching the word with ufetch/ustore...	(B)

  We need to ensure the logic (A) on another CPU touching the target
  ucas word happens-before we actually do the ucas at (B).

  (a) This requires the other CPU to do a store-release on
      ucas_critical_pausing_cpus in ucas_critical_cpu_gate, and

  (b) this requires the ucas CPU to do a load-acquire on
      ucas_critical_pausing_cpus in ucas_critical_wait.

  Without _both_ sides -- store-release and then load-acquire -- there
  is no such happens-before guarantee; another CPU may have a buffered
  store, for instance, that clobbers the ucas.

  For now, do the store-release with membar_exit conditional on
  __HAVE_ATOMIC_AS_MEMBAR and then atomic_dec_uint -- later with the
  C11 API we can dispense with the #ifdef and just use
  atomic_fetch_add_explicit(..., memory_order_release).  The
  load-acquire we can do with atomic_load_acquire.

- Issue a load-acquire in ucas_critical_cpu_gate so we have the
  following happens-before relation which was otherwise not guaranteed:

	/* CPU doing ucas */
	...ufetch/ustore...				(A)
	ucas_critical_exit()
		ucas_critical_pausing_cpus = -1;

	/* other CPU walking by whistling innocently */
	IPI handler
	ucas_critical_cpu_gate()
		...
		while (ucas_critical_pausing_cpus != -1)
			spin;
	...other logic touching the target ucas word...	(B)

  We need to ensure the logic (A) to do the ucas happens-before logic
  that might use it on another CPU at (B).

  (a) This requires that the ucas CPU do a store-release on
      ucas_critical_pausing_cpus in ucas_critical_exit, and

  (b) this requires that the other CPU do a load-acquire on
      ucas_critical_pausing_cpus in ucas_critical_cpu_gate.

  Without _both_ sides -- store-release and then load-acquire -- there
  is no such happens-before guarantee; the other CPU might witness a
  cached stale value of the target location but a new value of some
  other location in the wrong order.

- Use atomic_load/store_* to avoid the appearance of races, e.g. for
  sanitizers.

- Document which barriers pair up with which barriers and what they're
  doing.


(riastradh)
diff -r1.14 -r1.15 src/sys/kern/subr_copy.c

cvs diff -r1.14 -r1.15 src/sys/kern/subr_copy.c (expand / switch to context diff)
--- src/sys/kern/subr_copy.c 2020/05/23 23:42:43 1.14
+++ src/sys/kern/subr_copy.c 2022/02/11 17:53:28 1.15
@@ -1,4 +1,4 @@
-/*	$NetBSD: subr_copy.c,v 1.14 2020/05/23 23:42:43 ad Exp $	*/
+/*	$NetBSD: subr_copy.c,v 1.15 2022/02/11 17:53:28 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008, 2019
@@ -80,7 +80,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: subr_copy.c,v 1.14 2020/05/23 23:42:43 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: subr_copy.c,v 1.15 2022/02/11 17:53:28 riastradh Exp $");
 
 #define	__UFETCHSTORE_PRIVATE
 #define	__UCAS_PRIVATE
@@ -400,9 +400,32 @@
 {
 	int count = SPINLOCK_BACKOFF_MIN;
 
-	KASSERT(ucas_critical_pausing_cpus > 0);
+	KASSERT(atomic_load_relaxed(&ucas_critical_pausing_cpus) > 0);
+
+	/*
+	 * Notify ucas_critical_wait that we have stopped.  Using
+	 * store-release ensures all our memory operations up to the
+	 * IPI happen before the ucas -- no buffered stores on our end
+	 * can clobber it later on, for instance.
+	 *
+	 * Matches atomic_load_acquire in ucas_critical_wait -- turns
+	 * the following atomic_dec_uint into a store-release.
+	 */
+#ifndef __HAVE_ATOMIC_AS_MEMBAR
+	membar_exit();
+#endif
 	atomic_dec_uint(&ucas_critical_pausing_cpus);
-	while (ucas_critical_pausing_cpus != (u_int)-1) {
+
+	/*
+	 * Wait for ucas_critical_exit to reopen the gate and let us
+	 * proceed.  Using a load-acquire ensures the ucas happens
+	 * before any of our memory operations when we return from the
+	 * IPI and proceed -- we won't observe any stale cached value
+	 * that the ucas overwrote, for instance.
+	 *
+	 * Matches atomic_store_release in ucas_critical_exit.
+	 */
+	while (atomic_load_acquire(&ucas_critical_pausing_cpus) != (u_int)-1) {
 		SPINLOCK_BACKOFF(count);
 	}
 }
@@ -410,6 +433,7 @@
 static int
 ucas_critical_init(void)
 {
+
 	ucas_critical_ipi = ipi_register(ucas_critical_cpu_gate, NULL);
 	return 0;
 }
@@ -419,7 +443,16 @@
 {
 	int count = SPINLOCK_BACKOFF_MIN;
 
-	while (ucas_critical_pausing_cpus > 0) {
+	/*
+	 * Wait for all CPUs to stop at the gate.  Using a load-acquire
+	 * ensures all memory operations before they stop at the gate
+	 * happen before the ucas -- no buffered stores in other CPUs
+	 * can clobber it later on, for instance.
+	 *
+	 * Matches membar_exit/atomic_dec_uint (store-release) in
+	 * ucas_critical_cpu_gate.
+	 */
+	while (atomic_load_acquire(&ucas_critical_pausing_cpus) > 0) {
 		SPINLOCK_BACKOFF(count);
 	}
 }
@@ -444,8 +477,6 @@
 		mutex_enter(&cpu_lock);
 		ucas_critical_splcookie = splhigh();
 		ucas_critical_pausing_cpus = ncpu - 1;
-		membar_enter();
-
 		ipi_trigger_broadcast(ucas_critical_ipi, true);
 		ucas_critical_wait();
 		return;
@@ -461,8 +492,17 @@
 
 #if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)
 	if (ncpu > 1) {
-		membar_exit();
-		ucas_critical_pausing_cpus = (u_int)-1;
+		/*
+		 * Open the gate and notify all CPUs in
+		 * ucas_critical_cpu_gate that they can now proceed.
+		 * Using a store-release ensures the ucas happens
+		 * before any memory operations they issue after the
+		 * IPI -- they won't observe any stale cache of the
+		 * target word, for instance.
+		 *
+		 * Matches atomic_load_acquire in ucas_critical_cpu_gate.
+		 */
+		atomic_store_release(&ucas_critical_pausing_cpus, (u_int)-1);
 		splx(ucas_critical_splcookie);
 		mutex_exit(&cpu_lock);
 		return;