Sat Feb 12 17:10:02 2022 UTC ()
mips: Brush up __cpu_simple_lock.

- Eradicate last vestiges of mb_* barriers.

- In __cpu_simple_lock_init, omit needless barrier.  It is the
  caller's responsibility to ensure __cpu_simple_lock_init happens
  before other operations on it anyway, so there was never any need
  for a barrier here.

- In __cpu_simple_lock_try, leave comments about memory ordering
  guarantees of the kernel's _atomic_cas_uint, which are inexplicably
  different from the non-underscored atomic_cas_uint.

- In __cpu_simple_unlock, use membar_exit instead of mb_memory, and do
  it unconditionally.

  This ensures that in __cpu_simple_lock/.../__cpu_simple_unlock, all
  memory operations in the ellipsis happen before the store that
  releases the lock.

  - On Octeon, the barrier was omitted altogether, which is a bug --
    it needs to be there or else there is no happens-before relation
    and whoever takes the lock next might see stale values stored or
    even stomp over the unlocking CPU's delayed loads.

  - On non-Octeon, the mb_memory was sync.  Using membar_exit
    preserves this.

  XXX On Octeon, membar_exit only issues syncw -- this seems wrong,
  only store-before-store and not load/store-before-store, unless the
  CNMIPS architecture guarantees it is sufficient here like
  SPARCv8/v9 PSO (`Partial Store Order').

- Leave an essay with citations about why we have an apparently
  pointless syncw _after_ releasing a lock, to work around a design
  bug^W^Wquirk in cnmips which sometimes buffers stores for hundreds
  of thousands of cycles for fun unless you issue syncw.


(riastradh)
diff -r1.10 -r1.11 src/common/lib/libc/arch/mips/atomic/membar_ops.S
diff -r1.21 -r1.22 src/sys/arch/mips/include/lock.h

cvs diff -r1.10 -r1.11 src/common/lib/libc/arch/mips/atomic/membar_ops.S (expand / switch to context diff)
--- src/common/lib/libc/arch/mips/atomic/membar_ops.S 2020/08/10 14:37:38 1.10
+++ src/common/lib/libc/arch/mips/atomic/membar_ops.S 2022/02/12 17:10:02 1.11
@@ -1,4 +1,4 @@
-/*	$NetBSD: membar_ops.S,v 1.10 2020/08/10 14:37:38 skrll Exp $	*/
+/*	$NetBSD: membar_ops.S,v 1.11 2022/02/12 17:10:02 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc.
@@ -44,16 +44,6 @@
 	j	ra
 	 syncw
 END(_membar_producer)
-#endif
-
-#ifdef _KERNEL
-STRONG_ALIAS(mb_read, _membar_sync)
-#ifdef __OCTEON__
-STRONG_ALIAS(mb_write, _membar_producer)
-#else
-STRONG_ALIAS(mb_write, _membar_sync)
-#endif
-STRONG_ALIAS(mb_memory, _membar_sync)
 #endif
 
 ATOMIC_OP_ALIAS(membar_sync,_membar_sync)

cvs diff -r1.21 -r1.22 src/sys/arch/mips/include/lock.h (expand / switch to context diff)
--- src/sys/arch/mips/include/lock.h 2020/08/05 05:24:44 1.21
+++ src/sys/arch/mips/include/lock.h 2022/02/12 17:10:02 1.22
@@ -1,4 +1,4 @@
-/*	$NetBSD: lock.h,v 1.21 2020/08/05 05:24:44 simonb Exp $	*/
+/*	$NetBSD: lock.h,v 1.22 2022/02/12 17:10:02 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2001, 2007 The NetBSD Foundation, Inc.
@@ -41,6 +41,8 @@
 
 #include <sys/param.h>
 
+#include <sys/atomic.h>
+
 static __inline int
 __SIMPLELOCK_LOCKED_P(const __cpu_simple_lock_t *__ptr)
 {
@@ -98,63 +100,30 @@
 	return (v0 != 0);
 }
 
-#ifdef MIPS1
-static __inline void
-mb_read(void)
-{
-	__insn_barrier();
-}
-
-static __inline void
-mb_write(void)
-{
-	__insn_barrier();
-}
-
-static __inline void
-mb_memory(void)
-{
-	__insn_barrier();
-}
-#else	/* MIPS1*/
-static __inline void
-mb_read(void)
-{
-	__asm volatile(
-		"	.set push		\n"
-		"	.set mips2		\n"
-		"	sync			\n"
-		"	.set pop"
-		::: "memory"
-	);
-}
-
-static __inline void
-mb_write(void)
-{
-	mb_read();
-}
-
-static __inline void
-mb_memory(void)
-{
-	mb_read();
-}
-#endif	/* MIPS1 */
-
 #else	/* !_HARDKERNEL */
 
 u_int	_atomic_cas_uint(volatile u_int *, u_int, u_int);
 u_long	_atomic_cas_ulong(volatile u_long *, u_long, u_long);
 void *	_atomic_cas_ptr(volatile void *, void *, void *);
-void	mb_read(void);
-void	mb_write(void);
-void	mb_memory(void);
 
 static __inline int
 __cpu_simple_lock_try(__cpu_simple_lock_t *lp)
 {
 
+	/*
+	 * Successful _atomic_cas_uint functions as a load-acquire --
+	 * on MP systems, it issues sync after the LL/SC CAS succeeds;
+	 * on non-MP systems every load is a load-acquire so it's moot.
+	 * This pairs with the membar_exit and store sequence in
+	 * __cpu_simple_unlock that functions as a store-release
+	 * operation.
+	 *
+	 * NOTE: This applies only to _atomic_cas_uint (with the
+	 * underscore), in sys/arch/mips/mips/lock_stubs_*.S.  Not true
+	 * for atomic_cas_uint (without the underscore), from
+	 * common/lib/libc/arch/mips/atomic/atomic_cas.S which does not
+	 * imply a load-acquire.  It is unclear why these disagree.
+	 */
 	return _atomic_cas_uint(lp,
 	    __SIMPLELOCK_UNLOCKED, __SIMPLELOCK_LOCKED) ==
 	    __SIMPLELOCK_UNLOCKED;
@@ -167,7 +136,6 @@
 {
 
 	*lp = __SIMPLELOCK_UNLOCKED;
-	mb_memory();
 }
 
 static __inline void
@@ -184,12 +152,54 @@
 __cpu_simple_unlock(__cpu_simple_lock_t *lp)
 {
 
-#ifndef _MIPS_ARCH_OCTEONP
-	mb_memory();
-#endif
+	/*
+	 * The membar_exit and then store functions as a store-release
+	 * operation that pairs with the load-acquire operation in
+	 * successful __cpu_simple_lock_try.
+	 *
+	 * Can't use atomic_store_release here because that's not
+	 * available in userland at the moment.
+	 */
+	membar_exit();
 	*lp = __SIMPLELOCK_UNLOCKED;
+
 #ifdef _MIPS_ARCH_OCTEONP
-	mb_write();
+	/*
+	 * On Cavium's recommendation, we issue an extra SYNCW that is
+	 * not necessary for correct ordering because apparently stores
+	 * can get stuck in Octeon store buffers for hundreds of
+	 * thousands of cycles, according to the following note:
+	 *
+	 *	Programming Notes:
+	 *	[...]
+	 *	Core A (writer)
+	 *	SW R1, DATA
+	 *	LI R2, 1
+	 *	SYNCW
+	 *	SW R2, FLAG
+	 *	SYNCW
+	 *	[...]
+	 *
+	 *	The second SYNCW instruction executed by core A is not
+	 *	necessary for correctness, but has very important
+	 *	performance effects on OCTEON.  Without it, the store
+	 *	to FLAG may linger in core A's write buffer before it
+	 *	becomes visible to other cores.  (If core A is not
+	 *	performing many stores, this may add hundreds of
+	 *	thousands of cycles to the flag release time since the
+	 *	OCTEON core normally retains stores to attempt to merge
+	 *	them before sending the store on the CMB.)
+	 *	Applications should include this second SYNCW
+	 *	instruction after flag or lock releases.
+	 *
+	 * Cavium Networks OCTEON Plus CN50XX Hardware Reference
+	 * Manual, July 2008, Appendix A, p. 943.
+	 * https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/hactive/CN50XX-HRM-V0.99E.pdf
+	 *
+	 * XXX It might be prudent to put this into
+	 * atomic_store_release itself.
+	 */
+	__asm volatile("syncw" ::: "memory");
 #endif
 }