Sat Feb 12 17:10:02 2022 UTC ()
mips: Brush up __cpu_simple_lock.

- Eradicate last vestiges of mb_* barriers.

- In __cpu_simple_lock_init, omit needless barrier.  It is the
  caller's responsibility to ensure __cpu_simple_lock_init happens
  before other operations on it anyway, so there was never any need
  for a barrier here.

- In __cpu_simple_lock_try, leave comments about memory ordering
  guarantees of the kernel's _atomic_cas_uint, which are inexplicably
  different from the non-underscored atomic_cas_uint.

- In __cpu_simple_unlock, use membar_exit instead of mb_memory, and do
  it unconditionally.

  This ensures that in __cpu_simple_lock/.../__cpu_simple_unlock, all
  memory operations in the ellipsis happen before the store that
  releases the lock.

  - On Octeon, the barrier was omitted altogether, which is a bug --
    it needs to be there or else there is no happens-before relation
    and whoever takes the lock next might see stale values stored or
    even stomp over the unlocking CPU's delayed loads.

  - On non-Octeon, the mb_memory was sync.  Using membar_exit
    preserves this.

  XXX On Octeon, membar_exit only issues syncw -- this seems wrong,
  only store-before-store and not load/store-before-store, unless the
  CNMIPS architecture guarantees it is sufficient here like
  SPARCv8/v9 PSO (`Partial Store Order').

- Leave an essay with citations about why we have an apparently
  pointless syncw _after_ releasing a lock, to work around a design
  bug^W^Wquirk in cnmips which sometimes buffers stores for hundreds
  of thousands of cycles for fun unless you issue syncw.


(riastradh)
diff -r1.10 -r1.11 src/common/lib/libc/arch/mips/atomic/membar_ops.S
diff -r1.21 -r1.22 src/sys/arch/mips/include/lock.h

cvs diff -r1.10 -r1.11 src/common/lib/libc/arch/mips/atomic/membar_ops.S (expand / switch to unified diff)

--- src/common/lib/libc/arch/mips/atomic/membar_ops.S 2020/08/10 14:37:38 1.10
+++ src/common/lib/libc/arch/mips/atomic/membar_ops.S 2022/02/12 17:10:02 1.11
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: membar_ops.S,v 1.10 2020/08/10 14:37:38 skrll Exp $ */ 1/* $NetBSD: membar_ops.S,v 1.11 2022/02/12 17:10:02 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc. 4 * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe, and by Andrew Doran. 8 * by Jason R. Thorpe, and by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
@@ -36,36 +36,26 @@ @@ -36,36 +36,26 @@
36 36
37LEAF(_membar_sync) 37LEAF(_membar_sync)
38 j ra 38 j ra
39 BDSYNC 39 BDSYNC
40END(_membar_sync) 40END(_membar_sync)
41 41
42#ifdef __OCTEON__ 42#ifdef __OCTEON__
43LEAF(_membar_producer) 43LEAF(_membar_producer)
44 j ra 44 j ra
45 syncw 45 syncw
46END(_membar_producer) 46END(_membar_producer)
47#endif 47#endif
48 48
49#ifdef _KERNEL 
50STRONG_ALIAS(mb_read, _membar_sync) 
51#ifdef __OCTEON__ 
52STRONG_ALIAS(mb_write, _membar_producer) 
53#else 
54STRONG_ALIAS(mb_write, _membar_sync) 
55#endif 
56STRONG_ALIAS(mb_memory, _membar_sync) 
57#endif 
58 
59ATOMIC_OP_ALIAS(membar_sync,_membar_sync) 49ATOMIC_OP_ALIAS(membar_sync,_membar_sync)
60ATOMIC_OP_ALIAS(membar_enter,_membar_sync) 50ATOMIC_OP_ALIAS(membar_enter,_membar_sync)
61STRONG_ALIAS(_membar_enter,_membar_sync) 51STRONG_ALIAS(_membar_enter,_membar_sync)
62#ifdef __OCTEON__ 52#ifdef __OCTEON__
63ATOMIC_OP_ALIAS(membar_exit,_membar_producer) 53ATOMIC_OP_ALIAS(membar_exit,_membar_producer)
64STRONG_ALIAS(_membar_exit,_membar_producer) 54STRONG_ALIAS(_membar_exit,_membar_producer)
65STRONG_ALIAS(membar_producer,_membar_producer) 55STRONG_ALIAS(membar_producer,_membar_producer)
66#else 56#else
67ATOMIC_OP_ALIAS(membar_exit,_membar_sync) 57ATOMIC_OP_ALIAS(membar_exit,_membar_sync)
68STRONG_ALIAS(_membar_exit,_membar_sync) 58STRONG_ALIAS(_membar_exit,_membar_sync)
69ATOMIC_OP_ALIAS(membar_producer,_membar_sync) 59ATOMIC_OP_ALIAS(membar_producer,_membar_sync)
70STRONG_ALIAS(_membar_producer,_membar_sync) 60STRONG_ALIAS(_membar_producer,_membar_sync)
71#endif 61#endif

cvs diff -r1.21 -r1.22 src/sys/arch/mips/include/lock.h (expand / switch to unified diff)

--- src/sys/arch/mips/include/lock.h 2020/08/05 05:24:44 1.21
+++ src/sys/arch/mips/include/lock.h 2022/02/12 17:10:02 1.22
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: lock.h,v 1.21 2020/08/05 05:24:44 simonb Exp $ */ 1/* $NetBSD: lock.h,v 1.22 2022/02/12 17:10:02 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2001, 2007 The NetBSD Foundation, Inc. 4 * Copyright (c) 2001, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wayne Knowles and Andrew Doran. 8 * by Wayne Knowles and Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
@@ -31,26 +31,28 @@ @@ -31,26 +31,28 @@
31 31
32/* 32/*
33 * Machine-dependent spin lock operations for MIPS processors. 33 * Machine-dependent spin lock operations for MIPS processors.
34 * 34 *
35 * Note: R2000/R3000 doesn't have any atomic update instructions; this 35 * Note: R2000/R3000 doesn't have any atomic update instructions; this
36 * will cause problems for user applications using this header. 36 * will cause problems for user applications using this header.
37 */ 37 */
38 38
39#ifndef _MIPS_LOCK_H_ 39#ifndef _MIPS_LOCK_H_
40#define _MIPS_LOCK_H_ 40#define _MIPS_LOCK_H_
41 41
42#include <sys/param.h> 42#include <sys/param.h>
43 43
 44#include <sys/atomic.h>
 45
44static __inline int 46static __inline int
45__SIMPLELOCK_LOCKED_P(const __cpu_simple_lock_t *__ptr) 47__SIMPLELOCK_LOCKED_P(const __cpu_simple_lock_t *__ptr)
46{ 48{
47 return *__ptr != __SIMPLELOCK_UNLOCKED; 49 return *__ptr != __SIMPLELOCK_UNLOCKED;
48} 50}
49 51
50static __inline int 52static __inline int
51__SIMPLELOCK_UNLOCKED_P(const __cpu_simple_lock_t *__ptr) 53__SIMPLELOCK_UNLOCKED_P(const __cpu_simple_lock_t *__ptr)
52{ 54{
53 return *__ptr == __SIMPLELOCK_UNLOCKED; 55 return *__ptr == __SIMPLELOCK_UNLOCKED;
54} 56}
55 57
56static __inline void 58static __inline void
@@ -88,109 +90,117 @@ __cpu_simple_lock_try(__cpu_simple_lock_ @@ -88,109 +90,117 @@ __cpu_simple_lock_try(__cpu_simple_lock_
88 " j 3f \n" 90 " j 3f \n"
89 " nop \n" 91 " nop \n"
90 " nop \n" 92 " nop \n"
91 "2: li %1, 0 \n" 93 "2: li %1, 0 \n"
92 "3: \n" 94 "3: \n"
93 " .set pop \n" 95 " .set pop \n"
94 "# -- END __cpu_simple_lock_try \n" 96 "# -- END __cpu_simple_lock_try \n"
95 : "=r" (t0), "=r" (v0), "+m" (*lp) 97 : "=r" (t0), "=r" (v0), "+m" (*lp)
96 : "i" (__SIMPLELOCK_LOCKED), "m" (*lp)); 98 : "i" (__SIMPLELOCK_LOCKED), "m" (*lp));
97 99
98 return (v0 != 0); 100 return (v0 != 0);
99} 101}
100 102
101#ifdef MIPS1 
102static __inline void 
103mb_read(void) 
104{ 
105 __insn_barrier(); 
106} 
107 
108static __inline void 
109mb_write(void) 
110{ 
111 __insn_barrier(); 
112} 
113 
114static __inline void 
115mb_memory(void) 
116{ 
117 __insn_barrier(); 
118} 
119#else /* MIPS1*/ 
120static __inline void 
121mb_read(void) 
122{ 
123 __asm volatile( 
124 " .set push \n" 
125 " .set mips2 \n" 
126 " sync \n" 
127 " .set pop" 
128 ::: "memory" 
129 ); 
130} 
131 
132static __inline void 
133mb_write(void) 
134{ 
135 mb_read(); 
136} 
137 
138static __inline void 
139mb_memory(void) 
140{ 
141 mb_read(); 
142} 
143#endif /* MIPS1 */ 
144 
145#else /* !_HARDKERNEL */ 103#else /* !_HARDKERNEL */
146 104
147u_int _atomic_cas_uint(volatile u_int *, u_int, u_int); 105u_int _atomic_cas_uint(volatile u_int *, u_int, u_int);
148u_long _atomic_cas_ulong(volatile u_long *, u_long, u_long); 106u_long _atomic_cas_ulong(volatile u_long *, u_long, u_long);
149void * _atomic_cas_ptr(volatile void *, void *, void *); 107void * _atomic_cas_ptr(volatile void *, void *, void *);
150void mb_read(void); 
151void mb_write(void); 
152void mb_memory(void); 
153 108
154static __inline int 109static __inline int
155__cpu_simple_lock_try(__cpu_simple_lock_t *lp) 110__cpu_simple_lock_try(__cpu_simple_lock_t *lp)
156{ 111{
157 112
 113 /*
 114 * Successful _atomic_cas_uint functions as a load-acquire --
 115 * on MP systems, it issues sync after the LL/SC CAS succeeds;
 116 * on non-MP systems every load is a load-acquire so it's moot.
 117 * This pairs with the membar_exit and store sequence in
 118 * __cpu_simple_unlock that functions as a store-release
 119 * operation.
 120 *
 121 * NOTE: This applies only to _atomic_cas_uint (with the
 122 * underscore), in sys/arch/mips/mips/lock_stubs_*.S. Not true
 123 * for atomic_cas_uint (without the underscore), from
 124 * common/lib/libc/arch/mips/atomic/atomic_cas.S which does not
 125 * imply a load-acquire. It is unclear why these disagree.
 126 */
158 return _atomic_cas_uint(lp, 127 return _atomic_cas_uint(lp,
159 __SIMPLELOCK_UNLOCKED, __SIMPLELOCK_LOCKED) == 128 __SIMPLELOCK_UNLOCKED, __SIMPLELOCK_LOCKED) ==
160 __SIMPLELOCK_UNLOCKED; 129 __SIMPLELOCK_UNLOCKED;
161} 130}
162 131
163#endif /* _HARDKERNEL */ 132#endif /* _HARDKERNEL */
164 133
165static __inline void 134static __inline void
166__cpu_simple_lock_init(__cpu_simple_lock_t *lp) 135__cpu_simple_lock_init(__cpu_simple_lock_t *lp)
167{ 136{
168 137
169 *lp = __SIMPLELOCK_UNLOCKED; 138 *lp = __SIMPLELOCK_UNLOCKED;
170 mb_memory(); 
171} 139}
172 140
173static __inline void 141static __inline void
174__cpu_simple_lock(__cpu_simple_lock_t *lp) 142__cpu_simple_lock(__cpu_simple_lock_t *lp)
175{ 143{
176 144
177 while (!__cpu_simple_lock_try(lp)) { 145 while (!__cpu_simple_lock_try(lp)) {
178 while (*lp == __SIMPLELOCK_LOCKED) 146 while (*lp == __SIMPLELOCK_LOCKED)
179 /* spin */; 147 /* spin */;
180 } 148 }
181} 149}
182 150
183static __inline void 151static __inline void
184__cpu_simple_unlock(__cpu_simple_lock_t *lp) 152__cpu_simple_unlock(__cpu_simple_lock_t *lp)
185{ 153{
186 154
187#ifndef _MIPS_ARCH_OCTEONP 155 /*
188 mb_memory(); 156 * The membar_exit and then store functions as a store-release
189#endif 157 * operation that pairs with the load-acquire operation in
 158 * successful __cpu_simple_lock_try.
 159 *
 160 * Can't use atomic_store_release here because that's not
 161 * available in userland at the moment.
 162 */
 163 membar_exit();
190 *lp = __SIMPLELOCK_UNLOCKED; 164 *lp = __SIMPLELOCK_UNLOCKED;
 165
191#ifdef _MIPS_ARCH_OCTEONP 166#ifdef _MIPS_ARCH_OCTEONP
192 mb_write(); 167 /*
 168 * On Cavium's recommendation, we issue an extra SYNCW that is
 169 * not necessary for correct ordering because apparently stores
 170 * can get stuck in Octeon store buffers for hundreds of
 171 * thousands of cycles, according to the following note:
 172 *
 173 * Programming Notes:
 174 * [...]
 175 * Core A (writer)
 176 * SW R1, DATA
 177 * LI R2, 1
 178 * SYNCW
 179 * SW R2, FLAG
 180 * SYNCW
 181 * [...]
 182 *
 183 * The second SYNCW instruction executed by core A is not
 184 * necessary for correctness, but has very important
 185 * performance effects on OCTEON. Without it, the store
 186 * to FLAG may linger in core A's write buffer before it
 187 * becomes visible to other cores. (If core A is not
 188 * performing many stores, this may add hundreds of
 189 * thousands of cycles to the flag release time since the
 190 * OCTEON core normally retains stores to attempt to merge
 191 * them before sending the store on the CMB.)
 192 * Applications should include this second SYNCW
 193 * instruction after flag or lock releases.
 194 *
 195 * Cavium Networks OCTEON Plus CN50XX Hardware Reference
 196 * Manual, July 2008, Appendix A, p. 943.
 197 * https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/hactive/CN50XX-HRM-V0.99E.pdf
 198 *
 199 * XXX It might be prudent to put this into
 200 * atomic_store_release itself.
 201 */
 202 __asm volatile("syncw" ::: "memory");
193#endif 203#endif
194} 204}
195 205
196#endif /* _MIPS_LOCK_H_ */ 206#endif /* _MIPS_LOCK_H_ */