@@ -1,806 +1,806 @@
-.\"	$NetBSD: atomic_loadstore.9,v 1.5 2019/12/08 00:00:59 uwe Exp $
+.\"	$NetBSD: atomic_loadstore.9,v 1.6 2020/09/03 00:23:57 riastradh Exp $
 .\"
 .\" Copyright (c) 2019 The NetBSD Foundation
 .\" All rights reserved.
 .\"
 .\" This code is derived from software contributed to The NetBSD Foundation
 .\" by Taylor R. Campbell.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 .\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 .\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 .\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 .\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 .\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 .\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
 .Dd November 25, 2019
 .Dt ATOMIC_LOADSTORE 9
 .Os
 .Sh NAME
 .Nm atomic_load_relaxed ,
 .Nm atomic_load_acquire ,
 .Nm atomic_load_consume ,
 .Nm atomic_store_relaxed ,
 .Nm atomic_store_release
 .Nd atomic and ordered memory operations
 .Sh SYNOPSIS
 .In sys/atomic.h
 .Ft T
 .Fn atomic_load_relaxed "const volatile T *p"
 .Ft T
 .Fn atomic_load_acquire "const volatile T *p"
 .Ft T
 .Fn atomic_load_consume "const volatile T *p"
 .Ft void
 .Fn atomic_store_relaxed "volatile T *p" "T v"
 .Ft void
 .Fn atomic_store_release "volatile T *p" "T v"
 .Sh DESCRIPTION
 These type-generic macros implement memory operations that are
 .Em atomic
 and that have
 .Em memory ordering constraints .
 Aside from atomicity and ordering, the load operations are equivalent
 to
 .Li * Ns Fa p
 and the store operations are equivalent to
 .Li * Ns Fa p Li "=" Fa v .
 The pointer
 .Fa p
 must be aligned, even on architectures like x86 which generally lack
 strict alignment requirements; see
 .Sx SIZE AND ALIGNMENT
 for details.
 .Pp
 .Em Atomic
 means that the memory operations cannot be
 .Em fused
 or
 .Em torn :
 .Bl -bullet
 .It
 .Em Fusing
 is combining multiple memory operations on a single object into one
 memory operation, such as replacing
 .Bd -literal -compact
 	*p = v;
 	x = *p;
 .Ed
 by
 .Bd -literal -compact
 	*p = v;
 	x = v;
 .Ed
 since the compiler can prove that
 .Li \&*p
 will yield
 .Li v
 after
 .Li \&*p\ =\ v .
 For
 .Em atomic
 memory operations, the implementation
 .Em will not
 assume that
 .Bl -dash -compact
 .It
 consecutive loads of the same object will return the same value, or
 .It
 a store followed by a load of the same object will return the value
 stored, or
 .It
 consecutive stores of the same object are redundant.
 .El
 Thus, the implementation will not replace two consecutive atomic loads
 by one, will not elide an atomic load following a store, and will not
 combine two consecutive atomic stores into one.
 .Pp
 For example,
 .Bd -literal
 	atomic_store_relaxed(&flag, 1);
 	while (atomic_load_relaxed(&flag))
 		continue;
 .Ed
 .Pp
 may be used to set a flag and then busy-wait until another thread
 clears it, whereas
 .Bd -literal
 	flag = 1;
 	while (flag)
 		continue;
 .Ed
 .Pp
 may be transformed into the infinite loop
 .Bd -literal
 	flag = 1;
 	while (1)
 		continue;
 .Ed
 .It
 .Em Tearing
 is implementing a memory operation on a large data unit such as a
 -bit word by issuing multiple memory operations on smaller data units
 such as 8-bit bytes.
 The implementation will not tear
 .Em atomic
 loads or stores into smaller ones.
 Thus, as far as any interrupt, other thread, or other CPU can tell, an
 atomic memory operation is issued either all at once or not at all.
 .Pp
 For example, if a 32-bit word
 .Va w
 is written with
 .Pp
 .Dl atomic_store_relaxed(&w,\ 0x00010002);
 .Pp
 then an interrupt, other thread, or other CPU reading it with
 .Li atomic_load_relaxed(&w)
 will never witness it partially written, whereas
 .Pp
 .Dl w\ =\ 0x00010002;
 .Pp
 might be compiled into a pair of separate 16-bit store instructions
 instead of one single word-sized store instruction, in which case other
 threads may see the intermediate state with only one of the halves
 written.
 .El
 .Pp
 Atomic operations on any single object occur in a total order shared by
 all interrupts, threads, and CPUs, which is consistent with the program
 order in every interrupt, thread, and CPU.
 A single program without interruption or other threads or CPUs will
 always observe its own loads and stores in program order, but another
 program in an interrupt handler, in another thread, or on another CPU
 may issue loads that return values as if the first program's stores
 occurred out of program order, and vice versa.
 Two different threads might each observe a third thread's memory
 operations in different orders.
 .Pp
 The
 .Em memory ordering constraints
 make limited guarantees of ordering relative to memory operations on
 .Em other
 objects as witnessed by interrupts, other threads, or other CPUs, and
 have the following meanings:
 .Bl -tag -width relaxed
 .It relaxed
 No ordering relative to memory operations on any other objects is
 guaranteed.
 Relaxed ordering is the default for ordinary non-atomic memory
 operations like
 .Li "*p"
 and
 .Li "*p = v" .
 .Pp
 Atomic operations with relaxed ordering are cheap: they are not
 read/modify/write atomic operations, and they do not involve any kind
 of inter-CPU ordering barriers.
 .It acquire
 This memory operation happens before all subsequent memory operations
 in program order.
 However, prior memory operations in program order may be reordered to
 happen after this one.
 For example, assuming no aliasing between the pointers, the
 implementation is allowed to treat
 .Bd -literal
 	int x = *p;
 	if (atomic_load_acquire(q)) {
 		int y = *r;
 		*s = x + y;
 		return 1;
+	}
 .Ed
 .Pp
 as if it were
 .Bd -literal
 	if (atomic_load_acquire(q)) {
 		int x = *p;
 		int y = *r;
 		*s = x + y;
 		return 1;
+	}
 .Ed
 .Pp
 but
 .Em not
 as if it were
 .Bd -literal
 	int x = *p;
 	int y = *r;
 	*s = x + y;
 	if (atomic_load_acquire(q)) {
 		return 1;
+	}
 .Ed
 .It consume
 This memory operation happens before all memory operations on objects
 at addresses that are computed from the value returned by this one.
 Otherwise, no ordering relative to memory operations on other objects
 is implied.
 .Pp
 For example, the implementation is allowed to treat
 .Bd -literal
 	struct foo *foo0, *foo1;
 	struct foo *f0 = atomic_load_consume(&foo0);
 	struct foo *f1 = atomic_load_consume(&foo1);
 	int x = f0->x;
 	int y = f1->y;
 .Ed
 .Pp
 as if it were
 .Bd -literal
 	struct foo *foo0, *foo1;
 	struct foo *f1 = atomic_load_consume(&foo1);
 	struct foo *f0 = atomic_load_consume(&foo0);
 	int y = f1->y;
 	int x = f0->x;
 .Ed
 .Pp
 but loading
 .Li f0->x
 is guaranteed to happen after loading
 .Li foo0
 even if the CPU had a cached value for the address that
 .Li f0->x
 happened to be at, and likewise for
 .Li f1->y
 and
 .Li foo1 .
 .Pp
 .Fn atomic_load_consume
 functions like
 .Fn atomic_load_acquire
 as long as the memory operations that must happen after it are limited
 to addresses that depend on the value returned by it, but it is almost
 always as cheap as
 .Fn atomic_load_relaxed .
 See
 .Sx ACQUIRE OR CONSUME?
 below for more details.
 .It release
 All prior memory operations in program order happen before this one.
 However, subsequent memory operations in program order may be reordered
 to happen before this one too.
 For example, assuming no aliasing between the pointers, the
 implementation is allowed to treat
 .Bd -literal
 	int x = *p;
 	*q = x;
 	atomic_store_release(r, 0);
 	int y = *s;
 	return x + y;
 .Ed
 .Pp
 as if it were
 .Bd -literal
 	int y = *s;
 	int x = *p;
 	*q = x;
 	atomic_store_release(r, 0);
 	return x + y;
 .Ed
 .Pp
 but
 .Em not
 as if it were
 .Bd -literal
 	atomic_store_release(r, 0);
 	int x = *p;
 	int y = *s;
 	*q = x;
 	return x + y;
 .Ed
 .El
 .Ss PAIRING ORDERED MEMORY OPERATIONS
 In general, each
 .Fn atomic_store_release
 .Em must
 be paired with either
 .Fn atomic_load_acquire
 or
 .Fn atomic_load_consume
 in order to have an effect \(em it is only when a release operation
 synchronizes with an acquire or consume operation that any ordering
 guaranteed between memory operations
 .Em before
 the release operation and memory operations
 .Em after
 the acquire/consume operation.
 .Pp
 For example, to set up an entry in a table and then mark the entry
 ready, you should:
 .Bl -enum
 .It
 Perform memory operations to initialize the data.
 .Bd -literal
 	tab[i].x = ...;
 	tab[i].y = ...;
 .Ed
 .It
 Issue
 .Fn atomic_store_release
 to mark it ready.
 .Bd -literal
 	atomic_store_release(&tab[i].ready, 1);
 .Ed
 .It
 Possibly in another thread, issue
 .Fn atomic_load_acquire
 to ascertain whether it is ready.
 .Bd -literal
 	if (atomic_load_acquire(&tab[i].ready) == 0)
 		return EWOULDBLOCK;
 .Ed
 .It
 Perform memory operations to use the data.
 .Bd -literal
 	do_stuff(tab[i].x, tab[i].y);
 .Ed
 .El
 .Pp
 Similarly, if you want to create an object, initialize it, and then
 publish it to be used by another thread, then you should:
 .Bl -enum
 .It
 Perform memory operations to initialize the object.
 .Bd -literal
 	struct mumble *m = kmem_alloc(sizeof(*m), KM_SLEEP);
 	m->x = x;
 	m->y = y;
 	m->z = m->x + m->y;
 .Ed
 .It
 Issue
 .Fn atomic_store_release
 to publish it.
 .Bd -literal
 	atomic_store_release(&the_mumble, m);
 .Ed
 .It
 Possibly in another thread, issue
 .Fn atomic_load_consume
 to get it.
 .Bd -literal
 	struct mumble *m = atomic_load_consume(&the_mumble);
 .Ed
 .It
 Perform memory operations to use the object's members.
 .Bd -literal
 	m->y &= m->x;
 	do_things(m->x, m->y, m->z);
 .Ed
 .El
 .Pp
 In both examples, assuming that the value written by
 .Fn atomic_store_release
 in step\~2
 is read by
 .Fn atomic_load_acquire
 or
 .Fn atomic_load_consume
 in step\~3, this guarantees that all of the memory operations in
 step\~1 complete before any of the memory operations in step\~4 \(em
 even if they happen on different CPUs.
 .Pp
 Without
 .Em both
 the release operation in step\~2
 .Em and
 the acquire or consume operation in step\~3, no ordering is guaranteed
 between the memory operations in steps\~1 and\~4.
 In fact, without
 .Em both
 release and acquire/consume, even the assignment
 .Li m->z\ =\ m->x\ +\ m->y
 in step\~1 might read values of
 .Li m->x
 and
 .Li m->y
 that were written in step\~4.
 .Ss ACQUIRE OR CONSUME?
 You must use
 .Fn atomic_load_acquire
 when subsequent memory operations in program order that must happen
 after the load are on objects at
 .Em addresses that might not depend arithmetically on the resulting value .
 This applies particularly when the choice of whether to do the
 subsequent memory operation depends on a
 .Em control-flow decision based on the resulting value :
 .Bd -literal
 	struct gadget {
 		int ready, x;
 	} the_gadget;
 	/* Producer */
 	the_gadget.x = 42;
 	atomic_store_release(&the_gadget.ready, 1);
 	/* Consumer */
 	if (atomic_load_acquire(&the_gadget.ready) == 0)
 		return EWOULDBLOCK;
 	int x = the_gadget.x;
 .Ed
 .Pp
 Here the
 .Em decision of whether to load
 .Li the_gadget.x
 depends on a control-flow decision depending on value loaded from
 .Li the_gadget.ready ,
 and loading
 .Li the_gadget.x
 must happen after loading
 .Li the_gadget.ready .
 Using
 .Fn atomic_load_acquire
 guarantees that the compiler and CPU do not conspire to load
 .Li the_gadget.x
 before we have ascertained that it is ready.
 .Pp
 You may use
 .Fn atomic_load_consume
 if all subsequent memory operations in program order that must happen
 after the load are performed on objects at
 .Em addresses computed arithmetically from the resulting value ,
 such as loading a pointer to a structure object and then dereferencing
 it:
 .Bd -literal
 	struct gizmo {
 		int x, y, z;
 	};
 	struct gizmo null_gizmo;
 	struct gizmo *the_gizmo = &null_gizmo;
 	/* Producer */
 	struct gizmo *g = kmem_alloc(sizeof(*g), KM_SLEEP);
 	g->x = 12;
 	g->y = 34;
 	g->z = 56;
 	atomic_store_release(&the_gizmo, g);
 	/* Consumer */
 	struct gizmo *g = atomic_load_consume(&the_gizmo);
 	int y = g->y;
 .Ed
 .Pp
 Here the
 .Em address
 of
 .Li g->y
 depends on the value of the pointer loaded from
 .Li the_gizmo .
 Using
 .Fn atomic_load_consume
 guarantees that we do not witness a stale cache for that address.
 .Pp
 In some cases it may be unclear.
 For example:
 .Bd -literal
 	int x[2];
 	bool b;
 	/* Producer */
 	x[0] = 42;
 	atomic_store_release(&b, 0);
 	/* Consumer 1 */
 	int y = atomic_load_???(&b) ? x[0] : x[1];
 	/* Consumer 2 */
 	int y = x[atomic_load_???(&b) ? 0 : 1];
 	/* Consumer 3 */
 	int y = x[atomic_load_???(&b) ^ 1];
 .Ed
 .Pp
 Although the three consumers seem to be equivalent, by the letter of
 C11 consumers\~1 and\~2 require
 .Fn atomic_load_acquire
 because the value determines the address of a subsequent load only via
 control-flow decisions in the
 .Li ?:
 operator, whereas consumer\~3 can use
 .Fn atomic_load_consume .
 However, if you're not sure, you should err on the side of
 .Fn atomic_load_acquire
 until C11 implementations have ironed out the kinks in the semantics.
 .Pp
 On all CPUs other than DEC Alpha,
 .Fn atomic_load_consume
 is cheap \(em it is identical to
 .Fn atomic_load_relaxed .
 In contrast,
 .Fn atomic_load_acquire
 usually implies an expensive memory barrier.
 .Ss SIZE AND ALIGNMENT
 The pointer
 .Fa p
 must be aligned \(em that is, if the object it points to is
 .\"
 \c
 .ie t \s-2\v'-0.4m'n\v'+0.4m'\s+2
 .el ^n
 .\"
 bytes long, then the low-order
 .Ar n
 bits of
 .Fa p
 must be zero.
 .Pp
 All
 .Nx
 ports support atomic loads and stores on units of data up to 32 bits.
 Some ports additionally support atomic loads and stores on larger
 quantities, like 64-bit quantities, if
 .Dv __HAVE_ATOMIC64_LOADSTORE
 is defined.
 The macros are not allowed on larger quantities of data than the port
 supports atomically; attempts to use them for such quantities should
 result in a compile-time assertion failure.
 .Pp
 For example, as long as you use
 .Fn atomic_store_*
 to write a 32-bit quantity, you can safely use
 .Fn atomic_load_relaxed
 to optimistically read it outside a lock, but for a 64-bit quantity it
 must be conditional on
 .Dv __HAVE_ATOMIC64_LOADSTORE
 \(em otherwise it will lead to compile-time errors on platforms without
 -bit atomic loads and stores:
 .Bd -literal
 	struct foo {
 		kmutex_t	f_lock;
 		uint32_t	f_refcnt;
 		uint64_t	f_ticket;
 	};
 	if (atomic_load_relaxed(&foo->f_refcnt) == 0)
 		return 123;
 #ifdef __HAVE_ATOMIC64_LOADSTORE
 	if (atomic_load_relaxed(&foo->f_ticket) == ticket)
 		return 123;
 #endif
 	mutex_enter(&foo->f_lock);
 	if (foo->f_refcnt == 0 || foo->f_ticket == ticket)
 		ret = 123;
 	...
 #ifdef __HAVE_ATOMIC64_LOADSTORE
 	atomic_store_relaxed(&foo->f_ticket, foo->f_ticket + 1);
 #else
 	foo->f_ticket++;
 #endif
 	...
 	mutex_exit(&foo->f_lock);
 .Ed
 .Sh C11 COMPATIBILITY
 These macros are meant to follow
 .Tn C11
 semantics, in terms of
 .Li atomic_load_explicit()
 and
 .Li atomic_store_explicit()
 with the appropriate memory order specifiers, and are meant to make
 future adoption of the
 .Tn C11
 atomic API easier.
 Eventually it may be mandatory to use the
 .Tn C11
 .Vt _Atomic
 type qualifier or equivalent for the operands.
 .Sh LINUX ANALOGUES
 The Linux kernel provides two macros
 .Li READ_ONCE(x)
 and
 .Li WRITE_ONCE(x,\ v)
 which are similar to
 .Li atomic_load_consume(&x)
 and
 .Li atomic_store_relaxed(&x,\ v) ,
 respectively.
 However, while Linux's
 .Li READ_ONCE
 and
 .Li WRITE_ONCE
 prevent fusing, they may in some cases be torn \(em and therefore fail
 to guarantee atomicity \(em because:
 .Bl -bullet
 .It
 They do not require the address
 .Li "&x"
 to be aligned.
 .It
 They do not require
 .Li sizeof(x)
 to be at most the largest size of available atomic loads and stores on
 the host architecture.
 .El
 .Sh MEMORY BARRIERS AND ATOMIC READ/MODIFY/WRITE
 The atomic read/modify/write operations in
 .Xr atomic_ops 3
 have relaxed ordering by default, but can be combined with the memory
 barriers in
 .Xr membar_ops 3
 for the same effect as an acquire operation and a release operation for
 the purposes of pairing with
 .Fn atomic_store_release
 and
 .Fn atomic_load_acquire
 or
 .Fn atomic_load_consume .
 If
 .Li atomic_r/m/w()
 is an atomic read/modify/write operation in
 .Xr atomic_ops 3 ,
 then
 .Bd -literal
 	membar_exit();
 	atomic_r/m/w(obj, ...);
 .Ed
 .Pp
 functions like a release operation on
 .Va obj ,
 and
 .Bd -literal
 	atomic_r/m/w(obj, ...);
 	membar_enter();
 .Ed
 .Pp
 functions like a acquire operation on
 .Va obj .
 .Pp
 .Sy WARNING :
 The combination of
 .Fn atomic_load_relaxed
 and
 .Xr membar_enter 3
 .Em does not
 make an acquire operation; only read/modify/write atomics may be
 combined with
 .Xr membar_enter 3
 this way.
 .Pp
 On architectures where
 .Dv __HAVE_ATOMIC_AS_MEMBAR
 is defined, all the
 .Xr atomic_ops 3
 imply release and acquire operations, so the
 .Xr membar_enter 3
 and
 .Xr membar_exit 3
 are redundant.
 .Sh EXAMPLES
 Maintaining lossy counters.
 These may lose some counts, because the read/modify/write cycle as a
 whole is not atomic.
 But this guarantees that the count will increase by at most one each
 time.
 In contrast, without atomic operations, in principle a write to a
 -bit counter might be torn into multiple smaller stores, which could
 appear to happen out of order from another CPU's perspective, leading
 to nonsensical counter readouts.
 (For frequent events, consider using per-CPU counters instead in
 practice.)
 .Bd -literal
 	unsigned count;
 	void
 	record_event(void)
+	{
 		atomic_store_relaxed(&count,
 + atomic_load_relaxed(&count));
+	}
 	unsigned
 	read_event_count(void)
+	{
 		return atomic_load_relaxed(&count);
+	}
 .Ed
 .Pp
 Initialization barrier.
 .Bd -literal
 	int ready;
 	struct data d;
 	void
 	setup_and_notify(void)
+	{
 		setup_data(&d.things);
 		atomic_store_release(&ready, 1);
+	}
 	void
 	try_if_ready(void)
+	{
 		if (atomic_load_acquire(&ready))
 			do_stuff(d.things);
+	}
 .Ed
 .Pp
 Publishing a pointer to the current snapshot of data.
 (Caller must arrange that only one call to
 .Li take_snapshot()
 happens at any
 given time; generally this should be done in coordination with
 .Xr pserialize 9
 or similar to enable resource reclamation.)
 .Bd -literal
 	struct data *current_d;
 	void
 	take_snapshot(void)
+        {
 		struct data *d = kmem_alloc(sizeof(*d));
 		d->things = ...;
 		atomic_store_release(&current_d, d);
+	}
 	struct data *
 	get_snapshot(void)
+	{
 		return atomic_load_consume(&current_d);
+	}
 .Ed
 .Sh CODE REFERENCES
 .Pa sys/sys/atomic.h
 .Sh SEE ALSO
 .Xr atomic_ops 3 ,
 .Xr membar_ops 3 ,
 .Xr pserialize 9
 .Sh HISTORY
 These atomic operations first appeared in
-.Nx 10.0 .
+.Nx 9.0 .
 .Sh CAVEATS
 C11 formally specifies that all subexpressions, except the left
 operands of the
 .Ql && ,
 .Ql || ,
 .Ql ?: ,
 and
 .Ql \&,
 operators and the
 .Li kill_dependency()
 macro, carry dependencies for which
 .Dv memory_order_consume
 guarantees ordering, but most or all implementations to date simply
 treat
 .Dv memory_order_consume
 as
 .Dv memory_order_acquire
 and do not take advantage of data dependencies to elide costly memory
 barriers or load-acquire CPU instructions.
 .Pp
 Instead, we implement
 .Fn atomic_load_consume
 as
 .Fn atomic_load_relaxed
 followed by
 .Xr membar_datadep_consumer 3 ,
 which is equivalent to
 .Xr membar_consumer 3
 on DEC Alpha and
 .Xr __insn_barrier 3
 elsewhere.
 .Sh BUGS
 Some idiot decided to call it
 .Em tearing ,
 depriving us of the opportunity to say that atomic operations prevent
 fusion and
 .Em fission .