Sun Jul 30 12:09:51 2023 UTC ()
Pull up following revision(s) (requested by riastradh in ticket #262):

	sys/kern/kern_descrip.c: revision 1.252
	sys/kern/kern_descrip.c: revision 1.253
	sys/kern/kern_descrip.c: revision 1.254

kern_descrip.c: Fix membars around reference count decrement.

In general, the `last one out hit the lights' style of reference
counting (as opposed to the `whoever's destroying must wait for
pending users to finish' style) requires memory barriers like so:
        ... usage of resources associated with object ...
        membar_release();
        if (atomic_dec_uint_nv(&obj->refcnt) != 0)
                return;
        membar_acquire();
        ... freeing of resources associated with object ...

This way, all usage happens-before all freeing.  This fixes several
errors:
- fd_close failed to ensure whatever its caller did would
  happen-before the freeing, in the case where another thread is
  concurrently trying to close the fd (ff->ff_file == NULL).
  Fix: Add membar_release before atomic_dec_uint(&ff->ff_refcnt) in
  that branch.
- fd_close failed to ensure all loads its caller had issued will have
  happened-before the freeing, in the case where the fd is still in
  use by another thread (fdp->fd_refcnt > 1 and ff->ff_refcnt-- > 0).
  Fix: Change membar_producer to membar_release before
  atomic_dec_uint(&ff->ff_refcnt).
- fd_close failed to ensure that any usage of fp by other callers
  would happen-before any freeing it does.
  Fix: Add membar_acquire after atomic_dec_uint_nv(&ff->ff_refcnt).
- fd_free failed to ensure that any usage of fdp by other callers
  would happen-before any freeing it does.
  Fix: Add membar_acquire after atomic_dec_uint_nv(&fdp->fd_refcnt).

While here, change membar_exit -> membar_release.  No semantic
change, just updating away from the legacy API.

kern_descrip.c: Use atomic_store_relaxed/release for ff->ff_file.
1. atomic_store_relaxed in fd_close avoids the appearance of race in
   sanitizers (minor bug).
2. atomic_store_release in fd_affix is necessary because the lock
   activity was not, in fact, enough to guarantee ordering (real bug
   some architectures like aarch64).
   The premise appears to have been that the mutex_enter/exit earlier
   in fd_affix is enough to guarantee that initialization of fp (A)
   happens before use of fp by a user once fp is published (B):
        fp->f_... = ...;                // A
        /* fd_affix */
        mutex_enter(&fp->f_lock);
        fp->f_count++;
        mutex_exit(&fp->f_lock);
        ...
        ff->ff_file = fp;               // B
   But actually mutex_enter/exit allow the following reordering by
   the CPU:
        mutex_enter(&fp->f_lock);
        ff->ff_file = fp;               // B
        fp->f_count++;
        fp->f_... = ...;                // A
        mutex_exit(&fp->f_lock);
   The only constraints they imply are:
        1. fp->f_count++ and B cannot precede mutex_enter
        2. mutex_exit cannot precede A and fp->f_count++
   They imply no constraint on the relative ordering of A, B, and
   fp->f_count++ amongst each other, however.
   This affects any architecture that has a native load-acquire or
   store-release operation in mutex_enter/exit, like aarch64, instead
   of explicit load-before-load/store and load/store-before-store
   barrier.

No need for atomic_store_* in fd_copy or fd_free because we have
exclusive access to ff as is.

kern_descrip.c: Change membar_enter to membar_acquire in fd_getfile.
membar_acquire is cheaper on many CPUs, and unlikely to be costlier
on any CPUs, than the legacy membar_enter.
Add a long comment explaining the interaction between fd_getfile and
fd_close and why membar_acquire is safe.


(martin)
diff -r1.251 -r1.251.10.1 src/sys/kern/kern_descrip.c

cvs diff -r1.251 -r1.251.10.1 src/sys/kern/kern_descrip.c (expand / switch to unified diff)

--- src/sys/kern/kern_descrip.c 2021/06/29 22:40:53 1.251
+++ src/sys/kern/kern_descrip.c 2023/07/30 12:09:51 1.251.10.1
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: kern_descrip.c,v 1.251 2021/06/29 22:40:53 dholland Exp $ */ 1/* $NetBSD: kern_descrip.c,v 1.251.10.1 2023/07/30 12:09:51 martin Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
@@ -60,27 +60,27 @@ @@ -60,27 +60,27 @@
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE. 63 * SUCH DAMAGE.
64 * 64 *
65 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95 65 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
66 */ 66 */
67 67
68/* 68/*
69 * File descriptor management. 69 * File descriptor management.
70 */ 70 */
71 71
72#include <sys/cdefs.h> 72#include <sys/cdefs.h>
73__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.251 2021/06/29 22:40:53 dholland Exp $"); 73__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.251.10.1 2023/07/30 12:09:51 martin Exp $");
74 74
75#include <sys/param.h> 75#include <sys/param.h>
76#include <sys/systm.h> 76#include <sys/systm.h>
77#include <sys/filedesc.h> 77#include <sys/filedesc.h>
78#include <sys/kernel.h> 78#include <sys/kernel.h>
79#include <sys/proc.h> 79#include <sys/proc.h>
80#include <sys/file.h> 80#include <sys/file.h>
81#include <sys/socket.h> 81#include <sys/socket.h>
82#include <sys/socketvar.h> 82#include <sys/socketvar.h>
83#include <sys/stat.h> 83#include <sys/stat.h>
84#include <sys/ioctl.h> 84#include <sys/ioctl.h>
85#include <sys/fcntl.h> 85#include <sys/fcntl.h>
86#include <sys/pool.h> 86#include <sys/pool.h>
@@ -382,30 +382,65 @@ fd_getfile(unsigned fd) @@ -382,30 +382,65 @@ fd_getfile(unsigned fd)
382 /* Now get a reference to the descriptor. */ 382 /* Now get a reference to the descriptor. */
383 if (fdp->fd_refcnt == 1) { 383 if (fdp->fd_refcnt == 1) {
384 /* 384 /*
385 * Single threaded: don't need to worry about concurrent 385 * Single threaded: don't need to worry about concurrent
386 * access (other than earlier calls to kqueue, which may 386 * access (other than earlier calls to kqueue, which may
387 * hold a reference to the descriptor). 387 * hold a reference to the descriptor).
388 */ 388 */
389 ff->ff_refcnt++; 389 ff->ff_refcnt++;
390 } else { 390 } else {
391 /* 391 /*
392 * Multi threaded: issue a memory barrier to ensure that we 392 * Multi threaded: issue a memory barrier to ensure that we
393 * acquire the file pointer _after_ adding a reference. If 393 * acquire the file pointer _after_ adding a reference. If
394 * no memory barrier, we could fetch a stale pointer. 394 * no memory barrier, we could fetch a stale pointer.
 395 *
 396 * In particular, we must coordinate the following four
 397 * memory operations:
 398 *
 399 * A. fd_close store ff->ff_file = NULL
 400 * B. fd_close refcnt = atomic_dec_uint_nv(&ff->ff_refcnt)
 401 * C. fd_getfile atomic_inc_uint(&ff->ff_refcnt)
 402 * D. fd_getfile load fp = ff->ff_file
 403 *
 404 * If the order is D;A;B;C:
 405 *
 406 * 1. D: fp = ff->ff_file
 407 * 2. A: ff->ff_file = NULL
 408 * 3. B: refcnt = atomic_dec_uint_nv(&ff->ff_refcnt)
 409 * 4. C: atomic_inc_uint(&ff->ff_refcnt)
 410 *
 411 * then fd_close determines that there are no more
 412 * references and decides to free fp immediately, at
 413 * the same that fd_getfile ends up with an fp that's
 414 * about to be freed. *boom*
 415 *
 416 * By making B a release operation in fd_close, and by
 417 * making C an acquire operation in fd_getfile, since
 418 * they are atomic operations on the same object, which
 419 * has a total modification order, we guarantee either:
 420 *
 421 * - B happens before C. Then since A is
 422 * sequenced before B in fd_close, and C is
 423 * sequenced before D in fd_getfile, we
 424 * guarantee A happens before D, so fd_getfile
 425 * reads a null fp and safely fails.
 426 *
 427 * - C happens before B. Then fd_getfile may read
 428 * null or nonnull, but either way, fd_close
 429 * will safely wait for references to drain.
395 */ 430 */
396 atomic_inc_uint(&ff->ff_refcnt); 431 atomic_inc_uint(&ff->ff_refcnt);
397#ifndef __HAVE_ATOMIC_AS_MEMBAR 432#ifndef __HAVE_ATOMIC_AS_MEMBAR
398 membar_enter(); 433 membar_acquire();
399#endif 434#endif
400 } 435 }
401 436
402 /* 437 /*
403 * If the file is not open or is being closed then put the 438 * If the file is not open or is being closed then put the
404 * reference back. 439 * reference back.
405 */ 440 */
406 fp = atomic_load_consume(&ff->ff_file); 441 fp = atomic_load_consume(&ff->ff_file);
407 if (__predict_true(fp != NULL)) { 442 if (__predict_true(fp != NULL)) {
408 return fp; 443 return fp;
409 } 444 }
410 fd_putfile(fd); 445 fd_putfile(fd);
411 return NULL; 446 return NULL;
@@ -441,27 +476,27 @@ fd_putfile(unsigned fd) @@ -441,27 +476,27 @@ fd_putfile(unsigned fd)
441 } 476 }
442 ff->ff_refcnt--; 477 ff->ff_refcnt--;
443 return; 478 return;
444 } 479 }
445 480
446 /* 481 /*
447 * Ensure that any use of the file is complete and globally 482 * Ensure that any use of the file is complete and globally
448 * visible before dropping the final reference. If no membar, 483 * visible before dropping the final reference. If no membar,
449 * the current CPU could still access memory associated with 484 * the current CPU could still access memory associated with
450 * the file after it has been freed or recycled by another 485 * the file after it has been freed or recycled by another
451 * CPU. 486 * CPU.
452 */ 487 */
453#ifndef __HAVE_ATOMIC_AS_MEMBAR 488#ifndef __HAVE_ATOMIC_AS_MEMBAR
454 membar_exit(); 489 membar_release();
455#endif 490#endif
456 491
457 /* 492 /*
458 * Be optimistic and start out with the assumption that no other 493 * Be optimistic and start out with the assumption that no other
459 * threads are trying to close the descriptor. If the CAS fails, 494 * threads are trying to close the descriptor. If the CAS fails,
460 * we lost a race and/or it's being closed. 495 * we lost a race and/or it's being closed.
461 */ 496 */
462 for (u = ff->ff_refcnt & FR_MASK;; u = v) { 497 for (u = ff->ff_refcnt & FR_MASK;; u = v) {
463 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1); 498 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
464 if (__predict_true(u == v)) { 499 if (__predict_true(u == v)) {
465 return; 500 return;
466 } 501 }
467 if (__predict_false((v & FR_CLOSING) != 0)) { 502 if (__predict_false((v & FR_CLOSING) != 0)) {
@@ -592,64 +627,70 @@ fd_close(unsigned fd) @@ -592,64 +627,70 @@ fd_close(unsigned fd)
592 ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; 627 ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];
593 628
594 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 629 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
595 630
596 mutex_enter(&fdp->fd_lock); 631 mutex_enter(&fdp->fd_lock);
597 KASSERT((ff->ff_refcnt & FR_MASK) > 0); 632 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
598 fp = atomic_load_consume(&ff->ff_file); 633 fp = atomic_load_consume(&ff->ff_file);
599 if (__predict_false(fp == NULL)) { 634 if (__predict_false(fp == NULL)) {
600 /* 635 /*
601 * Another user of the file is already closing, and is 636 * Another user of the file is already closing, and is
602 * waiting for other users of the file to drain. Release 637 * waiting for other users of the file to drain. Release
603 * our reference, and wake up the closer. 638 * our reference, and wake up the closer.
604 */ 639 */
 640#ifndef __HAVE_ATOMIC_AS_MEMBAR
 641 membar_release();
 642#endif
605 atomic_dec_uint(&ff->ff_refcnt); 643 atomic_dec_uint(&ff->ff_refcnt);
606 cv_broadcast(&ff->ff_closing); 644 cv_broadcast(&ff->ff_closing);
607 mutex_exit(&fdp->fd_lock); 645 mutex_exit(&fdp->fd_lock);
608 646
609 /* 647 /*
610 * An application error, so pretend that the descriptor 648 * An application error, so pretend that the descriptor
611 * was already closed. We can't safely wait for it to 649 * was already closed. We can't safely wait for it to
612 * be closed without potentially deadlocking. 650 * be closed without potentially deadlocking.
613 */ 651 */
614 return (EBADF); 652 return (EBADF);
615 } 653 }
616 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0); 654 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
617 655
618 /* 656 /*
619 * There may be multiple users of this file within the process. 657 * There may be multiple users of this file within the process.
620 * Notify existing and new users that the file is closing. This 658 * Notify existing and new users that the file is closing. This
621 * will prevent them from adding additional uses to this file 659 * will prevent them from adding additional uses to this file
622 * while we are closing it. 660 * while we are closing it.
623 */ 661 */
624 ff->ff_file = NULL; 662 atomic_store_relaxed(&ff->ff_file, NULL);
625 ff->ff_exclose = false; 663 ff->ff_exclose = false;
626 664
627 /* 665 /*
628 * We expect the caller to hold a descriptor reference - drop it. 666 * We expect the caller to hold a descriptor reference - drop it.
629 * The reference count may increase beyond zero at this point due 667 * The reference count may increase beyond zero at this point due
630 * to an erroneous descriptor reference by an application, but 668 * to an erroneous descriptor reference by an application, but
631 * fd_getfile() will notice that the file is being closed and drop 669 * fd_getfile() will notice that the file is being closed and drop
632 * the reference again. 670 * the reference again.
633 */ 671 */
634 if (fdp->fd_refcnt == 1) { 672 if (fdp->fd_refcnt == 1) {
635 /* Single threaded. */ 673 /* Single threaded. */
636 refcnt = --(ff->ff_refcnt); 674 refcnt = --(ff->ff_refcnt);
637 } else { 675 } else {
638 /* Multi threaded. */ 676 /* Multi threaded. */
639#ifndef __HAVE_ATOMIC_AS_MEMBAR 677#ifndef __HAVE_ATOMIC_AS_MEMBAR
640 membar_producer(); 678 membar_release();
641#endif 679#endif
642 refcnt = atomic_dec_uint_nv(&ff->ff_refcnt); 680 refcnt = atomic_dec_uint_nv(&ff->ff_refcnt);
 681#ifndef __HAVE_ATOMIC_AS_MEMBAR
 682 membar_acquire();
 683#endif
643 } 684 }
644 if (__predict_false(refcnt != 0)) { 685 if (__predict_false(refcnt != 0)) {
645 /* 686 /*
646 * Wait for other references to drain. This is typically 687 * Wait for other references to drain. This is typically
647 * an application error - the descriptor is being closed 688 * an application error - the descriptor is being closed
648 * while still in use. 689 * while still in use.
649 * (Or just a threaded application trying to unblock its 690 * (Or just a threaded application trying to unblock its
650 * thread that sleeps in (say) accept()). 691 * thread that sleeps in (say) accept()).
651 */ 692 */
652 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING); 693 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
653 694
654 /* 695 /*
655 * Remove any knotes attached to the file. A knote 696 * Remove any knotes attached to the file. A knote
@@ -1136,45 +1177,39 @@ fd_affix(proc_t *p, file_t *fp, unsigned @@ -1136,45 +1177,39 @@ fd_affix(proc_t *p, file_t *fp, unsigned
1136 fdfile_t *ff; 1177 fdfile_t *ff;
1137 filedesc_t *fdp; 1178 filedesc_t *fdp;
1138 fdtab_t *dt; 1179 fdtab_t *dt;
1139 1180
1140 KASSERT(p == curproc || p == &proc0); 1181 KASSERT(p == curproc || p == &proc0);
1141 1182
1142 /* Add a reference to the file structure. */ 1183 /* Add a reference to the file structure. */
1143 mutex_enter(&fp->f_lock); 1184 mutex_enter(&fp->f_lock);
1144 fp->f_count++; 1185 fp->f_count++;
1145 mutex_exit(&fp->f_lock); 1186 mutex_exit(&fp->f_lock);
1146 1187
1147 /* 1188 /*
1148 * Insert the new file into the descriptor slot. 1189 * Insert the new file into the descriptor slot.
1149 * 
1150 * The memory barriers provided by lock activity in this routine 
1151 * ensure that any updates to the file structure become globally 
1152 * visible before the file becomes visible to other LWPs in the 
1153 * current process; otherwise we would set ff->ff_file with 
1154 * atomic_store_release(&ff->ff_file, fp) at the bottom. 
1155 */ 1190 */
1156 fdp = p->p_fd; 1191 fdp = p->p_fd;
1157 dt = atomic_load_consume(&fdp->fd_dt); 1192 dt = atomic_load_consume(&fdp->fd_dt);
1158 ff = dt->dt_ff[fd]; 1193 ff = dt->dt_ff[fd];
1159 1194
1160 KASSERT(ff != NULL); 1195 KASSERT(ff != NULL);
1161 KASSERT(ff->ff_file == NULL); 1196 KASSERT(ff->ff_file == NULL);
1162 KASSERT(ff->ff_allocated); 1197 KASSERT(ff->ff_allocated);
1163 KASSERT(fd_isused(fdp, fd)); 1198 KASSERT(fd_isused(fdp, fd));
1164 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1199 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1165 1200
1166 /* No need to lock in order to make file initially visible. */ 1201 /* No need to lock in order to make file initially visible. */
1167 ff->ff_file = fp; 1202 atomic_store_release(&ff->ff_file, fp);
1168} 1203}
1169 1204
1170/* 1205/*
1171 * Abort creation of a new descriptor: free descriptor slot and file. 1206 * Abort creation of a new descriptor: free descriptor slot and file.
1172 */ 1207 */
1173void 1208void
1174fd_abort(proc_t *p, file_t *fp, unsigned fd) 1209fd_abort(proc_t *p, file_t *fp, unsigned fd)
1175{ 1210{
1176 filedesc_t *fdp; 1211 filedesc_t *fdp;
1177 fdfile_t *ff; 1212 fdfile_t *ff;
1178 1213
1179 KASSERT(p == curproc || p == &proc0); 1214 KASSERT(p == curproc || p == &proc0);
1180 1215
@@ -1522,30 +1557,33 @@ fd_free(void) @@ -1522,30 +1557,33 @@ fd_free(void)
1522 file_t *fp; 1557 file_t *fp;
1523 int fd, nf; 1558 int fd, nf;
1524 fdtab_t *dt; 1559 fdtab_t *dt;
1525 lwp_t * const l = curlwp; 1560 lwp_t * const l = curlwp;
1526 filedesc_t * const fdp = l->l_fd; 1561 filedesc_t * const fdp = l->l_fd;
1527 const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0; 1562 const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0;
1528 1563
1529 KASSERT(atomic_load_consume(&fdp->fd_dt)->dt_ff[0] == 1564 KASSERT(atomic_load_consume(&fdp->fd_dt)->dt_ff[0] ==
1530 (fdfile_t *)fdp->fd_dfdfile[0]); 1565 (fdfile_t *)fdp->fd_dfdfile[0]);
1531 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1566 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
1532 KASSERT(fdp->fd_dtbuiltin.dt_link == NULL); 1567 KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
1533 1568
1534#ifndef __HAVE_ATOMIC_AS_MEMBAR 1569#ifndef __HAVE_ATOMIC_AS_MEMBAR
1535 membar_exit(); 1570 membar_release();
1536#endif 1571#endif
1537 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0) 1572 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1538 return; 1573 return;
 1574#ifndef __HAVE_ATOMIC_AS_MEMBAR
 1575 membar_acquire();
 1576#endif
1539 1577
1540 /* 1578 /*
1541 * Close any files that the process holds open. 1579 * Close any files that the process holds open.
1542 */ 1580 */
1543 dt = fdp->fd_dt; 1581 dt = fdp->fd_dt;
1544 fd_checkmaps(fdp); 1582 fd_checkmaps(fdp);
1545#ifdef DEBUG 1583#ifdef DEBUG
1546 fdp->fd_refcnt = -1; /* see fd_checkmaps */ 1584 fdp->fd_refcnt = -1; /* see fd_checkmaps */
1547#endif 1585#endif
1548 for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) { 1586 for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) {
1549 ff = dt->dt_ff[fd]; 1587 ff = dt->dt_ff[fd];
1550 KASSERT(fd >= NDFDFILE || 1588 KASSERT(fd >= NDFDFILE ||
1551 ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1589 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);