Tue Mar 7 19:56:45 2023 UTC ()
Pull up following revision(s) (requested by riastradh in ticket #115):

	sys/kern/vfs_syscalls.c: revision 1.557

open(2): Don't map ERESTART to EINTR.

If a file or device's open function returns ERESTART, respect that --
restart the syscall; don't pretend a signal has been delivered when
it was not.  If an SA_RESTART signal was delivered, POSIX does not
allow it to fail with EINTR:

    SA_RESTART
        This flag affects the behavior of interruptible functions;
        that is, those specified to fail with errno set to [EINTR].
        If set, and a function specified as interruptible is
        interrupted by this signal, the function shall restart and
        shall not fail with [EINTR] unless otherwise specified.  If
        an interruptible function which uses a timeout is restarted,
        the duration of the timeout following the restart is set to
        an unspecified value that does not exceed the original
        timeout value.  If the flag is not set, interruptible
        functions interrupted by this signal shall fail with errno
        set to [EINTR].

https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaction.html

Nothing in the POSIX definition of open specifies otherwise.

In 1990, Kirk McKusick added these lines with a mysterious commit
message:
Author: Kirk McKusick <mckusick>
Date:   Tue Apr 10 19:36:33 1990 -0800
    eliminate longjmp from the kernel (for karels)
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 7bc7b39bbf..d572d3a32d 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -14,7 +14,7 @@
  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  *
- *     @(#)vfs_syscalls.c      7.42 (Berkeley) 3/26/90
+ *     @(#)vfs_syscalls.c      7.43 (Berkeley) 4/10/90
  */
 #include "param.h"
@@ -530,8 +530,10 @@ copen(scp, fmode, cmode, ndp, resultfd)
        if (error = vn_open(ndp, fmode, (cmode & 07777) &~ S_ISVTX)) {
                crfree(fp->f_cred);
                fp->f_count--;
-               if (error == -1)        /* XXX from fdopen */
-                       return (0);     /* XXX from fdopen */
+               if (error == EJUSTRETURN)       /* XXX from fdopen */
+                       return (0);             /* XXX from fdopen */
+               if (error == ERESTART)
+                       error = EINTR;
                scp->sc_ofile[indx] = NULL;
                return (error);
        }

(found via this git import of the CSRG history:
https://github.com/robohack/ucb-csrg-bsd/commit/cce2869b7ae5d360921eb411005b328a29c4a3fe

This change appears to have served two related purposes:
1. The fdopen function (the erstwhile open routine for /dev/fd/N)
   used to return -1 as a hack to mean it had just duplicated the fd;
   it was recently changed by Mike Karels, in kern_descrip.c 7.9, to
   return EJUSTRETURN, now defined to be -2, presumably to avoid a
   conflict with ERESTART, defined to be -1.  So this change finished
   part of the change by Mike Karels to use a different magic return
   code from fdopen.
   Of course, today we use still another disgusting hack, EDUPFD, for
   the same purpose, so none of this is relevant any more.
2. Prior to April 1990, the kernel handled signals during tsleep(9)
   by longjmping out to the system call entry point or similar.  In
   April 1990, Mike Karels worked to convert all of that into
   explicit unwind logic by passing through EINTR or ERESTART as
   appropriate, instead of setjmp at each entry point.

However, it's not clear to me why this setjmp/longjmp and
fdopen/-1/EJUSTRETURN renovation justifies unconditional logic to map
ERESTART to EINTR in open(2).  I suspect it was a mistake.

In 2013, the corresponding logic to map ERESTART to EINTR in open(2)
was removed from FreeBSD:

   r246472 | kib | 2013-02-07 14:53:33 +0000 (Thu, 07 Feb 2013) | 11 lines
   Stop translating the ERESTART error from the open(2) into EINTR.
   Posix requires that open(2) is restartable for SA_RESTART.
   For non-posix objects, in particular, devfs nodes, still disable
   automatic restart of the opens. The open call to a driver could have
   significant side effects for the hardware.
   Noted and reviewed by:  jilles
   Discussed with: bde
   MFC after:      2 weeks

Index: vfs_syscalls.c
===================================================================
--- vfs_syscalls.c      (revision 246471)
+++ vfs_syscalls.c      (revision 246472)
@@ -1106,8 +1106,6 @@
                                goto success;
                }
-               if (error == ERESTART)
-                       error = EINTR;
                goto bad;
        }
        td->td_dupfd = 0;

https://cgit.freebsd.org/src/commit/sys/kern/vfs_syscalls.c?id=2ca49983425886121b506cb5126b60a705afc38c">https://cgit.freebsd.org/src/commit/sys/kern/vfs_syscalls.c?id=2ca49983425886121b506cb5126b60a705afc38c

It's not clear to me that there's any reason to treat device nodes
specially here; in fact, if a driver's .d_open routine sleeps and is
woken by a concurrent revoke without a signal pending or with an
SA_RESTART signal pending, it is wrong for it to fail with EINTR.

But it MUST restart the whole system call rather than continue
sleeping in a loop or just exit the loop and continue to open,
because it is mandatory in the security model of revoke for open(2)
to retry the permissions check at that point.

PR kern/57260


(martin)
diff -r1.556 -r1.556.2.1 src/sys/kern/vfs_syscalls.c

cvs diff -r1.556 -r1.556.2.1 src/sys/kern/vfs_syscalls.c (expand / switch to unified diff)

--- src/sys/kern/vfs_syscalls.c 2022/11/02 20:38:22 1.556
+++ src/sys/kern/vfs_syscalls.c 2023/03/07 19:56:45 1.556.2.1
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: vfs_syscalls.c,v 1.556 2022/11/02 20:38:22 andvar Exp $ */ 1/* $NetBSD: vfs_syscalls.c,v 1.556.2.1 2023/03/07 19:56:45 martin Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
@@ -60,27 +60,27 @@ @@ -60,27 +60,27 @@
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE. 63 * SUCH DAMAGE.
64 * 64 *
65 * @(#)vfs_syscalls.c 8.42 (Berkeley) 7/31/95 65 * @(#)vfs_syscalls.c 8.42 (Berkeley) 7/31/95
66 */ 66 */
67 67
68/* 68/*
69 * Virtual File System System Calls 69 * Virtual File System System Calls
70 */ 70 */
71 71
72#include <sys/cdefs.h> 72#include <sys/cdefs.h>
73__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.556 2022/11/02 20:38:22 andvar Exp $"); 73__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.556.2.1 2023/03/07 19:56:45 martin Exp $");
74 74
75#ifdef _KERNEL_OPT 75#ifdef _KERNEL_OPT
76#include "opt_fileassoc.h" 76#include "opt_fileassoc.h"
77#include "veriexec.h" 77#include "veriexec.h"
78#endif 78#endif
79 79
80#include <sys/param.h> 80#include <sys/param.h>
81#include <sys/systm.h> 81#include <sys/systm.h>
82#include <sys/namei.h> 82#include <sys/namei.h>
83#include <sys/filedesc.h> 83#include <sys/filedesc.h>
84#include <sys/kernel.h> 84#include <sys/kernel.h>
85#include <sys/file.h> 85#include <sys/file.h>
86#include <sys/fcntl.h> 86#include <sys/fcntl.h>
@@ -1743,28 +1743,26 @@ do_open(lwp_t *l, struct vnode *dvp, str @@ -1743,28 +1743,26 @@ do_open(lwp_t *l, struct vnode *dvp, str
1743 return EINVAL; 1743 return EINVAL;
1744 1744
1745 if ((error = fd_allocfile(&fp, &indx)) != 0) { 1745 if ((error = fd_allocfile(&fp, &indx)) != 0) {
1746 return error; 1746 return error;
1747 } 1747 }
1748 1748
1749 /* We're going to read cwdi->cwdi_cmask unlocked here. */ 1749 /* We're going to read cwdi->cwdi_cmask unlocked here. */
1750 cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT; 1750 cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1751  1751
1752 error = vn_open(dvp, pb, TRYEMULROOT, flags, cmode, 1752 error = vn_open(dvp, pb, TRYEMULROOT, flags, cmode,
1753 &vp, &dupfd_move, &dupfd); 1753 &vp, &dupfd_move, &dupfd);
1754 if (error != 0) { 1754 if (error != 0) {
1755 fd_abort(p, fp, indx); 1755 fd_abort(p, fp, indx);
1756 if (error == ERESTART) 
1757 error = EINTR; 
1758 return error; 1756 return error;
1759 } 1757 }
1760 1758
1761 if (vp == NULL) { 1759 if (vp == NULL) {
1762 fd_abort(p, fp, indx); 1760 fd_abort(p, fp, indx);
1763 error = fd_dupopen(dupfd, dupfd_move, flags, &indx); 1761 error = fd_dupopen(dupfd, dupfd_move, flags, &indx);
1764 if (error) 1762 if (error)
1765 return error; 1763 return error;
1766 *fd = indx; 1764 *fd = indx;
1767 } else { 1765 } else {
1768 error = open_setfp(l, fp, vp, indx, flags); 1766 error = open_setfp(l, fp, vp, indx, flags);
1769 if (error) 1767 if (error)
1770 return error; 1768 return error;