| @@ -1,22 +1,22 @@ | | | @@ -1,22 +1,22 @@ |
1 | /* $NetBSD: uipc_usrreq.c,v 1.119.4.1 2009/02/16 03:31:13 snj Exp $ */ | | 1 | /* $NetBSD: uipc_usrreq.c,v 1.119.4.2 2009/03/18 05:33:23 snj Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 1998, 2000, 2004, 2008 The NetBSD Foundation, Inc. | | 4 | * Copyright (c) 1998, 2000, 2004, 2008, 2009 The NetBSD Foundation, Inc. |
5 | * All rights reserved. | | 5 | * All rights reserved. |
6 | * | | 6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation | | 7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, | | 8 | * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, |
9 | * NASA Ames Research Center. | | 9 | * NASA Ames Research Center, and by Andrew Doran. |
10 | * | | 10 | * |
11 | * Redistribution and use in source and binary forms, with or without | | 11 | * Redistribution and use in source and binary forms, with or without |
12 | * modification, are permitted provided that the following conditions | | 12 | * modification, are permitted provided that the following conditions |
13 | * are met: | | 13 | * are met: |
14 | * 1. Redistributions of source code must retain the above copyright | | 14 | * 1. Redistributions of source code must retain the above copyright |
15 | * notice, this list of conditions and the following disclaimer. | | 15 | * notice, this list of conditions and the following disclaimer. |
16 | * 2. Redistributions in binary form must reproduce the above copyright | | 16 | * 2. Redistributions in binary form must reproduce the above copyright |
17 | * notice, this list of conditions and the following disclaimer in the | | 17 | * notice, this list of conditions and the following disclaimer in the |
18 | * documentation and/or other materials provided with the distribution. | | 18 | * documentation and/or other materials provided with the distribution. |
19 | * | | 19 | * |
20 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | | 20 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
21 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | | 21 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
22 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | | 22 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| @@ -86,47 +86,49 @@ | | | @@ -86,47 +86,49 @@ |
86 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | | 86 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
87 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | | 87 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
88 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | | 88 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
89 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | | 89 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
90 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | | 90 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
91 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | | 91 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
92 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | | 92 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
93 | * SUCH DAMAGE. | | 93 | * SUCH DAMAGE. |
94 | * | | 94 | * |
95 | * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95 | | 95 | * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95 |
96 | */ | | 96 | */ |
97 | | | 97 | |
98 | #include <sys/cdefs.h> | | 98 | #include <sys/cdefs.h> |
99 | __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.119.4.1 2009/02/16 03:31:13 snj Exp $"); | | 99 | __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.119.4.2 2009/03/18 05:33:23 snj Exp $"); |
100 | | | 100 | |
101 | #include <sys/param.h> | | 101 | #include <sys/param.h> |
102 | #include <sys/systm.h> | | 102 | #include <sys/systm.h> |
103 | #include <sys/proc.h> | | 103 | #include <sys/proc.h> |
104 | #include <sys/filedesc.h> | | 104 | #include <sys/filedesc.h> |
105 | #include <sys/domain.h> | | 105 | #include <sys/domain.h> |
106 | #include <sys/protosw.h> | | 106 | #include <sys/protosw.h> |
107 | #include <sys/socket.h> | | 107 | #include <sys/socket.h> |
108 | #include <sys/socketvar.h> | | 108 | #include <sys/socketvar.h> |
109 | #include <sys/unpcb.h> | | 109 | #include <sys/unpcb.h> |
110 | #include <sys/un.h> | | 110 | #include <sys/un.h> |
111 | #include <sys/namei.h> | | 111 | #include <sys/namei.h> |
112 | #include <sys/vnode.h> | | 112 | #include <sys/vnode.h> |
113 | #include <sys/file.h> | | 113 | #include <sys/file.h> |
114 | #include <sys/stat.h> | | 114 | #include <sys/stat.h> |
115 | #include <sys/mbuf.h> | | 115 | #include <sys/mbuf.h> |
116 | #include <sys/kauth.h> | | 116 | #include <sys/kauth.h> |
117 | #include <sys/kmem.h> | | 117 | #include <sys/kmem.h> |
118 | #include <sys/atomic.h> | | 118 | #include <sys/atomic.h> |
119 | #include <sys/uidinfo.h> | | 119 | #include <sys/uidinfo.h> |
| | | 120 | #include <sys/kernel.h> |
| | | 121 | #include <sys/kthread.h> |
120 | | | 122 | |
121 | /* | | 123 | /* |
122 | * Unix communications domain. | | 124 | * Unix communications domain. |
123 | * | | 125 | * |
124 | * TODO: | | 126 | * TODO: |
125 | * SEQPACKET, RDM | | 127 | * SEQPACKET, RDM |
126 | * rethink name space problems | | 128 | * rethink name space problems |
127 | * need a proper out-of-band | | 129 | * need a proper out-of-band |
128 | * | | 130 | * |
129 | * Notes on locking: | | 131 | * Notes on locking: |
130 | * | | 132 | * |
131 | * The generic rules noted in uipc_socket2.c apply. In addition: | | 133 | * The generic rules noted in uipc_socket2.c apply. In addition: |
132 | * | | 134 | * |
| @@ -159,36 +161,54 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq. | | | @@ -159,36 +161,54 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq. |
159 | * independent lock because of visibility / garbage collection issues: | | 161 | * independent lock because of visibility / garbage collection issues: |
160 | * if a socket has been associated with a lock at any point, that lock | | 162 | * if a socket has been associated with a lock at any point, that lock |
161 | * must remain valid until the socket is no longer visible in the system. | | 163 | * must remain valid until the socket is no longer visible in the system. |
162 | * The lock must not be freed or otherwise destroyed until any sockets | | 164 | * The lock must not be freed or otherwise destroyed until any sockets |
163 | * that had referenced it have also been destroyed. | | 165 | * that had referenced it have also been destroyed. |
164 | */ | | 166 | */ |
165 | const struct sockaddr_un sun_noname = { | | 167 | const struct sockaddr_un sun_noname = { |
166 | .sun_len = sizeof(sun_noname), | | 168 | .sun_len = sizeof(sun_noname), |
167 | .sun_family = AF_LOCAL, | | 169 | .sun_family = AF_LOCAL, |
168 | }; | | 170 | }; |
169 | ino_t unp_ino; /* prototype for fake inode numbers */ | | 171 | ino_t unp_ino; /* prototype for fake inode numbers */ |
170 | | | 172 | |
171 | struct mbuf *unp_addsockcred(struct lwp *, struct mbuf *); | | 173 | struct mbuf *unp_addsockcred(struct lwp *, struct mbuf *); |
| | | 174 | static void unp_mark(file_t *); |
| | | 175 | static void unp_scan(struct mbuf *, void (*)(file_t *), int); |
| | | 176 | static void unp_discard_now(file_t *); |
| | | 177 | static void unp_discard_later(file_t *); |
| | | 178 | static void unp_thread(void *); |
| | | 179 | static void unp_thread_kick(void); |
172 | static kmutex_t *uipc_lock; | | 180 | static kmutex_t *uipc_lock; |
173 | | | 181 | |
| | | 182 | static kcondvar_t unp_thread_cv; |
| | | 183 | static lwp_t *unp_thread_lwp; |
| | | 184 | static SLIST_HEAD(,file) unp_thread_discard; |
| | | 185 | static int unp_defer; |
| | | 186 | |
174 | /* | | 187 | /* |
175 | * Initialize Unix protocols. | | 188 | * Initialize Unix protocols. |
176 | */ | | 189 | */ |
177 | void | | 190 | void |
178 | uipc_init(void) | | 191 | uipc_init(void) |
179 | { | | 192 | { |
| | | 193 | int error; |
180 | | | 194 | |
181 | uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); | | 195 | uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); |
| | | 196 | cv_init(&unp_thread_cv, "unpgc"); |
| | | 197 | |
| | | 198 | error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, unp_thread, |
| | | 199 | NULL, &unp_thread_lwp, "unpgc"); |
| | | 200 | if (error != 0) |
| | | 201 | panic("uipc_init %d", error); |
182 | } | | 202 | } |
183 | | | 203 | |
184 | /* | | 204 | /* |
185 | * A connection succeeded: disassociate both endpoints from the head's | | 205 | * A connection succeeded: disassociate both endpoints from the head's |
186 | * lock, and make them share their own lock. There is a race here: for | | 206 | * lock, and make them share their own lock. There is a race here: for |
187 | * a very brief time one endpoint will be locked by a different lock | | 207 | * a very brief time one endpoint will be locked by a different lock |
188 | * than the other end. However, since the current thread holds the old | | 208 | * than the other end. However, since the current thread holds the old |
189 | * lock (the listening socket's lock, the head) access can still only be | | 209 | * lock (the listening socket's lock, the head) access can still only be |
190 | * made to one side of the connection. | | 210 | * made to one side of the connection. |
191 | */ | | 211 | */ |
192 | static void | | 212 | static void |
193 | unp_setpeerlocks(struct socket *so, struct socket *so2) | | 213 | unp_setpeerlocks(struct socket *so, struct socket *so2) |
194 | { | | 214 | { |
| @@ -280,31 +300,29 @@ unp_output(struct mbuf *m, struct mbuf * | | | @@ -280,31 +300,29 @@ unp_output(struct mbuf *m, struct mbuf * |
280 | so2 = unp->unp_conn->unp_socket; | | 300 | so2 = unp->unp_conn->unp_socket; |
281 | | | 301 | |
282 | KASSERT(solocked(so2)); | | 302 | KASSERT(solocked(so2)); |
283 | | | 303 | |
284 | if (unp->unp_addr) | | 304 | if (unp->unp_addr) |
285 | sun = unp->unp_addr; | | 305 | sun = unp->unp_addr; |
286 | else | | 306 | else |
287 | sun = &sun_noname; | | 307 | sun = &sun_noname; |
288 | if (unp->unp_conn->unp_flags & UNP_WANTCRED) | | 308 | if (unp->unp_conn->unp_flags & UNP_WANTCRED) |
289 | control = unp_addsockcred(l, control); | | 309 | control = unp_addsockcred(l, control); |
290 | if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m, | | 310 | if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m, |
291 | control) == 0) { | | 311 | control) == 0) { |
292 | so2->so_rcv.sb_overflowed++; | | 312 | so2->so_rcv.sb_overflowed++; |
293 | sounlock(so2); | | | |
294 | unp_dispose(control); | | 313 | unp_dispose(control); |
295 | m_freem(control); | | 314 | m_freem(control); |
296 | m_freem(m); | | 315 | m_freem(m); |
297 | solock(so2); | | | |
298 | return (ENOBUFS); | | 316 | return (ENOBUFS); |
299 | } else { | | 317 | } else { |
300 | sorwakeup(so2); | | 318 | sorwakeup(so2); |
301 | return (0); | | 319 | return (0); |
302 | } | | 320 | } |
303 | } | | 321 | } |
304 | | | 322 | |
305 | void | | 323 | void |
306 | unp_setaddr(struct socket *so, struct mbuf *nam, bool peeraddr) | | 324 | unp_setaddr(struct socket *so, struct mbuf *nam, bool peeraddr) |
307 | { | | 325 | { |
308 | const struct sockaddr_un *sun; | | 326 | const struct sockaddr_un *sun; |
309 | struct unpcb *unp; | | 327 | struct unpcb *unp; |
310 | bool ext; | | 328 | bool ext; |
| @@ -508,31 +526,29 @@ uipc_usrreq(struct socket *so, int req, | | | @@ -508,31 +526,29 @@ uipc_usrreq(struct socket *so, int req, |
508 | * dropped until we have sent | | 526 | * dropped until we have sent |
509 | * the message and disconnected. | | 527 | * the message and disconnected. |
510 | * This is necessary to prevent | | 528 | * This is necessary to prevent |
511 | * intervening control ops, like | | 529 | * intervening control ops, like |
512 | * another connection. | | 530 | * another connection. |
513 | */ | | 531 | */ |
514 | error = unp_connect(so, nam, l); | | 532 | error = unp_connect(so, nam, l); |
515 | } | | 533 | } |
516 | } else { | | 534 | } else { |
517 | if ((so->so_state & SS_ISCONNECTED) == 0) | | 535 | if ((so->so_state & SS_ISCONNECTED) == 0) |
518 | error = ENOTCONN; | | 536 | error = ENOTCONN; |
519 | } | | 537 | } |
520 | if (error) { | | 538 | if (error) { |
521 | sounlock(so); | | | |
522 | unp_dispose(control); | | 539 | unp_dispose(control); |
523 | m_freem(control); | | 540 | m_freem(control); |
524 | m_freem(m); | | 541 | m_freem(m); |
525 | solock(so); | | | |
526 | break; | | 542 | break; |
527 | } | | 543 | } |
528 | KASSERT(p != NULL); | | 544 | KASSERT(p != NULL); |
529 | error = unp_output(m, control, unp, l); | | 545 | error = unp_output(m, control, unp, l); |
530 | if (nam) | | 546 | if (nam) |
531 | unp_disconnect(unp); | | 547 | unp_disconnect(unp); |
532 | break; | | 548 | break; |
533 | } | | 549 | } |
534 | | | 550 | |
535 | case SOCK_STREAM: | | 551 | case SOCK_STREAM: |
536 | #define rcv (&so2->so_rcv) | | 552 | #define rcv (&so2->so_rcv) |
537 | #define snd (&so->so_snd) | | 553 | #define snd (&so->so_snd) |
538 | if (unp->unp_conn == NULL) { | | 554 | if (unp->unp_conn == NULL) { |
| @@ -561,30 +577,28 @@ uipc_usrreq(struct socket *so, int req, | | | @@ -561,30 +577,28 @@ uipc_usrreq(struct socket *so, int req, |
561 | sbappend(rcv, m); | | 577 | sbappend(rcv, m); |
562 | snd->sb_mbmax -= | | 578 | snd->sb_mbmax -= |
563 | rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt; | | 579 | rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt; |
564 | unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt; | | 580 | unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt; |
565 | newhiwat = snd->sb_hiwat - | | 581 | newhiwat = snd->sb_hiwat - |
566 | (rcv->sb_cc - unp->unp_conn->unp_cc); | | 582 | (rcv->sb_cc - unp->unp_conn->unp_cc); |
567 | (void)chgsbsize(so->so_uidinfo, | | 583 | (void)chgsbsize(so->so_uidinfo, |
568 | &snd->sb_hiwat, newhiwat, RLIM_INFINITY); | | 584 | &snd->sb_hiwat, newhiwat, RLIM_INFINITY); |
569 | unp->unp_conn->unp_cc = rcv->sb_cc; | | 585 | unp->unp_conn->unp_cc = rcv->sb_cc; |
570 | sorwakeup(so2); | | 586 | sorwakeup(so2); |
571 | #undef snd | | 587 | #undef snd |
572 | #undef rcv | | 588 | #undef rcv |
573 | if (control != NULL) { | | 589 | if (control != NULL) { |
574 | sounlock(so); | | | |
575 | unp_dispose(control); | | 590 | unp_dispose(control); |
576 | m_freem(control); | | 591 | m_freem(control); |
577 | solock(so); | | | |
578 | } | | 592 | } |
579 | break; | | 593 | break; |
580 | | | 594 | |
581 | default: | | 595 | default: |
582 | panic("uipc 4"); | | 596 | panic("uipc 4"); |
583 | } | | 597 | } |
584 | break; | | 598 | break; |
585 | | | 599 | |
586 | case PRU_ABORT: | | 600 | case PRU_ABORT: |
587 | (void)unp_drop(unp, ECONNABORTED); | | 601 | (void)unp_drop(unp, ECONNABORTED); |
588 | | | 602 | |
589 | KASSERT(so->so_head == NULL); | | 603 | KASSERT(so->so_head == NULL); |
590 | #ifdef DIAGNOSTIC | | 604 | #ifdef DIAGNOSTIC |
| @@ -714,27 +728,28 @@ uipc_ctloutput(int op, struct socket *so | | | @@ -714,27 +728,28 @@ uipc_ctloutput(int op, struct socket *so |
714 | * Both send and receive buffers are allocated PIPSIZ bytes of buffering | | 728 | * Both send and receive buffers are allocated PIPSIZ bytes of buffering |
715 | * for stream sockets, although the total for sender and receiver is | | 729 | * for stream sockets, although the total for sender and receiver is |
716 | * actually only PIPSIZ. | | 730 | * actually only PIPSIZ. |
717 | * Datagram sockets really use the sendspace as the maximum datagram size, | | 731 | * Datagram sockets really use the sendspace as the maximum datagram size, |
718 | * and don't really want to reserve the sendspace. Their recvspace should | | 732 | * and don't really want to reserve the sendspace. Their recvspace should |
719 | * be large enough for at least one max-size datagram plus address. | | 733 | * be large enough for at least one max-size datagram plus address. |
720 | */ | | 734 | */ |
721 | #define PIPSIZ 4096 | | 735 | #define PIPSIZ 4096 |
722 | u_long unpst_sendspace = PIPSIZ; | | 736 | u_long unpst_sendspace = PIPSIZ; |
723 | u_long unpst_recvspace = PIPSIZ; | | 737 | u_long unpst_recvspace = PIPSIZ; |
724 | u_long unpdg_sendspace = 2*1024; /* really max datagram size */ | | 738 | u_long unpdg_sendspace = 2*1024; /* really max datagram size */ |
725 | u_long unpdg_recvspace = 4*1024; | | 739 | u_long unpdg_recvspace = 4*1024; |
726 | | | 740 | |
727 | u_int unp_rights; /* file descriptors in flight */ | | 741 | u_int unp_rights; /* files in flight */ |
| | | 742 | u_int unp_rights_ratio = 2; /* limit, fraction of maxfiles */ |
728 | | | 743 | |
729 | int | | 744 | int |
730 | unp_attach(struct socket *so) | | 745 | unp_attach(struct socket *so) |
731 | { | | 746 | { |
732 | struct unpcb *unp; | | 747 | struct unpcb *unp; |
733 | int error; | | 748 | int error; |
734 | | | 749 | |
735 | switch (so->so_type) { | | 750 | switch (so->so_type) { |
736 | case SOCK_STREAM: | | 751 | case SOCK_STREAM: |
737 | if (so->so_lock == NULL) { | | 752 | if (so->so_lock == NULL) { |
738 | /* | | 753 | /* |
739 | * XXX Assuming that no socket locks are held, | | 754 | * XXX Assuming that no socket locks are held, |
740 | * as this call may sleep. | | 755 | * as this call may sleep. |
| @@ -798,37 +813,34 @@ unp_detach(struct unpcb *unp) | | | @@ -798,37 +813,34 @@ unp_detach(struct unpcb *unp) |
798 | if (unp->unp_conn) | | 813 | if (unp->unp_conn) |
799 | unp_disconnect(unp); | | 814 | unp_disconnect(unp); |
800 | while (unp->unp_refs) { | | 815 | while (unp->unp_refs) { |
801 | KASSERT(solocked2(so, unp->unp_refs->unp_socket)); | | 816 | KASSERT(solocked2(so, unp->unp_refs->unp_socket)); |
802 | if (unp_drop(unp->unp_refs, ECONNRESET)) { | | 817 | if (unp_drop(unp->unp_refs, ECONNRESET)) { |
803 | solock(so); | | 818 | solock(so); |
804 | goto retry; | | 819 | goto retry; |
805 | } | | 820 | } |
806 | } | | 821 | } |
807 | soisdisconnected(so); | | 822 | soisdisconnected(so); |
808 | so->so_pcb = NULL; | | 823 | so->so_pcb = NULL; |
809 | if (unp_rights) { | | 824 | if (unp_rights) { |
810 | /* | | 825 | /* |
811 | * Normally the receive buffer is flushed later, | | 826 | * Normally the receive buffer is flushed later, in sofree, |
812 | * in sofree, but if our receive buffer holds references | | 827 | * but if our receive buffer holds references to files that |
813 | * to descriptors that are now garbage, we will dispose | | 828 | * are now garbage, we will enqueue those file references to |
814 | * of those descriptor references after the garbage collector | | 829 | * the garbage collector and kick it into action. |
815 | * gets them (resulting in a "panic: closef: count < 0"). | | | |
816 | */ | | 830 | */ |
817 | sorflush(so); | | 831 | sorflush(so); |
818 | unp_free(unp); | | 832 | unp_free(unp); |
819 | sounlock(so); | | 833 | unp_thread_kick(); |
820 | unp_gc(); | | | |
821 | solock(so); | | | |
822 | } else | | 834 | } else |
823 | unp_free(unp); | | 835 | unp_free(unp); |
824 | } | | 836 | } |
825 | | | 837 | |
826 | int | | 838 | int |
827 | unp_bind(struct socket *so, struct mbuf *nam, struct lwp *l) | | 839 | unp_bind(struct socket *so, struct mbuf *nam, struct lwp *l) |
828 | { | | 840 | { |
829 | struct sockaddr_un *sun; | | 841 | struct sockaddr_un *sun; |
830 | struct unpcb *unp; | | 842 | struct unpcb *unp; |
831 | vnode_t *vp; | | 843 | vnode_t *vp; |
832 | struct vattr vattr; | | 844 | struct vattr vattr; |
833 | size_t addrlen; | | 845 | size_t addrlen; |
834 | int error; | | 846 | int error; |
| @@ -1155,66 +1167,62 @@ unp_externalize(struct mbuf *rights, str | | | @@ -1155,66 +1167,62 @@ unp_externalize(struct mbuf *rights, str |
1155 | struct proc *p = l->l_proc; | | 1167 | struct proc *p = l->l_proc; |
1156 | int i, *fdp; | | 1168 | int i, *fdp; |
1157 | file_t **rp; | | 1169 | file_t **rp; |
1158 | file_t *fp; | | 1170 | file_t *fp; |
1159 | int nfds, error = 0; | | 1171 | int nfds, error = 0; |
1160 | | | 1172 | |
1161 | nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / | | 1173 | nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / |
1162 | sizeof(file_t *); | | 1174 | sizeof(file_t *); |
1163 | rp = (file_t **)CMSG_DATA(cm); | | 1175 | rp = (file_t **)CMSG_DATA(cm); |
1164 | | | 1176 | |
1165 | fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK); | | 1177 | fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK); |
1166 | rw_enter(&p->p_cwdi->cwdi_lock, RW_READER); | | 1178 | rw_enter(&p->p_cwdi->cwdi_lock, RW_READER); |
1167 | | | 1179 | |
1168 | /* Make sure the recipient should be able to see the descriptors.. */ | | 1180 | /* Make sure the recipient should be able to see the files.. */ |
1169 | if (p->p_cwdi->cwdi_rdir != NULL) { | | 1181 | if (p->p_cwdi->cwdi_rdir != NULL) { |
1170 | rp = (file_t **)CMSG_DATA(cm); | | 1182 | rp = (file_t **)CMSG_DATA(cm); |
1171 | for (i = 0; i < nfds; i++) { | | 1183 | for (i = 0; i < nfds; i++) { |
1172 | fp = *rp++; | | 1184 | fp = *rp++; |
1173 | /* | | 1185 | /* |
1174 | * If we are in a chroot'ed directory, and | | 1186 | * If we are in a chroot'ed directory, and |
1175 | * someone wants to pass us a directory, make | | 1187 | * someone wants to pass us a directory, make |
1176 | * sure it's inside the subtree we're allowed | | 1188 | * sure it's inside the subtree we're allowed |
1177 | * to access. | | 1189 | * to access. |
1178 | */ | | 1190 | */ |
1179 | if (fp->f_type == DTYPE_VNODE) { | | 1191 | if (fp->f_type == DTYPE_VNODE) { |
1180 | vnode_t *vp = (vnode_t *)fp->f_data; | | 1192 | vnode_t *vp = (vnode_t *)fp->f_data; |
1181 | if ((vp->v_type == VDIR) && | | 1193 | if ((vp->v_type == VDIR) && |
1182 | !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) { | | 1194 | !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) { |
1183 | error = EPERM; | | 1195 | error = EPERM; |
1184 | break; | | 1196 | break; |
1185 | } | | 1197 | } |
1186 | } | | 1198 | } |
1187 | } | | 1199 | } |
1188 | } | | 1200 | } |
1189 | | | 1201 | |
1190 | restart: | | 1202 | restart: |
1191 | rp = (file_t **)CMSG_DATA(cm); | | 1203 | rp = (file_t **)CMSG_DATA(cm); |
1192 | if (error != 0) { | | 1204 | if (error != 0) { |
1193 | for (i = 0; i < nfds; i++) { | | 1205 | for (i = 0; i < nfds; i++) { |
1194 | fp = *rp; | | 1206 | fp = *rp; |
1195 | /* | | | |
1196 | * zero the pointer before calling unp_discard, | | | |
1197 | * since it may end up in unp_gc().. | | | |
1198 | */ | | | |
1199 | *rp++ = 0; | | 1207 | *rp++ = 0; |
1200 | unp_discard(fp); | | 1208 | unp_discard_now(fp); |
1201 | } | | 1209 | } |
1202 | goto out; | | 1210 | goto out; |
1203 | } | | 1211 | } |
1204 | | | 1212 | |
1205 | /* | | 1213 | /* |
1206 | * First loop -- allocate file descriptor table slots for the | | 1214 | * First loop -- allocate file descriptor table slots for the |
1207 | * new descriptors. | | 1215 | * new files. |
1208 | */ | | 1216 | */ |
1209 | for (i = 0; i < nfds; i++) { | | 1217 | for (i = 0; i < nfds; i++) { |
1210 | fp = *rp++; | | 1218 | fp = *rp++; |
1211 | if ((error = fd_alloc(p, 0, &fdp[i])) != 0) { | | 1219 | if ((error = fd_alloc(p, 0, &fdp[i])) != 0) { |
1212 | /* | | 1220 | /* |
1213 | * Back out what we've done so far. | | 1221 | * Back out what we've done so far. |
1214 | */ | | 1222 | */ |
1215 | for (--i; i >= 0; i--) { | | 1223 | for (--i; i >= 0; i--) { |
1216 | fd_abort(p, NULL, fdp[i]); | | 1224 | fd_abort(p, NULL, fdp[i]); |
1217 | } | | 1225 | } |
1218 | if (error == ENOSPC) { | | 1226 | if (error == ENOSPC) { |
1219 | fd_tryexpand(p); | | 1227 | fd_tryexpand(p); |
1220 | error = 0; | | 1228 | error = 0; |
| @@ -1222,27 +1230,27 @@ unp_externalize(struct mbuf *rights, str | | | @@ -1222,27 +1230,27 @@ unp_externalize(struct mbuf *rights, str |
1222 | /* | | 1230 | /* |
1223 | * This is the error that has historically | | 1231 | * This is the error that has historically |
1224 | * been returned, and some callers may | | 1232 | * been returned, and some callers may |
1225 | * expect it. | | 1233 | * expect it. |
1226 | */ | | 1234 | */ |
1227 | error = EMSGSIZE; | | 1235 | error = EMSGSIZE; |
1228 | } | | 1236 | } |
1229 | goto restart; | | 1237 | goto restart; |
1230 | } | | 1238 | } |
1231 | } | | 1239 | } |
1232 | | | 1240 | |
1233 | /* | | 1241 | /* |
1234 | * Now that adding them has succeeded, update all of the | | 1242 | * Now that adding them has succeeded, update all of the |
1235 | * descriptor passing state. | | 1243 | * file passing state and affix the descriptors. |
1236 | */ | | 1244 | */ |
1237 | rp = (file_t **)CMSG_DATA(cm); | | 1245 | rp = (file_t **)CMSG_DATA(cm); |
1238 | for (i = 0; i < nfds; i++) { | | 1246 | for (i = 0; i < nfds; i++) { |
1239 | fp = *rp++; | | 1247 | fp = *rp++; |
1240 | atomic_dec_uint(&unp_rights); | | 1248 | atomic_dec_uint(&unp_rights); |
1241 | fd_affix(p, fp, fdp[i]); | | 1249 | fd_affix(p, fp, fdp[i]); |
1242 | mutex_enter(&fp->f_lock); | | 1250 | mutex_enter(&fp->f_lock); |
1243 | fp->f_msgcount--; | | 1251 | fp->f_msgcount--; |
1244 | mutex_exit(&fp->f_lock); | | 1252 | mutex_exit(&fp->f_lock); |
1245 | /* | | 1253 | /* |
1246 | * Note that fd_affix() adds a reference to the file. | | 1254 | * Note that fd_affix() adds a reference to the file. |
1247 | * The file may already have been closed by another | | 1255 | * The file may already have been closed by another |
1248 | * LWP in the process, so we must drop the reference | | 1256 | * LWP in the process, so we must drop the reference |
| @@ -1257,52 +1265,61 @@ unp_externalize(struct mbuf *rights, str | | | @@ -1257,52 +1265,61 @@ unp_externalize(struct mbuf *rights, str |
1257 | */ | | 1265 | */ |
1258 | memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int)); | | 1266 | memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int)); |
1259 | cm->cmsg_len = CMSG_LEN(nfds * sizeof(int)); | | 1267 | cm->cmsg_len = CMSG_LEN(nfds * sizeof(int)); |
1260 | rights->m_len = CMSG_SPACE(nfds * sizeof(int)); | | 1268 | rights->m_len = CMSG_SPACE(nfds * sizeof(int)); |
1261 | out: | | 1269 | out: |
1262 | rw_exit(&p->p_cwdi->cwdi_lock); | | 1270 | rw_exit(&p->p_cwdi->cwdi_lock); |
1263 | free(fdp, M_TEMP); | | 1271 | free(fdp, M_TEMP); |
1264 | return (error); | | 1272 | return (error); |
1265 | } | | 1273 | } |
1266 | | | 1274 | |
1267 | int | | 1275 | int |
1268 | unp_internalize(struct mbuf **controlp) | | 1276 | unp_internalize(struct mbuf **controlp) |
1269 | { | | 1277 | { |
1270 | struct filedesc *fdescp = curlwp->l_fd; | | 1278 | filedesc_t *fdescp = curlwp->l_fd; |
1271 | struct mbuf *control = *controlp; | | 1279 | struct mbuf *control = *controlp; |
1272 | struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *); | | 1280 | struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *); |
1273 | file_t **rp, **files; | | 1281 | file_t **rp, **files; |
1274 | file_t *fp; | | 1282 | file_t *fp; |
1275 | int i, fd, *fdp; | | 1283 | int i, fd, *fdp; |
1276 | int nfds, error; | | 1284 | int nfds, error; |
| | | 1285 | u_int maxmsg; |
1277 | | | 1286 | |
1278 | error = 0; | | 1287 | error = 0; |
1279 | newcm = NULL; | | 1288 | newcm = NULL; |
1280 | | | 1289 | |
1281 | /* Sanity check the control message header. */ | | 1290 | /* Sanity check the control message header. */ |
1282 | if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || | | 1291 | if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || |
1283 | cm->cmsg_len > control->m_len || | | 1292 | cm->cmsg_len > control->m_len || |
1284 | cm->cmsg_len < CMSG_ALIGN(sizeof(*cm))) | | 1293 | cm->cmsg_len < CMSG_ALIGN(sizeof(*cm))) |
1285 | return (EINVAL); | | 1294 | return (EINVAL); |
1286 | | | 1295 | |
1287 | /* | | 1296 | /* |
1288 | * Verify that the file descriptors are valid, and acquire | | 1297 | * Verify that the file descriptors are valid, and acquire |
1289 | * a reference to each. | | 1298 | * a reference to each. |
1290 | */ | | 1299 | */ |
1291 | nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int); | | 1300 | nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int); |
1292 | fdp = (int *)CMSG_DATA(cm); | | 1301 | fdp = (int *)CMSG_DATA(cm); |
| | | 1302 | maxmsg = maxfiles / unp_rights_ratio; |
1293 | for (i = 0; i < nfds; i++) { | | 1303 | for (i = 0; i < nfds; i++) { |
1294 | fd = *fdp++; | | 1304 | fd = *fdp++; |
| | | 1305 | if (atomic_inc_uint_nv(&unp_rights) > maxmsg) { |
| | | 1306 | atomic_dec_uint(&unp_rights); |
| | | 1307 | nfds = i; |
| | | 1308 | error = EAGAIN; |
| | | 1309 | goto out; |
| | | 1310 | } |
1295 | if ((fp = fd_getfile(fd)) == NULL) { | | 1311 | if ((fp = fd_getfile(fd)) == NULL) { |
| | | 1312 | atomic_dec_uint(&unp_rights); |
1296 | nfds = i; | | 1313 | nfds = i; |
1297 | error = EBADF; | | 1314 | error = EBADF; |
1298 | goto out; | | 1315 | goto out; |
1299 | } | | 1316 | } |
1300 | } | | 1317 | } |
1301 | | | 1318 | |
1302 | /* Allocate new space and copy header into it. */ | | 1319 | /* Allocate new space and copy header into it. */ |
1303 | newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK); | | 1320 | newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK); |
1304 | if (newcm == NULL) { | | 1321 | if (newcm == NULL) { |
1305 | error = E2BIG; | | 1322 | error = E2BIG; |
1306 | goto out; | | 1323 | goto out; |
1307 | } | | 1324 | } |
1308 | memcpy(newcm, cm, sizeof(struct cmsghdr)); | | 1325 | memcpy(newcm, cm, sizeof(struct cmsghdr)); |
| @@ -1314,34 +1331,36 @@ unp_internalize(struct mbuf **controlp) | | | @@ -1314,34 +1331,36 @@ unp_internalize(struct mbuf **controlp) |
1314 | * int won't get until we're done. No need to lock, as we have | | 1331 | * int won't get until we're done. No need to lock, as we have |
1315 | * already validated the descriptors with fd_getfile(). | | 1332 | * already validated the descriptors with fd_getfile(). |
1316 | */ | | 1333 | */ |
1317 | fdp = (int *)CMSG_DATA(cm) + nfds; | | 1334 | fdp = (int *)CMSG_DATA(cm) + nfds; |
1318 | rp = files + nfds; | | 1335 | rp = files + nfds; |
1319 | for (i = 0; i < nfds; i++) { | | 1336 | for (i = 0; i < nfds; i++) { |
1320 | fp = fdescp->fd_ofiles[*--fdp]->ff_file; | | 1337 | fp = fdescp->fd_ofiles[*--fdp]->ff_file; |
1321 | KASSERT(fp != NULL); | | 1338 | KASSERT(fp != NULL); |
1322 | mutex_enter(&fp->f_lock); | | 1339 | mutex_enter(&fp->f_lock); |
1323 | *--rp = fp; | | 1340 | *--rp = fp; |
1324 | fp->f_count++; | | 1341 | fp->f_count++; |
1325 | fp->f_msgcount++; | | 1342 | fp->f_msgcount++; |
1326 | mutex_exit(&fp->f_lock); | | 1343 | mutex_exit(&fp->f_lock); |
1327 | atomic_inc_uint(&unp_rights); | | | |
1328 | } | | 1344 | } |
1329 | | | 1345 | |
1330 | out: | | 1346 | out: |
1331 | /* Release descriptor references. */ | | 1347 | /* Release descriptor references. */ |
1332 | fdp = (int *)CMSG_DATA(cm); | | 1348 | fdp = (int *)CMSG_DATA(cm); |
1333 | for (i = 0; i < nfds; i++) { | | 1349 | for (i = 0; i < nfds; i++) { |
1334 | fd_putfile(*fdp++); | | 1350 | fd_putfile(*fdp++); |
| | | 1351 | if (error != 0) { |
| | | 1352 | atomic_dec_uint(&unp_rights); |
| | | 1353 | } |
1335 | } | | 1354 | } |
1336 | | | 1355 | |
1337 | if (error == 0) { | | 1356 | if (error == 0) { |
1338 | if (control->m_flags & M_EXT) { | | 1357 | if (control->m_flags & M_EXT) { |
1339 | m_freem(control); | | 1358 | m_freem(control); |
1340 | *controlp = control = m_get(M_WAIT, MT_CONTROL); | | 1359 | *controlp = control = m_get(M_WAIT, MT_CONTROL); |
1341 | } | | 1360 | } |
1342 | MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)), | | 1361 | MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)), |
1343 | M_MBUF, NULL, NULL); | | 1362 | M_MBUF, NULL, NULL); |
1344 | cm = newcm; | | 1363 | cm = newcm; |
1345 | /* | | 1364 | /* |
1346 | * Adjust message & mbuf to note amount of space | | 1365 | * Adjust message & mbuf to note amount of space |
1347 | * actually used. | | 1366 | * actually used. |
| @@ -1394,297 +1413,340 @@ unp_addsockcred(struct lwp *l, struct mb | | | @@ -1394,297 +1413,340 @@ unp_addsockcred(struct lwp *l, struct mb |
1394 | /* | | 1413 | /* |
1395 | * If a control message already exists, append us to the end. | | 1414 | * If a control message already exists, append us to the end. |
1396 | */ | | 1415 | */ |
1397 | if (control != NULL) { | | 1416 | if (control != NULL) { |
1398 | for (n = control; n->m_next != NULL; n = n->m_next) | | 1417 | for (n = control; n->m_next != NULL; n = n->m_next) |
1399 | ; | | 1418 | ; |
1400 | n->m_next = m; | | 1419 | n->m_next = m; |
1401 | } else | | 1420 | } else |
1402 | control = m; | | 1421 | control = m; |
1403 | | | 1422 | |
1404 | return (control); | | 1423 | return (control); |
1405 | } | | 1424 | } |
1406 | | | 1425 | |
1407 | int unp_defer, unp_gcing; | | | |
1408 | extern struct domain unixdomain; | | | |
1409 | | | | |
1410 | /* | | 1426 | /* |
1411 | * Comment added long after the fact explaining what's going on here. | | 1427 | * Do a mark-sweep GC of files in the system, to free up any which are |
1412 | * Do a mark-sweep GC of file descriptors on the system, to free up | | 1428 | * caught in flight to an about-to-be-closed socket. Additionally, |
1413 | * any which are caught in flight to an about-to-be-closed socket. | | 1429 | * process deferred file closures. |
1414 | * | | | |
1415 | * Traditional mark-sweep gc's start at the "root", and mark | | | |
1416 | * everything reachable from the root (which, in our case would be the | | | |
1417 | * process table). The mark bits are cleared during the sweep. | | | |
1418 | * | | | |
1419 | * XXX For some inexplicable reason (perhaps because the file | | | |
1420 | * descriptor tables used to live in the u area which could be swapped | | | |
1421 | * out and thus hard to reach), we do multiple scans over the set of | | | |
1422 | * descriptors, using use *two* mark bits per object (DEFER and MARK). | | | |
1423 | * Whenever we find a descriptor which references other descriptors, | | | |
1424 | * the ones it references are marked with both bits, and we iterate | | | |
1425 | * over the whole file table until there are no more DEFER bits set. | | | |
1426 | * We also make an extra pass *before* the GC to clear the mark bits, | | | |
1427 | * which could have been cleared at almost no cost during the previous | | | |
1428 | * sweep. | | | |
1429 | */ | | 1430 | */ |
1430 | void | | 1431 | static void |
1431 | unp_gc(void) | | 1432 | unp_gc(file_t *dp) |
1432 | { | | 1433 | { |
1433 | file_t *fp, *nextfp; | | 1434 | extern struct domain unixdomain; |
| | | 1435 | file_t *fp, *np; |
1434 | struct socket *so, *so1; | | 1436 | struct socket *so, *so1; |
1435 | file_t **extra_ref, **fpp; | | 1437 | u_int i, old, new; |
1436 | int nunref, nslots, i; | | 1438 | bool didwork; |
1437 | | | 1439 | |
1438 | if (atomic_swap_uint(&unp_gcing, 1) == 1) | | 1440 | KASSERT(curlwp == unp_thread_lwp); |
1439 | return; | | 1441 | KASSERT(mutex_owned(&filelist_lock)); |
1440 | | | 1442 | |
1441 | restart: | | 1443 | /* |
1442 | nslots = nfiles * 2; | | 1444 | * First, process deferred file closures. |
1443 | extra_ref = kmem_alloc(nslots * sizeof(file_t *), KM_SLEEP); | | 1445 | */ |
| | | 1446 | while (!SLIST_EMPTY(&unp_thread_discard)) { |
| | | 1447 | fp = SLIST_FIRST(&unp_thread_discard); |
| | | 1448 | KASSERT(fp->f_unpcount > 0); |
| | | 1449 | KASSERT(fp->f_count > 0); |
| | | 1450 | KASSERT(fp->f_msgcount > 0); |
| | | 1451 | KASSERT(fp->f_count >= fp->f_unpcount); |
| | | 1452 | KASSERT(fp->f_count >= fp->f_msgcount); |
| | | 1453 | KASSERT(fp->f_msgcount >= fp->f_unpcount); |
| | | 1454 | SLIST_REMOVE_HEAD(&unp_thread_discard, f_unplist); |
| | | 1455 | i = fp->f_unpcount; |
| | | 1456 | fp->f_unpcount = 0; |
| | | 1457 | mutex_exit(&filelist_lock); |
| | | 1458 | for (; i != 0; i--) { |
| | | 1459 | unp_discard_now(fp); |
| | | 1460 | } |
| | | 1461 | mutex_enter(&filelist_lock); |
| | | 1462 | } |
1444 | | | 1463 | |
1445 | mutex_enter(&filelist_lock); | | 1464 | /* |
| | | 1465 | * Clear mark bits. Ensure that we don't consider new files |
| | | 1466 | * entering the file table during this loop (they will not have |
| | | 1467 | * FSCAN set). |
| | | 1468 | */ |
1446 | unp_defer = 0; | | 1469 | unp_defer = 0; |
1447 | | | | |
1448 | /* Clear mark bits */ | | | |
1449 | LIST_FOREACH(fp, &filehead, f_list) { | | 1470 | LIST_FOREACH(fp, &filehead, f_list) { |
1450 | atomic_and_uint(&fp->f_flag, ~(FMARK|FDEFER)); | | 1471 | for (old = fp->f_flag;; old = new) { |
| | | 1472 | new = atomic_cas_uint(&fp->f_flag, old, |
| | | 1473 | (old | FSCAN) & ~(FMARK|FDEFER)); |
| | | 1474 | if (__predict_true(old == new)) { |
| | | 1475 | break; |
| | | 1476 | } |
| | | 1477 | } |
1451 | } | | 1478 | } |
1452 | | | 1479 | |
1453 | /* | | 1480 | /* |
1454 | * Iterate over the set of descriptors, marking ones believed | | 1481 | * Iterate over the set of sockets, marking ones believed (based on |
1455 | * (based on refcount) to be referenced from a process, and | | 1482 | * refcount) to be referenced from a process, and marking for rescan |
1456 | * marking for rescan descriptors which are queued on a socket. | | 1483 | * sockets which are queued on a socket. Recan continues descending |
| | | 1484 | * and searching for sockets referenced by sockets (FDEFER), until |
| | | 1485 | * there are no more socket->socket references to be discovered. |
1457 | */ | | 1486 | */ |
1458 | do { | | 1487 | do { |
1459 | LIST_FOREACH(fp, &filehead, f_list) { | | 1488 | didwork = false; |
| | | 1489 | for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) { |
| | | 1490 | KASSERT(mutex_owned(&filelist_lock)); |
| | | 1491 | np = LIST_NEXT(fp, f_list); |
1460 | mutex_enter(&fp->f_lock); | | 1492 | mutex_enter(&fp->f_lock); |
1461 | if (fp->f_flag & FDEFER) { | | 1493 | if ((fp->f_flag & FDEFER) != 0) { |
1462 | atomic_and_uint(&fp->f_flag, ~FDEFER); | | 1494 | atomic_and_uint(&fp->f_flag, ~FDEFER); |
1463 | unp_defer--; | | 1495 | unp_defer--; |
1464 | KASSERT(fp->f_count != 0); | | 1496 | KASSERT(fp->f_count != 0); |
1465 | } else { | | 1497 | } else { |
1466 | if (fp->f_count == 0 || | | 1498 | if (fp->f_count == 0 || |
1467 | (fp->f_flag & FMARK) || | | 1499 | (fp->f_flag & FMARK) != 0 || |
1468 | fp->f_count == fp->f_msgcount) { | | 1500 | fp->f_count == fp->f_msgcount || |
| | | 1501 | fp->f_unpcount != 0) { |
1469 | mutex_exit(&fp->f_lock); | | 1502 | mutex_exit(&fp->f_lock); |
1470 | continue; | | 1503 | continue; |
1471 | } | | 1504 | } |
1472 | } | | 1505 | } |
1473 | atomic_or_uint(&fp->f_flag, FMARK); | | 1506 | atomic_or_uint(&fp->f_flag, FMARK); |
1474 | | | 1507 | |
1475 | if (fp->f_type != DTYPE_SOCKET || | | 1508 | if (fp->f_type != DTYPE_SOCKET || |
1476 | (so = fp->f_data) == NULL || | | 1509 | (so = fp->f_data) == NULL || |
1477 | so->so_proto->pr_domain != &unixdomain || | | 1510 | so->so_proto->pr_domain != &unixdomain || |
1478 | (so->so_proto->pr_flags&PR_RIGHTS) == 0) { | | 1511 | (so->so_proto->pr_flags & PR_RIGHTS) == 0) { |
1479 | mutex_exit(&fp->f_lock); | | 1512 | mutex_exit(&fp->f_lock); |
1480 | continue; | | 1513 | continue; |
1481 | } | | 1514 | } |
1482 | #ifdef notdef | | 1515 | |
1483 | if (so->so_rcv.sb_flags & SB_LOCK) { | | 1516 | /* Gain file ref, mark our position, and unlock. */ |
1484 | mutex_exit(&fp->f_lock); | | 1517 | didwork = true; |
1485 | mutex_exit(&filelist_lock); | | 1518 | LIST_INSERT_AFTER(fp, dp, f_list); |
1486 | kmem_free(extra_ref, nslots * sizeof(file_t *)); | | 1519 | fp->f_count++; |
1487 | /* | | | |
1488 | * This is problematical; it's not clear | | | |
1489 | * we need to wait for the sockbuf to be | | | |
1490 | * unlocked (on a uniprocessor, at least), | | | |
1491 | * and it's also not clear what to do | | | |
1492 | * if sbwait returns an error due to receipt | | | |
1493 | * of a signal. If sbwait does return | | | |
1494 | * an error, we'll go into an infinite | | | |
1495 | * loop. Delete all of this for now. | | | |
1496 | */ | | | |
1497 | (void) sbwait(&so->so_rcv); | | | |
1498 | goto restart; | | | |
1499 | } | | | |
1500 | #endif | | | |
1501 | mutex_exit(&fp->f_lock); | | 1520 | mutex_exit(&fp->f_lock); |
| | | 1521 | mutex_exit(&filelist_lock); |
1502 | | | 1522 | |
1503 | /* | | 1523 | /* |
1504 | * XXX Locking a socket with filelist_lock held | | 1524 | * Mark files referenced from sockets queued on the |
1505 | * is ugly. filelist_lock can be taken by the | | 1525 | * accept queue as well. |
1506 | * pagedaemon when reclaiming items from file_cache. | | | |
1507 | * Socket activity could delay the pagedaemon. | | | |
1508 | */ | | 1526 | */ |
1509 | solock(so); | | 1527 | solock(so); |
1510 | unp_scan(so->so_rcv.sb_mb, unp_mark, 0); | | 1528 | unp_scan(so->so_rcv.sb_mb, unp_mark, 0); |
1511 | /* | | 1529 | if ((so->so_options & SO_ACCEPTCONN) != 0) { |
1512 | * Mark descriptors referenced from sockets queued | | | |
1513 | * on the accept queue as well. | | | |
1514 | */ | | | |
1515 | if (so->so_options & SO_ACCEPTCONN) { | | | |
1516 | TAILQ_FOREACH(so1, &so->so_q0, so_qe) { | | 1530 | TAILQ_FOREACH(so1, &so->so_q0, so_qe) { |
1517 | unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); | | 1531 | unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); |
1518 | } | | 1532 | } |
1519 | TAILQ_FOREACH(so1, &so->so_q, so_qe) { | | 1533 | TAILQ_FOREACH(so1, &so->so_q, so_qe) { |
1520 | unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); | | 1534 | unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); |
1521 | } | | 1535 | } |
1522 | } | | 1536 | } |
1523 | sounlock(so); | | 1537 | sounlock(so); |
| | | 1538 | |
| | | 1539 | /* Re-lock and restart from where we left off. */ |
| | | 1540 | closef(fp); |
| | | 1541 | mutex_enter(&filelist_lock); |
| | | 1542 | np = LIST_NEXT(dp, f_list); |
| | | 1543 | LIST_REMOVE(dp, f_list); |
1524 | } | | 1544 | } |
1525 | } while (unp_defer); | | 1545 | /* |
| | | 1546 | * Bail early if we did nothing in the loop above. Could |
| | | 1547 | * happen because of concurrent activity causing unp_defer |
| | | 1548 | * to get out of sync. |
| | | 1549 | */ |
| | | 1550 | } while (unp_defer != 0 && didwork); |
1526 | | | 1551 | |
1527 | /* | | 1552 | /* |
1528 | * Sweep pass. Find unmarked descriptors, and free them. | | 1553 | * Sweep pass. |
1529 | * | | | |
1530 | * We grab an extra reference to each of the file table entries | | | |
1531 | * that are not otherwise accessible and then free the rights | | | |
1532 | * that are stored in messages on them. | | | |
1533 | * | | | |
1534 | * The bug in the original code is a little tricky, so I'll describe | | | |
1535 | * what's wrong with it here. | | | |
1536 | * | | | |
1537 | * It is incorrect to simply unp_discard each entry for f_msgcount | | | |
1538 | * times -- consider the case of sockets A and B that contain | | | |
1539 | * references to each other. On a last close of some other socket, | | | |
1540 | * we trigger a gc since the number of outstanding rights (unp_rights) | | | |
1541 | * is non-zero. If during the sweep phase the gc code un_discards, | | | |
1542 | * we end up doing a (full) closef on the descriptor. A closef on A | | | |
1543 | * results in the following chain. Closef calls soo_close, which | | | |
1544 | * calls soclose. Soclose calls first (through the switch | | | |
1545 | * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply | | | |
1546 | * returns because the previous instance had set unp_gcing, and | | | |
1547 | * we return all the way back to soclose, which marks the socket | | | |
1548 | * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush | | | |
1549 | * to free up the rights that are queued in messages on the socket A, | | | |
1550 | * i.e., the reference on B. The sorflush calls via the dom_dispose | | | |
1551 | * switch unp_dispose, which unp_scans with unp_discard. This second | | | |
1552 | * instance of unp_discard just calls closef on B. | | | |
1553 | * | | 1554 | * |
1554 | * Well, a similar chain occurs on B, resulting in a sorflush on B, | | 1555 | * We grab an extra reference to each of the files that are |
1555 | * which results in another closef on A. Unfortunately, A is already | | 1556 | * not otherwise accessible and then free the rights that are |
1556 | * being closed, and the descriptor has already been marked with | | 1557 | * stored in messages on them. |
1557 | * SS_NOFDREF, and soclose panics at this point. | | | |
1558 | * | | | |
1559 | * Here, we first take an extra reference to each inaccessible | | | |
1560 | * descriptor. Then, if the inaccessible descriptor is a | | | |
1561 | * socket, we call sorflush in case it is a Unix domain | | | |
1562 | * socket. After we destroy all the rights carried in | | | |
1563 | * messages, we do a last closef to get rid of our extra | | | |
1564 | * reference. This is the last close, and the unp_detach etc | | | |
1565 | * will shut down the socket. | | | |
1566 | * | | | |
1567 | * 91/09/19, bsy@cs.cmu.edu | | | |
1568 | */ | | 1558 | */ |
1569 | if (nslots < nfiles) { | | 1559 | for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) { |
1570 | mutex_exit(&filelist_lock); | | 1560 | KASSERT(mutex_owned(&filelist_lock)); |
1571 | kmem_free(extra_ref, nslots * sizeof(file_t *)); | | 1561 | np = LIST_NEXT(fp, f_list); |
1572 | goto restart; | | | |
1573 | } | | | |
1574 | for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0; | | | |
1575 | fp = nextfp) { | | | |
1576 | nextfp = LIST_NEXT(fp, f_list); | | | |
1577 | mutex_enter(&fp->f_lock); | | 1562 | mutex_enter(&fp->f_lock); |
1578 | if (fp->f_count != 0 && | | 1563 | |
1579 | fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) { | | 1564 | /* |
1580 | *fpp++ = fp; | | 1565 | * Ignore non-sockets. |
1581 | nunref++; | | 1566 | * Ignore dead sockets, or sockets with pending close. |
1582 | fp->f_count++; | | 1567 | * Ignore sockets obviously referenced elsewhere. |
| | | 1568 | * Ignore sockets marked as referenced by our scan. |
| | | 1569 | * Ignore new sockets that did not exist during the scan. |
| | | 1570 | */ |
| | | 1571 | if (fp->f_type != DTYPE_SOCKET || |
| | | 1572 | fp->f_count == 0 || fp->f_unpcount != 0 || |
| | | 1573 | fp->f_count != fp->f_msgcount || |
| | | 1574 | (fp->f_flag & (FMARK | FSCAN)) != FSCAN) { |
| | | 1575 | mutex_exit(&fp->f_lock); |
| | | 1576 | continue; |
1583 | } | | 1577 | } |
| | | 1578 | |
| | | 1579 | /* Gain file ref, mark our position, and unlock. */ |
| | | 1580 | LIST_INSERT_AFTER(fp, dp, f_list); |
| | | 1581 | fp->f_count++; |
1584 | mutex_exit(&fp->f_lock); | | 1582 | mutex_exit(&fp->f_lock); |
| | | 1583 | mutex_exit(&filelist_lock); |
| | | 1584 | |
| | | 1585 | /* |
| | | 1586 | * Flush all data from the socket's receive buffer. |
| | | 1587 | * This will cause files referenced only by the |
| | | 1588 | * socket to be queued for close. |
| | | 1589 | */ |
| | | 1590 | so = fp->f_data; |
| | | 1591 | solock(so); |
| | | 1592 | sorflush(so); |
| | | 1593 | sounlock(so); |
| | | 1594 | |
| | | 1595 | /* Re-lock and restart from where we left off. */ |
| | | 1596 | closef(fp); |
| | | 1597 | mutex_enter(&filelist_lock); |
| | | 1598 | np = LIST_NEXT(dp, f_list); |
| | | 1599 | LIST_REMOVE(dp, f_list); |
1585 | } | | 1600 | } |
1586 | mutex_exit(&filelist_lock); | | 1601 | } |
1587 | | | 1602 | |
1588 | for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { | | 1603 | /* |
1589 | fp = *fpp; | | 1604 | * Garbage collector thread. While SCM_RIGHTS messages are in transit, |
1590 | if (fp->f_type == DTYPE_SOCKET) { | | 1605 | * wake once per second to garbage collect. Run continually while we |
1591 | so = fp->f_data; | | 1606 | * have deferred closes to process. |
1592 | solock(so); | | 1607 | */ |
1593 | sorflush(fp->f_data); | | 1608 | static void |
1594 | sounlock(so); | | 1609 | unp_thread(void *cookie) |
| | | 1610 | { |
| | | 1611 | file_t *dp; |
| | | 1612 | |
| | | 1613 | /* Allocate a dummy file for our scans. */ |
| | | 1614 | if ((dp = fgetdummy()) == NULL) { |
| | | 1615 | panic("unp_thread"); |
| | | 1616 | } |
| | | 1617 | |
| | | 1618 | mutex_enter(&filelist_lock); |
| | | 1619 | for (;;) { |
| | | 1620 | KASSERT(mutex_owned(&filelist_lock)); |
| | | 1621 | if (SLIST_EMPTY(&unp_thread_discard)) { |
| | | 1622 | if (unp_rights != 0) { |
| | | 1623 | (void)cv_timedwait(&unp_thread_cv, |
| | | 1624 | &filelist_lock, hz); |
| | | 1625 | } else { |
| | | 1626 | cv_wait(&unp_thread_cv, &filelist_lock); |
| | | 1627 | } |
1595 | } | | 1628 | } |
| | | 1629 | unp_gc(dp); |
1596 | } | | 1630 | } |
1597 | for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { | | 1631 | /* NOTREACHED */ |
1598 | closef(*fpp); | | 1632 | } |
| | | 1633 | |
| | | 1634 | /* |
| | | 1635 | * Kick the garbage collector into action if there is something for |
| | | 1636 | * it to process. |
| | | 1637 | */ |
| | | 1638 | static void |
| | | 1639 | unp_thread_kick(void) |
| | | 1640 | { |
| | | 1641 | |
| | | 1642 | if (!SLIST_EMPTY(&unp_thread_discard) || unp_rights != 0) { |
| | | 1643 | mutex_enter(&filelist_lock); |
| | | 1644 | cv_signal(&unp_thread_cv); |
| | | 1645 | mutex_exit(&filelist_lock); |
1599 | } | | 1646 | } |
1600 | kmem_free(extra_ref, nslots * sizeof(file_t *)); | | | |
1601 | atomic_swap_uint(&unp_gcing, 0); | | | |
1602 | } | | 1647 | } |
1603 | | | 1648 | |
1604 | void | | 1649 | void |
1605 | unp_dispose(struct mbuf *m) | | 1650 | unp_dispose(struct mbuf *m) |
1606 | { | | 1651 | { |
1607 | | | 1652 | |
1608 | if (m) | | 1653 | if (m) |
1609 | unp_scan(m, unp_discard, 1); | | 1654 | unp_scan(m, unp_discard_later, 1); |
1610 | } | | 1655 | } |
1611 | | | 1656 | |
1612 | void | | 1657 | void |
1613 | unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard) | | 1658 | unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard) |
1614 | { | | 1659 | { |
1615 | struct mbuf *m; | | 1660 | struct mbuf *m; |
1616 | file_t **rp; | | 1661 | file_t **rp, *fp; |
1617 | struct cmsghdr *cm; | | 1662 | struct cmsghdr *cm; |
1618 | int i; | | 1663 | int i, qfds; |
1619 | int qfds; | | | |
1620 | | | 1664 | |
1621 | while (m0) { | | 1665 | while (m0) { |
1622 | for (m = m0; m; m = m->m_next) { | | 1666 | for (m = m0; m; m = m->m_next) { |
1623 | if (m->m_type == MT_CONTROL && | | 1667 | if (m->m_type != MT_CONTROL || |
1624 | m->m_len >= sizeof(*cm)) { | | 1668 | m->m_len < sizeof(*cm)) { |
1625 | cm = mtod(m, struct cmsghdr *); | | 1669 | continue; |
1626 | if (cm->cmsg_level != SOL_SOCKET || | | 1670 | } |
1627 | cm->cmsg_type != SCM_RIGHTS) | | 1671 | cm = mtod(m, struct cmsghdr *); |
1628 | continue; | | 1672 | if (cm->cmsg_level != SOL_SOCKET || |
1629 | qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) | | 1673 | cm->cmsg_type != SCM_RIGHTS) |
1630 | / sizeof(file_t *); | | 1674 | continue; |
1631 | rp = (file_t **)CMSG_DATA(cm); | | 1675 | qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) |
1632 | for (i = 0; i < qfds; i++) { | | 1676 | / sizeof(file_t *); |
1633 | file_t *fp = *rp; | | 1677 | rp = (file_t **)CMSG_DATA(cm); |
1634 | if (discard) | | 1678 | for (i = 0; i < qfds; i++) { |
1635 | *rp = 0; | | 1679 | fp = *rp; |
1636 | (*op)(fp); | | 1680 | if (discard) { |
1637 | rp++; | | 1681 | *rp = 0; |
1638 | } | | 1682 | } |
1639 | break; /* XXX, but saves time */ | | 1683 | (*op)(fp); |
| | | 1684 | rp++; |
1640 | } | | 1685 | } |
1641 | } | | 1686 | } |
1642 | m0 = m0->m_nextpkt; | | 1687 | m0 = m0->m_nextpkt; |
1643 | } | | 1688 | } |
1644 | } | | 1689 | } |
1645 | | | 1690 | |
1646 | void | | 1691 | void |
1647 | unp_mark(file_t *fp) | | 1692 | unp_mark(file_t *fp) |
1648 | { | | 1693 | { |
1649 | | | 1694 | |
1650 | if (fp == NULL) | | 1695 | if (fp == NULL) |
1651 | return; | | 1696 | return; |
1652 | | | 1697 | |
1653 | /* If we're already deferred, don't screw up the defer count */ | | 1698 | /* If we're already deferred, don't screw up the defer count */ |
1654 | mutex_enter(&fp->f_lock); | | 1699 | mutex_enter(&fp->f_lock); |
1655 | if (fp->f_flag & (FMARK | FDEFER)) { | | 1700 | if (fp->f_flag & (FMARK | FDEFER)) { |
1656 | mutex_exit(&fp->f_lock); | | 1701 | mutex_exit(&fp->f_lock); |
1657 | return; | | 1702 | return; |
1658 | } | | 1703 | } |
1659 | | | 1704 | |
1660 | /* | | 1705 | /* |
1661 | * Minimize the number of deferrals... Sockets are the only | | 1706 | * Minimize the number of deferrals... Sockets are the only type of |
1662 | * type of descriptor which can hold references to another | | 1707 | * file which can hold references to another file, so just mark |
1663 | * descriptor, so just mark other descriptors, and defer | | 1708 | * other files, and defer unmarked sockets for the next pass. |
1664 | * unmarked sockets for the next pass. | | | |
1665 | */ | | 1709 | */ |
1666 | if (fp->f_type == DTYPE_SOCKET) { | | 1710 | if (fp->f_type == DTYPE_SOCKET) { |
1667 | unp_defer++; | | 1711 | unp_defer++; |
1668 | KASSERT(fp->f_count != 0); | | 1712 | KASSERT(fp->f_count != 0); |
1669 | atomic_or_uint(&fp->f_flag, FDEFER); | | 1713 | atomic_or_uint(&fp->f_flag, FDEFER); |
1670 | } else { | | 1714 | } else { |
1671 | atomic_or_uint(&fp->f_flag, FMARK); | | 1715 | atomic_or_uint(&fp->f_flag, FMARK); |
1672 | } | | 1716 | } |
1673 | mutex_exit(&fp->f_lock); | | 1717 | mutex_exit(&fp->f_lock); |
1674 | return; | | | |
1675 | } | | 1718 | } |
1676 | | | 1719 | |
1677 | void | | 1720 | static void |
1678 | unp_discard(file_t *fp) | | 1721 | unp_discard_now(file_t *fp) |
1679 | { | | 1722 | { |
1680 | | | 1723 | |
1681 | if (fp == NULL) | | 1724 | if (fp == NULL) |
1682 | return; | | 1725 | return; |
1683 | | | 1726 | |
1684 | mutex_enter(&fp->f_lock); | | | |
1685 | KASSERT(fp->f_count > 0); | | 1727 | KASSERT(fp->f_count > 0); |
| | | 1728 | KASSERT(fp->f_msgcount > 0); |
| | | 1729 | |
| | | 1730 | mutex_enter(&fp->f_lock); |
1686 | fp->f_msgcount--; | | 1731 | fp->f_msgcount--; |
1687 | mutex_exit(&fp->f_lock); | | 1732 | mutex_exit(&fp->f_lock); |
1688 | atomic_dec_uint(&unp_rights); | | 1733 | atomic_dec_uint(&unp_rights); |
1689 | (void)closef(fp); | | 1734 | (void)closef(fp); |
1690 | } | | 1735 | } |
| | | 1736 | |
| | | 1737 | static void |
| | | 1738 | unp_discard_later(file_t *fp) |
| | | 1739 | { |
| | | 1740 | |
| | | 1741 | if (fp == NULL) |
| | | 1742 | return; |
| | | 1743 | |
| | | 1744 | KASSERT(fp->f_count > 0); |
| | | 1745 | KASSERT(fp->f_msgcount > 0); |
| | | 1746 | |
| | | 1747 | mutex_enter(&filelist_lock); |
| | | 1748 | if (fp->f_unpcount++ == 0) { |
| | | 1749 | SLIST_INSERT_HEAD(&unp_thread_discard, fp, f_unplist); |
| | | 1750 | } |
| | | 1751 | mutex_exit(&filelist_lock); |
| | | 1752 | } |