Wed Nov 29 19:21:45 2017 UTC ()
Make tap(4) MP-safe.


(jmcneill)
diff -r1.101 -r1.102 src/sys/net/if_tap.c

cvs diff -r1.101 -r1.102 src/sys/net/if_tap.c (expand / switch to unified diff)

--- src/sys/net/if_tap.c 2017/10/30 16:01:19 1.101
+++ src/sys/net/if_tap.c 2017/11/29 19:21:44 1.102
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: if_tap.c,v 1.101 2017/10/30 16:01:19 ozaki-r Exp $ */ 1/* $NetBSD: if_tap.c,v 1.102 2017/11/29 19:21:44 jmcneill Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 2003, 2004, 2008, 2009 The NetBSD Foundation. 4 * Copyright (c) 2003, 2004, 2008, 2009 The NetBSD Foundation.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -23,47 +23,48 @@ @@ -23,47 +23,48 @@
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29/* 29/*
30 * tap(4) is a virtual Ethernet interface. It appears as a real Ethernet 30 * tap(4) is a virtual Ethernet interface. It appears as a real Ethernet
31 * device to the system, but can also be accessed by userland through a 31 * device to the system, but can also be accessed by userland through a
32 * character device interface, which allows reading and injecting frames. 32 * character device interface, which allows reading and injecting frames.
33 */ 33 */
34 34
35#include <sys/cdefs.h> 35#include <sys/cdefs.h>
36__KERNEL_RCSID(0, "$NetBSD: if_tap.c,v 1.101 2017/10/30 16:01:19 ozaki-r Exp $"); 36__KERNEL_RCSID(0, "$NetBSD: if_tap.c,v 1.102 2017/11/29 19:21:44 jmcneill Exp $");
37 37
38#if defined(_KERNEL_OPT) 38#if defined(_KERNEL_OPT)
39 39
40#include "opt_modular.h" 40#include "opt_modular.h"
41#include "opt_compat_netbsd.h" 41#include "opt_compat_netbsd.h"
42#endif 42#endif
43 43
44#include <sys/param.h> 44#include <sys/param.h>
45#include <sys/atomic.h> 45#include <sys/atomic.h>
46#include <sys/conf.h> 46#include <sys/conf.h>
47#include <sys/cprng.h> 47#include <sys/cprng.h>
48#include <sys/device.h> 48#include <sys/device.h>
49#include <sys/file.h> 49#include <sys/file.h>
50#include <sys/filedesc.h> 50#include <sys/filedesc.h>
51#include <sys/intr.h> 51#include <sys/intr.h>
52#include <sys/kauth.h> 52#include <sys/kauth.h>
53#include <sys/kernel.h> 53#include <sys/kernel.h>
54#include <sys/kmem.h> 54#include <sys/kmem.h>
55#include <sys/module.h> 55#include <sys/module.h>
56#include <sys/mutex.h> 56#include <sys/mutex.h>
 57#include <sys/condvar.h>
57#include <sys/poll.h> 58#include <sys/poll.h>
58#include <sys/proc.h> 59#include <sys/proc.h>
59#include <sys/select.h> 60#include <sys/select.h>
60#include <sys/sockio.h> 61#include <sys/sockio.h>
61#include <sys/stat.h> 62#include <sys/stat.h>
62#include <sys/sysctl.h> 63#include <sys/sysctl.h>
63#include <sys/systm.h> 64#include <sys/systm.h>
64 65
65#include <net/if.h> 66#include <net/if.h>
66#include <net/if_dl.h> 67#include <net/if_dl.h>
67#include <net/if_ether.h> 68#include <net/if_ether.h>
68#include <net/if_media.h> 69#include <net/if_media.h>
69#include <net/if_tap.h> 70#include <net/if_tap.h>
@@ -99,28 +100,28 @@ static void sysctl_tap_setup(struct sysc @@ -99,28 +100,28 @@ static void sysctl_tap_setup(struct sysc
99 */ 100 */
100 101
101struct tap_softc { 102struct tap_softc {
102 device_t sc_dev; 103 device_t sc_dev;
103 struct ifmedia sc_im; 104 struct ifmedia sc_im;
104 struct ethercom sc_ec; 105 struct ethercom sc_ec;
105 int sc_flags; 106 int sc_flags;
106#define TAP_INUSE 0x00000001 /* tap device can only be opened once */ 107#define TAP_INUSE 0x00000001 /* tap device can only be opened once */
107#define TAP_ASYNCIO 0x00000002 /* user is using async I/O (SIGIO) on the device */ 108#define TAP_ASYNCIO 0x00000002 /* user is using async I/O (SIGIO) on the device */
108#define TAP_NBIO 0x00000004 /* user wants calls to avoid blocking */ 109#define TAP_NBIO 0x00000004 /* user wants calls to avoid blocking */
109#define TAP_GOING 0x00000008 /* interface is being destroyed */ 110#define TAP_GOING 0x00000008 /* interface is being destroyed */
110 struct selinfo sc_rsel; 111 struct selinfo sc_rsel;
111 pid_t sc_pgid; /* For async. IO */ 112 pid_t sc_pgid; /* For async. IO */
112 kmutex_t sc_rdlock; 113 kmutex_t sc_lock;
113 kmutex_t sc_kqlock; 114 kcondvar_t sc_cv;
114 void *sc_sih; 115 void *sc_sih;
115 struct timespec sc_atime; 116 struct timespec sc_atime;
116 struct timespec sc_mtime; 117 struct timespec sc_mtime;
117 struct timespec sc_btime; 118 struct timespec sc_btime;
118}; 119};
119 120
120/* autoconf(9) glue */ 121/* autoconf(9) glue */
121 122
122static int tap_match(device_t, cfdata_t, void *); 123static int tap_match(device_t, cfdata_t, void *);
123static void tap_attach(device_t, device_t, void *); 124static void tap_attach(device_t, device_t, void *);
124static int tap_detach(device_t, int); 125static int tap_detach(device_t, int);
125 126
126CFATTACH_DECL_NEW(tap, sizeof(struct tap_softc), 127CFATTACH_DECL_NEW(tap, sizeof(struct tap_softc),
@@ -172,27 +173,27 @@ static int tap_cdev_kqfilter(dev_t, stru @@ -172,27 +173,27 @@ static int tap_cdev_kqfilter(dev_t, stru
172 173
173const struct cdevsw tap_cdevsw = { 174const struct cdevsw tap_cdevsw = {
174 .d_open = tap_cdev_open, 175 .d_open = tap_cdev_open,
175 .d_close = tap_cdev_close, 176 .d_close = tap_cdev_close,
176 .d_read = tap_cdev_read, 177 .d_read = tap_cdev_read,
177 .d_write = tap_cdev_write, 178 .d_write = tap_cdev_write,
178 .d_ioctl = tap_cdev_ioctl, 179 .d_ioctl = tap_cdev_ioctl,
179 .d_stop = nostop, 180 .d_stop = nostop,
180 .d_tty = notty, 181 .d_tty = notty,
181 .d_poll = tap_cdev_poll, 182 .d_poll = tap_cdev_poll,
182 .d_mmap = nommap, 183 .d_mmap = nommap,
183 .d_kqfilter = tap_cdev_kqfilter, 184 .d_kqfilter = tap_cdev_kqfilter,
184 .d_discard = nodiscard, 185 .d_discard = nodiscard,
185 .d_flag = D_OTHER 186 .d_flag = D_OTHER | D_MPSAFE
186}; 187};
187 188
188#define TAP_CLONER 0xfffff /* Maximal minor value */ 189#define TAP_CLONER 0xfffff /* Maximal minor value */
189 190
190/* kqueue-related routines */ 191/* kqueue-related routines */
191static void tap_kqdetach(struct knote *); 192static void tap_kqdetach(struct knote *);
192static int tap_kqread(struct knote *, long); 193static int tap_kqread(struct knote *, long);
193 194
194/* 195/*
195 * Those are needed by the if_media interface. 196 * Those are needed by the if_media interface.
196 */ 197 */
197 198
198static int tap_mediachange(struct ifnet *); 199static int tap_mediachange(struct ifnet *);
@@ -305,42 +306,28 @@ tap_attach(device_t parent, device_t sel @@ -305,42 +306,28 @@ tap_attach(device_t parent, device_t sel
305 const struct sysctlnode *node; 306 const struct sysctlnode *node;
306 int error; 307 int error;
307 uint8_t enaddr[ETHER_ADDR_LEN] = 308 uint8_t enaddr[ETHER_ADDR_LEN] =
308 { 0xf2, 0x0b, 0xa4, 0xff, 0xff, 0xff }; 309 { 0xf2, 0x0b, 0xa4, 0xff, 0xff, 0xff };
309 char enaddrstr[3 * ETHER_ADDR_LEN]; 310 char enaddrstr[3 * ETHER_ADDR_LEN];
310 311
311 sc->sc_dev = self; 312 sc->sc_dev = self;
312 sc->sc_sih = NULL; 313 sc->sc_sih = NULL;
313 getnanotime(&sc->sc_btime); 314 getnanotime(&sc->sc_btime);
314 sc->sc_atime = sc->sc_mtime = sc->sc_btime; 315 sc->sc_atime = sc->sc_mtime = sc->sc_btime;
315 sc->sc_flags = 0; 316 sc->sc_flags = 0;
316 selinit(&sc->sc_rsel); 317 selinit(&sc->sc_rsel);
317 318
318 /* 319 cv_init(&sc->sc_cv, "tapread");
319 * Initialize the two locks for the device. 320 mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_NET);
320 * 
321 * We need a lock here because even though the tap device can be 
322 * opened only once, the file descriptor might be passed to another 
323 * process, say a fork(2)ed child. 
324 * 
325 * The Giant saves us from most of the hassle, but since the read 
326 * operation can sleep, we don't want two processes to wake up at 
327 * the same moment and both try and dequeue a single packet. 
328 * 
329 * The queue for event listeners (used by kqueue(9), see below) has 
330 * to be protected too, so use a spin lock. 
331 */ 
332 mutex_init(&sc->sc_rdlock, MUTEX_DEFAULT, IPL_NONE); 
333 mutex_init(&sc->sc_kqlock, MUTEX_DEFAULT, IPL_VM); 
334 321
335 if (!pmf_device_register(self, NULL, NULL)) 322 if (!pmf_device_register(self, NULL, NULL))
336 aprint_error_dev(self, "couldn't establish power handler\n"); 323 aprint_error_dev(self, "couldn't establish power handler\n");
337 324
338 /* 325 /*
339 * In order to obtain unique initial Ethernet address on a host, 326 * In order to obtain unique initial Ethernet address on a host,
340 * do some randomisation. It's not meant for anything but avoiding 327 * do some randomisation. It's not meant for anything but avoiding
341 * hard-coding an address. 328 * hard-coding an address.
342 */ 329 */
343 cprng_fast(&enaddr[3], 3); 330 cprng_fast(&enaddr[3], 3);
344 331
345 aprint_verbose_dev(self, "Ethernet address %s\n", 332 aprint_verbose_dev(self, "Ethernet address %s\n",
346 ether_snprintf(enaddrstr, sizeof(enaddrstr), enaddr)); 333 ether_snprintf(enaddrstr, sizeof(enaddrstr), enaddr));
@@ -375,32 +362,32 @@ tap_attach(device_t parent, device_t sel @@ -375,32 +362,32 @@ tap_attach(device_t parent, device_t sel
375 ifp->if_start = tap_start; 362 ifp->if_start = tap_start;
376 ifp->if_stop = tap_stop; 363 ifp->if_stop = tap_stop;
377 ifp->if_init = tap_init; 364 ifp->if_init = tap_init;
378 IFQ_SET_READY(&ifp->if_snd); 365 IFQ_SET_READY(&ifp->if_snd);
379 366
380 sc->sc_ec.ec_capabilities = ETHERCAP_VLAN_MTU | ETHERCAP_JUMBO_MTU; 367 sc->sc_ec.ec_capabilities = ETHERCAP_VLAN_MTU | ETHERCAP_JUMBO_MTU;
381 368
382 /* Those steps are mandatory for an Ethernet driver. */ 369 /* Those steps are mandatory for an Ethernet driver. */
383 error = if_initialize(ifp); 370 error = if_initialize(ifp);
384 if (error != 0) { 371 if (error != 0) {
385 aprint_error_dev(self, "if_initialize failed(%d)\n", error); 372 aprint_error_dev(self, "if_initialize failed(%d)\n", error);
386 ifmedia_removeall(&sc->sc_im); 373 ifmedia_removeall(&sc->sc_im);
387 pmf_device_deregister(self); 374 pmf_device_deregister(self);
388 mutex_destroy(&sc->sc_rdlock); 375 mutex_destroy(&sc->sc_lock);
389 mutex_destroy(&sc->sc_kqlock); 
390 seldestroy(&sc->sc_rsel); 376 seldestroy(&sc->sc_rsel);
391 377
392 return; /* Error */ 378 return; /* Error */
393 } 379 }
 380 ifp->if_percpuq = if_percpuq_create(ifp);
394 ether_ifattach(ifp, enaddr); 381 ether_ifattach(ifp, enaddr);
395 if_register(ifp); 382 if_register(ifp);
396 383
397 /* 384 /*
398 * Add a sysctl node for that interface. 385 * Add a sysctl node for that interface.
399 * 386 *
400 * The pointer transmitted is not a string, but instead a pointer to 387 * The pointer transmitted is not a string, but instead a pointer to
401 * the softc structure, which we can use to build the string value on 388 * the softc structure, which we can use to build the string value on
402 * the fly in the helper function of the node. See the comments for 389 * the fly in the helper function of the node. See the comments for
403 * tap_sysctl_handler for details. 390 * tap_sysctl_handler for details.
404 * 391 *
405 * Usually sysctl_createv is called with CTL_CREATE as the before-last 392 * Usually sysctl_createv is called with CTL_CREATE as the before-last
406 * component. However, we can allocate a number ourselves, as we are 393 * component. However, we can allocate a number ourselves, as we are
@@ -418,54 +405,51 @@ tap_attach(device_t parent, device_t sel @@ -418,54 +405,51 @@ tap_attach(device_t parent, device_t sel
418 "sysctl_createv returned %d, ignoring\n", error); 405 "sysctl_createv returned %d, ignoring\n", error);
419} 406}
420 407
421/* 408/*
422 * When detaching, we do the inverse of what is done in the attach 409 * When detaching, we do the inverse of what is done in the attach
423 * routine, in reversed order. 410 * routine, in reversed order.
424 */ 411 */
425static int 412static int
426tap_detach(device_t self, int flags) 413tap_detach(device_t self, int flags)
427{ 414{
428 struct tap_softc *sc = device_private(self); 415 struct tap_softc *sc = device_private(self);
429 struct ifnet *ifp = &sc->sc_ec.ec_if; 416 struct ifnet *ifp = &sc->sc_ec.ec_if;
430 int error; 417 int error;
431 int s; 
432 418
433 sc->sc_flags |= TAP_GOING; 419 sc->sc_flags |= TAP_GOING;
434 s = splnet(); 
435 tap_stop(ifp, 1); 420 tap_stop(ifp, 1);
436 if_down(ifp); 421 if_down(ifp);
437 splx(s); 
438 422
439 if (sc->sc_sih != NULL) { 423 if (sc->sc_sih != NULL) {
440 softint_disestablish(sc->sc_sih); 424 softint_disestablish(sc->sc_sih);
441 sc->sc_sih = NULL; 425 sc->sc_sih = NULL;
442 } 426 }
443 427
444 /* 428 /*
445 * Destroying a single leaf is a very straightforward operation using 429 * Destroying a single leaf is a very straightforward operation using
446 * sysctl_destroyv. One should be sure to always end the path with 430 * sysctl_destroyv. One should be sure to always end the path with
447 * CTL_EOL. 431 * CTL_EOL.
448 */ 432 */
449 if ((error = sysctl_destroyv(NULL, CTL_NET, AF_LINK, tap_node, 433 if ((error = sysctl_destroyv(NULL, CTL_NET, AF_LINK, tap_node,
450 device_unit(sc->sc_dev), CTL_EOL)) != 0) 434 device_unit(sc->sc_dev), CTL_EOL)) != 0)
451 aprint_error_dev(self, 435 aprint_error_dev(self,
452 "sysctl_destroyv returned %d, ignoring\n", error); 436 "sysctl_destroyv returned %d, ignoring\n", error);
453 ether_ifdetach(ifp); 437 ether_ifdetach(ifp);
454 if_detach(ifp); 438 if_detach(ifp);
455 ifmedia_removeall(&sc->sc_im); 439 ifmedia_removeall(&sc->sc_im);
456 seldestroy(&sc->sc_rsel); 440 seldestroy(&sc->sc_rsel);
457 mutex_destroy(&sc->sc_rdlock); 441 mutex_destroy(&sc->sc_lock);
458 mutex_destroy(&sc->sc_kqlock); 442 cv_destroy(&sc->sc_cv);
459 443
460 pmf_device_deregister(self); 444 pmf_device_deregister(self);
461 445
462 return 0; 446 return 0;
463} 447}
464 448
465/* 449/*
466 * This function is called by the ifmedia layer to notify the driver 450 * This function is called by the ifmedia layer to notify the driver
467 * that the user requested a media change. A real driver would 451 * that the user requested a media change. A real driver would
468 * reconfigure the hardware. 452 * reconfigure the hardware.
469 */ 453 */
470static int 454static int
471tap_mediachange(struct ifnet *ifp) 455tap_mediachange(struct ifnet *ifp)
@@ -506,45 +490,48 @@ tap_mediastatus(struct ifnet *ifp, struc @@ -506,45 +490,48 @@ tap_mediastatus(struct ifnet *ifp, struc
506 * userland. For that we stay in OACTIVE mode while the userland gets 490 * userland. For that we stay in OACTIVE mode while the userland gets
507 * the packets, and we send a signal to the processes waiting to read. 491 * the packets, and we send a signal to the processes waiting to read.
508 * 492 *
509 * wakeup(sc) is the counterpart to the tsleep call in 493 * wakeup(sc) is the counterpart to the tsleep call in
510 * tap_dev_read, while selnotify() is used for kevent(2) and 494 * tap_dev_read, while selnotify() is used for kevent(2) and
511 * poll(2) (which includes select(2)) listeners. 495 * poll(2) (which includes select(2)) listeners.
512 */ 496 */
513static void 497static void
514tap_start(struct ifnet *ifp) 498tap_start(struct ifnet *ifp)
515{ 499{
516 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 500 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc;
517 struct mbuf *m0; 501 struct mbuf *m0;
518 502
 503 mutex_enter(&sc->sc_lock);
519 if ((sc->sc_flags & TAP_INUSE) == 0) { 504 if ((sc->sc_flags & TAP_INUSE) == 0) {
520 /* Simply drop packets */ 505 /* Simply drop packets */
521 for(;;) { 506 for(;;) {
522 IFQ_DEQUEUE(&ifp->if_snd, m0); 507 IFQ_DEQUEUE(&ifp->if_snd, m0);
523 if (m0 == NULL) 508 if (m0 == NULL)
524 return; 509 goto done;
525 510
526 ifp->if_opackets++; 511 ifp->if_opackets++;
527 bpf_mtap(ifp, m0); 512 bpf_mtap(ifp, m0);
528 513
529 m_freem(m0); 514 m_freem(m0);
530 } 515 }
531 } else if (!IFQ_IS_EMPTY(&ifp->if_snd)) { 516 } else if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
532 ifp->if_flags |= IFF_OACTIVE; 517 ifp->if_flags |= IFF_OACTIVE;
533 wakeup(sc); 518 cv_broadcast(&sc->sc_cv);
534 selnotify(&sc->sc_rsel, 0, 1); 519 selnotify(&sc->sc_rsel, 0, 1);
535 if (sc->sc_flags & TAP_ASYNCIO) 520 if (sc->sc_flags & TAP_ASYNCIO)
536 softint_schedule(sc->sc_sih); 521 softint_schedule(sc->sc_sih);
537 } 522 }
 523done:
 524 mutex_exit(&sc->sc_lock);
538} 525}
539 526
540static void 527static void
541tap_softintr(void *cookie) 528tap_softintr(void *cookie)
542{ 529{
543 struct tap_softc *sc; 530 struct tap_softc *sc;
544 struct ifnet *ifp; 531 struct ifnet *ifp;
545 int a, b; 532 int a, b;
546 533
547 sc = cookie; 534 sc = cookie;
548 535
549 if (sc->sc_flags & TAP_ASYNCIO) { 536 if (sc->sc_flags & TAP_ASYNCIO) {
550 ifp = &sc->sc_ec.ec_if; 537 ifp = &sc->sc_ec.ec_if;
@@ -635,31 +622,33 @@ tap_init(struct ifnet *ifp) @@ -635,31 +622,33 @@ tap_init(struct ifnet *ifp)
635/* 622/*
636 * _stop() is called when an interface goes down. It is our 623 * _stop() is called when an interface goes down. It is our
637 * responsability to validate that state by clearing the 624 * responsability to validate that state by clearing the
638 * IFF_RUNNING flag. 625 * IFF_RUNNING flag.
639 * 626 *
640 * We have to wake up all the sleeping processes to have the pending 627 * We have to wake up all the sleeping processes to have the pending
641 * read requests cancelled. 628 * read requests cancelled.
642 */ 629 */
643static void 630static void
644tap_stop(struct ifnet *ifp, int disable) 631tap_stop(struct ifnet *ifp, int disable)
645{ 632{
646 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 633 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc;
647 634
 635 mutex_enter(&sc->sc_lock);
648 ifp->if_flags &= ~IFF_RUNNING; 636 ifp->if_flags &= ~IFF_RUNNING;
649 wakeup(sc); 637 cv_broadcast(&sc->sc_cv);
650 selnotify(&sc->sc_rsel, 0, 1); 638 selnotify(&sc->sc_rsel, 0, 1);
651 if (sc->sc_flags & TAP_ASYNCIO) 639 if (sc->sc_flags & TAP_ASYNCIO)
652 softint_schedule(sc->sc_sih); 640 softint_schedule(sc->sc_sih);
 641 mutex_exit(&sc->sc_lock);
653} 642}
654 643
655/* 644/*
656 * The 'create' command of ifconfig can be used to create 645 * The 'create' command of ifconfig can be used to create
657 * any numbered instance of a given device. Thus we have to 646 * any numbered instance of a given device. Thus we have to
658 * make sure we have enough room in cd_devs to create the 647 * make sure we have enough room in cd_devs to create the
659 * user-specified instance. config_attach_pseudo will do this 648 * user-specified instance. config_attach_pseudo will do this
660 * for us. 649 * for us.
661 */ 650 */
662static int 651static int
663tap_clone_create(struct if_clone *ifc, int unit) 652tap_clone_create(struct if_clone *ifc, int unit)
664{ 653{
665 if (tap_clone_creator(unit) == NULL) { 654 if (tap_clone_creator(unit) == NULL) {
@@ -921,100 +910,90 @@ tap_fops_read(file_t *fp, off_t *offp, s @@ -921,100 +910,90 @@ tap_fops_read(file_t *fp, off_t *offp, s
921 910
922 KERNEL_LOCK(1, NULL); 911 KERNEL_LOCK(1, NULL);
923 error = tap_dev_read(fp->f_devunit, uio, flags); 912 error = tap_dev_read(fp->f_devunit, uio, flags);
924 KERNEL_UNLOCK_ONE(NULL); 913 KERNEL_UNLOCK_ONE(NULL);
925 return error; 914 return error;
926} 915}
927 916
928static int 917static int
929tap_dev_read(int unit, struct uio *uio, int flags) 918tap_dev_read(int unit, struct uio *uio, int flags)
930{ 919{
931 struct tap_softc *sc = device_lookup_private(&tap_cd, unit); 920 struct tap_softc *sc = device_lookup_private(&tap_cd, unit);
932 struct ifnet *ifp; 921 struct ifnet *ifp;
933 struct mbuf *m, *n; 922 struct mbuf *m, *n;
934 int error = 0, s; 923 int error = 0;
935 924
936 if (sc == NULL) 925 if (sc == NULL)
937 return ENXIO; 926 return ENXIO;
938 927
939 getnanotime(&sc->sc_atime); 928 getnanotime(&sc->sc_atime);
940 929
941 ifp = &sc->sc_ec.ec_if; 930 ifp = &sc->sc_ec.ec_if;
942 if ((ifp->if_flags & IFF_UP) == 0) 931 if ((ifp->if_flags & IFF_UP) == 0)
943 return EHOSTDOWN; 932 return EHOSTDOWN;
944 933
945 /* 934 /*
946 * In the TAP_NBIO case, we have to make sure we won't be sleeping 935 * In the TAP_NBIO case, we have to make sure we won't be sleeping
947 */ 936 */
948 if ((sc->sc_flags & TAP_NBIO) != 0) { 937 if ((sc->sc_flags & TAP_NBIO) != 0) {
949 if (!mutex_tryenter(&sc->sc_rdlock)) 938 if (!mutex_tryenter(&sc->sc_lock))
950 return EWOULDBLOCK; 939 return EWOULDBLOCK;
951 } else { 940 } else {
952 mutex_enter(&sc->sc_rdlock); 941 mutex_enter(&sc->sc_lock);
953 } 942 }
954 943
955 s = splnet(); 
956 if (IFQ_IS_EMPTY(&ifp->if_snd)) { 944 if (IFQ_IS_EMPTY(&ifp->if_snd)) {
957 ifp->if_flags &= ~IFF_OACTIVE; 945 ifp->if_flags &= ~IFF_OACTIVE;
958 /* 
959 * We must release the lock before sleeping, and re-acquire it 
960 * after. 
961 */ 
962 mutex_exit(&sc->sc_rdlock); 
963 if (sc->sc_flags & TAP_NBIO) 946 if (sc->sc_flags & TAP_NBIO)
964 error = EWOULDBLOCK; 947 error = EWOULDBLOCK;
965 else 948 else
966 error = tsleep(sc, PSOCK|PCATCH, "tap", 0); 949 error = cv_wait_sig(&sc->sc_cv, &sc->sc_lock);
967 splx(s); 
968 950
969 if (error != 0) 951 if (error != 0) {
 952 mutex_exit(&sc->sc_lock);
970 return error; 953 return error;
 954 }
971 /* The device might have been downed */ 955 /* The device might have been downed */
972 if ((ifp->if_flags & IFF_UP) == 0) 956 if ((ifp->if_flags & IFF_UP) == 0) {
 957 mutex_exit(&sc->sc_lock);
973 return EHOSTDOWN; 958 return EHOSTDOWN;
974 if ((sc->sc_flags & TAP_NBIO)) { 
975 if (!mutex_tryenter(&sc->sc_rdlock)) 
976 return EWOULDBLOCK; 
977 } else { 
978 mutex_enter(&sc->sc_rdlock); 
979 } 959 }
980 s = splnet(); 
981 } 960 }
982 961
983 IFQ_DEQUEUE(&ifp->if_snd, m); 962 IFQ_DEQUEUE(&ifp->if_snd, m);
 963 mutex_exit(&sc->sc_lock);
 964
984 ifp->if_flags &= ~IFF_OACTIVE; 965 ifp->if_flags &= ~IFF_OACTIVE;
985 splx(s); 
986 if (m == NULL) { 966 if (m == NULL) {
987 error = 0; 967 error = 0;
988 goto out; 968 goto out;
989 } 969 }
990 970
991 ifp->if_opackets++; 971 ifp->if_opackets++;
992 bpf_mtap(ifp, m); 972 bpf_mtap(ifp, m);
993 973
994 /* 974 /*
995 * One read is one packet. 975 * One read is one packet.
996 */ 976 */
997 do { 977 do {
998 error = uiomove(mtod(m, void *), 978 error = uiomove(mtod(m, void *),
999 min(m->m_len, uio->uio_resid), uio); 979 min(m->m_len, uio->uio_resid), uio);
1000 m = n = m_free(m); 980 m = n = m_free(m);
1001 } while (m != NULL && uio->uio_resid > 0 && error == 0); 981 } while (m != NULL && uio->uio_resid > 0 && error == 0);
1002 982
1003 if (m != NULL) 983 if (m != NULL)
1004 m_freem(m); 984 m_freem(m);
1005 985
1006out: 986out:
1007 mutex_exit(&sc->sc_rdlock); 
1008 return error; 987 return error;
1009} 988}
1010 989
1011static int 990static int
1012tap_fops_stat(file_t *fp, struct stat *st) 991tap_fops_stat(file_t *fp, struct stat *st)
1013{ 992{
1014 int error = 0; 993 int error = 0;
1015 struct tap_softc *sc; 994 struct tap_softc *sc;
1016 int unit = fp->f_devunit; 995 int unit = fp->f_devunit;
1017 996
1018 (void)memset(st, 0, sizeof(*st)); 997 (void)memset(st, 0, sizeof(*st));
1019 998
1020 KERNEL_LOCK(1, NULL); 999 KERNEL_LOCK(1, NULL);
@@ -1051,27 +1030,26 @@ tap_fops_write(file_t *fp, off_t *offp,  @@ -1051,27 +1030,26 @@ tap_fops_write(file_t *fp, off_t *offp,
1051 error = tap_dev_write(fp->f_devunit, uio, flags); 1030 error = tap_dev_write(fp->f_devunit, uio, flags);
1052 KERNEL_UNLOCK_ONE(NULL); 1031 KERNEL_UNLOCK_ONE(NULL);
1053 return error; 1032 return error;
1054} 1033}
1055 1034
1056static int 1035static int
1057tap_dev_write(int unit, struct uio *uio, int flags) 1036tap_dev_write(int unit, struct uio *uio, int flags)
1058{ 1037{
1059 struct tap_softc *sc = 1038 struct tap_softc *sc =
1060 device_lookup_private(&tap_cd, unit); 1039 device_lookup_private(&tap_cd, unit);
1061 struct ifnet *ifp; 1040 struct ifnet *ifp;
1062 struct mbuf *m, **mp; 1041 struct mbuf *m, **mp;
1063 int error = 0; 1042 int error = 0;
1064 int s; 
1065 1043
1066 if (sc == NULL) 1044 if (sc == NULL)
1067 return ENXIO; 1045 return ENXIO;
1068 1046
1069 getnanotime(&sc->sc_mtime); 1047 getnanotime(&sc->sc_mtime);
1070 ifp = &sc->sc_ec.ec_if; 1048 ifp = &sc->sc_ec.ec_if;
1071 1049
1072 /* One write, one packet, that's the rule */ 1050 /* One write, one packet, that's the rule */
1073 MGETHDR(m, M_DONTWAIT, MT_DATA); 1051 MGETHDR(m, M_DONTWAIT, MT_DATA);
1074 if (m == NULL) { 1052 if (m == NULL) {
1075 ifp->if_ierrors++; 1053 ifp->if_ierrors++;
1076 return ENOBUFS; 1054 return ENOBUFS;
1077 } 1055 }
@@ -1088,29 +1066,27 @@ tap_dev_write(int unit, struct uio *uio, @@ -1088,29 +1066,27 @@ tap_dev_write(int unit, struct uio *uio,
1088 } 1066 }
1089 (*mp)->m_len = min(MHLEN, uio->uio_resid); 1067 (*mp)->m_len = min(MHLEN, uio->uio_resid);
1090 error = uiomove(mtod(*mp, void *), (*mp)->m_len, uio); 1068 error = uiomove(mtod(*mp, void *), (*mp)->m_len, uio);
1091 mp = &(*mp)->m_next; 1069 mp = &(*mp)->m_next;
1092 } 1070 }
1093 if (error) { 1071 if (error) {
1094 ifp->if_ierrors++; 1072 ifp->if_ierrors++;
1095 m_freem(m); 1073 m_freem(m);
1096 return error; 1074 return error;
1097 } 1075 }
1098 1076
1099 m_set_rcvif(m, ifp); 1077 m_set_rcvif(m, ifp);
1100 1078
1101 s = splnet(); 1079 if_percpuq_enqueue(ifp->if_percpuq, m);
1102 if_input(ifp, m); 
1103 splx(s); 
1104 1080
1105 return 0; 1081 return 0;
1106} 1082}
1107 1083
1108static int 1084static int
1109tap_cdev_ioctl(dev_t dev, u_long cmd, void *data, int flags, 1085tap_cdev_ioctl(dev_t dev, u_long cmd, void *data, int flags,
1110 struct lwp *l) 1086 struct lwp *l)
1111{ 1087{
1112 return tap_dev_ioctl(minor(dev), cmd, data, l); 1088 return tap_dev_ioctl(minor(dev), cmd, data, l);
1113} 1089}
1114 1090
1115static int 1091static int
1116tap_fops_ioctl(file_t *fp, u_long cmd, void *data) 1092tap_fops_ioctl(file_t *fp, u_long cmd, void *data)
@@ -1211,29 +1187,29 @@ tap_dev_poll(int unit, int events, struc @@ -1211,29 +1187,29 @@ tap_dev_poll(int unit, int events, struc
1211 return POLLERR; 1187 return POLLERR;
1212 1188
1213 if (events & (POLLIN|POLLRDNORM)) { 1189 if (events & (POLLIN|POLLRDNORM)) {
1214 struct ifnet *ifp = &sc->sc_ec.ec_if; 1190 struct ifnet *ifp = &sc->sc_ec.ec_if;
1215 struct mbuf *m; 1191 struct mbuf *m;
1216 int s; 1192 int s;
1217 1193
1218 s = splnet(); 1194 s = splnet();
1219 IFQ_POLL(&ifp->if_snd, m); 1195 IFQ_POLL(&ifp->if_snd, m);
1220 1196
1221 if (m != NULL) 1197 if (m != NULL)
1222 revents |= events & (POLLIN|POLLRDNORM); 1198 revents |= events & (POLLIN|POLLRDNORM);
1223 else { 1199 else {
1224 mutex_spin_enter(&sc->sc_kqlock); 1200 mutex_spin_enter(&sc->sc_lock);
1225 selrecord(l, &sc->sc_rsel); 1201 selrecord(l, &sc->sc_rsel);
1226 mutex_spin_exit(&sc->sc_kqlock); 1202 mutex_spin_exit(&sc->sc_lock);
1227 } 1203 }
1228 splx(s); 1204 splx(s);
1229 } 1205 }
1230 revents |= events & (POLLOUT|POLLWRNORM); 1206 revents |= events & (POLLOUT|POLLWRNORM);
1231 1207
1232 return revents; 1208 return revents;
1233} 1209}
1234 1210
1235static struct filterops tap_read_filterops = { 1, NULL, tap_kqdetach, 1211static struct filterops tap_read_filterops = { 1, NULL, tap_kqdetach,
1236 tap_kqread }; 1212 tap_kqread };
1237static struct filterops tap_seltrue_filterops = { 1, NULL, tap_kqdetach, 1213static struct filterops tap_seltrue_filterops = { 1, NULL, tap_kqdetach,
1238 filt_seltrue }; 1214 filt_seltrue };
1239 1215
@@ -1262,42 +1238,42 @@ tap_dev_kqfilter(int unit, struct knote  @@ -1262,42 +1238,42 @@ tap_dev_kqfilter(int unit, struct knote
1262 switch(kn->kn_filter) { 1238 switch(kn->kn_filter) {
1263 case EVFILT_READ: 1239 case EVFILT_READ:
1264 kn->kn_fop = &tap_read_filterops; 1240 kn->kn_fop = &tap_read_filterops;
1265 break; 1241 break;
1266 case EVFILT_WRITE: 1242 case EVFILT_WRITE:
1267 kn->kn_fop = &tap_seltrue_filterops; 1243 kn->kn_fop = &tap_seltrue_filterops;
1268 break; 1244 break;
1269 default: 1245 default:
1270 KERNEL_UNLOCK_ONE(NULL); 1246 KERNEL_UNLOCK_ONE(NULL);
1271 return EINVAL; 1247 return EINVAL;
1272 } 1248 }
1273 1249
1274 kn->kn_hook = sc; 1250 kn->kn_hook = sc;
1275 mutex_spin_enter(&sc->sc_kqlock); 1251 mutex_spin_enter(&sc->sc_lock);
1276 SLIST_INSERT_HEAD(&sc->sc_rsel.sel_klist, kn, kn_selnext); 1252 SLIST_INSERT_HEAD(&sc->sc_rsel.sel_klist, kn, kn_selnext);
1277 mutex_spin_exit(&sc->sc_kqlock); 1253 mutex_spin_exit(&sc->sc_lock);
1278 KERNEL_UNLOCK_ONE(NULL); 1254 KERNEL_UNLOCK_ONE(NULL);
1279 return 0; 1255 return 0;
1280} 1256}
1281 1257
1282static void 1258static void
1283tap_kqdetach(struct knote *kn) 1259tap_kqdetach(struct knote *kn)
1284{ 1260{
1285 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook; 1261 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook;
1286 1262
1287 KERNEL_LOCK(1, NULL); 1263 KERNEL_LOCK(1, NULL);
1288 mutex_spin_enter(&sc->sc_kqlock); 1264 mutex_spin_enter(&sc->sc_lock);
1289 SLIST_REMOVE(&sc->sc_rsel.sel_klist, kn, knote, kn_selnext); 1265 SLIST_REMOVE(&sc->sc_rsel.sel_klist, kn, knote, kn_selnext);
1290 mutex_spin_exit(&sc->sc_kqlock); 1266 mutex_spin_exit(&sc->sc_lock);
1291 KERNEL_UNLOCK_ONE(NULL); 1267 KERNEL_UNLOCK_ONE(NULL);
1292} 1268}
1293 1269
1294static int 1270static int
1295tap_kqread(struct knote *kn, long hint) 1271tap_kqread(struct knote *kn, long hint)
1296{ 1272{
1297 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook; 1273 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook;
1298 struct ifnet *ifp = &sc->sc_ec.ec_if; 1274 struct ifnet *ifp = &sc->sc_ec.ec_if;
1299 struct mbuf *m; 1275 struct mbuf *m;
1300 int s, rv; 1276 int s, rv;
1301 1277
1302 KERNEL_LOCK(1, NULL); 1278 KERNEL_LOCK(1, NULL);
1303 s = splnet(); 1279 s = splnet();