Mon Apr 15 08:06:36 2024 UTC (35d)
Add a newline to a printf message.


(simonb)
diff -r1.6 -r1.7 src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c

cvs diff -r1.6 -r1.7 src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c (switch to unified diff)

--- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c 2018/11/15 04:55:06 1.6
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c 2024/04/15 08:06:36 1.7
@@ -2623,1013 +2623,1013 @@ vdev_online(spa_t *spa, uint64_t guid, u @@ -2623,1013 +2623,1013 @@ vdev_online(spa_t *spa, uint64_t guid, u
2623 2623
2624 if (newstate) 2624 if (newstate)
2625 *newstate = vd->vdev_state; 2625 *newstate = vd->vdev_state;
2626 if ((flags & ZFS_ONLINE_UNSPARE) && 2626 if ((flags & ZFS_ONLINE_UNSPARE) &&
2627 !vdev_is_dead(vd) && vd->vdev_parent && 2627 !vdev_is_dead(vd) && vd->vdev_parent &&
2628 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2628 vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2629 vd->vdev_parent->vdev_child[0] == vd) 2629 vd->vdev_parent->vdev_child[0] == vd)
2630 vd->vdev_unspare = B_TRUE; 2630 vd->vdev_unspare = B_TRUE;
2631 2631
2632 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2632 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
2633 2633
2634 /* XXX - L2ARC 1.0 does not support expansion */ 2634 /* XXX - L2ARC 1.0 does not support expansion */
2635 if (vd->vdev_aux) 2635 if (vd->vdev_aux)
2636 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2636 return (spa_vdev_state_exit(spa, vd, ENOTSUP));
2637 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2637 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2638 } 2638 }
2639 2639
2640 if (postevent) 2640 if (postevent)
2641 spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE); 2641 spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE);
2642 2642
2643 return (spa_vdev_state_exit(spa, vd, 0)); 2643 return (spa_vdev_state_exit(spa, vd, 0));
2644} 2644}
2645 2645
2646static int 2646static int
2647vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2647vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
2648{ 2648{
2649 vdev_t *vd, *tvd; 2649 vdev_t *vd, *tvd;
2650 int error = 0; 2650 int error = 0;
2651 uint64_t generation; 2651 uint64_t generation;
2652 metaslab_group_t *mg; 2652 metaslab_group_t *mg;
2653 2653
2654top: 2654top:
2655 spa_vdev_state_enter(spa, SCL_ALLOC); 2655 spa_vdev_state_enter(spa, SCL_ALLOC);
2656 2656
2657 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2657 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2658 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2658 return (spa_vdev_state_exit(spa, NULL, ENODEV));
2659 2659
2660 if (!vd->vdev_ops->vdev_op_leaf) 2660 if (!vd->vdev_ops->vdev_op_leaf)
2661 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2661 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2662 2662
2663 tvd = vd->vdev_top; 2663 tvd = vd->vdev_top;
2664 mg = tvd->vdev_mg; 2664 mg = tvd->vdev_mg;
2665 generation = spa->spa_config_generation + 1; 2665 generation = spa->spa_config_generation + 1;
2666 2666
2667 /* 2667 /*
2668 * If the device isn't already offline, try to offline it. 2668 * If the device isn't already offline, try to offline it.
2669 */ 2669 */
2670 if (!vd->vdev_offline) { 2670 if (!vd->vdev_offline) {
2671 /* 2671 /*
2672 * If this device has the only valid copy of some data, 2672 * If this device has the only valid copy of some data,
2673 * don't allow it to be offlined. Log devices are always 2673 * don't allow it to be offlined. Log devices are always
2674 * expendable. 2674 * expendable.
2675 */ 2675 */
2676 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2676 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
2677 vdev_dtl_required(vd)) 2677 vdev_dtl_required(vd))
2678 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2678 return (spa_vdev_state_exit(spa, NULL, EBUSY));
2679 2679
2680 /* 2680 /*
2681 * If the top-level is a slog and it has had allocations 2681 * If the top-level is a slog and it has had allocations
2682 * then proceed. We check that the vdev's metaslab group 2682 * then proceed. We check that the vdev's metaslab group
2683 * is not NULL since it's possible that we may have just 2683 * is not NULL since it's possible that we may have just
2684 * added this vdev but not yet initialized its metaslabs. 2684 * added this vdev but not yet initialized its metaslabs.
2685 */ 2685 */
2686 if (tvd->vdev_islog && mg != NULL) { 2686 if (tvd->vdev_islog && mg != NULL) {
2687 /* 2687 /*
2688 * Prevent any future allocations. 2688 * Prevent any future allocations.
2689 */ 2689 */
2690 metaslab_group_passivate(mg); 2690 metaslab_group_passivate(mg);
2691 (void) spa_vdev_state_exit(spa, vd, 0); 2691 (void) spa_vdev_state_exit(spa, vd, 0);
2692 2692
2693 error = spa_offline_log(spa); 2693 error = spa_offline_log(spa);
2694 2694
2695 spa_vdev_state_enter(spa, SCL_ALLOC); 2695 spa_vdev_state_enter(spa, SCL_ALLOC);
2696 2696
2697 /* 2697 /*
2698 * Check to see if the config has changed. 2698 * Check to see if the config has changed.
2699 */ 2699 */
2700 if (error || generation != spa->spa_config_generation) { 2700 if (error || generation != spa->spa_config_generation) {
2701 metaslab_group_activate(mg); 2701 metaslab_group_activate(mg);
2702 if (error) 2702 if (error)
2703 return (spa_vdev_state_exit(spa, 2703 return (spa_vdev_state_exit(spa,
2704 vd, error)); 2704 vd, error));
2705 (void) spa_vdev_state_exit(spa, vd, 0); 2705 (void) spa_vdev_state_exit(spa, vd, 0);
2706 goto top; 2706 goto top;
2707 } 2707 }
2708 ASSERT0(tvd->vdev_stat.vs_alloc); 2708 ASSERT0(tvd->vdev_stat.vs_alloc);
2709 } 2709 }
2710 2710
2711 /* 2711 /*
2712 * Offline this device and reopen its top-level vdev. 2712 * Offline this device and reopen its top-level vdev.
2713 * If the top-level vdev is a log device then just offline 2713 * If the top-level vdev is a log device then just offline
2714 * it. Otherwise, if this action results in the top-level 2714 * it. Otherwise, if this action results in the top-level
2715 * vdev becoming unusable, undo it and fail the request. 2715 * vdev becoming unusable, undo it and fail the request.
2716 */ 2716 */
2717 vd->vdev_offline = B_TRUE; 2717 vd->vdev_offline = B_TRUE;
2718 vdev_reopen(tvd); 2718 vdev_reopen(tvd);
2719 2719
2720 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2720 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
2721 vdev_is_dead(tvd)) { 2721 vdev_is_dead(tvd)) {
2722 vd->vdev_offline = B_FALSE; 2722 vd->vdev_offline = B_FALSE;
2723 vdev_reopen(tvd); 2723 vdev_reopen(tvd);
2724 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2724 return (spa_vdev_state_exit(spa, NULL, EBUSY));
2725 } 2725 }
2726 2726
2727 /* 2727 /*
2728 * Add the device back into the metaslab rotor so that 2728 * Add the device back into the metaslab rotor so that
2729 * once we online the device it's open for business. 2729 * once we online the device it's open for business.
2730 */ 2730 */
2731 if (tvd->vdev_islog && mg != NULL) 2731 if (tvd->vdev_islog && mg != NULL)
2732 metaslab_group_activate(mg); 2732 metaslab_group_activate(mg);
2733 } 2733 }
2734 2734
2735 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2735 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
2736 2736
2737 return (spa_vdev_state_exit(spa, vd, 0)); 2737 return (spa_vdev_state_exit(spa, vd, 0));
2738} 2738}
2739 2739
2740int 2740int
2741vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2741vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
2742{ 2742{
2743 int error; 2743 int error;
2744 2744
2745 mutex_enter(&spa->spa_vdev_top_lock); 2745 mutex_enter(&spa->spa_vdev_top_lock);
2746 error = vdev_offline_locked(spa, guid, flags); 2746 error = vdev_offline_locked(spa, guid, flags);
2747 mutex_exit(&spa->spa_vdev_top_lock); 2747 mutex_exit(&spa->spa_vdev_top_lock);
2748 2748
2749 return (error); 2749 return (error);
2750} 2750}
2751 2751
2752/* 2752/*
2753 * Clear the error counts associated with this vdev. Unlike vdev_online() and 2753 * Clear the error counts associated with this vdev. Unlike vdev_online() and
2754 * vdev_offline(), we assume the spa config is locked. We also clear all 2754 * vdev_offline(), we assume the spa config is locked. We also clear all
2755 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2755 * children. If 'vd' is NULL, then the user wants to clear all vdevs.
2756 */ 2756 */
2757void 2757void
2758vdev_clear(spa_t *spa, vdev_t *vd) 2758vdev_clear(spa_t *spa, vdev_t *vd)
2759{ 2759{
2760 vdev_t *rvd = spa->spa_root_vdev; 2760 vdev_t *rvd = spa->spa_root_vdev;
2761 2761
2762 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2762 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2763 2763
2764 if (vd == NULL) 2764 if (vd == NULL)
2765 vd = rvd; 2765 vd = rvd;
2766 2766
2767 vd->vdev_stat.vs_read_errors = 0; 2767 vd->vdev_stat.vs_read_errors = 0;
2768 vd->vdev_stat.vs_write_errors = 0; 2768 vd->vdev_stat.vs_write_errors = 0;
2769 vd->vdev_stat.vs_checksum_errors = 0; 2769 vd->vdev_stat.vs_checksum_errors = 0;
2770 2770
2771 for (int c = 0; c < vd->vdev_children; c++) 2771 for (int c = 0; c < vd->vdev_children; c++)
2772 vdev_clear(spa, vd->vdev_child[c]); 2772 vdev_clear(spa, vd->vdev_child[c]);
2773 2773
2774 if (vd == rvd) { 2774 if (vd == rvd) {
2775 for (int c = 0; c < spa->spa_l2cache.sav_count; c++) 2775 for (int c = 0; c < spa->spa_l2cache.sav_count; c++)
2776 vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); 2776 vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]);
2777 2777
2778 for (int c = 0; c < spa->spa_spares.sav_count; c++) 2778 for (int c = 0; c < spa->spa_spares.sav_count; c++)
2779 vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); 2779 vdev_clear(spa, spa->spa_spares.sav_vdevs[c]);
2780 } 2780 }
2781 2781
2782 /* 2782 /*
2783 * If we're in the FAULTED state or have experienced failed I/O, then 2783 * If we're in the FAULTED state or have experienced failed I/O, then
2784 * clear the persistent state and attempt to reopen the device. We 2784 * clear the persistent state and attempt to reopen the device. We
2785 * also mark the vdev config dirty, so that the new faulted state is 2785 * also mark the vdev config dirty, so that the new faulted state is
2786 * written out to disk. 2786 * written out to disk.
2787 */ 2787 */
2788 if (vd->vdev_faulted || vd->vdev_degraded || 2788 if (vd->vdev_faulted || vd->vdev_degraded ||
2789 !vdev_readable(vd) || !vdev_writeable(vd)) { 2789 !vdev_readable(vd) || !vdev_writeable(vd)) {
2790 2790
2791 /* 2791 /*
2792 * When reopening in reponse to a clear event, it may be due to 2792 * When reopening in reponse to a clear event, it may be due to
2793 * a fmadm repair request. In this case, if the device is 2793 * a fmadm repair request. In this case, if the device is
2794 * still broken, we want to still post the ereport again. 2794 * still broken, we want to still post the ereport again.
2795 */ 2795 */
2796 vd->vdev_forcefault = B_TRUE; 2796 vd->vdev_forcefault = B_TRUE;
2797 2797
2798 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 2798 vd->vdev_faulted = vd->vdev_degraded = 0ULL;
2799 vd->vdev_cant_read = B_FALSE; 2799 vd->vdev_cant_read = B_FALSE;
2800 vd->vdev_cant_write = B_FALSE; 2800 vd->vdev_cant_write = B_FALSE;
2801 2801
2802 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 2802 vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
2803 2803
2804 vd->vdev_forcefault = B_FALSE; 2804 vd->vdev_forcefault = B_FALSE;
2805 2805
2806 if (vd != rvd && vdev_writeable(vd->vdev_top)) 2806 if (vd != rvd && vdev_writeable(vd->vdev_top))
2807 vdev_state_dirty(vd->vdev_top); 2807 vdev_state_dirty(vd->vdev_top);
2808 2808
2809 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2809 if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
2810 spa_async_request(spa, SPA_ASYNC_RESILVER); 2810 spa_async_request(spa, SPA_ASYNC_RESILVER);
2811 2811
2812 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 2812 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
2813 } 2813 }
2814 2814
2815 /* 2815 /*
2816 * When clearing a FMA-diagnosed fault, we always want to 2816 * When clearing a FMA-diagnosed fault, we always want to
2817 * unspare the device, as we assume that the original spare was 2817 * unspare the device, as we assume that the original spare was
2818 * done in response to the FMA fault. 2818 * done in response to the FMA fault.
2819 */ 2819 */
2820 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2820 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
2821 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2821 vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2822 vd->vdev_parent->vdev_child[0] == vd) 2822 vd->vdev_parent->vdev_child[0] == vd)
2823 vd->vdev_unspare = B_TRUE; 2823 vd->vdev_unspare = B_TRUE;
2824} 2824}
2825 2825
2826boolean_t 2826boolean_t
2827vdev_is_dead(vdev_t *vd) 2827vdev_is_dead(vdev_t *vd)
2828{ 2828{
2829 /* 2829 /*
2830 * Holes and missing devices are always considered "dead". 2830 * Holes and missing devices are always considered "dead".
2831 * This simplifies the code since we don't have to check for 2831 * This simplifies the code since we don't have to check for
2832 * these types of devices in the various code paths. 2832 * these types of devices in the various code paths.
2833 * Instead we rely on the fact that we skip over dead devices 2833 * Instead we rely on the fact that we skip over dead devices
2834 * before issuing I/O to them. 2834 * before issuing I/O to them.
2835 */ 2835 */
2836 return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || 2836 return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
2837 vd->vdev_ops == &vdev_missing_ops); 2837 vd->vdev_ops == &vdev_missing_ops);
2838} 2838}
2839 2839
2840boolean_t 2840boolean_t
2841vdev_readable(vdev_t *vd) 2841vdev_readable(vdev_t *vd)
2842{ 2842{
2843 return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 2843 return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
2844} 2844}
2845 2845
2846boolean_t 2846boolean_t
2847vdev_writeable(vdev_t *vd) 2847vdev_writeable(vdev_t *vd)
2848{ 2848{
2849 return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 2849 return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
2850} 2850}
2851 2851
2852boolean_t 2852boolean_t
2853vdev_allocatable(vdev_t *vd) 2853vdev_allocatable(vdev_t *vd)
2854{ 2854{
2855 uint64_t state = vd->vdev_state; 2855 uint64_t state = vd->vdev_state;
2856 2856
2857 /* 2857 /*
2858 * We currently allow allocations from vdevs which may be in the 2858 * We currently allow allocations from vdevs which may be in the
2859 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 2859 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
2860 * fails to reopen then we'll catch it later when we're holding 2860 * fails to reopen then we'll catch it later when we're holding
2861 * the proper locks. Note that we have to get the vdev state 2861 * the proper locks. Note that we have to get the vdev state
2862 * in a local variable because although it changes atomically, 2862 * in a local variable because although it changes atomically,
2863 * we're asking two separate questions about it. 2863 * we're asking two separate questions about it.
2864 */ 2864 */
2865 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 2865 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
2866 !vd->vdev_cant_write && !vd->vdev_ishole && 2866 !vd->vdev_cant_write && !vd->vdev_ishole &&
2867 vd->vdev_mg->mg_initialized); 2867 vd->vdev_mg->mg_initialized);
2868} 2868}
2869 2869
2870boolean_t 2870boolean_t
2871vdev_accessible(vdev_t *vd, zio_t *zio) 2871vdev_accessible(vdev_t *vd, zio_t *zio)
2872{ 2872{
2873 ASSERT(zio->io_vd == vd); 2873 ASSERT(zio->io_vd == vd);
2874 2874
2875 if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 2875 if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
2876 return (B_FALSE); 2876 return (B_FALSE);
2877 2877
2878 if (zio->io_type == ZIO_TYPE_READ) 2878 if (zio->io_type == ZIO_TYPE_READ)
2879 return (!vd->vdev_cant_read); 2879 return (!vd->vdev_cant_read);
2880 2880
2881 if (zio->io_type == ZIO_TYPE_WRITE) 2881 if (zio->io_type == ZIO_TYPE_WRITE)
2882 return (!vd->vdev_cant_write); 2882 return (!vd->vdev_cant_write);
2883 2883
2884 return (B_TRUE); 2884 return (B_TRUE);
2885} 2885}
2886 2886
2887/* 2887/*
2888 * Get statistics for the given vdev. 2888 * Get statistics for the given vdev.
2889 */ 2889 */
2890void 2890void
2891vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2891vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
2892{ 2892{
2893 spa_t *spa = vd->vdev_spa; 2893 spa_t *spa = vd->vdev_spa;
2894 vdev_t *rvd = spa->spa_root_vdev; 2894 vdev_t *rvd = spa->spa_root_vdev;
2895 vdev_t *tvd = vd->vdev_top; 2895 vdev_t *tvd = vd->vdev_top;
2896 2896
2897 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2897 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
2898 2898
2899 mutex_enter(&vd->vdev_stat_lock); 2899 mutex_enter(&vd->vdev_stat_lock);
2900 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2900 bcopy(&vd->vdev_stat, vs, sizeof (*vs));
2901 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2901 vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
2902 vs->vs_state = vd->vdev_state; 2902 vs->vs_state = vd->vdev_state;
2903 vs->vs_rsize = vdev_get_min_asize(vd); 2903 vs->vs_rsize = vdev_get_min_asize(vd);
2904 if (vd->vdev_ops->vdev_op_leaf) 2904 if (vd->vdev_ops->vdev_op_leaf)
2905 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 2905 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
2906 /* 2906 /*
2907 * Report expandable space on top-level, non-auxillary devices only. 2907 * Report expandable space on top-level, non-auxillary devices only.
2908 * The expandable space is reported in terms of metaslab sized units 2908 * The expandable space is reported in terms of metaslab sized units
2909 * since that determines how much space the pool can expand. 2909 * since that determines how much space the pool can expand.
2910 */ 2910 */
2911 if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { 2911 if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) {
2912 vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize, 2912 vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize,
2913 1ULL << tvd->vdev_ms_shift); 2913 1ULL << tvd->vdev_ms_shift);
2914 } 2914 }
2915 vs->vs_configured_ashift = vd->vdev_top != NULL 2915 vs->vs_configured_ashift = vd->vdev_top != NULL
2916 ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; 2916 ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
2917 vs->vs_logical_ashift = vd->vdev_logical_ashift; 2917 vs->vs_logical_ashift = vd->vdev_logical_ashift;
2918 vs->vs_physical_ashift = vd->vdev_physical_ashift; 2918 vs->vs_physical_ashift = vd->vdev_physical_ashift;
2919 if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { 2919 if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
2920 vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; 2920 vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
2921 } 2921 }
2922 2922
2923 /* 2923 /*
2924 * If we're getting stats on the root vdev, aggregate the I/O counts 2924 * If we're getting stats on the root vdev, aggregate the I/O counts
2925 * over all top-level vdevs (i.e. the direct children of the root). 2925 * over all top-level vdevs (i.e. the direct children of the root).
2926 */ 2926 */
2927 if (vd == rvd) { 2927 if (vd == rvd) {
2928 for (int c = 0; c < rvd->vdev_children; c++) { 2928 for (int c = 0; c < rvd->vdev_children; c++) {
2929 vdev_t *cvd = rvd->vdev_child[c]; 2929 vdev_t *cvd = rvd->vdev_child[c];
2930 vdev_stat_t *cvs = &cvd->vdev_stat; 2930 vdev_stat_t *cvs = &cvd->vdev_stat;
2931 2931
2932 for (int t = 0; t < ZIO_TYPES; t++) { 2932 for (int t = 0; t < ZIO_TYPES; t++) {
2933 vs->vs_ops[t] += cvs->vs_ops[t]; 2933 vs->vs_ops[t] += cvs->vs_ops[t];
2934 vs->vs_bytes[t] += cvs->vs_bytes[t]; 2934 vs->vs_bytes[t] += cvs->vs_bytes[t];
2935 } 2935 }
2936 cvs->vs_scan_removing = cvd->vdev_removing; 2936 cvs->vs_scan_removing = cvd->vdev_removing;
2937 } 2937 }
2938 } 2938 }
2939 mutex_exit(&vd->vdev_stat_lock); 2939 mutex_exit(&vd->vdev_stat_lock);
2940} 2940}
2941 2941
2942void 2942void
2943vdev_clear_stats(vdev_t *vd) 2943vdev_clear_stats(vdev_t *vd)
2944{ 2944{
2945 mutex_enter(&vd->vdev_stat_lock); 2945 mutex_enter(&vd->vdev_stat_lock);
2946 vd->vdev_stat.vs_space = 0; 2946 vd->vdev_stat.vs_space = 0;
2947 vd->vdev_stat.vs_dspace = 0; 2947 vd->vdev_stat.vs_dspace = 0;
2948 vd->vdev_stat.vs_alloc = 0; 2948 vd->vdev_stat.vs_alloc = 0;
2949 mutex_exit(&vd->vdev_stat_lock); 2949 mutex_exit(&vd->vdev_stat_lock);
2950} 2950}
2951 2951
2952void 2952void
2953vdev_scan_stat_init(vdev_t *vd) 2953vdev_scan_stat_init(vdev_t *vd)
2954{ 2954{
2955 vdev_stat_t *vs = &vd->vdev_stat; 2955 vdev_stat_t *vs = &vd->vdev_stat;
2956 2956
2957 for (int c = 0; c < vd->vdev_children; c++) 2957 for (int c = 0; c < vd->vdev_children; c++)
2958 vdev_scan_stat_init(vd->vdev_child[c]); 2958 vdev_scan_stat_init(vd->vdev_child[c]);
2959 2959
2960 mutex_enter(&vd->vdev_stat_lock); 2960 mutex_enter(&vd->vdev_stat_lock);
2961 vs->vs_scan_processed = 0; 2961 vs->vs_scan_processed = 0;
2962 mutex_exit(&vd->vdev_stat_lock); 2962 mutex_exit(&vd->vdev_stat_lock);
2963} 2963}
2964 2964
2965void 2965void
2966vdev_stat_update(zio_t *zio, uint64_t psize) 2966vdev_stat_update(zio_t *zio, uint64_t psize)
2967{ 2967{
2968 spa_t *spa = zio->io_spa; 2968 spa_t *spa = zio->io_spa;
2969 vdev_t *rvd = spa->spa_root_vdev; 2969 vdev_t *rvd = spa->spa_root_vdev;
2970 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 2970 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
2971 vdev_t *pvd; 2971 vdev_t *pvd;
2972 uint64_t txg = zio->io_txg; 2972 uint64_t txg = zio->io_txg;
2973 vdev_stat_t *vs = &vd->vdev_stat; 2973 vdev_stat_t *vs = &vd->vdev_stat;
2974 zio_type_t type = zio->io_type; 2974 zio_type_t type = zio->io_type;
2975 int flags = zio->io_flags; 2975 int flags = zio->io_flags;
2976 2976
2977 /* 2977 /*
2978 * If this i/o is a gang leader, it didn't do any actual work. 2978 * If this i/o is a gang leader, it didn't do any actual work.
2979 */ 2979 */
2980 if (zio->io_gang_tree) 2980 if (zio->io_gang_tree)
2981 return; 2981 return;
2982 2982
2983 if (zio->io_error == 0) { 2983 if (zio->io_error == 0) {
2984 /* 2984 /*
2985 * If this is a root i/o, don't count it -- we've already 2985 * If this is a root i/o, don't count it -- we've already
2986 * counted the top-level vdevs, and vdev_get_stats() will 2986 * counted the top-level vdevs, and vdev_get_stats() will
2987 * aggregate them when asked. This reduces contention on 2987 * aggregate them when asked. This reduces contention on
2988 * the root vdev_stat_lock and implicitly handles blocks 2988 * the root vdev_stat_lock and implicitly handles blocks
2989 * that compress away to holes, for which there is no i/o. 2989 * that compress away to holes, for which there is no i/o.
2990 * (Holes never create vdev children, so all the counters 2990 * (Holes never create vdev children, so all the counters
2991 * remain zero, which is what we want.) 2991 * remain zero, which is what we want.)
2992 * 2992 *
2993 * Note: this only applies to successful i/o (io_error == 0) 2993 * Note: this only applies to successful i/o (io_error == 0)
2994 * because unlike i/o counts, errors are not additive. 2994 * because unlike i/o counts, errors are not additive.
2995 * When reading a ditto block, for example, failure of 2995 * When reading a ditto block, for example, failure of
2996 * one top-level vdev does not imply a root-level error. 2996 * one top-level vdev does not imply a root-level error.
2997 */ 2997 */
2998 if (vd == rvd) 2998 if (vd == rvd)
2999 return; 2999 return;
3000 3000
3001 ASSERT(vd == zio->io_vd); 3001 ASSERT(vd == zio->io_vd);
3002 3002
3003 if (flags & ZIO_FLAG_IO_BYPASS) 3003 if (flags & ZIO_FLAG_IO_BYPASS)
3004 return; 3004 return;
3005 3005
3006 mutex_enter(&vd->vdev_stat_lock); 3006 mutex_enter(&vd->vdev_stat_lock);
3007 3007
3008 if (flags & ZIO_FLAG_IO_REPAIR) { 3008 if (flags & ZIO_FLAG_IO_REPAIR) {
3009 if (flags & ZIO_FLAG_SCAN_THREAD) { 3009 if (flags & ZIO_FLAG_SCAN_THREAD) {
3010 dsl_scan_phys_t *scn_phys = 3010 dsl_scan_phys_t *scn_phys =
3011 &spa->spa_dsl_pool->dp_scan->scn_phys; 3011 &spa->spa_dsl_pool->dp_scan->scn_phys;
3012 uint64_t *processed = &scn_phys->scn_processed; 3012 uint64_t *processed = &scn_phys->scn_processed;
3013 3013
3014 /* XXX cleanup? */ 3014 /* XXX cleanup? */
3015 if (vd->vdev_ops->vdev_op_leaf) 3015 if (vd->vdev_ops->vdev_op_leaf)
3016 atomic_add_64(processed, psize); 3016 atomic_add_64(processed, psize);
3017 vs->vs_scan_processed += psize; 3017 vs->vs_scan_processed += psize;
3018 } 3018 }
3019 3019
3020 if (flags & ZIO_FLAG_SELF_HEAL) 3020 if (flags & ZIO_FLAG_SELF_HEAL)
3021 vs->vs_self_healed += psize; 3021 vs->vs_self_healed += psize;
3022 } 3022 }
3023 3023
3024 vs->vs_ops[type]++; 3024 vs->vs_ops[type]++;
3025 vs->vs_bytes[type] += psize; 3025 vs->vs_bytes[type] += psize;
3026 3026
3027 mutex_exit(&vd->vdev_stat_lock); 3027 mutex_exit(&vd->vdev_stat_lock);
3028 return; 3028 return;
3029 } 3029 }
3030 3030
3031 if (flags & ZIO_FLAG_SPECULATIVE) 3031 if (flags & ZIO_FLAG_SPECULATIVE)
3032 return; 3032 return;
3033 3033
3034 /* 3034 /*
3035 * If this is an I/O error that is going to be retried, then ignore the 3035 * If this is an I/O error that is going to be retried, then ignore the
3036 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 3036 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as
3037 * hard errors, when in reality they can happen for any number of 3037 * hard errors, when in reality they can happen for any number of
3038 * innocuous reasons (bus resets, MPxIO link failure, etc). 3038 * innocuous reasons (bus resets, MPxIO link failure, etc).
3039 */ 3039 */
3040 if (zio->io_error == EIO && 3040 if (zio->io_error == EIO &&
3041 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 3041 !(zio->io_flags & ZIO_FLAG_IO_RETRY))
3042 return; 3042 return;
3043 3043
3044 /* 3044 /*
3045 * Intent logs writes won't propagate their error to the root 3045 * Intent logs writes won't propagate their error to the root
3046 * I/O so don't mark these types of failures as pool-level 3046 * I/O so don't mark these types of failures as pool-level
3047 * errors. 3047 * errors.
3048 */ 3048 */
3049 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 3049 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
3050 return; 3050 return;
3051 3051
3052 mutex_enter(&vd->vdev_stat_lock); 3052 mutex_enter(&vd->vdev_stat_lock);
3053 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 3053 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
3054 if (zio->io_error == ECKSUM) 3054 if (zio->io_error == ECKSUM)
3055 vs->vs_checksum_errors++; 3055 vs->vs_checksum_errors++;
3056 else 3056 else
3057 vs->vs_read_errors++; 3057 vs->vs_read_errors++;
3058 } 3058 }
3059 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 3059 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
3060 vs->vs_write_errors++; 3060 vs->vs_write_errors++;
3061 mutex_exit(&vd->vdev_stat_lock); 3061 mutex_exit(&vd->vdev_stat_lock);
3062 3062
3063 if (type == ZIO_TYPE_WRITE && txg != 0 && 3063 if (type == ZIO_TYPE_WRITE && txg != 0 &&
3064 (!(flags & ZIO_FLAG_IO_REPAIR) || 3064 (!(flags & ZIO_FLAG_IO_REPAIR) ||
3065 (flags & ZIO_FLAG_SCAN_THREAD) || 3065 (flags & ZIO_FLAG_SCAN_THREAD) ||
3066 spa->spa_claiming)) { 3066 spa->spa_claiming)) {
3067 /* 3067 /*
3068 * This is either a normal write (not a repair), or it's 3068 * This is either a normal write (not a repair), or it's
3069 * a repair induced by the scrub thread, or it's a repair 3069 * a repair induced by the scrub thread, or it's a repair
3070 * made by zil_claim() during spa_load() in the first txg. 3070 * made by zil_claim() during spa_load() in the first txg.
3071 * In the normal case, we commit the DTL change in the same 3071 * In the normal case, we commit the DTL change in the same
3072 * txg as the block was born. In the scrub-induced repair 3072 * txg as the block was born. In the scrub-induced repair
3073 * case, we know that scrubs run in first-pass syncing context, 3073 * case, we know that scrubs run in first-pass syncing context,
3074 * so we commit the DTL change in spa_syncing_txg(spa). 3074 * so we commit the DTL change in spa_syncing_txg(spa).
3075 * In the zil_claim() case, we commit in spa_first_txg(spa). 3075 * In the zil_claim() case, we commit in spa_first_txg(spa).
3076 * 3076 *
3077 * We currently do not make DTL entries for failed spontaneous 3077 * We currently do not make DTL entries for failed spontaneous
3078 * self-healing writes triggered by normal (non-scrubbing) 3078 * self-healing writes triggered by normal (non-scrubbing)
3079 * reads, because we have no transactional context in which to 3079 * reads, because we have no transactional context in which to
3080 * do so -- and it's not clear that it'd be desirable anyway. 3080 * do so -- and it's not clear that it'd be desirable anyway.
3081 */ 3081 */
3082 if (vd->vdev_ops->vdev_op_leaf) { 3082 if (vd->vdev_ops->vdev_op_leaf) {
3083 uint64_t commit_txg = txg; 3083 uint64_t commit_txg = txg;
3084 if (flags & ZIO_FLAG_SCAN_THREAD) { 3084 if (flags & ZIO_FLAG_SCAN_THREAD) {
3085 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3085 ASSERT(flags & ZIO_FLAG_IO_REPAIR);
3086 ASSERT(spa_sync_pass(spa) == 1); 3086 ASSERT(spa_sync_pass(spa) == 1);
3087 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 3087 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
3088 commit_txg = spa_syncing_txg(spa); 3088 commit_txg = spa_syncing_txg(spa);
3089 } else if (spa->spa_claiming) { 3089 } else if (spa->spa_claiming) {
3090 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3090 ASSERT(flags & ZIO_FLAG_IO_REPAIR);
3091 commit_txg = spa_first_txg(spa); 3091 commit_txg = spa_first_txg(spa);
3092 } 3092 }
3093 ASSERT(commit_txg >= spa_syncing_txg(spa)); 3093 ASSERT(commit_txg >= spa_syncing_txg(spa));
3094 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 3094 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
3095 return; 3095 return;
3096 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 3096 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
3097 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 3097 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
3098 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 3098 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
3099 } 3099 }
3100 if (vd != rvd) 3100 if (vd != rvd)
3101 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 3101 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
3102 } 3102 }
3103} 3103}
3104 3104
3105/* 3105/*
3106 * Update the in-core space usage stats for this vdev, its metaslab class, 3106 * Update the in-core space usage stats for this vdev, its metaslab class,
3107 * and the root vdev. 3107 * and the root vdev.
3108 */ 3108 */
3109void 3109void
3110vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 3110vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
3111 int64_t space_delta) 3111 int64_t space_delta)
3112{ 3112{
3113 int64_t dspace_delta = space_delta; 3113 int64_t dspace_delta = space_delta;
3114 spa_t *spa = vd->vdev_spa; 3114 spa_t *spa = vd->vdev_spa;
3115 vdev_t *rvd = spa->spa_root_vdev; 3115 vdev_t *rvd = spa->spa_root_vdev;
3116 metaslab_group_t *mg = vd->vdev_mg; 3116 metaslab_group_t *mg = vd->vdev_mg;
3117 metaslab_class_t *mc = mg ? mg->mg_class : NULL; 3117 metaslab_class_t *mc = mg ? mg->mg_class : NULL;
3118 3118
3119 ASSERT(vd == vd->vdev_top); 3119 ASSERT(vd == vd->vdev_top);
3120 3120
3121 /* 3121 /*
3122 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 3122 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
3123 * factor. We must calculate this here and not at the root vdev 3123 * factor. We must calculate this here and not at the root vdev
3124 * because the root vdev's psize-to-asize is simply the max of its 3124 * because the root vdev's psize-to-asize is simply the max of its
3125 * childrens', thus not accurate enough for us. 3125 * childrens', thus not accurate enough for us.
3126 */ 3126 */
3127 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 3127 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
3128 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 3128 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
3129 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 3129 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
3130 vd->vdev_deflate_ratio; 3130 vd->vdev_deflate_ratio;
3131 3131
3132 mutex_enter(&vd->vdev_stat_lock); 3132 mutex_enter(&vd->vdev_stat_lock);
3133 vd->vdev_stat.vs_alloc += alloc_delta; 3133 vd->vdev_stat.vs_alloc += alloc_delta;
3134 vd->vdev_stat.vs_space += space_delta; 3134 vd->vdev_stat.vs_space += space_delta;
3135 vd->vdev_stat.vs_dspace += dspace_delta; 3135 vd->vdev_stat.vs_dspace += dspace_delta;
3136 mutex_exit(&vd->vdev_stat_lock); 3136 mutex_exit(&vd->vdev_stat_lock);
3137 3137
3138 if (mc == spa_normal_class(spa)) { 3138 if (mc == spa_normal_class(spa)) {
3139 mutex_enter(&rvd->vdev_stat_lock); 3139 mutex_enter(&rvd->vdev_stat_lock);
3140 rvd->vdev_stat.vs_alloc += alloc_delta; 3140 rvd->vdev_stat.vs_alloc += alloc_delta;
3141 rvd->vdev_stat.vs_space += space_delta; 3141 rvd->vdev_stat.vs_space += space_delta;
3142 rvd->vdev_stat.vs_dspace += dspace_delta; 3142 rvd->vdev_stat.vs_dspace += dspace_delta;
3143 mutex_exit(&rvd->vdev_stat_lock); 3143 mutex_exit(&rvd->vdev_stat_lock);
3144 } 3144 }
3145 3145
3146 if (mc != NULL) { 3146 if (mc != NULL) {
3147 ASSERT(rvd == vd->vdev_parent); 3147 ASSERT(rvd == vd->vdev_parent);
3148 ASSERT(vd->vdev_ms_count != 0); 3148 ASSERT(vd->vdev_ms_count != 0);
3149 3149
3150 metaslab_class_space_update(mc, 3150 metaslab_class_space_update(mc,
3151 alloc_delta, defer_delta, space_delta, dspace_delta); 3151 alloc_delta, defer_delta, space_delta, dspace_delta);
3152 } 3152 }
3153} 3153}
3154 3154
3155/* 3155/*
3156 * Mark a top-level vdev's config as dirty, placing it on the dirty list 3156 * Mark a top-level vdev's config as dirty, placing it on the dirty list
3157 * so that it will be written out next time the vdev configuration is synced. 3157 * so that it will be written out next time the vdev configuration is synced.
3158 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 3158 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
3159 */ 3159 */
3160void 3160void
3161vdev_config_dirty(vdev_t *vd) 3161vdev_config_dirty(vdev_t *vd)
3162{ 3162{
3163 spa_t *spa = vd->vdev_spa; 3163 spa_t *spa = vd->vdev_spa;
3164 vdev_t *rvd = spa->spa_root_vdev; 3164 vdev_t *rvd = spa->spa_root_vdev;
3165 int c; 3165 int c;
3166 3166
3167 ASSERT(spa_writeable(spa)); 3167 ASSERT(spa_writeable(spa));
3168 3168
3169 /* 3169 /*
3170 * If this is an aux vdev (as with l2cache and spare devices), then we 3170 * If this is an aux vdev (as with l2cache and spare devices), then we
3171 * update the vdev config manually and set the sync flag. 3171 * update the vdev config manually and set the sync flag.
3172 */ 3172 */
3173 if (vd->vdev_aux != NULL) { 3173 if (vd->vdev_aux != NULL) {
3174 spa_aux_vdev_t *sav = vd->vdev_aux; 3174 spa_aux_vdev_t *sav = vd->vdev_aux;
3175 nvlist_t **aux; 3175 nvlist_t **aux;
3176 uint_t naux; 3176 uint_t naux;
3177 3177
3178 for (c = 0; c < sav->sav_count; c++) { 3178 for (c = 0; c < sav->sav_count; c++) {
3179 if (sav->sav_vdevs[c] == vd) 3179 if (sav->sav_vdevs[c] == vd)
3180 break; 3180 break;
3181 } 3181 }
3182 3182
3183 if (c == sav->sav_count) { 3183 if (c == sav->sav_count) {
3184 /* 3184 /*
3185 * We're being removed. There's nothing more to do. 3185 * We're being removed. There's nothing more to do.
3186 */ 3186 */
3187 ASSERT(sav->sav_sync == B_TRUE); 3187 ASSERT(sav->sav_sync == B_TRUE);
3188 return; 3188 return;
3189 } 3189 }
3190 3190
3191 sav->sav_sync = B_TRUE; 3191 sav->sav_sync = B_TRUE;
3192 3192
3193 if (nvlist_lookup_nvlist_array(sav->sav_config, 3193 if (nvlist_lookup_nvlist_array(sav->sav_config,
3194 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 3194 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
3195 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 3195 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
3196 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 3196 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
3197 } 3197 }
3198 3198
3199 ASSERT(c < naux); 3199 ASSERT(c < naux);
3200 3200
3201 /* 3201 /*
3202 * Setting the nvlist in the middle if the array is a little 3202 * Setting the nvlist in the middle if the array is a little
3203 * sketchy, but it will work. 3203 * sketchy, but it will work.
3204 */ 3204 */
3205 nvlist_free(aux[c]); 3205 nvlist_free(aux[c]);
3206 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 3206 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
3207 3207
3208 return; 3208 return;
3209 } 3209 }
3210 3210
3211 /* 3211 /*
3212 * The dirty list is protected by the SCL_CONFIG lock. The caller 3212 * The dirty list is protected by the SCL_CONFIG lock. The caller
3213 * must either hold SCL_CONFIG as writer, or must be the sync thread 3213 * must either hold SCL_CONFIG as writer, or must be the sync thread
3214 * (which holds SCL_CONFIG as reader). There's only one sync thread, 3214 * (which holds SCL_CONFIG as reader). There's only one sync thread,
3215 * so this is sufficient to ensure mutual exclusion. 3215 * so this is sufficient to ensure mutual exclusion.
3216 */ 3216 */
3217 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3217 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
3218 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3218 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3219 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3219 spa_config_held(spa, SCL_CONFIG, RW_READER)));
3220 3220
3221 if (vd == rvd) { 3221 if (vd == rvd) {
3222 for (c = 0; c < rvd->vdev_children; c++) 3222 for (c = 0; c < rvd->vdev_children; c++)
3223 vdev_config_dirty(rvd->vdev_child[c]); 3223 vdev_config_dirty(rvd->vdev_child[c]);
3224 } else { 3224 } else {
3225 ASSERT(vd == vd->vdev_top); 3225 ASSERT(vd == vd->vdev_top);
3226 3226
3227 if (!list_link_active(&vd->vdev_config_dirty_node) && 3227 if (!list_link_active(&vd->vdev_config_dirty_node) &&
3228 !vd->vdev_ishole) 3228 !vd->vdev_ishole)
3229 list_insert_head(&spa->spa_config_dirty_list, vd); 3229 list_insert_head(&spa->spa_config_dirty_list, vd);
3230 } 3230 }
3231} 3231}
3232 3232
3233void 3233void
3234vdev_config_clean(vdev_t *vd) 3234vdev_config_clean(vdev_t *vd)
3235{ 3235{
3236 spa_t *spa = vd->vdev_spa; 3236 spa_t *spa = vd->vdev_spa;
3237 3237
3238 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3238 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
3239 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3239 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3240 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3240 spa_config_held(spa, SCL_CONFIG, RW_READER)));
3241 3241
3242 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 3242 ASSERT(list_link_active(&vd->vdev_config_dirty_node));
3243 list_remove(&spa->spa_config_dirty_list, vd); 3243 list_remove(&spa->spa_config_dirty_list, vd);
3244} 3244}
3245 3245
3246/* 3246/*
3247 * Mark a top-level vdev's state as dirty, so that the next pass of 3247 * Mark a top-level vdev's state as dirty, so that the next pass of
3248 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 3248 * spa_sync() can convert this into vdev_config_dirty(). We distinguish
3249 * the state changes from larger config changes because they require 3249 * the state changes from larger config changes because they require
3250 * much less locking, and are often needed for administrative actions. 3250 * much less locking, and are often needed for administrative actions.
3251 */ 3251 */
3252void 3252void
3253vdev_state_dirty(vdev_t *vd) 3253vdev_state_dirty(vdev_t *vd)
3254{ 3254{
3255 spa_t *spa = vd->vdev_spa; 3255 spa_t *spa = vd->vdev_spa;
3256 3256
3257 ASSERT(spa_writeable(spa)); 3257 ASSERT(spa_writeable(spa));
3258 ASSERT(vd == vd->vdev_top); 3258 ASSERT(vd == vd->vdev_top);
3259 3259
3260 /* 3260 /*
3261 * The state list is protected by the SCL_STATE lock. The caller 3261 * The state list is protected by the SCL_STATE lock. The caller
3262 * must either hold SCL_STATE as writer, or must be the sync thread 3262 * must either hold SCL_STATE as writer, or must be the sync thread
3263 * (which holds SCL_STATE as reader). There's only one sync thread, 3263 * (which holds SCL_STATE as reader). There's only one sync thread,
3264 * so this is sufficient to ensure mutual exclusion. 3264 * so this is sufficient to ensure mutual exclusion.
3265 */ 3265 */
3266 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3266 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
3267 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3267 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3268 spa_config_held(spa, SCL_STATE, RW_READER))); 3268 spa_config_held(spa, SCL_STATE, RW_READER)));
3269 3269
3270 if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) 3270 if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
3271 list_insert_head(&spa->spa_state_dirty_list, vd); 3271 list_insert_head(&spa->spa_state_dirty_list, vd);
3272} 3272}
3273 3273
3274void 3274void
3275vdev_state_clean(vdev_t *vd) 3275vdev_state_clean(vdev_t *vd)
3276{ 3276{
3277 spa_t *spa = vd->vdev_spa; 3277 spa_t *spa = vd->vdev_spa;
3278 3278
3279 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3279 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
3280 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3280 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3281 spa_config_held(spa, SCL_STATE, RW_READER))); 3281 spa_config_held(spa, SCL_STATE, RW_READER)));
3282 3282
3283 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 3283 ASSERT(list_link_active(&vd->vdev_state_dirty_node));
3284 list_remove(&spa->spa_state_dirty_list, vd); 3284 list_remove(&spa->spa_state_dirty_list, vd);
3285} 3285}
3286 3286
3287/* 3287/*
3288 * Propagate vdev state up from children to parent. 3288 * Propagate vdev state up from children to parent.
3289 */ 3289 */
3290void 3290void
3291vdev_propagate_state(vdev_t *vd) 3291vdev_propagate_state(vdev_t *vd)
3292{ 3292{
3293 spa_t *spa = vd->vdev_spa; 3293 spa_t *spa = vd->vdev_spa;
3294 vdev_t *rvd = spa->spa_root_vdev; 3294 vdev_t *rvd = spa->spa_root_vdev;
3295 int degraded = 0, faulted = 0; 3295 int degraded = 0, faulted = 0;
3296 int corrupted = 0; 3296 int corrupted = 0;
3297 vdev_t *child; 3297 vdev_t *child;
3298 3298
3299 if (vd->vdev_children > 0) { 3299 if (vd->vdev_children > 0) {
3300 for (int c = 0; c < vd->vdev_children; c++) { 3300 for (int c = 0; c < vd->vdev_children; c++) {
3301 child = vd->vdev_child[c]; 3301 child = vd->vdev_child[c];
3302 3302
3303 /* 3303 /*
3304 * Don't factor holes into the decision. 3304 * Don't factor holes into the decision.
3305 */ 3305 */
3306 if (child->vdev_ishole) 3306 if (child->vdev_ishole)
3307 continue; 3307 continue;
3308 3308
3309 if (!vdev_readable(child) || 3309 if (!vdev_readable(child) ||
3310 (!vdev_writeable(child) && spa_writeable(spa))) { 3310 (!vdev_writeable(child) && spa_writeable(spa))) {
3311 /* 3311 /*
3312 * Root special: if there is a top-level log 3312 * Root special: if there is a top-level log
3313 * device, treat the root vdev as if it were 3313 * device, treat the root vdev as if it were
3314 * degraded. 3314 * degraded.
3315 */ 3315 */
3316 if (child->vdev_islog && vd == rvd) 3316 if (child->vdev_islog && vd == rvd)
3317 degraded++; 3317 degraded++;
3318 else 3318 else
3319 faulted++; 3319 faulted++;
3320 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 3320 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
3321 degraded++; 3321 degraded++;
3322 } 3322 }
3323 3323
3324 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 3324 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
3325 corrupted++; 3325 corrupted++;
3326 } 3326 }
3327 3327
3328 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 3328 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
3329 3329
3330 /* 3330 /*
3331 * Root special: if there is a top-level vdev that cannot be 3331 * Root special: if there is a top-level vdev that cannot be
3332 * opened due to corrupted metadata, then propagate the root 3332 * opened due to corrupted metadata, then propagate the root
3333 * vdev's aux state as 'corrupt' rather than 'insufficient 3333 * vdev's aux state as 'corrupt' rather than 'insufficient
3334 * replicas'. 3334 * replicas'.
3335 */ 3335 */
3336 if (corrupted && vd == rvd && 3336 if (corrupted && vd == rvd &&
3337 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 3337 rvd->vdev_state == VDEV_STATE_CANT_OPEN)
3338 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 3338 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
3339 VDEV_AUX_CORRUPT_DATA); 3339 VDEV_AUX_CORRUPT_DATA);
3340 } 3340 }
3341 3341
3342 if (vd->vdev_parent) 3342 if (vd->vdev_parent)
3343 vdev_propagate_state(vd->vdev_parent); 3343 vdev_propagate_state(vd->vdev_parent);
3344} 3344}
3345 3345
3346/* 3346/*
3347 * Set a vdev's state. If this is during an open, we don't update the parent 3347 * Set a vdev's state. If this is during an open, we don't update the parent
3348 * state, because we're in the process of opening children depth-first. 3348 * state, because we're in the process of opening children depth-first.
3349 * Otherwise, we propagate the change to the parent. 3349 * Otherwise, we propagate the change to the parent.
3350 * 3350 *
3351 * If this routine places a device in a faulted state, an appropriate ereport is 3351 * If this routine places a device in a faulted state, an appropriate ereport is
3352 * generated. 3352 * generated.
3353 */ 3353 */
3354void 3354void
3355vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 3355vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
3356{ 3356{
3357 uint64_t save_state; 3357 uint64_t save_state;
3358 spa_t *spa = vd->vdev_spa; 3358 spa_t *spa = vd->vdev_spa;
3359 3359
3360 if (state == vd->vdev_state) { 3360 if (state == vd->vdev_state) {
3361 vd->vdev_stat.vs_aux = aux; 3361 vd->vdev_stat.vs_aux = aux;
3362 return; 3362 return;
3363 } 3363 }
3364 3364
3365 save_state = vd->vdev_state; 3365 save_state = vd->vdev_state;
3366 3366
3367 vd->vdev_state = state; 3367 vd->vdev_state = state;
3368 vd->vdev_stat.vs_aux = aux; 3368 vd->vdev_stat.vs_aux = aux;
3369 3369
3370 /* 3370 /*
3371 * If we are setting the vdev state to anything but an open state, then 3371 * If we are setting the vdev state to anything but an open state, then
3372 * always close the underlying device unless the device has requested 3372 * always close the underlying device unless the device has requested
3373 * a delayed close (i.e. we're about to remove or fault the device). 3373 * a delayed close (i.e. we're about to remove or fault the device).
3374 * Otherwise, we keep accessible but invalid devices open forever. 3374 * Otherwise, we keep accessible but invalid devices open forever.
3375 * We don't call vdev_close() itself, because that implies some extra 3375 * We don't call vdev_close() itself, because that implies some extra
3376 * checks (offline, etc) that we don't want here. This is limited to 3376 * checks (offline, etc) that we don't want here. This is limited to
3377 * leaf devices, because otherwise closing the device will affect other 3377 * leaf devices, because otherwise closing the device will affect other
3378 * children. 3378 * children.
3379 */ 3379 */
3380 if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 3380 if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
3381 vd->vdev_ops->vdev_op_leaf) 3381 vd->vdev_ops->vdev_op_leaf)
3382 vd->vdev_ops->vdev_op_close(vd); 3382 vd->vdev_ops->vdev_op_close(vd);
3383 3383
3384 if (vd->vdev_removed && 3384 if (vd->vdev_removed &&
3385 state == VDEV_STATE_CANT_OPEN && 3385 state == VDEV_STATE_CANT_OPEN &&
3386 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 3386 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
3387 /* 3387 /*
3388 * If the previous state is set to VDEV_STATE_REMOVED, then this 3388 * If the previous state is set to VDEV_STATE_REMOVED, then this
3389 * device was previously marked removed and someone attempted to 3389 * device was previously marked removed and someone attempted to
3390 * reopen it. If this failed due to a nonexistent device, then 3390 * reopen it. If this failed due to a nonexistent device, then
3391 * keep the device in the REMOVED state. We also let this be if 3391 * keep the device in the REMOVED state. We also let this be if
3392 * it is one of our special test online cases, which is only 3392 * it is one of our special test online cases, which is only
3393 * attempting to online the device and shouldn't generate an FMA 3393 * attempting to online the device and shouldn't generate an FMA
3394 * fault. 3394 * fault.
3395 */ 3395 */
3396 vd->vdev_state = VDEV_STATE_REMOVED; 3396 vd->vdev_state = VDEV_STATE_REMOVED;
3397 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 3397 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
3398 } else if (state == VDEV_STATE_REMOVED) { 3398 } else if (state == VDEV_STATE_REMOVED) {
3399 vd->vdev_removed = B_TRUE; 3399 vd->vdev_removed = B_TRUE;
3400 } else if (state == VDEV_STATE_CANT_OPEN) { 3400 } else if (state == VDEV_STATE_CANT_OPEN) {
3401 /* 3401 /*
3402 * If we fail to open a vdev during an import or recovery, we 3402 * If we fail to open a vdev during an import or recovery, we
3403 * mark it as "not available", which signifies that it was 3403 * mark it as "not available", which signifies that it was
3404 * never there to begin with. Failure to open such a device 3404 * never there to begin with. Failure to open such a device
3405 * is not considered an error. 3405 * is not considered an error.
3406 */ 3406 */
3407 if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 3407 if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
3408 spa_load_state(spa) == SPA_LOAD_RECOVER) && 3408 spa_load_state(spa) == SPA_LOAD_RECOVER) &&
3409 vd->vdev_ops->vdev_op_leaf) 3409 vd->vdev_ops->vdev_op_leaf)
3410 vd->vdev_not_present = 1; 3410 vd->vdev_not_present = 1;
3411 3411
3412 /* 3412 /*
3413 * Post the appropriate ereport. If the 'prevstate' field is 3413 * Post the appropriate ereport. If the 'prevstate' field is
3414 * set to something other than VDEV_STATE_UNKNOWN, it indicates 3414 * set to something other than VDEV_STATE_UNKNOWN, it indicates
3415 * that this is part of a vdev_reopen(). In this case, we don't 3415 * that this is part of a vdev_reopen(). In this case, we don't
3416 * want to post the ereport if the device was already in the 3416 * want to post the ereport if the device was already in the
3417 * CANT_OPEN state beforehand. 3417 * CANT_OPEN state beforehand.
3418 * 3418 *
3419 * If the 'checkremove' flag is set, then this is an attempt to 3419 * If the 'checkremove' flag is set, then this is an attempt to
3420 * online the device in response to an insertion event. If we 3420 * online the device in response to an insertion event. If we
3421 * hit this case, then we have detected an insertion event for a 3421 * hit this case, then we have detected an insertion event for a
3422 * faulted or offline device that wasn't in the removed state. 3422 * faulted or offline device that wasn't in the removed state.
3423 * In this scenario, we don't post an ereport because we are 3423 * In this scenario, we don't post an ereport because we are
3424 * about to replace the device, or attempt an online with 3424 * about to replace the device, or attempt an online with
3425 * vdev_forcefault, which will generate the fault for us. 3425 * vdev_forcefault, which will generate the fault for us.
3426 */ 3426 */
3427 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 3427 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
3428 !vd->vdev_not_present && !vd->vdev_checkremove && 3428 !vd->vdev_not_present && !vd->vdev_checkremove &&
3429 vd != spa->spa_root_vdev) { 3429 vd != spa->spa_root_vdev) {
3430 const char *class; 3430 const char *class;
3431 3431
3432 switch (aux) { 3432 switch (aux) {
3433 case VDEV_AUX_OPEN_FAILED: 3433 case VDEV_AUX_OPEN_FAILED:
3434 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3434 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
3435 break; 3435 break;
3436 case VDEV_AUX_CORRUPT_DATA: 3436 case VDEV_AUX_CORRUPT_DATA:
3437 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3437 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
3438 break; 3438 break;
3439 case VDEV_AUX_NO_REPLICAS: 3439 case VDEV_AUX_NO_REPLICAS:
3440 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3440 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
3441 break; 3441 break;
3442 case VDEV_AUX_BAD_GUID_SUM: 3442 case VDEV_AUX_BAD_GUID_SUM:
3443 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3443 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
3444 break; 3444 break;
3445 case VDEV_AUX_TOO_SMALL: 3445 case VDEV_AUX_TOO_SMALL:
3446 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3446 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
3447 break; 3447 break;
3448 case VDEV_AUX_BAD_LABEL: 3448 case VDEV_AUX_BAD_LABEL:
3449 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3449 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
3450 break; 3450 break;
3451 default: 3451 default:
3452 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3452 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
3453 } 3453 }
3454 3454
3455 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3455 zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
3456 } 3456 }
3457 3457
3458 /* Erase any notion of persistent removed state */ 3458 /* Erase any notion of persistent removed state */
3459 vd->vdev_removed = B_FALSE; 3459 vd->vdev_removed = B_FALSE;
3460 } else { 3460 } else {
3461 vd->vdev_removed = B_FALSE; 3461 vd->vdev_removed = B_FALSE;
3462 } 3462 }
3463 3463
3464 /* 3464 /*
3465 * Notify the fmd of the state change. Be verbose and post 3465 * Notify the fmd of the state change. Be verbose and post
3466 * notifications even for stuff that's not important; the fmd agent can 3466 * notifications even for stuff that's not important; the fmd agent can
3467 * sort it out. Don't emit state change events for non-leaf vdevs since 3467 * sort it out. Don't emit state change events for non-leaf vdevs since
3468 * they can't change state on their own. The FMD can check their state 3468 * they can't change state on their own. The FMD can check their state
3469 * if it wants to when it sees that a leaf vdev had a state change. 3469 * if it wants to when it sees that a leaf vdev had a state change.
3470 */ 3470 */
3471 if (vd->vdev_ops->vdev_op_leaf) 3471 if (vd->vdev_ops->vdev_op_leaf)
3472 zfs_post_state_change(spa, vd); 3472 zfs_post_state_change(spa, vd);
3473 3473
3474 if (!isopen && vd->vdev_parent) 3474 if (!isopen && vd->vdev_parent)
3475 vdev_propagate_state(vd->vdev_parent); 3475 vdev_propagate_state(vd->vdev_parent);
3476} 3476}
3477 3477
3478/* 3478/*
3479 * Check the vdev configuration to ensure that it's capable of supporting 3479 * Check the vdev configuration to ensure that it's capable of supporting
3480 * a root pool. We do not support partial configuration. 3480 * a root pool. We do not support partial configuration.
3481 * In addition, only a single top-level vdev is allowed. 3481 * In addition, only a single top-level vdev is allowed.
3482 * 3482 *
3483 * FreeBSD does not have above limitations. 3483 * FreeBSD does not have above limitations.
3484 */ 3484 */
3485boolean_t 3485boolean_t
3486vdev_is_bootable(vdev_t *vd) 3486vdev_is_bootable(vdev_t *vd)
3487{ 3487{
3488#ifdef illumos 3488#ifdef illumos
3489 if (!vd->vdev_ops->vdev_op_leaf) { 3489 if (!vd->vdev_ops->vdev_op_leaf) {
3490 char *vdev_type = vd->vdev_ops->vdev_op_type; 3490 char *vdev_type = vd->vdev_ops->vdev_op_type;
3491 3491
3492 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3492 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
3493 vd->vdev_children > 1) { 3493 vd->vdev_children > 1) {
3494 return (B_FALSE); 3494 return (B_FALSE);
3495 } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 3495 } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
3496 return (B_FALSE); 3496 return (B_FALSE);
3497 } 3497 }
3498 } 3498 }
3499 3499
3500 for (int c = 0; c < vd->vdev_children; c++) { 3500 for (int c = 0; c < vd->vdev_children; c++) {
3501 if (!vdev_is_bootable(vd->vdev_child[c])) 3501 if (!vdev_is_bootable(vd->vdev_child[c]))
3502 return (B_FALSE); 3502 return (B_FALSE);
3503 } 3503 }
3504#endif /* illumos */ 3504#endif /* illumos */
3505 return (B_TRUE); 3505 return (B_TRUE);
3506} 3506}
3507 3507
3508/* 3508/*
3509 * Load the state from the original vdev tree (ovd) which 3509 * Load the state from the original vdev tree (ovd) which
3510 * we've retrieved from the MOS config object. If the original 3510 * we've retrieved from the MOS config object. If the original
3511 * vdev was offline or faulted then we transfer that state to the 3511 * vdev was offline or faulted then we transfer that state to the
3512 * device in the current vdev tree (nvd). 3512 * device in the current vdev tree (nvd).
3513 */ 3513 */
3514void 3514void
3515vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3515vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
3516{ 3516{
3517 spa_t *spa = nvd->vdev_spa; 3517 spa_t *spa = nvd->vdev_spa;
3518 3518
3519 ASSERT(nvd->vdev_top->vdev_islog); 3519 ASSERT(nvd->vdev_top->vdev_islog);
3520 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3520 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
3521 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3521 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
3522 3522
3523 for (int c = 0; c < nvd->vdev_children; c++) 3523 for (int c = 0; c < nvd->vdev_children; c++)
3524 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3524 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
3525 3525
3526 if (nvd->vdev_ops->vdev_op_leaf) { 3526 if (nvd->vdev_ops->vdev_op_leaf) {
3527 /* 3527 /*
3528 * Restore the persistent vdev state 3528 * Restore the persistent vdev state
3529 */ 3529 */
3530 nvd->vdev_offline = ovd->vdev_offline; 3530 nvd->vdev_offline = ovd->vdev_offline;
3531 nvd->vdev_faulted = ovd->vdev_faulted; 3531 nvd->vdev_faulted = ovd->vdev_faulted;
3532 nvd->vdev_degraded = ovd->vdev_degraded; 3532 nvd->vdev_degraded = ovd->vdev_degraded;
3533 nvd->vdev_removed = ovd->vdev_removed; 3533 nvd->vdev_removed = ovd->vdev_removed;
3534 } 3534 }
3535} 3535}
3536 3536
3537/* 3537/*
3538 * Determine if a log device has valid content. If the vdev was 3538 * Determine if a log device has valid content. If the vdev was
3539 * removed or faulted in the MOS config then we know that 3539 * removed or faulted in the MOS config then we know that
3540 * the content on the log device has already been written to the pool. 3540 * the content on the log device has already been written to the pool.
3541 */ 3541 */
3542boolean_t 3542boolean_t
3543vdev_log_state_valid(vdev_t *vd) 3543vdev_log_state_valid(vdev_t *vd)
3544{ 3544{
3545 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3545 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
3546 !vd->vdev_removed) 3546 !vd->vdev_removed)
3547 return (B_TRUE); 3547 return (B_TRUE);
3548 3548
3549 for (int c = 0; c < vd->vdev_children; c++) 3549 for (int c = 0; c < vd->vdev_children; c++)
3550 if (vdev_log_state_valid(vd->vdev_child[c])) 3550 if (vdev_log_state_valid(vd->vdev_child[c]))
3551 return (B_TRUE); 3551 return (B_TRUE);
3552 3552
3553 return (B_FALSE); 3553 return (B_FALSE);
3554} 3554}
3555 3555
3556/* 3556/*
3557 * Expand a vdev if possible. 3557 * Expand a vdev if possible.
3558 */ 3558 */
3559void 3559void
3560vdev_expand(vdev_t *vd, uint64_t txg) 3560vdev_expand(vdev_t *vd, uint64_t txg)
3561{ 3561{
3562 ASSERT(vd->vdev_top == vd); 3562 ASSERT(vd->vdev_top == vd);
3563 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3563 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3564 3564
3565 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { 3565 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
3566 VERIFY(vdev_metaslab_init(vd, txg) == 0); 3566 VERIFY(vdev_metaslab_init(vd, txg) == 0);
3567 vdev_config_dirty(vd); 3567 vdev_config_dirty(vd);
3568 } 3568 }
3569} 3569}
3570 3570
3571/* 3571/*
3572 * Split a vdev. 3572 * Split a vdev.
3573 */ 3573 */
3574void 3574void
3575vdev_split(vdev_t *vd) 3575vdev_split(vdev_t *vd)
3576{ 3576{
3577 vdev_t *cvd, *pvd = vd->vdev_parent; 3577 vdev_t *cvd, *pvd = vd->vdev_parent;
3578 3578
3579 vdev_remove_child(pvd, vd); 3579 vdev_remove_child(pvd, vd);
3580 vdev_compact_children(pvd); 3580 vdev_compact_children(pvd);
3581 3581
3582 cvd = pvd->vdev_child[0]; 3582 cvd = pvd->vdev_child[0];
3583 if (pvd->vdev_children == 1) { 3583 if (pvd->vdev_children == 1) {
3584 vdev_remove_parent(cvd); 3584 vdev_remove_parent(cvd);
3585 cvd->vdev_splitting = B_TRUE; 3585 cvd->vdev_splitting = B_TRUE;
3586 } 3586 }
3587 vdev_propagate_state(cvd); 3587 vdev_propagate_state(cvd);
3588} 3588}
3589 3589
3590void 3590void
3591vdev_deadman(vdev_t *vd) 3591vdev_deadman(vdev_t *vd)
3592{ 3592{
3593 for (int c = 0; c < vd->vdev_children; c++) { 3593 for (int c = 0; c < vd->vdev_children; c++) {
3594 vdev_t *cvd = vd->vdev_child[c]; 3594 vdev_t *cvd = vd->vdev_child[c];
3595 3595
3596 vdev_deadman(cvd); 3596 vdev_deadman(cvd);
3597 } 3597 }
3598 3598
3599 if (vd->vdev_ops->vdev_op_leaf) { 3599 if (vd->vdev_ops->vdev_op_leaf) {
3600 vdev_queue_t *vq = &vd->vdev_queue; 3600 vdev_queue_t *vq = &vd->vdev_queue;
3601 3601
3602 mutex_enter(&vq->vq_lock); 3602 mutex_enter(&vq->vq_lock);
3603 if (avl_numnodes(&vq->vq_active_tree) > 0) { 3603 if (avl_numnodes(&vq->vq_active_tree) > 0) {
3604 spa_t *spa = vd->vdev_spa; 3604 spa_t *spa = vd->vdev_spa;
3605 zio_t *fio; 3605 zio_t *fio;
3606 uint64_t delta; 3606 uint64_t delta;
3607 3607
3608 /* 3608 /*
3609 * Look at the head of all the pending queues, 3609 * Look at the head of all the pending queues,
3610 * if any I/O has been outstanding for longer than 3610 * if any I/O has been outstanding for longer than
3611 * the spa_deadman_synctime we panic the system. 3611 * the spa_deadman_synctime we panic the system.
3612 */ 3612 */
3613 fio = avl_first(&vq->vq_active_tree); 3613 fio = avl_first(&vq->vq_active_tree);
3614 delta = gethrtime() - fio->io_timestamp; 3614 delta = gethrtime() - fio->io_timestamp;
3615 if (delta > spa_deadman_synctime(spa)) { 3615 if (delta > spa_deadman_synctime(spa)) {
3616 zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " 3616 zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
3617 "delta %"PRIu64"ns, last io %lluns", 3617 "delta %"PRIu64"ns, last io %lluns",
3618 fio->io_timestamp, delta, 3618 fio->io_timestamp, delta,
3619 vq->vq_io_complete_ts); 3619 vq->vq_io_complete_ts);
3620 3620
3621 printf("SLOW IO: zio timestamp %lluns, " 3621 printf("SLOW IO: zio timestamp %lluns, "
3622 "delta %"PRIu64"ns, last io %lluns", 3622 "delta %"PRIu64"ns, last io %lluns\n",
3623 fio->io_timestamp, delta, 3623 fio->io_timestamp, delta,
3624 vq->vq_io_complete_ts); 3624 vq->vq_io_complete_ts);
3625 3625
3626 fm_panic("I/O to pool '%s' appears to be " 3626 fm_panic("I/O to pool '%s' appears to be "
3627 "hung on vdev guid %llu at '%s'.", 3627 "hung on vdev guid %llu at '%s'.",
3628 spa_name(spa), 3628 spa_name(spa),
3629 (long long unsigned int) vd->vdev_guid, 3629 (long long unsigned int) vd->vdev_guid,
3630 vd->vdev_path); 3630 vd->vdev_path);
3631 } 3631 }
3632 } 3632 }
3633 mutex_exit(&vq->vq_lock); 3633 mutex_exit(&vq->vq_lock);
3634 } 3634 }
3635} 3635}