| @@ -2623,1013 +2623,1013 @@ vdev_online(spa_t *spa, uint64_t guid, u | | | @@ -2623,1013 +2623,1013 @@ vdev_online(spa_t *spa, uint64_t guid, u |
2623 | | | 2623 | |
2624 | if (newstate) | | 2624 | if (newstate) |
2625 | *newstate = vd->vdev_state; | | 2625 | *newstate = vd->vdev_state; |
2626 | if ((flags & ZFS_ONLINE_UNSPARE) && | | 2626 | if ((flags & ZFS_ONLINE_UNSPARE) && |
2627 | !vdev_is_dead(vd) && vd->vdev_parent && | | 2627 | !vdev_is_dead(vd) && vd->vdev_parent && |
2628 | vd->vdev_parent->vdev_ops == &vdev_spare_ops && | | 2628 | vd->vdev_parent->vdev_ops == &vdev_spare_ops && |
2629 | vd->vdev_parent->vdev_child[0] == vd) | | 2629 | vd->vdev_parent->vdev_child[0] == vd) |
2630 | vd->vdev_unspare = B_TRUE; | | 2630 | vd->vdev_unspare = B_TRUE; |
2631 | | | 2631 | |
2632 | if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { | | 2632 | if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { |
2633 | | | 2633 | |
2634 | /* XXX - L2ARC 1.0 does not support expansion */ | | 2634 | /* XXX - L2ARC 1.0 does not support expansion */ |
2635 | if (vd->vdev_aux) | | 2635 | if (vd->vdev_aux) |
2636 | return (spa_vdev_state_exit(spa, vd, ENOTSUP)); | | 2636 | return (spa_vdev_state_exit(spa, vd, ENOTSUP)); |
2637 | spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); | | 2637 | spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); |
2638 | } | | 2638 | } |
2639 | | | 2639 | |
2640 | if (postevent) | | 2640 | if (postevent) |
2641 | spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE); | | 2641 | spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE); |
2642 | | | 2642 | |
2643 | return (spa_vdev_state_exit(spa, vd, 0)); | | 2643 | return (spa_vdev_state_exit(spa, vd, 0)); |
2644 | } | | 2644 | } |
2645 | | | 2645 | |
2646 | static int | | 2646 | static int |
2647 | vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) | | 2647 | vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) |
2648 | { | | 2648 | { |
2649 | vdev_t *vd, *tvd; | | 2649 | vdev_t *vd, *tvd; |
2650 | int error = 0; | | 2650 | int error = 0; |
2651 | uint64_t generation; | | 2651 | uint64_t generation; |
2652 | metaslab_group_t *mg; | | 2652 | metaslab_group_t *mg; |
2653 | | | 2653 | |
2654 | top: | | 2654 | top: |
2655 | spa_vdev_state_enter(spa, SCL_ALLOC); | | 2655 | spa_vdev_state_enter(spa, SCL_ALLOC); |
2656 | | | 2656 | |
2657 | if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) | | 2657 | if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) |
2658 | return (spa_vdev_state_exit(spa, NULL, ENODEV)); | | 2658 | return (spa_vdev_state_exit(spa, NULL, ENODEV)); |
2659 | | | 2659 | |
2660 | if (!vd->vdev_ops->vdev_op_leaf) | | 2660 | if (!vd->vdev_ops->vdev_op_leaf) |
2661 | return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); | | 2661 | return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); |
2662 | | | 2662 | |
2663 | tvd = vd->vdev_top; | | 2663 | tvd = vd->vdev_top; |
2664 | mg = tvd->vdev_mg; | | 2664 | mg = tvd->vdev_mg; |
2665 | generation = spa->spa_config_generation + 1; | | 2665 | generation = spa->spa_config_generation + 1; |
2666 | | | 2666 | |
2667 | /* | | 2667 | /* |
2668 | * If the device isn't already offline, try to offline it. | | 2668 | * If the device isn't already offline, try to offline it. |
2669 | */ | | 2669 | */ |
2670 | if (!vd->vdev_offline) { | | 2670 | if (!vd->vdev_offline) { |
2671 | /* | | 2671 | /* |
2672 | * If this device has the only valid copy of some data, | | 2672 | * If this device has the only valid copy of some data, |
2673 | * don't allow it to be offlined. Log devices are always | | 2673 | * don't allow it to be offlined. Log devices are always |
2674 | * expendable. | | 2674 | * expendable. |
2675 | */ | | 2675 | */ |
2676 | if (!tvd->vdev_islog && vd->vdev_aux == NULL && | | 2676 | if (!tvd->vdev_islog && vd->vdev_aux == NULL && |
2677 | vdev_dtl_required(vd)) | | 2677 | vdev_dtl_required(vd)) |
2678 | return (spa_vdev_state_exit(spa, NULL, EBUSY)); | | 2678 | return (spa_vdev_state_exit(spa, NULL, EBUSY)); |
2679 | | | 2679 | |
2680 | /* | | 2680 | /* |
2681 | * If the top-level is a slog and it has had allocations | | 2681 | * If the top-level is a slog and it has had allocations |
2682 | * then proceed. We check that the vdev's metaslab group | | 2682 | * then proceed. We check that the vdev's metaslab group |
2683 | * is not NULL since it's possible that we may have just | | 2683 | * is not NULL since it's possible that we may have just |
2684 | * added this vdev but not yet initialized its metaslabs. | | 2684 | * added this vdev but not yet initialized its metaslabs. |
2685 | */ | | 2685 | */ |
2686 | if (tvd->vdev_islog && mg != NULL) { | | 2686 | if (tvd->vdev_islog && mg != NULL) { |
2687 | /* | | 2687 | /* |
2688 | * Prevent any future allocations. | | 2688 | * Prevent any future allocations. |
2689 | */ | | 2689 | */ |
2690 | metaslab_group_passivate(mg); | | 2690 | metaslab_group_passivate(mg); |
2691 | (void) spa_vdev_state_exit(spa, vd, 0); | | 2691 | (void) spa_vdev_state_exit(spa, vd, 0); |
2692 | | | 2692 | |
2693 | error = spa_offline_log(spa); | | 2693 | error = spa_offline_log(spa); |
2694 | | | 2694 | |
2695 | spa_vdev_state_enter(spa, SCL_ALLOC); | | 2695 | spa_vdev_state_enter(spa, SCL_ALLOC); |
2696 | | | 2696 | |
2697 | /* | | 2697 | /* |
2698 | * Check to see if the config has changed. | | 2698 | * Check to see if the config has changed. |
2699 | */ | | 2699 | */ |
2700 | if (error || generation != spa->spa_config_generation) { | | 2700 | if (error || generation != spa->spa_config_generation) { |
2701 | metaslab_group_activate(mg); | | 2701 | metaslab_group_activate(mg); |
2702 | if (error) | | 2702 | if (error) |
2703 | return (spa_vdev_state_exit(spa, | | 2703 | return (spa_vdev_state_exit(spa, |
2704 | vd, error)); | | 2704 | vd, error)); |
2705 | (void) spa_vdev_state_exit(spa, vd, 0); | | 2705 | (void) spa_vdev_state_exit(spa, vd, 0); |
2706 | goto top; | | 2706 | goto top; |
2707 | } | | 2707 | } |
2708 | ASSERT0(tvd->vdev_stat.vs_alloc); | | 2708 | ASSERT0(tvd->vdev_stat.vs_alloc); |
2709 | } | | 2709 | } |
2710 | | | 2710 | |
2711 | /* | | 2711 | /* |
2712 | * Offline this device and reopen its top-level vdev. | | 2712 | * Offline this device and reopen its top-level vdev. |
2713 | * If the top-level vdev is a log device then just offline | | 2713 | * If the top-level vdev is a log device then just offline |
2714 | * it. Otherwise, if this action results in the top-level | | 2714 | * it. Otherwise, if this action results in the top-level |
2715 | * vdev becoming unusable, undo it and fail the request. | | 2715 | * vdev becoming unusable, undo it and fail the request. |
2716 | */ | | 2716 | */ |
2717 | vd->vdev_offline = B_TRUE; | | 2717 | vd->vdev_offline = B_TRUE; |
2718 | vdev_reopen(tvd); | | 2718 | vdev_reopen(tvd); |
2719 | | | 2719 | |
2720 | if (!tvd->vdev_islog && vd->vdev_aux == NULL && | | 2720 | if (!tvd->vdev_islog && vd->vdev_aux == NULL && |
2721 | vdev_is_dead(tvd)) { | | 2721 | vdev_is_dead(tvd)) { |
2722 | vd->vdev_offline = B_FALSE; | | 2722 | vd->vdev_offline = B_FALSE; |
2723 | vdev_reopen(tvd); | | 2723 | vdev_reopen(tvd); |
2724 | return (spa_vdev_state_exit(spa, NULL, EBUSY)); | | 2724 | return (spa_vdev_state_exit(spa, NULL, EBUSY)); |
2725 | } | | 2725 | } |
2726 | | | 2726 | |
2727 | /* | | 2727 | /* |
2728 | * Add the device back into the metaslab rotor so that | | 2728 | * Add the device back into the metaslab rotor so that |
2729 | * once we online the device it's open for business. | | 2729 | * once we online the device it's open for business. |
2730 | */ | | 2730 | */ |
2731 | if (tvd->vdev_islog && mg != NULL) | | 2731 | if (tvd->vdev_islog && mg != NULL) |
2732 | metaslab_group_activate(mg); | | 2732 | metaslab_group_activate(mg); |
2733 | } | | 2733 | } |
2734 | | | 2734 | |
2735 | vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); | | 2735 | vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); |
2736 | | | 2736 | |
2737 | return (spa_vdev_state_exit(spa, vd, 0)); | | 2737 | return (spa_vdev_state_exit(spa, vd, 0)); |
2738 | } | | 2738 | } |
2739 | | | 2739 | |
2740 | int | | 2740 | int |
2741 | vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) | | 2741 | vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) |
2742 | { | | 2742 | { |
2743 | int error; | | 2743 | int error; |
2744 | | | 2744 | |
2745 | mutex_enter(&spa->spa_vdev_top_lock); | | 2745 | mutex_enter(&spa->spa_vdev_top_lock); |
2746 | error = vdev_offline_locked(spa, guid, flags); | | 2746 | error = vdev_offline_locked(spa, guid, flags); |
2747 | mutex_exit(&spa->spa_vdev_top_lock); | | 2747 | mutex_exit(&spa->spa_vdev_top_lock); |
2748 | | | 2748 | |
2749 | return (error); | | 2749 | return (error); |
2750 | } | | 2750 | } |
2751 | | | 2751 | |
2752 | /* | | 2752 | /* |
2753 | * Clear the error counts associated with this vdev. Unlike vdev_online() and | | 2753 | * Clear the error counts associated with this vdev. Unlike vdev_online() and |
2754 | * vdev_offline(), we assume the spa config is locked. We also clear all | | 2754 | * vdev_offline(), we assume the spa config is locked. We also clear all |
2755 | * children. If 'vd' is NULL, then the user wants to clear all vdevs. | | 2755 | * children. If 'vd' is NULL, then the user wants to clear all vdevs. |
2756 | */ | | 2756 | */ |
2757 | void | | 2757 | void |
2758 | vdev_clear(spa_t *spa, vdev_t *vd) | | 2758 | vdev_clear(spa_t *spa, vdev_t *vd) |
2759 | { | | 2759 | { |
2760 | vdev_t *rvd = spa->spa_root_vdev; | | 2760 | vdev_t *rvd = spa->spa_root_vdev; |
2761 | | | 2761 | |
2762 | ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); | | 2762 | ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); |
2763 | | | 2763 | |
2764 | if (vd == NULL) | | 2764 | if (vd == NULL) |
2765 | vd = rvd; | | 2765 | vd = rvd; |
2766 | | | 2766 | |
2767 | vd->vdev_stat.vs_read_errors = 0; | | 2767 | vd->vdev_stat.vs_read_errors = 0; |
2768 | vd->vdev_stat.vs_write_errors = 0; | | 2768 | vd->vdev_stat.vs_write_errors = 0; |
2769 | vd->vdev_stat.vs_checksum_errors = 0; | | 2769 | vd->vdev_stat.vs_checksum_errors = 0; |
2770 | | | 2770 | |
2771 | for (int c = 0; c < vd->vdev_children; c++) | | 2771 | for (int c = 0; c < vd->vdev_children; c++) |
2772 | vdev_clear(spa, vd->vdev_child[c]); | | 2772 | vdev_clear(spa, vd->vdev_child[c]); |
2773 | | | 2773 | |
2774 | if (vd == rvd) { | | 2774 | if (vd == rvd) { |
2775 | for (int c = 0; c < spa->spa_l2cache.sav_count; c++) | | 2775 | for (int c = 0; c < spa->spa_l2cache.sav_count; c++) |
2776 | vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); | | 2776 | vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); |
2777 | | | 2777 | |
2778 | for (int c = 0; c < spa->spa_spares.sav_count; c++) | | 2778 | for (int c = 0; c < spa->spa_spares.sav_count; c++) |
2779 | vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); | | 2779 | vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); |
2780 | } | | 2780 | } |
2781 | | | 2781 | |
2782 | /* | | 2782 | /* |
2783 | * If we're in the FAULTED state or have experienced failed I/O, then | | 2783 | * If we're in the FAULTED state or have experienced failed I/O, then |
2784 | * clear the persistent state and attempt to reopen the device. We | | 2784 | * clear the persistent state and attempt to reopen the device. We |
2785 | * also mark the vdev config dirty, so that the new faulted state is | | 2785 | * also mark the vdev config dirty, so that the new faulted state is |
2786 | * written out to disk. | | 2786 | * written out to disk. |
2787 | */ | | 2787 | */ |
2788 | if (vd->vdev_faulted || vd->vdev_degraded || | | 2788 | if (vd->vdev_faulted || vd->vdev_degraded || |
2789 | !vdev_readable(vd) || !vdev_writeable(vd)) { | | 2789 | !vdev_readable(vd) || !vdev_writeable(vd)) { |
2790 | | | 2790 | |
2791 | /* | | 2791 | /* |
2792 | * When reopening in reponse to a clear event, it may be due to | | 2792 | * When reopening in reponse to a clear event, it may be due to |
2793 | * a fmadm repair request. In this case, if the device is | | 2793 | * a fmadm repair request. In this case, if the device is |
2794 | * still broken, we want to still post the ereport again. | | 2794 | * still broken, we want to still post the ereport again. |
2795 | */ | | 2795 | */ |
2796 | vd->vdev_forcefault = B_TRUE; | | 2796 | vd->vdev_forcefault = B_TRUE; |
2797 | | | 2797 | |
2798 | vd->vdev_faulted = vd->vdev_degraded = 0ULL; | | 2798 | vd->vdev_faulted = vd->vdev_degraded = 0ULL; |
2799 | vd->vdev_cant_read = B_FALSE; | | 2799 | vd->vdev_cant_read = B_FALSE; |
2800 | vd->vdev_cant_write = B_FALSE; | | 2800 | vd->vdev_cant_write = B_FALSE; |
2801 | | | 2801 | |
2802 | vdev_reopen(vd == rvd ? rvd : vd->vdev_top); | | 2802 | vdev_reopen(vd == rvd ? rvd : vd->vdev_top); |
2803 | | | 2803 | |
2804 | vd->vdev_forcefault = B_FALSE; | | 2804 | vd->vdev_forcefault = B_FALSE; |
2805 | | | 2805 | |
2806 | if (vd != rvd && vdev_writeable(vd->vdev_top)) | | 2806 | if (vd != rvd && vdev_writeable(vd->vdev_top)) |
2807 | vdev_state_dirty(vd->vdev_top); | | 2807 | vdev_state_dirty(vd->vdev_top); |
2808 | | | 2808 | |
2809 | if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) | | 2809 | if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) |
2810 | spa_async_request(spa, SPA_ASYNC_RESILVER); | | 2810 | spa_async_request(spa, SPA_ASYNC_RESILVER); |
2811 | | | 2811 | |
2812 | spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); | | 2812 | spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); |
2813 | } | | 2813 | } |
2814 | | | 2814 | |
2815 | /* | | 2815 | /* |
2816 | * When clearing a FMA-diagnosed fault, we always want to | | 2816 | * When clearing a FMA-diagnosed fault, we always want to |
2817 | * unspare the device, as we assume that the original spare was | | 2817 | * unspare the device, as we assume that the original spare was |
2818 | * done in response to the FMA fault. | | 2818 | * done in response to the FMA fault. |
2819 | */ | | 2819 | */ |
2820 | if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && | | 2820 | if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && |
2821 | vd->vdev_parent->vdev_ops == &vdev_spare_ops && | | 2821 | vd->vdev_parent->vdev_ops == &vdev_spare_ops && |
2822 | vd->vdev_parent->vdev_child[0] == vd) | | 2822 | vd->vdev_parent->vdev_child[0] == vd) |
2823 | vd->vdev_unspare = B_TRUE; | | 2823 | vd->vdev_unspare = B_TRUE; |
2824 | } | | 2824 | } |
2825 | | | 2825 | |
2826 | boolean_t | | 2826 | boolean_t |
2827 | vdev_is_dead(vdev_t *vd) | | 2827 | vdev_is_dead(vdev_t *vd) |
2828 | { | | 2828 | { |
2829 | /* | | 2829 | /* |
2830 | * Holes and missing devices are always considered "dead". | | 2830 | * Holes and missing devices are always considered "dead". |
2831 | * This simplifies the code since we don't have to check for | | 2831 | * This simplifies the code since we don't have to check for |
2832 | * these types of devices in the various code paths. | | 2832 | * these types of devices in the various code paths. |
2833 | * Instead we rely on the fact that we skip over dead devices | | 2833 | * Instead we rely on the fact that we skip over dead devices |
2834 | * before issuing I/O to them. | | 2834 | * before issuing I/O to them. |
2835 | */ | | 2835 | */ |
2836 | return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || | | 2836 | return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || |
2837 | vd->vdev_ops == &vdev_missing_ops); | | 2837 | vd->vdev_ops == &vdev_missing_ops); |
2838 | } | | 2838 | } |
2839 | | | 2839 | |
2840 | boolean_t | | 2840 | boolean_t |
2841 | vdev_readable(vdev_t *vd) | | 2841 | vdev_readable(vdev_t *vd) |
2842 | { | | 2842 | { |
2843 | return (!vdev_is_dead(vd) && !vd->vdev_cant_read); | | 2843 | return (!vdev_is_dead(vd) && !vd->vdev_cant_read); |
2844 | } | | 2844 | } |
2845 | | | 2845 | |
2846 | boolean_t | | 2846 | boolean_t |
2847 | vdev_writeable(vdev_t *vd) | | 2847 | vdev_writeable(vdev_t *vd) |
2848 | { | | 2848 | { |
2849 | return (!vdev_is_dead(vd) && !vd->vdev_cant_write); | | 2849 | return (!vdev_is_dead(vd) && !vd->vdev_cant_write); |
2850 | } | | 2850 | } |
2851 | | | 2851 | |
2852 | boolean_t | | 2852 | boolean_t |
2853 | vdev_allocatable(vdev_t *vd) | | 2853 | vdev_allocatable(vdev_t *vd) |
2854 | { | | 2854 | { |
2855 | uint64_t state = vd->vdev_state; | | 2855 | uint64_t state = vd->vdev_state; |
2856 | | | 2856 | |
2857 | /* | | 2857 | /* |
2858 | * We currently allow allocations from vdevs which may be in the | | 2858 | * We currently allow allocations from vdevs which may be in the |
2859 | * process of reopening (i.e. VDEV_STATE_CLOSED). If the device | | 2859 | * process of reopening (i.e. VDEV_STATE_CLOSED). If the device |
2860 | * fails to reopen then we'll catch it later when we're holding | | 2860 | * fails to reopen then we'll catch it later when we're holding |
2861 | * the proper locks. Note that we have to get the vdev state | | 2861 | * the proper locks. Note that we have to get the vdev state |
2862 | * in a local variable because although it changes atomically, | | 2862 | * in a local variable because although it changes atomically, |
2863 | * we're asking two separate questions about it. | | 2863 | * we're asking two separate questions about it. |
2864 | */ | | 2864 | */ |
2865 | return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && | | 2865 | return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && |
2866 | !vd->vdev_cant_write && !vd->vdev_ishole && | | 2866 | !vd->vdev_cant_write && !vd->vdev_ishole && |
2867 | vd->vdev_mg->mg_initialized); | | 2867 | vd->vdev_mg->mg_initialized); |
2868 | } | | 2868 | } |
2869 | | | 2869 | |
2870 | boolean_t | | 2870 | boolean_t |
2871 | vdev_accessible(vdev_t *vd, zio_t *zio) | | 2871 | vdev_accessible(vdev_t *vd, zio_t *zio) |
2872 | { | | 2872 | { |
2873 | ASSERT(zio->io_vd == vd); | | 2873 | ASSERT(zio->io_vd == vd); |
2874 | | | 2874 | |
2875 | if (vdev_is_dead(vd) || vd->vdev_remove_wanted) | | 2875 | if (vdev_is_dead(vd) || vd->vdev_remove_wanted) |
2876 | return (B_FALSE); | | 2876 | return (B_FALSE); |
2877 | | | 2877 | |
2878 | if (zio->io_type == ZIO_TYPE_READ) | | 2878 | if (zio->io_type == ZIO_TYPE_READ) |
2879 | return (!vd->vdev_cant_read); | | 2879 | return (!vd->vdev_cant_read); |
2880 | | | 2880 | |
2881 | if (zio->io_type == ZIO_TYPE_WRITE) | | 2881 | if (zio->io_type == ZIO_TYPE_WRITE) |
2882 | return (!vd->vdev_cant_write); | | 2882 | return (!vd->vdev_cant_write); |
2883 | | | 2883 | |
2884 | return (B_TRUE); | | 2884 | return (B_TRUE); |
2885 | } | | 2885 | } |
2886 | | | 2886 | |
2887 | /* | | 2887 | /* |
2888 | * Get statistics for the given vdev. | | 2888 | * Get statistics for the given vdev. |
2889 | */ | | 2889 | */ |
2890 | void | | 2890 | void |
2891 | vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) | | 2891 | vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) |
2892 | { | | 2892 | { |
2893 | spa_t *spa = vd->vdev_spa; | | 2893 | spa_t *spa = vd->vdev_spa; |
2894 | vdev_t *rvd = spa->spa_root_vdev; | | 2894 | vdev_t *rvd = spa->spa_root_vdev; |
2895 | vdev_t *tvd = vd->vdev_top; | | 2895 | vdev_t *tvd = vd->vdev_top; |
2896 | | | 2896 | |
2897 | ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); | | 2897 | ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); |
2898 | | | 2898 | |
2899 | mutex_enter(&vd->vdev_stat_lock); | | 2899 | mutex_enter(&vd->vdev_stat_lock); |
2900 | bcopy(&vd->vdev_stat, vs, sizeof (*vs)); | | 2900 | bcopy(&vd->vdev_stat, vs, sizeof (*vs)); |
2901 | vs->vs_timestamp = gethrtime() - vs->vs_timestamp; | | 2901 | vs->vs_timestamp = gethrtime() - vs->vs_timestamp; |
2902 | vs->vs_state = vd->vdev_state; | | 2902 | vs->vs_state = vd->vdev_state; |
2903 | vs->vs_rsize = vdev_get_min_asize(vd); | | 2903 | vs->vs_rsize = vdev_get_min_asize(vd); |
2904 | if (vd->vdev_ops->vdev_op_leaf) | | 2904 | if (vd->vdev_ops->vdev_op_leaf) |
2905 | vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; | | 2905 | vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; |
2906 | /* | | 2906 | /* |
2907 | * Report expandable space on top-level, non-auxillary devices only. | | 2907 | * Report expandable space on top-level, non-auxillary devices only. |
2908 | * The expandable space is reported in terms of metaslab sized units | | 2908 | * The expandable space is reported in terms of metaslab sized units |
2909 | * since that determines how much space the pool can expand. | | 2909 | * since that determines how much space the pool can expand. |
2910 | */ | | 2910 | */ |
2911 | if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { | | 2911 | if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { |
2912 | vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize, | | 2912 | vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize, |
2913 | 1ULL << tvd->vdev_ms_shift); | | 2913 | 1ULL << tvd->vdev_ms_shift); |
2914 | } | | 2914 | } |
2915 | vs->vs_configured_ashift = vd->vdev_top != NULL | | 2915 | vs->vs_configured_ashift = vd->vdev_top != NULL |
2916 | ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; | | 2916 | ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; |
2917 | vs->vs_logical_ashift = vd->vdev_logical_ashift; | | 2917 | vs->vs_logical_ashift = vd->vdev_logical_ashift; |
2918 | vs->vs_physical_ashift = vd->vdev_physical_ashift; | | 2918 | vs->vs_physical_ashift = vd->vdev_physical_ashift; |
2919 | if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { | | 2919 | if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { |
2920 | vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; | | 2920 | vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; |
2921 | } | | 2921 | } |
2922 | | | 2922 | |
2923 | /* | | 2923 | /* |
2924 | * If we're getting stats on the root vdev, aggregate the I/O counts | | 2924 | * If we're getting stats on the root vdev, aggregate the I/O counts |
2925 | * over all top-level vdevs (i.e. the direct children of the root). | | 2925 | * over all top-level vdevs (i.e. the direct children of the root). |
2926 | */ | | 2926 | */ |
2927 | if (vd == rvd) { | | 2927 | if (vd == rvd) { |
2928 | for (int c = 0; c < rvd->vdev_children; c++) { | | 2928 | for (int c = 0; c < rvd->vdev_children; c++) { |
2929 | vdev_t *cvd = rvd->vdev_child[c]; | | 2929 | vdev_t *cvd = rvd->vdev_child[c]; |
2930 | vdev_stat_t *cvs = &cvd->vdev_stat; | | 2930 | vdev_stat_t *cvs = &cvd->vdev_stat; |
2931 | | | 2931 | |
2932 | for (int t = 0; t < ZIO_TYPES; t++) { | | 2932 | for (int t = 0; t < ZIO_TYPES; t++) { |
2933 | vs->vs_ops[t] += cvs->vs_ops[t]; | | 2933 | vs->vs_ops[t] += cvs->vs_ops[t]; |
2934 | vs->vs_bytes[t] += cvs->vs_bytes[t]; | | 2934 | vs->vs_bytes[t] += cvs->vs_bytes[t]; |
2935 | } | | 2935 | } |
2936 | cvs->vs_scan_removing = cvd->vdev_removing; | | 2936 | cvs->vs_scan_removing = cvd->vdev_removing; |
2937 | } | | 2937 | } |
2938 | } | | 2938 | } |
2939 | mutex_exit(&vd->vdev_stat_lock); | | 2939 | mutex_exit(&vd->vdev_stat_lock); |
2940 | } | | 2940 | } |
2941 | | | 2941 | |
2942 | void | | 2942 | void |
2943 | vdev_clear_stats(vdev_t *vd) | | 2943 | vdev_clear_stats(vdev_t *vd) |
2944 | { | | 2944 | { |
2945 | mutex_enter(&vd->vdev_stat_lock); | | 2945 | mutex_enter(&vd->vdev_stat_lock); |
2946 | vd->vdev_stat.vs_space = 0; | | 2946 | vd->vdev_stat.vs_space = 0; |
2947 | vd->vdev_stat.vs_dspace = 0; | | 2947 | vd->vdev_stat.vs_dspace = 0; |
2948 | vd->vdev_stat.vs_alloc = 0; | | 2948 | vd->vdev_stat.vs_alloc = 0; |
2949 | mutex_exit(&vd->vdev_stat_lock); | | 2949 | mutex_exit(&vd->vdev_stat_lock); |
2950 | } | | 2950 | } |
2951 | | | 2951 | |
2952 | void | | 2952 | void |
2953 | vdev_scan_stat_init(vdev_t *vd) | | 2953 | vdev_scan_stat_init(vdev_t *vd) |
2954 | { | | 2954 | { |
2955 | vdev_stat_t *vs = &vd->vdev_stat; | | 2955 | vdev_stat_t *vs = &vd->vdev_stat; |
2956 | | | 2956 | |
2957 | for (int c = 0; c < vd->vdev_children; c++) | | 2957 | for (int c = 0; c < vd->vdev_children; c++) |
2958 | vdev_scan_stat_init(vd->vdev_child[c]); | | 2958 | vdev_scan_stat_init(vd->vdev_child[c]); |
2959 | | | 2959 | |
2960 | mutex_enter(&vd->vdev_stat_lock); | | 2960 | mutex_enter(&vd->vdev_stat_lock); |
2961 | vs->vs_scan_processed = 0; | | 2961 | vs->vs_scan_processed = 0; |
2962 | mutex_exit(&vd->vdev_stat_lock); | | 2962 | mutex_exit(&vd->vdev_stat_lock); |
2963 | } | | 2963 | } |
2964 | | | 2964 | |
2965 | void | | 2965 | void |
2966 | vdev_stat_update(zio_t *zio, uint64_t psize) | | 2966 | vdev_stat_update(zio_t *zio, uint64_t psize) |
2967 | { | | 2967 | { |
2968 | spa_t *spa = zio->io_spa; | | 2968 | spa_t *spa = zio->io_spa; |
2969 | vdev_t *rvd = spa->spa_root_vdev; | | 2969 | vdev_t *rvd = spa->spa_root_vdev; |
2970 | vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; | | 2970 | vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; |
2971 | vdev_t *pvd; | | 2971 | vdev_t *pvd; |
2972 | uint64_t txg = zio->io_txg; | | 2972 | uint64_t txg = zio->io_txg; |
2973 | vdev_stat_t *vs = &vd->vdev_stat; | | 2973 | vdev_stat_t *vs = &vd->vdev_stat; |
2974 | zio_type_t type = zio->io_type; | | 2974 | zio_type_t type = zio->io_type; |
2975 | int flags = zio->io_flags; | | 2975 | int flags = zio->io_flags; |
2976 | | | 2976 | |
2977 | /* | | 2977 | /* |
2978 | * If this i/o is a gang leader, it didn't do any actual work. | | 2978 | * If this i/o is a gang leader, it didn't do any actual work. |
2979 | */ | | 2979 | */ |
2980 | if (zio->io_gang_tree) | | 2980 | if (zio->io_gang_tree) |
2981 | return; | | 2981 | return; |
2982 | | | 2982 | |
2983 | if (zio->io_error == 0) { | | 2983 | if (zio->io_error == 0) { |
2984 | /* | | 2984 | /* |
2985 | * If this is a root i/o, don't count it -- we've already | | 2985 | * If this is a root i/o, don't count it -- we've already |
2986 | * counted the top-level vdevs, and vdev_get_stats() will | | 2986 | * counted the top-level vdevs, and vdev_get_stats() will |
2987 | * aggregate them when asked. This reduces contention on | | 2987 | * aggregate them when asked. This reduces contention on |
2988 | * the root vdev_stat_lock and implicitly handles blocks | | 2988 | * the root vdev_stat_lock and implicitly handles blocks |
2989 | * that compress away to holes, for which there is no i/o. | | 2989 | * that compress away to holes, for which there is no i/o. |
2990 | * (Holes never create vdev children, so all the counters | | 2990 | * (Holes never create vdev children, so all the counters |
2991 | * remain zero, which is what we want.) | | 2991 | * remain zero, which is what we want.) |
2992 | * | | 2992 | * |
2993 | * Note: this only applies to successful i/o (io_error == 0) | | 2993 | * Note: this only applies to successful i/o (io_error == 0) |
2994 | * because unlike i/o counts, errors are not additive. | | 2994 | * because unlike i/o counts, errors are not additive. |
2995 | * When reading a ditto block, for example, failure of | | 2995 | * When reading a ditto block, for example, failure of |
2996 | * one top-level vdev does not imply a root-level error. | | 2996 | * one top-level vdev does not imply a root-level error. |
2997 | */ | | 2997 | */ |
2998 | if (vd == rvd) | | 2998 | if (vd == rvd) |
2999 | return; | | 2999 | return; |
3000 | | | 3000 | |
3001 | ASSERT(vd == zio->io_vd); | | 3001 | ASSERT(vd == zio->io_vd); |
3002 | | | 3002 | |
3003 | if (flags & ZIO_FLAG_IO_BYPASS) | | 3003 | if (flags & ZIO_FLAG_IO_BYPASS) |
3004 | return; | | 3004 | return; |
3005 | | | 3005 | |
3006 | mutex_enter(&vd->vdev_stat_lock); | | 3006 | mutex_enter(&vd->vdev_stat_lock); |
3007 | | | 3007 | |
3008 | if (flags & ZIO_FLAG_IO_REPAIR) { | | 3008 | if (flags & ZIO_FLAG_IO_REPAIR) { |
3009 | if (flags & ZIO_FLAG_SCAN_THREAD) { | | 3009 | if (flags & ZIO_FLAG_SCAN_THREAD) { |
3010 | dsl_scan_phys_t *scn_phys = | | 3010 | dsl_scan_phys_t *scn_phys = |
3011 | &spa->spa_dsl_pool->dp_scan->scn_phys; | | 3011 | &spa->spa_dsl_pool->dp_scan->scn_phys; |
3012 | uint64_t *processed = &scn_phys->scn_processed; | | 3012 | uint64_t *processed = &scn_phys->scn_processed; |
3013 | | | 3013 | |
3014 | /* XXX cleanup? */ | | 3014 | /* XXX cleanup? */ |
3015 | if (vd->vdev_ops->vdev_op_leaf) | | 3015 | if (vd->vdev_ops->vdev_op_leaf) |
3016 | atomic_add_64(processed, psize); | | 3016 | atomic_add_64(processed, psize); |
3017 | vs->vs_scan_processed += psize; | | 3017 | vs->vs_scan_processed += psize; |
3018 | } | | 3018 | } |
3019 | | | 3019 | |
3020 | if (flags & ZIO_FLAG_SELF_HEAL) | | 3020 | if (flags & ZIO_FLAG_SELF_HEAL) |
3021 | vs->vs_self_healed += psize; | | 3021 | vs->vs_self_healed += psize; |
3022 | } | | 3022 | } |
3023 | | | 3023 | |
3024 | vs->vs_ops[type]++; | | 3024 | vs->vs_ops[type]++; |
3025 | vs->vs_bytes[type] += psize; | | 3025 | vs->vs_bytes[type] += psize; |
3026 | | | 3026 | |
3027 | mutex_exit(&vd->vdev_stat_lock); | | 3027 | mutex_exit(&vd->vdev_stat_lock); |
3028 | return; | | 3028 | return; |
3029 | } | | 3029 | } |
3030 | | | 3030 | |
3031 | if (flags & ZIO_FLAG_SPECULATIVE) | | 3031 | if (flags & ZIO_FLAG_SPECULATIVE) |
3032 | return; | | 3032 | return; |
3033 | | | 3033 | |
3034 | /* | | 3034 | /* |
3035 | * If this is an I/O error that is going to be retried, then ignore the | | 3035 | * If this is an I/O error that is going to be retried, then ignore the |
3036 | * error. Otherwise, the user may interpret B_FAILFAST I/O errors as | | 3036 | * error. Otherwise, the user may interpret B_FAILFAST I/O errors as |
3037 | * hard errors, when in reality they can happen for any number of | | 3037 | * hard errors, when in reality they can happen for any number of |
3038 | * innocuous reasons (bus resets, MPxIO link failure, etc). | | 3038 | * innocuous reasons (bus resets, MPxIO link failure, etc). |
3039 | */ | | 3039 | */ |
3040 | if (zio->io_error == EIO && | | 3040 | if (zio->io_error == EIO && |
3041 | !(zio->io_flags & ZIO_FLAG_IO_RETRY)) | | 3041 | !(zio->io_flags & ZIO_FLAG_IO_RETRY)) |
3042 | return; | | 3042 | return; |
3043 | | | 3043 | |
3044 | /* | | 3044 | /* |
3045 | * Intent logs writes won't propagate their error to the root | | 3045 | * Intent logs writes won't propagate their error to the root |
3046 | * I/O so don't mark these types of failures as pool-level | | 3046 | * I/O so don't mark these types of failures as pool-level |
3047 | * errors. | | 3047 | * errors. |
3048 | */ | | 3048 | */ |
3049 | if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) | | 3049 | if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) |
3050 | return; | | 3050 | return; |
3051 | | | 3051 | |
3052 | mutex_enter(&vd->vdev_stat_lock); | | 3052 | mutex_enter(&vd->vdev_stat_lock); |
3053 | if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { | | 3053 | if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { |
3054 | if (zio->io_error == ECKSUM) | | 3054 | if (zio->io_error == ECKSUM) |
3055 | vs->vs_checksum_errors++; | | 3055 | vs->vs_checksum_errors++; |
3056 | else | | 3056 | else |
3057 | vs->vs_read_errors++; | | 3057 | vs->vs_read_errors++; |
3058 | } | | 3058 | } |
3059 | if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) | | 3059 | if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) |
3060 | vs->vs_write_errors++; | | 3060 | vs->vs_write_errors++; |
3061 | mutex_exit(&vd->vdev_stat_lock); | | 3061 | mutex_exit(&vd->vdev_stat_lock); |
3062 | | | 3062 | |
3063 | if (type == ZIO_TYPE_WRITE && txg != 0 && | | 3063 | if (type == ZIO_TYPE_WRITE && txg != 0 && |
3064 | (!(flags & ZIO_FLAG_IO_REPAIR) || | | 3064 | (!(flags & ZIO_FLAG_IO_REPAIR) || |
3065 | (flags & ZIO_FLAG_SCAN_THREAD) || | | 3065 | (flags & ZIO_FLAG_SCAN_THREAD) || |
3066 | spa->spa_claiming)) { | | 3066 | spa->spa_claiming)) { |
3067 | /* | | 3067 | /* |
3068 | * This is either a normal write (not a repair), or it's | | 3068 | * This is either a normal write (not a repair), or it's |
3069 | * a repair induced by the scrub thread, or it's a repair | | 3069 | * a repair induced by the scrub thread, or it's a repair |
3070 | * made by zil_claim() during spa_load() in the first txg. | | 3070 | * made by zil_claim() during spa_load() in the first txg. |
3071 | * In the normal case, we commit the DTL change in the same | | 3071 | * In the normal case, we commit the DTL change in the same |
3072 | * txg as the block was born. In the scrub-induced repair | | 3072 | * txg as the block was born. In the scrub-induced repair |
3073 | * case, we know that scrubs run in first-pass syncing context, | | 3073 | * case, we know that scrubs run in first-pass syncing context, |
3074 | * so we commit the DTL change in spa_syncing_txg(spa). | | 3074 | * so we commit the DTL change in spa_syncing_txg(spa). |
3075 | * In the zil_claim() case, we commit in spa_first_txg(spa). | | 3075 | * In the zil_claim() case, we commit in spa_first_txg(spa). |
3076 | * | | 3076 | * |
3077 | * We currently do not make DTL entries for failed spontaneous | | 3077 | * We currently do not make DTL entries for failed spontaneous |
3078 | * self-healing writes triggered by normal (non-scrubbing) | | 3078 | * self-healing writes triggered by normal (non-scrubbing) |
3079 | * reads, because we have no transactional context in which to | | 3079 | * reads, because we have no transactional context in which to |
3080 | * do so -- and it's not clear that it'd be desirable anyway. | | 3080 | * do so -- and it's not clear that it'd be desirable anyway. |
3081 | */ | | 3081 | */ |
3082 | if (vd->vdev_ops->vdev_op_leaf) { | | 3082 | if (vd->vdev_ops->vdev_op_leaf) { |
3083 | uint64_t commit_txg = txg; | | 3083 | uint64_t commit_txg = txg; |
3084 | if (flags & ZIO_FLAG_SCAN_THREAD) { | | 3084 | if (flags & ZIO_FLAG_SCAN_THREAD) { |
3085 | ASSERT(flags & ZIO_FLAG_IO_REPAIR); | | 3085 | ASSERT(flags & ZIO_FLAG_IO_REPAIR); |
3086 | ASSERT(spa_sync_pass(spa) == 1); | | 3086 | ASSERT(spa_sync_pass(spa) == 1); |
3087 | vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); | | 3087 | vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); |
3088 | commit_txg = spa_syncing_txg(spa); | | 3088 | commit_txg = spa_syncing_txg(spa); |
3089 | } else if (spa->spa_claiming) { | | 3089 | } else if (spa->spa_claiming) { |
3090 | ASSERT(flags & ZIO_FLAG_IO_REPAIR); | | 3090 | ASSERT(flags & ZIO_FLAG_IO_REPAIR); |
3091 | commit_txg = spa_first_txg(spa); | | 3091 | commit_txg = spa_first_txg(spa); |
3092 | } | | 3092 | } |
3093 | ASSERT(commit_txg >= spa_syncing_txg(spa)); | | 3093 | ASSERT(commit_txg >= spa_syncing_txg(spa)); |
3094 | if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) | | 3094 | if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) |
3095 | return; | | 3095 | return; |
3096 | for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) | | 3096 | for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) |
3097 | vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); | | 3097 | vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); |
3098 | vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); | | 3098 | vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); |
3099 | } | | 3099 | } |
3100 | if (vd != rvd) | | 3100 | if (vd != rvd) |
3101 | vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); | | 3101 | vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); |
3102 | } | | 3102 | } |
3103 | } | | 3103 | } |
3104 | | | 3104 | |
3105 | /* | | 3105 | /* |
3106 | * Update the in-core space usage stats for this vdev, its metaslab class, | | 3106 | * Update the in-core space usage stats for this vdev, its metaslab class, |
3107 | * and the root vdev. | | 3107 | * and the root vdev. |
3108 | */ | | 3108 | */ |
3109 | void | | 3109 | void |
3110 | vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, | | 3110 | vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, |
3111 | int64_t space_delta) | | 3111 | int64_t space_delta) |
3112 | { | | 3112 | { |
3113 | int64_t dspace_delta = space_delta; | | 3113 | int64_t dspace_delta = space_delta; |
3114 | spa_t *spa = vd->vdev_spa; | | 3114 | spa_t *spa = vd->vdev_spa; |
3115 | vdev_t *rvd = spa->spa_root_vdev; | | 3115 | vdev_t *rvd = spa->spa_root_vdev; |
3116 | metaslab_group_t *mg = vd->vdev_mg; | | 3116 | metaslab_group_t *mg = vd->vdev_mg; |
3117 | metaslab_class_t *mc = mg ? mg->mg_class : NULL; | | 3117 | metaslab_class_t *mc = mg ? mg->mg_class : NULL; |
3118 | | | 3118 | |
3119 | ASSERT(vd == vd->vdev_top); | | 3119 | ASSERT(vd == vd->vdev_top); |
3120 | | | 3120 | |
3121 | /* | | 3121 | /* |
3122 | * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion | | 3122 | * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion |
3123 | * factor. We must calculate this here and not at the root vdev | | 3123 | * factor. We must calculate this here and not at the root vdev |
3124 | * because the root vdev's psize-to-asize is simply the max of its | | 3124 | * because the root vdev's psize-to-asize is simply the max of its |
3125 | * childrens', thus not accurate enough for us. | | 3125 | * childrens', thus not accurate enough for us. |
3126 | */ | | 3126 | */ |
3127 | ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); | | 3127 | ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); |
3128 | ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); | | 3128 | ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); |
3129 | dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * | | 3129 | dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * |
3130 | vd->vdev_deflate_ratio; | | 3130 | vd->vdev_deflate_ratio; |
3131 | | | 3131 | |
3132 | mutex_enter(&vd->vdev_stat_lock); | | 3132 | mutex_enter(&vd->vdev_stat_lock); |
3133 | vd->vdev_stat.vs_alloc += alloc_delta; | | 3133 | vd->vdev_stat.vs_alloc += alloc_delta; |
3134 | vd->vdev_stat.vs_space += space_delta; | | 3134 | vd->vdev_stat.vs_space += space_delta; |
3135 | vd->vdev_stat.vs_dspace += dspace_delta; | | 3135 | vd->vdev_stat.vs_dspace += dspace_delta; |
3136 | mutex_exit(&vd->vdev_stat_lock); | | 3136 | mutex_exit(&vd->vdev_stat_lock); |
3137 | | | 3137 | |
3138 | if (mc == spa_normal_class(spa)) { | | 3138 | if (mc == spa_normal_class(spa)) { |
3139 | mutex_enter(&rvd->vdev_stat_lock); | | 3139 | mutex_enter(&rvd->vdev_stat_lock); |
3140 | rvd->vdev_stat.vs_alloc += alloc_delta; | | 3140 | rvd->vdev_stat.vs_alloc += alloc_delta; |
3141 | rvd->vdev_stat.vs_space += space_delta; | | 3141 | rvd->vdev_stat.vs_space += space_delta; |
3142 | rvd->vdev_stat.vs_dspace += dspace_delta; | | 3142 | rvd->vdev_stat.vs_dspace += dspace_delta; |
3143 | mutex_exit(&rvd->vdev_stat_lock); | | 3143 | mutex_exit(&rvd->vdev_stat_lock); |
3144 | } | | 3144 | } |
3145 | | | 3145 | |
3146 | if (mc != NULL) { | | 3146 | if (mc != NULL) { |
3147 | ASSERT(rvd == vd->vdev_parent); | | 3147 | ASSERT(rvd == vd->vdev_parent); |
3148 | ASSERT(vd->vdev_ms_count != 0); | | 3148 | ASSERT(vd->vdev_ms_count != 0); |
3149 | | | 3149 | |
3150 | metaslab_class_space_update(mc, | | 3150 | metaslab_class_space_update(mc, |
3151 | alloc_delta, defer_delta, space_delta, dspace_delta); | | 3151 | alloc_delta, defer_delta, space_delta, dspace_delta); |
3152 | } | | 3152 | } |
3153 | } | | 3153 | } |
3154 | | | 3154 | |
3155 | /* | | 3155 | /* |
3156 | * Mark a top-level vdev's config as dirty, placing it on the dirty list | | 3156 | * Mark a top-level vdev's config as dirty, placing it on the dirty list |
3157 | * so that it will be written out next time the vdev configuration is synced. | | 3157 | * so that it will be written out next time the vdev configuration is synced. |
3158 | * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. | | 3158 | * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. |
3159 | */ | | 3159 | */ |
3160 | void | | 3160 | void |
3161 | vdev_config_dirty(vdev_t *vd) | | 3161 | vdev_config_dirty(vdev_t *vd) |
3162 | { | | 3162 | { |
3163 | spa_t *spa = vd->vdev_spa; | | 3163 | spa_t *spa = vd->vdev_spa; |
3164 | vdev_t *rvd = spa->spa_root_vdev; | | 3164 | vdev_t *rvd = spa->spa_root_vdev; |
3165 | int c; | | 3165 | int c; |
3166 | | | 3166 | |
3167 | ASSERT(spa_writeable(spa)); | | 3167 | ASSERT(spa_writeable(spa)); |
3168 | | | 3168 | |
3169 | /* | | 3169 | /* |
3170 | * If this is an aux vdev (as with l2cache and spare devices), then we | | 3170 | * If this is an aux vdev (as with l2cache and spare devices), then we |
3171 | * update the vdev config manually and set the sync flag. | | 3171 | * update the vdev config manually and set the sync flag. |
3172 | */ | | 3172 | */ |
3173 | if (vd->vdev_aux != NULL) { | | 3173 | if (vd->vdev_aux != NULL) { |
3174 | spa_aux_vdev_t *sav = vd->vdev_aux; | | 3174 | spa_aux_vdev_t *sav = vd->vdev_aux; |
3175 | nvlist_t **aux; | | 3175 | nvlist_t **aux; |
3176 | uint_t naux; | | 3176 | uint_t naux; |
3177 | | | 3177 | |
3178 | for (c = 0; c < sav->sav_count; c++) { | | 3178 | for (c = 0; c < sav->sav_count; c++) { |
3179 | if (sav->sav_vdevs[c] == vd) | | 3179 | if (sav->sav_vdevs[c] == vd) |
3180 | break; | | 3180 | break; |
3181 | } | | 3181 | } |
3182 | | | 3182 | |
3183 | if (c == sav->sav_count) { | | 3183 | if (c == sav->sav_count) { |
3184 | /* | | 3184 | /* |
3185 | * We're being removed. There's nothing more to do. | | 3185 | * We're being removed. There's nothing more to do. |
3186 | */ | | 3186 | */ |
3187 | ASSERT(sav->sav_sync == B_TRUE); | | 3187 | ASSERT(sav->sav_sync == B_TRUE); |
3188 | return; | | 3188 | return; |
3189 | } | | 3189 | } |
3190 | | | 3190 | |
3191 | sav->sav_sync = B_TRUE; | | 3191 | sav->sav_sync = B_TRUE; |
3192 | | | 3192 | |
3193 | if (nvlist_lookup_nvlist_array(sav->sav_config, | | 3193 | if (nvlist_lookup_nvlist_array(sav->sav_config, |
3194 | ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { | | 3194 | ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { |
3195 | VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, | | 3195 | VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, |
3196 | ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); | | 3196 | ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); |
3197 | } | | 3197 | } |
3198 | | | 3198 | |
3199 | ASSERT(c < naux); | | 3199 | ASSERT(c < naux); |
3200 | | | 3200 | |
3201 | /* | | 3201 | /* |
3202 | * Setting the nvlist in the middle if the array is a little | | 3202 | * Setting the nvlist in the middle if the array is a little |
3203 | * sketchy, but it will work. | | 3203 | * sketchy, but it will work. |
3204 | */ | | 3204 | */ |
3205 | nvlist_free(aux[c]); | | 3205 | nvlist_free(aux[c]); |
3206 | aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); | | 3206 | aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); |
3207 | | | 3207 | |
3208 | return; | | 3208 | return; |
3209 | } | | 3209 | } |
3210 | | | 3210 | |
3211 | /* | | 3211 | /* |
3212 | * The dirty list is protected by the SCL_CONFIG lock. The caller | | 3212 | * The dirty list is protected by the SCL_CONFIG lock. The caller |
3213 | * must either hold SCL_CONFIG as writer, or must be the sync thread | | 3213 | * must either hold SCL_CONFIG as writer, or must be the sync thread |
3214 | * (which holds SCL_CONFIG as reader). There's only one sync thread, | | 3214 | * (which holds SCL_CONFIG as reader). There's only one sync thread, |
3215 | * so this is sufficient to ensure mutual exclusion. | | 3215 | * so this is sufficient to ensure mutual exclusion. |
3216 | */ | | 3216 | */ |
3217 | ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || | | 3217 | ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || |
3218 | (dsl_pool_sync_context(spa_get_dsl(spa)) && | | 3218 | (dsl_pool_sync_context(spa_get_dsl(spa)) && |
3219 | spa_config_held(spa, SCL_CONFIG, RW_READER))); | | 3219 | spa_config_held(spa, SCL_CONFIG, RW_READER))); |
3220 | | | 3220 | |
3221 | if (vd == rvd) { | | 3221 | if (vd == rvd) { |
3222 | for (c = 0; c < rvd->vdev_children; c++) | | 3222 | for (c = 0; c < rvd->vdev_children; c++) |
3223 | vdev_config_dirty(rvd->vdev_child[c]); | | 3223 | vdev_config_dirty(rvd->vdev_child[c]); |
3224 | } else { | | 3224 | } else { |
3225 | ASSERT(vd == vd->vdev_top); | | 3225 | ASSERT(vd == vd->vdev_top); |
3226 | | | 3226 | |
3227 | if (!list_link_active(&vd->vdev_config_dirty_node) && | | 3227 | if (!list_link_active(&vd->vdev_config_dirty_node) && |
3228 | !vd->vdev_ishole) | | 3228 | !vd->vdev_ishole) |
3229 | list_insert_head(&spa->spa_config_dirty_list, vd); | | 3229 | list_insert_head(&spa->spa_config_dirty_list, vd); |
3230 | } | | 3230 | } |
3231 | } | | 3231 | } |
3232 | | | 3232 | |
3233 | void | | 3233 | void |
3234 | vdev_config_clean(vdev_t *vd) | | 3234 | vdev_config_clean(vdev_t *vd) |
3235 | { | | 3235 | { |
3236 | spa_t *spa = vd->vdev_spa; | | 3236 | spa_t *spa = vd->vdev_spa; |
3237 | | | 3237 | |
3238 | ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || | | 3238 | ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || |
3239 | (dsl_pool_sync_context(spa_get_dsl(spa)) && | | 3239 | (dsl_pool_sync_context(spa_get_dsl(spa)) && |
3240 | spa_config_held(spa, SCL_CONFIG, RW_READER))); | | 3240 | spa_config_held(spa, SCL_CONFIG, RW_READER))); |
3241 | | | 3241 | |
3242 | ASSERT(list_link_active(&vd->vdev_config_dirty_node)); | | 3242 | ASSERT(list_link_active(&vd->vdev_config_dirty_node)); |
3243 | list_remove(&spa->spa_config_dirty_list, vd); | | 3243 | list_remove(&spa->spa_config_dirty_list, vd); |
3244 | } | | 3244 | } |
3245 | | | 3245 | |
3246 | /* | | 3246 | /* |
3247 | * Mark a top-level vdev's state as dirty, so that the next pass of | | 3247 | * Mark a top-level vdev's state as dirty, so that the next pass of |
3248 | * spa_sync() can convert this into vdev_config_dirty(). We distinguish | | 3248 | * spa_sync() can convert this into vdev_config_dirty(). We distinguish |
3249 | * the state changes from larger config changes because they require | | 3249 | * the state changes from larger config changes because they require |
3250 | * much less locking, and are often needed for administrative actions. | | 3250 | * much less locking, and are often needed for administrative actions. |
3251 | */ | | 3251 | */ |
3252 | void | | 3252 | void |
3253 | vdev_state_dirty(vdev_t *vd) | | 3253 | vdev_state_dirty(vdev_t *vd) |
3254 | { | | 3254 | { |
3255 | spa_t *spa = vd->vdev_spa; | | 3255 | spa_t *spa = vd->vdev_spa; |
3256 | | | 3256 | |
3257 | ASSERT(spa_writeable(spa)); | | 3257 | ASSERT(spa_writeable(spa)); |
3258 | ASSERT(vd == vd->vdev_top); | | 3258 | ASSERT(vd == vd->vdev_top); |
3259 | | | 3259 | |
3260 | /* | | 3260 | /* |
3261 | * The state list is protected by the SCL_STATE lock. The caller | | 3261 | * The state list is protected by the SCL_STATE lock. The caller |
3262 | * must either hold SCL_STATE as writer, or must be the sync thread | | 3262 | * must either hold SCL_STATE as writer, or must be the sync thread |
3263 | * (which holds SCL_STATE as reader). There's only one sync thread, | | 3263 | * (which holds SCL_STATE as reader). There's only one sync thread, |
3264 | * so this is sufficient to ensure mutual exclusion. | | 3264 | * so this is sufficient to ensure mutual exclusion. |
3265 | */ | | 3265 | */ |
3266 | ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || | | 3266 | ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || |
3267 | (dsl_pool_sync_context(spa_get_dsl(spa)) && | | 3267 | (dsl_pool_sync_context(spa_get_dsl(spa)) && |
3268 | spa_config_held(spa, SCL_STATE, RW_READER))); | | 3268 | spa_config_held(spa, SCL_STATE, RW_READER))); |
3269 | | | 3269 | |
3270 | if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) | | 3270 | if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) |
3271 | list_insert_head(&spa->spa_state_dirty_list, vd); | | 3271 | list_insert_head(&spa->spa_state_dirty_list, vd); |
3272 | } | | 3272 | } |
3273 | | | 3273 | |
3274 | void | | 3274 | void |
3275 | vdev_state_clean(vdev_t *vd) | | 3275 | vdev_state_clean(vdev_t *vd) |
3276 | { | | 3276 | { |
3277 | spa_t *spa = vd->vdev_spa; | | 3277 | spa_t *spa = vd->vdev_spa; |
3278 | | | 3278 | |
3279 | ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || | | 3279 | ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || |
3280 | (dsl_pool_sync_context(spa_get_dsl(spa)) && | | 3280 | (dsl_pool_sync_context(spa_get_dsl(spa)) && |
3281 | spa_config_held(spa, SCL_STATE, RW_READER))); | | 3281 | spa_config_held(spa, SCL_STATE, RW_READER))); |
3282 | | | 3282 | |
3283 | ASSERT(list_link_active(&vd->vdev_state_dirty_node)); | | 3283 | ASSERT(list_link_active(&vd->vdev_state_dirty_node)); |
3284 | list_remove(&spa->spa_state_dirty_list, vd); | | 3284 | list_remove(&spa->spa_state_dirty_list, vd); |
3285 | } | | 3285 | } |
3286 | | | 3286 | |
3287 | /* | | 3287 | /* |
3288 | * Propagate vdev state up from children to parent. | | 3288 | * Propagate vdev state up from children to parent. |
3289 | */ | | 3289 | */ |
3290 | void | | 3290 | void |
3291 | vdev_propagate_state(vdev_t *vd) | | 3291 | vdev_propagate_state(vdev_t *vd) |
3292 | { | | 3292 | { |
3293 | spa_t *spa = vd->vdev_spa; | | 3293 | spa_t *spa = vd->vdev_spa; |
3294 | vdev_t *rvd = spa->spa_root_vdev; | | 3294 | vdev_t *rvd = spa->spa_root_vdev; |
3295 | int degraded = 0, faulted = 0; | | 3295 | int degraded = 0, faulted = 0; |
3296 | int corrupted = 0; | | 3296 | int corrupted = 0; |
3297 | vdev_t *child; | | 3297 | vdev_t *child; |
3298 | | | 3298 | |
3299 | if (vd->vdev_children > 0) { | | 3299 | if (vd->vdev_children > 0) { |
3300 | for (int c = 0; c < vd->vdev_children; c++) { | | 3300 | for (int c = 0; c < vd->vdev_children; c++) { |
3301 | child = vd->vdev_child[c]; | | 3301 | child = vd->vdev_child[c]; |
3302 | | | 3302 | |
3303 | /* | | 3303 | /* |
3304 | * Don't factor holes into the decision. | | 3304 | * Don't factor holes into the decision. |
3305 | */ | | 3305 | */ |
3306 | if (child->vdev_ishole) | | 3306 | if (child->vdev_ishole) |
3307 | continue; | | 3307 | continue; |
3308 | | | 3308 | |
3309 | if (!vdev_readable(child) || | | 3309 | if (!vdev_readable(child) || |
3310 | (!vdev_writeable(child) && spa_writeable(spa))) { | | 3310 | (!vdev_writeable(child) && spa_writeable(spa))) { |
3311 | /* | | 3311 | /* |
3312 | * Root special: if there is a top-level log | | 3312 | * Root special: if there is a top-level log |
3313 | * device, treat the root vdev as if it were | | 3313 | * device, treat the root vdev as if it were |
3314 | * degraded. | | 3314 | * degraded. |
3315 | */ | | 3315 | */ |
3316 | if (child->vdev_islog && vd == rvd) | | 3316 | if (child->vdev_islog && vd == rvd) |
3317 | degraded++; | | 3317 | degraded++; |
3318 | else | | 3318 | else |
3319 | faulted++; | | 3319 | faulted++; |
3320 | } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { | | 3320 | } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { |
3321 | degraded++; | | 3321 | degraded++; |
3322 | } | | 3322 | } |
3323 | | | 3323 | |
3324 | if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) | | 3324 | if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) |
3325 | corrupted++; | | 3325 | corrupted++; |
3326 | } | | 3326 | } |
3327 | | | 3327 | |
3328 | vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); | | 3328 | vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); |
3329 | | | 3329 | |
3330 | /* | | 3330 | /* |
3331 | * Root special: if there is a top-level vdev that cannot be | | 3331 | * Root special: if there is a top-level vdev that cannot be |
3332 | * opened due to corrupted metadata, then propagate the root | | 3332 | * opened due to corrupted metadata, then propagate the root |
3333 | * vdev's aux state as 'corrupt' rather than 'insufficient | | 3333 | * vdev's aux state as 'corrupt' rather than 'insufficient |
3334 | * replicas'. | | 3334 | * replicas'. |
3335 | */ | | 3335 | */ |
3336 | if (corrupted && vd == rvd && | | 3336 | if (corrupted && vd == rvd && |
3337 | rvd->vdev_state == VDEV_STATE_CANT_OPEN) | | 3337 | rvd->vdev_state == VDEV_STATE_CANT_OPEN) |
3338 | vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, | | 3338 | vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, |
3339 | VDEV_AUX_CORRUPT_DATA); | | 3339 | VDEV_AUX_CORRUPT_DATA); |
3340 | } | | 3340 | } |
3341 | | | 3341 | |
3342 | if (vd->vdev_parent) | | 3342 | if (vd->vdev_parent) |
3343 | vdev_propagate_state(vd->vdev_parent); | | 3343 | vdev_propagate_state(vd->vdev_parent); |
3344 | } | | 3344 | } |
3345 | | | 3345 | |
3346 | /* | | 3346 | /* |
3347 | * Set a vdev's state. If this is during an open, we don't update the parent | | 3347 | * Set a vdev's state. If this is during an open, we don't update the parent |
3348 | * state, because we're in the process of opening children depth-first. | | 3348 | * state, because we're in the process of opening children depth-first. |
3349 | * Otherwise, we propagate the change to the parent. | | 3349 | * Otherwise, we propagate the change to the parent. |
3350 | * | | 3350 | * |
3351 | * If this routine places a device in a faulted state, an appropriate ereport is | | 3351 | * If this routine places a device in a faulted state, an appropriate ereport is |
3352 | * generated. | | 3352 | * generated. |
3353 | */ | | 3353 | */ |
3354 | void | | 3354 | void |
3355 | vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) | | 3355 | vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) |
3356 | { | | 3356 | { |
3357 | uint64_t save_state; | | 3357 | uint64_t save_state; |
3358 | spa_t *spa = vd->vdev_spa; | | 3358 | spa_t *spa = vd->vdev_spa; |
3359 | | | 3359 | |
3360 | if (state == vd->vdev_state) { | | 3360 | if (state == vd->vdev_state) { |
3361 | vd->vdev_stat.vs_aux = aux; | | 3361 | vd->vdev_stat.vs_aux = aux; |
3362 | return; | | 3362 | return; |
3363 | } | | 3363 | } |
3364 | | | 3364 | |
3365 | save_state = vd->vdev_state; | | 3365 | save_state = vd->vdev_state; |
3366 | | | 3366 | |
3367 | vd->vdev_state = state; | | 3367 | vd->vdev_state = state; |
3368 | vd->vdev_stat.vs_aux = aux; | | 3368 | vd->vdev_stat.vs_aux = aux; |
3369 | | | 3369 | |
3370 | /* | | 3370 | /* |
3371 | * If we are setting the vdev state to anything but an open state, then | | 3371 | * If we are setting the vdev state to anything but an open state, then |
3372 | * always close the underlying device unless the device has requested | | 3372 | * always close the underlying device unless the device has requested |
3373 | * a delayed close (i.e. we're about to remove or fault the device). | | 3373 | * a delayed close (i.e. we're about to remove or fault the device). |
3374 | * Otherwise, we keep accessible but invalid devices open forever. | | 3374 | * Otherwise, we keep accessible but invalid devices open forever. |
3375 | * We don't call vdev_close() itself, because that implies some extra | | 3375 | * We don't call vdev_close() itself, because that implies some extra |
3376 | * checks (offline, etc) that we don't want here. This is limited to | | 3376 | * checks (offline, etc) that we don't want here. This is limited to |
3377 | * leaf devices, because otherwise closing the device will affect other | | 3377 | * leaf devices, because otherwise closing the device will affect other |
3378 | * children. | | 3378 | * children. |
3379 | */ | | 3379 | */ |
3380 | if (!vd->vdev_delayed_close && vdev_is_dead(vd) && | | 3380 | if (!vd->vdev_delayed_close && vdev_is_dead(vd) && |
3381 | vd->vdev_ops->vdev_op_leaf) | | 3381 | vd->vdev_ops->vdev_op_leaf) |
3382 | vd->vdev_ops->vdev_op_close(vd); | | 3382 | vd->vdev_ops->vdev_op_close(vd); |
3383 | | | 3383 | |
3384 | if (vd->vdev_removed && | | 3384 | if (vd->vdev_removed && |
3385 | state == VDEV_STATE_CANT_OPEN && | | 3385 | state == VDEV_STATE_CANT_OPEN && |
3386 | (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { | | 3386 | (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { |
3387 | /* | | 3387 | /* |
3388 | * If the previous state is set to VDEV_STATE_REMOVED, then this | | 3388 | * If the previous state is set to VDEV_STATE_REMOVED, then this |
3389 | * device was previously marked removed and someone attempted to | | 3389 | * device was previously marked removed and someone attempted to |
3390 | * reopen it. If this failed due to a nonexistent device, then | | 3390 | * reopen it. If this failed due to a nonexistent device, then |
3391 | * keep the device in the REMOVED state. We also let this be if | | 3391 | * keep the device in the REMOVED state. We also let this be if |
3392 | * it is one of our special test online cases, which is only | | 3392 | * it is one of our special test online cases, which is only |
3393 | * attempting to online the device and shouldn't generate an FMA | | 3393 | * attempting to online the device and shouldn't generate an FMA |
3394 | * fault. | | 3394 | * fault. |
3395 | */ | | 3395 | */ |
3396 | vd->vdev_state = VDEV_STATE_REMOVED; | | 3396 | vd->vdev_state = VDEV_STATE_REMOVED; |
3397 | vd->vdev_stat.vs_aux = VDEV_AUX_NONE; | | 3397 | vd->vdev_stat.vs_aux = VDEV_AUX_NONE; |
3398 | } else if (state == VDEV_STATE_REMOVED) { | | 3398 | } else if (state == VDEV_STATE_REMOVED) { |
3399 | vd->vdev_removed = B_TRUE; | | 3399 | vd->vdev_removed = B_TRUE; |
3400 | } else if (state == VDEV_STATE_CANT_OPEN) { | | 3400 | } else if (state == VDEV_STATE_CANT_OPEN) { |
3401 | /* | | 3401 | /* |
3402 | * If we fail to open a vdev during an import or recovery, we | | 3402 | * If we fail to open a vdev during an import or recovery, we |
3403 | * mark it as "not available", which signifies that it was | | 3403 | * mark it as "not available", which signifies that it was |
3404 | * never there to begin with. Failure to open such a device | | 3404 | * never there to begin with. Failure to open such a device |
3405 | * is not considered an error. | | 3405 | * is not considered an error. |
3406 | */ | | 3406 | */ |
3407 | if ((spa_load_state(spa) == SPA_LOAD_IMPORT || | | 3407 | if ((spa_load_state(spa) == SPA_LOAD_IMPORT || |
3408 | spa_load_state(spa) == SPA_LOAD_RECOVER) && | | 3408 | spa_load_state(spa) == SPA_LOAD_RECOVER) && |
3409 | vd->vdev_ops->vdev_op_leaf) | | 3409 | vd->vdev_ops->vdev_op_leaf) |
3410 | vd->vdev_not_present = 1; | | 3410 | vd->vdev_not_present = 1; |
3411 | | | 3411 | |
3412 | /* | | 3412 | /* |
3413 | * Post the appropriate ereport. If the 'prevstate' field is | | 3413 | * Post the appropriate ereport. If the 'prevstate' field is |
3414 | * set to something other than VDEV_STATE_UNKNOWN, it indicates | | 3414 | * set to something other than VDEV_STATE_UNKNOWN, it indicates |
3415 | * that this is part of a vdev_reopen(). In this case, we don't | | 3415 | * that this is part of a vdev_reopen(). In this case, we don't |
3416 | * want to post the ereport if the device was already in the | | 3416 | * want to post the ereport if the device was already in the |
3417 | * CANT_OPEN state beforehand. | | 3417 | * CANT_OPEN state beforehand. |
3418 | * | | 3418 | * |
3419 | * If the 'checkremove' flag is set, then this is an attempt to | | 3419 | * If the 'checkremove' flag is set, then this is an attempt to |
3420 | * online the device in response to an insertion event. If we | | 3420 | * online the device in response to an insertion event. If we |
3421 | * hit this case, then we have detected an insertion event for a | | 3421 | * hit this case, then we have detected an insertion event for a |
3422 | * faulted or offline device that wasn't in the removed state. | | 3422 | * faulted or offline device that wasn't in the removed state. |
3423 | * In this scenario, we don't post an ereport because we are | | 3423 | * In this scenario, we don't post an ereport because we are |
3424 | * about to replace the device, or attempt an online with | | 3424 | * about to replace the device, or attempt an online with |
3425 | * vdev_forcefault, which will generate the fault for us. | | 3425 | * vdev_forcefault, which will generate the fault for us. |
3426 | */ | | 3426 | */ |
3427 | if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && | | 3427 | if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && |
3428 | !vd->vdev_not_present && !vd->vdev_checkremove && | | 3428 | !vd->vdev_not_present && !vd->vdev_checkremove && |
3429 | vd != spa->spa_root_vdev) { | | 3429 | vd != spa->spa_root_vdev) { |
3430 | const char *class; | | 3430 | const char *class; |
3431 | | | 3431 | |
3432 | switch (aux) { | | 3432 | switch (aux) { |
3433 | case VDEV_AUX_OPEN_FAILED: | | 3433 | case VDEV_AUX_OPEN_FAILED: |
3434 | class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; | | 3434 | class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; |
3435 | break; | | 3435 | break; |
3436 | case VDEV_AUX_CORRUPT_DATA: | | 3436 | case VDEV_AUX_CORRUPT_DATA: |
3437 | class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; | | 3437 | class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; |
3438 | break; | | 3438 | break; |
3439 | case VDEV_AUX_NO_REPLICAS: | | 3439 | case VDEV_AUX_NO_REPLICAS: |
3440 | class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; | | 3440 | class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; |
3441 | break; | | 3441 | break; |
3442 | case VDEV_AUX_BAD_GUID_SUM: | | 3442 | case VDEV_AUX_BAD_GUID_SUM: |
3443 | class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; | | 3443 | class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; |
3444 | break; | | 3444 | break; |
3445 | case VDEV_AUX_TOO_SMALL: | | 3445 | case VDEV_AUX_TOO_SMALL: |
3446 | class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; | | 3446 | class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; |
3447 | break; | | 3447 | break; |
3448 | case VDEV_AUX_BAD_LABEL: | | 3448 | case VDEV_AUX_BAD_LABEL: |
3449 | class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; | | 3449 | class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; |
3450 | break; | | 3450 | break; |
3451 | default: | | 3451 | default: |
3452 | class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; | | 3452 | class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; |
3453 | } | | 3453 | } |
3454 | | | 3454 | |
3455 | zfs_ereport_post(class, spa, vd, NULL, save_state, 0); | | 3455 | zfs_ereport_post(class, spa, vd, NULL, save_state, 0); |
3456 | } | | 3456 | } |
3457 | | | 3457 | |
3458 | /* Erase any notion of persistent removed state */ | | 3458 | /* Erase any notion of persistent removed state */ |
3459 | vd->vdev_removed = B_FALSE; | | 3459 | vd->vdev_removed = B_FALSE; |
3460 | } else { | | 3460 | } else { |
3461 | vd->vdev_removed = B_FALSE; | | 3461 | vd->vdev_removed = B_FALSE; |
3462 | } | | 3462 | } |
3463 | | | 3463 | |
3464 | /* | | 3464 | /* |
3465 | * Notify the fmd of the state change. Be verbose and post | | 3465 | * Notify the fmd of the state change. Be verbose and post |
3466 | * notifications even for stuff that's not important; the fmd agent can | | 3466 | * notifications even for stuff that's not important; the fmd agent can |
3467 | * sort it out. Don't emit state change events for non-leaf vdevs since | | 3467 | * sort it out. Don't emit state change events for non-leaf vdevs since |
3468 | * they can't change state on their own. The FMD can check their state | | 3468 | * they can't change state on their own. The FMD can check their state |
3469 | * if it wants to when it sees that a leaf vdev had a state change. | | 3469 | * if it wants to when it sees that a leaf vdev had a state change. |
3470 | */ | | 3470 | */ |
3471 | if (vd->vdev_ops->vdev_op_leaf) | | 3471 | if (vd->vdev_ops->vdev_op_leaf) |
3472 | zfs_post_state_change(spa, vd); | | 3472 | zfs_post_state_change(spa, vd); |
3473 | | | 3473 | |
3474 | if (!isopen && vd->vdev_parent) | | 3474 | if (!isopen && vd->vdev_parent) |
3475 | vdev_propagate_state(vd->vdev_parent); | | 3475 | vdev_propagate_state(vd->vdev_parent); |
3476 | } | | 3476 | } |
3477 | | | 3477 | |
3478 | /* | | 3478 | /* |
3479 | * Check the vdev configuration to ensure that it's capable of supporting | | 3479 | * Check the vdev configuration to ensure that it's capable of supporting |
3480 | * a root pool. We do not support partial configuration. | | 3480 | * a root pool. We do not support partial configuration. |
3481 | * In addition, only a single top-level vdev is allowed. | | 3481 | * In addition, only a single top-level vdev is allowed. |
3482 | * | | 3482 | * |
3483 | * FreeBSD does not have above limitations. | | 3483 | * FreeBSD does not have above limitations. |
3484 | */ | | 3484 | */ |
3485 | boolean_t | | 3485 | boolean_t |
3486 | vdev_is_bootable(vdev_t *vd) | | 3486 | vdev_is_bootable(vdev_t *vd) |
3487 | { | | 3487 | { |
3488 | #ifdef illumos | | 3488 | #ifdef illumos |
3489 | if (!vd->vdev_ops->vdev_op_leaf) { | | 3489 | if (!vd->vdev_ops->vdev_op_leaf) { |
3490 | char *vdev_type = vd->vdev_ops->vdev_op_type; | | 3490 | char *vdev_type = vd->vdev_ops->vdev_op_type; |
3491 | | | 3491 | |
3492 | if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && | | 3492 | if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && |
3493 | vd->vdev_children > 1) { | | 3493 | vd->vdev_children > 1) { |
3494 | return (B_FALSE); | | 3494 | return (B_FALSE); |
3495 | } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { | | 3495 | } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { |
3496 | return (B_FALSE); | | 3496 | return (B_FALSE); |
3497 | } | | 3497 | } |
3498 | } | | 3498 | } |
3499 | | | 3499 | |
3500 | for (int c = 0; c < vd->vdev_children; c++) { | | 3500 | for (int c = 0; c < vd->vdev_children; c++) { |
3501 | if (!vdev_is_bootable(vd->vdev_child[c])) | | 3501 | if (!vdev_is_bootable(vd->vdev_child[c])) |
3502 | return (B_FALSE); | | 3502 | return (B_FALSE); |
3503 | } | | 3503 | } |
3504 | #endif /* illumos */ | | 3504 | #endif /* illumos */ |
3505 | return (B_TRUE); | | 3505 | return (B_TRUE); |
3506 | } | | 3506 | } |
3507 | | | 3507 | |
3508 | /* | | 3508 | /* |
3509 | * Load the state from the original vdev tree (ovd) which | | 3509 | * Load the state from the original vdev tree (ovd) which |
3510 | * we've retrieved from the MOS config object. If the original | | 3510 | * we've retrieved from the MOS config object. If the original |
3511 | * vdev was offline or faulted then we transfer that state to the | | 3511 | * vdev was offline or faulted then we transfer that state to the |
3512 | * device in the current vdev tree (nvd). | | 3512 | * device in the current vdev tree (nvd). |
3513 | */ | | 3513 | */ |
3514 | void | | 3514 | void |
3515 | vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) | | 3515 | vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) |
3516 | { | | 3516 | { |
3517 | spa_t *spa = nvd->vdev_spa; | | 3517 | spa_t *spa = nvd->vdev_spa; |
3518 | | | 3518 | |
3519 | ASSERT(nvd->vdev_top->vdev_islog); | | 3519 | ASSERT(nvd->vdev_top->vdev_islog); |
3520 | ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); | | 3520 | ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); |
3521 | ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); | | 3521 | ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); |
3522 | | | 3522 | |
3523 | for (int c = 0; c < nvd->vdev_children; c++) | | 3523 | for (int c = 0; c < nvd->vdev_children; c++) |
3524 | vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); | | 3524 | vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); |
3525 | | | 3525 | |
3526 | if (nvd->vdev_ops->vdev_op_leaf) { | | 3526 | if (nvd->vdev_ops->vdev_op_leaf) { |
3527 | /* | | 3527 | /* |
3528 | * Restore the persistent vdev state | | 3528 | * Restore the persistent vdev state |
3529 | */ | | 3529 | */ |
3530 | nvd->vdev_offline = ovd->vdev_offline; | | 3530 | nvd->vdev_offline = ovd->vdev_offline; |
3531 | nvd->vdev_faulted = ovd->vdev_faulted; | | 3531 | nvd->vdev_faulted = ovd->vdev_faulted; |
3532 | nvd->vdev_degraded = ovd->vdev_degraded; | | 3532 | nvd->vdev_degraded = ovd->vdev_degraded; |
3533 | nvd->vdev_removed = ovd->vdev_removed; | | 3533 | nvd->vdev_removed = ovd->vdev_removed; |
3534 | } | | 3534 | } |
3535 | } | | 3535 | } |
3536 | | | 3536 | |
3537 | /* | | 3537 | /* |
3538 | * Determine if a log device has valid content. If the vdev was | | 3538 | * Determine if a log device has valid content. If the vdev was |
3539 | * removed or faulted in the MOS config then we know that | | 3539 | * removed or faulted in the MOS config then we know that |
3540 | * the content on the log device has already been written to the pool. | | 3540 | * the content on the log device has already been written to the pool. |
3541 | */ | | 3541 | */ |
3542 | boolean_t | | 3542 | boolean_t |
3543 | vdev_log_state_valid(vdev_t *vd) | | 3543 | vdev_log_state_valid(vdev_t *vd) |
3544 | { | | 3544 | { |
3545 | if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && | | 3545 | if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && |
3546 | !vd->vdev_removed) | | 3546 | !vd->vdev_removed) |
3547 | return (B_TRUE); | | 3547 | return (B_TRUE); |
3548 | | | 3548 | |
3549 | for (int c = 0; c < vd->vdev_children; c++) | | 3549 | for (int c = 0; c < vd->vdev_children; c++) |
3550 | if (vdev_log_state_valid(vd->vdev_child[c])) | | 3550 | if (vdev_log_state_valid(vd->vdev_child[c])) |
3551 | return (B_TRUE); | | 3551 | return (B_TRUE); |
3552 | | | 3552 | |
3553 | return (B_FALSE); | | 3553 | return (B_FALSE); |
3554 | } | | 3554 | } |
3555 | | | 3555 | |
3556 | /* | | 3556 | /* |
3557 | * Expand a vdev if possible. | | 3557 | * Expand a vdev if possible. |
3558 | */ | | 3558 | */ |
3559 | void | | 3559 | void |
3560 | vdev_expand(vdev_t *vd, uint64_t txg) | | 3560 | vdev_expand(vdev_t *vd, uint64_t txg) |
3561 | { | | 3561 | { |
3562 | ASSERT(vd->vdev_top == vd); | | 3562 | ASSERT(vd->vdev_top == vd); |
3563 | ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); | | 3563 | ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); |
3564 | | | 3564 | |
3565 | if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { | | 3565 | if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { |
3566 | VERIFY(vdev_metaslab_init(vd, txg) == 0); | | 3566 | VERIFY(vdev_metaslab_init(vd, txg) == 0); |
3567 | vdev_config_dirty(vd); | | 3567 | vdev_config_dirty(vd); |
3568 | } | | 3568 | } |
3569 | } | | 3569 | } |
3570 | | | 3570 | |
3571 | /* | | 3571 | /* |
3572 | * Split a vdev. | | 3572 | * Split a vdev. |
3573 | */ | | 3573 | */ |
3574 | void | | 3574 | void |
3575 | vdev_split(vdev_t *vd) | | 3575 | vdev_split(vdev_t *vd) |
3576 | { | | 3576 | { |
3577 | vdev_t *cvd, *pvd = vd->vdev_parent; | | 3577 | vdev_t *cvd, *pvd = vd->vdev_parent; |
3578 | | | 3578 | |
3579 | vdev_remove_child(pvd, vd); | | 3579 | vdev_remove_child(pvd, vd); |
3580 | vdev_compact_children(pvd); | | 3580 | vdev_compact_children(pvd); |
3581 | | | 3581 | |
3582 | cvd = pvd->vdev_child[0]; | | 3582 | cvd = pvd->vdev_child[0]; |
3583 | if (pvd->vdev_children == 1) { | | 3583 | if (pvd->vdev_children == 1) { |
3584 | vdev_remove_parent(cvd); | | 3584 | vdev_remove_parent(cvd); |
3585 | cvd->vdev_splitting = B_TRUE; | | 3585 | cvd->vdev_splitting = B_TRUE; |
3586 | } | | 3586 | } |
3587 | vdev_propagate_state(cvd); | | 3587 | vdev_propagate_state(cvd); |
3588 | } | | 3588 | } |
3589 | | | 3589 | |
3590 | void | | 3590 | void |
3591 | vdev_deadman(vdev_t *vd) | | 3591 | vdev_deadman(vdev_t *vd) |
3592 | { | | 3592 | { |
3593 | for (int c = 0; c < vd->vdev_children; c++) { | | 3593 | for (int c = 0; c < vd->vdev_children; c++) { |
3594 | vdev_t *cvd = vd->vdev_child[c]; | | 3594 | vdev_t *cvd = vd->vdev_child[c]; |
3595 | | | 3595 | |
3596 | vdev_deadman(cvd); | | 3596 | vdev_deadman(cvd); |
3597 | } | | 3597 | } |
3598 | | | 3598 | |
3599 | if (vd->vdev_ops->vdev_op_leaf) { | | 3599 | if (vd->vdev_ops->vdev_op_leaf) { |
3600 | vdev_queue_t *vq = &vd->vdev_queue; | | 3600 | vdev_queue_t *vq = &vd->vdev_queue; |
3601 | | | 3601 | |
3602 | mutex_enter(&vq->vq_lock); | | 3602 | mutex_enter(&vq->vq_lock); |
3603 | if (avl_numnodes(&vq->vq_active_tree) > 0) { | | 3603 | if (avl_numnodes(&vq->vq_active_tree) > 0) { |
3604 | spa_t *spa = vd->vdev_spa; | | 3604 | spa_t *spa = vd->vdev_spa; |
3605 | zio_t *fio; | | 3605 | zio_t *fio; |
3606 | uint64_t delta; | | 3606 | uint64_t delta; |
3607 | | | 3607 | |
3608 | /* | | 3608 | /* |
3609 | * Look at the head of all the pending queues, | | 3609 | * Look at the head of all the pending queues, |
3610 | * if any I/O has been outstanding for longer than | | 3610 | * if any I/O has been outstanding for longer than |
3611 | * the spa_deadman_synctime we panic the system. | | 3611 | * the spa_deadman_synctime we panic the system. |
3612 | */ | | 3612 | */ |
3613 | fio = avl_first(&vq->vq_active_tree); | | 3613 | fio = avl_first(&vq->vq_active_tree); |
3614 | delta = gethrtime() - fio->io_timestamp; | | 3614 | delta = gethrtime() - fio->io_timestamp; |
3615 | if (delta > spa_deadman_synctime(spa)) { | | 3615 | if (delta > spa_deadman_synctime(spa)) { |
3616 | zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " | | 3616 | zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " |
3617 | "delta %"PRIu64"ns, last io %lluns", | | 3617 | "delta %"PRIu64"ns, last io %lluns", |
3618 | fio->io_timestamp, delta, | | 3618 | fio->io_timestamp, delta, |
3619 | vq->vq_io_complete_ts); | | 3619 | vq->vq_io_complete_ts); |
3620 | | | 3620 | |
3621 | printf("SLOW IO: zio timestamp %lluns, " | | 3621 | printf("SLOW IO: zio timestamp %lluns, " |
3622 | "delta %"PRIu64"ns, last io %lluns", | | 3622 | "delta %"PRIu64"ns, last io %lluns\n", |
3623 | fio->io_timestamp, delta, | | 3623 | fio->io_timestamp, delta, |
3624 | vq->vq_io_complete_ts); | | 3624 | vq->vq_io_complete_ts); |
3625 | | | 3625 | |
3626 | fm_panic("I/O to pool '%s' appears to be " | | 3626 | fm_panic("I/O to pool '%s' appears to be " |
3627 | "hung on vdev guid %llu at '%s'.", | | 3627 | "hung on vdev guid %llu at '%s'.", |
3628 | spa_name(spa), | | 3628 | spa_name(spa), |
3629 | (long long unsigned int) vd->vdev_guid, | | 3629 | (long long unsigned int) vd->vdev_guid, |
3630 | vd->vdev_path); | | 3630 | vd->vdev_path); |
3631 | } | | 3631 | } |
3632 | } | | 3632 | } |
3633 | mutex_exit(&vq->vq_lock); | | 3633 | mutex_exit(&vq->vq_lock); |
3634 | } | | 3634 | } |
3635 | } | | 3635 | } |