Mon Mar 9 15:37:46 2020 UTC ()
external/cddl/osnet: Fix possible signed integer overflow

Detected by UBSan and fixed upstream

Cherry-pick:
From 05852b3467b44cdf88541ec67624cd1f5f2ded1d Mon Sep 17 00:00:00 2001
From: luozhengzheng <luo.zhengzheng@zte.com.cn>
Date: Fri, 14 Oct 2016 05:25:05 +0800
Subject: [PATCH] Fix coverity defects: CID 147571, 147574

CID 147571: Unintentional integer overflow (OVERFLOW_BEFORE_WIDEN)
CID 147574: Unintentional integer overflow (OVERFLOW_BEFORE_WIDEN)

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: luozhengzheng <luo.zhengzheng@zte.com.cn>
Closes #5268

Reviewed by: kamil@


(fox)
diff -r1.4 -r1.5 src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c
diff -r1.10 -r1.11 src/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c

cvs diff -r1.4 -r1.5 src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c (switch to unified diff)

--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c 2019/06/21 10:59:50 1.4
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c 2020/03/09 15:37:46 1.5
@@ -1,1414 +1,1414 @@ @@ -1,1414 +1,1414 @@
1/* 1/*
2 * CDDL HEADER START 2 * CDDL HEADER START
3 * 3 *
4 * The contents of this file are subject to the terms of the 4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License"). 5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License. 6 * You may not use this file except in compliance with the License.
7 * 7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing. 9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions 10 * See the License for the specific language governing permissions
11 * and limitations under the License. 11 * and limitations under the License.
12 * 12 *
13 * When distributing Covered Code, include this CDDL HEADER in each 13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the 15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner] 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 * 18 *
19 * CDDL HEADER END 19 * CDDL HEADER END
20 */ 20 */
21/* 21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 24 * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com] 25 * Copyright (c) 2014 Integros [integros.com]
26 */ 26 */
27 27
28#include <sys/dmu.h> 28#include <sys/dmu.h>
29#include <sys/dmu_impl.h> 29#include <sys/dmu_impl.h>
30#include <sys/dbuf.h> 30#include <sys/dbuf.h>
31#include <sys/dmu_tx.h> 31#include <sys/dmu_tx.h>
32#include <sys/dmu_objset.h> 32#include <sys/dmu_objset.h>
33#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ 33#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
34#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ 34#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
35#include <sys/dsl_pool.h> 35#include <sys/dsl_pool.h>
36#include <sys/zap_impl.h> /* for fzap_default_block_shift */ 36#include <sys/zap_impl.h> /* for fzap_default_block_shift */
37#include <sys/spa.h> 37#include <sys/spa.h>
38#include <sys/sa.h> 38#include <sys/sa.h>
39#include <sys/sa_impl.h> 39#include <sys/sa_impl.h>
40#include <sys/zfs_context.h> 40#include <sys/zfs_context.h>
41#include <sys/varargs.h> 41#include <sys/varargs.h>
42 42
43typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 43typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
44 uint64_t arg1, uint64_t arg2); 44 uint64_t arg1, uint64_t arg2);
45 45
46 46
47dmu_tx_t * 47dmu_tx_t *
48dmu_tx_create_dd(dsl_dir_t *dd) 48dmu_tx_create_dd(dsl_dir_t *dd)
49{ 49{
50 dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 50 dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
51 tx->tx_dir = dd; 51 tx->tx_dir = dd;
52 if (dd != NULL) 52 if (dd != NULL)
53 tx->tx_pool = dd->dd_pool; 53 tx->tx_pool = dd->dd_pool;
54 list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 54 list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
55 offsetof(dmu_tx_hold_t, txh_node)); 55 offsetof(dmu_tx_hold_t, txh_node));
56 list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), 56 list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
57 offsetof(dmu_tx_callback_t, dcb_node)); 57 offsetof(dmu_tx_callback_t, dcb_node));
58 tx->tx_start = gethrtime(); 58 tx->tx_start = gethrtime();
59#ifdef ZFS_DEBUG 59#ifdef ZFS_DEBUG
60 refcount_create(&tx->tx_space_written); 60 refcount_create(&tx->tx_space_written);
61 refcount_create(&tx->tx_space_freed); 61 refcount_create(&tx->tx_space_freed);
62#endif 62#endif
63 return (tx); 63 return (tx);
64} 64}
65 65
66dmu_tx_t * 66dmu_tx_t *
67dmu_tx_create(objset_t *os) 67dmu_tx_create(objset_t *os)
68{ 68{
69 dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); 69 dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
70 tx->tx_objset = os; 70 tx->tx_objset = os;
71 tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); 71 tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
72 return (tx); 72 return (tx);
73} 73}
74 74
75dmu_tx_t * 75dmu_tx_t *
76dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 76dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
77{ 77{
78 dmu_tx_t *tx = dmu_tx_create_dd(NULL); 78 dmu_tx_t *tx = dmu_tx_create_dd(NULL);
79 79
80 ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 80 ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
81 tx->tx_pool = dp; 81 tx->tx_pool = dp;
82 tx->tx_txg = txg; 82 tx->tx_txg = txg;
83 tx->tx_anyobj = TRUE; 83 tx->tx_anyobj = TRUE;
84 84
85 return (tx); 85 return (tx);
86} 86}
87 87
88int 88int
89dmu_tx_is_syncing(dmu_tx_t *tx) 89dmu_tx_is_syncing(dmu_tx_t *tx)
90{ 90{
91 return (tx->tx_anyobj); 91 return (tx->tx_anyobj);
92} 92}
93 93
94int 94int
95dmu_tx_private_ok(dmu_tx_t *tx) 95dmu_tx_private_ok(dmu_tx_t *tx)
96{ 96{
97 return (tx->tx_anyobj); 97 return (tx->tx_anyobj);
98} 98}
99 99
100static dmu_tx_hold_t * 100static dmu_tx_hold_t *
101dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 101dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
102 enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) 102 enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
103{ 103{
104 dmu_tx_hold_t *txh; 104 dmu_tx_hold_t *txh;
105 dnode_t *dn = NULL; 105 dnode_t *dn = NULL;
106 int err; 106 int err;
107 107
108 if (object != DMU_NEW_OBJECT) { 108 if (object != DMU_NEW_OBJECT) {
109 err = dnode_hold(os, object, tx, &dn); 109 err = dnode_hold(os, object, tx, &dn);
110 if (err) { 110 if (err) {
111 tx->tx_err = err; 111 tx->tx_err = err;
112 return (NULL); 112 return (NULL);
113 } 113 }
114 114
115 if (err == 0 && tx->tx_txg != 0) { 115 if (err == 0 && tx->tx_txg != 0) {
116 mutex_enter(&dn->dn_mtx); 116 mutex_enter(&dn->dn_mtx);
117 /* 117 /*
118 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 118 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
119 * problem, but there's no way for it to happen (for 119 * problem, but there's no way for it to happen (for
120 * now, at least). 120 * now, at least).
121 */ 121 */
122 ASSERT(dn->dn_assigned_txg == 0); 122 ASSERT(dn->dn_assigned_txg == 0);
123 dn->dn_assigned_txg = tx->tx_txg; 123 dn->dn_assigned_txg = tx->tx_txg;
124 (void) refcount_add(&dn->dn_tx_holds, tx); 124 (void) refcount_add(&dn->dn_tx_holds, tx);
125 mutex_exit(&dn->dn_mtx); 125 mutex_exit(&dn->dn_mtx);
126 } 126 }
127 } 127 }
128 128
129 txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 129 txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
130 txh->txh_tx = tx; 130 txh->txh_tx = tx;
131 txh->txh_dnode = dn; 131 txh->txh_dnode = dn;
132 refcount_create(&txh->txh_space_towrite); 132 refcount_create(&txh->txh_space_towrite);
133 refcount_create(&txh->txh_space_tofree); 133 refcount_create(&txh->txh_space_tofree);
134 refcount_create(&txh->txh_space_tooverwrite); 134 refcount_create(&txh->txh_space_tooverwrite);
135 refcount_create(&txh->txh_space_tounref); 135 refcount_create(&txh->txh_space_tounref);
136 refcount_create(&txh->txh_memory_tohold); 136 refcount_create(&txh->txh_memory_tohold);
137 refcount_create(&txh->txh_fudge); 137 refcount_create(&txh->txh_fudge);
138#ifdef ZFS_DEBUG 138#ifdef ZFS_DEBUG
139 txh->txh_type = type; 139 txh->txh_type = type;
140 txh->txh_arg1 = arg1; 140 txh->txh_arg1 = arg1;
141 txh->txh_arg2 = arg2; 141 txh->txh_arg2 = arg2;
142#endif 142#endif
143 list_insert_tail(&tx->tx_holds, txh); 143 list_insert_tail(&tx->tx_holds, txh);
144 144
145 return (txh); 145 return (txh);
146} 146}
147 147
148void 148void
149dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) 149dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
150{ 150{
151 /* 151 /*
152 * If we're syncing, they can manipulate any object anyhow, and 152 * If we're syncing, they can manipulate any object anyhow, and
153 * the hold on the dnode_t can cause problems. 153 * the hold on the dnode_t can cause problems.
154 */ 154 */
155 if (!dmu_tx_is_syncing(tx)) { 155 if (!dmu_tx_is_syncing(tx)) {
156 (void) dmu_tx_hold_object_impl(tx, os, 156 (void) dmu_tx_hold_object_impl(tx, os,
157 object, THT_NEWOBJECT, 0, 0); 157 object, THT_NEWOBJECT, 0, 0);
158 } 158 }
159} 159}
160 160
161static int 161static int
162dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 162dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
163{ 163{
164 int err; 164 int err;
165 dmu_buf_impl_t *db; 165 dmu_buf_impl_t *db;
166 166
167 rw_enter(&dn->dn_struct_rwlock, RW_READER); 167 rw_enter(&dn->dn_struct_rwlock, RW_READER);
168 db = dbuf_hold_level(dn, level, blkid, FTAG); 168 db = dbuf_hold_level(dn, level, blkid, FTAG);
169 rw_exit(&dn->dn_struct_rwlock); 169 rw_exit(&dn->dn_struct_rwlock);
170 if (db == NULL) 170 if (db == NULL)
171 return (SET_ERROR(EIO)); 171 return (SET_ERROR(EIO));
172 err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); 172 err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
173 dbuf_rele(db, FTAG); 173 dbuf_rele(db, FTAG);
174 return (err); 174 return (err);
175} 175}
176 176
177static void 177static void
178dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, 178dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
179 int level, uint64_t blkid, boolean_t freeable, uint64_t *history) 179 int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
180{ 180{
181 objset_t *os = dn->dn_objset; 181 objset_t *os = dn->dn_objset;
182 dsl_dataset_t *ds = os->os_dsl_dataset; 182 dsl_dataset_t *ds = os->os_dsl_dataset;
183 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 183 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
184 dmu_buf_impl_t *parent = NULL; 184 dmu_buf_impl_t *parent = NULL;
185 blkptr_t *bp = NULL; 185 blkptr_t *bp = NULL;
186 uint64_t space; 186 uint64_t space;
187 187
188 if (level >= dn->dn_nlevels || history[level] == blkid) 188 if (level >= dn->dn_nlevels || history[level] == blkid)
189 return; 189 return;
190 190
191 history[level] = blkid; 191 history[level] = blkid;
192 192
193 space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); 193 space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
194 194
195 if (db == NULL || db == dn->dn_dbuf) { 195 if (db == NULL || db == dn->dn_dbuf) {
196 ASSERT(level != 0); 196 ASSERT(level != 0);
197 db = NULL; 197 db = NULL;
198 } else { 198 } else {
199 ASSERT(DB_DNODE(db) == dn); 199 ASSERT(DB_DNODE(db) == dn);
200 ASSERT(db->db_level == level); 200 ASSERT(db->db_level == level);
201 ASSERT(db->db.db_size == space); 201 ASSERT(db->db.db_size == space);
202 ASSERT(db->db_blkid == blkid); 202 ASSERT(db->db_blkid == blkid);
203 bp = db->db_blkptr; 203 bp = db->db_blkptr;
204 parent = db->db_parent; 204 parent = db->db_parent;
205 } 205 }
206 206
207 freeable = (bp && (freeable || 207 freeable = (bp && (freeable ||
208 dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); 208 dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
209 209
210 if (freeable) { 210 if (freeable) {
211 (void) refcount_add_many(&txh->txh_space_tooverwrite, 211 (void) refcount_add_many(&txh->txh_space_tooverwrite,
212 space, FTAG); 212 space, FTAG);
213 } else { 213 } else {
214 (void) refcount_add_many(&txh->txh_space_towrite, 214 (void) refcount_add_many(&txh->txh_space_towrite,
215 space, FTAG); 215 space, FTAG);
216 } 216 }
217 217
218 if (bp) { 218 if (bp) {
219 (void) refcount_add_many(&txh->txh_space_tounref, 219 (void) refcount_add_many(&txh->txh_space_tounref,
220 bp_get_dsize(os->os_spa, bp), FTAG); 220 bp_get_dsize(os->os_spa, bp), FTAG);
221 } 221 }
222 222
223 dmu_tx_count_twig(txh, dn, parent, level + 1, 223 dmu_tx_count_twig(txh, dn, parent, level + 1,
224 blkid >> epbs, freeable, history); 224 blkid >> epbs, freeable, history);
225} 225}
226 226
227/* ARGSUSED */ 227/* ARGSUSED */
228static void 228static void
229dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 229dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
230{ 230{
231 dnode_t *dn = txh->txh_dnode; 231 dnode_t *dn = txh->txh_dnode;
232 uint64_t start, end, i; 232 uint64_t start, end, i;
233 int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; 233 int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
234 int err = 0; 234 int err = 0;
235 235
236 if (len == 0) 236 if (len == 0)
237 return; 237 return;
238 238
239 min_bs = SPA_MINBLOCKSHIFT; 239 min_bs = SPA_MINBLOCKSHIFT;
240 max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1; 240 max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
241 min_ibs = DN_MIN_INDBLKSHIFT; 241 min_ibs = DN_MIN_INDBLKSHIFT;
242 max_ibs = DN_MAX_INDBLKSHIFT; 242 max_ibs = DN_MAX_INDBLKSHIFT;
243 243
244 if (dn) { 244 if (dn) {
245 uint64_t history[DN_MAX_LEVELS]; 245 uint64_t history[DN_MAX_LEVELS];
246 int nlvls = dn->dn_nlevels; 246 int nlvls = dn->dn_nlevels;
247 int delta; 247 int delta;
248 248
249 /* 249 /*
250 * For i/o error checking, read the first and last level-0 250 * For i/o error checking, read the first and last level-0
251 * blocks (if they are not aligned), and all the level-1 blocks. 251 * blocks (if they are not aligned), and all the level-1 blocks.
252 */ 252 */
253 if (dn->dn_maxblkid == 0) { 253 if (dn->dn_maxblkid == 0) {
254 delta = dn->dn_datablksz; 254 delta = dn->dn_datablksz;
255 start = (off < dn->dn_datablksz) ? 0 : 1; 255 start = (off < dn->dn_datablksz) ? 0 : 1;
256 end = (off+len <= dn->dn_datablksz) ? 0 : 1; 256 end = (off+len <= dn->dn_datablksz) ? 0 : 1;
257 if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { 257 if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
258 err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 258 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
259 if (err) 259 if (err)
260 goto out; 260 goto out;
261 delta -= off; 261 delta -= off;
262 } 262 }
263 } else { 263 } else {
264 zio_t *zio = zio_root(dn->dn_objset->os_spa, 264 zio_t *zio = zio_root(dn->dn_objset->os_spa,
265 NULL, NULL, ZIO_FLAG_CANFAIL); 265 NULL, NULL, ZIO_FLAG_CANFAIL);
266 266
267 /* first level-0 block */ 267 /* first level-0 block */
268 start = off >> dn->dn_datablkshift; 268 start = off >> dn->dn_datablkshift;
269 if (P2PHASE(off, dn->dn_datablksz) || 269 if (P2PHASE(off, dn->dn_datablksz) ||
270 len < dn->dn_datablksz) { 270 len < dn->dn_datablksz) {
271 err = dmu_tx_check_ioerr(zio, dn, 0, start); 271 err = dmu_tx_check_ioerr(zio, dn, 0, start);
272 if (err) 272 if (err)
273 goto out; 273 goto out;
274 } 274 }
275 275
276 /* last level-0 block */ 276 /* last level-0 block */
277 end = (off+len-1) >> dn->dn_datablkshift; 277 end = (off+len-1) >> dn->dn_datablkshift;
278 if (end != start && end <= dn->dn_maxblkid && 278 if (end != start && end <= dn->dn_maxblkid &&
279 P2PHASE(off+len, dn->dn_datablksz)) { 279 P2PHASE(off+len, dn->dn_datablksz)) {
280 err = dmu_tx_check_ioerr(zio, dn, 0, end); 280 err = dmu_tx_check_ioerr(zio, dn, 0, end);
281 if (err) 281 if (err)
282 goto out; 282 goto out;
283 } 283 }
284 284
285 /* level-1 blocks */ 285 /* level-1 blocks */
286 if (nlvls > 1) { 286 if (nlvls > 1) {
287 int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 287 int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
288 for (i = (start>>shft)+1; i < end>>shft; i++) { 288 for (i = (start>>shft)+1; i < end>>shft; i++) {
289 err = dmu_tx_check_ioerr(zio, dn, 1, i); 289 err = dmu_tx_check_ioerr(zio, dn, 1, i);
290 if (err) 290 if (err)
291 goto out; 291 goto out;
292 } 292 }
293 } 293 }
294 294
295 err = zio_wait(zio); 295 err = zio_wait(zio);
296 if (err) 296 if (err)
297 goto out; 297 goto out;
298 delta = P2NPHASE(off, dn->dn_datablksz); 298 delta = P2NPHASE(off, dn->dn_datablksz);
299 } 299 }
300 300
301 min_ibs = max_ibs = dn->dn_indblkshift; 301 min_ibs = max_ibs = dn->dn_indblkshift;
302 if (dn->dn_maxblkid > 0) { 302 if (dn->dn_maxblkid > 0) {
303 /* 303 /*
304 * The blocksize can't change, 304 * The blocksize can't change,
305 * so we can make a more precise estimate. 305 * so we can make a more precise estimate.
306 */ 306 */
307 ASSERT(dn->dn_datablkshift != 0); 307 ASSERT(dn->dn_datablkshift != 0);
308 min_bs = max_bs = dn->dn_datablkshift; 308 min_bs = max_bs = dn->dn_datablkshift;
309 } else { 309 } else {
310 /* 310 /*
311 * The blocksize can increase up to the recordsize, 311 * The blocksize can increase up to the recordsize,
312 * or if it is already more than the recordsize, 312 * or if it is already more than the recordsize,
313 * up to the next power of 2. 313 * up to the next power of 2.
314 */ 314 */
315 min_bs = highbit64(dn->dn_datablksz - 1); 315 min_bs = highbit64(dn->dn_datablksz - 1);
316 max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1)); 316 max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
317 } 317 }
318 318
319 /* 319 /*
320 * If this write is not off the end of the file 320 * If this write is not off the end of the file
321 * we need to account for overwrites/unref. 321 * we need to account for overwrites/unref.
322 */ 322 */
323 if (start <= dn->dn_maxblkid) { 323 if (start <= dn->dn_maxblkid) {
324 for (int l = 0; l < DN_MAX_LEVELS; l++) 324 for (int l = 0; l < DN_MAX_LEVELS; l++)
325 history[l] = -1ULL; 325 history[l] = -1ULL;
326 } 326 }
327 while (start <= dn->dn_maxblkid) { 327 while (start <= dn->dn_maxblkid) {
328 dmu_buf_impl_t *db; 328 dmu_buf_impl_t *db;
329 329
330 rw_enter(&dn->dn_struct_rwlock, RW_READER); 330 rw_enter(&dn->dn_struct_rwlock, RW_READER);
331 err = dbuf_hold_impl(dn, 0, start, 331 err = dbuf_hold_impl(dn, 0, start,
332 FALSE, FALSE, FTAG, &db); 332 FALSE, FALSE, FTAG, &db);
333 rw_exit(&dn->dn_struct_rwlock); 333 rw_exit(&dn->dn_struct_rwlock);
334 334
335 if (err) { 335 if (err) {
336 txh->txh_tx->tx_err = err; 336 txh->txh_tx->tx_err = err;
337 return; 337 return;
338 } 338 }
339 339
340 dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, 340 dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
341 history); 341 history);
342 dbuf_rele(db, FTAG); 342 dbuf_rele(db, FTAG);
343 if (++start > end) { 343 if (++start > end) {
344 /* 344 /*
345 * Account for new indirects appearing 345 * Account for new indirects appearing
346 * before this IO gets assigned into a txg. 346 * before this IO gets assigned into a txg.
347 */ 347 */
348 bits = 64 - min_bs; 348 bits = 64 - min_bs;
349 epbs = min_ibs - SPA_BLKPTRSHIFT; 349 epbs = min_ibs - SPA_BLKPTRSHIFT;
350 for (bits -= epbs * (nlvls - 1); 350 for (bits -= epbs * (nlvls - 1);
351 bits >= 0; bits -= epbs) { 351 bits >= 0; bits -= epbs) {
352 (void) refcount_add_many( 352 (void) refcount_add_many(
353 &txh->txh_fudge, 353 &txh->txh_fudge,
354 1ULL << max_ibs, FTAG); 354 1ULL << max_ibs, FTAG);
355 } 355 }
356 goto out; 356 goto out;
357 } 357 }
358 off += delta; 358 off += delta;
359 if (len >= delta) 359 if (len >= delta)
360 len -= delta; 360 len -= delta;
361 delta = dn->dn_datablksz; 361 delta = dn->dn_datablksz;
362 } 362 }
363 } 363 }
364 364
365 /* 365 /*
366 * 'end' is the last thing we will access, not one past. 366 * 'end' is the last thing we will access, not one past.
367 * This way we won't overflow when accessing the last byte. 367 * This way we won't overflow when accessing the last byte.
368 */ 368 */
369 start = P2ALIGN(off, 1ULL << max_bs); 369 start = P2ALIGN(off, 1ULL << max_bs);
370 end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; 370 end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
371 (void) refcount_add_many(&txh->txh_space_towrite, 371 (void) refcount_add_many(&txh->txh_space_towrite,
372 end - start + 1, FTAG); 372 end - start + 1, FTAG);
373 373
374 start >>= min_bs; 374 start >>= min_bs;
375 end >>= min_bs; 375 end >>= min_bs;
376 376
377 epbs = min_ibs - SPA_BLKPTRSHIFT; 377 epbs = min_ibs - SPA_BLKPTRSHIFT;
378 378
379 /* 379 /*
380 * The object contains at most 2^(64 - min_bs) blocks, 380 * The object contains at most 2^(64 - min_bs) blocks,
381 * and each indirect level maps 2^epbs. 381 * and each indirect level maps 2^epbs.
382 */ 382 */
383 for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { 383 for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
384 start >>= epbs; 384 start >>= epbs;
385 end >>= epbs; 385 end >>= epbs;
386 ASSERT3U(end, >=, start); 386 ASSERT3U(end, >=, start);
387 (void) refcount_add_many(&txh->txh_space_towrite, 387 (void) refcount_add_many(&txh->txh_space_towrite,
388 (end - start + 1) << max_ibs, FTAG); 388 (end - start + 1) << max_ibs, FTAG);
389 if (start != 0) { 389 if (start != 0) {
390 /* 390 /*
391 * We also need a new blkid=0 indirect block 391 * We also need a new blkid=0 indirect block
392 * to reference any existing file data. 392 * to reference any existing file data.
393 */ 393 */
394 (void) refcount_add_many(&txh->txh_space_towrite, 394 (void) refcount_add_many(&txh->txh_space_towrite,
395 1ULL << max_ibs, FTAG); 395 1ULL << max_ibs, FTAG);
396 } 396 }
397 } 397 }
398 398
399out: 399out:
400 if (refcount_count(&txh->txh_space_towrite) + 400 if (refcount_count(&txh->txh_space_towrite) +
401 refcount_count(&txh->txh_space_tooverwrite) > 401 refcount_count(&txh->txh_space_tooverwrite) >
402 2 * DMU_MAX_ACCESS) 402 2 * DMU_MAX_ACCESS)
403 err = SET_ERROR(EFBIG); 403 err = SET_ERROR(EFBIG);
404 404
405 if (err) 405 if (err)
406 txh->txh_tx->tx_err = err; 406 txh->txh_tx->tx_err = err;
407} 407}
408 408
409static void 409static void
410dmu_tx_count_dnode(dmu_tx_hold_t *txh) 410dmu_tx_count_dnode(dmu_tx_hold_t *txh)
411{ 411{
412 dnode_t *dn = txh->txh_dnode; 412 dnode_t *dn = txh->txh_dnode;
413 dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); 413 dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
414 uint64_t space = mdn->dn_datablksz + 414 uint64_t space = mdn->dn_datablksz +
415 ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); 415 ((uint64_t)(mdn->dn_nlevels-1) << mdn->dn_indblkshift);
416 416
417 if (dn && dn->dn_dbuf->db_blkptr && 417 if (dn && dn->dn_dbuf->db_blkptr &&
418 dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 418 dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
419 dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { 419 dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
420 (void) refcount_add_many(&txh->txh_space_tooverwrite, 420 (void) refcount_add_many(&txh->txh_space_tooverwrite,
421 space, FTAG); 421 space, FTAG);
422 (void) refcount_add_many(&txh->txh_space_tounref, space, FTAG); 422 (void) refcount_add_many(&txh->txh_space_tounref, space, FTAG);
423 } else { 423 } else {
424 (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); 424 (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
425 if (dn && dn->dn_dbuf->db_blkptr) { 425 if (dn && dn->dn_dbuf->db_blkptr) {
426 (void) refcount_add_many(&txh->txh_space_tounref, 426 (void) refcount_add_many(&txh->txh_space_tounref,
427 space, FTAG); 427 space, FTAG);
428 } 428 }
429 } 429 }
430} 430}
431 431
432void 432void
433dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 433dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
434{ 434{
435 dmu_tx_hold_t *txh; 435 dmu_tx_hold_t *txh;
436 436
437 ASSERT(tx->tx_txg == 0); 437 ASSERT(tx->tx_txg == 0);
438 ASSERT(len < DMU_MAX_ACCESS); 438 ASSERT(len < DMU_MAX_ACCESS);
439 ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 439 ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
440 440
441 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 441 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
442 object, THT_WRITE, off, len); 442 object, THT_WRITE, off, len);
443 if (txh == NULL) 443 if (txh == NULL)
444 return; 444 return;
445 445
446 dmu_tx_count_write(txh, off, len); 446 dmu_tx_count_write(txh, off, len);
447 dmu_tx_count_dnode(txh); 447 dmu_tx_count_dnode(txh);
448} 448}
449 449
450static void 450static void
451dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 451dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
452{ 452{
453 uint64_t blkid, nblks, lastblk; 453 uint64_t blkid, nblks, lastblk;
454 uint64_t space = 0, unref = 0, skipped = 0; 454 uint64_t space = 0, unref = 0, skipped = 0;
455 dnode_t *dn = txh->txh_dnode; 455 dnode_t *dn = txh->txh_dnode;
456 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 456 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
457 spa_t *spa = txh->txh_tx->tx_pool->dp_spa; 457 spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
458 int epbs; 458 int epbs;
459 uint64_t l0span = 0, nl1blks = 0; 459 uint64_t l0span = 0, nl1blks = 0;
460 460
461 if (dn->dn_nlevels == 0) 461 if (dn->dn_nlevels == 0)
462 return; 462 return;
463 463
464 /* 464 /*
465 * The struct_rwlock protects us against dn_nlevels 465 * The struct_rwlock protects us against dn_nlevels
466 * changing, in case (against all odds) we manage to dirty & 466 * changing, in case (against all odds) we manage to dirty &
467 * sync out the changes after we check for being dirty. 467 * sync out the changes after we check for being dirty.
468 * Also, dbuf_hold_impl() wants us to have the struct_rwlock. 468 * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
469 */ 469 */
470 rw_enter(&dn->dn_struct_rwlock, RW_READER); 470 rw_enter(&dn->dn_struct_rwlock, RW_READER);
471 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 471 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
472 if (dn->dn_maxblkid == 0) { 472 if (dn->dn_maxblkid == 0) {
473 if (off == 0 && len >= dn->dn_datablksz) { 473 if (off == 0 && len >= dn->dn_datablksz) {
474 blkid = 0; 474 blkid = 0;
475 nblks = 1; 475 nblks = 1;
476 } else { 476 } else {
477 rw_exit(&dn->dn_struct_rwlock); 477 rw_exit(&dn->dn_struct_rwlock);
478 return; 478 return;
479 } 479 }
480 } else { 480 } else {
481 blkid = off >> dn->dn_datablkshift; 481 blkid = off >> dn->dn_datablkshift;
482 nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; 482 nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
483 483
484 if (blkid > dn->dn_maxblkid) { 484 if (blkid > dn->dn_maxblkid) {
485 rw_exit(&dn->dn_struct_rwlock); 485 rw_exit(&dn->dn_struct_rwlock);
486 return; 486 return;
487 } 487 }
488 if (blkid + nblks > dn->dn_maxblkid) 488 if (blkid + nblks > dn->dn_maxblkid)
489 nblks = dn->dn_maxblkid - blkid + 1; 489 nblks = dn->dn_maxblkid - blkid + 1;
490 490
491 } 491 }
492 l0span = nblks; /* save for later use to calc level > 1 overhead */ 492 l0span = nblks; /* save for later use to calc level > 1 overhead */
493 if (dn->dn_nlevels == 1) { 493 if (dn->dn_nlevels == 1) {
494 int i; 494 int i;
495 for (i = 0; i < nblks; i++) { 495 for (i = 0; i < nblks; i++) {
496 blkptr_t *bp = dn->dn_phys->dn_blkptr; 496 blkptr_t *bp = dn->dn_phys->dn_blkptr;
497 ASSERT3U(blkid + i, <, dn->dn_nblkptr); 497 ASSERT3U(blkid + i, <, dn->dn_nblkptr);
498 bp += blkid + i; 498 bp += blkid + i;
499 if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { 499 if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
500 dprintf_bp(bp, "can free old%s", ""); 500 dprintf_bp(bp, "can free old%s", "");
501 space += bp_get_dsize(spa, bp); 501 space += bp_get_dsize(spa, bp);
502 } 502 }
503 unref += BP_GET_ASIZE(bp); 503 unref += BP_GET_ASIZE(bp);
504 } 504 }
505 nl1blks = 1; 505 nl1blks = 1;
506 nblks = 0; 506 nblks = 0;
507 } 507 }
508 508
509 lastblk = blkid + nblks - 1; 509 lastblk = blkid + nblks - 1;
510 while (nblks) { 510 while (nblks) {
511 dmu_buf_impl_t *dbuf; 511 dmu_buf_impl_t *dbuf;
512 uint64_t ibyte, new_blkid; 512 uint64_t ibyte, new_blkid;
513 int epb = 1 << epbs; 513 int epb = 1 << epbs;
514 int err, i, blkoff, tochk; 514 int err, i, blkoff, tochk;
515 blkptr_t *bp; 515 blkptr_t *bp;
516 516
517 ibyte = blkid << dn->dn_datablkshift; 517 ibyte = blkid << dn->dn_datablkshift;
518 err = dnode_next_offset(dn, 518 err = dnode_next_offset(dn,
519 DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); 519 DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
520 new_blkid = ibyte >> dn->dn_datablkshift; 520 new_blkid = ibyte >> dn->dn_datablkshift;
521 if (err == ESRCH) { 521 if (err == ESRCH) {
522 skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; 522 skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
523 break; 523 break;
524 } 524 }
525 if (err) { 525 if (err) {
526 txh->txh_tx->tx_err = err; 526 txh->txh_tx->tx_err = err;
527 break; 527 break;
528 } 528 }
529 if (new_blkid > lastblk) { 529 if (new_blkid > lastblk) {
530 skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; 530 skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
531 break; 531 break;
532 } 532 }
533 533
534 if (new_blkid > blkid) { 534 if (new_blkid > blkid) {
535 ASSERT((new_blkid >> epbs) > (blkid >> epbs)); 535 ASSERT((new_blkid >> epbs) > (blkid >> epbs));
536 skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; 536 skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
537 nblks -= new_blkid - blkid; 537 nblks -= new_blkid - blkid;
538 blkid = new_blkid; 538 blkid = new_blkid;
539 } 539 }
540 blkoff = P2PHASE(blkid, epb); 540 blkoff = P2PHASE(blkid, epb);
541 tochk = MIN(epb - blkoff, nblks); 541 tochk = MIN(epb - blkoff, nblks);
542 542
543 err = dbuf_hold_impl(dn, 1, blkid >> epbs, 543 err = dbuf_hold_impl(dn, 1, blkid >> epbs,
544 FALSE, FALSE, FTAG, &dbuf); 544 FALSE, FALSE, FTAG, &dbuf);
545 if (err) { 545 if (err) {
546 txh->txh_tx->tx_err = err; 546 txh->txh_tx->tx_err = err;
547 break; 547 break;
548 } 548 }
549 549
550 (void) refcount_add_many(&txh->txh_memory_tohold, 550 (void) refcount_add_many(&txh->txh_memory_tohold,
551 dbuf->db.db_size, FTAG); 551 dbuf->db.db_size, FTAG);
552 552
553 /* 553 /*
554 * We don't check memory_tohold against DMU_MAX_ACCESS because 554 * We don't check memory_tohold against DMU_MAX_ACCESS because
555 * memory_tohold is an over-estimation (especially the >L1 555 * memory_tohold is an over-estimation (especially the >L1
556 * indirect blocks), so it could fail. Callers should have 556 * indirect blocks), so it could fail. Callers should have
557 * already verified that they will not be holding too much 557 * already verified that they will not be holding too much
558 * memory. 558 * memory.
559 */ 559 */
560 560
561 err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); 561 err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
562 if (err != 0) { 562 if (err != 0) {
563 txh->txh_tx->tx_err = err; 563 txh->txh_tx->tx_err = err;
564 dbuf_rele(dbuf, FTAG); 564 dbuf_rele(dbuf, FTAG);
565 break; 565 break;
566 } 566 }
567 567
568 bp = dbuf->db.db_data; 568 bp = dbuf->db.db_data;
569 bp += blkoff; 569 bp += blkoff;
570 570
571 for (i = 0; i < tochk; i++) { 571 for (i = 0; i < tochk; i++) {
572 if (dsl_dataset_block_freeable(ds, &bp[i], 572 if (dsl_dataset_block_freeable(ds, &bp[i],
573 bp[i].blk_birth)) { 573 bp[i].blk_birth)) {
574 dprintf_bp(&bp[i], "can free old%s", ""); 574 dprintf_bp(&bp[i], "can free old%s", "");
575 space += bp_get_dsize(spa, &bp[i]); 575 space += bp_get_dsize(spa, &bp[i]);
576 } 576 }
577 unref += BP_GET_ASIZE(bp); 577 unref += BP_GET_ASIZE(bp);
578 } 578 }
579 dbuf_rele(dbuf, FTAG); 579 dbuf_rele(dbuf, FTAG);
580 580
581 ++nl1blks; 581 ++nl1blks;
582 blkid += tochk; 582 blkid += tochk;
583 nblks -= tochk; 583 nblks -= tochk;
584 } 584 }
585 rw_exit(&dn->dn_struct_rwlock); 585 rw_exit(&dn->dn_struct_rwlock);
586 586
587 /* 587 /*
588 * Add in memory requirements of higher-level indirects. 588 * Add in memory requirements of higher-level indirects.
589 * This assumes a worst-possible scenario for dn_nlevels and a 589 * This assumes a worst-possible scenario for dn_nlevels and a
590 * worst-possible distribution of l1-blocks over the region to free. 590 * worst-possible distribution of l1-blocks over the region to free.
591 */ 591 */
592 { 592 {
593 uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs); 593 uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
594 int level = 2; 594 int level = 2;
595 /* 595 /*
596 * Here we don't use DN_MAX_LEVEL, but calculate it with the 596 * Here we don't use DN_MAX_LEVEL, but calculate it with the
597 * given datablkshift and indblkshift. This makes the 597 * given datablkshift and indblkshift. This makes the
598 * difference between 19 and 8 on large files. 598 * difference between 19 and 8 on large files.
599 */ 599 */
600 int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) / 600 int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
601 (dn->dn_indblkshift - SPA_BLKPTRSHIFT); 601 (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
602 602
603 while (level++ < maxlevel) { 603 while (level++ < maxlevel) {
604 (void) refcount_add_many(&txh->txh_memory_tohold, 604 (void) refcount_add_many(&txh->txh_memory_tohold,
605 MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift, 605 MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift,
606 FTAG); 606 FTAG);
607 blkcnt = 1 + (blkcnt >> epbs); 607 blkcnt = 1 + (blkcnt >> epbs);
608 } 608 }
609 } 609 }
610 610
611 /* account for new level 1 indirect blocks that might show up */ 611 /* account for new level 1 indirect blocks that might show up */
612 if (skipped > 0) { 612 if (skipped > 0) {
613 (void) refcount_add_many(&txh->txh_fudge, 613 (void) refcount_add_many(&txh->txh_fudge,
614 skipped << dn->dn_indblkshift, FTAG); 614 skipped << dn->dn_indblkshift, FTAG);
615 skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); 615 skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
616 (void) refcount_add_many(&txh->txh_memory_tohold, 616 (void) refcount_add_many(&txh->txh_memory_tohold,
617 skipped << dn->dn_indblkshift, FTAG); 617 skipped << dn->dn_indblkshift, FTAG);
618 } 618 }
619 (void) refcount_add_many(&txh->txh_space_tofree, space, FTAG); 619 (void) refcount_add_many(&txh->txh_space_tofree, space, FTAG);
620 (void) refcount_add_many(&txh->txh_space_tounref, unref, FTAG); 620 (void) refcount_add_many(&txh->txh_space_tounref, unref, FTAG);
621} 621}
622 622
623/* 623/*
624 * This function marks the transaction as being a "net free". The end 624 * This function marks the transaction as being a "net free". The end
625 * result is that refquotas will be disabled for this transaction, and 625 * result is that refquotas will be disabled for this transaction, and
626 * this transaction will be able to use half of the pool space overhead 626 * this transaction will be able to use half of the pool space overhead
627 * (see dsl_pool_adjustedsize()). Therefore this function should only 627 * (see dsl_pool_adjustedsize()). Therefore this function should only
628 * be called for transactions that we expect will not cause a net increase 628 * be called for transactions that we expect will not cause a net increase
629 * in the amount of space used (but it's OK if that is occasionally not true). 629 * in the amount of space used (but it's OK if that is occasionally not true).
630 */ 630 */
631void 631void
632dmu_tx_mark_netfree(dmu_tx_t *tx) 632dmu_tx_mark_netfree(dmu_tx_t *tx)
633{ 633{
634 dmu_tx_hold_t *txh; 634 dmu_tx_hold_t *txh;
635 635
636 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 636 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
637 DMU_NEW_OBJECT, THT_FREE, 0, 0); 637 DMU_NEW_OBJECT, THT_FREE, 0, 0);
638 638
639 /* 639 /*
640 * Pretend that this operation will free 1GB of space. This 640 * Pretend that this operation will free 1GB of space. This
641 * should be large enough to cancel out the largest write. 641 * should be large enough to cancel out the largest write.
642 * We don't want to use something like UINT64_MAX, because that would 642 * We don't want to use something like UINT64_MAX, because that would
643 * cause overflows when doing math with these values (e.g. in 643 * cause overflows when doing math with these values (e.g. in
644 * dmu_tx_try_assign()). 644 * dmu_tx_try_assign()).
645 */ 645 */
646 (void) refcount_add_many(&txh->txh_space_tofree, 646 (void) refcount_add_many(&txh->txh_space_tofree,
647 1024 * 1024 * 1024, FTAG); 647 1024 * 1024 * 1024, FTAG);
648 (void) refcount_add_many(&txh->txh_space_tounref, 648 (void) refcount_add_many(&txh->txh_space_tounref,
649 1024 * 1024 * 1024, FTAG); 649 1024 * 1024 * 1024, FTAG);
650} 650}
651 651
652void 652void
653dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 653dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
654{ 654{
655 dmu_tx_hold_t *txh; 655 dmu_tx_hold_t *txh;
656 dnode_t *dn; 656 dnode_t *dn;
657 int err; 657 int err;
658 zio_t *zio; 658 zio_t *zio;
659 659
660 ASSERT(tx->tx_txg == 0); 660 ASSERT(tx->tx_txg == 0);
661 661
662 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 662 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
663 object, THT_FREE, off, len); 663 object, THT_FREE, off, len);
664 if (txh == NULL) 664 if (txh == NULL)
665 return; 665 return;
666 dn = txh->txh_dnode; 666 dn = txh->txh_dnode;
667 dmu_tx_count_dnode(txh); 667 dmu_tx_count_dnode(txh);
668 668
669 if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) 669 if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
670 return; 670 return;
671 if (len == DMU_OBJECT_END) 671 if (len == DMU_OBJECT_END)
672 len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; 672 len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
673 673
674 674
675 /* 675 /*
676 * For i/o error checking, we read the first and last level-0 676 * For i/o error checking, we read the first and last level-0
677 * blocks if they are not aligned, and all the level-1 blocks. 677 * blocks if they are not aligned, and all the level-1 blocks.
678 * 678 *
679 * Note: dbuf_free_range() assumes that we have not instantiated 679 * Note: dbuf_free_range() assumes that we have not instantiated
680 * any level-0 dbufs that will be completely freed. Therefore we must 680 * any level-0 dbufs that will be completely freed. Therefore we must
681 * exercise care to not read or count the first and last blocks 681 * exercise care to not read or count the first and last blocks
682 * if they are blocksize-aligned. 682 * if they are blocksize-aligned.
683 */ 683 */
684 if (dn->dn_datablkshift == 0) { 684 if (dn->dn_datablkshift == 0) {
685 if (off != 0 || len < dn->dn_datablksz) 685 if (off != 0 || len < dn->dn_datablksz)
686 dmu_tx_count_write(txh, 0, dn->dn_datablksz); 686 dmu_tx_count_write(txh, 0, dn->dn_datablksz);
687 } else { 687 } else {
688 /* first block will be modified if it is not aligned */ 688 /* first block will be modified if it is not aligned */
689 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) 689 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
690 dmu_tx_count_write(txh, off, 1); 690 dmu_tx_count_write(txh, off, 1);
691 /* last block will be modified if it is not aligned */ 691 /* last block will be modified if it is not aligned */
692 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) 692 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
693 dmu_tx_count_write(txh, off+len, 1); 693 dmu_tx_count_write(txh, off+len, 1);
694 } 694 }
695 695
696 /* 696 /*
697 * Check level-1 blocks. 697 * Check level-1 blocks.
698 */ 698 */
699 if (dn->dn_nlevels > 1) { 699 if (dn->dn_nlevels > 1) {
700 int shift = dn->dn_datablkshift + dn->dn_indblkshift - 700 int shift = dn->dn_datablkshift + dn->dn_indblkshift -
701 SPA_BLKPTRSHIFT; 701 SPA_BLKPTRSHIFT;
702 uint64_t start = off >> shift; 702 uint64_t start = off >> shift;
703 uint64_t end = (off + len) >> shift; 703 uint64_t end = (off + len) >> shift;
704 704
705 ASSERT(dn->dn_indblkshift != 0); 705 ASSERT(dn->dn_indblkshift != 0);
706 706
707 /* 707 /*
708 * dnode_reallocate() can result in an object with indirect 708 * dnode_reallocate() can result in an object with indirect
709 * blocks having an odd data block size. In this case, 709 * blocks having an odd data block size. In this case,
710 * just check the single block. 710 * just check the single block.
711 */ 711 */
712 if (dn->dn_datablkshift == 0) 712 if (dn->dn_datablkshift == 0)
713 start = end = 0; 713 start = end = 0;
714 714
715 zio = zio_root(tx->tx_pool->dp_spa, 715 zio = zio_root(tx->tx_pool->dp_spa,
716 NULL, NULL, ZIO_FLAG_CANFAIL); 716 NULL, NULL, ZIO_FLAG_CANFAIL);
717 for (uint64_t i = start; i <= end; i++) { 717 for (uint64_t i = start; i <= end; i++) {
718 uint64_t ibyte = i << shift; 718 uint64_t ibyte = i << shift;
719 err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); 719 err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
720 i = ibyte >> shift; 720 i = ibyte >> shift;
721 if (err == ESRCH || i > end) 721 if (err == ESRCH || i > end)
722 break; 722 break;
723 if (err) { 723 if (err) {
724 tx->tx_err = err; 724 tx->tx_err = err;
725 return; 725 return;
726 } 726 }
727 727
728 err = dmu_tx_check_ioerr(zio, dn, 1, i); 728 err = dmu_tx_check_ioerr(zio, dn, 1, i);
729 if (err) { 729 if (err) {
730 tx->tx_err = err; 730 tx->tx_err = err;
731 return; 731 return;
732 } 732 }
733 } 733 }
734 err = zio_wait(zio); 734 err = zio_wait(zio);
735 if (err) { 735 if (err) {
736 tx->tx_err = err; 736 tx->tx_err = err;
737 return; 737 return;
738 } 738 }
739 } 739 }
740 740
741 dmu_tx_count_free(txh, off, len); 741 dmu_tx_count_free(txh, off, len);
742} 742}
743 743
744void 744void
745dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) 745dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
746{ 746{
747 dmu_tx_hold_t *txh; 747 dmu_tx_hold_t *txh;
748 dnode_t *dn; 748 dnode_t *dn;
749 int err; 749 int err;
750 750
751 ASSERT(tx->tx_txg == 0); 751 ASSERT(tx->tx_txg == 0);
752 752
753 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 753 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
754 object, THT_ZAP, add, (uintptr_t)name); 754 object, THT_ZAP, add, (uintptr_t)name);
755 if (txh == NULL) 755 if (txh == NULL)
756 return; 756 return;
757 dn = txh->txh_dnode; 757 dn = txh->txh_dnode;
758 758
759 dmu_tx_count_dnode(txh); 759 dmu_tx_count_dnode(txh);
760 760
761 if (dn == NULL) { 761 if (dn == NULL) {
762 /* 762 /*
763 * We will be able to fit a new object's entries into one leaf 763 * We will be able to fit a new object's entries into one leaf
764 * block. So there will be at most 2 blocks total, 764 * block. So there will be at most 2 blocks total,
765 * including the header block. 765 * including the header block.
766 */ 766 */
767 dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); 767 dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
768 return; 768 return;
769 } 769 }
770 770
771 ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); 771 ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
772 772
773 if (dn->dn_maxblkid == 0 && !add) { 773 if (dn->dn_maxblkid == 0 && !add) {
774 blkptr_t *bp; 774 blkptr_t *bp;
775 775
776 /* 776 /*
777 * If there is only one block (i.e. this is a micro-zap) 777 * If there is only one block (i.e. this is a micro-zap)
778 * and we are not adding anything, the accounting is simple. 778 * and we are not adding anything, the accounting is simple.
779 */ 779 */
780 err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 780 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
781 if (err) { 781 if (err) {
782 tx->tx_err = err; 782 tx->tx_err = err;
783 return; 783 return;
784 } 784 }
785 785
786 /* 786 /*
787 * Use max block size here, since we don't know how much 787 * Use max block size here, since we don't know how much
788 * the size will change between now and the dbuf dirty call. 788 * the size will change between now and the dbuf dirty call.
789 */ 789 */
790 bp = &dn->dn_phys->dn_blkptr[0]; 790 bp = &dn->dn_phys->dn_blkptr[0];
791 if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 791 if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
792 bp, bp->blk_birth)) { 792 bp, bp->blk_birth)) {
793 (void) refcount_add_many(&txh->txh_space_tooverwrite, 793 (void) refcount_add_many(&txh->txh_space_tooverwrite,
794 MZAP_MAX_BLKSZ, FTAG); 794 MZAP_MAX_BLKSZ, FTAG);
795 } else { 795 } else {
796 (void) refcount_add_many(&txh->txh_space_towrite, 796 (void) refcount_add_many(&txh->txh_space_towrite,
797 MZAP_MAX_BLKSZ, FTAG); 797 MZAP_MAX_BLKSZ, FTAG);
798 } 798 }
799 if (!BP_IS_HOLE(bp)) { 799 if (!BP_IS_HOLE(bp)) {
800 (void) refcount_add_many(&txh->txh_space_tounref, 800 (void) refcount_add_many(&txh->txh_space_tounref,
801 MZAP_MAX_BLKSZ, FTAG); 801 MZAP_MAX_BLKSZ, FTAG);
802 } 802 }
803 return; 803 return;
804 } 804 }
805 805
806 if (dn->dn_maxblkid > 0 && name) { 806 if (dn->dn_maxblkid > 0 && name) {
807 /* 807 /*
808 * access the name in this fat-zap so that we'll check 808 * access the name in this fat-zap so that we'll check
809 * for i/o errors to the leaf blocks, etc. 809 * for i/o errors to the leaf blocks, etc.
810 */ 810 */
811 err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); 811 err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
812 if (err == EIO) { 812 if (err == EIO) {
813 tx->tx_err = err; 813 tx->tx_err = err;
814 return; 814 return;
815 } 815 }
816 } 816 }
817 817
818 err = zap_count_write_by_dnode(dn, name, add, 818 err = zap_count_write_by_dnode(dn, name, add,
819 &txh->txh_space_towrite, &txh->txh_space_tooverwrite); 819 &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
820 820
821 /* 821 /*
822 * If the modified blocks are scattered to the four winds, 822 * If the modified blocks are scattered to the four winds,
823 * we'll have to modify an indirect twig for each. We can make 823 * we'll have to modify an indirect twig for each. We can make
824 * modifications at up to 3 locations: 824 * modifications at up to 3 locations:
825 * - header block at the beginning of the object 825 * - header block at the beginning of the object
826 * - target leaf block 826 * - target leaf block
827 * - end of the object, where we might need to write: 827 * - end of the object, where we might need to write:
828 * - a new leaf block if the target block needs to be split 828 * - a new leaf block if the target block needs to be split
829 * - the new pointer table, if it is growing 829 * - the new pointer table, if it is growing
830 * - the new cookie table, if it is growing 830 * - the new cookie table, if it is growing
831 */ 831 */
832 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 832 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
833 dsl_dataset_phys_t *ds_phys = 833 dsl_dataset_phys_t *ds_phys =
834 dsl_dataset_phys(dn->dn_objset->os_dsl_dataset); 834 dsl_dataset_phys(dn->dn_objset->os_dsl_dataset);
835 for (int lvl = 1; lvl < dn->dn_nlevels; lvl++) { 835 for (int lvl = 1; lvl < dn->dn_nlevels; lvl++) {
836 uint64_t num_indirects = 1 + (dn->dn_maxblkid >> (epbs * lvl)); 836 uint64_t num_indirects = 1 + (dn->dn_maxblkid >> (epbs * lvl));
837 uint64_t spc = MIN(3, num_indirects) << dn->dn_indblkshift; 837 uint64_t spc = MIN(3, num_indirects) << dn->dn_indblkshift;
838 if (ds_phys->ds_prev_snap_obj != 0) { 838 if (ds_phys->ds_prev_snap_obj != 0) {
839 (void) refcount_add_many(&txh->txh_space_towrite, 839 (void) refcount_add_many(&txh->txh_space_towrite,
840 spc, FTAG); 840 spc, FTAG);
841 } else { 841 } else {
842 (void) refcount_add_many(&txh->txh_space_tooverwrite, 842 (void) refcount_add_many(&txh->txh_space_tooverwrite,
843 spc, FTAG); 843 spc, FTAG);
844 } 844 }
845 } 845 }
846} 846}
847 847
848void 848void
849dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 849dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
850{ 850{
851 dmu_tx_hold_t *txh; 851 dmu_tx_hold_t *txh;
852 852
853 ASSERT(tx->tx_txg == 0); 853 ASSERT(tx->tx_txg == 0);
854 854
855 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 855 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
856 object, THT_BONUS, 0, 0); 856 object, THT_BONUS, 0, 0);
857 if (txh) 857 if (txh)
858 dmu_tx_count_dnode(txh); 858 dmu_tx_count_dnode(txh);
859} 859}
860 860
861void 861void
862dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 862dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
863{ 863{
864 dmu_tx_hold_t *txh; 864 dmu_tx_hold_t *txh;
865 ASSERT(tx->tx_txg == 0); 865 ASSERT(tx->tx_txg == 0);
866 866
867 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 867 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
868 DMU_NEW_OBJECT, THT_SPACE, space, 0); 868 DMU_NEW_OBJECT, THT_SPACE, space, 0);
869 869
870 (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); 870 (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
871} 871}
872 872
873int 873int
874dmu_tx_holds(dmu_tx_t *tx, uint64_t object) 874dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
875{ 875{
876 dmu_tx_hold_t *txh; 876 dmu_tx_hold_t *txh;
877 int holds = 0; 877 int holds = 0;
878 878
879 /* 879 /*
880 * By asserting that the tx is assigned, we're counting the 880 * By asserting that the tx is assigned, we're counting the
881 * number of dn_tx_holds, which is the same as the number of 881 * number of dn_tx_holds, which is the same as the number of
882 * dn_holds. Otherwise, we'd be counting dn_holds, but 882 * dn_holds. Otherwise, we'd be counting dn_holds, but
883 * dn_tx_holds could be 0. 883 * dn_tx_holds could be 0.
884 */ 884 */
885 ASSERT(tx->tx_txg != 0); 885 ASSERT(tx->tx_txg != 0);
886 886
887 /* if (tx->tx_anyobj == TRUE) */ 887 /* if (tx->tx_anyobj == TRUE) */
888 /* return (0); */ 888 /* return (0); */
889 889
890 for (txh = list_head(&tx->tx_holds); txh; 890 for (txh = list_head(&tx->tx_holds); txh;
891 txh = list_next(&tx->tx_holds, txh)) { 891 txh = list_next(&tx->tx_holds, txh)) {
892 if (txh->txh_dnode && txh->txh_dnode->dn_object == object) 892 if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
893 holds++; 893 holds++;
894 } 894 }
895 895
896 return (holds); 896 return (holds);
897} 897}
898 898
899#ifdef ZFS_DEBUG 899#ifdef ZFS_DEBUG
900void 900void
901dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 901dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
902{ 902{
903 dmu_tx_hold_t *txh; 903 dmu_tx_hold_t *txh;
904 int match_object = FALSE, match_offset = FALSE; 904 int match_object = FALSE, match_offset = FALSE;
905 dnode_t *dn; 905 dnode_t *dn;
906 906
907 DB_DNODE_ENTER(db); 907 DB_DNODE_ENTER(db);
908 dn = DB_DNODE(db); 908 dn = DB_DNODE(db);
909 ASSERT(tx->tx_txg != 0); 909 ASSERT(tx->tx_txg != 0);
910 ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); 910 ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
911 ASSERT3U(dn->dn_object, ==, db->db.db_object); 911 ASSERT3U(dn->dn_object, ==, db->db.db_object);
912 912
913 if (tx->tx_anyobj) { 913 if (tx->tx_anyobj) {
914 DB_DNODE_EXIT(db); 914 DB_DNODE_EXIT(db);
915 return; 915 return;
916 } 916 }
917 917
918 /* XXX No checking on the meta dnode for now */ 918 /* XXX No checking on the meta dnode for now */
919 if (db->db.db_object == DMU_META_DNODE_OBJECT) { 919 if (db->db.db_object == DMU_META_DNODE_OBJECT) {
920 DB_DNODE_EXIT(db); 920 DB_DNODE_EXIT(db);
921 return; 921 return;
922 } 922 }
923 923
924 for (txh = list_head(&tx->tx_holds); txh; 924 for (txh = list_head(&tx->tx_holds); txh;
925 txh = list_next(&tx->tx_holds, txh)) { 925 txh = list_next(&tx->tx_holds, txh)) {
926 ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); 926 ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
927 if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) 927 if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
928 match_object = TRUE; 928 match_object = TRUE;
929 if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { 929 if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
930 int datablkshift = dn->dn_datablkshift ? 930 int datablkshift = dn->dn_datablkshift ?
931 dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 931 dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
932 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 932 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
933 int shift = datablkshift + epbs * db->db_level; 933 int shift = datablkshift + epbs * db->db_level;
934 uint64_t beginblk = shift >= 64 ? 0 : 934 uint64_t beginblk = shift >= 64 ? 0 :
935 (txh->txh_arg1 >> shift); 935 (txh->txh_arg1 >> shift);
936 uint64_t endblk = shift >= 64 ? 0 : 936 uint64_t endblk = shift >= 64 ? 0 :
937 ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); 937 ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
938 uint64_t blkid = db->db_blkid; 938 uint64_t blkid = db->db_blkid;
939 939
940 /* XXX txh_arg2 better not be zero... */ 940 /* XXX txh_arg2 better not be zero... */
941 941
942 dprintf("found txh type %x beginblk=%llx endblk=%llx\n", 942 dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
943 txh->txh_type, beginblk, endblk); 943 txh->txh_type, beginblk, endblk);
944 944
945 switch (txh->txh_type) { 945 switch (txh->txh_type) {
946 case THT_WRITE: 946 case THT_WRITE:
947 if (blkid >= beginblk && blkid <= endblk) 947 if (blkid >= beginblk && blkid <= endblk)
948 match_offset = TRUE; 948 match_offset = TRUE;
949 /* 949 /*
950 * We will let this hold work for the bonus 950 * We will let this hold work for the bonus
951 * or spill buffer so that we don't need to 951 * or spill buffer so that we don't need to
952 * hold it when creating a new object. 952 * hold it when creating a new object.
953 */ 953 */
954 if (blkid == DMU_BONUS_BLKID || 954 if (blkid == DMU_BONUS_BLKID ||
955 blkid == DMU_SPILL_BLKID) 955 blkid == DMU_SPILL_BLKID)
956 match_offset = TRUE; 956 match_offset = TRUE;
957 /* 957 /*
958 * They might have to increase nlevels, 958 * They might have to increase nlevels,
959 * thus dirtying the new TLIBs. Or the 959 * thus dirtying the new TLIBs. Or the
960 * might have to change the block size, 960 * might have to change the block size,
961 * thus dirying the new lvl=0 blk=0. 961 * thus dirying the new lvl=0 blk=0.
962 */ 962 */
963 if (blkid == 0) 963 if (blkid == 0)
964 match_offset = TRUE; 964 match_offset = TRUE;
965 break; 965 break;
966 case THT_FREE: 966 case THT_FREE:
967 /* 967 /*
968 * We will dirty all the level 1 blocks in 968 * We will dirty all the level 1 blocks in
969 * the free range and perhaps the first and 969 * the free range and perhaps the first and
970 * last level 0 block. 970 * last level 0 block.
971 */ 971 */
972 if (blkid >= beginblk && (blkid <= endblk || 972 if (blkid >= beginblk && (blkid <= endblk ||
973 txh->txh_arg2 == DMU_OBJECT_END)) 973 txh->txh_arg2 == DMU_OBJECT_END))
974 match_offset = TRUE; 974 match_offset = TRUE;
975 break; 975 break;
976 case THT_SPILL: 976 case THT_SPILL:
977 if (blkid == DMU_SPILL_BLKID) 977 if (blkid == DMU_SPILL_BLKID)
978 match_offset = TRUE; 978 match_offset = TRUE;
979 break; 979 break;
980 case THT_BONUS: 980 case THT_BONUS:
981 if (blkid == DMU_BONUS_BLKID) 981 if (blkid == DMU_BONUS_BLKID)
982 match_offset = TRUE; 982 match_offset = TRUE;
983 break; 983 break;
984 case THT_ZAP: 984 case THT_ZAP:
985 match_offset = TRUE; 985 match_offset = TRUE;
986 break; 986 break;
987 case THT_NEWOBJECT: 987 case THT_NEWOBJECT:
988 match_object = TRUE; 988 match_object = TRUE;
989 break; 989 break;
990 default: 990 default:
991 ASSERT(!"bad txh_type"); 991 ASSERT(!"bad txh_type");
992 } 992 }
993 } 993 }
994 if (match_object && match_offset) { 994 if (match_object && match_offset) {
995 DB_DNODE_EXIT(db); 995 DB_DNODE_EXIT(db);
996 return; 996 return;
997 } 997 }
998 } 998 }
999 DB_DNODE_EXIT(db); 999 DB_DNODE_EXIT(db);
1000 panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 1000 panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
1001 (u_longlong_t)db->db.db_object, db->db_level, 1001 (u_longlong_t)db->db.db_object, db->db_level,
1002 (u_longlong_t)db->db_blkid); 1002 (u_longlong_t)db->db_blkid);
1003} 1003}
1004#endif 1004#endif
1005 1005
1006/* 1006/*
1007 * If we can't do 10 iops, something is wrong. Let us go ahead 1007 * If we can't do 10 iops, something is wrong. Let us go ahead
1008 * and hit zfs_dirty_data_max. 1008 * and hit zfs_dirty_data_max.
1009 */ 1009 */
1010hrtime_t zfs_delay_max_ns = MSEC2NSEC(100); 1010hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
1011int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ 1011int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
1012 1012
1013/* 1013/*
1014 * We delay transactions when we've determined that the backend storage 1014 * We delay transactions when we've determined that the backend storage
1015 * isn't able to accommodate the rate of incoming writes. 1015 * isn't able to accommodate the rate of incoming writes.
1016 * 1016 *
1017 * If there is already a transaction waiting, we delay relative to when 1017 * If there is already a transaction waiting, we delay relative to when
1018 * that transaction finishes waiting. This way the calculated min_time 1018 * that transaction finishes waiting. This way the calculated min_time
1019 * is independent of the number of threads concurrently executing 1019 * is independent of the number of threads concurrently executing
1020 * transactions. 1020 * transactions.
1021 * 1021 *
1022 * If we are the only waiter, wait relative to when the transaction 1022 * If we are the only waiter, wait relative to when the transaction
1023 * started, rather than the current time. This credits the transaction for 1023 * started, rather than the current time. This credits the transaction for
1024 * "time already served", e.g. reading indirect blocks. 1024 * "time already served", e.g. reading indirect blocks.
1025 * 1025 *
1026 * The minimum time for a transaction to take is calculated as: 1026 * The minimum time for a transaction to take is calculated as:
1027 * min_time = scale * (dirty - min) / (max - dirty) 1027 * min_time = scale * (dirty - min) / (max - dirty)
1028 * min_time is then capped at zfs_delay_max_ns. 1028 * min_time is then capped at zfs_delay_max_ns.
1029 * 1029 *
1030 * The delay has two degrees of freedom that can be adjusted via tunables. 1030 * The delay has two degrees of freedom that can be adjusted via tunables.
1031 * The percentage of dirty data at which we start to delay is defined by 1031 * The percentage of dirty data at which we start to delay is defined by
1032 * zfs_delay_min_dirty_percent. This should typically be at or above 1032 * zfs_delay_min_dirty_percent. This should typically be at or above
1033 * zfs_vdev_async_write_active_max_dirty_percent so that we only start to 1033 * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
1034 * delay after writing at full speed has failed to keep up with the incoming 1034 * delay after writing at full speed has failed to keep up with the incoming
1035 * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly 1035 * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
1036 * speaking, this variable determines the amount of delay at the midpoint of 1036 * speaking, this variable determines the amount of delay at the midpoint of
1037 * the curve. 1037 * the curve.
1038 * 1038 *
1039 * delay 1039 * delay
1040 * 10ms +-------------------------------------------------------------*+ 1040 * 10ms +-------------------------------------------------------------*+
1041 * | *| 1041 * | *|
1042 * 9ms + *+ 1042 * 9ms + *+
1043 * | *| 1043 * | *|
1044 * 8ms + *+ 1044 * 8ms + *+
1045 * | * | 1045 * | * |
1046 * 7ms + * + 1046 * 7ms + * +
1047 * | * | 1047 * | * |
1048 * 6ms + * + 1048 * 6ms + * +
1049 * | * | 1049 * | * |
1050 * 5ms + * + 1050 * 5ms + * +
1051 * | * | 1051 * | * |
1052 * 4ms + * + 1052 * 4ms + * +
1053 * | * | 1053 * | * |
1054 * 3ms + * + 1054 * 3ms + * +
1055 * | * | 1055 * | * |
1056 * 2ms + (midpoint) * + 1056 * 2ms + (midpoint) * +
1057 * | | ** | 1057 * | | ** |
1058 * 1ms + v *** + 1058 * 1ms + v *** +
1059 * | zfs_delay_scale ----------> ******** | 1059 * | zfs_delay_scale ----------> ******** |
1060 * 0 +-------------------------------------*********----------------+ 1060 * 0 +-------------------------------------*********----------------+
1061 * 0% <- zfs_dirty_data_max -> 100% 1061 * 0% <- zfs_dirty_data_max -> 100%
1062 * 1062 *
1063 * Note that since the delay is added to the outstanding time remaining on the 1063 * Note that since the delay is added to the outstanding time remaining on the
1064 * most recent transaction, the delay is effectively the inverse of IOPS. 1064 * most recent transaction, the delay is effectively the inverse of IOPS.
1065 * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve 1065 * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
1066 * was chosen such that small changes in the amount of accumulated dirty data 1066 * was chosen such that small changes in the amount of accumulated dirty data
1067 * in the first 3/4 of the curve yield relatively small differences in the 1067 * in the first 3/4 of the curve yield relatively small differences in the
1068 * amount of delay. 1068 * amount of delay.
1069 * 1069 *
1070 * The effects can be easier to understand when the amount of delay is 1070 * The effects can be easier to understand when the amount of delay is
1071 * represented on a log scale: 1071 * represented on a log scale:
1072 * 1072 *
1073 * delay 1073 * delay
1074 * 100ms +-------------------------------------------------------------++ 1074 * 100ms +-------------------------------------------------------------++
1075 * + + 1075 * + +
1076 * | | 1076 * | |
1077 * + *+ 1077 * + *+
1078 * 10ms + *+ 1078 * 10ms + *+
1079 * + ** + 1079 * + ** +
1080 * | (midpoint) ** | 1080 * | (midpoint) ** |
1081 * + | ** + 1081 * + | ** +
1082 * 1ms + v **** + 1082 * 1ms + v **** +
1083 * + zfs_delay_scale ----------> ***** + 1083 * + zfs_delay_scale ----------> ***** +
1084 * | **** | 1084 * | **** |
1085 * + **** + 1085 * + **** +
1086 * 100us + ** + 1086 * 100us + ** +
1087 * + * + 1087 * + * +
1088 * | * | 1088 * | * |
1089 * + * + 1089 * + * +
1090 * 10us + * + 1090 * 10us + * +
1091 * + + 1091 * + +
1092 * | | 1092 * | |
1093 * + + 1093 * + +
1094 * +--------------------------------------------------------------+ 1094 * +--------------------------------------------------------------+
1095 * 0% <- zfs_dirty_data_max -> 100% 1095 * 0% <- zfs_dirty_data_max -> 100%
1096 * 1096 *
1097 * Note here that only as the amount of dirty data approaches its limit does 1097 * Note here that only as the amount of dirty data approaches its limit does
1098 * the delay start to increase rapidly. The goal of a properly tuned system 1098 * the delay start to increase rapidly. The goal of a properly tuned system
1099 * should be to keep the amount of dirty data out of that range by first 1099 * should be to keep the amount of dirty data out of that range by first
1100 * ensuring that the appropriate limits are set for the I/O scheduler to reach 1100 * ensuring that the appropriate limits are set for the I/O scheduler to reach
1101 * optimal throughput on the backend storage, and then by changing the value 1101 * optimal throughput on the backend storage, and then by changing the value
1102 * of zfs_delay_scale to increase the steepness of the curve. 1102 * of zfs_delay_scale to increase the steepness of the curve.
1103 */ 1103 */
1104static void 1104static void
1105dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) 1105dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
1106{ 1106{
1107 dsl_pool_t *dp = tx->tx_pool; 1107 dsl_pool_t *dp = tx->tx_pool;
1108 uint64_t delay_min_bytes = 1108 uint64_t delay_min_bytes =
1109 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; 1109 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
1110 hrtime_t wakeup, min_tx_time, now; 1110 hrtime_t wakeup, min_tx_time, now;
1111 1111
1112 if (dirty <= delay_min_bytes) 1112 if (dirty <= delay_min_bytes)
1113 return; 1113 return;
1114 1114
1115 /* 1115 /*
1116 * The caller has already waited until we are under the max. 1116 * The caller has already waited until we are under the max.
1117 * We make them pass us the amount of dirty data so we don't 1117 * We make them pass us the amount of dirty data so we don't
1118 * have to handle the case of it being >= the max, which could 1118 * have to handle the case of it being >= the max, which could
1119 * cause a divide-by-zero if it's == the max. 1119 * cause a divide-by-zero if it's == the max.
1120 */ 1120 */
1121 ASSERT3U(dirty, <, zfs_dirty_data_max); 1121 ASSERT3U(dirty, <, zfs_dirty_data_max);
1122 1122
1123 now = gethrtime(); 1123 now = gethrtime();
1124 min_tx_time = zfs_delay_scale * 1124 min_tx_time = zfs_delay_scale *
1125 (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); 1125 (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1126 if (now > tx->tx_start + min_tx_time) 1126 if (now > tx->tx_start + min_tx_time)
1127 return; 1127 return;
1128 1128
1129 min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); 1129 min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
1130 1130
1131 DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, 1131 DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
1132 uint64_t, min_tx_time); 1132 uint64_t, min_tx_time);
1133 1133
1134 mutex_enter(&dp->dp_lock); 1134 mutex_enter(&dp->dp_lock);
1135 wakeup = MAX(tx->tx_start + min_tx_time, 1135 wakeup = MAX(tx->tx_start + min_tx_time,
1136 dp->dp_last_wakeup + min_tx_time); 1136 dp->dp_last_wakeup + min_tx_time);
1137 dp->dp_last_wakeup = wakeup; 1137 dp->dp_last_wakeup = wakeup;
1138 mutex_exit(&dp->dp_lock); 1138 mutex_exit(&dp->dp_lock);
1139 1139
1140#ifdef _KERNEL 1140#ifdef _KERNEL
1141#ifdef illumos 1141#ifdef illumos
1142 mutex_enter(&curthread->t_delay_lock); 1142 mutex_enter(&curthread->t_delay_lock);
1143 while (cv_timedwait_hires(&curthread->t_delay_cv, 1143 while (cv_timedwait_hires(&curthread->t_delay_cv,
1144 &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns, 1144 &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
1145 CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0) 1145 CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
1146 continue; 1146 continue;
1147 mutex_exit(&curthread->t_delay_lock); 1147 mutex_exit(&curthread->t_delay_lock);
1148#endif 1148#endif
1149#ifdef __FreeBSD__ 1149#ifdef __FreeBSD__
1150 pause_sbt("dmu_tx_delay", wakeup * SBT_1NS, 1150 pause_sbt("dmu_tx_delay", wakeup * SBT_1NS,
1151 zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE); 1151 zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE);
1152#endif 1152#endif
1153#ifdef __NetBSD__ 1153#ifdef __NetBSD__
1154 int timo = (wakeup - now) * hz / 1000000000; 1154 int timo = (wakeup - now) * hz / 1000000000;
1155 1155
1156 if (timo == 0) 1156 if (timo == 0)
1157 timo = 1; 1157 timo = 1;
1158 kpause("dmu_tx_delay", false, timo, NULL); 1158 kpause("dmu_tx_delay", false, timo, NULL);
1159#endif 1159#endif
1160#else 1160#else
1161 hrtime_t delta = wakeup - gethrtime(); 1161 hrtime_t delta = wakeup - gethrtime();
1162 struct timespec ts; 1162 struct timespec ts;
1163 ts.tv_sec = delta / NANOSEC; 1163 ts.tv_sec = delta / NANOSEC;
1164 ts.tv_nsec = delta % NANOSEC; 1164 ts.tv_nsec = delta % NANOSEC;
1165 (void) nanosleep(&ts, NULL); 1165 (void) nanosleep(&ts, NULL);
1166#endif 1166#endif
1167} 1167}
1168 1168
1169static int 1169static int
1170dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) 1170dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
1171{ 1171{
1172 dmu_tx_hold_t *txh; 1172 dmu_tx_hold_t *txh;
1173 spa_t *spa = tx->tx_pool->dp_spa; 1173 spa_t *spa = tx->tx_pool->dp_spa;
1174 uint64_t memory, asize, fsize, usize; 1174 uint64_t memory, asize, fsize, usize;
1175 uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; 1175 uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
1176 1176
1177 ASSERT0(tx->tx_txg); 1177 ASSERT0(tx->tx_txg);
1178 1178
1179 if (tx->tx_err) 1179 if (tx->tx_err)
1180 return (tx->tx_err); 1180 return (tx->tx_err);
1181 1181
1182 if (spa_suspended(spa)) { 1182 if (spa_suspended(spa)) {
1183 /* 1183 /*
1184 * If the user has indicated a blocking failure mode 1184 * If the user has indicated a blocking failure mode
1185 * then return ERESTART which will block in dmu_tx_wait(). 1185 * then return ERESTART which will block in dmu_tx_wait().
1186 * Otherwise, return EIO so that an error can get 1186 * Otherwise, return EIO so that an error can get
1187 * propagated back to the VOP calls. 1187 * propagated back to the VOP calls.
1188 * 1188 *
1189 * Note that we always honor the txg_how flag regardless 1189 * Note that we always honor the txg_how flag regardless
1190 * of the failuremode setting. 1190 * of the failuremode setting.
1191 */ 1191 */
1192 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && 1192 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
1193 txg_how != TXG_WAIT) 1193 txg_how != TXG_WAIT)
1194 return (SET_ERROR(EIO)); 1194 return (SET_ERROR(EIO));
1195 1195
1196 return (SET_ERROR(ERESTART)); 1196 return (SET_ERROR(ERESTART));
1197 } 1197 }
1198 1198
1199 if (!tx->tx_waited && 1199 if (!tx->tx_waited &&
1200 dsl_pool_need_dirty_delay(tx->tx_pool)) { 1200 dsl_pool_need_dirty_delay(tx->tx_pool)) {
1201 tx->tx_wait_dirty = B_TRUE; 1201 tx->tx_wait_dirty = B_TRUE;
1202 return (SET_ERROR(ERESTART)); 1202 return (SET_ERROR(ERESTART));
1203 } 1203 }
1204 1204
1205 tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 1205 tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
1206 tx->tx_needassign_txh = NULL; 1206 tx->tx_needassign_txh = NULL;
1207 1207
1208 /* 1208 /*
1209 * NB: No error returns are allowed after txg_hold_open, but 1209 * NB: No error returns are allowed after txg_hold_open, but
1210 * before processing the dnode holds, due to the 1210 * before processing the dnode holds, due to the
1211 * dmu_tx_unassign() logic. 1211 * dmu_tx_unassign() logic.
1212 */ 1212 */
1213 1213
1214 towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; 1214 towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
1215 for (txh = list_head(&tx->tx_holds); txh; 1215 for (txh = list_head(&tx->tx_holds); txh;
1216 txh = list_next(&tx->tx_holds, txh)) { 1216 txh = list_next(&tx->tx_holds, txh)) {
1217 dnode_t *dn = txh->txh_dnode; 1217 dnode_t *dn = txh->txh_dnode;
1218 if (dn != NULL) { 1218 if (dn != NULL) {
1219 mutex_enter(&dn->dn_mtx); 1219 mutex_enter(&dn->dn_mtx);
1220 if (dn->dn_assigned_txg == tx->tx_txg - 1) { 1220 if (dn->dn_assigned_txg == tx->tx_txg - 1) {
1221 mutex_exit(&dn->dn_mtx); 1221 mutex_exit(&dn->dn_mtx);
1222 tx->tx_needassign_txh = txh; 1222 tx->tx_needassign_txh = txh;
1223 return (SET_ERROR(ERESTART)); 1223 return (SET_ERROR(ERESTART));
1224 } 1224 }
1225 if (dn->dn_assigned_txg == 0) 1225 if (dn->dn_assigned_txg == 0)
1226 dn->dn_assigned_txg = tx->tx_txg; 1226 dn->dn_assigned_txg = tx->tx_txg;
1227 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1227 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1228 (void) refcount_add(&dn->dn_tx_holds, tx); 1228 (void) refcount_add(&dn->dn_tx_holds, tx);
1229 mutex_exit(&dn->dn_mtx); 1229 mutex_exit(&dn->dn_mtx);
1230 } 1230 }
1231 towrite += refcount_count(&txh->txh_space_towrite); 1231 towrite += refcount_count(&txh->txh_space_towrite);
1232 tofree += refcount_count(&txh->txh_space_tofree); 1232 tofree += refcount_count(&txh->txh_space_tofree);
1233 tooverwrite += refcount_count(&txh->txh_space_tooverwrite); 1233 tooverwrite += refcount_count(&txh->txh_space_tooverwrite);
1234 tounref += refcount_count(&txh->txh_space_tounref); 1234 tounref += refcount_count(&txh->txh_space_tounref);
1235 tohold += refcount_count(&txh->txh_memory_tohold); 1235 tohold += refcount_count(&txh->txh_memory_tohold);
1236 fudge += refcount_count(&txh->txh_fudge); 1236 fudge += refcount_count(&txh->txh_fudge);
1237 } 1237 }
1238 1238
1239 /* 1239 /*
1240 * If a snapshot has been taken since we made our estimates, 1240 * If a snapshot has been taken since we made our estimates,
1241 * assume that we won't be able to free or overwrite anything. 1241 * assume that we won't be able to free or overwrite anything.
1242 */ 1242 */
1243 if (tx->tx_objset && 1243 if (tx->tx_objset &&
1244 dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > 1244 dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
1245 tx->tx_lastsnap_txg) { 1245 tx->tx_lastsnap_txg) {
1246 towrite += tooverwrite; 1246 towrite += tooverwrite;
1247 tooverwrite = tofree = 0; 1247 tooverwrite = tofree = 0;
1248 } 1248 }
1249 1249
1250 /* needed allocation: worst-case estimate of write space */ 1250 /* needed allocation: worst-case estimate of write space */
1251 asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); 1251 asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
1252 /* freed space estimate: worst-case overwrite + free estimate */ 1252 /* freed space estimate: worst-case overwrite + free estimate */
1253 fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; 1253 fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
1254 /* convert unrefd space to worst-case estimate */ 1254 /* convert unrefd space to worst-case estimate */
1255 usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); 1255 usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
1256 /* calculate memory footprint estimate */ 1256 /* calculate memory footprint estimate */
1257 memory = towrite + tooverwrite + tohold; 1257 memory = towrite + tooverwrite + tohold;
1258 1258
1259#ifdef ZFS_DEBUG 1259#ifdef ZFS_DEBUG
1260 /* 1260 /*
1261 * Add in 'tohold' to account for our dirty holds on this memory 1261 * Add in 'tohold' to account for our dirty holds on this memory
1262 * XXX - the "fudge" factor is to account for skipped blocks that 1262 * XXX - the "fudge" factor is to account for skipped blocks that
1263 * we missed because dnode_next_offset() misses in-core-only blocks. 1263 * we missed because dnode_next_offset() misses in-core-only blocks.
1264 */ 1264 */
1265 tx->tx_space_towrite = asize + 1265 tx->tx_space_towrite = asize +
1266 spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); 1266 spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
1267 tx->tx_space_tofree = tofree; 1267 tx->tx_space_tofree = tofree;
1268 tx->tx_space_tooverwrite = tooverwrite; 1268 tx->tx_space_tooverwrite = tooverwrite;
1269 tx->tx_space_tounref = tounref; 1269 tx->tx_space_tounref = tounref;
1270#endif 1270#endif
1271 1271
1272 if (tx->tx_dir && asize != 0) { 1272 if (tx->tx_dir && asize != 0) {
1273 int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, 1273 int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1274 asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); 1274 asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1275 if (err) 1275 if (err)
1276 return (err); 1276 return (err);
1277 } 1277 }
1278 1278
1279 return (0); 1279 return (0);
1280} 1280}
1281 1281
1282static void 1282static void
1283dmu_tx_unassign(dmu_tx_t *tx) 1283dmu_tx_unassign(dmu_tx_t *tx)
1284{ 1284{
1285 dmu_tx_hold_t *txh; 1285 dmu_tx_hold_t *txh;
1286 1286
1287 if (tx->tx_txg == 0) 1287 if (tx->tx_txg == 0)
1288 return; 1288 return;
1289 1289
1290 txg_rele_to_quiesce(&tx->tx_txgh); 1290 txg_rele_to_quiesce(&tx->tx_txgh);
1291 1291
1292 /* 1292 /*
1293 * Walk the transaction's hold list, removing the hold on the 1293 * Walk the transaction's hold list, removing the hold on the
1294 * associated dnode, and notifying waiters if the refcount drops to 0. 1294 * associated dnode, and notifying waiters if the refcount drops to 0.
1295 */ 1295 */
1296 for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; 1296 for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1297 txh = list_next(&tx->tx_holds, txh)) { 1297 txh = list_next(&tx->tx_holds, txh)) {
1298 dnode_t *dn = txh->txh_dnode; 1298 dnode_t *dn = txh->txh_dnode;
1299 1299
1300 if (dn == NULL) 1300 if (dn == NULL)
1301 continue; 1301 continue;
1302 mutex_enter(&dn->dn_mtx); 1302 mutex_enter(&dn->dn_mtx);
1303 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1303 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1304 1304
1305 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 1305 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1306 dn->dn_assigned_txg = 0; 1306 dn->dn_assigned_txg = 0;
1307 cv_broadcast(&dn->dn_notxholds); 1307 cv_broadcast(&dn->dn_notxholds);
1308 } 1308 }
1309 mutex_exit(&dn->dn_mtx); 1309 mutex_exit(&dn->dn_mtx);
1310 } 1310 }
1311 1311
1312 txg_rele_to_sync(&tx->tx_txgh); 1312 txg_rele_to_sync(&tx->tx_txgh);
1313 1313
1314 tx->tx_lasttried_txg = tx->tx_txg; 1314 tx->tx_lasttried_txg = tx->tx_txg;
1315 tx->tx_txg = 0; 1315 tx->tx_txg = 0;
1316} 1316}
1317 1317
1318/* 1318/*
1319 * Assign tx to a transaction group. txg_how can be one of: 1319 * Assign tx to a transaction group. txg_how can be one of:
1320 * 1320 *
1321 * (1) TXG_WAIT. If the current open txg is full, waits until there's 1321 * (1) TXG_WAIT. If the current open txg is full, waits until there's
1322 * a new one. This should be used when you're not holding locks. 1322 * a new one. This should be used when you're not holding locks.
1323 * It will only fail if we're truly out of space (or over quota). 1323 * It will only fail if we're truly out of space (or over quota).
1324 * 1324 *
1325 * (2) TXG_NOWAIT. If we can't assign into the current open txg without 1325 * (2) TXG_NOWAIT. If we can't assign into the current open txg without
1326 * blocking, returns immediately with ERESTART. This should be used 1326 * blocking, returns immediately with ERESTART. This should be used
1327 * whenever you're holding locks. On an ERESTART error, the caller 1327 * whenever you're holding locks. On an ERESTART error, the caller
1328 * should drop locks, do a dmu_tx_wait(tx), and try again. 1328 * should drop locks, do a dmu_tx_wait(tx), and try again.
1329 * 1329 *
1330 * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() 1330 * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait()
1331 * has already been called on behalf of this operation (though 1331 * has already been called on behalf of this operation (though
1332 * most likely on a different tx). 1332 * most likely on a different tx).
1333 */ 1333 */
1334int 1334int
1335dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) 1335dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1336{ 1336{
1337 int err; 1337 int err;
1338 1338
1339 ASSERT(tx->tx_txg == 0); 1339 ASSERT(tx->tx_txg == 0);
1340 ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || 1340 ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
1341 txg_how == TXG_WAITED); 1341 txg_how == TXG_WAITED);
1342 ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 1342 ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1343 1343
1344 /* If we might wait, we must not hold the config lock. */ 1344 /* If we might wait, we must not hold the config lock. */
1345 ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); 1345 ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1346 1346
1347 if (txg_how == TXG_WAITED) 1347 if (txg_how == TXG_WAITED)
1348 tx->tx_waited = B_TRUE; 1348 tx->tx_waited = B_TRUE;
1349 1349
1350 while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { 1350 while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1351 dmu_tx_unassign(tx); 1351 dmu_tx_unassign(tx);
1352 1352
1353 if (err != ERESTART || txg_how != TXG_WAIT) 1353 if (err != ERESTART || txg_how != TXG_WAIT)
1354 return (err); 1354 return (err);
1355 1355
1356 dmu_tx_wait(tx); 1356 dmu_tx_wait(tx);
1357 } 1357 }
1358 1358
1359 txg_rele_to_quiesce(&tx->tx_txgh); 1359 txg_rele_to_quiesce(&tx->tx_txgh);
1360 1360
1361 return (0); 1361 return (0);
1362} 1362}
1363 1363
1364void 1364void
1365dmu_tx_wait(dmu_tx_t *tx) 1365dmu_tx_wait(dmu_tx_t *tx)
1366{ 1366{
1367 spa_t *spa = tx->tx_pool->dp_spa; 1367 spa_t *spa = tx->tx_pool->dp_spa;
1368 dsl_pool_t *dp = tx->tx_pool; 1368 dsl_pool_t *dp = tx->tx_pool;
1369 1369
1370 ASSERT(tx->tx_txg == 0); 1370 ASSERT(tx->tx_txg == 0);
1371 ASSERT(!dsl_pool_config_held(tx->tx_pool)); 1371 ASSERT(!dsl_pool_config_held(tx->tx_pool));
1372 1372
1373 if (tx->tx_wait_dirty) { 1373 if (tx->tx_wait_dirty) {
1374 /* 1374 /*
1375 * dmu_tx_try_assign() has determined that we need to wait 1375 * dmu_tx_try_assign() has determined that we need to wait
1376 * because we've consumed much or all of the dirty buffer 1376 * because we've consumed much or all of the dirty buffer
1377 * space. 1377 * space.
1378 */ 1378 */
1379 mutex_enter(&dp->dp_lock); 1379 mutex_enter(&dp->dp_lock);
1380 while (dp->dp_dirty_total >= zfs_dirty_data_max) 1380 while (dp->dp_dirty_total >= zfs_dirty_data_max)
1381 cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); 1381 cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1382 uint64_t dirty = dp->dp_dirty_total; 1382 uint64_t dirty = dp->dp_dirty_total;
1383 mutex_exit(&dp->dp_lock); 1383 mutex_exit(&dp->dp_lock);
1384 1384
1385 dmu_tx_delay(tx, dirty); 1385 dmu_tx_delay(tx, dirty);
1386 1386
1387 tx->tx_wait_dirty = B_FALSE; 1387 tx->tx_wait_dirty = B_FALSE;
1388 1388
1389 /* 1389 /*
1390 * Note: setting tx_waited only has effect if the caller 1390 * Note: setting tx_waited only has effect if the caller
1391 * used TX_WAIT. Otherwise they are going to destroy 1391 * used TX_WAIT. Otherwise they are going to destroy
1392 * this tx and try again. The common case, zfs_write(), 1392 * this tx and try again. The common case, zfs_write(),
1393 * uses TX_WAIT. 1393 * uses TX_WAIT.
1394 */ 1394 */
1395 tx->tx_waited = B_TRUE; 1395 tx->tx_waited = B_TRUE;
1396 } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { 1396 } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1397 /* 1397 /*
1398 * If the pool is suspended we need to wait until it 1398 * If the pool is suspended we need to wait until it
1399 * is resumed. Note that it's possible that the pool 1399 * is resumed. Note that it's possible that the pool
1400 * has become active after this thread has tried to 1400 * has become active after this thread has tried to
1401 * obtain a tx. If that's the case then tx_lasttried_txg 1401 * obtain a tx. If that's the case then tx_lasttried_txg
1402 * would not have been set. 1402 * would not have been set.
1403 */ 1403 */
1404 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); 1404 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1405 } else if (tx->tx_needassign_txh) { 1405 } else if (tx->tx_needassign_txh) {
1406 /* 1406 /*
1407 * A dnode is assigned to the quiescing txg. Wait for its 1407 * A dnode is assigned to the quiescing txg. Wait for its
1408 * transaction to complete. 1408 * transaction to complete.
1409 */ 1409 */
1410 dnode_t *dn = tx->tx_needassign_txh->txh_dnode; 1410 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1411 1411
1412 mutex_enter(&dn->dn_mtx); 1412 mutex_enter(&dn->dn_mtx);
1413 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) 1413 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1414 cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 1414 cv_wait(&dn->dn_notxholds, &dn->dn_mtx);

cvs diff -r1.10 -r1.11 src/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c (switch to unified diff)

--- src/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c 2019/07/23 07:46:22 1.10
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c 2020/03/09 15:37:46 1.11
@@ -5297,1999 +5297,1999 @@ spa_vdev_detach(spa_t *spa, uint64_t gui @@ -5297,1999 +5297,1999 @@ spa_vdev_detach(spa_t *spa, uint64_t gui
5297 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5297 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
5298 mutex_enter(&spa_namespace_lock); 5298 mutex_enter(&spa_namespace_lock);
5299 spa_close(altspa, FTAG); 5299 spa_close(altspa, FTAG);
5300 } 5300 }
5301 mutex_exit(&spa_namespace_lock); 5301 mutex_exit(&spa_namespace_lock);
5302 5302
5303 /* search the rest of the vdevs for spares to remove */ 5303 /* search the rest of the vdevs for spares to remove */
5304 spa_vdev_resilver_done(spa); 5304 spa_vdev_resilver_done(spa);
5305 } 5305 }
5306 5306
5307 /* all done with the spa; OK to release */ 5307 /* all done with the spa; OK to release */
5308 mutex_enter(&spa_namespace_lock); 5308 mutex_enter(&spa_namespace_lock);
5309 spa_close(spa, FTAG); 5309 spa_close(spa, FTAG);
5310 mutex_exit(&spa_namespace_lock); 5310 mutex_exit(&spa_namespace_lock);
5311 5311
5312 return (error); 5312 return (error);
5313} 5313}
5314 5314
5315/* 5315/*
5316 * Split a set of devices from their mirrors, and create a new pool from them. 5316 * Split a set of devices from their mirrors, and create a new pool from them.
5317 */ 5317 */
5318int 5318int
5319spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5319spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
5320 nvlist_t *props, boolean_t exp) 5320 nvlist_t *props, boolean_t exp)
5321{ 5321{
5322 int error = 0; 5322 int error = 0;
5323 uint64_t txg, *glist; 5323 uint64_t txg, *glist;
5324 spa_t *newspa; 5324 spa_t *newspa;
5325 uint_t c, children, lastlog; 5325 uint_t c, children, lastlog;
5326 nvlist_t **child, *nvl, *tmp; 5326 nvlist_t **child, *nvl, *tmp;
5327 dmu_tx_t *tx; 5327 dmu_tx_t *tx;
5328 char *altroot = NULL; 5328 char *altroot = NULL;
5329 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5329 vdev_t *rvd, **vml = NULL; /* vdev modify list */
5330 boolean_t activate_slog; 5330 boolean_t activate_slog;
5331 5331
5332 ASSERT(spa_writeable(spa)); 5332 ASSERT(spa_writeable(spa));
5333 5333
5334 txg = spa_vdev_enter(spa); 5334 txg = spa_vdev_enter(spa);
5335 5335
5336 /* clear the log and flush everything up to now */ 5336 /* clear the log and flush everything up to now */
5337 activate_slog = spa_passivate_log(spa); 5337 activate_slog = spa_passivate_log(spa);
5338 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5338 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5339 error = spa_offline_log(spa); 5339 error = spa_offline_log(spa);
5340 txg = spa_vdev_config_enter(spa); 5340 txg = spa_vdev_config_enter(spa);
5341 5341
5342 if (activate_slog) 5342 if (activate_slog)
5343 spa_activate_log(spa); 5343 spa_activate_log(spa);
5344 5344
5345 if (error != 0) 5345 if (error != 0)
5346 return (spa_vdev_exit(spa, NULL, txg, error)); 5346 return (spa_vdev_exit(spa, NULL, txg, error));
5347 5347
5348 /* check new spa name before going any further */ 5348 /* check new spa name before going any further */
5349 if (spa_lookup(newname) != NULL) 5349 if (spa_lookup(newname) != NULL)
5350 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5350 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
5351 5351
5352 /* 5352 /*
5353 * scan through all the children to ensure they're all mirrors 5353 * scan through all the children to ensure they're all mirrors
5354 */ 5354 */
5355 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5355 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
5356 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5356 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
5357 &children) != 0) 5357 &children) != 0)
5358 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5358 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5359 5359
5360 /* first, check to ensure we've got the right child count */ 5360 /* first, check to ensure we've got the right child count */
5361 rvd = spa->spa_root_vdev; 5361 rvd = spa->spa_root_vdev;
5362 lastlog = 0; 5362 lastlog = 0;
5363 for (c = 0; c < rvd->vdev_children; c++) { 5363 for (c = 0; c < rvd->vdev_children; c++) {
5364 vdev_t *vd = rvd->vdev_child[c]; 5364 vdev_t *vd = rvd->vdev_child[c];
5365 5365
5366 /* don't count the holes & logs as children */ 5366 /* don't count the holes & logs as children */
5367 if (vd->vdev_islog || vd->vdev_ishole) { 5367 if (vd->vdev_islog || vd->vdev_ishole) {
5368 if (lastlog == 0) 5368 if (lastlog == 0)
5369 lastlog = c; 5369 lastlog = c;
5370 continue; 5370 continue;
5371 } 5371 }
5372 5372
5373 lastlog = 0; 5373 lastlog = 0;
5374 } 5374 }
5375 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5375 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
5376 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5376 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5377 5377
5378 /* next, ensure no spare or cache devices are part of the split */ 5378 /* next, ensure no spare or cache devices are part of the split */
5379 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5379 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
5380 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5380 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
5381 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5381 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5382 5382
5383 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5383 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
5384 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5384 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
5385 5385
5386 /* then, loop over each vdev and validate it */ 5386 /* then, loop over each vdev and validate it */
5387 for (c = 0; c < children; c++) { 5387 for (c = 0; c < children; c++) {
5388 uint64_t is_hole = 0; 5388 uint64_t is_hole = 0;
5389 5389
5390 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5390 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
5391 &is_hole); 5391 &is_hole);
5392 5392
5393 if (is_hole != 0) { 5393 if (is_hole != 0) {
5394 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5394 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
5395 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5395 spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
5396 continue; 5396 continue;
5397 } else { 5397 } else {
5398 error = SET_ERROR(EINVAL); 5398 error = SET_ERROR(EINVAL);
5399 break; 5399 break;
5400 } 5400 }
5401 } 5401 }
5402 5402
5403 /* which disk is going to be split? */ 5403 /* which disk is going to be split? */
5404 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5404 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
5405 &glist[c]) != 0) { 5405 &glist[c]) != 0) {
5406 error = SET_ERROR(EINVAL); 5406 error = SET_ERROR(EINVAL);
5407 break; 5407 break;
5408 } 5408 }
5409 5409
5410 /* look it up in the spa */ 5410 /* look it up in the spa */
5411 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5411 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
5412 if (vml[c] == NULL) { 5412 if (vml[c] == NULL) {
5413 error = SET_ERROR(ENODEV); 5413 error = SET_ERROR(ENODEV);
5414 break; 5414 break;
5415 } 5415 }
5416 5416
5417 /* make sure there's nothing stopping the split */ 5417 /* make sure there's nothing stopping the split */
5418 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5418 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
5419 vml[c]->vdev_islog || 5419 vml[c]->vdev_islog ||
5420 vml[c]->vdev_ishole || 5420 vml[c]->vdev_ishole ||
5421 vml[c]->vdev_isspare || 5421 vml[c]->vdev_isspare ||
5422 vml[c]->vdev_isl2cache || 5422 vml[c]->vdev_isl2cache ||
5423 !vdev_writeable(vml[c]) || 5423 !vdev_writeable(vml[c]) ||
5424 vml[c]->vdev_children != 0 || 5424 vml[c]->vdev_children != 0 ||
5425 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5425 vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
5426 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5426 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
5427 error = SET_ERROR(EINVAL); 5427 error = SET_ERROR(EINVAL);
5428 break; 5428 break;
5429 } 5429 }
5430 5430
5431 if (vdev_dtl_required(vml[c])) { 5431 if (vdev_dtl_required(vml[c])) {
5432 error = SET_ERROR(EBUSY); 5432 error = SET_ERROR(EBUSY);
5433 break; 5433 break;
5434 } 5434 }
5435 5435
5436 /* we need certain info from the top level */ 5436 /* we need certain info from the top level */
5437 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5437 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
5438 vml[c]->vdev_top->vdev_ms_array) == 0); 5438 vml[c]->vdev_top->vdev_ms_array) == 0);
5439 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5439 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
5440 vml[c]->vdev_top->vdev_ms_shift) == 0); 5440 vml[c]->vdev_top->vdev_ms_shift) == 0);
5441 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5441 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
5442 vml[c]->vdev_top->vdev_asize) == 0); 5442 vml[c]->vdev_top->vdev_asize) == 0);
5443 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5443 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
5444 vml[c]->vdev_top->vdev_ashift) == 0); 5444 vml[c]->vdev_top->vdev_ashift) == 0);
5445 5445
5446 /* transfer per-vdev ZAPs */ 5446 /* transfer per-vdev ZAPs */
5447 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 5447 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
5448 VERIFY0(nvlist_add_uint64(child[c], 5448 VERIFY0(nvlist_add_uint64(child[c],
5449 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 5449 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
5450 5450
5451 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 5451 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
5452 VERIFY0(nvlist_add_uint64(child[c], 5452 VERIFY0(nvlist_add_uint64(child[c],
5453 ZPOOL_CONFIG_VDEV_TOP_ZAP, 5453 ZPOOL_CONFIG_VDEV_TOP_ZAP,
5454 vml[c]->vdev_parent->vdev_top_zap)); 5454 vml[c]->vdev_parent->vdev_top_zap));
5455 } 5455 }
5456 5456
5457 if (error != 0) { 5457 if (error != 0) {
5458 kmem_free(vml, children * sizeof (vdev_t *)); 5458 kmem_free(vml, children * sizeof (vdev_t *));
5459 kmem_free(glist, children * sizeof (uint64_t)); 5459 kmem_free(glist, children * sizeof (uint64_t));
5460 return (spa_vdev_exit(spa, NULL, txg, error)); 5460 return (spa_vdev_exit(spa, NULL, txg, error));
5461 } 5461 }
5462 5462
5463 /* stop writers from using the disks */ 5463 /* stop writers from using the disks */
5464 for (c = 0; c < children; c++) { 5464 for (c = 0; c < children; c++) {
5465 if (vml[c] != NULL) 5465 if (vml[c] != NULL)
5466 vml[c]->vdev_offline = B_TRUE; 5466 vml[c]->vdev_offline = B_TRUE;
5467 } 5467 }
5468 vdev_reopen(spa->spa_root_vdev); 5468 vdev_reopen(spa->spa_root_vdev);
5469 5469
5470 /* 5470 /*
5471 * Temporarily record the splitting vdevs in the spa config. This 5471 * Temporarily record the splitting vdevs in the spa config. This
5472 * will disappear once the config is regenerated. 5472 * will disappear once the config is regenerated.
5473 */ 5473 */
5474 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5474 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5475 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5475 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
5476 glist, children) == 0); 5476 glist, children) == 0);
5477 kmem_free(glist, children * sizeof (uint64_t)); 5477 kmem_free(glist, children * sizeof (uint64_t));
5478 5478
5479 mutex_enter(&spa->spa_props_lock); 5479 mutex_enter(&spa->spa_props_lock);
5480 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5480 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
5481 nvl) == 0); 5481 nvl) == 0);
5482 mutex_exit(&spa->spa_props_lock); 5482 mutex_exit(&spa->spa_props_lock);
5483 spa->spa_config_splitting = nvl; 5483 spa->spa_config_splitting = nvl;
5484 vdev_config_dirty(spa->spa_root_vdev); 5484 vdev_config_dirty(spa->spa_root_vdev);
5485 5485
5486 /* configure and create the new pool */ 5486 /* configure and create the new pool */
5487 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5487 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
5488 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5488 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
5489 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5489 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
5490 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5490 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
5491 spa_version(spa)) == 0); 5491 spa_version(spa)) == 0);
5492 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5492 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
5493 spa->spa_config_txg) == 0); 5493 spa->spa_config_txg) == 0);
5494 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5494 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
5495 spa_generate_guid(NULL)) == 0); 5495 spa_generate_guid(NULL)) == 0);
5496 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 5496 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
5497 (void) nvlist_lookup_string(props, 5497 (void) nvlist_lookup_string(props,
5498 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5498 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5499 5499
5500 /* add the new pool to the namespace */ 5500 /* add the new pool to the namespace */
5501 newspa = spa_add(newname, config, altroot); 5501 newspa = spa_add(newname, config, altroot);
5502 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 5502 newspa->spa_avz_action = AVZ_ACTION_REBUILD;
5503 newspa->spa_config_txg = spa->spa_config_txg; 5503 newspa->spa_config_txg = spa->spa_config_txg;
5504 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5504 spa_set_log_state(newspa, SPA_LOG_CLEAR);
5505 5505
5506 /* release the spa config lock, retaining the namespace lock */ 5506 /* release the spa config lock, retaining the namespace lock */
5507 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5507 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5508 5508
5509 if (zio_injection_enabled) 5509 if (zio_injection_enabled)
5510 zio_handle_panic_injection(spa, FTAG, 1); 5510 zio_handle_panic_injection(spa, FTAG, 1);
5511 5511
5512 spa_activate(newspa, spa_mode_global); 5512 spa_activate(newspa, spa_mode_global);
5513 spa_async_suspend(newspa); 5513 spa_async_suspend(newspa);
5514 5514
5515#ifndef illumos 5515#ifndef illumos
5516 /* mark that we are creating new spa by splitting */ 5516 /* mark that we are creating new spa by splitting */
5517 newspa->spa_splitting_newspa = B_TRUE; 5517 newspa->spa_splitting_newspa = B_TRUE;
5518#endif 5518#endif
5519 /* create the new pool from the disks of the original pool */ 5519 /* create the new pool from the disks of the original pool */
5520 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5520 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
5521#ifndef illumos 5521#ifndef illumos
5522 newspa->spa_splitting_newspa = B_FALSE; 5522 newspa->spa_splitting_newspa = B_FALSE;
5523#endif 5523#endif
5524 if (error) 5524 if (error)
5525 goto out; 5525 goto out;
5526 5526
5527 /* if that worked, generate a real config for the new pool */ 5527 /* if that worked, generate a real config for the new pool */
5528 if (newspa->spa_root_vdev != NULL) { 5528 if (newspa->spa_root_vdev != NULL) {
5529 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5529 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
5530 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5530 NV_UNIQUE_NAME, KM_SLEEP) == 0);
5531 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5531 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
5532 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5532 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
5533 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5533 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
5534 B_TRUE)); 5534 B_TRUE));
5535 } 5535 }
5536 5536
5537 /* set the props */ 5537 /* set the props */
5538 if (props != NULL) { 5538 if (props != NULL) {
5539 spa_configfile_set(newspa, props, B_FALSE); 5539 spa_configfile_set(newspa, props, B_FALSE);
5540 error = spa_prop_set(newspa, props); 5540 error = spa_prop_set(newspa, props);
5541 if (error) 5541 if (error)
5542 goto out; 5542 goto out;
5543 } 5543 }
5544 5544
5545 /* flush everything */ 5545 /* flush everything */
5546 txg = spa_vdev_config_enter(newspa); 5546 txg = spa_vdev_config_enter(newspa);
5547 vdev_config_dirty(newspa->spa_root_vdev); 5547 vdev_config_dirty(newspa->spa_root_vdev);
5548 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5548 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
5549 5549
5550 if (zio_injection_enabled) 5550 if (zio_injection_enabled)
5551 zio_handle_panic_injection(spa, FTAG, 2); 5551 zio_handle_panic_injection(spa, FTAG, 2);
5552 5552
5553 spa_async_resume(newspa); 5553 spa_async_resume(newspa);
5554 5554
5555 /* finally, update the original pool's config */ 5555 /* finally, update the original pool's config */
5556 txg = spa_vdev_config_enter(spa); 5556 txg = spa_vdev_config_enter(spa);
5557 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5557 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5558 error = dmu_tx_assign(tx, TXG_WAIT); 5558 error = dmu_tx_assign(tx, TXG_WAIT);
5559 if (error != 0) 5559 if (error != 0)
5560 dmu_tx_abort(tx); 5560 dmu_tx_abort(tx);
5561 for (c = 0; c < children; c++) { 5561 for (c = 0; c < children; c++) {
5562 if (vml[c] != NULL) { 5562 if (vml[c] != NULL) {
5563 vdev_split(vml[c]); 5563 vdev_split(vml[c]);
5564 if (error == 0) 5564 if (error == 0)
5565 spa_history_log_internal(spa, "detach", tx, 5565 spa_history_log_internal(spa, "detach", tx,
5566 "vdev=%s", vml[c]->vdev_path); 5566 "vdev=%s", vml[c]->vdev_path);
5567 5567
5568 vdev_free(vml[c]); 5568 vdev_free(vml[c]);
5569 } 5569 }
5570 } 5570 }
5571 spa->spa_avz_action = AVZ_ACTION_REBUILD; 5571 spa->spa_avz_action = AVZ_ACTION_REBUILD;
5572 vdev_config_dirty(spa->spa_root_vdev); 5572 vdev_config_dirty(spa->spa_root_vdev);
5573 spa->spa_config_splitting = NULL; 5573 spa->spa_config_splitting = NULL;
5574 nvlist_free(nvl); 5574 nvlist_free(nvl);
5575 if (error == 0) 5575 if (error == 0)
5576 dmu_tx_commit(tx); 5576 dmu_tx_commit(tx);
5577 (void) spa_vdev_exit(spa, NULL, txg, 0); 5577 (void) spa_vdev_exit(spa, NULL, txg, 0);
5578 5578
5579 if (zio_injection_enabled) 5579 if (zio_injection_enabled)
5580 zio_handle_panic_injection(spa, FTAG, 3); 5580 zio_handle_panic_injection(spa, FTAG, 3);
5581 5581
5582 /* split is complete; log a history record */ 5582 /* split is complete; log a history record */
5583 spa_history_log_internal(newspa, "split", NULL, 5583 spa_history_log_internal(newspa, "split", NULL,
5584 "from pool %s", spa_name(spa)); 5584 "from pool %s", spa_name(spa));
5585 5585
5586 kmem_free(vml, children * sizeof (vdev_t *)); 5586 kmem_free(vml, children * sizeof (vdev_t *));
5587 5587
5588 /* if we're not going to mount the filesystems in userland, export */ 5588 /* if we're not going to mount the filesystems in userland, export */
5589 if (exp) 5589 if (exp)
5590 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5590 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
5591 B_FALSE, B_FALSE); 5591 B_FALSE, B_FALSE);
5592 5592
5593 return (error); 5593 return (error);
5594 5594
5595out: 5595out:
5596 spa_unload(newspa); 5596 spa_unload(newspa);
5597 spa_deactivate(newspa); 5597 spa_deactivate(newspa);
5598 spa_remove(newspa); 5598 spa_remove(newspa);
5599 5599
5600 txg = spa_vdev_config_enter(spa); 5600 txg = spa_vdev_config_enter(spa);
5601 5601
5602 /* re-online all offlined disks */ 5602 /* re-online all offlined disks */
5603 for (c = 0; c < children; c++) { 5603 for (c = 0; c < children; c++) {
5604 if (vml[c] != NULL) 5604 if (vml[c] != NULL)
5605 vml[c]->vdev_offline = B_FALSE; 5605 vml[c]->vdev_offline = B_FALSE;
5606 } 5606 }
5607 vdev_reopen(spa->spa_root_vdev); 5607 vdev_reopen(spa->spa_root_vdev);
5608 5608
5609 nvlist_free(spa->spa_config_splitting); 5609 nvlist_free(spa->spa_config_splitting);
5610 spa->spa_config_splitting = NULL; 5610 spa->spa_config_splitting = NULL;
5611 (void) spa_vdev_exit(spa, NULL, txg, error); 5611 (void) spa_vdev_exit(spa, NULL, txg, error);
5612 5612
5613 kmem_free(vml, children * sizeof (vdev_t *)); 5613 kmem_free(vml, children * sizeof (vdev_t *));
5614 return (error); 5614 return (error);
5615} 5615}
5616 5616
5617static nvlist_t * 5617static nvlist_t *
5618spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5618spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
5619{ 5619{
5620 for (int i = 0; i < count; i++) { 5620 for (int i = 0; i < count; i++) {
5621 uint64_t guid; 5621 uint64_t guid;
5622 5622
5623 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5623 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
5624 &guid) == 0); 5624 &guid) == 0);
5625 5625
5626 if (guid == target_guid) 5626 if (guid == target_guid)
5627 return (nvpp[i]); 5627 return (nvpp[i]);
5628 } 5628 }
5629 5629
5630 return (NULL); 5630 return (NULL);
5631} 5631}
5632 5632
5633static void 5633static void
5634spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5634spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
5635 nvlist_t *dev_to_remove) 5635 nvlist_t *dev_to_remove)
5636{ 5636{
5637 nvlist_t **newdev = NULL; 5637 nvlist_t **newdev = NULL;
5638 5638
5639 if (count > 1) 5639 if (count > 1)
5640 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5640 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
5641 5641
5642 for (int i = 0, j = 0; i < count; i++) { 5642 for (int i = 0, j = 0; i < count; i++) {
5643 if (dev[i] == dev_to_remove) 5643 if (dev[i] == dev_to_remove)
5644 continue; 5644 continue;
5645 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5645 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
5646 } 5646 }
5647 5647
5648 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5648 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
5649 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5649 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
5650 5650
5651 for (int i = 0; i < count - 1; i++) 5651 for (int i = 0; i < count - 1; i++)
5652 nvlist_free(newdev[i]); 5652 nvlist_free(newdev[i]);
5653 5653
5654 if (count > 1) 5654 if (count > 1)
5655 kmem_free(newdev, (count - 1) * sizeof (void *)); 5655 kmem_free(newdev, (count - 1) * sizeof (void *));
5656} 5656}
5657 5657
5658/* 5658/*
5659 * Evacuate the device. 5659 * Evacuate the device.
5660 */ 5660 */
5661static int 5661static int
5662spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5662spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
5663{ 5663{
5664 uint64_t txg; 5664 uint64_t txg;
5665 int error = 0; 5665 int error = 0;
5666 5666
5667 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5667 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5668 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5668 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5669 ASSERT(vd == vd->vdev_top); 5669 ASSERT(vd == vd->vdev_top);
5670 5670
5671 /* 5671 /*
5672 * Evacuate the device. We don't hold the config lock as writer 5672 * Evacuate the device. We don't hold the config lock as writer
5673 * since we need to do I/O but we do keep the 5673 * since we need to do I/O but we do keep the
5674 * spa_namespace_lock held. Once this completes the device 5674 * spa_namespace_lock held. Once this completes the device
5675 * should no longer have any blocks allocated on it. 5675 * should no longer have any blocks allocated on it.
5676 */ 5676 */
5677 if (vd->vdev_islog) { 5677 if (vd->vdev_islog) {
5678 if (vd->vdev_stat.vs_alloc != 0) 5678 if (vd->vdev_stat.vs_alloc != 0)
5679 error = spa_offline_log(spa); 5679 error = spa_offline_log(spa);
5680 } else { 5680 } else {
5681 error = SET_ERROR(ENOTSUP); 5681 error = SET_ERROR(ENOTSUP);
5682 } 5682 }
5683 5683
5684 if (error) 5684 if (error)
5685 return (error); 5685 return (error);
5686 5686
5687 /* 5687 /*
5688 * The evacuation succeeded. Remove any remaining MOS metadata 5688 * The evacuation succeeded. Remove any remaining MOS metadata
5689 * associated with this vdev, and wait for these changes to sync. 5689 * associated with this vdev, and wait for these changes to sync.
5690 */ 5690 */
5691 ASSERT0(vd->vdev_stat.vs_alloc); 5691 ASSERT0(vd->vdev_stat.vs_alloc);
5692 txg = spa_vdev_config_enter(spa); 5692 txg = spa_vdev_config_enter(spa);
5693 vd->vdev_removing = B_TRUE; 5693 vd->vdev_removing = B_TRUE;
5694 vdev_dirty_leaves(vd, VDD_DTL, txg); 5694 vdev_dirty_leaves(vd, VDD_DTL, txg);
5695 vdev_config_dirty(vd); 5695 vdev_config_dirty(vd);
5696 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5696 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5697 5697
5698 return (0); 5698 return (0);
5699} 5699}
5700 5700
5701/* 5701/*
5702 * Complete the removal by cleaning up the namespace. 5702 * Complete the removal by cleaning up the namespace.
5703 */ 5703 */
5704static void 5704static void
5705spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5705spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
5706{ 5706{
5707 vdev_t *rvd = spa->spa_root_vdev; 5707 vdev_t *rvd = spa->spa_root_vdev;
5708 uint64_t id = vd->vdev_id; 5708 uint64_t id = vd->vdev_id;
5709 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5709 boolean_t last_vdev = (id == (rvd->vdev_children - 1));
5710 5710
5711 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5711 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5712 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5712 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5713 ASSERT(vd == vd->vdev_top); 5713 ASSERT(vd == vd->vdev_top);
5714 5714
5715 /* 5715 /*
5716 * Only remove any devices which are empty. 5716 * Only remove any devices which are empty.
5717 */ 5717 */
5718 if (vd->vdev_stat.vs_alloc != 0) 5718 if (vd->vdev_stat.vs_alloc != 0)
5719 return; 5719 return;
5720 5720
5721 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5721 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5722 5722
5723 if (list_link_active(&vd->vdev_state_dirty_node)) 5723 if (list_link_active(&vd->vdev_state_dirty_node))
5724 vdev_state_clean(vd); 5724 vdev_state_clean(vd);
5725 if (list_link_active(&vd->vdev_config_dirty_node)) 5725 if (list_link_active(&vd->vdev_config_dirty_node))
5726 vdev_config_clean(vd); 5726 vdev_config_clean(vd);
5727 5727
5728 vdev_free(vd); 5728 vdev_free(vd);
5729 5729
5730 if (last_vdev) { 5730 if (last_vdev) {
5731 vdev_compact_children(rvd); 5731 vdev_compact_children(rvd);
5732 } else { 5732 } else {
5733 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5733 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
5734 vdev_add_child(rvd, vd); 5734 vdev_add_child(rvd, vd);
5735 } 5735 }
5736 vdev_config_dirty(rvd); 5736 vdev_config_dirty(rvd);
5737 5737
5738 /* 5738 /*
5739 * Reassess the health of our root vdev. 5739 * Reassess the health of our root vdev.
5740 */ 5740 */
5741 vdev_reopen(rvd); 5741 vdev_reopen(rvd);
5742} 5742}
5743 5743
5744/* 5744/*
5745 * Remove a device from the pool - 5745 * Remove a device from the pool -
5746 * 5746 *
5747 * Removing a device from the vdev namespace requires several steps 5747 * Removing a device from the vdev namespace requires several steps
5748 * and can take a significant amount of time. As a result we use 5748 * and can take a significant amount of time. As a result we use
5749 * the spa_vdev_config_[enter/exit] functions which allow us to 5749 * the spa_vdev_config_[enter/exit] functions which allow us to
5750 * grab and release the spa_config_lock while still holding the namespace 5750 * grab and release the spa_config_lock while still holding the namespace
5751 * lock. During each step the configuration is synced out. 5751 * lock. During each step the configuration is synced out.
5752 * 5752 *
5753 * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5753 * Currently, this supports removing only hot spares, slogs, and level 2 ARC
5754 * devices. 5754 * devices.
5755 */ 5755 */
5756int 5756int
5757spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5757spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
5758{ 5758{
5759 vdev_t *vd; 5759 vdev_t *vd;
5760 sysevent_t *ev = NULL; 5760 sysevent_t *ev = NULL;
5761 metaslab_group_t *mg; 5761 metaslab_group_t *mg;
5762 nvlist_t **spares, **l2cache, *nv; 5762 nvlist_t **spares, **l2cache, *nv;
5763 uint64_t txg = 0; 5763 uint64_t txg = 0;
5764 uint_t nspares, nl2cache; 5764 uint_t nspares, nl2cache;
5765 int error = 0; 5765 int error = 0;
5766 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5766 boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
5767 5767
5768 ASSERT(spa_writeable(spa)); 5768 ASSERT(spa_writeable(spa));
5769 5769
5770 if (!locked) 5770 if (!locked)
5771 txg = spa_vdev_enter(spa); 5771 txg = spa_vdev_enter(spa);
5772 5772
5773 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5773 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
5774 5774
5775 if (spa->spa_spares.sav_vdevs != NULL && 5775 if (spa->spa_spares.sav_vdevs != NULL &&
5776 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5776 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
5777 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5777 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
5778 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5778 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
5779 /* 5779 /*
5780 * Only remove the hot spare if it's not currently in use 5780 * Only remove the hot spare if it's not currently in use
5781 * in this pool. 5781 * in this pool.
5782 */ 5782 */
5783 if (vd == NULL || unspare) { 5783 if (vd == NULL || unspare) {
5784 if (vd == NULL) 5784 if (vd == NULL)
5785 vd = spa_lookup_by_guid(spa, guid, B_TRUE); 5785 vd = spa_lookup_by_guid(spa, guid, B_TRUE);
5786 ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX); 5786 ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX);
5787 spa_vdev_remove_aux(spa->spa_spares.sav_config, 5787 spa_vdev_remove_aux(spa->spa_spares.sav_config,
5788 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5788 ZPOOL_CONFIG_SPARES, spares, nspares, nv);
5789 spa_load_spares(spa); 5789 spa_load_spares(spa);
5790 spa->spa_spares.sav_sync = B_TRUE; 5790 spa->spa_spares.sav_sync = B_TRUE;
5791 } else { 5791 } else {
5792 error = SET_ERROR(EBUSY); 5792 error = SET_ERROR(EBUSY);
5793 } 5793 }
5794 } else if (spa->spa_l2cache.sav_vdevs != NULL && 5794 } else if (spa->spa_l2cache.sav_vdevs != NULL &&
5795 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5795 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
5796 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5796 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
5797 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5797 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
5798 /* 5798 /*
5799 * Cache devices can always be removed. 5799 * Cache devices can always be removed.
5800 */ 5800 */
5801 vd = spa_lookup_by_guid(spa, guid, B_TRUE); 5801 vd = spa_lookup_by_guid(spa, guid, B_TRUE);
5802 ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX); 5802 ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX);
5803 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5803 spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
5804 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5804 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
5805 spa_load_l2cache(spa); 5805 spa_load_l2cache(spa);
5806 spa->spa_l2cache.sav_sync = B_TRUE; 5806 spa->spa_l2cache.sav_sync = B_TRUE;
5807 } else if (vd != NULL && vd->vdev_islog) { 5807 } else if (vd != NULL && vd->vdev_islog) {
5808 ASSERT(!locked); 5808 ASSERT(!locked);
5809 ASSERT(vd == vd->vdev_top); 5809 ASSERT(vd == vd->vdev_top);
5810 5810
5811 mg = vd->vdev_mg; 5811 mg = vd->vdev_mg;
5812 5812
5813 /* 5813 /*
5814 * Stop allocating from this vdev. 5814 * Stop allocating from this vdev.
5815 */ 5815 */
5816 metaslab_group_passivate(mg); 5816 metaslab_group_passivate(mg);
5817 5817
5818 /* 5818 /*
5819 * Wait for the youngest allocations and frees to sync, 5819 * Wait for the youngest allocations and frees to sync,
5820 * and then wait for the deferral of those frees to finish. 5820 * and then wait for the deferral of those frees to finish.
5821 */ 5821 */
5822 spa_vdev_config_exit(spa, NULL, 5822 spa_vdev_config_exit(spa, NULL,
5823 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5823 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
5824 5824
5825 /* 5825 /*
5826 * Attempt to evacuate the vdev. 5826 * Attempt to evacuate the vdev.
5827 */ 5827 */
5828 error = spa_vdev_remove_evacuate(spa, vd); 5828 error = spa_vdev_remove_evacuate(spa, vd);
5829 5829
5830 txg = spa_vdev_config_enter(spa); 5830 txg = spa_vdev_config_enter(spa);
5831 5831
5832 /* 5832 /*
5833 * If we couldn't evacuate the vdev, unwind. 5833 * If we couldn't evacuate the vdev, unwind.
5834 */ 5834 */
5835 if (error) { 5835 if (error) {
5836 metaslab_group_activate(mg); 5836 metaslab_group_activate(mg);
5837 return (spa_vdev_exit(spa, NULL, txg, error)); 5837 return (spa_vdev_exit(spa, NULL, txg, error));
5838 } 5838 }
5839 5839
5840 /* 5840 /*
5841 * Clean up the vdev namespace. 5841 * Clean up the vdev namespace.
5842 */ 5842 */
5843 ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_DEV); 5843 ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_DEV);
5844 spa_vdev_remove_from_namespace(spa, vd); 5844 spa_vdev_remove_from_namespace(spa, vd);
5845 5845
5846 } else if (vd != NULL) { 5846 } else if (vd != NULL) {
5847 /* 5847 /*
5848 * Normal vdevs cannot be removed (yet). 5848 * Normal vdevs cannot be removed (yet).
5849 */ 5849 */
5850 error = SET_ERROR(ENOTSUP); 5850 error = SET_ERROR(ENOTSUP);
5851 } else { 5851 } else {
5852 /* 5852 /*
5853 * There is no vdev of any kind with the specified guid. 5853 * There is no vdev of any kind with the specified guid.
5854 */ 5854 */
5855 error = SET_ERROR(ENOENT); 5855 error = SET_ERROR(ENOENT);
5856 } 5856 }
5857 5857
5858 if (!locked) 5858 if (!locked)
5859 error = spa_vdev_exit(spa, NULL, txg, error); 5859 error = spa_vdev_exit(spa, NULL, txg, error);
5860 5860
5861 if (ev) 5861 if (ev)
5862 spa_event_post(ev); 5862 spa_event_post(ev);
5863 5863
5864 return (error); 5864 return (error);
5865} 5865}
5866 5866
5867/* 5867/*
5868 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5868 * Find any device that's done replacing, or a vdev marked 'unspare' that's
5869 * currently spared, so we can detach it. 5869 * currently spared, so we can detach it.
5870 */ 5870 */
5871static vdev_t * 5871static vdev_t *
5872spa_vdev_resilver_done_hunt(vdev_t *vd) 5872spa_vdev_resilver_done_hunt(vdev_t *vd)
5873{ 5873{
5874 vdev_t *newvd, *oldvd; 5874 vdev_t *newvd, *oldvd;
5875 5875
5876 for (int c = 0; c < vd->vdev_children; c++) { 5876 for (int c = 0; c < vd->vdev_children; c++) {
5877 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5877 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
5878 if (oldvd != NULL) 5878 if (oldvd != NULL)
5879 return (oldvd); 5879 return (oldvd);
5880 } 5880 }
5881 5881
5882 /* 5882 /*
5883 * Check for a completed replacement. We always consider the first 5883 * Check for a completed replacement. We always consider the first
5884 * vdev in the list to be the oldest vdev, and the last one to be 5884 * vdev in the list to be the oldest vdev, and the last one to be
5885 * the newest (see spa_vdev_attach() for how that works). In 5885 * the newest (see spa_vdev_attach() for how that works). In
5886 * the case where the newest vdev is faulted, we will not automatically 5886 * the case where the newest vdev is faulted, we will not automatically
5887 * remove it after a resilver completes. This is OK as it will require 5887 * remove it after a resilver completes. This is OK as it will require
5888 * user intervention to determine which disk the admin wishes to keep. 5888 * user intervention to determine which disk the admin wishes to keep.
5889 */ 5889 */
5890 if (vd->vdev_ops == &vdev_replacing_ops) { 5890 if (vd->vdev_ops == &vdev_replacing_ops) {
5891 ASSERT(vd->vdev_children > 1); 5891 ASSERT(vd->vdev_children > 1);
5892 5892
5893 newvd = vd->vdev_child[vd->vdev_children - 1]; 5893 newvd = vd->vdev_child[vd->vdev_children - 1];
5894 oldvd = vd->vdev_child[0]; 5894 oldvd = vd->vdev_child[0];
5895 5895
5896 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5896 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
5897 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5897 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5898 !vdev_dtl_required(oldvd)) 5898 !vdev_dtl_required(oldvd))
5899 return (oldvd); 5899 return (oldvd);
5900 } 5900 }
5901 5901
5902 /* 5902 /*
5903 * Check for a completed resilver with the 'unspare' flag set. 5903 * Check for a completed resilver with the 'unspare' flag set.
5904 */ 5904 */
5905 if (vd->vdev_ops == &vdev_spare_ops) { 5905 if (vd->vdev_ops == &vdev_spare_ops) {
5906 vdev_t *first = vd->vdev_child[0]; 5906 vdev_t *first = vd->vdev_child[0];
5907 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5907 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
5908 5908
5909 if (last->vdev_unspare) { 5909 if (last->vdev_unspare) {
5910 oldvd = first; 5910 oldvd = first;
5911 newvd = last; 5911 newvd = last;
5912 } else if (first->vdev_unspare) { 5912 } else if (first->vdev_unspare) {
5913 oldvd = last; 5913 oldvd = last;
5914 newvd = first; 5914 newvd = first;
5915 } else { 5915 } else {
5916 oldvd = NULL; 5916 oldvd = NULL;
5917 } 5917 }
5918 5918
5919 if (oldvd != NULL && 5919 if (oldvd != NULL &&
5920 vdev_dtl_empty(newvd, DTL_MISSING) && 5920 vdev_dtl_empty(newvd, DTL_MISSING) &&
5921 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5921 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5922 !vdev_dtl_required(oldvd)) 5922 !vdev_dtl_required(oldvd))
5923 return (oldvd); 5923 return (oldvd);
5924 5924
5925 /* 5925 /*
5926 * If there are more than two spares attached to a disk, 5926 * If there are more than two spares attached to a disk,
5927 * and those spares are not required, then we want to 5927 * and those spares are not required, then we want to
5928 * attempt to free them up now so that they can be used 5928 * attempt to free them up now so that they can be used
5929 * by other pools. Once we're back down to a single 5929 * by other pools. Once we're back down to a single
5930 * disk+spare, we stop removing them. 5930 * disk+spare, we stop removing them.
5931 */ 5931 */
5932 if (vd->vdev_children > 2) { 5932 if (vd->vdev_children > 2) {
5933 newvd = vd->vdev_child[1]; 5933 newvd = vd->vdev_child[1];
5934 5934
5935 if (newvd->vdev_isspare && last->vdev_isspare && 5935 if (newvd->vdev_isspare && last->vdev_isspare &&
5936 vdev_dtl_empty(last, DTL_MISSING) && 5936 vdev_dtl_empty(last, DTL_MISSING) &&
5937 vdev_dtl_empty(last, DTL_OUTAGE) && 5937 vdev_dtl_empty(last, DTL_OUTAGE) &&
5938 !vdev_dtl_required(newvd)) 5938 !vdev_dtl_required(newvd))
5939 return (newvd); 5939 return (newvd);
5940 } 5940 }
5941 } 5941 }
5942 5942
5943 return (NULL); 5943 return (NULL);
5944} 5944}
5945 5945
5946static void 5946static void
5947spa_vdev_resilver_done(spa_t *spa) 5947spa_vdev_resilver_done(spa_t *spa)
5948{ 5948{
5949 vdev_t *vd, *pvd, *ppvd; 5949 vdev_t *vd, *pvd, *ppvd;
5950 uint64_t guid, sguid, pguid, ppguid; 5950 uint64_t guid, sguid, pguid, ppguid;
5951 5951
5952 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5952 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5953 5953
5954 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5954 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
5955 pvd = vd->vdev_parent; 5955 pvd = vd->vdev_parent;
5956 ppvd = pvd->vdev_parent; 5956 ppvd = pvd->vdev_parent;
5957 guid = vd->vdev_guid; 5957 guid = vd->vdev_guid;
5958 pguid = pvd->vdev_guid; 5958 pguid = pvd->vdev_guid;
5959 ppguid = ppvd->vdev_guid; 5959 ppguid = ppvd->vdev_guid;
5960 sguid = 0; 5960 sguid = 0;
5961 /* 5961 /*
5962 * If we have just finished replacing a hot spared device, then 5962 * If we have just finished replacing a hot spared device, then
5963 * we need to detach the parent's first child (the original hot 5963 * we need to detach the parent's first child (the original hot
5964 * spare) as well. 5964 * spare) as well.
5965 */ 5965 */
5966 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5966 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
5967 ppvd->vdev_children == 2) { 5967 ppvd->vdev_children == 2) {
5968 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5968 ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
5969 sguid = ppvd->vdev_child[1]->vdev_guid; 5969 sguid = ppvd->vdev_child[1]->vdev_guid;
5970 } 5970 }
5971 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5971 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
5972 5972
5973 spa_config_exit(spa, SCL_ALL, FTAG); 5973 spa_config_exit(spa, SCL_ALL, FTAG);
5974 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5974 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
5975 return; 5975 return;
5976 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5976 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
5977 return; 5977 return;
5978 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5978 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5979 } 5979 }
5980 5980
5981 spa_config_exit(spa, SCL_ALL, FTAG); 5981 spa_config_exit(spa, SCL_ALL, FTAG);
5982} 5982}
5983 5983
5984/* 5984/*
5985 * Update the stored path or FRU for this vdev. 5985 * Update the stored path or FRU for this vdev.
5986 */ 5986 */
5987int 5987int
5988spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5988spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
5989 boolean_t ispath) 5989 boolean_t ispath)
5990{ 5990{
5991 vdev_t *vd; 5991 vdev_t *vd;
5992 boolean_t sync = B_FALSE; 5992 boolean_t sync = B_FALSE;
5993 5993
5994 ASSERT(spa_writeable(spa)); 5994 ASSERT(spa_writeable(spa));
5995 5995
5996 spa_vdev_state_enter(spa, SCL_ALL); 5996 spa_vdev_state_enter(spa, SCL_ALL);
5997 5997
5998 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5998 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
5999 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5999 return (spa_vdev_state_exit(spa, NULL, ENOENT));
6000 6000
6001 if (!vd->vdev_ops->vdev_op_leaf) 6001 if (!vd->vdev_ops->vdev_op_leaf)
6002 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 6002 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
6003 6003
6004 if (ispath) { 6004 if (ispath) {
6005 if (strcmp(value, vd->vdev_path) != 0) { 6005 if (strcmp(value, vd->vdev_path) != 0) {
6006 spa_strfree(vd->vdev_path); 6006 spa_strfree(vd->vdev_path);
6007 vd->vdev_path = spa_strdup(value); 6007 vd->vdev_path = spa_strdup(value);
6008 sync = B_TRUE; 6008 sync = B_TRUE;
6009 } 6009 }
6010 } else { 6010 } else {
6011 if (vd->vdev_fru == NULL) { 6011 if (vd->vdev_fru == NULL) {
6012 vd->vdev_fru = spa_strdup(value); 6012 vd->vdev_fru = spa_strdup(value);
6013 sync = B_TRUE; 6013 sync = B_TRUE;
6014 } else if (strcmp(value, vd->vdev_fru) != 0) { 6014 } else if (strcmp(value, vd->vdev_fru) != 0) {
6015 spa_strfree(vd->vdev_fru); 6015 spa_strfree(vd->vdev_fru);
6016 vd->vdev_fru = spa_strdup(value); 6016 vd->vdev_fru = spa_strdup(value);
6017 sync = B_TRUE; 6017 sync = B_TRUE;
6018 } 6018 }
6019 } 6019 }
6020 6020
6021 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 6021 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
6022} 6022}
6023 6023
6024int 6024int
6025spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 6025spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
6026{ 6026{
6027 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 6027 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
6028} 6028}
6029 6029
6030int 6030int
6031spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 6031spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
6032{ 6032{
6033 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 6033 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
6034} 6034}
6035 6035
6036/* 6036/*
6037 * ========================================================================== 6037 * ==========================================================================
6038 * SPA Scanning 6038 * SPA Scanning
6039 * ========================================================================== 6039 * ==========================================================================
6040 */ 6040 */
6041 6041
6042int 6042int
6043spa_scan_stop(spa_t *spa) 6043spa_scan_stop(spa_t *spa)
6044{ 6044{
6045 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6045 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
6046 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 6046 if (dsl_scan_resilvering(spa->spa_dsl_pool))
6047 return (SET_ERROR(EBUSY)); 6047 return (SET_ERROR(EBUSY));
6048 return (dsl_scan_cancel(spa->spa_dsl_pool)); 6048 return (dsl_scan_cancel(spa->spa_dsl_pool));
6049} 6049}
6050 6050
6051int 6051int
6052spa_scan(spa_t *spa, pool_scan_func_t func) 6052spa_scan(spa_t *spa, pool_scan_func_t func)
6053{ 6053{
6054 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6054 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
6055 6055
6056 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 6056 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
6057 return (SET_ERROR(ENOTSUP)); 6057 return (SET_ERROR(ENOTSUP));
6058 6058
6059 /* 6059 /*
6060 * If a resilver was requested, but there is no DTL on a 6060 * If a resilver was requested, but there is no DTL on a
6061 * writeable leaf device, we have nothing to do. 6061 * writeable leaf device, we have nothing to do.
6062 */ 6062 */
6063 if (func == POOL_SCAN_RESILVER && 6063 if (func == POOL_SCAN_RESILVER &&
6064 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 6064 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
6065 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 6065 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
6066 return (0); 6066 return (0);
6067 } 6067 }
6068 6068
6069 return (dsl_scan(spa->spa_dsl_pool, func)); 6069 return (dsl_scan(spa->spa_dsl_pool, func));
6070} 6070}
6071 6071
6072/* 6072/*
6073 * ========================================================================== 6073 * ==========================================================================
6074 * SPA async task processing 6074 * SPA async task processing
6075 * ========================================================================== 6075 * ==========================================================================
6076 */ 6076 */
6077 6077
6078static void 6078static void
6079spa_async_remove(spa_t *spa, vdev_t *vd) 6079spa_async_remove(spa_t *spa, vdev_t *vd)
6080{ 6080{
6081 if (vd->vdev_remove_wanted) { 6081 if (vd->vdev_remove_wanted) {
6082 vd->vdev_remove_wanted = B_FALSE; 6082 vd->vdev_remove_wanted = B_FALSE;
6083 vd->vdev_delayed_close = B_FALSE; 6083 vd->vdev_delayed_close = B_FALSE;
6084 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 6084 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
6085 6085
6086 /* 6086 /*
6087 * We want to clear the stats, but we don't want to do a full 6087 * We want to clear the stats, but we don't want to do a full
6088 * vdev_clear() as that will cause us to throw away 6088 * vdev_clear() as that will cause us to throw away
6089 * degraded/faulted state as well as attempt to reopen the 6089 * degraded/faulted state as well as attempt to reopen the
6090 * device, all of which is a waste. 6090 * device, all of which is a waste.
6091 */ 6091 */
6092 vd->vdev_stat.vs_read_errors = 0; 6092 vd->vdev_stat.vs_read_errors = 0;
6093 vd->vdev_stat.vs_write_errors = 0; 6093 vd->vdev_stat.vs_write_errors = 0;
6094 vd->vdev_stat.vs_checksum_errors = 0; 6094 vd->vdev_stat.vs_checksum_errors = 0;
6095 6095
6096 vdev_state_dirty(vd->vdev_top); 6096 vdev_state_dirty(vd->vdev_top);
6097 /* Tell userspace that the vdev is gone. */ 6097 /* Tell userspace that the vdev is gone. */
6098 zfs_post_remove(spa, vd); 6098 zfs_post_remove(spa, vd);
6099 } 6099 }
6100 6100
6101 for (int c = 0; c < vd->vdev_children; c++) 6101 for (int c = 0; c < vd->vdev_children; c++)
6102 spa_async_remove(spa, vd->vdev_child[c]); 6102 spa_async_remove(spa, vd->vdev_child[c]);
6103} 6103}
6104 6104
6105static void 6105static void
6106spa_async_probe(spa_t *spa, vdev_t *vd) 6106spa_async_probe(spa_t *spa, vdev_t *vd)
6107{ 6107{
6108 if (vd->vdev_probe_wanted) { 6108 if (vd->vdev_probe_wanted) {
6109 vd->vdev_probe_wanted = B_FALSE; 6109 vd->vdev_probe_wanted = B_FALSE;
6110 vdev_reopen(vd); /* vdev_open() does the actual probe */ 6110 vdev_reopen(vd); /* vdev_open() does the actual probe */
6111 } 6111 }
6112 6112
6113 for (int c = 0; c < vd->vdev_children; c++) 6113 for (int c = 0; c < vd->vdev_children; c++)
6114 spa_async_probe(spa, vd->vdev_child[c]); 6114 spa_async_probe(spa, vd->vdev_child[c]);
6115} 6115}
6116 6116
6117static void 6117static void
6118spa_async_autoexpand(spa_t *spa, vdev_t *vd) 6118spa_async_autoexpand(spa_t *spa, vdev_t *vd)
6119{ 6119{
6120 sysevent_id_t eid; 6120 sysevent_id_t eid;
6121 nvlist_t *attr; 6121 nvlist_t *attr;
6122 char *physpath; 6122 char *physpath;
6123 6123
6124 if (!spa->spa_autoexpand) 6124 if (!spa->spa_autoexpand)
6125 return; 6125 return;
6126 6126
6127 for (int c = 0; c < vd->vdev_children; c++) { 6127 for (int c = 0; c < vd->vdev_children; c++) {
6128 vdev_t *cvd = vd->vdev_child[c]; 6128 vdev_t *cvd = vd->vdev_child[c];
6129 spa_async_autoexpand(spa, cvd); 6129 spa_async_autoexpand(spa, cvd);
6130 } 6130 }
6131 6131
6132 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 6132 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
6133 return; 6133 return;
6134 6134
6135 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 6135 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
6136 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 6136 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
6137 6137
6138 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6138 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
6139 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 6139 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
6140 6140
6141 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 6141 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
6142 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 6142 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
6143 6143
6144 nvlist_free(attr); 6144 nvlist_free(attr);
6145 kmem_free(physpath, MAXPATHLEN); 6145 kmem_free(physpath, MAXPATHLEN);
6146} 6146}
6147 6147
6148static void 6148static void
6149spa_async_thread(void *arg) 6149spa_async_thread(void *arg)
6150{ 6150{
6151 spa_t *spa = arg; 6151 spa_t *spa = arg;
6152 int tasks; 6152 int tasks;
6153 6153
6154 ASSERT(spa->spa_sync_on); 6154 ASSERT(spa->spa_sync_on);
6155 6155
6156 mutex_enter(&spa->spa_async_lock); 6156 mutex_enter(&spa->spa_async_lock);
6157 tasks = spa->spa_async_tasks; 6157 tasks = spa->spa_async_tasks;
6158 spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 6158 spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
6159 mutex_exit(&spa->spa_async_lock); 6159 mutex_exit(&spa->spa_async_lock);
6160 6160
6161 /* 6161 /*
6162 * See if the config needs to be updated. 6162 * See if the config needs to be updated.
6163 */ 6163 */
6164 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 6164 if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
6165 uint64_t old_space, new_space; 6165 uint64_t old_space, new_space;
6166 6166
6167 mutex_enter(&spa_namespace_lock); 6167 mutex_enter(&spa_namespace_lock);
6168 old_space = metaslab_class_get_space(spa_normal_class(spa)); 6168 old_space = metaslab_class_get_space(spa_normal_class(spa));
6169 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6169 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
6170 new_space = metaslab_class_get_space(spa_normal_class(spa)); 6170 new_space = metaslab_class_get_space(spa_normal_class(spa));
6171 mutex_exit(&spa_namespace_lock); 6171 mutex_exit(&spa_namespace_lock);
6172 6172
6173 /* 6173 /*
6174 * If the pool grew as a result of the config update, 6174 * If the pool grew as a result of the config update,
6175 * then log an internal history event. 6175 * then log an internal history event.
6176 */ 6176 */
6177 if (new_space != old_space) { 6177 if (new_space != old_space) {
6178 spa_history_log_internal(spa, "vdev online", NULL, 6178 spa_history_log_internal(spa, "vdev online", NULL,
6179 "pool '%s' size: %llu(+%llu)", 6179 "pool '%s' size: %llu(+%llu)",
6180 spa_name(spa), new_space, new_space - old_space); 6180 spa_name(spa), new_space, new_space - old_space);
6181 } 6181 }
6182 } 6182 }
6183 6183
6184 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 6184 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
6185 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6185 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6186 spa_async_autoexpand(spa, spa->spa_root_vdev); 6186 spa_async_autoexpand(spa, spa->spa_root_vdev);
6187 spa_config_exit(spa, SCL_CONFIG, FTAG); 6187 spa_config_exit(spa, SCL_CONFIG, FTAG);
6188 } 6188 }
6189 6189
6190 /* 6190 /*
6191 * See if any devices need to be probed. 6191 * See if any devices need to be probed.
6192 */ 6192 */
6193 if (tasks & SPA_ASYNC_PROBE) { 6193 if (tasks & SPA_ASYNC_PROBE) {
6194 spa_vdev_state_enter(spa, SCL_NONE); 6194 spa_vdev_state_enter(spa, SCL_NONE);
6195 spa_async_probe(spa, spa->spa_root_vdev); 6195 spa_async_probe(spa, spa->spa_root_vdev);
6196 (void) spa_vdev_state_exit(spa, NULL, 0); 6196 (void) spa_vdev_state_exit(spa, NULL, 0);
6197 } 6197 }
6198 6198
6199 /* 6199 /*
6200 * If any devices are done replacing, detach them. 6200 * If any devices are done replacing, detach them.
6201 */ 6201 */
6202 if (tasks & SPA_ASYNC_RESILVER_DONE) 6202 if (tasks & SPA_ASYNC_RESILVER_DONE)
6203 spa_vdev_resilver_done(spa); 6203 spa_vdev_resilver_done(spa);
6204 6204
6205 /* 6205 /*
6206 * Kick off a resilver. 6206 * Kick off a resilver.
6207 */ 6207 */
6208 if (tasks & SPA_ASYNC_RESILVER) 6208 if (tasks & SPA_ASYNC_RESILVER)
6209 dsl_resilver_restart(spa->spa_dsl_pool, 0); 6209 dsl_resilver_restart(spa->spa_dsl_pool, 0);
6210 6210
6211 /* 6211 /*
6212 * Let the world know that we're done. 6212 * Let the world know that we're done.
6213 */ 6213 */
6214 mutex_enter(&spa->spa_async_lock); 6214 mutex_enter(&spa->spa_async_lock);
6215 spa->spa_async_thread = NULL; 6215 spa->spa_async_thread = NULL;
6216 cv_broadcast(&spa->spa_async_cv); 6216 cv_broadcast(&spa->spa_async_cv);
6217 mutex_exit(&spa->spa_async_lock); 6217 mutex_exit(&spa->spa_async_lock);
6218 thread_exit(); 6218 thread_exit();
6219} 6219}
6220 6220
6221static void 6221static void
6222spa_async_thread_vd(void *arg) 6222spa_async_thread_vd(void *arg)
6223{ 6223{
6224 spa_t *spa = arg; 6224 spa_t *spa = arg;
6225 int tasks; 6225 int tasks;
6226 6226
6227 ASSERT(spa->spa_sync_on); 6227 ASSERT(spa->spa_sync_on);
6228 6228
6229 mutex_enter(&spa->spa_async_lock); 6229 mutex_enter(&spa->spa_async_lock);
6230 tasks = spa->spa_async_tasks; 6230 tasks = spa->spa_async_tasks;
6231retry: 6231retry:
6232 spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 6232 spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
6233 mutex_exit(&spa->spa_async_lock); 6233 mutex_exit(&spa->spa_async_lock);
6234 6234
6235 /* 6235 /*
6236 * See if any devices need to be marked REMOVED. 6236 * See if any devices need to be marked REMOVED.
6237 */ 6237 */
6238 if (tasks & SPA_ASYNC_REMOVE) { 6238 if (tasks & SPA_ASYNC_REMOVE) {
6239 spa_vdev_state_enter(spa, SCL_NONE); 6239 spa_vdev_state_enter(spa, SCL_NONE);
6240 spa_async_remove(spa, spa->spa_root_vdev); 6240 spa_async_remove(spa, spa->spa_root_vdev);
6241 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 6241 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
6242 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 6242 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
6243 for (int i = 0; i < spa->spa_spares.sav_count; i++) 6243 for (int i = 0; i < spa->spa_spares.sav_count; i++)
6244 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 6244 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
6245 (void) spa_vdev_state_exit(spa, NULL, 0); 6245 (void) spa_vdev_state_exit(spa, NULL, 0);
6246 } 6246 }
6247 6247
6248 /* 6248 /*
6249 * Let the world know that we're done. 6249 * Let the world know that we're done.
6250 */ 6250 */
6251 mutex_enter(&spa->spa_async_lock); 6251 mutex_enter(&spa->spa_async_lock);
6252 tasks = spa->spa_async_tasks; 6252 tasks = spa->spa_async_tasks;
6253 if ((tasks & SPA_ASYNC_REMOVE) != 0) 6253 if ((tasks & SPA_ASYNC_REMOVE) != 0)
6254 goto retry; 6254 goto retry;
6255 spa->spa_async_thread_vd = NULL; 6255 spa->spa_async_thread_vd = NULL;
6256 cv_broadcast(&spa->spa_async_cv); 6256 cv_broadcast(&spa->spa_async_cv);
6257 mutex_exit(&spa->spa_async_lock); 6257 mutex_exit(&spa->spa_async_lock);
6258 thread_exit(); 6258 thread_exit();
6259} 6259}
6260 6260
6261void 6261void
6262spa_async_suspend(spa_t *spa) 6262spa_async_suspend(spa_t *spa)
6263{ 6263{
6264 mutex_enter(&spa->spa_async_lock); 6264 mutex_enter(&spa->spa_async_lock);
6265 spa->spa_async_suspended++; 6265 spa->spa_async_suspended++;
6266 while (spa->spa_async_thread != NULL && 6266 while (spa->spa_async_thread != NULL &&
6267 spa->spa_async_thread_vd != NULL) 6267 spa->spa_async_thread_vd != NULL)
6268 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 6268 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
6269 mutex_exit(&spa->spa_async_lock); 6269 mutex_exit(&spa->spa_async_lock);
6270} 6270}
6271 6271
6272void 6272void
6273spa_async_resume(spa_t *spa) 6273spa_async_resume(spa_t *spa)
6274{ 6274{
6275 mutex_enter(&spa->spa_async_lock); 6275 mutex_enter(&spa->spa_async_lock);
6276 ASSERT(spa->spa_async_suspended != 0); 6276 ASSERT(spa->spa_async_suspended != 0);
6277 spa->spa_async_suspended--; 6277 spa->spa_async_suspended--;
6278 mutex_exit(&spa->spa_async_lock); 6278 mutex_exit(&spa->spa_async_lock);
6279} 6279}
6280 6280
6281static boolean_t 6281static boolean_t
6282spa_async_tasks_pending(spa_t *spa) 6282spa_async_tasks_pending(spa_t *spa)
6283{ 6283{
6284 uint_t non_config_tasks; 6284 uint_t non_config_tasks;
6285 uint_t config_task; 6285 uint_t config_task;
6286 boolean_t config_task_suspended; 6286 boolean_t config_task_suspended;
6287 6287
6288 non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 6288 non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE |
6289 SPA_ASYNC_REMOVE); 6289 SPA_ASYNC_REMOVE);
6290 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6290 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
6291 if (spa->spa_ccw_fail_time == 0) { 6291 if (spa->spa_ccw_fail_time == 0) {
6292 config_task_suspended = B_FALSE; 6292 config_task_suspended = B_FALSE;
6293 } else { 6293 } else {
6294 config_task_suspended = 6294 config_task_suspended =
6295 (gethrtime() - spa->spa_ccw_fail_time) < 6295 (gethrtime() - spa->spa_ccw_fail_time) <
6296 (zfs_ccw_retry_interval * NANOSEC); 6296 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC);
6297 } 6297 }
6298 6298
6299 return (non_config_tasks || (config_task && !config_task_suspended)); 6299 return (non_config_tasks || (config_task && !config_task_suspended));
6300} 6300}
6301 6301
6302static void 6302static void
6303spa_async_dispatch(spa_t *spa) 6303spa_async_dispatch(spa_t *spa)
6304{ 6304{
6305 mutex_enter(&spa->spa_async_lock); 6305 mutex_enter(&spa->spa_async_lock);
6306 if (spa_async_tasks_pending(spa) && 6306 if (spa_async_tasks_pending(spa) &&
6307 !spa->spa_async_suspended && 6307 !spa->spa_async_suspended &&
6308 spa->spa_async_thread == NULL && 6308 spa->spa_async_thread == NULL &&
6309 rootdir != NULL) 6309 rootdir != NULL)
6310 spa->spa_async_thread = thread_create(NULL, 0, 6310 spa->spa_async_thread = thread_create(NULL, 0,
6311 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6311 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
6312 mutex_exit(&spa->spa_async_lock); 6312 mutex_exit(&spa->spa_async_lock);
6313} 6313}
6314 6314
6315static void 6315static void
6316spa_async_dispatch_vd(spa_t *spa) 6316spa_async_dispatch_vd(spa_t *spa)
6317{ 6317{
6318 mutex_enter(&spa->spa_async_lock); 6318 mutex_enter(&spa->spa_async_lock);
6319 if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6319 if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
6320 !spa->spa_async_suspended && 6320 !spa->spa_async_suspended &&
6321 spa->spa_async_thread_vd == NULL && 6321 spa->spa_async_thread_vd == NULL &&
6322 rootdir != NULL) 6322 rootdir != NULL)
6323 spa->spa_async_thread_vd = thread_create(NULL, 0, 6323 spa->spa_async_thread_vd = thread_create(NULL, 0,
6324 spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6324 spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
6325 mutex_exit(&spa->spa_async_lock); 6325 mutex_exit(&spa->spa_async_lock);
6326} 6326}
6327 6327
6328void 6328void
6329spa_async_request(spa_t *spa, int task) 6329spa_async_request(spa_t *spa, int task)
6330{ 6330{
6331 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6331 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
6332 mutex_enter(&spa->spa_async_lock); 6332 mutex_enter(&spa->spa_async_lock);
6333 spa->spa_async_tasks |= task; 6333 spa->spa_async_tasks |= task;
6334 mutex_exit(&spa->spa_async_lock); 6334 mutex_exit(&spa->spa_async_lock);
6335 spa_async_dispatch_vd(spa); 6335 spa_async_dispatch_vd(spa);
6336} 6336}
6337 6337
6338/* 6338/*
6339 * ========================================================================== 6339 * ==========================================================================
6340 * SPA syncing routines 6340 * SPA syncing routines
6341 * ========================================================================== 6341 * ==========================================================================
6342 */ 6342 */
6343 6343
6344static int 6344static int
6345bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6345bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
6346{ 6346{
6347 bpobj_t *bpo = arg; 6347 bpobj_t *bpo = arg;
6348 bpobj_enqueue(bpo, bp, tx); 6348 bpobj_enqueue(bpo, bp, tx);
6349 return (0); 6349 return (0);
6350} 6350}
6351 6351
6352static int 6352static int
6353spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6353spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
6354{ 6354{
6355 zio_t *zio = arg; 6355 zio_t *zio = arg;
6356 6356
6357 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6357 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
6358 BP_GET_PSIZE(bp), zio->io_flags)); 6358 BP_GET_PSIZE(bp), zio->io_flags));
6359 return (0); 6359 return (0);
6360} 6360}
6361 6361
6362/* 6362/*
6363 * Note: this simple function is not inlined to make it easier to dtrace the 6363 * Note: this simple function is not inlined to make it easier to dtrace the
6364 * amount of time spent syncing frees. 6364 * amount of time spent syncing frees.
6365 */ 6365 */
6366static void 6366static void
6367spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6367spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
6368{ 6368{
6369 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6369 zio_t *zio = zio_root(spa, NULL, NULL, 0);
6370 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6370 bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
6371 VERIFY(zio_wait(zio) == 0); 6371 VERIFY(zio_wait(zio) == 0);
6372} 6372}
6373 6373
6374/* 6374/*
6375 * Note: this simple function is not inlined to make it easier to dtrace the 6375 * Note: this simple function is not inlined to make it easier to dtrace the
6376 * amount of time spent syncing deferred frees. 6376 * amount of time spent syncing deferred frees.
6377 */ 6377 */
6378static void 6378static void
6379spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6379spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
6380{ 6380{
6381 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6381 zio_t *zio = zio_root(spa, NULL, NULL, 0);
6382 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6382 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
6383 spa_free_sync_cb, zio, tx), ==, 0); 6383 spa_free_sync_cb, zio, tx), ==, 0);
6384 VERIFY0(zio_wait(zio)); 6384 VERIFY0(zio_wait(zio));
6385} 6385}
6386 6386
6387 6387
6388static void 6388static void
6389spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6389spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
6390{ 6390{
6391 char *packed = NULL; 6391 char *packed = NULL;
6392 size_t bufsize; 6392 size_t bufsize;
6393 size_t nvsize = 0; 6393 size_t nvsize = 0;
6394 dmu_buf_t *db; 6394 dmu_buf_t *db;
6395 6395
6396 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6396 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
6397 6397
6398 /* 6398 /*
6399 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6399 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
6400 * information. This avoids the dmu_buf_will_dirty() path and 6400 * information. This avoids the dmu_buf_will_dirty() path and
6401 * saves us a pre-read to get data we don't actually care about. 6401 * saves us a pre-read to get data we don't actually care about.
6402 */ 6402 */
6403 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6403 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
6404 packed = kmem_alloc(bufsize, KM_SLEEP); 6404 packed = kmem_alloc(bufsize, KM_SLEEP);
6405 6405
6406 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6406 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
6407 KM_SLEEP) == 0); 6407 KM_SLEEP) == 0);
6408 bzero(packed + nvsize, bufsize - nvsize); 6408 bzero(packed + nvsize, bufsize - nvsize);
6409 6409
6410 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6410 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
6411 6411
6412 kmem_free(packed, bufsize); 6412 kmem_free(packed, bufsize);
6413 6413
6414 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6414 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
6415 dmu_buf_will_dirty(db, tx); 6415 dmu_buf_will_dirty(db, tx);
6416 *(uint64_t *)db->db_data = nvsize; 6416 *(uint64_t *)db->db_data = nvsize;
6417 dmu_buf_rele(db, FTAG); 6417 dmu_buf_rele(db, FTAG);
6418} 6418}
6419 6419
6420static void 6420static void
6421spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6421spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
6422 const char *config, const char *entry) 6422 const char *config, const char *entry)
6423{ 6423{
6424 nvlist_t *nvroot; 6424 nvlist_t *nvroot;
6425 nvlist_t **list; 6425 nvlist_t **list;
6426 int i; 6426 int i;
6427 6427
6428 if (!sav->sav_sync) 6428 if (!sav->sav_sync)
6429 return; 6429 return;
6430 6430
6431 /* 6431 /*
6432 * Update the MOS nvlist describing the list of available devices. 6432 * Update the MOS nvlist describing the list of available devices.
6433 * spa_validate_aux() will have already made sure this nvlist is 6433 * spa_validate_aux() will have already made sure this nvlist is
6434 * valid and the vdevs are labeled appropriately. 6434 * valid and the vdevs are labeled appropriately.
6435 */ 6435 */
6436 if (sav->sav_object == 0) { 6436 if (sav->sav_object == 0) {
6437 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6437 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
6438 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6438 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
6439 sizeof (uint64_t), tx); 6439 sizeof (uint64_t), tx);
6440 VERIFY(zap_update(spa->spa_meta_objset, 6440 VERIFY(zap_update(spa->spa_meta_objset,
6441 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6441 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
6442 &sav->sav_object, tx) == 0); 6442 &sav->sav_object, tx) == 0);
6443 } 6443 }
6444 6444
6445 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6445 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
6446 if (sav->sav_count == 0) { 6446 if (sav->sav_count == 0) {
6447 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6447 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
6448 } else { 6448 } else {
6449 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6449 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
6450 for (i = 0; i < sav->sav_count; i++) 6450 for (i = 0; i < sav->sav_count; i++)
6451 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6451 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
6452 B_FALSE, VDEV_CONFIG_L2CACHE); 6452 B_FALSE, VDEV_CONFIG_L2CACHE);
6453 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6453 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
6454 sav->sav_count) == 0); 6454 sav->sav_count) == 0);
6455 for (i = 0; i < sav->sav_count; i++) 6455 for (i = 0; i < sav->sav_count; i++)
6456 nvlist_free(list[i]); 6456 nvlist_free(list[i]);
6457 kmem_free(list, sav->sav_count * sizeof (void *)); 6457 kmem_free(list, sav->sav_count * sizeof (void *));
6458 } 6458 }
6459 6459
6460 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6460 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
6461 nvlist_free(nvroot); 6461 nvlist_free(nvroot);
6462 6462
6463 sav->sav_sync = B_FALSE; 6463 sav->sav_sync = B_FALSE;
6464} 6464}
6465 6465
6466/* 6466/*
6467 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 6467 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
6468 * The all-vdev ZAP must be empty. 6468 * The all-vdev ZAP must be empty.
6469 */ 6469 */
6470static void 6470static void
6471spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 6471spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
6472{ 6472{
6473 spa_t *spa = vd->vdev_spa; 6473 spa_t *spa = vd->vdev_spa;
6474 if (vd->vdev_top_zap != 0) { 6474 if (vd->vdev_top_zap != 0) {
6475 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6475 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
6476 vd->vdev_top_zap, tx)); 6476 vd->vdev_top_zap, tx));
6477 } 6477 }
6478 if (vd->vdev_leaf_zap != 0) { 6478 if (vd->vdev_leaf_zap != 0) {
6479 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6479 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
6480 vd->vdev_leaf_zap, tx)); 6480 vd->vdev_leaf_zap, tx));
6481 } 6481 }
6482 for (uint64_t i = 0; i < vd->vdev_children; i++) { 6482 for (uint64_t i = 0; i < vd->vdev_children; i++) {
6483 spa_avz_build(vd->vdev_child[i], avz, tx); 6483 spa_avz_build(vd->vdev_child[i], avz, tx);
6484 } 6484 }
6485} 6485}
6486 6486
6487static void 6487static void
6488spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6488spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
6489{ 6489{
6490 nvlist_t *config; 6490 nvlist_t *config;
6491 6491
6492 /* 6492 /*
6493 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 6493 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
6494 * its config may not be dirty but we still need to build per-vdev ZAPs. 6494 * its config may not be dirty but we still need to build per-vdev ZAPs.
6495 * Similarly, if the pool is being assembled (e.g. after a split), we 6495 * Similarly, if the pool is being assembled (e.g. after a split), we
6496 * need to rebuild the AVZ although the config may not be dirty. 6496 * need to rebuild the AVZ although the config may not be dirty.
6497 */ 6497 */
6498 if (list_is_empty(&spa->spa_config_dirty_list) && 6498 if (list_is_empty(&spa->spa_config_dirty_list) &&
6499 spa->spa_avz_action == AVZ_ACTION_NONE) 6499 spa->spa_avz_action == AVZ_ACTION_NONE)
6500 return; 6500 return;
6501 6501
6502 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6502 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6503 6503
6504 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 6504 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
6505 spa->spa_all_vdev_zaps != 0); 6505 spa->spa_all_vdev_zaps != 0);
6506 6506
6507 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 6507 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
6508 /* Make and build the new AVZ */ 6508 /* Make and build the new AVZ */
6509 uint64_t new_avz = zap_create(spa->spa_meta_objset, 6509 uint64_t new_avz = zap_create(spa->spa_meta_objset,
6510 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 6510 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
6511 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 6511 spa_avz_build(spa->spa_root_vdev, new_avz, tx);
6512 6512
6513 /* Diff old AVZ with new one */ 6513 /* Diff old AVZ with new one */
6514 zap_cursor_t zc; 6514 zap_cursor_t zc;
6515 zap_attribute_t za; 6515 zap_attribute_t za;
6516 6516
6517 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6517 for (zap_cursor_init(&zc, spa->spa_meta_objset,
6518 spa->spa_all_vdev_zaps); 6518 spa->spa_all_vdev_zaps);
6519 zap_cursor_retrieve(&zc, &za) == 0; 6519 zap_cursor_retrieve(&zc, &za) == 0;
6520 zap_cursor_advance(&zc)) { 6520 zap_cursor_advance(&zc)) {
6521 uint64_t vdzap = za.za_first_integer; 6521 uint64_t vdzap = za.za_first_integer;
6522 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 6522 if (zap_lookup_int(spa->spa_meta_objset, new_avz,
6523 vdzap) == ENOENT) { 6523 vdzap) == ENOENT) {
6524 /* 6524 /*
6525 * ZAP is listed in old AVZ but not in new one; 6525 * ZAP is listed in old AVZ but not in new one;
6526 * destroy it 6526 * destroy it
6527 */ 6527 */
6528 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 6528 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
6529 tx)); 6529 tx));
6530 } 6530 }
6531 } 6531 }
6532 6532
6533 zap_cursor_fini(&zc); 6533 zap_cursor_fini(&zc);
6534 6534
6535 /* Destroy the old AVZ */ 6535 /* Destroy the old AVZ */
6536 VERIFY0(zap_destroy(spa->spa_meta_objset, 6536 VERIFY0(zap_destroy(spa->spa_meta_objset,
6537 spa->spa_all_vdev_zaps, tx)); 6537 spa->spa_all_vdev_zaps, tx));
6538 6538
6539 /* Replace the old AVZ in the dir obj with the new one */ 6539 /* Replace the old AVZ in the dir obj with the new one */
6540 VERIFY0(zap_update(spa->spa_meta_objset, 6540 VERIFY0(zap_update(spa->spa_meta_objset,
6541 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 6541 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
6542 sizeof (new_avz), 1, &new_avz, tx)); 6542 sizeof (new_avz), 1, &new_avz, tx));
6543 6543
6544 spa->spa_all_vdev_zaps = new_avz; 6544 spa->spa_all_vdev_zaps = new_avz;
6545 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 6545 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
6546 zap_cursor_t zc; 6546 zap_cursor_t zc;
6547 zap_attribute_t za; 6547 zap_attribute_t za;
6548 6548
6549 /* Walk through the AVZ and destroy all listed ZAPs */ 6549 /* Walk through the AVZ and destroy all listed ZAPs */
6550 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6550 for (zap_cursor_init(&zc, spa->spa_meta_objset,
6551 spa->spa_all_vdev_zaps); 6551 spa->spa_all_vdev_zaps);
6552 zap_cursor_retrieve(&zc, &za) == 0; 6552 zap_cursor_retrieve(&zc, &za) == 0;
6553 zap_cursor_advance(&zc)) { 6553 zap_cursor_advance(&zc)) {
6554 uint64_t zap = za.za_first_integer; 6554 uint64_t zap = za.za_first_integer;
6555 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 6555 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
6556 } 6556 }
6557 6557
6558 zap_cursor_fini(&zc); 6558 zap_cursor_fini(&zc);
6559 6559
6560 /* Destroy and unlink the AVZ itself */ 6560 /* Destroy and unlink the AVZ itself */
6561 VERIFY0(zap_destroy(spa->spa_meta_objset, 6561 VERIFY0(zap_destroy(spa->spa_meta_objset,
6562 spa->spa_all_vdev_zaps, tx)); 6562 spa->spa_all_vdev_zaps, tx));
6563 VERIFY0(zap_remove(spa->spa_meta_objset, 6563 VERIFY0(zap_remove(spa->spa_meta_objset,
6564 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 6564 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
6565 spa->spa_all_vdev_zaps = 0; 6565 spa->spa_all_vdev_zaps = 0;
6566 } 6566 }
6567 6567
6568 if (spa->spa_all_vdev_zaps == 0) { 6568 if (spa->spa_all_vdev_zaps == 0) {
6569 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 6569 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
6570 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 6570 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
6571 DMU_POOL_VDEV_ZAP_MAP, tx); 6571 DMU_POOL_VDEV_ZAP_MAP, tx);
6572 } 6572 }
6573 spa->spa_avz_action = AVZ_ACTION_NONE; 6573 spa->spa_avz_action = AVZ_ACTION_NONE;
6574 6574
6575 /* Create ZAPs for vdevs that don't have them. */ 6575 /* Create ZAPs for vdevs that don't have them. */
6576 vdev_construct_zaps(spa->spa_root_vdev, tx); 6576 vdev_construct_zaps(spa->spa_root_vdev, tx);
6577 6577
6578 config = spa_config_generate(spa, spa->spa_root_vdev, 6578 config = spa_config_generate(spa, spa->spa_root_vdev,
6579 dmu_tx_get_txg(tx), B_FALSE); 6579 dmu_tx_get_txg(tx), B_FALSE);
6580 6580
6581 /* 6581 /*
6582 * If we're upgrading the spa version then make sure that 6582 * If we're upgrading the spa version then make sure that
6583 * the config object gets updated with the correct version. 6583 * the config object gets updated with the correct version.
6584 */ 6584 */
6585 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6585 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
6586 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6586 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
6587 spa->spa_uberblock.ub_version); 6587 spa->spa_uberblock.ub_version);
6588 6588
6589 spa_config_exit(spa, SCL_STATE, FTAG); 6589 spa_config_exit(spa, SCL_STATE, FTAG);
6590 6590
6591 nvlist_free(spa->spa_config_syncing); 6591 nvlist_free(spa->spa_config_syncing);
6592 spa->spa_config_syncing = config; 6592 spa->spa_config_syncing = config;
6593 6593
6594 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6594 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
6595} 6595}
6596 6596
6597static void 6597static void
6598spa_sync_version(void *arg, dmu_tx_t *tx) 6598spa_sync_version(void *arg, dmu_tx_t *tx)
6599{ 6599{
6600 uint64_t *versionp = arg; 6600 uint64_t *versionp = arg;
6601 uint64_t version = *versionp; 6601 uint64_t version = *versionp;
6602 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6602 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6603 6603
6604 /* 6604 /*
6605 * Setting the version is special cased when first creating the pool. 6605 * Setting the version is special cased when first creating the pool.
6606 */ 6606 */
6607 ASSERT(tx->tx_txg != TXG_INITIAL); 6607 ASSERT(tx->tx_txg != TXG_INITIAL);
6608 6608
6609 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6609 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
6610 ASSERT(version >= spa_version(spa)); 6610 ASSERT(version >= spa_version(spa));
6611 6611
6612 spa->spa_uberblock.ub_version = version; 6612 spa->spa_uberblock.ub_version = version;
6613 vdev_config_dirty(spa->spa_root_vdev); 6613 vdev_config_dirty(spa->spa_root_vdev);
6614 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6614 spa_history_log_internal(spa, "set", tx, "version=%lld", version);
6615} 6615}
6616 6616
6617/* 6617/*
6618 * Set zpool properties. 6618 * Set zpool properties.
6619 */ 6619 */
6620static void 6620static void
6621spa_sync_props(void *arg, dmu_tx_t *tx) 6621spa_sync_props(void *arg, dmu_tx_t *tx)
6622{ 6622{
6623 nvlist_t *nvp = arg; 6623 nvlist_t *nvp = arg;
6624 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6624 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6625 objset_t *mos = spa->spa_meta_objset; 6625 objset_t *mos = spa->spa_meta_objset;
6626 nvpair_t *elem = NULL; 6626 nvpair_t *elem = NULL;
6627 6627
6628 mutex_enter(&spa->spa_props_lock); 6628 mutex_enter(&spa->spa_props_lock);
6629 6629
6630 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6630 while ((elem = nvlist_next_nvpair(nvp, elem))) {
6631 uint64_t intval; 6631 uint64_t intval;
6632 char *strval, *fname; 6632 char *strval, *fname;
6633 zpool_prop_t prop; 6633 zpool_prop_t prop;
6634 const char *propname; 6634 const char *propname;
6635 zprop_type_t proptype; 6635 zprop_type_t proptype;
6636 spa_feature_t fid; 6636 spa_feature_t fid;
6637 6637
6638 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6638 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
6639 case ZPROP_INVAL: 6639 case ZPROP_INVAL:
6640 /* 6640 /*
6641 * We checked this earlier in spa_prop_validate(). 6641 * We checked this earlier in spa_prop_validate().
6642 */ 6642 */
6643 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6643 ASSERT(zpool_prop_feature(nvpair_name(elem)));
6644 6644
6645 fname = strchr(nvpair_name(elem), '@') + 1; 6645 fname = strchr(nvpair_name(elem), '@') + 1;
6646 VERIFY0(zfeature_lookup_name(fname, &fid)); 6646 VERIFY0(zfeature_lookup_name(fname, &fid));
6647 6647
6648 spa_feature_enable(spa, fid, tx); 6648 spa_feature_enable(spa, fid, tx);
6649 spa_history_log_internal(spa, "set", tx, 6649 spa_history_log_internal(spa, "set", tx,
6650 "%s=enabled", nvpair_name(elem)); 6650 "%s=enabled", nvpair_name(elem));
6651 break; 6651 break;
6652 6652
6653 case ZPOOL_PROP_VERSION: 6653 case ZPOOL_PROP_VERSION:
6654 intval = fnvpair_value_uint64(elem); 6654 intval = fnvpair_value_uint64(elem);
6655 /* 6655 /*
6656 * The version is synced seperatly before other 6656 * The version is synced seperatly before other
6657 * properties and should be correct by now. 6657 * properties and should be correct by now.
6658 */ 6658 */
6659 ASSERT3U(spa_version(spa), >=, intval); 6659 ASSERT3U(spa_version(spa), >=, intval);
6660 break; 6660 break;
6661 6661
6662 case ZPOOL_PROP_ALTROOT: 6662 case ZPOOL_PROP_ALTROOT:
6663 /* 6663 /*
6664 * 'altroot' is a non-persistent property. It should 6664 * 'altroot' is a non-persistent property. It should
6665 * have been set temporarily at creation or import time. 6665 * have been set temporarily at creation or import time.
6666 */ 6666 */
6667 ASSERT(spa->spa_root != NULL); 6667 ASSERT(spa->spa_root != NULL);
6668 break; 6668 break;
6669 6669
6670 case ZPOOL_PROP_READONLY: 6670 case ZPOOL_PROP_READONLY:
6671 case ZPOOL_PROP_CACHEFILE: 6671 case ZPOOL_PROP_CACHEFILE:
6672 /* 6672 /*
6673 * 'readonly' and 'cachefile' are also non-persisitent 6673 * 'readonly' and 'cachefile' are also non-persisitent
6674 * properties. 6674 * properties.
6675 */ 6675 */
6676 break; 6676 break;
6677 case ZPOOL_PROP_COMMENT: 6677 case ZPOOL_PROP_COMMENT:
6678 strval = fnvpair_value_string(elem); 6678 strval = fnvpair_value_string(elem);
6679 if (spa->spa_comment != NULL) 6679 if (spa->spa_comment != NULL)
6680 spa_strfree(spa->spa_comment); 6680 spa_strfree(spa->spa_comment);
6681 spa->spa_comment = spa_strdup(strval); 6681 spa->spa_comment = spa_strdup(strval);
6682 /* 6682 /*
6683 * We need to dirty the configuration on all the vdevs 6683 * We need to dirty the configuration on all the vdevs
6684 * so that their labels get updated. It's unnecessary 6684 * so that their labels get updated. It's unnecessary
6685 * to do this for pool creation since the vdev's 6685 * to do this for pool creation since the vdev's
6686 * configuratoin has already been dirtied. 6686 * configuratoin has already been dirtied.
6687 */ 6687 */
6688 if (tx->tx_txg != TXG_INITIAL) 6688 if (tx->tx_txg != TXG_INITIAL)
6689 vdev_config_dirty(spa->spa_root_vdev); 6689 vdev_config_dirty(spa->spa_root_vdev);
6690 spa_history_log_internal(spa, "set", tx, 6690 spa_history_log_internal(spa, "set", tx,
6691 "%s=%s", nvpair_name(elem), strval); 6691 "%s=%s", nvpair_name(elem), strval);
6692 break; 6692 break;
6693 default: 6693 default:
6694 /* 6694 /*
6695 * Set pool property values in the poolprops mos object. 6695 * Set pool property values in the poolprops mos object.
6696 */ 6696 */
6697 if (spa->spa_pool_props_object == 0) { 6697 if (spa->spa_pool_props_object == 0) {
6698 spa->spa_pool_props_object = 6698 spa->spa_pool_props_object =
6699 zap_create_link(mos, DMU_OT_POOL_PROPS, 6699 zap_create_link(mos, DMU_OT_POOL_PROPS,
6700 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6700 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
6701 tx); 6701 tx);
6702 } 6702 }
6703 6703
6704 /* normalize the property name */ 6704 /* normalize the property name */
6705 propname = zpool_prop_to_name(prop); 6705 propname = zpool_prop_to_name(prop);
6706 proptype = zpool_prop_get_type(prop); 6706 proptype = zpool_prop_get_type(prop);
6707 6707
6708 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6708 if (nvpair_type(elem) == DATA_TYPE_STRING) {
6709 ASSERT(proptype == PROP_TYPE_STRING); 6709 ASSERT(proptype == PROP_TYPE_STRING);
6710 strval = fnvpair_value_string(elem); 6710 strval = fnvpair_value_string(elem);
6711 VERIFY0(zap_update(mos, 6711 VERIFY0(zap_update(mos,
6712 spa->spa_pool_props_object, propname, 6712 spa->spa_pool_props_object, propname,
6713 1, strlen(strval) + 1, strval, tx)); 6713 1, strlen(strval) + 1, strval, tx));
6714 spa_history_log_internal(spa, "set", tx, 6714 spa_history_log_internal(spa, "set", tx,
6715 "%s=%s", nvpair_name(elem), strval); 6715 "%s=%s", nvpair_name(elem), strval);
6716 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6716 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
6717 intval = fnvpair_value_uint64(elem); 6717 intval = fnvpair_value_uint64(elem);
6718 6718
6719 if (proptype == PROP_TYPE_INDEX) { 6719 if (proptype == PROP_TYPE_INDEX) {
6720 const char *unused; 6720 const char *unused;
6721 VERIFY0(zpool_prop_index_to_string( 6721 VERIFY0(zpool_prop_index_to_string(
6722 prop, intval, &unused)); 6722 prop, intval, &unused));
6723 } 6723 }
6724 VERIFY0(zap_update(mos, 6724 VERIFY0(zap_update(mos,
6725 spa->spa_pool_props_object, propname, 6725 spa->spa_pool_props_object, propname,
6726 8, 1, &intval, tx)); 6726 8, 1, &intval, tx));
6727 spa_history_log_internal(spa, "set", tx, 6727 spa_history_log_internal(spa, "set", tx,
6728 "%s=%lld", nvpair_name(elem), intval); 6728 "%s=%lld", nvpair_name(elem), intval);
6729 } else { 6729 } else {
6730 ASSERT(0); /* not allowed */ 6730 ASSERT(0); /* not allowed */
6731 } 6731 }
6732 6732
6733 switch (prop) { 6733 switch (prop) {
6734 case ZPOOL_PROP_DELEGATION: 6734 case ZPOOL_PROP_DELEGATION:
6735 spa->spa_delegation = intval; 6735 spa->spa_delegation = intval;
6736 break; 6736 break;
6737 case ZPOOL_PROP_BOOTFS: 6737 case ZPOOL_PROP_BOOTFS:
6738 spa->spa_bootfs = intval; 6738 spa->spa_bootfs = intval;
6739 break; 6739 break;
6740 case ZPOOL_PROP_FAILUREMODE: 6740 case ZPOOL_PROP_FAILUREMODE:
6741 spa->spa_failmode = intval; 6741 spa->spa_failmode = intval;
6742 break; 6742 break;
6743 case ZPOOL_PROP_AUTOEXPAND: 6743 case ZPOOL_PROP_AUTOEXPAND:
6744 spa->spa_autoexpand = intval; 6744 spa->spa_autoexpand = intval;
6745 if (tx->tx_txg != TXG_INITIAL) 6745 if (tx->tx_txg != TXG_INITIAL)
6746 spa_async_request(spa, 6746 spa_async_request(spa,
6747 SPA_ASYNC_AUTOEXPAND); 6747 SPA_ASYNC_AUTOEXPAND);
6748 break; 6748 break;
6749 case ZPOOL_PROP_DEDUPDITTO: 6749 case ZPOOL_PROP_DEDUPDITTO:
6750 spa->spa_dedup_ditto = intval; 6750 spa->spa_dedup_ditto = intval;
6751 break; 6751 break;
6752 default: 6752 default:
6753 break; 6753 break;
6754 } 6754 }
6755 } 6755 }
6756 6756
6757 } 6757 }
6758 6758
6759 mutex_exit(&spa->spa_props_lock); 6759 mutex_exit(&spa->spa_props_lock);
6760} 6760}
6761 6761
6762/* 6762/*
6763 * Perform one-time upgrade on-disk changes. spa_version() does not 6763 * Perform one-time upgrade on-disk changes. spa_version() does not
6764 * reflect the new version this txg, so there must be no changes this 6764 * reflect the new version this txg, so there must be no changes this
6765 * txg to anything that the upgrade code depends on after it executes. 6765 * txg to anything that the upgrade code depends on after it executes.
6766 * Therefore this must be called after dsl_pool_sync() does the sync 6766 * Therefore this must be called after dsl_pool_sync() does the sync
6767 * tasks. 6767 * tasks.
6768 */ 6768 */
6769static void 6769static void
6770spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6770spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
6771{ 6771{
6772 dsl_pool_t *dp = spa->spa_dsl_pool; 6772 dsl_pool_t *dp = spa->spa_dsl_pool;
6773 6773
6774 ASSERT(spa->spa_sync_pass == 1); 6774 ASSERT(spa->spa_sync_pass == 1);
6775 6775
6776 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6776 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
6777 6777
6778 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6778 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
6779 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6779 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
6780 dsl_pool_create_origin(dp, tx); 6780 dsl_pool_create_origin(dp, tx);
6781 6781
6782 /* Keeping the origin open increases spa_minref */ 6782 /* Keeping the origin open increases spa_minref */
6783 spa->spa_minref += 3; 6783 spa->spa_minref += 3;
6784 } 6784 }
6785 6785
6786 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6786 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
6787 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6787 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
6788 dsl_pool_upgrade_clones(dp, tx); 6788 dsl_pool_upgrade_clones(dp, tx);
6789 } 6789 }
6790 6790
6791 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6791 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
6792 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6792 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
6793 dsl_pool_upgrade_dir_clones(dp, tx); 6793 dsl_pool_upgrade_dir_clones(dp, tx);
6794 6794
6795 /* Keeping the freedir open increases spa_minref */ 6795 /* Keeping the freedir open increases spa_minref */
6796 spa->spa_minref += 3; 6796 spa->spa_minref += 3;
6797 } 6797 }
6798 6798
6799 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6799 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
6800 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6800 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6801 spa_feature_create_zap_objects(spa, tx); 6801 spa_feature_create_zap_objects(spa, tx);
6802 } 6802 }
6803 6803
6804 /* 6804 /*
6805 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6805 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
6806 * when possibility to use lz4 compression for metadata was added 6806 * when possibility to use lz4 compression for metadata was added
6807 * Old pools that have this feature enabled must be upgraded to have 6807 * Old pools that have this feature enabled must be upgraded to have
6808 * this feature active 6808 * this feature active
6809 */ 6809 */
6810 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6810 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6811 boolean_t lz4_en = spa_feature_is_enabled(spa, 6811 boolean_t lz4_en = spa_feature_is_enabled(spa,
6812 SPA_FEATURE_LZ4_COMPRESS); 6812 SPA_FEATURE_LZ4_COMPRESS);
6813 boolean_t lz4_ac = spa_feature_is_active(spa, 6813 boolean_t lz4_ac = spa_feature_is_active(spa,
6814 SPA_FEATURE_LZ4_COMPRESS); 6814 SPA_FEATURE_LZ4_COMPRESS);
6815 6815
6816 if (lz4_en && !lz4_ac) 6816 if (lz4_en && !lz4_ac)
6817 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6817 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
6818 } 6818 }
6819 6819
6820 /* 6820 /*
6821 * If we haven't written the salt, do so now. Note that the 6821 * If we haven't written the salt, do so now. Note that the
6822 * feature may not be activated yet, but that's fine since 6822 * feature may not be activated yet, but that's fine since
6823 * the presence of this ZAP entry is backwards compatible. 6823 * the presence of this ZAP entry is backwards compatible.
6824 */ 6824 */
6825 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6825 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
6826 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 6826 DMU_POOL_CHECKSUM_SALT) == ENOENT) {
6827 VERIFY0(zap_add(spa->spa_meta_objset, 6827 VERIFY0(zap_add(spa->spa_meta_objset,
6828 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 6828 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
6829 sizeof (spa->spa_cksum_salt.zcs_bytes), 6829 sizeof (spa->spa_cksum_salt.zcs_bytes),
6830 spa->spa_cksum_salt.zcs_bytes, tx)); 6830 spa->spa_cksum_salt.zcs_bytes, tx));
6831 } 6831 }
6832 6832
6833 rrw_exit(&dp->dp_config_rwlock, FTAG); 6833 rrw_exit(&dp->dp_config_rwlock, FTAG);
6834} 6834}
6835 6835
6836/* 6836/*
6837 * Sync the specified transaction group. New blocks may be dirtied as 6837 * Sync the specified transaction group. New blocks may be dirtied as
6838 * part of the process, so we iterate until it converges. 6838 * part of the process, so we iterate until it converges.
6839 */ 6839 */
6840 6840
6841void 6841void
6842spa_sync(spa_t *spa, uint64_t txg) 6842spa_sync(spa_t *spa, uint64_t txg)
6843{ 6843{
6844 dsl_pool_t *dp = spa->spa_dsl_pool; 6844 dsl_pool_t *dp = spa->spa_dsl_pool;
6845 objset_t *mos = spa->spa_meta_objset; 6845 objset_t *mos = spa->spa_meta_objset;
6846 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6846 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
6847 vdev_t *rvd = spa->spa_root_vdev; 6847 vdev_t *rvd = spa->spa_root_vdev;
6848 vdev_t *vd; 6848 vdev_t *vd;
6849 dmu_tx_t *tx; 6849 dmu_tx_t *tx;
6850 int error; 6850 int error;
6851 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 6851 uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
6852 zfs_vdev_queue_depth_pct / 100; 6852 zfs_vdev_queue_depth_pct / 100;
6853 6853
6854 VERIFY(spa_writeable(spa)); 6854 VERIFY(spa_writeable(spa));
6855 6855
6856 /* 6856 /*
6857 * Lock out configuration changes. 6857 * Lock out configuration changes.
6858 */ 6858 */
6859 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6859 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6860 6860
6861 spa->spa_syncing_txg = txg; 6861 spa->spa_syncing_txg = txg;
6862 spa->spa_sync_pass = 0; 6862 spa->spa_sync_pass = 0;
6863 6863
6864 mutex_enter(&spa->spa_alloc_lock); 6864 mutex_enter(&spa->spa_alloc_lock);
6865 VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 6865 VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
6866 mutex_exit(&spa->spa_alloc_lock); 6866 mutex_exit(&spa->spa_alloc_lock);
6867 6867
6868 /* 6868 /*
6869 * If there are any pending vdev state changes, convert them 6869 * If there are any pending vdev state changes, convert them
6870 * into config changes that go out with this transaction group. 6870 * into config changes that go out with this transaction group.
6871 */ 6871 */
6872 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6872 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6873 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6873 while (list_head(&spa->spa_state_dirty_list) != NULL) {
6874 /* 6874 /*
6875 * We need the write lock here because, for aux vdevs, 6875 * We need the write lock here because, for aux vdevs,
6876 * calling vdev_config_dirty() modifies sav_config. 6876 * calling vdev_config_dirty() modifies sav_config.
6877 * This is ugly and will become unnecessary when we 6877 * This is ugly and will become unnecessary when we
6878 * eliminate the aux vdev wart by integrating all vdevs 6878 * eliminate the aux vdev wart by integrating all vdevs
6879 * into the root vdev tree. 6879 * into the root vdev tree.
6880 */ 6880 */
6881 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6881 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6882 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6882 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
6883 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6883 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
6884 vdev_state_clean(vd); 6884 vdev_state_clean(vd);
6885 vdev_config_dirty(vd); 6885 vdev_config_dirty(vd);
6886 } 6886 }
6887 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6887 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6888 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6888 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
6889 } 6889 }
6890 spa_config_exit(spa, SCL_STATE, FTAG); 6890 spa_config_exit(spa, SCL_STATE, FTAG);
6891 6891
6892 tx = dmu_tx_create_assigned(dp, txg); 6892 tx = dmu_tx_create_assigned(dp, txg);
6893 6893
6894 spa->spa_sync_starttime = gethrtime(); 6894 spa->spa_sync_starttime = gethrtime();
6895#ifdef illumos 6895#ifdef illumos
6896 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6896 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
6897 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6897 spa->spa_sync_starttime + spa->spa_deadman_synctime));
6898#endif /* illumos */ 6898#endif /* illumos */
6899#ifdef __FreeBSD__ 6899#ifdef __FreeBSD__
6900#ifdef _KERNEL 6900#ifdef _KERNEL
6901 callout_schedule(&spa->spa_deadman_cycid, 6901 callout_schedule(&spa->spa_deadman_cycid,
6902 hz * spa->spa_deadman_synctime / NANOSEC); 6902 hz * spa->spa_deadman_synctime / NANOSEC);
6903#endif 6903#endif
6904#endif /* __FreeBSD__ */ 6904#endif /* __FreeBSD__ */
6905#ifdef __NetBSD__ 6905#ifdef __NetBSD__
6906#ifdef _KERNEL 6906#ifdef _KERNEL
6907 callout_schedule(&spa->spa_deadman_cycid, 6907 callout_schedule(&spa->spa_deadman_cycid,
6908 hz * spa->spa_deadman_synctime / NANOSEC); 6908 hz * spa->spa_deadman_synctime / NANOSEC);
6909#endif 6909#endif
6910#endif 6910#endif
6911 6911
6912 /* 6912 /*
6913 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6913 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
6914 * set spa_deflate if we have no raid-z vdevs. 6914 * set spa_deflate if we have no raid-z vdevs.
6915 */ 6915 */
6916 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6916 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
6917 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6917 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
6918 int i; 6918 int i;
6919 6919
6920 for (i = 0; i < rvd->vdev_children; i++) { 6920 for (i = 0; i < rvd->vdev_children; i++) {
6921 vd = rvd->vdev_child[i]; 6921 vd = rvd->vdev_child[i];
6922 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6922 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
6923 break; 6923 break;
6924 } 6924 }
6925 if (i == rvd->vdev_children) { 6925 if (i == rvd->vdev_children) {
6926 spa->spa_deflate = TRUE; 6926 spa->spa_deflate = TRUE;
6927 VERIFY(0 == zap_add(spa->spa_meta_objset, 6927 VERIFY(0 == zap_add(spa->spa_meta_objset,
6928 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6928 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6929 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6929 sizeof (uint64_t), 1, &spa->spa_deflate, tx));
6930 } 6930 }
6931 } 6931 }
6932 6932
6933 /* 6933 /*
6934 * Set the top-level vdev's max queue depth. Evaluate each 6934 * Set the top-level vdev's max queue depth. Evaluate each
6935 * top-level's async write queue depth in case it changed. 6935 * top-level's async write queue depth in case it changed.
6936 * The max queue depth will not change in the middle of syncing 6936 * The max queue depth will not change in the middle of syncing
6937 * out this txg. 6937 * out this txg.
6938 */ 6938 */
6939 uint64_t queue_depth_total = 0; 6939 uint64_t queue_depth_total = 0;
6940 for (int c = 0; c < rvd->vdev_children; c++) { 6940 for (int c = 0; c < rvd->vdev_children; c++) {
6941 vdev_t *tvd = rvd->vdev_child[c]; 6941 vdev_t *tvd = rvd->vdev_child[c];
6942 metaslab_group_t *mg = tvd->vdev_mg; 6942 metaslab_group_t *mg = tvd->vdev_mg;
6943 6943
6944 if (mg == NULL || mg->mg_class != spa_normal_class(spa) || 6944 if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
6945 !metaslab_group_initialized(mg)) 6945 !metaslab_group_initialized(mg))
6946 continue; 6946 continue;
6947 6947
6948 /* 6948 /*
6949 * It is safe to do a lock-free check here because only async 6949 * It is safe to do a lock-free check here because only async
6950 * allocations look at mg_max_alloc_queue_depth, and async 6950 * allocations look at mg_max_alloc_queue_depth, and async
6951 * allocations all happen from spa_sync(). 6951 * allocations all happen from spa_sync().
6952 */ 6952 */
6953 ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); 6953 ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
6954 mg->mg_max_alloc_queue_depth = max_queue_depth; 6954 mg->mg_max_alloc_queue_depth = max_queue_depth;
6955 queue_depth_total += mg->mg_max_alloc_queue_depth; 6955 queue_depth_total += mg->mg_max_alloc_queue_depth;
6956 } 6956 }
6957 metaslab_class_t *mc = spa_normal_class(spa); 6957 metaslab_class_t *mc = spa_normal_class(spa);
6958 ASSERT0(refcount_count(&mc->mc_alloc_slots)); 6958 ASSERT0(refcount_count(&mc->mc_alloc_slots));
6959 mc->mc_alloc_max_slots = queue_depth_total; 6959 mc->mc_alloc_max_slots = queue_depth_total;
6960 mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 6960 mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
6961 6961
6962 ASSERT3U(mc->mc_alloc_max_slots, <=, 6962 ASSERT3U(mc->mc_alloc_max_slots, <=,
6963 max_queue_depth * rvd->vdev_children); 6963 max_queue_depth * rvd->vdev_children);
6964 6964
6965 /* 6965 /*
6966 * Iterate to convergence. 6966 * Iterate to convergence.
6967 */ 6967 */
6968 do { 6968 do {
6969 int pass = ++spa->spa_sync_pass; 6969 int pass = ++spa->spa_sync_pass;
6970 6970
6971 spa_sync_config_object(spa, tx); 6971 spa_sync_config_object(spa, tx);
6972 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6972 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6973 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6973 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6974 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6974 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6975 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6975 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6976 spa_errlog_sync(spa, txg); 6976 spa_errlog_sync(spa, txg);
6977 dsl_pool_sync(dp, txg); 6977 dsl_pool_sync(dp, txg);
6978 6978
6979 if (pass < zfs_sync_pass_deferred_free) { 6979 if (pass < zfs_sync_pass_deferred_free) {
6980 spa_sync_frees(spa, free_bpl, tx); 6980 spa_sync_frees(spa, free_bpl, tx);
6981 } else { 6981 } else {
6982 /* 6982 /*
6983 * We can not defer frees in pass 1, because 6983 * We can not defer frees in pass 1, because
6984 * we sync the deferred frees later in pass 1. 6984 * we sync the deferred frees later in pass 1.
6985 */ 6985 */
6986 ASSERT3U(pass, >, 1); 6986 ASSERT3U(pass, >, 1);
6987 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6987 bplist_iterate(free_bpl, bpobj_enqueue_cb,
6988 &spa->spa_deferred_bpobj, tx); 6988 &spa->spa_deferred_bpobj, tx);
6989 } 6989 }
6990 6990
6991 ddt_sync(spa, txg); 6991 ddt_sync(spa, txg);
6992 dsl_scan_sync(dp, tx); 6992 dsl_scan_sync(dp, tx);
6993 6993
6994 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6994 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
6995 vdev_sync(vd, txg); 6995 vdev_sync(vd, txg);
6996 6996
6997 if (pass == 1) { 6997 if (pass == 1) {
6998 spa_sync_upgrades(spa, tx); 6998 spa_sync_upgrades(spa, tx);
6999 ASSERT3U(txg, >=, 6999 ASSERT3U(txg, >=,
7000 spa->spa_uberblock.ub_rootbp.blk_birth); 7000 spa->spa_uberblock.ub_rootbp.blk_birth);
7001 /* 7001 /*
7002 * Note: We need to check if the MOS is dirty 7002 * Note: We need to check if the MOS is dirty
7003 * because we could have marked the MOS dirty 7003 * because we could have marked the MOS dirty
7004 * without updating the uberblock (e.g. if we 7004 * without updating the uberblock (e.g. if we
7005 * have sync tasks but no dirty user data). We 7005 * have sync tasks but no dirty user data). We
7006 * need to check the uberblock's rootbp because 7006 * need to check the uberblock's rootbp because
7007 * it is updated if we have synced out dirty 7007 * it is updated if we have synced out dirty
7008 * data (though in this case the MOS will most 7008 * data (though in this case the MOS will most
7009 * likely also be dirty due to second order 7009 * likely also be dirty due to second order
7010 * effects, we don't want to rely on that here). 7010 * effects, we don't want to rely on that here).
7011 */ 7011 */
7012 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 7012 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
7013 !dmu_objset_is_dirty(mos, txg)) { 7013 !dmu_objset_is_dirty(mos, txg)) {
7014 /* 7014 /*
7015 * Nothing changed on the first pass, 7015 * Nothing changed on the first pass,
7016 * therefore this TXG is a no-op. Avoid 7016 * therefore this TXG is a no-op. Avoid
7017 * syncing deferred frees, so that we 7017 * syncing deferred frees, so that we
7018 * can keep this TXG as a no-op. 7018 * can keep this TXG as a no-op.
7019 */ 7019 */
7020 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 7020 ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
7021 txg)); 7021 txg));
7022 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 7022 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
7023 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 7023 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
7024 break; 7024 break;
7025 } 7025 }
7026 spa_sync_deferred_frees(spa, tx); 7026 spa_sync_deferred_frees(spa, tx);
7027 } 7027 }
7028 7028
7029 } while (dmu_objset_is_dirty(mos, txg)); 7029 } while (dmu_objset_is_dirty(mos, txg));
7030 7030
7031 if (!list_is_empty(&spa->spa_config_dirty_list)) { 7031 if (!list_is_empty(&spa->spa_config_dirty_list)) {
7032 /* 7032 /*
7033 * Make sure that the number of ZAPs for all the vdevs matches 7033 * Make sure that the number of ZAPs for all the vdevs matches
7034 * the number of ZAPs in the per-vdev ZAP list. This only gets 7034 * the number of ZAPs in the per-vdev ZAP list. This only gets
7035 * called if the config is dirty; otherwise there may be 7035 * called if the config is dirty; otherwise there may be
7036 * outstanding AVZ operations that weren't completed in 7036 * outstanding AVZ operations that weren't completed in
7037 * spa_sync_config_object. 7037 * spa_sync_config_object.
7038 */ 7038 */
7039 uint64_t all_vdev_zap_entry_count; 7039 uint64_t all_vdev_zap_entry_count;
7040 ASSERT0(zap_count(spa->spa_meta_objset, 7040 ASSERT0(zap_count(spa->spa_meta_objset,
7041 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 7041 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
7042 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 7042 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
7043 all_vdev_zap_entry_count); 7043 all_vdev_zap_entry_count);
7044 } 7044 }
7045 7045
7046 /* 7046 /*
7047 * Rewrite the vdev configuration (which includes the uberblock) 7047 * Rewrite the vdev configuration (which includes the uberblock)
7048 * to commit the transaction group. 7048 * to commit the transaction group.
7049 * 7049 *
7050 * If there are no dirty vdevs, we sync the uberblock to a few 7050 * If there are no dirty vdevs, we sync the uberblock to a few
7051 * random top-level vdevs that are known to be visible in the 7051 * random top-level vdevs that are known to be visible in the
7052 * config cache (see spa_vdev_add() for a complete description). 7052 * config cache (see spa_vdev_add() for a complete description).
7053 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 7053 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
7054 */ 7054 */
7055 for (;;) { 7055 for (;;) {
7056 /* 7056 /*
7057 * We hold SCL_STATE to prevent vdev open/close/etc. 7057 * We hold SCL_STATE to prevent vdev open/close/etc.
7058 * while we're attempting to write the vdev labels. 7058 * while we're attempting to write the vdev labels.
7059 */ 7059 */
7060 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 7060 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
7061 7061
7062 if (list_is_empty(&spa->spa_config_dirty_list)) { 7062 if (list_is_empty(&spa->spa_config_dirty_list)) {
7063 vdev_t *svd[SPA_DVAS_PER_BP]; 7063 vdev_t *svd[SPA_DVAS_PER_BP];
7064 int svdcount = 0; 7064 int svdcount = 0;
7065 int children = rvd->vdev_children; 7065 int children = rvd->vdev_children;
7066 int c0 = spa_get_random(children); 7066 int c0 = spa_get_random(children);
7067 7067
7068 for (int c = 0; c < children; c++) { 7068 for (int c = 0; c < children; c++) {
7069 vd = rvd->vdev_child[(c0 + c) % children]; 7069 vd = rvd->vdev_child[(c0 + c) % children];
7070 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 7070 if (vd->vdev_ms_array == 0 || vd->vdev_islog)
7071 continue; 7071 continue;
7072 svd[svdcount++] = vd; 7072 svd[svdcount++] = vd;
7073 if (svdcount == SPA_DVAS_PER_BP) 7073 if (svdcount == SPA_DVAS_PER_BP)
7074 break; 7074 break;
7075 } 7075 }
7076 error = vdev_config_sync(svd, svdcount, txg); 7076 error = vdev_config_sync(svd, svdcount, txg);
7077 } else { 7077 } else {
7078 error = vdev_config_sync(rvd->vdev_child, 7078 error = vdev_config_sync(rvd->vdev_child,
7079 rvd->vdev_children, txg); 7079 rvd->vdev_children, txg);
7080 } 7080 }
7081 7081
7082 if (error == 0) 7082 if (error == 0)
7083 spa->spa_last_synced_guid = rvd->vdev_guid; 7083 spa->spa_last_synced_guid = rvd->vdev_guid;
7084 7084
7085 spa_config_exit(spa, SCL_STATE, FTAG); 7085 spa_config_exit(spa, SCL_STATE, FTAG);
7086 7086
7087 if (error == 0) 7087 if (error == 0)
7088 break; 7088 break;
7089 zio_suspend(spa, NULL); 7089 zio_suspend(spa, NULL);
7090 zio_resume_wait(spa); 7090 zio_resume_wait(spa);
7091 } 7091 }
7092 dmu_tx_commit(tx); 7092 dmu_tx_commit(tx);
7093 7093
7094#ifdef illumos 7094#ifdef illumos
7095 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 7095 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
7096#endif /* illumos */ 7096#endif /* illumos */
7097#ifdef __FreeBSD__ 7097#ifdef __FreeBSD__
7098#ifdef _KERNEL 7098#ifdef _KERNEL
7099 callout_drain(&spa->spa_deadman_cycid); 7099 callout_drain(&spa->spa_deadman_cycid);
7100#endif 7100#endif
7101#endif /* __FreeBSD__ */ 7101#endif /* __FreeBSD__ */
7102#ifdef __NetBSD__ 7102#ifdef __NetBSD__
7103#ifdef _KERNEL 7103#ifdef _KERNEL
7104 callout_drain(&spa->spa_deadman_cycid); 7104 callout_drain(&spa->spa_deadman_cycid);
7105#endif 7105#endif
7106#endif /* __NetBSD__ */ 7106#endif /* __NetBSD__ */
7107 7107
7108 /* 7108 /*
7109 * Clear the dirty config list. 7109 * Clear the dirty config list.
7110 */ 7110 */
7111 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 7111 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
7112 vdev_config_clean(vd); 7112 vdev_config_clean(vd);
7113 7113
7114 /* 7114 /*
7115 * Now that the new config has synced transactionally, 7115 * Now that the new config has synced transactionally,
7116 * let it become visible to the config cache. 7116 * let it become visible to the config cache.
7117 */ 7117 */
7118 if (spa->spa_config_syncing != NULL) { 7118 if (spa->spa_config_syncing != NULL) {
7119 spa_config_set(spa, spa->spa_config_syncing); 7119 spa_config_set(spa, spa->spa_config_syncing);
7120 spa->spa_config_txg = txg; 7120 spa->spa_config_txg = txg;
7121 spa->spa_config_syncing = NULL; 7121 spa->spa_config_syncing = NULL;
7122 } 7122 }
7123 7123
7124 dsl_pool_sync_done(dp, txg); 7124 dsl_pool_sync_done(dp, txg);
7125 7125
7126 mutex_enter(&spa->spa_alloc_lock); 7126 mutex_enter(&spa->spa_alloc_lock);
7127 VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 7127 VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
7128 mutex_exit(&spa->spa_alloc_lock); 7128 mutex_exit(&spa->spa_alloc_lock);
7129 7129
7130 /* 7130 /*
7131 * Update usable space statistics. 7131 * Update usable space statistics.
7132 */ 7132 */
7133 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 7133 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
7134 vdev_sync_done(vd, txg); 7134 vdev_sync_done(vd, txg);
7135 7135
7136 spa_update_dspace(spa); 7136 spa_update_dspace(spa);
7137 7137
7138 /* 7138 /*
7139 * It had better be the case that we didn't dirty anything 7139 * It had better be the case that we didn't dirty anything
7140 * since vdev_config_sync(). 7140 * since vdev_config_sync().
7141 */ 7141 */
7142 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 7142 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
7143 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 7143 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
7144 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 7144 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
7145 7145
7146 spa->spa_sync_pass = 0; 7146 spa->spa_sync_pass = 0;
7147 7147
7148 /* 7148 /*
7149 * Update the last synced uberblock here. We want to do this at 7149 * Update the last synced uberblock here. We want to do this at
7150 * the end of spa_sync() so that consumers of spa_last_synced_txg() 7150 * the end of spa_sync() so that consumers of spa_last_synced_txg()
7151 * will be guaranteed that all the processing associated with 7151 * will be guaranteed that all the processing associated with
7152 * that txg has been completed. 7152 * that txg has been completed.
7153 */ 7153 */
7154 spa->spa_ubsync = spa->spa_uberblock; 7154 spa->spa_ubsync = spa->spa_uberblock;
7155 spa_config_exit(spa, SCL_CONFIG, FTAG); 7155 spa_config_exit(spa, SCL_CONFIG, FTAG);
7156 7156
7157 spa_handle_ignored_writes(spa); 7157 spa_handle_ignored_writes(spa);
7158 7158
7159 /* 7159 /*
7160 * If any async tasks have been requested, kick them off. 7160 * If any async tasks have been requested, kick them off.
7161 */ 7161 */
7162 spa_async_dispatch(spa); 7162 spa_async_dispatch(spa);
7163 spa_async_dispatch_vd(spa); 7163 spa_async_dispatch_vd(spa);
7164} 7164}
7165 7165
7166/* 7166/*
7167 * Sync all pools. We don't want to hold the namespace lock across these 7167 * Sync all pools. We don't want to hold the namespace lock across these
7168 * operations, so we take a reference on the spa_t and drop the lock during the 7168 * operations, so we take a reference on the spa_t and drop the lock during the
7169 * sync. 7169 * sync.
7170 */ 7170 */
7171void 7171void
7172spa_sync_allpools(void) 7172spa_sync_allpools(void)
7173{ 7173{
7174 spa_t *spa = NULL; 7174 spa_t *spa = NULL;
7175 mutex_enter(&spa_namespace_lock); 7175 mutex_enter(&spa_namespace_lock);
7176 while ((spa = spa_next(spa)) != NULL) { 7176 while ((spa = spa_next(spa)) != NULL) {
7177 if (spa_state(spa) != POOL_STATE_ACTIVE || 7177 if (spa_state(spa) != POOL_STATE_ACTIVE ||
7178 !spa_writeable(spa) || spa_suspended(spa)) 7178 !spa_writeable(spa) || spa_suspended(spa))
7179 continue; 7179 continue;
7180 spa_open_ref(spa, FTAG); 7180 spa_open_ref(spa, FTAG);
7181 mutex_exit(&spa_namespace_lock); 7181 mutex_exit(&spa_namespace_lock);
7182 txg_wait_synced(spa_get_dsl(spa), 0); 7182 txg_wait_synced(spa_get_dsl(spa), 0);
7183 mutex_enter(&spa_namespace_lock); 7183 mutex_enter(&spa_namespace_lock);
7184 spa_close(spa, FTAG); 7184 spa_close(spa, FTAG);
7185 } 7185 }
7186 mutex_exit(&spa_namespace_lock); 7186 mutex_exit(&spa_namespace_lock);
7187} 7187}
7188 7188
7189/* 7189/*
7190 * ========================================================================== 7190 * ==========================================================================
7191 * Miscellaneous routines 7191 * Miscellaneous routines
7192 * ========================================================================== 7192 * ==========================================================================
7193 */ 7193 */
7194 7194
7195/* 7195/*
7196 * Remove all pools in the system. 7196 * Remove all pools in the system.
7197 */ 7197 */
7198void 7198void
7199spa_evict_all(void) 7199spa_evict_all(void)
7200{ 7200{
7201 spa_t *spa; 7201 spa_t *spa;
7202 7202
7203 /* 7203 /*
7204 * Remove all cached state. All pools should be closed now, 7204 * Remove all cached state. All pools should be closed now,
7205 * so every spa in the AVL tree should be unreferenced. 7205 * so every spa in the AVL tree should be unreferenced.
7206 */ 7206 */
7207 mutex_enter(&spa_namespace_lock); 7207 mutex_enter(&spa_namespace_lock);
7208 while ((spa = spa_next(NULL)) != NULL) { 7208 while ((spa = spa_next(NULL)) != NULL) {
7209 /* 7209 /*
7210 * Stop async tasks. The async thread may need to detach 7210 * Stop async tasks. The async thread may need to detach
7211 * a device that's been replaced, which requires grabbing 7211 * a device that's been replaced, which requires grabbing
7212 * spa_namespace_lock, so we must drop it here. 7212 * spa_namespace_lock, so we must drop it here.
7213 */ 7213 */
7214 spa_open_ref(spa, FTAG); 7214 spa_open_ref(spa, FTAG);
7215 mutex_exit(&spa_namespace_lock); 7215 mutex_exit(&spa_namespace_lock);
7216 spa_async_suspend(spa); 7216 spa_async_suspend(spa);
7217 mutex_enter(&spa_namespace_lock); 7217 mutex_enter(&spa_namespace_lock);
7218 spa_close(spa, FTAG); 7218 spa_close(spa, FTAG);
7219 7219
7220 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 7220 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
7221 spa_unload(spa); 7221 spa_unload(spa);
7222 spa_deactivate(spa); 7222 spa_deactivate(spa);
7223 } 7223 }
7224 spa_remove(spa); 7224 spa_remove(spa);
7225 } 7225 }
7226 mutex_exit(&spa_namespace_lock); 7226 mutex_exit(&spa_namespace_lock);
7227} 7227}
7228 7228
7229vdev_t * 7229vdev_t *
7230spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 7230spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
7231{ 7231{
7232 vdev_t *vd; 7232 vdev_t *vd;
7233 int i; 7233 int i;
7234 7234
7235 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 7235 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
7236 return (vd); 7236 return (vd);
7237 7237
7238 if (aux) { 7238 if (aux) {
7239 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 7239 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
7240 vd = spa->spa_l2cache.sav_vdevs[i]; 7240 vd = spa->spa_l2cache.sav_vdevs[i];
7241 if (vd->vdev_guid == guid) 7241 if (vd->vdev_guid == guid)
7242 return (vd); 7242 return (vd);
7243 } 7243 }
7244 7244
7245 for (i = 0; i < spa->spa_spares.sav_count; i++) { 7245 for (i = 0; i < spa->spa_spares.sav_count; i++) {
7246 vd = spa->spa_spares.sav_vdevs[i]; 7246 vd = spa->spa_spares.sav_vdevs[i];
7247 if (vd->vdev_guid == guid) 7247 if (vd->vdev_guid == guid)
7248 return (vd); 7248 return (vd);
7249 } 7249 }
7250 } 7250 }
7251 7251
7252 return (NULL); 7252 return (NULL);
7253} 7253}
7254 7254
7255void 7255void
7256spa_upgrade(spa_t *spa, uint64_t version) 7256spa_upgrade(spa_t *spa, uint64_t version)
7257{ 7257{
7258 ASSERT(spa_writeable(spa)); 7258 ASSERT(spa_writeable(spa));
7259 7259
7260 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7260 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7261 7261
7262 /* 7262 /*
7263 * This should only be called for a non-faulted pool, and since a 7263 * This should only be called for a non-faulted pool, and since a
7264 * future version would result in an unopenable pool, this shouldn't be 7264 * future version would result in an unopenable pool, this shouldn't be
7265 * possible. 7265 * possible.
7266 */ 7266 */
7267 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 7267 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
7268 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 7268 ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
7269 7269
7270 spa->spa_uberblock.ub_version = version; 7270 spa->spa_uberblock.ub_version = version;
7271 vdev_config_dirty(spa->spa_root_vdev); 7271 vdev_config_dirty(spa->spa_root_vdev);
7272 7272
7273 spa_config_exit(spa, SCL_ALL, FTAG); 7273 spa_config_exit(spa, SCL_ALL, FTAG);
7274 7274
7275 txg_wait_synced(spa_get_dsl(spa), 0); 7275 txg_wait_synced(spa_get_dsl(spa), 0);
7276} 7276}
7277 7277
7278boolean_t 7278boolean_t
7279spa_has_spare(spa_t *spa, uint64_t guid) 7279spa_has_spare(spa_t *spa, uint64_t guid)
7280{ 7280{
7281 int i; 7281 int i;
7282 uint64_t spareguid; 7282 uint64_t spareguid;
7283 spa_aux_vdev_t *sav = &spa->spa_spares; 7283 spa_aux_vdev_t *sav = &spa->spa_spares;
7284 7284
7285 for (i = 0; i < sav->sav_count; i++) 7285 for (i = 0; i < sav->sav_count; i++)
7286 if (sav->sav_vdevs[i]->vdev_guid == guid) 7286 if (sav->sav_vdevs[i]->vdev_guid == guid)
7287 return (B_TRUE); 7287 return (B_TRUE);
7288 7288
7289 for (i = 0; i < sav->sav_npending; i++) { 7289 for (i = 0; i < sav->sav_npending; i++) {
7290 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 7290 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
7291 &spareguid) == 0 && spareguid == guid) 7291 &spareguid) == 0 && spareguid == guid)
7292 return (B_TRUE); 7292 return (B_TRUE);
7293 } 7293 }
7294 7294
7295 return (B_FALSE); 7295 return (B_FALSE);