98
* Get the replaceable or attachable device size.
99
* If the parent is a mirror or raidz, the replaceable size is the minimum
100
* psize of all its children. For the rest, just return our own psize.
100
* Get the minimum allocatable size. We define the allocatable size as
101
* the vdev's asize rounded to the nearest metaslab. This allows us to
102
* replace or attach devices which don't have the same physical size but
103
* can still satisfy the same number of allocations.
111
vdev_get_rsize(vdev_t *vd)
116
pvd = vd->vdev_parent;
119
* If our parent is NULL or the root, just return our own psize.
121
if (pvd == NULL || pvd->vdev_parent == NULL)
122
return (vd->vdev_psize);
126
for (c = 0; c < pvd->vdev_children; c++) {
127
cvd = pvd->vdev_child[c];
128
rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
106
vdev_get_min_asize(vdev_t *vd)
108
vdev_t *pvd = vd->vdev_parent;
111
* The our parent is NULL (inactive spare or cache) or is the root,
112
* just return our own asize.
115
return (vd->vdev_asize);
118
* The top-level vdev just returns the allocatable size rounded
119
* to the nearest metaslab.
121
if (vd == vd->vdev_top)
122
return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
125
* The allocatable space for a raidz vdev is N * sizeof(smallest child),
126
* so each child must provide at least 1/Nth of its asize.
128
if (pvd->vdev_ops == &vdev_raidz_ops)
129
return (pvd->vdev_min_asize / pvd->vdev_children);
131
return (pvd->vdev_min_asize);
135
vdev_set_min_asize(vdev_t *vd)
137
vd->vdev_min_asize = vdev_get_min_asize(vd);
139
for (int c = 0; c < vd->vdev_children; c++)
140
vdev_set_min_asize(vd->vdev_child[c]);
252
260
vdev_t **newchild, *cvd;
253
261
int oldc = pvd->vdev_children;
256
264
ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
258
for (c = newc = 0; c < oldc; c++)
266
for (int c = newc = 0; c < oldc; c++)
259
267
if (pvd->vdev_child[c])
262
270
newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
264
for (c = newc = 0; c < oldc; c++) {
272
for (int c = newc = 0; c < oldc; c++) {
265
273
if ((cvd = pvd->vdev_child[c]) != NULL) {
266
274
newchild[newc] = cvd;
267
275
cvd->vdev_id = newc++;
288
296
spa->spa_root_vdev = vd;
299
if (guid == 0 && ops != &vdev_hole_ops) {
292
300
if (spa->spa_root_vdev == vd) {
294
302
* The root vdev's guid will also be the pool guid,
295
303
* which must be unique among all pools.
297
while (guid == 0 || spa_guid_exists(guid, 0))
298
guid = spa_get_random(-1ULL);
305
guid = spa_generate_guid(NULL);
301
308
* Any other vdev's guid must be unique within the pool.
304
spa_guid_exists(spa_guid(spa), guid))
305
guid = spa_get_random(-1ULL);
310
guid = spa_generate_guid(spa);
307
312
ASSERT(!spa_guid_exists(spa_guid(spa), guid));
313
318
vd->vdev_guid_sum = guid;
314
319
vd->vdev_ops = ops;
315
320
vd->vdev_state = VDEV_STATE_CLOSED;
321
vd->vdev_ishole = (ops == &vdev_hole_ops);
317
323
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
318
324
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
396
408
if (ops == &vdev_raidz_ops) {
397
409
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
398
410
&nparity) == 0) {
400
* Currently, we can only support 2 parity devices.
402
if (nparity == 0 || nparity > 2)
411
if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
405
* Older versions can only support 1 parity device.
414
* Previous versions could only support 1 or 2 parity
408
spa_version(spa) < SPA_VERSION_RAID6)
418
spa_version(spa) < SPA_VERSION_RAIDZ2)
421
spa_version(spa) < SPA_VERSION_RAIDZ3)
409
422
return (ENOTSUP);
412
425
* We require the parity to be specified for SPAs that
413
426
* support multiple parity levels.
415
if (spa_version(spa) >= SPA_VERSION_RAID6)
428
if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
418
431
* Otherwise, we default to 1 parity device for RAID-Z.
460
473
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
476
* Retrieve the vdev creation time.
478
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
463
482
* If we're a top-level vdev, try to load the allocation parameters.
465
if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
484
if (parent && !parent->vdev_parent &&
485
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
466
486
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
467
487
&vd->vdev_ms_array);
468
488
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
471
491
&vd->vdev_asize);
494
if (parent && !parent->vdev_parent) {
495
ASSERT(alloctype == VDEV_ALLOC_LOAD ||
496
alloctype == VDEV_ALLOC_ADD ||
497
alloctype == VDEV_ALLOC_SPLIT ||
498
alloctype == VDEV_ALLOC_ROOTPOOL);
499
vd->vdev_mg = metaslab_group_create(islog ?
500
spa_log_class(spa) : spa_normal_class(spa), vd);
475
504
* If we're a leaf vdev, try to load the DTL object and other state.
477
506
if (vd->vdev_ops->vdev_op_leaf &&
478
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) {
507
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
508
alloctype == VDEV_ALLOC_ROOTPOOL)) {
479
509
if (alloctype == VDEV_ALLOC_LOAD) {
480
510
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
481
511
&vd->vdev_dtl_smo.smo_object);
482
512
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
483
513
&vd->vdev_unspare);
516
if (alloctype == VDEV_ALLOC_ROOTPOOL) {
519
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
520
&spare) == 0 && spare)
485
524
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
486
525
&vd->vdev_offline);
489
528
* When importing a pool, we want to ignore the persistent fault
490
529
* state, as the diagnosis made on another system may not be
491
* valid in the current context.
530
* valid in the current context. Local vdevs will
531
* remain in the faulted state.
493
if (spa->spa_load_state == SPA_LOAD_OPEN) {
533
if (spa_load_state(spa) == SPA_LOAD_OPEN) {
494
534
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
495
535
&vd->vdev_faulted);
496
536
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
497
537
&vd->vdev_degraded);
498
538
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
499
539
&vd->vdev_removed);
541
if (vd->vdev_faulted || vd->vdev_degraded) {
545
VDEV_AUX_ERR_EXCEEDED;
546
if (nvlist_lookup_string(nv,
547
ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
548
strcmp(aux, "external") == 0)
549
vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
683
734
mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
685
736
mvd->vdev_asize = cvd->vdev_asize;
737
mvd->vdev_min_asize = cvd->vdev_min_asize;
686
738
mvd->vdev_ashift = cvd->vdev_ashift;
687
739
mvd->vdev_state = cvd->vdev_state;
740
mvd->vdev_crtxg = cvd->vdev_crtxg;
689
742
vdev_remove_child(pvd, cvd);
690
743
vdev_add_child(pvd, mvd);
746
800
spa_t *spa = vd->vdev_spa;
747
801
objset_t *mos = spa->spa_meta_objset;
748
metaslab_class_t *mc;
750
803
uint64_t oldc = vd->vdev_ms_count;
751
804
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
752
805
metaslab_t **mspp;
755
if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
808
ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
811
* This vdev is not being allocated from yet or is a hole.
813
if (vd->vdev_ms_shift == 0)
816
ASSERT(!vd->vdev_ishole);
759
819
* Compute the raidz-deflation ratio. Note, we hard-code
760
820
* in 128k (1 << 17) because it is the current "typical" blocksize.
1043
vdev_open_child(void *arg)
1047
vd->vdev_open_thread = curthread;
1048
vd->vdev_open_error = vdev_open(vd);
1049
vd->vdev_open_thread = NULL;
1053
vdev_uses_zvols(vdev_t *vd)
1055
if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
1056
strlen(ZVOL_DIR)) == 0)
1058
for (int c = 0; c < vd->vdev_children; c++)
1059
if (vdev_uses_zvols(vd->vdev_child[c]))
1065
vdev_open_children(vdev_t *vd)
1068
int children = vd->vdev_children;
1071
* in order to handle pools on top of zvols, do the opens
1072
* in a single thread so that the same thread holds the
1073
* spa_namespace_lock
1075
if (vdev_uses_zvols(vd)) {
1076
for (int c = 0; c < children; c++)
1077
vd->vdev_child[c]->vdev_open_error =
1078
vdev_open(vd->vdev_child[c]);
1081
tq = taskq_create("vdev_open", children, minclsyspri,
1082
children, children, TASKQ_PREPOPULATE);
1084
for (int c = 0; c < children; c++)
1085
VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
981
1092
* Prepare a virtual device for access.
986
1097
spa_t *spa = vd->vdev_spa;
989
1099
uint64_t osize = 0;
990
1100
uint64_t asize, psize;
991
1101
uint64_t ashift = 0;
993
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1103
ASSERT(vd->vdev_open_thread == curthread ||
1104
spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
995
1105
ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
996
1106
vd->vdev_state == VDEV_STATE_CANT_OPEN ||
997
1107
vd->vdev_state == VDEV_STATE_OFFLINE);
999
1109
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1000
1110
vd->vdev_cant_read = B_FALSE;
1001
1111
vd->vdev_cant_write = B_FALSE;
1112
vd->vdev_min_asize = vdev_get_min_asize(vd);
1115
* If this vdev is not removed, check its fault status. If it's
1116
* faulted, bail out of the open.
1003
1118
if (!vd->vdev_removed && vd->vdev_faulted) {
1004
1119
ASSERT(vd->vdev_children == 0);
1120
ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1121
vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1005
1122
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1006
VDEV_AUX_ERR_EXCEEDED);
1123
vd->vdev_label_aux);
1007
1124
return (ENXIO);
1008
1125
} else if (vd->vdev_offline) {
1009
dprintf("vdev_open(): vd->vdev_offline\n");
1010
1126
ASSERT(vd->vdev_children == 0);
1011
1127
vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
1012
1128
return (ENXIO);
1030
1151
vd->vdev_removed = B_FALSE;
1154
* Recheck the faulted flag now that we have confirmed that
1155
* the vdev is accessible. If we're faulted, bail.
1157
if (vd->vdev_faulted) {
1158
ASSERT(vd->vdev_children == 0);
1159
ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1160
vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1161
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1162
vd->vdev_label_aux);
1032
1166
if (vd->vdev_degraded) {
1033
1167
ASSERT(vd->vdev_children == 0);
1034
1168
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1035
1169
VDEV_AUX_ERR_EXCEEDED);
1037
vd->vdev_state = VDEV_STATE_HEALTHY;
1171
vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
1040
for (c = 0; c < vd->vdev_children; c++)
1175
* For hole or missing vdevs we just return success.
1177
if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
1180
for (int c = 0; c < vd->vdev_children; c++) {
1041
1181
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
1042
1182
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1043
1183
VDEV_AUX_NONE);
1047
1188
osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
1083
1233
VDEV_AUX_BAD_LABEL);
1084
1234
return (EINVAL);
1088
* Make sure the device hasn't shrunk.
1090
if (asize < vd->vdev_asize) {
1091
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1092
VDEV_AUX_BAD_LABEL);
1097
* If all children are healthy and the asize has increased,
1098
* then we've experienced dynamic LUN growth.
1100
if (vd->vdev_state == VDEV_STATE_HEALTHY &&
1101
asize > vd->vdev_asize) {
1102
vd->vdev_asize = asize;
1239
* If all children are healthy and the asize has increased,
1240
* then we've experienced dynamic LUN growth. If automatic
1241
* expansion is enabled then use the additional space.
1243
/* Force spa_autoexpand = 1 here - it's not initialised at this
1244
* point in linux, and we want it initialised to be able to update
1245
* the vdev size here while importing a pool */
1246
spa->spa_autoexpand = 1;
1247
if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
1248
(vd->vdev_expanding || spa->spa_autoexpand))
1249
vd->vdev_asize = asize;
1251
vdev_set_min_asize(vd);
1107
1254
* Ensure we can issue some IO before declaring the
1108
1255
* vdev open for business.
1314
* Determine if this vdev has been split off into another
1315
* pool. If so, then refuse to open it.
1317
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
1318
&aux_guid) == 0 && aux_guid == spa_guid(spa)) {
1319
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1320
VDEV_AUX_SPLIT_POOL);
1165
1325
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
1166
1326
&guid) != 0 || guid != spa_guid(spa)) {
1167
1327
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1177
1342
* Fortunately, either version of the label will have the
1178
1343
* same top guid, so if we're a top-level vdev, we can
1179
1344
* safely compare to that instead.
1346
* If we split this vdev off instead, then we also check the
1347
* original pool's guid. We don't want to consider the vdev
1348
* corrupt if it is partway through a split operation.
1181
1350
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
1183
1352
nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
1184
1353
&top_guid) != 0 ||
1185
(vd->vdev_guid != guid &&
1354
((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
1186
1355
(vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
1187
1356
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1188
1357
VDEV_AUX_CORRUPT_DATA);
1223
1397
vdev_close(vdev_t *vd)
1225
1399
spa_t *spa = vd->vdev_spa;
1400
vdev_t *pvd = vd->vdev_parent;
1227
1402
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1405
* If our parent is reopening, then we are as well, unless we are
1408
if (pvd != NULL && pvd->vdev_reopening)
1409
vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
1229
1411
vd->vdev_ops->vdev_op_close(vd);
1231
1413
vdev_cache_purge(vd);
1234
* We record the previous state before we close it, so that if we are
1416
* We record the previous state before we close it, so that if we are
1235
1417
* doing a reopen(), we don't generate FMA ereports if we notice that
1236
1418
* it's still faulted.
1312
* The is the latter half of vdev_create(). It is distinct because it
1313
* involves initiating transactions in order to do metaslab creation.
1314
* For creation, we want to try to create all vdevs at once and then undo it
1315
* if anything fails; this is much harder if we have pending transactions.
1318
vdev_init(vdev_t *vd, uint64_t txg)
1498
vdev_metaslab_set_size(vdev_t *vd)
1321
1501
* Aim for roughly 200 metaslabs per vdev.
1323
1503
vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
1324
1504
vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1327
* Initialize the vdev's metaslabs. This can't fail because
1328
* there's nothing to read when creating all new metaslabs.
1330
VERIFY(vdev_metaslab_init(vd, txg) == 0);
1334
1508
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1336
1510
ASSERT(vd == vd->vdev_top);
1511
ASSERT(!vd->vdev_ishole);
1337
1512
ASSERT(ISP2(flags));
1339
1514
if (flags & VDD_METASLAB)
1443
1618
vdev_dtl_reassess(vd->vdev_child[c], txg,
1444
1619
scrub_txg, scrub_done);
1446
if (vd == spa->spa_root_vdev)
1621
if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
1449
1624
if (vd->vdev_ops->vdev_op_leaf) {
1450
1625
mutex_enter(&vd->vdev_dtl_lock);
1451
1626
if (scrub_txg != 0 &&
1452
1627
(spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
1453
/* XXX should check scrub_done? */
1455
1629
* We completed a scrub up to scrub_txg. If we
1456
1630
* did it without rebooting, then the scrub dtl
1510
1686
for (int c = 0; c < vd->vdev_children; c++) {
1511
1687
vdev_t *cvd = vd->vdev_child[c];
1512
1688
mutex_enter(&cvd->vdev_dtl_lock);
1513
space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
1689
space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
1514
1690
mutex_exit(&cvd->vdev_dtl_lock);
1516
1692
space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
1697
1877
* If this is a top-level vdev, initialize its metaslabs.
1699
if (vd == vd->vdev_top &&
1879
if (vd == vd->vdev_top && !vd->vdev_ishole &&
1700
1880
(vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
1701
1881
vdev_metaslab_init(vd, 0) != 0))
1702
1882
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1936
vdev_remove(vdev_t *vd, uint64_t txg)
1938
spa_t *spa = vd->vdev_spa;
1939
objset_t *mos = spa->spa_meta_objset;
1942
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1944
if (vd->vdev_dtl_smo.smo_object) {
1945
ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
1946
(void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
1947
vd->vdev_dtl_smo.smo_object = 0;
1950
if (vd->vdev_ms != NULL) {
1951
for (int m = 0; m < vd->vdev_ms_count; m++) {
1952
metaslab_t *msp = vd->vdev_ms[m];
1954
if (msp == NULL || msp->ms_smo.smo_object == 0)
1957
ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
1958
(void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
1959
msp->ms_smo.smo_object = 0;
1963
if (vd->vdev_ms_array) {
1964
(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
1965
vd->vdev_ms_array = 0;
1966
vd->vdev_ms_shift = 0;
1756
1972
vdev_sync_done(vdev_t *vd, uint64_t txg)
1758
1974
metaslab_t *msp;
1975
boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
1977
ASSERT(!vd->vdev_ishole);
1760
1979
while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
1761
1980
metaslab_sync_done(msp, txg);
1983
metaslab_sync_reassess(vd->vdev_mg);
1801
2028
* not be opened, and no I/O is attempted.
1804
vdev_fault(spa_t *spa, uint64_t guid)
2031
vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
1808
spa_vdev_state_enter(spa);
2035
spa_vdev_state_enter(spa, SCL_NONE);
1810
2037
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
1811
2038
return (spa_vdev_state_exit(spa, NULL, ENODEV));
1814
2041
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2044
* We don't directly use the aux state here, but if we do a
2045
* vdev_reopen(), we need this value to be present to remember why we
2048
vd->vdev_label_aux = aux;
1817
2051
* Faulted state takes precedence over degraded.
1819
2053
vd->vdev_faulted = 1ULL;
1820
2054
vd->vdev_degraded = 0ULL;
1821
vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED);
2055
vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
1824
2058
* If marking the vdev as faulted cause the top-level vdev to become
1825
2059
* unavailable, then back off and simply mark the vdev as degraded
1828
if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
2062
if (vdev_is_dead(vd->vdev_top) && !vd->vdev_islog &&
2063
vd->vdev_aux == NULL) {
1829
2064
vd->vdev_degraded = 1ULL;
1830
2065
vd->vdev_faulted = 0ULL;
1850
2083
* as I/O is concerned.
1853
vdev_degrade(spa_t *spa, uint64_t guid)
2086
vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
1857
spa_vdev_state_enter(spa);
2090
spa_vdev_state_enter(spa, SCL_NONE);
1859
2092
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
1860
2093
return (spa_vdev_state_exit(spa, NULL, ENODEV));
1886
2119
vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
2121
vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
1890
spa_vdev_state_enter(spa);
2123
spa_vdev_state_enter(spa, SCL_NONE);
1892
2125
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
1893
2126
return (spa_vdev_state_exit(spa, NULL, ENODEV));
1895
2128
if (!vd->vdev_ops->vdev_op_leaf)
1896
2129
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
1898
2132
vd->vdev_offline = B_FALSE;
1899
2133
vd->vdev_tmpoffline = B_FALSE;
1900
2134
vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
1901
2135
vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
1902
vdev_reopen(vd->vdev_top);
2137
/* XXX - L2ARC 1.0 does not support expansion */
2138
if (!vd->vdev_aux) {
2139
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2140
pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
1903
2144
vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
2146
if (!vd->vdev_aux) {
2147
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2148
pvd->vdev_expanding = B_FALSE;
1906
2152
*newstate = vd->vdev_state;
1907
2153
if ((flags & ZFS_ONLINE_UNSPARE) &&
1910
2156
vd->vdev_parent->vdev_child[0] == vd)
1911
2157
vd->vdev_unspare = B_TRUE;
2159
if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
2161
/* XXX - L2ARC 1.0 does not support expansion */
2163
return (spa_vdev_state_exit(spa, vd, ENOTSUP));
2164
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
1913
2166
return (spa_vdev_state_exit(spa, vd, 0));
1917
vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
2170
vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
1919
2172
vdev_t *vd, *tvd;
2174
uint64_t generation;
2175
metaslab_group_t *mg;
1922
spa_vdev_state_enter(spa);
2178
spa_vdev_state_enter(spa, SCL_ALLOC);
1924
2180
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
1925
2181
return (spa_vdev_state_exit(spa, NULL, ENODEV));
1943
2201
return (spa_vdev_state_exit(spa, NULL, EBUSY));
2204
* If the top-level is a slog and it has had allocations
2205
* then proceed. We check that the vdev's metaslab group
2206
* is not NULL since it's possible that we may have just
2207
* added this vdev but not yet initialized its metaslabs.
2209
if (tvd->vdev_islog && mg != NULL) {
2211
* Prevent any future allocations.
2213
metaslab_group_passivate(mg);
2214
(void) spa_vdev_state_exit(spa, vd, 0);
2216
error = spa_offline_log(spa);
2218
spa_vdev_state_enter(spa, SCL_ALLOC);
2221
* Check to see if the config has changed.
2223
if (error || generation != spa->spa_config_generation) {
2224
metaslab_group_activate(mg);
2226
return (spa_vdev_state_exit(spa,
2228
(void) spa_vdev_state_exit(spa, vd, 0);
2231
ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
1946
2235
* Offline this device and reopen its top-level vdev.
1947
2236
* If the top-level vdev is a log device then just offline
1948
2237
* it. Otherwise, if this action results in the top-level
1949
2238
* vdev becoming unusable, undo it and fail the request.
1951
2240
vd->vdev_offline = B_TRUE;
2241
/* Explicitely call vdev_close before vdev_reopen because
2242
* otherwise the reopen flag forbids vdev_close */
1952
2244
vdev_reopen(tvd);
1954
2246
if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
1957
2249
vdev_reopen(tvd);
1958
2250
return (spa_vdev_state_exit(spa, NULL, EBUSY));
2254
* Add the device back into the metaslab rotor so that
2255
* once we online the device it's open for business.
2257
if (tvd->vdev_islog && mg != NULL)
2258
metaslab_group_activate(mg);
1962
2261
vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
1964
if (!tvd->vdev_islog || !vdev_is_dead(tvd))
1965
return (spa_vdev_state_exit(spa, vd, 0));
1967
(void) spa_vdev_state_exit(spa, vd, 0);
1969
error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1970
NULL, DS_FIND_CHILDREN);
1972
(void) vdev_online(spa, guid, 0, NULL);
1976
* If we successfully offlined the log device then we need to
1977
* sync out the current txg so that the "stubby" block can be
1978
* removed by zil_sync().
1980
txg_wait_synced(spa->spa_dsl_pool, 0);
2263
return (spa_vdev_state_exit(spa, vd, 0));
2267
vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
2271
mutex_enter(&spa->spa_vdev_top_lock);
2272
error = vdev_offline_locked(spa, guid, flags);
2273
mutex_exit(&spa->spa_vdev_top_lock);
2012
2306
if (vd->vdev_faulted || vd->vdev_degraded ||
2013
2307
!vdev_readable(vd) || !vdev_writeable(vd)) {
2310
* When reopening in reponse to a clear event, it may be due to
2311
* a fmadm repair request. In this case, if the device is
2312
* still broken, we want to still post the ereport again.
2314
vd->vdev_forcefault = B_TRUE;
2015
2316
vd->vdev_faulted = vd->vdev_degraded = 0;
2016
2317
vd->vdev_cant_read = B_FALSE;
2017
2318
vd->vdev_cant_write = B_FALSE;
2019
2320
vdev_reopen(vd);
2322
vd->vdev_forcefault = B_FALSE;
2022
2325
vdev_state_dirty(vd->vdev_top);
2027
2330
spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
2334
* When clearing a FMA-diagnosed fault, we always want to
2335
* unspare the device, as we assume that the original spare was
2336
* done in response to the FMA fault.
2338
if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
2339
vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2340
vd->vdev_parent->vdev_child[0] == vd)
2341
vd->vdev_unspare = B_TRUE;
2032
2345
vdev_is_dead(vdev_t *vd)
2034
return (vd->vdev_state < VDEV_STATE_DEGRADED);
2348
* Holes and missing devices are always considered "dead".
2349
* This simplifies the code since we don't have to check for
2350
* these types of devices in the various code paths.
2351
* Instead we rely on the fact that we skip over dead devices
2352
* before issuing I/O to them.
2354
return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
2355
vd->vdev_ops == &vdev_missing_ops);
2093
2414
vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
2094
2415
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
2095
2416
vs->vs_state = vd->vdev_state;
2096
vs->vs_rsize = vdev_get_rsize(vd);
2417
vs->vs_rsize = vdev_get_min_asize(vd);
2418
if (vd->vdev_ops->vdev_op_leaf)
2419
vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
2097
2420
mutex_exit(&vd->vdev_stat_lock);
2210
2541
if (type == ZIO_TYPE_WRITE && txg != 0 &&
2211
2542
(!(flags & ZIO_FLAG_IO_REPAIR) ||
2212
(flags & ZIO_FLAG_SCRUB_THREAD))) {
2543
(flags & ZIO_FLAG_SCRUB_THREAD) ||
2544
spa->spa_claiming)) {
2214
* This is either a normal write (not a repair), or it's a
2215
* repair induced by the scrub thread. In the normal case,
2216
* we commit the DTL change in the same txg as the block
2217
* was born. In the scrub-induced repair case, we know that
2218
* scrubs run in first-pass syncing context, so we commit
2219
* the DTL change in spa->spa_syncing_txg.
2546
* This is either a normal write (not a repair), or it's
2547
* a repair induced by the scrub thread, or it's a repair
2548
* made by zil_claim() during spa_load() in the first txg.
2549
* In the normal case, we commit the DTL change in the same
2550
* txg as the block was born. In the scrub-induced repair
2551
* case, we know that scrubs run in first-pass syncing context,
2552
* so we commit the DTL change in spa_syncing_txg(spa).
2553
* In the zil_claim() case, we commit in spa_first_txg(spa).
2221
2555
* We currently do not make DTL entries for failed spontaneous
2222
2556
* self-healing writes triggered by normal (non-scrubbing)
2229
2563
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
2230
2564
ASSERT(spa_sync_pass(spa) == 1);
2231
2565
vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
2232
commit_txg = spa->spa_syncing_txg;
2566
commit_txg = spa_syncing_txg(spa);
2567
} else if (spa->spa_claiming) {
2568
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
2569
commit_txg = spa_first_txg(spa);
2234
ASSERT(commit_txg >= spa->spa_syncing_txg);
2571
ASSERT(commit_txg >= spa_syncing_txg(spa));
2235
2572
if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
2237
2574
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2247
2584
vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
2250
2586
vdev_stat_t *vs = &vd->vdev_stat;
2252
for (c = 0; c < vd->vdev_children; c++)
2588
for (int c = 0; c < vd->vdev_children; c++)
2253
2589
vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
2255
2591
mutex_enter(&vd->vdev_stat_lock);
2277
* Update the in-core space usage stats for this vdev and the root vdev.
2613
* Update the in-core space usage stats for this vdev, its metaslab class,
2614
* and the root vdev.
2280
vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
2281
boolean_t update_root)
2617
vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
2618
int64_t space_delta)
2283
2620
int64_t dspace_delta = space_delta;
2284
2621
spa_t *spa = vd->vdev_spa;
2285
2622
vdev_t *rvd = spa->spa_root_vdev;
2623
metaslab_group_t *mg = vd->vdev_mg;
2624
metaslab_class_t *mc = mg ? mg->mg_class : NULL;
2287
2626
ASSERT(vd == vd->vdev_top);
2298
2637
vd->vdev_deflate_ratio;
2300
2639
mutex_enter(&vd->vdev_stat_lock);
2640
vd->vdev_stat.vs_alloc += alloc_delta;
2301
2641
vd->vdev_stat.vs_space += space_delta;
2302
vd->vdev_stat.vs_alloc += alloc_delta;
2303
2642
vd->vdev_stat.vs_dspace += dspace_delta;
2304
2643
mutex_exit(&vd->vdev_stat_lock);
2307
ASSERT(rvd == vd->vdev_parent);
2308
ASSERT(vd->vdev_ms_count != 0);
2311
* Don't count non-normal (e.g. intent log) space as part of
2312
* the pool's capacity.
2314
if (vd->vdev_mg->mg_class != spa->spa_normal_class)
2645
if (mc == spa_normal_class(spa)) {
2317
2646
mutex_enter(&rvd->vdev_stat_lock);
2647
rvd->vdev_stat.vs_alloc += alloc_delta;
2318
2648
rvd->vdev_stat.vs_space += space_delta;
2319
rvd->vdev_stat.vs_alloc += alloc_delta;
2320
2649
rvd->vdev_stat.vs_dspace += dspace_delta;
2321
2650
mutex_exit(&rvd->vdev_stat_lock);
2654
ASSERT(rvd == vd->vdev_parent);
2655
ASSERT(vd->vdev_ms_count != 0);
2657
metaslab_class_space_update(mc,
2658
alloc_delta, defer_delta, space_delta, dspace_delta);
2460
2798
vdev_t *rvd = spa->spa_root_vdev;
2461
2799
int degraded = 0, faulted = 0;
2462
2800
int corrupted = 0;
2466
2803
if (vd->vdev_children > 0) {
2467
for (c = 0; c < vd->vdev_children; c++) {
2804
for (int c = 0; c < vd->vdev_children; c++) {
2468
2805
child = vd->vdev_child[c];
2808
* Don't factor holes into the decision.
2810
if (child->vdev_ishole)
2470
2813
if (!vdev_readable(child) ||
2471
2814
(!vdev_writeable(child) && spa_writeable(spa))) {
2504
2847
vdev_propagate_state(vd->vdev_parent);
2850
static char old_name[MAXNAMELEN];
2851
static time_t old_time;
2852
static vdev_state_t old_state;
2854
/* This zpool_state_to_name is a copy of the one from libzfs.
2855
* taken from user land, in zfs-fuse we don't have all these problems to
2856
* communicate between the 2... ! */
2858
zpool_state_to_name(vdev_state_t state, vdev_aux_t aux)
2861
case VDEV_STATE_CLOSED:
2862
case VDEV_STATE_OFFLINE:
2863
return (gettext("OFFLINE"));
2864
case VDEV_STATE_REMOVED:
2865
return (gettext("REMOVED"));
2866
case VDEV_STATE_CANT_OPEN:
2867
if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
2868
return (gettext("FAULTED"));
2869
else if (aux == VDEV_AUX_SPLIT_POOL)
2870
return (gettext("SPLIT"));
2872
return (gettext("UNAVAIL"));
2873
case VDEV_STATE_FAULTED:
2874
return (gettext("FAULTED"));
2875
case VDEV_STATE_DEGRADED:
2876
return (gettext("DEGRADED"));
2877
case VDEV_STATE_HEALTHY:
2878
return (gettext("ONLINE"));
2881
return (gettext("UNKNOWN"));
2884
static int vdev_check_children(vdev_t *vd) {
2885
/* Check 1st that it's not just because of an offline vdev :
2886
* browse the children looking for 1 which in a state !=
2887
* online && offline, check recursively */
2890
for (n=0; n<vd->vdev_children; n++) {
2891
if (vd->vdev_child[n]->vdev_children) {
2892
found = vdev_check_children(vd->vdev_child[n]);
2896
vdev_state_t st = vd->vdev_child[n]->vdev_state;
2897
if (st != VDEV_STATE_HEALTHY && st != VDEV_STATE_OFFLINE) {
2508
2907
* Set a vdev's state. If this is during an open, we don't update the parent
2509
2908
* state, because we're in the process of opening children depth-first.
2539
2938
if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
2540
2939
vd->vdev_ops->vdev_op_close(vd);
2942
* If we have brought this vdev back into service, we need
2943
* to notify fmd so that it can gracefully repair any outstanding
2944
* cases due to a missing device. We do this in all cases, even those
2945
* that probably don't correlate to a repaired fault. This is sure to
2946
* catch all cases, and we let the zfs-retire agent sort it out. If
2947
* this is a transient state it's OK, as the retire agent will
2948
* double-check the state of the vdev before repairing it.
2950
if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
2951
vd->vdev_prevstate != state)
2952
zfs_post_state_change(spa, vd);
2542
2954
if (vd->vdev_removed &&
2543
2955
state == VDEV_STATE_CANT_OPEN &&
2544
2956
(aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
2554
2966
vd->vdev_state = VDEV_STATE_REMOVED;
2555
2967
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
2556
2968
} else if (state == VDEV_STATE_REMOVED) {
2558
* Indicate to the ZFS DE that this device has been removed, and
2559
* any recent errors should be ignored.
2561
zfs_post_remove(spa, vd);
2562
2969
vd->vdev_removed = B_TRUE;
2563
2970
} else if (state == VDEV_STATE_CANT_OPEN) {
2629
3036
if (!isopen && vd->vdev_parent)
2630
3037
vdev_propagate_state(vd->vdev_parent);
3038
if ((state == VDEV_STATE_HEALTHY && save_state == VDEV_STATE_DEGRADED)
3039
|| (state == VDEV_STATE_DEGRADED &&
3040
save_state == VDEV_STATE_HEALTHY)) {
3044
top = vd->vdev_top->vdev_spa;
3047
time_t mytime = time(NULL);
3048
if (mytime - old_time < 30 && !strcmp(top->spa_name,old_name) &&
3049
old_state == state) {
3050
/* Already got the same alert for this pool less than 30s ago */
3053
if (state == VDEV_STATE_DEGRADED) {
3054
int found = vdev_check_children(vd);
3056
return; // nothing of interest here
3058
if (strcasecmp(top->spa_name,"$import")) {
3059
snprintf(cmd,2048,"/etc/zfs/zfs_pool_alert %s &",top->spa_name);
3060
syslog(LOG_WARNING,"running zfs_pool_alert for pool %s, status %s prev status %s",top->spa_name,zpool_state_to_name(state,save_state),
3061
zpool_state_to_name(save_state,state));
3062
int ret = system(cmd);
3064
syslog(LOG_WARNING,"fork failed for zfs_pool_alert");
3065
/* We won't get the return code of the actual command since
3066
* it's executed in the background. So if the fork worked
3067
* then this is the job of the zfs_pool_alert to track
3068
* error conditions */
3071
strcpy(old_name,top->spa_name);
2655
3096
return (B_FALSE);
2658
for (c = 0; c < vd->vdev_children; c++) {
3099
for (int c = 0; c < vd->vdev_children; c++) {
2659
3100
if (!vdev_is_bootable(vd->vdev_child[c]))
2660
3101
return (B_FALSE);
2662
3103
return (B_TRUE);
3107
* Load the state from the original vdev tree (ovd) which
3108
* we've retrieved from the MOS config object. If the original
3109
* vdev was offline then we transfer that state to the device
3110
* in the current vdev tree (nvd).
2666
vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
3113
vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
2671
spa_t *spa = vd->vdev_spa;
2673
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
2674
&child, &children) == 0) {
2675
for (c = 0; c < children; c++)
2676
vdev_load_log_state(vd->vdev_child[c], child[c]);
2679
if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
2680
ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
3115
spa_t *spa = nvd->vdev_spa;
3117
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
3118
ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
3120
for (int c = 0; c < nvd->vdev_children; c++)
3121
vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
3123
if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
2683
3125
* It would be nice to call vdev_offline()
2684
3126
* directly but the pool isn't fully loaded and
2685
3127
* the txg threads have not been started yet.
2687
spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
2688
vd->vdev_offline = val;
2689
vdev_reopen(vd->vdev_top);
2690
spa_config_exit(spa, SCL_STATE_ALL, FTAG);
3129
nvd->vdev_offline = ovd->vdev_offline;
3130
vdev_reopen(nvd->vdev_top);
3135
* Expand a vdev if possible.
3138
vdev_expand(vdev_t *vd, uint64_t txg)
3140
ASSERT(vd->vdev_top == vd);
3141
ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3143
if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
3144
VERIFY(vdev_metaslab_init(vd, txg) == 0);
3145
vdev_config_dirty(vd);
3153
vdev_split(vdev_t *vd)
3155
vdev_t *cvd, *pvd = vd->vdev_parent;
3157
vdev_remove_child(pvd, vd);
3158
vdev_compact_children(pvd);
3160
cvd = pvd->vdev_child[0];
3161
if (pvd->vdev_children == 1) {
3162
vdev_remove_parent(cvd);
3163
cvd->vdev_splitting = B_TRUE;
3165
vdev_propagate_state(cvd);