533
(chunksize || level!= UnSet || layout_str || raid_disks)) {
534
fprintf(stderr, Name ": cannot change component size at the same time "
535
"as other changes.\n"
536
" Change size first, then check data is intact before "
537
"making other changes.\n");
541
if (raid_disks && raid_disks < array.raid_disks && array.level > 1 &&
542
get_linux_version() < 2006032 &&
543
!check_env("MDADM_FORCE_FEWER")) {
544
fprintf(stderr, Name ": reducing the number of devices is not safe before Linux 2.6.32\n"
545
" Please use a newer kernel\n");
548
sra = sysfs_read(fd, 0, GET_LEVEL);
550
frozen = freeze_array(sra);
552
fprintf(stderr, Name ": failed to read sysfs parameters for %s\n",
557
fprintf(stderr, Name ": %s is performing resync/recovery and cannot"
558
" be reshaped\n", devname);
562
/* ========= set size =============== */
563
if (size >= 0 && (size == 0 || size != array.size)) {
565
if (array.size != size) {
566
/* got truncated to 32bit, write to
567
* component_size instead
570
rv = sysfs_set_num(sra, NULL,
571
"component_size", size);
575
rv = ioctl(fd, SET_ARRAY_INFO, &array);
578
fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
579
devname, strerror(err));
581
(array.state & (1<<MD_SB_BITMAP_PRESENT)))
582
fprintf(stderr, " Bitmap must be removed before size can be changed\n");
586
ioctl(fd, GET_ARRAY_INFO, &array);
587
size = get_component_size(fd)/2;
591
fprintf(stderr, Name ": component size of %s has been set to %lluK\n",
595
size = get_component_size(fd)/2;
600
/* ======= set level =========== */
601
if (level != UnSet && level != array.level) {
602
/* Trying to change the level.
603
* We might need to change layout first and schedule a
604
* level change for later.
605
* Level changes that can happen immediately are:
606
* 0->4,5,6 1->5 4->5,6 5->1,6
607
* Level changes that need a layout change first are:
608
* 6->5,4,0 : need a -6 layout, or parity-last
609
* 5->4,0 : need parity-last
611
if ((array.level == 6 || array.level == 5) &&
612
(level == 5 || level == 4 || level == 0)) {
613
/* Don't change level yet, but choose intermediate
617
if (layout_str == NULL)
618
switch (array.layout) {
619
case ALGORITHM_LEFT_ASYMMETRIC:
620
case ALGORITHM_LEFT_ASYMMETRIC_6:
621
case ALGORITHM_ROTATING_N_RESTART:
622
layout_str = "left-asymmetric-6";
624
case ALGORITHM_LEFT_SYMMETRIC:
625
case ALGORITHM_LEFT_SYMMETRIC_6:
626
case ALGORITHM_ROTATING_N_CONTINUE:
627
layout_str = "left-symmetric-6";
629
case ALGORITHM_RIGHT_ASYMMETRIC:
630
case ALGORITHM_RIGHT_ASYMMETRIC_6:
631
case ALGORITHM_ROTATING_ZERO_RESTART:
632
layout_str = "right-asymmetric-6";
634
case ALGORITHM_RIGHT_SYMMETRIC:
635
case ALGORITHM_RIGHT_SYMMETRIC_6:
636
layout_str = "right-symmetric-6";
638
case ALGORITHM_PARITY_0:
639
case ALGORITHM_PARITY_0_6:
640
layout_str = "parity-first-6";
642
case ALGORITHM_PARITY_N:
643
layout_str = "parity-last";
646
fprintf(stderr, Name ": %s: cannot"
647
"convert layout to RAID5 equivalent\n",
653
int l = map_name(r5layout, layout_str);
655
fprintf(stderr, Name ": %s: layout '%s' not recognised\n",
656
devname, layout_str);
660
if (l != ALGORITHM_PARITY_N) {
661
/* need the -6 version */
662
char *ls = map_num(r5layout, l);
663
strcat(strcpy(alt_layout, ls),
665
layout_str = alt_layout;
669
/* The final raid6->raid5 conversion
670
* will reduce the number of disks,
671
* so now we need to aim higher
675
layout_str = "parity-last";
677
c = map_num(pers, level);
679
rv = 1;/* not possible */
682
err = sysfs_set_str(sra, NULL, "level", c);
685
fprintf(stderr, Name ": %s: could not set level to %s\n",
688
(array.state & (1<<MD_SB_BITMAP_PRESENT)))
689
fprintf(stderr, " Bitmap must be removed before level can be changed\n");
694
orig_level = orig.level;
695
ioctl(fd, GET_ARRAY_INFO, &array);
696
if (layout_str == NULL &&
697
orig.level == 5 && level == 6 &&
698
array.layout != orig.layout)
699
layout_str = map_num(r5layout, orig.layout);
701
fprintf(stderr, Name " level of %s changed to %s\n",
707
/* ========= set shape (chunk_size / layout / ndisks) ============== */
708
/* Check if layout change is a no-op */
709
if (layout_str) switch(array.level) {
711
if (array.layout == map_name(r5layout, layout_str))
715
if (layout_str == NULL &&
716
((chunksize && chunksize * 1024 != array.chunk_size) ||
717
(raid_disks && raid_disks != array.raid_disks)) &&
718
array.layout >= 16) {
720
": %s has a non-standard layout. If you wish to preserve this\n"
721
" during the reshape, please specify --layout=preserve\n"
722
" If you want to change it, specify a layout or use --layout=normalise\n",
727
if (strcmp(layout_str, "normalise") == 0 ||
728
strcmp(layout_str, "normalize") == 0) {
730
strcpy(alt_layout, map_num(r6layout, array.layout));
731
hyphen = strrchr(alt_layout, '-');
732
if (hyphen && strcmp(hyphen, "-6") == 0) {
734
layout_str = alt_layout;
738
if (array.layout == map_name(r6layout, layout_str))
740
if (layout_str && strcmp(layout_str, "preserve") == 0)
744
if (layout_str == NULL
745
&& (chunksize == 0 || chunksize*1024 == array.chunk_size)
746
&& (raid_disks == 0 || raid_disks == array.raid_disks)) {
748
if (level != UnSet && level != array.level) {
749
/* Looks like this level change doesn't need
750
* a reshape after all.
752
c = map_num(pers, level);
754
rv = sysfs_set_str(sra, NULL, "level", c);
757
fprintf(stderr, Name ": %s: could not set level to %s\n",
760
(array.state & (1<<MD_SB_BITMAP_PRESENT)))
761
fprintf(stderr, " Bitmap must be removed before level can be changed\n");
765
} else if (!changed && !quiet)
766
fprintf(stderr, Name ": %s: no change requested\n",
446
771
c = map_num(pers, array.level);
447
772
if (c == NULL) c = "-unknown-";
448
773
switch(array.level) {
449
774
default: /* raid0, linear, multipath cannot be reconfigured */
450
775
fprintf(stderr, Name ": %s array %s cannot be reshaped.\n",
454
780
case LEVEL_FAULTY: /* only 'layout' change is permitted */
457
fprintf(stderr, Name ": %s: Cannot change size of a 'faulty' array\n",
461
if (level != UnSet && level != LEVEL_FAULTY) {
462
fprintf(stderr, Name ": %s: Cannot change RAID level of a 'faulty' array\n",
466
782
if (chunksize || raid_disks) {
467
783
fprintf(stderr, Name ": %s: Cannot change chunksize or disks of a 'faulty' array\n",
472
return 0; /* nothing to do.... */
788
if (layout_str == NULL)
789
break; /* nothing to do.... */
474
array.layout = layout;
791
array.layout = parse_layout_faulty(layout_str);
792
if (array.layout < 0) {
793
fprintf(stderr, Name ": %s: layout %s not understood for 'faulty' array\n",
794
devname, layout_str);
475
798
if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
476
799
fprintf(stderr, Name ": Cannot set layout for %s: %s\n",
477
800
devname, strerror(errno));
481
803
printf("layout for %s set to %d\n", devname, array.layout);
484
case 1: /* raid_disks and size can each be changed. They are independant */
486
if (level != UnSet && level != 1) {
487
fprintf(stderr, Name ": %s: Cannot change RAID level of a RAID1 array.\n",
491
if (chunksize || layout != UnSet) {
492
fprintf(stderr, Name ": %s: Cannot change chunk size of layout for a RAID1 array.\n",
497
/* Each can trigger a resync/recovery which will block the
498
* other from happening. Later we could block
499
* resync for the duration via 'sync_action'...
806
case 1: /* only raid_disks can each be changed. */
808
if (chunksize || layout_str != NULL) {
809
fprintf(stderr, Name ": %s: Cannot change chunk size or layout for a RAID1 array.\n",
501
814
if (raid_disks > 0) {
502
815
array.raid_disks = raid_disks;
503
816
if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
504
817
fprintf(stderr, Name ": Cannot set raid-devices for %s: %s\n",
505
818
devname, strerror(errno));
511
if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
512
fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
513
devname, strerror(errno));
829
* layout/chunksize/raid_disks can be changed
830
* though the kernel may not support it all.
522
832
st = super_by_fd(fd);
524
/* size can be changed independently.
525
* layout/chunksize/raid_disks/level can be changed
526
* though the kernel may not support it all.
527
* If 'suspend_lo' is not present in devfs, then
528
* these cannot be changed.
531
/* Cannot change other details as well.. */
532
if (layout != UnSet ||
536
fprintf(stderr, Name ": %s: Cannot change shape as well as size of a %s array.\n",
541
if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
542
fprintf(stderr, Name ": Cannot set device size/shape for %s: %s\n",
543
devname, strerror(errno));
548
/* Ok, just change the shape. This can be awkward.
549
* There are three possibilities.
550
* 1/ The array will shrink. We don't support this
551
* possibility. Maybe one day...
552
* 2/ The array will not change size. This is easy enough
553
* to do, but not reliably. If the process is aborted
554
* the array *will* be corrupted. So maybe we can allow
555
* this but only if the user is really certain. e.g.
556
* --really-risk-everything
557
* 3/ The array will grow. This can be reliably achieved.
835
* There are three possibilities.
836
* 1/ The array will shrink.
837
* We need to ensure the reshape will pause before reaching
838
* the 'critical section'. We also need to fork and wait for
839
* that to happen. When it does we
840
* suspend/backup/complete/unfreeze
842
* 2/ The array will not change size.
843
* This requires that we keep a backup of a sliding window
844
* so that we can restore data after a crash. So we need
845
* to fork and monitor progress.
847
* 3/ The array will grow. This is relatively easy.
558
848
* However the kernel's restripe routines will cheerfully
559
849
* overwrite some early data before it is safe. So we
560
850
* need to make a backup of the early parts of the array
561
851
* and be ready to restore it if rebuild aborts very early.
563
* We backup data by writing it to all spares (there must be
564
* at least 1, so even raid6->raid5 requires a spare to be
853
* We backup data by writing it to one spare, or to a
854
* file which was given on command line.
856
* [FOLLOWING IS OLD AND PARTLY WRONG]
567
857
* So: we enumerate the devices in the array and
568
858
* make sure we can open all of them.
569
859
* Then we freeze the early part of the array and
657
1000
sd->disk.minor, 1);
658
1001
fdlist[sd->disk.raid_disk]
659
1002
= dev_open(dn, O_RDONLY);
660
offsets[sd->disk.raid_disk] = sd->data_offset;
1003
offsets[sd->disk.raid_disk] = sd->data_offset*512;
661
1004
if (fdlist[sd->disk.raid_disk] < 0) {
662
1005
fprintf(stderr, Name ": %s: cannot open component %s\n",
663
1006
devname, dn?dn:"-unknown-");
1010
} else if (backup_file == NULL) {
668
1012
char *dn = map_dev(sd->disk.major,
669
1013
sd->disk.minor, 1);
670
1014
fdlist[d] = dev_open(dn, O_RDWR);
671
offsets[d] = sd->data_offset;
1015
offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
672
1016
if (fdlist[d]<0) {
673
1017
fprintf(stderr, Name ": %s: cannot open component %s\n",
674
1018
devname, dn?dn:"-unknown");
680
for (i=0 ; i<array.raid_disks; i++)
682
fprintf(stderr, Name ": %s: failed to find device %d. Array might be degraded.\n"
683
" --grow aborted\n", devname, i);
686
spares = sra->array.spare_disks;
688
fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL, 0600);
1025
if (backup_file == NULL) {
1026
if (ndata <= odata) {
1027
fprintf(stderr, Name ": %s: Cannot grow - need backup-file\n",
1031
} else if (sra->array.spare_disks == 0) {
1032
fprintf(stderr, Name ": %s: Cannot grow - need a spare or "
1033
"backup-file to backup critical section\n",
1038
if (d == array.raid_disks) {
1039
fprintf(stderr, Name ": %s: No spare device for backup\n",
1045
/* need to check backup file is large enough */
1047
fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL,
1049
offsets[d] = 8 * 512;
689
1050
if (fdlist[d] < 0) {
690
1051
fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
691
1052
devname, backup_file, strerror(errno));
1056
memset(buf, 0, 512);
1057
for (i=0; i < (signed)blocks + 1 ; i++) {
1058
if (write(fdlist[d], buf, 512) != 512) {
1059
fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
1060
devname, backup_file, strerror(errno));
1065
if (fsync(fdlist[d]) != 0) {
1066
fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
1067
devname, backup_file, strerror(errno));
698
if (fdlist[array.raid_disks] < 0) {
699
fprintf(stderr, Name ": %s: failed to find a spare and no backup-file given - --grow aborted\n",
1074
/* lastly, check that the internal stripe cache is
1075
* large enough, or it won't work.
1078
cache = (nchunk < ochunk) ? ochunk : nchunk;
1079
cache = cache * 4 / 4096;
1080
if (cache < blocks / 8 / odisks + 16)
1081
/* Make it big enough to hold 'blocks' */
1082
cache = blocks / 8 / odisks + 16;
1083
if (sra->cache_size < cache)
1084
sysfs_set_num(sra, NULL, "stripe_cache_size",
1086
/* Right, everything seems fine. Let's kick things off.
1087
* If only changing raid_disks, use ioctl, else use
1090
if (ochunk == nchunk && olayout == nlayout) {
1091
array.raid_disks = ndisks;
1092
if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
1095
fprintf(stderr, Name ": Cannot set device shape for %s: %s\n",
1096
devname, strerror(errno));
1097
if (ndisks < odisks &&
1098
get_linux_version() < 2006030)
1099
fprintf(stderr, Name ": linux 2.6.30 or later required\n");
1101
(array.state & (1<<MD_SB_BITMAP_PRESENT)))
1102
fprintf(stderr, " Bitmap must be removed before shape can be changed\n");
1107
/* set them all just in case some old 'new_*' value
1108
* persists from some earlier problem
1110
int err = err; /* only used if rv==1, and always set if
1111
* rv==1, so initialisation not needed,
1112
* despite gcc warning
1114
if (sysfs_set_num(sra, NULL, "chunk_size", nchunk) < 0)
1115
rv = 1, err = errno;
1116
if (!rv && sysfs_set_num(sra, NULL, "layout", nlayout) < 0)
1117
rv = 1, err = errno;
1118
if (!rv && sysfs_set_num(sra, NULL, "raid_disks", ndisks) < 0)
1119
rv = 1, err = errno;
1121
fprintf(stderr, Name ": Cannot set device shape for %s\n",
1123
if (get_linux_version() < 2006030)
1124
fprintf(stderr, Name ": linux 2.6.30 or later required\n");
1126
(array.state & (1<<MD_SB_BITMAP_PRESENT)))
1127
fprintf(stderr, " Bitmap must be removed before shape can be changed\n");
1132
if (ndisks == 2 && odisks == 2) {
1133
/* No reshape is needed in this trivial case */
1138
/* set up the backup-super-block. This requires the
1139
* uuid from the array.
704
1141
/* Find a superblock */
705
if (st->ss->load_super(st, fdlist[0], NULL)) {
1142
for (sd = sra->devs; sd; sd = sd->next) {
1146
if (sd->disk.state & (1<<MD_DISK_FAULTY))
1148
dn = map_dev(sd->disk.major, sd->disk.minor, 1);
1149
devfd = dev_open(dn, O_RDONLY);
1152
ok = st->ss->load_super(st, devfd, NULL);
706
1158
fprintf(stderr, Name ": %s: Cannot find a superblock\n",
1164
memset(&bsb, 0, 512);
712
1165
memcpy(bsb.magic, "md_backup_data-1", 16);
713
1166
st->ss->uuid_from_super(st, (int*)&bsb.set_uuid);
714
1167
bsb.mtime = __cpu_to_le64(time(0));
716
bsb.length = __cpu_to_le64(last_block);
1168
bsb.devstart2 = blocks;
1169
stripes = blocks / (ochunk/512) / odata;
1170
/* Now we just need to kick off the reshape and watch, while
1171
* handling backups of the data...
1172
* This is all done by a forked background process.
1177
if (check_env("MDADM_GROW_VERIFY"))
1178
fd = open(devname, O_RDONLY | O_DIRECT);
1181
mlockall(MCL_FUTURE);
718
/* Decide offset for the backup, llseek the spares, and write
719
* a leading superblock 4K earlier.
721
for (i=array.raid_disks; i<d; i++) {
723
if (i==d-1 && backup_file) {
724
/* This is the backup file */
727
offsets[i] += sra->component_size - last_block - 8;
728
if (lseek64(fdlist[i], (offsets[i]<<9) - 4096, 0)
729
!= (offsets[i]<<9) - 4096) {
730
fprintf(stderr, Name ": could not seek...\n");
733
memset(buf, 0, sizeof(buf));
734
bsb.devstart = __cpu_to_le64(offsets[i]);
735
bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
736
memcpy(buf, &bsb, sizeof(bsb));
737
if (write(fdlist[i], buf, 4096) != 4096) {
738
fprintf(stderr, Name ": could not write leading superblock\n");
742
array.level = nlevel;
743
array.raid_disks = ndisks;
744
array.chunk_size = nchunk;
745
array.layout = nlayout;
746
if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
747
if (errno == ENOSPC) {
748
/* stripe cache is not big enough.
749
* It needs to be 4 times chunksize_size,
750
* and we assume pagesize is 4K
1184
done = child_grow(fd, sra, stripes,
1186
odisks, ochunk, array.level, olayout, odata,
1187
d - odisks, fdlist+odisks, offsets+odisks);
1188
else if (odata > ndata)
1189
done = child_shrink(fd, sra, stripes,
1191
odisks, ochunk, array.level, olayout, odata,
1192
d - odisks, fdlist+odisks, offsets+odisks);
1194
done = child_same_size(fd, sra, stripes,
1197
odisks, ochunk, array.level, olayout, odata,
1198
d - odisks, fdlist+odisks, offsets+odisks);
1199
if (backup_file && done)
1200
unlink(backup_file);
1201
if (level != UnSet && level != array.level) {
1202
/* We need to wait for the reshape to finish
1203
* (which will have happened unless odata < ndata)
1204
* and then set the level
752
if (sra->cache_size < 4 * (nchunk/4096)) {
753
sysfs_set_num(sra, NULL,
755
4 * (nchunk/4096) +1);
756
if (ioctl(fd, SET_ARRAY_INFO,
761
fprintf(stderr, Name ": Cannot set device size/shape for %s: %s\n",
762
devname, strerror(errno));
767
/* suspend the relevant region */
768
sysfs_set_num(sra, NULL, "suspend_hi", 0); /* just in case */
769
if (sysfs_set_num(sra, NULL, "suspend_lo", 0) < 0 ||
770
sysfs_set_num(sra, NULL, "suspend_hi", last_block) < 0) {
771
fprintf(stderr, Name ": %s: failed to suspend device.\n",
777
err = save_stripes(fdlist, offsets,
778
odisks, ochunk, olevel, olayout,
779
spares, fdlist+odisks,
780
0ULL, last_block*512);
782
/* abort if there was an error */
784
fprintf(stderr, Name ": %s: failed to save critical region\n",
789
for (i=odisks; i<d ; i++) {
790
bsb.devstart = __cpu_to_le64(offsets[i]);
791
bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
792
if (lseek64(fdlist[i], (offsets[i]+last_block)<<9, 0) < 0 ||
793
write(fdlist[i], &bsb, sizeof(bsb)) != sizeof(bsb) ||
794
fsync(fdlist[i]) != 0) {
795
fprintf(stderr, Name ": %s: fail to save metadata for critical region backups.\n",
801
/* start the reshape happening */
802
if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) {
803
fprintf(stderr, Name ": %s: failed to initiate reshape\n",
807
/* wait for reshape to pass the critical region */
809
unsigned long long comp;
810
if (sysfs_get_ll(sra, NULL, "sync_completed", &comp)<0) {
819
/* invalidate superblocks */
820
memset(&bsb, 0, sizeof(bsb));
821
for (i=odisks; i<d ; i++) {
822
lseek64(fdlist[i], (offsets[i]+last_block)<<9, 0);
823
if (write(fdlist[i], &bsb, sizeof(bsb)) < 0) {
824
fprintf(stderr, Name ": %s: failed to invalidate metadata for raid disk %d\n",
830
sysfs_set_num(sra, NULL, "suspend_lo", last_block);
840
printf(Name ": ... critical section passed.\n");
1207
c = map_num(pers, level);
1209
exit(0);/* not possible */
1213
err = sysfs_set_str(sra, NULL, "level", c);
1215
fprintf(stderr, Name ": %s: could not set level to %s\n",
1220
fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n",
1225
/* The child will take care of unfreezing the array */
847
sysfs_set_num(sra, NULL, "suspend_lo", last_block);
849
for (i=0; i<array.nr_disks; i++)
1234
if (rv && orig_level != UnSet && sra) {
1235
c = map_num(pers, orig_level);
1236
if (c && sysfs_set_str(sra, NULL, "level", c) == 0)
1237
fprintf(stderr, Name ": aborting level change\n");
1240
unfreeze_array(sra, frozen);
1245
* We run a child process in the background which performs the following
1247
* - wait for resync to reach a certain point
1248
* - suspend io to the following section
1249
* - backup that section
1250
* - allow resync to proceed further
1252
* - discard the backup.
1254
* When are combined in slightly different ways in the three cases.
1256
* - suspend/backup/allow/wait/resume/discard
1258
* - allow/wait/suspend/backup/allow/wait/resume/discard
1260
* - wait/resume/discard/suspend/backup/allow
1262
* suspend/backup/allow always come together
1263
* wait/resume/discard do too.
1264
* For the same-size case we have two backups to improve flow.
1268
/* FIXME return status is never checked */
1269
int grow_backup(struct mdinfo *sra,
1270
unsigned long long offset, /* per device */
1271
unsigned long stripes, /* per device */
1272
int *sources, unsigned long long *offsets,
1273
int disks, int chunk, int level, int layout,
1274
int dests, int *destfd, unsigned long long *destoffsets,
1275
int part, int *degraded,
1278
/* Backup 'blocks' sectors at 'offset' on each device of the array,
1279
* to storage 'destfd' (offset 'destoffsets'), after first
1280
* suspending IO. Then allow resync to continue
1281
* over the suspended section.
1282
* Use part 'part' of the backup-super-block.
1287
unsigned long long ll;
1289
//printf("offset %llu\n", offset);
1294
sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * (chunk/512)) * odata);
1295
/* Check that array hasn't become degraded, else we might backup the wrong data */
1296
sysfs_get_ll(sra, NULL, "degraded", &ll);
1297
new_degraded = (int)ll;
1298
if (new_degraded != *degraded) {
1299
/* check each device to ensure it is still working */
1301
for (sd = sra->devs ; sd ; sd = sd->next) {
1302
if (sd->disk.state & (1<<MD_DISK_FAULTY))
1304
if (sd->disk.state & (1<<MD_DISK_SYNC)) {
1306
if (sysfs_get_str(sra, sd, "state", sbuf, 20) < 0 ||
1307
strstr(sbuf, "faulty") ||
1308
strstr(sbuf, "in_sync") == NULL) {
1309
/* this device is dead */
1310
sd->disk.state = (1<<MD_DISK_FAULTY);
1311
if (sd->disk.raid_disk >= 0 &&
1312
sources[sd->disk.raid_disk] >= 0) {
1313
close(sources[sd->disk.raid_disk]);
1314
sources[sd->disk.raid_disk] = -1;
1319
*degraded = new_degraded;
1322
bsb.arraystart2 = __cpu_to_le64(offset * odata);
1323
bsb.length2 = __cpu_to_le64(stripes * (chunk/512) * odata);
1325
bsb.arraystart = __cpu_to_le64(offset * odata);
1326
bsb.length = __cpu_to_le64(stripes * (chunk/512) * odata);
1329
bsb.magic[15] = '2';
1330
for (i = 0; i < dests; i++)
1332
lseek64(destfd[i], destoffsets[i] + __le64_to_cpu(bsb.devstart2)*512, 0);
1334
lseek64(destfd[i], destoffsets[i], 0);
1336
rv = save_stripes(sources, offsets,
1337
disks, chunk, level, layout,
1339
offset*512*odata, stripes * chunk * odata,
1344
bsb.mtime = __cpu_to_le64(time(0));
1345
for (i = 0; i < dests; i++) {
1346
bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
1348
bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
1349
if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
1350
bsb.sb_csum2 = bsb_csum((char*)&bsb,
1351
((char*)&bsb.sb_csum2)-((char*)&bsb));
1354
if ((unsigned long long)lseek64(destfd[i], destoffsets[i] - 4096, 0)
1355
!= destoffsets[i] - 4096)
1357
if (write(destfd[i], &bsb, 512) != 512)
1359
if (destoffsets[i] > 4096) {
1360
if ((unsigned long long)lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) !=
1361
destoffsets[i]+stripes*chunk*odata)
1363
if (write(destfd[i], &bsb, 512) != 512)
1373
/* in 2.6.30, the value reported by sync_completed can be
1374
* less that it should be by one stripe.
1375
* This only happens when reshape hits sync_max and pauses.
1376
* So allow wait_backup to either extent sync_max further
1377
* than strictly necessary, or return before the
1378
* sync has got quite as far as we would really like.
1379
* This is what 'blocks2' is for.
1380
* The various caller give appropriate values so that
1383
/* FIXME return value is often ignored */
1384
int wait_backup(struct mdinfo *sra,
1385
unsigned long long offset, /* per device */
1386
unsigned long long blocks, /* per device */
1387
unsigned long long blocks2, /* per device - hack */
1388
int dests, int *destfd, unsigned long long *destoffsets,
1391
/* Wait for resync to pass the section that was backed up
1392
* then erase the backup and allow IO
1394
int fd = sysfs_get_fd(sra, NULL, "sync_completed");
1395
unsigned long long completed;
1401
sysfs_set_num(sra, NULL, "sync_max", offset + blocks + blocks2);
1403
sysfs_set_str(sra, NULL, "sync_action", "reshape");
1409
select(fd+1, NULL, NULL, &rfds, NULL);
1410
if (sysfs_fd_get_ll(fd, &completed) < 0) {
1414
if (sysfs_get_str(sra, NULL, "sync_action",
1416
strncmp(action, "reshape", 7) != 0)
1418
} while (completed < offset + blocks);
1422
bsb.arraystart2 = __cpu_to_le64(0);
1423
bsb.length2 = __cpu_to_le64(0);
1425
bsb.arraystart = __cpu_to_le64(0);
1426
bsb.length = __cpu_to_le64(0);
1428
bsb.mtime = __cpu_to_le64(time(0));
1430
for (i = 0; i < dests; i++) {
1431
bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
1432
bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
1433
if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
1434
bsb.sb_csum2 = bsb_csum((char*)&bsb,
1435
((char*)&bsb.sb_csum2)-((char*)&bsb));
1436
if ((unsigned long long)lseek64(destfd[i], destoffsets[i]-4096, 0) !=
1437
destoffsets[i]-4096)
1440
write(destfd[i], &bsb, 512) != 512)
1447
static void fail(char *msg)
1450
rv = (write(2, msg, strlen(msg)) != (int)strlen(msg));
1451
rv |= (write(2, "\n", 1) != 1);
1455
static char *abuf, *bbuf;
1456
static unsigned long long abuflen;
1457
static void validate(int afd, int bfd, unsigned long long offset)
1459
/* check that the data in the backup against the array.
1460
* This is only used for regression testing and should not
1461
* be used while the array is active
1465
lseek64(bfd, offset - 4096, 0);
1466
if (read(bfd, &bsb2, 512) != 512)
1467
fail("cannot read bsb");
1468
if (bsb2.sb_csum != bsb_csum((char*)&bsb2,
1469
((char*)&bsb2.sb_csum)-((char*)&bsb2)))
1470
fail("first csum bad");
1471
if (memcmp(bsb2.magic, "md_backup_data", 14) != 0)
1472
fail("magic is bad");
1473
if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 &&
1474
bsb2.sb_csum2 != bsb_csum((char*)&bsb2,
1475
((char*)&bsb2.sb_csum2)-((char*)&bsb2)))
1476
fail("second csum bad");
1478
if (__le64_to_cpu(bsb2.devstart)*512 != offset)
1479
fail("devstart is wrong");
1482
unsigned long long len = __le64_to_cpu(bsb2.length)*512;
1484
if (abuflen < len) {
1488
if (posix_memalign((void**)&abuf, 4096, abuflen) ||
1489
posix_memalign((void**)&bbuf, 4096, abuflen)) {
1491
/* just stop validating on mem-alloc failure */
1496
lseek64(bfd, offset, 0);
1497
if ((unsigned long long)read(bfd, bbuf, len) != len) {
1498
//printf("len %llu\n", len);
1499
fail("read first backup failed");
1501
lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0);
1502
if ((unsigned long long)read(afd, abuf, len) != len)
1503
fail("read first from array failed");
1504
if (memcmp(bbuf, abuf, len) != 0) {
1507
printf("offset=%llu len=%llu\n",
1508
(unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len);
1509
for (i=0; i<len; i++)
1510
if (bbuf[i] != abuf[i]) {
1511
printf("first diff byte %d\n", i);
1515
fail("data1 compare failed");
1519
unsigned long long len = __le64_to_cpu(bsb2.length2)*512;
1521
if (abuflen < len) {
1525
abuf = malloc(abuflen);
1526
bbuf = malloc(abuflen);
1529
lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0);
1530
if ((unsigned long long)read(bfd, bbuf, len) != len)
1531
fail("read second backup failed");
1532
lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0);
1533
if ((unsigned long long)read(afd, abuf, len) != len)
1534
fail("read second from array failed");
1535
if (memcmp(bbuf, abuf, len) != 0)
1536
fail("data2 compare failed");
1540
static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes,
1541
int *fds, unsigned long long *offsets,
1542
int disks, int chunk, int level, int layout, int data,
1543
int dests, int *destfd, unsigned long long *destoffsets)
1548
if (posix_memalign((void**)&buf, 4096, disks * chunk))
1549
/* Don't start the 'reshape' */
1551
sysfs_set_num(sra, NULL, "suspend_hi", 0);
1552
sysfs_set_num(sra, NULL, "suspend_lo", 0);
1553
grow_backup(sra, 0, stripes,
1554
fds, offsets, disks, chunk, level, layout,
1555
dests, destfd, destoffsets,
1557
validate(afd, destfd[0], destoffsets[0]);
1558
wait_backup(sra, 0, stripes * (chunk / 512), stripes * (chunk / 512),
1559
dests, destfd, destoffsets,
1561
sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data);
1563
/* FIXME this should probably be numeric */
1564
sysfs_set_str(sra, NULL, "sync_max", "max");
1568
static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
1569
int *fds, unsigned long long *offsets,
1570
int disks, int chunk, int level, int layout, int data,
1571
int dests, int *destfd, unsigned long long *destoffsets)
1574
unsigned long long start;
1578
if (posix_memalign((void**)&buf, 4096, disks * chunk))
1580
start = sra->component_size - stripes * (chunk/512);
1581
sysfs_set_num(sra, NULL, "sync_max", start);
1582
sysfs_set_str(sra, NULL, "sync_action", "reshape");
1583
sysfs_set_num(sra, NULL, "suspend_lo", 0);
1584
sysfs_set_num(sra, NULL, "suspend_hi", 0);
1585
rv = wait_backup(sra, 0, start - stripes * (chunk/512), stripes * (chunk/512),
1586
dests, destfd, destoffsets, 0);
1589
grow_backup(sra, 0, stripes,
1591
disks, chunk, level, layout,
1592
dests, destfd, destoffsets,
1594
validate(afd, destfd[0], destoffsets[0]);
1595
wait_backup(sra, start, stripes*(chunk/512), 0,
1596
dests, destfd, destoffsets, 0);
1597
sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data);
1599
/* FIXME this should probably be numeric */
1600
sysfs_set_str(sra, NULL, "sync_max", "max");
1604
static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
1605
int *fds, unsigned long long *offsets,
1606
unsigned long long start,
1607
int disks, int chunk, int level, int layout, int data,
1608
int dests, int *destfd, unsigned long long *destoffsets)
1610
unsigned long long size;
1611
unsigned long tailstripes = stripes;
1614
unsigned long long speed;
1618
if (posix_memalign((void**)&buf, 4096, disks * chunk))
1621
sysfs_set_num(sra, NULL, "suspend_lo", 0);
1622
sysfs_set_num(sra, NULL, "suspend_hi", 0);
1624
sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
1625
sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
1627
grow_backup(sra, start, stripes,
1629
disks, chunk, level, layout,
1630
dests, destfd, destoffsets,
1632
grow_backup(sra, (start + stripes) * (chunk/512), stripes,
1634
disks, chunk, level, layout,
1635
dests, destfd, destoffsets,
1637
validate(afd, destfd[0], destoffsets[0]);
1639
start += stripes * 2; /* where to read next */
1640
size = sra->component_size / (chunk/512);
1641
while (start < size) {
1642
if (wait_backup(sra, (start-stripes*2)*(chunk/512),
1643
stripes*(chunk/512), 0,
1644
dests, destfd, destoffsets,
1647
sysfs_set_num(sra, NULL, "suspend_lo", start*(chunk/512) * data);
1648
if (start + stripes > size)
1649
tailstripes = (size - start);
1651
grow_backup(sra, start*(chunk/512), tailstripes,
1653
disks, chunk, level, layout,
1654
dests, destfd, destoffsets,
1655
part, °raded, buf);
1658
validate(afd, destfd[0], destoffsets[0]);
1660
if (wait_backup(sra, (start-stripes*2) * (chunk/512), stripes * (chunk/512), 0,
1661
dests, destfd, destoffsets,
1664
sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*(chunk/512)) * data);
1665
wait_backup(sra, (start-stripes) * (chunk/512), tailstripes * (chunk/512), 0,
1666
dests, destfd, destoffsets,
1668
sysfs_set_num(sra, NULL, "suspend_lo", (size*(chunk/512)) * data);
1669
sysfs_set_num(sra, NULL, "sync_speed_min", speed);
958
1858
info->new_level,
959
1859
info->new_layout,
960
1860
fd, __le64_to_cpu(bsb.devstart)*512,
961
0, __le64_to_cpu(bsb.length)*512)) {
962
/* didn't succeed, so giveup */
1861
__le64_to_cpu(bsb.arraystart)*512,
1862
__le64_to_cpu(bsb.length)*512)) {
1863
/* didn't succeed, so giveup */
1865
fprintf(stderr, Name ": Error restoring backup from %s\n",
1870
if (bsb.magic[15] == '2' &&
1871
restore_stripes(fdlist, offsets,
1872
info->array.raid_disks,
1876
fd, __le64_to_cpu(bsb.devstart)*512 +
1877
__le64_to_cpu(bsb.devstart2)*512,
1878
__le64_to_cpu(bsb.arraystart2)*512,
1879
__le64_to_cpu(bsb.length2)*512)) {
1880
/* didn't succeed, so giveup */
1882
fprintf(stderr, Name ": Error restoring second backup from %s\n",
966
1888
/* Ok, so the data is restored. Let's update those superblocks. */
1890
if (info->delta_disks >= 0) {
1891
info->reshape_progress = __le64_to_cpu(bsb.arraystart) +
1892
__le64_to_cpu(bsb.length);
1893
if (bsb.magic[15] == '2') {
1894
unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) +
1895
__le64_to_cpu(bsb.length2);
1896
if (p2 > info->reshape_progress)
1897
info->reshape_progress = p2;
1900
info->reshape_progress = __le64_to_cpu(bsb.arraystart);
1901
if (bsb.magic[15] == '2') {
1902
unsigned long long p2 = __le64_to_cpu(bsb.arraystart2);
1903
if (p2 < info->reshape_progress)
1904
info->reshape_progress = p2;
968
1907
for (j=0; j<info->array.raid_disks; j++) {
969
1908
if (fdlist[j] < 0) continue;
970
1909
if (st->ss->load_super(st, fdlist[j], NULL))
972
1911
st->ss->getinfo_super(st, &dinfo);
973
dinfo.reshape_progress = __le64_to_cpu(bsb.length);
1912
dinfo.reshape_progress = info->reshape_progress;
974
1913
st->ss->update_super(st, &dinfo,
975
1914
"_reshape_progress",
976
1915
NULL,0, 0, NULL);
977
1916
st->ss->store_super(st, fdlist[j]);
978
1917
st->ss->free_super(st);
981
/* And we are done! */
984
1921
/* Didn't find any backup data, try to see if any
987
nstripe = ostripe = 0;
988
odata = info->array.raid_disks - info->delta_disks - 1;
989
if (info->array.level == 6) odata--; /* number of data disks */
990
ndata = info->array.raid_disks - 1;
991
if (info->new_level == 6) ndata--;
993
while (nstripe >= ostripe) {
994
nstripe += info->new_chunk / 512;
995
last_block = nstripe * ndata;
996
ostripe = last_block / odata / (info->array.chunk_size/512) *
997
(info->array.chunk_size/512);
1000
if (info->reshape_progress >= last_block)
1924
if (info->delta_disks < 0) {
1925
/* When shrinking, the critical section is at the end.
1926
* So see if we are before the critical section.
1928
unsigned long long first_block;
1929
nstripe = ostripe = 0;
1931
while (ostripe >= nstripe) {
1932
ostripe += info->array.chunk_size / 512;
1933
first_block = ostripe * odata;
1934
nstripe = first_block / ndata / (info->new_chunk/512) *
1935
(info->new_chunk/512);
1938
if (info->reshape_progress >= first_block)
1941
if (info->delta_disks > 0) {
1942
/* See if we are beyond the critical section. */
1943
unsigned long long last_block;
1944
nstripe = ostripe = 0;
1946
while (nstripe >= ostripe) {
1947
nstripe += info->new_chunk / 512;
1948
last_block = nstripe * ndata;
1949
ostripe = last_block / odata / (info->array.chunk_size/512) *
1950
(info->array.chunk_size/512);
1953
if (info->reshape_progress >= last_block)
1002
1956
/* needed to recover critical section! */
1958
fprintf(stderr, Name ": Failed to find backup of critical section\n");
1962
int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
1965
/* Array is assembled and ready to be started, but
1966
* monitoring is probably required.
1969
* - set upper bound for resync
1970
* - initialise the 'suspend' boundaries
1971
* - switch to read-write
1972
* - fork and continue monitoring
1976
unsigned long long backup_offsets[1];
1977
int odisks, ndisks, ochunk, nchunk,odata,ndata;
1978
unsigned long a,b,blocks,stripes;
1981
unsigned long long *offsets;
1983
struct mdinfo *sra, *sd;
1985
unsigned long cache;
1988
err = sysfs_set_str(info, NULL, "array_state", "readonly");
1992
/* make sure reshape doesn't progress until we are ready */
1993
sysfs_set_str(info, NULL, "sync_max", "0");
1994
sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */
1996
sra = sysfs_read(-1, devname2devnum(info->sys_name),
1997
GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
2002
/* ndisks is not growing, so raid_disks is old and +delta is new */
2003
odisks = info->array.raid_disks;
2004
ndisks = odisks + info->delta_disks;
2007
if (info->array.level == 6) {
2011
ochunk = info->array.chunk_size;
2012
nchunk = info->new_chunk;
2014
a = (ochunk/512) * odata;
2015
b = (nchunk/512) * ndata;
2023
/* LCM == product / GCD */
2024
blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
2027
while (blocks * 32 < sra->component_size &&
2030
stripes = blocks / (info->array.chunk_size/512) / odata;
2032
/* check that the internal stripe cache is
2033
* large enough, or it won't work.
2035
cache = (nchunk < ochunk) ? ochunk : nchunk;
2036
cache = cache * 4 / 4096;
2037
if (cache < blocks / 8 / odisks + 16)
2038
/* Make it big enough to hold 'blocks' */
2039
cache = blocks / 8 / odisks + 16;
2040
if (sra->cache_size < cache)
2041
sysfs_set_num(sra, NULL, "stripe_cache_size",
2044
memset(&bsb, 0, 512);
2045
memcpy(bsb.magic, "md_backup_data-1", 16);
2046
memcpy(&bsb.set_uuid, info->uuid, 16);
2047
bsb.mtime = __cpu_to_le64(time(0));
2048
bsb.devstart2 = blocks;
2050
backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR);
2051
backup_list[0] = backup_fd;
2052
backup_offsets[0] = 8 * 512;
2053
fds = malloc(odisks * sizeof(fds[0]));
2054
offsets = malloc(odisks * sizeof(offsets[0]));
2055
for (d=0; d<odisks; d++)
2058
for (sd = sra->devs; sd; sd = sd->next) {
2059
if (sd->disk.state & (1<<MD_DISK_FAULTY))
2061
if (sd->disk.state & (1<<MD_DISK_SYNC)) {
2062
char *dn = map_dev(sd->disk.major,
2064
fds[sd->disk.raid_disk]
2065
= dev_open(dn, O_RDONLY);
2066
offsets[sd->disk.raid_disk] = sd->data_offset*512;
2067
if (fds[sd->disk.raid_disk] < 0) {
2068
fprintf(stderr, Name ": %s: cannot open component %s\n",
2069
info->sys_name, dn?dn:"-unknown-");
2080
mlockall(MCL_FUTURE);
2081
if (info->delta_disks < 0)
2082
done = child_shrink(-1, info, stripes,
2084
info->array.raid_disks,
2085
info->array.chunk_size,
2086
info->array.level, info->array.layout,
2088
1, backup_list, backup_offsets);
2089
else if (info->delta_disks == 0) {
2090
/* The 'start' is a per-device stripe number.
2091
* reshape_progress is a per-array sector number.
2092
* So divide by ndata * chunk_size
2094
unsigned long long start = info->reshape_progress / ndata;
2095
start /= (info->array.chunk_size/512);
2096
done = child_same_size(-1, info, stripes,
2099
info->array.raid_disks,
2100
info->array.chunk_size,
2101
info->array.level, info->array.layout,
2103
1, backup_list, backup_offsets);
2105
if (backup_file && done)
2106
unlink(backup_file);
2107
/* FIXME should I intuit a level change */
2110
fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n",