~ubuntu-branches/ubuntu/hardy/mdadm/hardy-updates

« back to all changes in this revision

Viewing changes to {arch}/++pristine-trees/unlocked/mdadm/mdadm--upstream/mdadm--upstream--1.12.0/pkg-mdadm-devel@lists.alioth.debian.org--2005/mdadm--upstream--1.12.0--patch-1/Monitor.c

  • Committer: Package Import Robot
  • Author(s): Scott James Remnant
  • Date: 2006-07-11 17:23:21 UTC
  • mfrom: (1.1.4)
  • Revision ID: package-import@ubuntu.com-20060711172321-070tz7lox9adujtw
Tags: 2.4.1-6ubuntu1
* Merge from debian unstable, remaining changes:
  - integration with initramfs-tools,
  - autocreate devices when udev is in use,
  - use lstat in mdopen.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*
2
 
 * mdadm - manage Linux "md" devices aka RAID arrays.
3
 
 *
4
 
 * Copyright (C) 2001-2002 Neil Brown <neilb@cse.unsw.edu.au>
5
 
 *
6
 
 *
7
 
 *    This program is free software; you can redistribute it and/or modify
8
 
 *    it under the terms of the GNU General Public License as published by
9
 
 *    the Free Software Foundation; either version 2 of the License, or
10
 
 *    (at your option) any later version.
11
 
 *
12
 
 *    This program is distributed in the hope that it will be useful,
13
 
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 
 *    GNU General Public License for more details.
16
 
 *
17
 
 *    You should have received a copy of the GNU General Public License
18
 
 *    along with this program; if not, write to the Free Software
19
 
 *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20
 
 *
21
 
 *    Author: Neil Brown
22
 
 *    Email: <neilb@cse.unsw.edu.au>
23
 
 *    Paper: Neil Brown
24
 
 *           School of Computer Science and Engineering
25
 
 *           The University of New South Wales
26
 
 *           Sydney, 2052
27
 
 *           Australia
28
 
 */
29
 
 
30
 
#include        "mdadm.h"
31
 
#include        "md_p.h"
32
 
#include        "md_u.h"
33
 
#include        <sys/wait.h>
34
 
#include        <sys/signal.h>
35
 
#include        <values.h>
36
 
 
37
 
static void alert(char *event, char *dev, char *disc, char *mailaddr, char *cmd);
38
 
 
39
 
static char *percentalerts[] = { 
40
 
        "RebuildStarted",
41
 
        "Rebuild20",
42
 
        "Rebuild40",
43
 
        "Rebuild60",
44
 
        "Rebuild80",
45
 
};
46
 
 
47
 
int Monitor(mddev_dev_t devlist,
48
 
            char *mailaddr, char *alert_cmd,
49
 
            int period, int daemonise, int scan, int oneshot,
50
 
            char *config, int test, char* pidfile)
51
 
{
52
 
        /*
53
 
         * Every few seconds, scan every md device looking for changes
54
 
         * When a change is found, log it, possibly run the alert command,
55
 
         * and possibly send Email
56
 
         *
57
 
         * For each array, we record:
58
 
         *   Update time
59
 
         *   active/working/failed/spare drives
60
 
         *   State of each device.
61
 
         *   %rebuilt if rebuilding
62
 
         *
63
 
         * If the update time changes, check out all the data again
64
 
         * It is possible that we cannot get the state of each device
65
 
         * due to bugs in the md kernel module.
66
 
         * We also read /proc/mdstat to get rebuild percent,
67
 
         * and to get state on all active devices incase of kernel bug.
68
 
         *
69
 
         * Events are:
70
 
         *    Fail
71
 
         *      An active device had Faulty set or Active/Sync removed
72
 
         *    FailSpare
73
 
         *      A spare device had Faulty set
74
 
         *    SpareActive
75
 
         *      An active device had a reverse transition
76
 
         *    RebuildStarted
77
 
         *      percent went from -1 to +ve
78
 
         *    Rebuild20 Rebuild40 Rebuild60 Rebuild80
79
 
         *      percent went from below to not-below that number
80
 
         *    DeviceDisappeared
81
 
         *      Couldn't access a device which was previously visible
82
 
         *
83
 
         * if we detect an array with active<raid and spare==0
84
 
         * we look at other arrays that have same spare-group
85
 
         * If we find one with active==raid and spare>0,
86
 
         *  and if we can get_disk_info and find a name
87
 
         *  Then we hot-remove and hot-add to the other array
88
 
         *
89
 
         * If devlist is NULL, then we can monitor everything because --scan
90
 
         * was given.  We get an initial list from config file and add anything
91
 
         * that appears in /proc/mdstat
92
 
         */
93
 
 
94
 
        struct state {
95
 
                char *devname;
96
 
                int devnum;     /* to sync with mdstat info */
97
 
                long utime;
98
 
                int err;
99
 
                char *spare_group;
100
 
                int active, working, failed, spare, raid;
101
 
                int expected_spares;
102
 
                int devstate[MD_SB_DISKS];
103
 
                int devid[MD_SB_DISKS];
104
 
                int percent;
105
 
                struct state *next;
106
 
        } *statelist = NULL;
107
 
        int finished = 0;
108
 
        struct mdstat_ent *mdstat = NULL;
109
 
 
110
 
        if (!mailaddr) {
111
 
                mailaddr = conf_get_mailaddr(config);
112
 
                if (mailaddr && ! scan)
113
 
                        fprintf(stderr, Name ": Monitor using email address \"%s\" from config file\n",
114
 
                               mailaddr);
115
 
        }
116
 
        if (!alert_cmd) {
117
 
                alert_cmd = conf_get_program(config);
118
 
                if (alert_cmd && ! scan)
119
 
                        fprintf(stderr, Name ": Monitor using program \"%s\" from config file\n",
120
 
                               alert_cmd);
121
 
        }
122
 
        if (scan && !mailaddr && !alert_cmd) {
123
 
                fprintf(stderr, Name ": No mail address or alert command - not monitoring.\n");
124
 
                return 1;
125
 
        }
126
 
 
127
 
        if (daemonise) {
128
 
                int pid = fork();
129
 
                if (pid > 0) {
130
 
                        if (!pidfile)
131
 
                                printf("%d\n", pid);
132
 
                        else {
133
 
                                FILE *pid_file;
134
 
                                pid_file=fopen(pidfile, "w");
135
 
                                if (!pid_file)
136
 
                                        perror("cannot create pid file");
137
 
                                else {
138
 
                                        fprintf(pid_file,"%d\n", pid);
139
 
                                        fclose(pid_file);
140
 
                                }
141
 
                        }
142
 
                        return 0;
143
 
                }
144
 
                if (pid < 0) {
145
 
                        perror("daemonise");
146
 
                        return 1;
147
 
                }
148
 
                close(0);
149
 
                open("/dev/null", 3);
150
 
                dup2(0,1);
151
 
                dup2(0,2);
152
 
                setsid();
153
 
        }
154
 
 
155
 
        if (devlist == NULL) {
156
 
                mddev_ident_t mdlist = conf_get_ident(config, NULL);
157
 
                for (; mdlist; mdlist=mdlist->next) {
158
 
                        struct state *st = malloc(sizeof *st);
159
 
                        if (st == NULL)
160
 
                                continue;
161
 
                        st->devname = strdup(mdlist->devname);
162
 
                        st->utime = 0;
163
 
                        st->next = statelist;
164
 
                        st->err = 0;
165
 
                        st->devnum = MAXINT;
166
 
                        st->percent = -2;
167
 
                        st->expected_spares = mdlist->spare_disks;
168
 
                        if (mdlist->spare_group)
169
 
                                st->spare_group = strdup(mdlist->spare_group);
170
 
                        else
171
 
                                st->spare_group = NULL;
172
 
                        statelist = st;
173
 
                }
174
 
        } else {
175
 
                mddev_dev_t dv;
176
 
                for (dv=devlist ; dv; dv=dv->next) {
177
 
                        mddev_ident_t mdlist = conf_get_ident(config, dv->devname);
178
 
                        struct state *st = malloc(sizeof *st);
179
 
                        if (st == NULL)
180
 
                                continue;
181
 
                        st->devname = strdup(dv->devname);
182
 
                        st->utime = 0;
183
 
                        st->next = statelist;
184
 
                        st->err = 0;
185
 
                        st->devnum = MAXINT;
186
 
                        st->percent = -2;
187
 
                        st->expected_spares = -1;
188
 
                        st->spare_group = NULL;
189
 
                        if (mdlist) {
190
 
                                st->expected_spares = mdlist->spare_disks;
191
 
                                if (mdlist->spare_group)
192
 
                                        st->spare_group = strdup(mdlist->spare_group);
193
 
                        }
194
 
                        statelist = st;
195
 
                }
196
 
        }
197
 
 
198
 
 
199
 
        while (! finished) {
200
 
                int new_found = 0;
201
 
                struct state *st;
202
 
 
203
 
                if (mdstat)
204
 
                        free_mdstat(mdstat);
205
 
                mdstat = mdstat_read(oneshot?0:1);
206
 
 
207
 
                for (st=statelist; st; st=st->next) {
208
 
                        mdu_array_info_t array;
209
 
                        struct mdstat_ent *mse = NULL, *mse2;
210
 
                        char *dev = st->devname;
211
 
                        int fd;
212
 
                        unsigned int i;
213
 
 
214
 
                        if (test)
215
 
                                alert("TestMessage", dev, NULL, mailaddr, alert_cmd);
216
 
                        fd = open(dev, O_RDONLY);
217
 
                        if (fd < 0) {
218
 
                                if (!st->err)
219
 
                                        alert("DeviceDisappeared", dev, NULL,
220
 
                                              mailaddr, alert_cmd);
221
 
/*                                      fprintf(stderr, Name ": cannot open %s: %s\n",
222
 
                                                dev, strerror(errno));
223
 
*/                              st->err=1;
224
 
                                continue;
225
 
                        }
226
 
                        if (ioctl(fd, GET_ARRAY_INFO, &array)<0) {
227
 
                                if (!st->err)
228
 
                                        alert("DeviceDisappeared", dev, NULL,
229
 
                                              mailaddr, alert_cmd);
230
 
/*                                      fprintf(stderr, Name ": cannot get array info for %s: %s\n",
231
 
                                                dev, strerror(errno));
232
 
*/                              st->err=1;
233
 
                                close(fd);
234
 
                                continue;
235
 
                        }
236
 
                        if (array.level != 1 && array.level != 5 && array.level != -4 &&
237
 
                                array.level != 6 && array.level != 10) {
238
 
                                if (!st->err)
239
 
                                        alert("DeviceDisappeared", dev, "Wrong-Level",
240
 
                                              mailaddr, alert_cmd);
241
 
                                st->err = 1;
242
 
                                close(fd);
243
 
                                continue;
244
 
                        }
245
 
                        if (st->devnum == MAXINT) {
246
 
                                struct stat stb;
247
 
                                if (fstat(fd, &stb) == 0 &&
248
 
                                    (S_IFMT&stb.st_mode)==S_IFBLK) {
249
 
                                        if (major(stb.st_rdev) == MD_MAJOR)
250
 
                                                st->devnum = minor(stb.st_rdev);
251
 
                                        else
252
 
                                                st->devnum = -1- (minor(stb.st_rdev)>>6);
253
 
                                }
254
 
                        }
255
 
 
256
 
                        for (mse2 = mdstat ; mse2 ; mse2=mse2->next)
257
 
                                if (mse2->devnum == st->devnum) {
258
 
                                        mse2->devnum = MAXINT; /* flag it as "used" */
259
 
                                        mse = mse2;
260
 
                                }
261
 
 
262
 
                        if (st->utime == array.utime &&
263
 
                            st->failed == array.failed_disks &&
264
 
                            st->working == array.working_disks &&
265
 
                            st->spare == array.spare_disks &&
266
 
                            (mse == NULL  || (
267
 
                                    mse->percent == st->percent
268
 
                                    ))) {
269
 
                                close(fd);
270
 
                                st->err = 0;
271
 
                                continue;
272
 
                        }
273
 
                        if (st->utime == 0 && /* new array */
274
 
                            mse &&      /* is in /proc/mdstat */
275
 
                            mse->pattern && strchr(mse->pattern, '_') /* degraded */
276
 
                                )
277
 
                                alert("DegradedArray", dev, NULL, mailaddr, alert_cmd);
278
 
 
279
 
                        if (st->utime == 0 && /* new array */
280
 
                            st->expected_spares > 0 && 
281
 
                            array.spare_disks < st->expected_spares) 
282
 
                                alert("SparesMissing", dev, NULL, mailaddr, alert_cmd);
283
 
                        if (mse &&
284
 
                            st->percent == -1 && 
285
 
                            mse->percent >= 0)
286
 
                                alert("RebuildStarted", dev, NULL, mailaddr, alert_cmd);
287
 
                        if (mse &&
288
 
                            st->percent >= 0 &&
289
 
                            mse->percent >= 0 &&
290
 
                            (mse->percent / 20) > (st->percent / 20))
291
 
                                alert(percentalerts[mse->percent/20],
292
 
                                      dev, NULL, mailaddr, alert_cmd);
293
 
 
294
 
                        if (mse &&
295
 
                            mse->percent == -1 &&
296
 
                            st->percent >= 0)
297
 
                                alert("RebuildFinished", dev, NULL, mailaddr, alert_cmd);
298
 
 
299
 
                        if (mse)
300
 
                                st->percent = mse->percent;
301
 
                                        
302
 
                        for (i=0; i<MD_SB_DISKS; i++) {
303
 
                                mdu_disk_info_t disc;
304
 
                                int newstate=0;
305
 
                                int change;
306
 
                                char *dv = NULL;
307
 
                                disc.number = i;
308
 
                                if (ioctl(fd, GET_DISK_INFO, &disc)>= 0) {
309
 
                                        newstate = disc.state;
310
 
                                        dv = map_dev(disc.major, disc.minor);
311
 
                                } else if (mse &&  mse->pattern && i < strlen(mse->pattern))
312
 
                                        switch(mse->pattern[i]) {
313
 
                                        case 'U': newstate = 6 /* ACTIVE/SYNC */; break;
314
 
                                        case '_': newstate = 0; break;
315
 
                                        }
316
 
                                change = newstate ^ st->devstate[i];
317
 
                                if (st->utime && change && !st->err) {
318
 
                                        if (i < (unsigned)array.raid_disks &&
319
 
                                            (((newstate&change)&(1<<MD_DISK_FAULTY)) ||
320
 
                                             ((st->devstate[i]&change)&(1<<MD_DISK_ACTIVE)) ||
321
 
                                             ((st->devstate[i]&change)&(1<<MD_DISK_SYNC)))
322
 
                                                )
323
 
                                                alert("Fail", dev, dv, mailaddr, alert_cmd);
324
 
                                        else if (i >= (unsigned)array.raid_disks &&
325
 
                                                 (disc.major || disc.minor) &&
326
 
                                                 st->devid[i] == makedev(disc.major, disc.minor) &&
327
 
                                                 ((newstate&change)&(1<<MD_DISK_FAULTY))
328
 
                                                )
329
 
                                                alert("FailSpare", dev, dv, mailaddr, alert_cmd);
330
 
                                        else if (i < (unsigned)array.raid_disks &&
331
 
                                                 (((st->devstate[i]&change)&(1<<MD_DISK_FAULTY)) ||
332
 
                                                  ((newstate&change)&(1<<MD_DISK_ACTIVE)) ||
333
 
                                                  ((newstate&change)&(1<<MD_DISK_SYNC)))
334
 
                                                )
335
 
                                                alert("SpareActive", dev, dv, mailaddr, alert_cmd);
336
 
                                }
337
 
                                st->devstate[i] = disc.state;
338
 
                                st->devid[i] = makedev(disc.major, disc.minor);
339
 
                        }
340
 
                        close(fd);
341
 
                        st->active = array.active_disks;
342
 
                        st->working = array.working_disks;
343
 
                        st->spare = array.spare_disks;
344
 
                        st->failed = array.failed_disks;
345
 
                        st->utime = array.utime;
346
 
                        st->raid = array.raid_disks;
347
 
                        st->err = 0;
348
 
                }
349
 
                /* now check if there are any new devices found in mdstat */
350
 
                if (scan) {
351
 
                        struct mdstat_ent *mse;
352
 
                        for (mse=mdstat; mse; mse=mse->next) 
353
 
                                if (mse->devnum != MAXINT &&
354
 
                                    (strcmp(mse->level, "raid1")==0 ||
355
 
                                     strcmp(mse->level, "raid5")==0 ||
356
 
                                     strcmp(mse->level, "multipath")==0)
357
 
                                        ) {
358
 
                                        struct state *st = malloc(sizeof *st);
359
 
                                        mdu_array_info_t array;
360
 
                                        int fd;
361
 
                                        if (st == NULL)
362
 
                                                continue;
363
 
                                        st->devname = strdup(get_md_name(mse->devnum));
364
 
                                        if ((fd = open(st->devname, O_RDONLY)) < 0 ||
365
 
                                            ioctl(fd, GET_ARRAY_INFO, &array)< 0) {
366
 
                                                /* no such array */
367
 
                                                if (fd >=0) close(fd);
368
 
                                                free(st->devname);
369
 
                                                free(st);
370
 
                                                continue;
371
 
                                        }
372
 
                                        close(fd);
373
 
                                        st->utime = 0;
374
 
                                        st->next = statelist;
375
 
                                        st->err = 1;
376
 
                                        st->devnum = mse->devnum;
377
 
                                        st->percent = -2;
378
 
                                        st->spare_group = NULL;
379
 
                                        st->expected_spares = -1;
380
 
                                        statelist = st;
381
 
                                        alert("NewArray", st->devname, NULL, mailaddr, alert_cmd);
382
 
                                        new_found = 1;
383
 
                                }
384
 
                }
385
 
                /* If an array has active < raid && spare == 0 && spare_group != NULL
386
 
                 * Look for another array with spare > 0 and active == raid and same spare_group
387
 
                 *  if found, choose a device and hotremove/hotadd
388
 
                 */
389
 
                for (st = statelist; st; st=st->next)
390
 
                        if (st->active < st->raid &&
391
 
                            st->spare == 0 &&
392
 
                            st->spare_group != NULL) {
393
 
                                struct state *st2;
394
 
                                for (st2=statelist ; st2 ; st2=st2->next)
395
 
                                        if (st2 != st &&
396
 
                                            st2->spare > 0 &&
397
 
                                            st2->active == st2->raid &&
398
 
                                            st2->spare_group != NULL &&
399
 
                                            strcmp(st->spare_group, st2->spare_group) == 0) {
400
 
                                                /* try to remove and add */
401
 
                                                int fd1 = open(st->devname, O_RDONLY);
402
 
                                                int fd2 = open(st2->devname, O_RDONLY);
403
 
                                                int dev = -1;
404
 
                                                int d;
405
 
                                                if (fd1 < 0 || fd2 < 0) {
406
 
                                                        if (fd1>=0) close(fd1);
407
 
                                                        if (fd2>=0) close(fd2);
408
 
                                                        continue;
409
 
                                                }
410
 
                                                for (d=st2->raid; d<MD_SB_DISKS; d++) {
411
 
                                                        if (st2->devid[d] > 0 &&
412
 
                                                            st2->devstate[d] == 0) {
413
 
                                                                dev = st2->devid[d];
414
 
                                                                break;
415
 
                                                        }
416
 
                                                }
417
 
                                                if (dev > 0) {
418
 
                                                        if (ioctl(fd2, HOT_REMOVE_DISK, 
419
 
                                                                  (unsigned long)dev) == 0) {
420
 
                                                                if (ioctl(fd1, HOT_ADD_DISK,
421
 
                                                                          (unsigned long)dev) == 0) {
422
 
                                                                        alert("MoveSpare", st->devname, st2->devname, mailaddr, alert_cmd);
423
 
                                                                        close(fd1);
424
 
                                                                        close(fd2);
425
 
                                                                        break;
426
 
                                                                }
427
 
                                                                else ioctl(fd2, HOT_ADD_DISK, (unsigned long) dev);
428
 
                                                        }
429
 
                                                }
430
 
                                                close(fd1);
431
 
                                                close(fd2);
432
 
                                        }
433
 
                        }
434
 
                if (!new_found) {
435
 
                        if (oneshot)
436
 
                                break;
437
 
                        else
438
 
                                mdstat_wait(period);
439
 
                }
440
 
                test = 0;
441
 
        }
442
 
        if (pidfile)
443
 
                unlink(pidfile);
444
 
        return 0;
445
 
}
446
 
 
447
 
 
448
 
static void alert(char *event, char *dev, char *disc, char *mailaddr, char *cmd)
449
 
{
450
 
        if (!cmd && !mailaddr) {
451
 
                time_t now = time(0);
452
 
               
453
 
                printf("%1.15s: %s on %s %s\n", ctime(&now)+4, event, dev, disc?disc:"unknown device");
454
 
        }
455
 
        if (cmd) {
456
 
                int pid = fork();
457
 
                switch(pid) {
458
 
                default:
459
 
                        waitpid(pid, NULL, 0);
460
 
                        break;
461
 
                case -1:
462
 
                        break;
463
 
                case 0:
464
 
                        execl(cmd, cmd, event, dev, disc, NULL);
465
 
                        exit(2);
466
 
                }
467
 
        }
468
 
        if (mailaddr && 
469
 
            (strncmp(event, "Fail", 4)==0 || 
470
 
             strncmp(event, "Test", 4)==0 ||
471
 
             strncmp(event, "Degrade", 7)==0)) {
472
 
                FILE *mp = popen(Sendmail, "w");
473
 
                if (mp) {
474
 
                        char hname[256];
475
 
                        gethostname(hname, sizeof(hname));
476
 
                        signal(SIGPIPE, SIG_IGN);
477
 
                        fprintf(mp, "From: " Name " monitoring <root>\n");
478
 
                        fprintf(mp, "To: %s\n", mailaddr);
479
 
                        fprintf(mp, "Subject: %s event on %s:%s\n\n", event, dev, hname);
480
 
 
481
 
                        fprintf(mp, "This is an automatically generated mail message from " Name "\n");
482
 
                        fprintf(mp, "running on %s\n\n", hname);
483
 
 
484
 
                        fprintf(mp, "A %s event had been detected on md device %s.\n\n", event, dev);
485
 
 
486
 
                        if (disc)
487
 
                                fprintf(mp, "It could be related to component device %s.\n\n", disc);
488
 
 
489
 
                        fprintf(mp, "Faithfully yours, etc.\n");
490
 
                        fclose(mp);
491
 
                }
492
 
 
493
 
        }
494
 
        /* FIXME log the event to syslog maybe */
495
 
}