~cthier/swift/s3_bucket_date

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
# Copyright (c) 2010-2011 OpenStack, LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import os
import re
import subprocess
import sys
from ConfigParser import ConfigParser

from swift.common.utils import get_logger

# To search for more types of errors, add the regex to the list below
error_re = [
    'error.*(sd[a-z])',
    '(sd[a-z]).*error',
]

def get_devices(device_dir, logger):
    devices = []
    for line in open('/proc/mounts').readlines():
        data = line.strip().split()
        block_device = data[0]
        mount_point = data[1]
        if mount_point.startswith(device_dir):
            device = {}
            device['mount_point'] = mount_point
            device['block_device'] = block_device
            try:
                device_num = os.stat(block_device).st_rdev
            except OSError, e:
                # If we can't stat the device, then something weird is going on
                logger.error("Error: Could not stat %s!" %
                    block_device)
                continue
            device['major'] = str(os.major(device_num))
            device['minor'] = str(os.minor(device_num))
            devices.append(device)
    for line in open('/proc/partitions').readlines()[2:]:
        major,minor,blocks,kernel_device = line.strip().split()
        device = [d for d in devices
            if d['major'] == major and d['minor'] == minor]
        if device:
            device[0]['kernel_device'] = kernel_device
    return devices

def get_errors(minutes):
    errors = {}
    start_time = datetime.datetime.now() - datetime.timedelta(minutes=minutes)
    for line in open('/var/log/kern.log'):
        if '[    0.000000]' in line:
            # Ignore anything before the last boot
            errors = {}
            continue
        log_time_string = '%s %s' % (start_time.year,' '.join(line.split()[:3]))
        log_time = datetime.datetime.strptime(
            log_time_string,'%Y %b %d %H:%M:%S')
        if log_time > start_time:
            for err in error_re:
                for device in re.findall(err,line):
                    errors[device] = errors.get(device,0) + 1
    return errors

def comment_fstab(mount_point):
    with open('/etc/fstab', 'r') as fstab:
        with open('/etc/fstab.new', 'w') as new_fstab:
            for line in fstab:
                parts = line.split()
                if len(parts) > 2 and line.split()[1] == mount_point:
                    new_fstab.write('#' +  line)
                else:
                    new_fstab.write(line)
    os.rename('/etc/fstab.new', '/etc/fstab')

if __name__ == '__main__':
    c = ConfigParser()
    try:
        conf_path = sys.argv[1]
    except Exception:
        print "Usage: %s CONF_FILE" % sys.argv[0].split('/')[-1]
        sys.exit(1)
    if not c.read(conf_path):
        print "Unable to read config file %s" % conf_path
        sys.exit(1)
    conf = dict(c.items('drive-audit'))
    device_dir = conf.get('device_dir', '/srv/node')
    minutes = int(conf.get('minutes', 60))
    error_limit = int(conf.get('error_limit', 1))
    conf['log_name'] = conf.get('log_name', 'drive-audit')
    logger = get_logger(conf, log_route='drive-audit')
    devices = get_devices(device_dir, logger)
    logger.debug("Devices found: %s" % str(devices))
    if not devices:
        logger.error("Error: No devices found!")
    errors = get_errors(minutes)
    logger.debug("Errors found: %s" % str(errors))
    unmounts = 0
    for kernel_device,count in errors.items():
        if count >= error_limit:
            device = [d for d in devices
                if d['kernel_device'].startswith(kernel_device)]
            if device:
                mount_point = device[0]['mount_point']
                if mount_point.startswith('/srv/node'):
                    logger.info("Unmounting %s with %d errors" %
                        (mount_point, count))
                    subprocess.call(['umount','-fl',mount_point])
                    logger.info("Commenting out %s from /etc/fstab" %
                        (mount_point))
                    comment_fstab(mount_point)
                    unmounts += 1
    if unmounts == 0:
        logger.info("No drives were unmounted")