1
# Copyright (c) 2010 OpenStack, LLC.
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
7
# http://www.apache.org/licenses/LICENSE-2.0
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
17
from urllib import unquote
20
from swift.common.utils import split_path, get_logger
22
month_map = '_ Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec'.split()
25
class AccessLogProcessor(object):
26
"""Transform proxy server access logs"""
28
def __init__(self, conf):
29
self.server_name = conf.get('server_name', 'proxy')
30
self.lb_private_ips = [x.strip() for x in \
31
conf.get('lb_private_ips', '').split(',')\
33
self.service_ips = [x.strip() for x in \
34
conf.get('service_ips', '').split(',')\
36
self.warn_percent = float(conf.get('warn_percent', '0.8'))
37
self.logger = get_logger(conf)
39
def log_line_parser(self, raw_log):
40
'''given a raw access log line, return a dict of the good parts'''
60
processing_time) = (unquote(x) for x in raw_log[16:].split(' '))
62
self.logger.debug('Bad line data: %s' % repr(raw_log))
64
if server != self.server_name:
65
# incorrect server name in log line
66
self.logger.debug('Bad server name: found "%s" expected "%s"' \
67
% (server, self.server_name))
72
object_name) = split_path(request, 2, 4, True)
73
if container_name is not None:
74
container_name = container_name.split('?', 1)[0]
75
if object_name is not None:
76
object_name = object_name.split('?', 1)[0]
77
account = account.split('?', 1)[0]
80
request, query = request.split('?', 1)
81
args = query.split('&')
82
# Count each query argument. This is used later to aggregate
83
# the number of format, prefix, etc. queries.
86
k, v = q.split('=', 1)
89
# Certain keys will get summmed in stats reporting
90
# (format, path, delimiter, etc.). Save a "1" here
91
# to indicate that this request is 1 request for
94
d['client_ip'] = client_ip
97
d['request'] = request
100
d['http_version'] = http_version
102
d['referrer'] = referrer
103
d['user_agent'] = user_agent
104
d['auth_token'] = auth_token
105
d['bytes_in'] = bytes_in
106
d['bytes_out'] = bytes_out
108
d['trans_id'] = trans_id
109
d['processing_time'] = processing_time
110
day, month, year, hour, minute, second = timestamp.split('/')
112
month = ('%02s' % month_map.index(month)).replace(' ', '0')
119
d['account'] = account
120
d['container_name'] = container_name
121
d['object_name'] = object_name
122
d['bytes_out'] = int(d['bytes_out'].replace('-', '0'))
123
d['bytes_in'] = int(d['bytes_in'].replace('-', '0'))
124
d['code'] = int(d['code'])
127
def process(self, obj_stream, account, container, object_name):
128
'''generate hourly groupings of data from one access log file'''
129
hourly_aggr_info = {}
132
for line in obj_stream:
133
line_data = self.log_line_parser(line)
138
account = line_data['account']
139
container_name = line_data['container_name']
140
year = line_data['year']
141
month = line_data['month']
142
day = line_data['day']
143
hour = line_data['hour']
144
bytes_out = line_data['bytes_out']
145
bytes_in = line_data['bytes_in']
146
method = line_data['method']
147
code = int(line_data['code'])
148
object_name = line_data['object_name']
149
client_ip = line_data['client_ip']
152
if not container_name:
154
elif container_name and not object_name:
155
op_level = 'container'
159
aggr_key = (account, year, month, day, hour)
160
d = hourly_aggr_info.get(aggr_key, {})
161
if line_data['lb_ip'] in self.lb_private_ips:
166
if line_data['client_ip'] in self.service_ips:
169
d[(source, 'bytes_out')] = d.setdefault((
170
source, 'bytes_out'), 0) + bytes_out
171
d[(source, 'bytes_in')] = d.setdefault((source, 'bytes_in'), 0) + \
174
d['format_query'] = d.setdefault('format_query', 0) + \
175
line_data.get('format', 0)
176
d['marker_query'] = d.setdefault('marker_query', 0) + \
177
line_data.get('marker', 0)
178
d['prefix_query'] = d.setdefault('prefix_query', 0) + \
179
line_data.get('prefix', 0)
180
d['delimiter_query'] = d.setdefault('delimiter_query', 0) + \
181
line_data.get('delimiter', 0)
182
path = line_data.get('path', 0)
183
d['path_query'] = d.setdefault('path_query', 0) + path
185
code = '%dxx' % (code / 100)
186
key = (source, op_level, method, code)
187
d[key] = d.setdefault(key, 0) + 1
189
hourly_aggr_info[aggr_key] = d
190
if bad_lines > (total_lines * self.warn_percent):
191
name = '/'.join([account, container, object_name])
192
self.logger.warning('I found a bunch of bad lines in %s '\
193
'(%d bad, %d total)' % (name, bad_lines, total_lines))
194
return hourly_aggr_info
196
def keylist_mapping(self):
197
source_keys = 'service public'.split()
198
level_keys = 'account container object'.split()
199
verb_keys = 'GET PUT POST DELETE HEAD COPY'.split()
200
code_keys = '2xx 4xx 5xx'.split()
203
# <db key> : <row key> or <set of row keys>
204
'service_bw_in': ('service', 'bytes_in'),
205
'service_bw_out': ('service', 'bytes_out'),
206
'public_bw_in': ('public', 'bytes_in'),
207
'public_bw_out': ('public', 'bytes_out'),
208
'account_requests': set(),
209
'container_requests': set(),
210
'object_requests': set(),
211
'service_request': set(),
212
'public_request': set(),
215
for verb in verb_keys:
216
keylist_mapping[verb] = set()
217
for code in code_keys:
218
keylist_mapping[code] = set()
219
for source in source_keys:
220
for level in level_keys:
221
for verb in verb_keys:
222
for code in code_keys:
223
keylist_mapping['account_requests'].add(
224
(source, 'account', verb, code))
225
keylist_mapping['container_requests'].add(
226
(source, 'container', verb, code))
227
keylist_mapping['object_requests'].add(
228
(source, 'object', verb, code))
229
keylist_mapping['service_request'].add(
230
('service', level, verb, code))
231
keylist_mapping['public_request'].add(
232
('public', level, verb, code))
233
keylist_mapping[verb].add(
234
(source, level, verb, code))
235
keylist_mapping[code].add(
236
(source, level, verb, code))
237
keylist_mapping['ops_count'].add(
238
(source, level, verb, code))
239
return keylist_mapping