2
# Copyright 2009 Google Inc. All Rights Reserved.
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
8
# http://www.apache.org/licenses/LICENSE-2.0
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
16
"""Provides data sources to use for benchmarking."""
27
# See if a third_party library exists -- use it if so.
36
from . import selectors
38
# Pick the most accurate timer for a platform. Stolen from timeit.py:
39
if sys.platform[:3] == 'win':
40
DEFAULT_TIMER = time.clock
42
DEFAULT_TIMER = time.time
44
GLOBAL_DATA_CACHE = {}
45
INTERNAL_RE = re.compile('^0|\.pro[md]\.|\.corp|\.bor|\.hot$|internal|dmz|intra|\.\w$|\.\w{5,}$')
46
# ^.*[\w-]+\.[\w-]+\.[\w-]+\.[a-zA-Z]+\.$|^[\w-]+\.[\w-]{3,}\.[a-zA-Z]+\.$
47
FQDN_RE = re.compile('^.*\..*\..*\..*\.$|^.*\.[\w-]*\.\w{3,4}\.$|^[\w-]+\.[\w-]{4,}\.\w+\.')
49
IP_RE = re.compile('^[0-9.]+$')
50
DEFAULT_CONFIG_PATH = "config/data_sources.cfg"
51
MAX_NON_UNIQUE_RECORD_COUNT = 500000
52
MAX_FILE_MTIME_AGE_DAYS = 60
54
MIN_RECOMMENDED_RECORD_COUNT = 200
55
MAX_FQDN_SYNTHESIZE_PERCENT = 4
57
class DataSources(object):
58
def __init__(self, config_path=DEFAULT_CONFIG_PATH, status_callback=None):
59
global GLOBAL_DATA_CACHE
60
self.source_cache = GLOBAL_DATA_CACHE
61
self.source_config = {}
62
self.status_callback = status_callback
63
self._LoadConfigFromPath(config_path)
65
def msg(self, msg, **kwargs):
66
if self.status_callback:
67
self.status_callback(msg, **kwargs)
71
def _LoadConfigFromPath(self, path):
72
conf_file = util.FindDataFile('config/data_sources.cfg')
73
config = ConfigParser.ConfigParser()
74
config.read(conf_file)
75
for section in config.sections():
76
if section not in self.source_config:
77
self.source_config[section] = {
79
'search_paths': set(),
80
# Store whether or not this data source contains personal data
81
'full_hostnames': True
84
for (key, value) in config.items(section):
86
self.source_config[section]['name'] = value
87
elif key == 'full_hostnames' and int(value) == 0:
88
self.source_config[section]['full_hostnames'] = False
90
self.source_config[section]['search_paths'].add(value)
92
def ListSourceTypes(self):
93
"""Get a list of all data sources we know about."""
94
return sorted(self.source_config.keys())
96
def ListSourcesWithDetails(self):
97
"""Get a list of all data sources found with total counts.
100
List of tuples in form of (short_name, full_name, full_hosts, # of entries)
102
for source in self.ListSourceTypes():
103
self._GetHostsFromSource(source, min_file_size=MIN_FILE_SIZE,
104
max_mtime_age_days=MAX_FILE_MTIME_AGE_DAYS)
107
for source in self.source_cache:
108
details.append((source,
109
self.source_config[source]['name'],
110
self.source_config[source]['full_hostnames'],
111
len(self.source_cache[source])))
112
return sorted(details, key=lambda x:(x[2], x[3]), reverse=True)
114
def ListSourceTitles(self):
115
"""Return a list of sources in title + count format."""
117
for (source_type, name, full_hostnames, count) in self.ListSourcesWithDetails():
118
titles.append("%s (%s)" % (name, count))
121
def ConvertSourceTitleToType(self, detail):
122
"""Convert a detail name to a source type."""
123
for source_type in self.source_config:
124
if detail.startswith(self.source_config[source_type]['name']):
127
def GetBestSourceDetails(self):
128
return self.ListSourcesWithDetails()[0]
130
def GetNameForSource(self, source):
131
if source in self.source_config:
132
return self.source_config[source]['name']
134
# Most likely a custom file path
137
def GetCachedRecordCountForSource(self, source):
138
return len(self.source_cache[source])
140
def _CreateRecordsFromHostEntries(self, entries):
141
"""Create records from hosts, removing duplicate entries and IP's
144
A list of test-data entries.
147
A tuple of (filtered records, full_host_names (Boolean)
149
real_tld_re = re.compile('[a-z]{2,4}$')
150
internal_re = re.compile('^[\d:\.]+$')
155
for entry in entries:
156
if entry == last_entry:
162
(record_type, host) = entry.split(' ')
167
if not IP_RE.match(host) and not INTERNAL_RE.search(host):
168
if not host.endswith('.'):
169
# For a short string like this, simple addition is 54% faster than formatting
171
records.append((record_type, host))
173
if FQDN_RE.match(host):
176
# Now that we've read everything, are we dealing with domains or full hostnames?
177
full_host_percent = full_host_count / float(len(records)) * 100
178
# self.msg('%0.1f%% of input records are using fully qualified hostnames.' % full_host_percent)
179
if full_host_percent < MAX_FQDN_SYNTHESIZE_PERCENT:
180
full_host_names = True
182
full_host_names = False
183
return (records, full_host_names)
185
def GetTestsFromSource(self, source, count=50, select_mode=None):
186
"""Parse records from source, and returnrequest tuples to use for testing.
188
This is tricky because we support 3 types of input data:
192
- List of record_type + hosts
196
# Convert entries into tuples, determine if we are using full hostnames
199
records = self._GetHostsFromSource(source)
200
self.msg('Generating tests from %s (%s records, selecting %s %s)' % (self.GetNameForSource(source), len(records), count, select_mode))
201
(records, are_records_fqdn) = self._CreateRecordsFromHostEntries(records)
202
# First try to resolve whether to use weighted or random.
203
if select_mode in ('weighted', 'automatic', None):
204
if len(records) != len(set(records)):
205
if select_mode == 'weighted':
206
self.msg('%s data contains duplicates, switching select_mode to random' % source)
207
select_mode = 'random'
209
select_mode = 'weighted'
211
self.msg('Selecting %s out of %s %s records.' % (count, len(records), select_mode))
212
# Now make the real selection.
213
if select_mode == 'weighted':
214
records = selectors.WeightedDistribution(records, count)
215
elif select_mode == 'chunk':
216
records = selectors.ChunkSelect(records, count)
217
elif select_mode == 'random':
218
records = selectors.RandomSelect(records, count)
221
self.source_config[source]['full_hostnames'] = False
222
self.msg('%s input appears to be predominantly domain names. Synthesizing FQDNs' % source)
224
for (req_type, hostname) in records:
225
if not FQDN_RE.match(hostname):
226
hostname = self._GenerateRandomHostname(hostname)
227
synthesized.append((req_type, hostname))
232
def _GenerateRandomHostname(self, domain):
233
"""Generate a random hostname f or a given domain."""
234
oracle = random.randint(0, 100)
236
return 'www.%s' % domain
240
return 'static.%s' % domain
242
return 'cache-%s.%s' % (random.randint(0, 10), domain)
244
def _GetHostsFromSource(self, source, min_file_size=None, max_mtime_age_days=None):
245
"""Get data for a particular source. This needs to be fast.
247
We support 3 styles of files:
249
* One-per line list in form of record-type: host
250
* One-per line list of unique domains
251
* Any form with URL's.
253
The results of this function get cached.
255
if source in self.source_cache:
256
return self.source_cache[source]
257
filename = self._FindBestFileForSource(source, min_file_size=min_file_size,
258
max_mtime_age_days=max_mtime_age_days)
262
size_mb = os.path.getsize(filename) / 1024.0 / 1024.0
263
self.msg('Reading %s: %s (%0.1fMB)' % (self.GetNameForSource(source), filename, size_mb))
264
start_clock = DEFAULT_TIMER()
265
hosts = self._ExtractHostsFromHistoryFile(filename)
267
hosts = self._ReadDataFile(filename)
268
duration = DEFAULT_TIMER() - start_clock
270
self.msg('%s data took %1.1fs to read!' % (self.GetNameForSource(source), duration))
271
self.source_cache[source] = hosts
274
def _ExtractHostsFromHistoryFile(self, path):
275
"""Get a list of sanitized records from a history file containing URLs."""
276
# This regexp is fairly general (no ip filtering), since we need speed more
277
# than precision at this stage.
278
parse_re = re.compile('https*://([\-\w]+\.[\-\w\.]+)')
279
return parse_re.findall(open(path, 'rb').read())
281
def _ReadDataFile(self, path):
282
"""Read a line-based datafile."""
284
for line in open(path).readlines():
285
if not line.startswith('#'):
286
records.append(line.rstrip())
289
def _GetSourceSearchPaths(self, source):
290
"""Get a list of possible search paths (globs) for a given source."""
292
# This is likely a custom file path
293
if source not in self.source_config:
297
environment_re = re.compile('%(\w+)%')
300
# First get through resolving environment variables
301
for path in self.source_config[source]['search_paths']:
302
env_vars = set(environment_re.findall(path))
304
for variable in env_vars:
305
env_var = os.getenv(variable, False)
307
path = path.replace('%%%s%%' % variable, env_var)
311
# If everything is good, replace all '/' chars with the os path variable.
313
path = path.replace('/', os.sep)
314
search_paths.append(path)
316
# This moment of weirdness brought to you by Windows XP(tm). If we find
317
# a Local or Roaming keyword in path, add the other forms to the search
319
if sys.platform[:3] == 'win':
320
keywords = ('Local', 'Roaming')
321
for keyword in keywords:
323
replacement = keywords[keywords.index(keyword)-1]
324
search_paths.append(path.replace('\\%s' % keyword, '\\%s' % replacement))
325
search_paths.append(path.replace('\\%s' % keyword, ''))
329
def _FindBestFileForSource(self, source, min_file_size=None,
330
max_mtime_age_days=None):
331
"""Find the best file (newest over X size) to use for a given source type.
340
for path in self._GetSourceSearchPaths(source):
341
if not os.path.isabs(path):
342
path = util.FindDataFile(path)
344
for filename in glob.glob(path):
345
if min_file_size and os.path.getsize(filename) < min_file_size:
346
self.msg('Ignoring %s (only %s bytes)' % (filename, os.path.getsize(filename)))
348
found.append(filename)
351
newest = sorted(found, key=os.path.getmtime)[-1]
352
age_days = (time.time() - os.path.getmtime(newest)) / 86400
353
if max_mtime_age_days and age_days > max_mtime_age_days:
354
self.msg('Ignoring %s from %s (%2.0f days old)' % (newest, source, age_days))
360
if __name__ == '__main__':
361
parser = DataSources()
362
print parser.ListSourceTypes()
363
print parser.ListSourcesWithDetails()
364
best = parser.ListSourcesWithDetails()[0][0]
365
print len(parser.GetRecordsFromSource(best))