~ubuntu-branches/ubuntu/quantal/namebench/quantal

Viewing changes to libnamebench/data_sources.py

Committer: Bazaar Package Importer
Author(s): Miguel Landaeta
Date: 2010-03-07 13:32:50 UTC
mfrom: (1.1.1 upstream)
Revision ID: james.westby@ubuntu.com-20100307133250-80e4nkb88udvdt4m

Tags: 1.2+dfsg-1

* New upstream release.
* Updated watch file and added debian/orig-tar.sh.
* debian/rules: Added get-orig-source target.
* Update years in copyright file.
* Configuration files are now installed in /etc/namebench directory.
* Removed unnecessary files from /usr/share/doc/namebench directory.
* Updated XS-Python-Version, Build-Depends and Depends to python >= 2.5
since this version doesn't work with 2.4 anymore due to the use of
relative imports.

files added:
config

config/data_sources.cfg

config/hostname_reference.cfg

config/namebench.cfg

debian/namebench.postinst

debian/namebench.preinst

debian/orig-tar.sh

libnamebench/data_sources.py

libnamebench/reporter.py

files removed:
data/hostname_reference.cfg

data/nameservers.ods

libnamebench/history_parser.py

namebench.cfg

files modified:
ChangeLog.txt

data/alexa-top-10000-global.txt

debian/README.source

debian/changelog

debian/control

debian/copyright

debian/namebench.docs

debian/namebench.install

debian/namebench.links

debian/namebench.sgml

debian/rules

debian/watch

libnamebench/base_ui.py

libnamebench/benchmark.py

libnamebench/better_webbrowser.py

libnamebench/cli.py

libnamebench/config.py

libnamebench/conn_quality.py

libnamebench/health_checks.py

libnamebench/nameserver.py

libnamebench/nameserver_list.py

libnamebench/selectors.py

libnamebench/tk.py

libnamebench/util.py

libnamebench/version.py

namebench.py

setup.py

templates/ascii.tmpl

templates/html.tmpl

templates/style.css

tools/check_dns_servers.py

tools/check_nameserver_popularity.py

tools/release_mac.sh

tools/release_src.sh

tools/release_win.bat

Show diffs side-by-side

added added

removed removed

libnamebench/data_sources.py

#!/usr/bin/env python

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

"""Provides data sources to use for benchmarking."""

import glob

import os

import os.path

import random

import re

import sys

import time

import ConfigParser

# See if a third_party library exists -- use it if so.

try:

import third_party

except ImportError:

pass

# relative

from . import util

from . import selectors

# Pick the most accurate timer for a platform. Stolen from timeit.py:

if sys.platform[:3] == 'win':

DEFAULT_TIMER = time.clock

else:

DEFAULT_TIMER = time.time

GLOBAL_DATA_CACHE = {}

INTERNAL_RE = re.compile('^0|\.pro[md]\.|\.corp|\.bor|\.hot$|internal|dmz|intra|\.\w$|\.\w{5,}$')

# ^.*[\w-]+\.[\w-]+\.[\w-]+\.[a-zA-Z]+\.$|^[\w-]+\.[\w-]{3,}\.[a-zA-Z]+\.$

FQDN_RE = re.compile('^.*\..*\..*\..*\.$|^.*\.[\w-]*\.\w{3,4}\.$|^[\w-]+\.[\w-]{4,}\.\w+\.')

IP_RE = re.compile('^[0-9.]+$')

DEFAULT_CONFIG_PATH = "config/data_sources.cfg"

MAX_NON_UNIQUE_RECORD_COUNT = 500000

MAX_FILE_MTIME_AGE_DAYS = 60

MIN_FILE_SIZE = 10000

MIN_RECOMMENDED_RECORD_COUNT = 200

MAX_FQDN_SYNTHESIZE_PERCENT = 4

class DataSources(object):

def __init__(self, config_path=DEFAULT_CONFIG_PATH, status_callback=None):

global GLOBAL_DATA_CACHE

self.source_cache = GLOBAL_DATA_CACHE

self.source_config = {}

self.status_callback = status_callback

self._LoadConfigFromPath(config_path)

def msg(self, msg, **kwargs):

if self.status_callback:

self.status_callback(msg, **kwargs)

else:

print '- %s' % msg

def _LoadConfigFromPath(self, path):

conf_file = util.FindDataFile('config/data_sources.cfg')

config = ConfigParser.ConfigParser()

config.read(conf_file)

for section in config.sections():

if section not in self.source_config:

self.source_config[section] = {

'name': None,

'search_paths': set(),

# Store whether or not this data source contains personal data

'full_hostnames': True

}

for (key, value) in config.items(section):

if key == 'name':

self.source_config[section]['name'] = value

elif key == 'full_hostnames' and int(value) == 0:

self.source_config[section]['full_hostnames'] = False

else:

self.source_config[section]['search_paths'].add(value)

def ListSourceTypes(self):

"""Get a list of all data sources we know about."""

return sorted(self.source_config.keys())

def ListSourcesWithDetails(self):

"""Get a list of all data sources found with total counts.

Returns:

100

List of tuples in form of (short_name, full_name, full_hosts, # of entries)

101

"""

102

for source in self.ListSourceTypes():

103

self._GetHostsFromSource(source, min_file_size=MIN_FILE_SIZE,

104

max_mtime_age_days=MAX_FILE_MTIME_AGE_DAYS)

105

106

details = []

107

for source in self.source_cache:

108

details.append((source,

109

self.source_config[source]['name'],

110

self.source_config[source]['full_hostnames'],

111

len(self.source_cache[source])))

112

return sorted(details, key=lambda x:(x[2], x[3]), reverse=True)

113

114

def ListSourceTitles(self):

115

"""Return a list of sources in title + count format."""

116

titles = []

117

for (source_type, name, full_hostnames, count) in self.ListSourcesWithDetails():

118

titles.append("%s (%s)" % (name, count))

119

return titles

120

121

def ConvertSourceTitleToType(self, detail):

122

"""Convert a detail name to a source type."""

123

for source_type in self.source_config:

124

if detail.startswith(self.source_config[source_type]['name']):

125

return source_type

126

127

def GetBestSourceDetails(self):

128

return self.ListSourcesWithDetails()[0]

129

130

def GetNameForSource(self, source):

131

if source in self.source_config:

132

return self.source_config[source]['name']

133

else:

134

# Most likely a custom file path

135

return source

136

137

def GetCachedRecordCountForSource(self, source):

138

return len(self.source_cache[source])

139

140

def _CreateRecordsFromHostEntries(self, entries):

141

"""Create records from hosts, removing duplicate entries and IP's

142

143

Args:

144

A list of test-data entries.

145

146

Returns:

147

A tuple of (filtered records, full_host_names (Boolean)

148

"""

149

real_tld_re = re.compile('[a-z]{2,4}$')

150

internal_re = re.compile('^[\d:\.]+$')

151

last_entry = None

152

153

records = []

154

full_host_count = 0

155

for entry in entries:

156

if entry == last_entry:

157

continue

158

else:

159

last_entry = entry

160

161

if ' ' in entry:

162

(record_type, host) = entry.split(' ')

163

else:

164

record_type = 'A'

165

host = entry

166

167

if not IP_RE.match(host) and not INTERNAL_RE.search(host):

168

if not host.endswith('.'):

169

# For a short string like this, simple addition is 54% faster than formatting

170

host = host + '.'

171

records.append((record_type, host))

172

173

if FQDN_RE.match(host):

174

full_host_count += 1

175

176

# Now that we've read everything, are we dealing with domains or full hostnames?

177

full_host_percent = full_host_count / float(len(records)) * 100

178

# self.msg('%0.1f%% of input records are using fully qualified hostnames.' % full_host_percent)

179

if full_host_percent < MAX_FQDN_SYNTHESIZE_PERCENT:

180

full_host_names = True

181

else:

182

full_host_names = False

183

return (records, full_host_names)

184

185

def GetTestsFromSource(self, source, count=50, select_mode=None):

186

"""Parse records from source, and returnrequest tuples to use for testing.

187

188

This is tricky because we support 3 types of input data:

189

190

- List of domains

191

- List of hosts

192

- List of record_type + hosts

193

"""

194

records = []

195

196

# Convert entries into tuples, determine if we are using full hostnames

197

full_host_count = 0

198

www_host_count = 0

199

records = self._GetHostsFromSource(source)

200

self.msg('Generating tests from %s (%s records, selecting %s %s)' % (self.GetNameForSource(source), len(records), count, select_mode))

201

(records, are_records_fqdn) = self._CreateRecordsFromHostEntries(records)

202

# First try to resolve whether to use weighted or random.

203

if select_mode in ('weighted', 'automatic', None):

204

if len(records) != len(set(records)):

205

if select_mode == 'weighted':

206

self.msg('%s data contains duplicates, switching select_mode to random' % source)

207

select_mode = 'random'

208

else:

209

select_mode = 'weighted'

210

211

self.msg('Selecting %s out of %s %s records.' % (count, len(records), select_mode))

212

# Now make the real selection.

213

if select_mode == 'weighted':

214

records = selectors.WeightedDistribution(records, count)

215

elif select_mode == 'chunk':

216

records = selectors.ChunkSelect(records, count)

217

elif select_mode == 'random':

218

records = selectors.RandomSelect(records, count)

219

220

if are_records_fqdn:

221

self.source_config[source]['full_hostnames'] = False

222

self.msg('%s input appears to be predominantly domain names. Synthesizing FQDNs' % source)

223

synthesized = []

224

for (req_type, hostname) in records:

225

if not FQDN_RE.match(hostname):

226

hostname = self._GenerateRandomHostname(hostname)

227

synthesized.append((req_type, hostname))

228

return synthesized

229

else:

230

return records

231

232

def _GenerateRandomHostname(self, domain):

233

"""Generate a random hostname f or a given domain."""

234

oracle = random.randint(0, 100)

235

if oracle < 70:

236

return 'www.%s' % domain

237

elif oracle < 95:

238

return domain

239

elif oracle < 98:

240

return 'static.%s' % domain

241

else:

242

return 'cache-%s.%s' % (random.randint(0, 10), domain)

243

244

def _GetHostsFromSource(self, source, min_file_size=None, max_mtime_age_days=None):

245

"""Get data for a particular source. This needs to be fast.

246

247

We support 3 styles of files:

248

249

* One-per line list in form of record-type: host

250

* One-per line list of unique domains

251

* Any form with URL's.

252

253

The results of this function get cached.

254

"""

255

if source in self.source_cache:

256

return self.source_cache[source]

257

filename = self._FindBestFileForSource(source, min_file_size=min_file_size,

258

max_mtime_age_days=max_mtime_age_days)

259

if not filename:

260

return None

261

262

size_mb = os.path.getsize(filename) / 1024.0 / 1024.0

263

self.msg('Reading %s: %s (%0.1fMB)' % (self.GetNameForSource(source), filename, size_mb))

264

start_clock = DEFAULT_TIMER()

265

hosts = self._ExtractHostsFromHistoryFile(filename)

266

if not hosts:

267

hosts = self._ReadDataFile(filename)

268

duration = DEFAULT_TIMER() - start_clock

269

if duration > 5:

270

self.msg('%s data took %1.1fs to read!' % (self.GetNameForSource(source), duration))

271

self.source_cache[source] = hosts

272

return hosts

273

274

def _ExtractHostsFromHistoryFile(self, path):

275

"""Get a list of sanitized records from a history file containing URLs."""

276

# This regexp is fairly general (no ip filtering), since we need speed more

277

# than precision at this stage.

278

parse_re = re.compile('https*://([\-\w]+\.[\-\w\.]+)')

279

return parse_re.findall(open(path, 'rb').read())

280

281

def _ReadDataFile(self, path):

282

"""Read a line-based datafile."""

283

records = []

284

for line in open(path).readlines():

285

if not line.startswith('#'):

286

records.append(line.rstrip())

287

return records

288

289

def _GetSourceSearchPaths(self, source):

290

"""Get a list of possible search paths (globs) for a given source."""

291

292

# This is likely a custom file path

293

if source not in self.source_config:

294

return [source]

295

296

search_paths = []

297

environment_re = re.compile('%(\w+)%')

298

299

300

# First get through resolving environment variables

301

for path in self.source_config[source]['search_paths']:

302

env_vars = set(environment_re.findall(path))

303

if env_vars:

304

for variable in env_vars:

305

env_var = os.getenv(variable, False)

306

if env_var:

307

path = path.replace('%%%s%%' % variable, env_var)

308

else:

309

path = None

310

311

# If everything is good, replace all '/' chars with the os path variable.

312

if path:

313

path = path.replace('/', os.sep)

314

search_paths.append(path)

315

316

# This moment of weirdness brought to you by Windows XP(tm). If we find

317

# a Local or Roaming keyword in path, add the other forms to the search

318

# path.

319

if sys.platform[:3] == 'win':

320

keywords = ('Local', 'Roaming')

321

for keyword in keywords:

322

if keyword in path:

323

replacement = keywords[keywords.index(keyword)-1]

324

search_paths.append(path.replace('\\%s' % keyword, '\\%s' % replacement))

325

search_paths.append(path.replace('\\%s' % keyword, ''))

326

327

return search_paths

328

329

def _FindBestFileForSource(self, source, min_file_size=None,

330

max_mtime_age_days=None):

331

"""Find the best file (newest over X size) to use for a given source type.

332

333

Args:

334

source: source type

335

336

Returns:

337

A file path.

338

"""

339

found = []

340

for path in self._GetSourceSearchPaths(source):

341

if not os.path.isabs(path):

342

path = util.FindDataFile(path)

343

344

for filename in glob.glob(path):

345

if min_file_size and os.path.getsize(filename) < min_file_size:

346

self.msg('Ignoring %s (only %s bytes)' % (filename, os.path.getsize(filename)))

347

else:

348

found.append(filename)

349

350

if found:

351

newest = sorted(found, key=os.path.getmtime)[-1]

352

age_days = (time.time() - os.path.getmtime(newest)) / 86400

353

if max_mtime_age_days and age_days > max_mtime_age_days:

354

self.msg('Ignoring %s from %s (%2.0f days old)' % (newest, source, age_days))

355

else:

356

return newest

357

else:

358

return None

359

360

if __name__ == '__main__':

361

parser = DataSources()

362

print parser.ListSourceTypes()

363

print parser.ListSourcesWithDetails()

364

best = parser.ListSourcesWithDetails()[0][0]

365

print len(parser.GetRecordsFromSource(best))

Older »