~dpm/+junk/langpacks-by-inst

« back to all changes in this revision

Viewing changes to get_data.py

Committer: David Planella
Date: 2010-06-06 00:59:13 UTC
Revision ID: dpm@lillypilly-20100606005913-145mwdgcpg38yz6n

Ported all code to Python, moved modules to a local library location

files added:
data

get_data.py

files renamed:
generate_langpack_graphs.py => generate_report.py

Show diffs side-by-side

added added

removed removed

get_data.py

#!/usr/bin/python

# -*- coding: utf-8 -*-

### BEGIN LICENSE

#This program is free software: you can redistribute it and/or modify it

#under the terms of the GNU General Public License version 3, as published

#by the Free Software Foundation.

#This program is distributed in the hope that it will be useful, but

#WITHOUT ANY WARRANTY; without even the implied warranties of

#MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR

#PURPOSE. See the GNU General Public License for more details.

#You should have received a copy of the GNU General Public License along

#with this program. If not, see <http://www.gnu.org/licenses/>.

### END LICENSE

import datetime

import urllib2

import urllib

import time

import os

import gzip

import csv

import re

from LocaleInfo import LocaleInfo

import simplejson as json

def get_http_timestamp(url):

try:

sock = urllib2.urlopen(url)

except:

return None

timestamp_lm = None

if sock.info()["Last-Modified"]:

lm_string = sock.info()["Last-Modified"]

sock.close()

if lm_string:

timestamp_lm = time.mktime(time.strptime(lm_string, "%a, %d %b %Y %H:%M:%S GMT"))

datetime_lm = datetime.datetime.fromtimestamp(timestamp_lm)

return datetime_lm

return None

def get_file_timestamp(filename):

try:

mtime = os.path.getmtime(filename)

except:

return None

return datetime.datetime.utcfromtimestamp(mtime)

def get_langpack_code(langpackname):

langpack_bits = langpackname.split('-')

if len(langpack_bits) == 5:

langpack_code = langpack_bits[2] + '-' + langpack_bits[3]

else:

langpack_code = langpack_bits[2]

return langpack_code

def get_language_name(langcode, li):

try:

lang_name = li.translate(langcode, True)

lname = lang_name.split(';')[0]

except:

lname = langcode

return lname

DATA_DIR='/home/dpm/langpacks-by-inst/data'

PUBLIC_DIR='/home/dpm/public_html'

URL = "http://popcon.ubuntu.com/by_inst.gz"

REGEXP = re.compile("^language-pack-[a-z]{2,3}(-han[st])?-base")

data_file = os.path.join(DATA_DIR, os.path.basename(URL))

CSV_FILE = os.path.join(DATA_DIR, 'langpacks_by_inst.csv')

JSON_FILE = os.path.join(DATA_DIR, 'langpacks_by_inst.json')

http_timestamp = get_http_timestamp(URL)

if http_timestamp != get_file_timestamp(data_file):

urllib.urlretrieve(URL, data_file)

# Extract and put the language pack data in a CSV file

data_csv = csv.reader(gzip.open(data_file), delimiter = ' ', skipinitialspace = True)

writer = csv.writer(open(CSV_FILE, "wb"))

writer.writerow(['# Language', 'Installations', 'Timestamp'])

li = LocaleInfo.LocaleInfo()

data_json = []

try:

for row in data_csv:

#if not (row[0].startswith('#') or row[0].startswith('-') or (row[1] == 'Total')) :

try:

if REGEXP.search(row[1]):

lang_code = get_langpack_code(row[1])

100

installs = row[2]

101

lang_name = get_language_name(lang_code, li)

102

writer.writerow([lang_code, installs, lang_name, http_timestamp])

103

data_json.append({'installs': installs, 'langpack': lang_name})

104

except:

105

pass

106

except csv.Error, e:

107

pass

108

#print 'file %s, line %d: %s' % (data_file, data_csv.line_num, e)

109

110

f = open(JSON_FILE, mode='w')

111

json.dump(data_json, f)

Older »