~jjed/archive-crawler/near-rewrite

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/env python3
import logging
import os.path
import sys

from optparse import OptionParser, Option
from src.AppMetadata import extract_metadata

_OPTIONS = (
    Option("-u", "--url", dest="archive_url",
           default="http://archive.ubuntu.com/ubuntu",
           help="url of an archive with 'dists/SUITE/Release'"),
    Option("-s", "--suite", dest="suite", default="oneiric",
           help="distro suite in the archive (eg 'oneiric')"),
    Option("-a", "--arches", dest="arches", default="i386 amd64",
           help="architectures to search (eg 'i386 amd65')"),
    Option("-o", "--out-dir", dest="out_dir", default="output/",
           help="directory where extracted metadata will output"),
    Option("-m", "--memory", type="int", dest="memory", default=500,
           help="how many megabytes of data to load into memory"))


if __name__ == "__main__":
    # check options from command line and ensure validity
    parser = OptionParser(option_list=_OPTIONS)
    opts, args = parser.parse_args()
    if len(args):
        parser.print_usage()
        print("extraneous arguments: {0}".format(" ".join(args)))
        sys.exit(1)
    kwargs = dict(opts.__dict__)
    kwargs["arches"] = kwargs["arches"].split()

    # create a record file and start logging
    out_dir = kwargs["out_dir"]
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    out_log = os.path.join(out_dir, "extract.log")
    logging.basicConfig(level=logging.DEBUG, filename=out_log, filemode="w",
                        format="%(asctime)s %(levelname)s %(message)s")

    # extract
    extract_metadata(**kwargs)