~t-w-/+junk/zuluction

« back to all changes in this revision

Viewing changes to get_links_from_gallery_pages.py

  • Committer: Thorsten Wilms
  • Date: 2010-01-16 16:57:02 UTC
  • Revision ID: t_w_@freenet.de-20100116165702-meykox1tg77hensk
Make get_links script cleverer in case of added items (hopefully).

Show diffs side-by-side

added added

removed removed

Lines of Context:
6
6
import socket
7
7
import urllib2
8
8
 
9
 
pool_base_url = 'http://www.flickr.com/groups/ubuntu-artwork/pool/page'
 
9
base_url = 'http://www.flickr.com/groups/ubuntu-artwork/pool/page'
10
10
 
11
11
# Strings before and after the snippets to be extracted:
12
12
match_before_items_total = '<div class="Results">\('
26
26
 
27
27
# Return total number of items:
28
28
def get_total(string):
29
 
        list = find_between(match_before_items_total, match_after_items_total, s)
 
29
        list = find_between(match_before_items_total, match_after_items_total, string)
30
30
        no_commas = re.sub(',', '', list[0])
31
31
        total = int(no_commas)
32
32
        return total
33
33
 
34
 
 
35
 
# Preconditions for fetching gallery pages:
36
 
page_nr = 1
37
 
items_total = 0
38
 
success = False
39
 
items_gotten = 0
40
 
items_list = []
41
 
 
42
 
# Fetch gallery pages, extract shortened item links, write to stream:
43
 
while True:
44
 
        try:
45
 
                response = urllib2.urlopen(pool_base_url + str(page_nr))
46
 
        except IOError:
47
 
                print 'Failed to connect to the web'
48
 
                break
49
 
        
50
 
        s = response.read()
51
 
        response.close()
52
 
 
53
 
        new_items_total = get_total(s)
54
 
        if items_total != 0:
55
 
                if items_total != new_items_total:
56
 
                        # Total number of items changed, start from scratch!
 
34
# Fetch gallery pages, extract shortened item links, write to stream.
 
35
# Recurse if new items have been added during run:
 
36
def get_links(base_url, number=-1):
 
37
        # Preconditions for fetching gallery pages:
 
38
        page_nr = 1
 
39
        items_total = 0
 
40
        items_gotten = 0
 
41
        items_list = []
 
42
        additions = 0 # number of items added during run
 
43
        offset = 0 # offset resulting through new items, valid for only the current page
 
44
 
 
45
        while True:
 
46
                try:
 
47
                        response = urllib2.urlopen(base_url + str(page_nr))
 
48
                except IOError:
 
49
                        #response.close()
 
50
                        print 'Failed to connect to the web'
 
51
                        break
 
52
                
 
53
                s = response.read()
 
54
                response.close()
 
55
 
 
56
                new_items_total = get_total(s)
 
57
                if items_total != 0: # Not the first run
 
58
                        if items_total > new_items_total:
 
59
                                # At least one item has been deleted, we need to start from scratch!
 
60
                                items_list = []
 
61
                                page_nr = 1
 
62
                                print 'Reset!'
 
63
                        elif items_total < new_items_total:
 
64
                                # At least one item has been added, we need to avoid duplicates
 
65
                                # and fetch them afterwards
 
66
                                offset = new_items_total - items_total
 
67
                                additions += offset
 
68
                items_total = new_items_total
 
69
 
 
70
                items_of_the_page = find_between(match_before_item_link, match_after_item_link, s)
 
71
                
 
72
                #for item in items_of_the_page:
 
73
                #       items_list.append(item)
 
74
                        
 
75
                for i in range(0 + offset, len(items_of_the_page)):
 
76
                        items_list.append(items_of_the_page[i])
 
77
                        
 
78
                print 'Page ' + str(page_nr) + ' done, ' + str(len(items_of_the_page)) + ' items'
 
79
                print 'Total now: ' + str(len(items_list))
 
80
                
 
81
                
 
82
                # If this is the first call, the total number of items is the maximum.
 
83
                # Otherwise it's the number of items added during the previous run:
 
84
                if number == -1:
 
85
                        max = items_total
 
86
                else:
 
87
                        max = number
 
88
                
 
89
                if len(items_list) == max:
 
90
                        print 'Page ' + str(page_nr) + ' is the last'
 
91
                        if additions > 0:
 
92
                                for item in get_links(base_url, additions):
 
93
                                        items_list.append(item)
 
94
                        break
 
95
                elif len(items_list) > items_total:
 
96
                        print 'More items than possible!?'
57
97
                        items_list = []
58
 
                        page_nr = 1
59
 
                        print 'Reset!'
60
 
        items_total = new_items_total
61
 
 
62
 
        items_of_the_page = find_between(match_before_item_link, match_after_item_link, s)
63
 
        
64
 
        for item in items_of_the_page:
65
 
                items_list.append(item)
66
 
        
67
 
        print 'Page ' + str(page_nr) + ' done, ' + str(len(items_of_the_page)) + ' items'
68
 
        print 'Total now: ' + str(len(items_list)
69
 
        
70
 
        if len(items_list) == items_total:
71
 
                print 'Page ' + str(page_nr) + ' is the last'
72
 
                success = True
73
 
                break
74
 
        elif len(items_list) > items_total:
75
 
                print 'More items than possible!?'
76
 
                break
77
 
                
78
 
                
79
 
        page_nr += 1
80
 
                
81
 
print 'Final number of items:' + str(items_total)
 
98
                        break
 
99
                                                
 
100
                page_nr += 1
 
101
                
 
102
        print 'Final number of items:' + str(items_total)
 
103
 
 
104
        return items_list
 
105
 
82
106
 
83
107
# Write stream to file:
84
 
if success:
 
108
links = get_links(base_url)
 
109
if len(links) > 1:
85
110
        f = open('links.txt', 'w')
86
 
 
87
 
        for i in range(0, items_total):
88
 
                f.write(items_list[i] + '\n')
89
 
                
 
111
        for i in range(0, len(links)):
 
112
                f.write(links[i] + '\n')
90
113
        f.close()