1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
|
// This code (and its parent process in changes.js) is a Node.JS listener
// listening to CouchDB's _changes feed, and is derived from
// https://github.com/mikeal/node.couch.js and
// http://dominicbarnes.us/node-couchdb-api/api/database/changes.html
// It monitors when requests are submitted to:
// (when configuring a directory's settings) get a url in general
// (when a directory is already configured) download all cong data for a directory
// TODO: Could we use backbone-couch.js here instead of cradle, in order to use our
// Backbone model here?
var buffer = '',
http = require('http'),
https = require('https'),
cwd = process.cwd(),
config = require(cwd + '/config'),
db = config.db,
log = require(cwd + '/lib').log;
//$ = require('jquery');
//var model = require('model.js').model
//stdin = process.openStdin();
if (config.debug)
var longjohn = require(cwd + '/node_modules/longjohn')
//stdin.setEncoding('utf8');
// Declare utility functions
function get_url(doc, from_url, to_html, status_flag, options){
var http_lib = http
if (doc[from_url].indexOf('https') === 0){
// Switch to using https if necessary
var http_lib = https
}
http_lib.get(doc[from_url], function(res){
var pageData = ''
res.on('data', function(chunk){
pageData += chunk
})
res.on('end', function(){
// TODO: Check to see if we got a 404 response
// Write the contents of the html variable back to the database
doc[to_html] = pageData
doc[status_flag] = 'gotten'
console.log(new Date().getTime() + '\t n: ' + status_flag + ': ' + doc[status_flag] + ' ' + doc[from_url])
// TODO: Use Backbone here instead of cradle
db.save(doc._id, doc._rev, doc, function(err, res){
// TODO: Do anything more that needs to be done here
if (options && options.success){
options.success()
}
});
})
});
}
function save(options){
db.get(options.doc._id, function(err, doc){
options.doc = doc
if (!err && options.doc && options.doc._id && typeof options.doc._id !== 'undefined'){
// Save to the db all the HTML we've gotten
// TODO: This is running several times in series
options.doc[options.to_html] = options.output_array
options.doc[options.status_flag] = 'gotten';
// Deletes number downloaded since it's not needed anymore
delete options.doc[options.number_downloaded]
db.save(options.doc._id, options.doc._rev, options.doc, function(err, response){
if (err !== null){
console.error(err)
// Recurse to try saving again
// Only recurse a certain number of times, then fail, to avoid a memory leak
if (options.save_attempts <= 5){
options.save_attempts++;
console.log('options.save_attempts: ' + options.save_attempts)
save(options)
}else{
// TODO: This is where we get an error. For some reason sometimes,
// but not always, we have the wrong revision here, and this causes get_state_url_html
// to never == 'gotten', (so the state details page doesn't display?)
console.error('Failed to save doc: ' + options.doc._id, options.doc._rev)
}
}else{
console.log('Succeeded at saving all the states\' HTML pages')
options.output_array_saved = true
// Remove this options.status_flag from the list of tasks
currently_getting.splice(currently_getting.indexOf(options.status_flag),1)
// Clean up some memory
options.output_array = []
}
})
}
})
}
function recurse_then_save(i, options){
// If we've downloaded all the HTML, and haven't saved to the db yet
if (options.output_array.length == options.doc[options.from_urls].length && options.output_array_saved !== true){
options.save_attempts = 0
if (options.output_array_saved !== true){
save(options)
// console.log ("after saving all the states")
}
}
// Call the parent function recursively to enable throttling the rate of web-scraping requests
// Handle next URL
recurse_urls(i+1, options)
}
function recurse_urls(i, options){
if (typeof options.doc[options.from_urls] == 'undefined'){
console.log(options.doc[options.from_urls])
}
// Stop running if we have reached the end of the list of URLs,
if (options.doc[options.from_urls][i] !== '' && typeof options.doc[options.from_urls][i] !== 'undefined' &&
// and don't run if we've already downloaded the HTML for this URL
typeof options.doc[i] == 'undefined'){
// TODO: Make this handle options.doc[options.method] == 'post'
http.get(options.doc[options.from_urls][i], function(res){
var pageData = ''
res.on('data', function(chunk){
pageData += chunk
})
res.on('end', function(){
// TODO: Check to see if we got a 404 response
// Append result to options.output_array
options.output_array[i] = pageData
if (options.doc[options.status_flag] !== 'getting'){
options.doc[options.status_flag] = 'getting'
// Set flag to indicate that we just reset the status_flag
options.flag_set = true
// report to the db the fact we are getting the HTML
// console.log ("before saving all the states")
db.save(options.doc._id, options.doc._rev, options.doc, function(err, response){
recurse_then_save(i, options)
})
}
// Record the number downloaded
// Don't run until the status_flag has been set
if (typeof options.flag_set !== 'undefined' && options.flag_set === true){
recurse_then_save(i, options)
}
})
})
}else{
currently_getting.splice(currently_getting.indexOf(options.status_flag),1)
}
}
currently_getting = []
function get_url_set(options){
// Don't run more than one copy of this task at a time
if (currently_getting.indexOf(options.status_flag) == -1){
// Add this options.status_flag to the list of tasks
currently_getting.push(options.status_flag)
var i = 0
options.output_array = []
options.output_array_saved = false
// Use a recursive function to allow throttling the rate of web-scraping requests
// per second to avoid getting banned by some servers.
recurse_urls(i, options)
}
}
// Handle all changes
process.on('message', function(doc){
// Watch for requests to get the contents of a URL for a church directory
// TODO: Check to see if the URL is valid
if (doc.collection == 'directory' && doc.get_url_html=='requested' && doc.url){
// E.g., when a user enters "opc.org/locator.html" into the church directory configuration page,
// then go get the contents of that URL.
get_url(doc, 'url', 'url_html', 'get_url_html')
}
if (doc.collection == 'directory' && doc.get_cong_url_html=='requested' && doc.cong_url){
get_url(doc, 'cong_url_raw', 'cong_url_html', 'get_cong_url_html', {success:function(){
// Iterate state pages' HTML
for (var i=0; i<doc.state_url_html.length; i++){
// TODO: Get each cong's URL
var state_html = doc.state_url_html[i]
// TODO: Get each cong page's HTML & write to database
}
}})
}
// Watch for requests to get the contents of a state page URL
if (doc.collection == 'directory' && doc.get_state_url_html=='requested' && doc.state_url){
// Interpolate state names into URLs
var state_page_urls = []
console.log('before interpolating state names into URLs')
for (var i=0; i<doc.state_page_values.length; i++){
if (doc.state_page_values[i] !== ''){
state_page_urls.push(doc.state_url.replace('{state_name}', doc.state_page_values[i]))
}
}
console.log('about to get_url_set')
doc.state_page_urls = state_page_urls
get_url_set({
doc: doc,
from_urls: 'state_page_urls',
method: 'state_url_method',
to_html: 'state_url_html',
status_flag: 'get_state_url_html',
number_downloaded: 'state_urls_gotten',
success:function(){
// TODO: Cleanup unnecessary doc attributes here? Probably that should be done in
// ImportDirectoryView.js instead.
}})
}
// Watch for requests to get the contents of a batchgeo map URL
if (doc.collection == 'directory' && doc.get_batchgeo_map_html=='requested' && doc.batchgeo_map_url){
get_url(doc, 'batchgeo_map_url', 'batchgeo_map_html', 'get_batchgeo_map_html')
}
// Watch for requests to get the contents of a JSON feed
if (doc.collection == 'directory' && doc.get_json=='requested' && doc.json_url){
get_url(doc, 'json_url', 'json', 'get_json')
}
});
|