~caneypuggies/reformedchurcheslocator/couchapp-backbone

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
// This code (and its parent process in changes.js) is a Node.JS listener
//	listening to CouchDB's _changes feed, and is derived from
//	https://github.com/mikeal/node.couch.js and
//	http://dominicbarnes.us/node-couchdb-api/api/database/changes.html
// It monitors when requests are submitted to:
//	(when configuring a directory's settings) get a url in general
//	(when a directory is already configured) download all cong data for a directory
// TODO: Could we use backbone-couch.js here instead of cradle, in order to use our
//	Backbone model here?

var buffer = '',
	http = require('http'),
    https = require('https'),
	cwd = process.cwd(),
	config = require(cwd + '/config'),
	db = config.db,
	log = require(cwd + '/lib').log;
	//$ = require('jquery');
//var model = require('model.js').model
	//stdin = process.openStdin();
if (config.debug)
	var longjohn = require(cwd + '/node_modules/longjohn')

//stdin.setEncoding('utf8');

// Declare utility functions
function get_url(doc, from_url, to_html, status_flag, options){
    var http_lib = http
    if (doc[from_url].indexOf('https') === 0){
        // Switch to using https if necessary
        var http_lib = https
    }
    http_lib.get(doc[from_url], function(res){
        var pageData = ''
        res.on('data', function(chunk){
            pageData += chunk
        })
        res.on('end', function(){
            // TODO: Check to see if we got a 404 response
            // Write the contents of the html variable back to the database
            doc[to_html] = pageData
            doc[status_flag] = 'gotten'
            console.log(new Date().getTime() + '\t n: ' + status_flag + ': ' + doc[status_flag] + ' ' + doc[from_url])
            // TODO: Use Backbone here instead of cradle
            db.save(doc._id, doc._rev, doc, function(err, res){
                // TODO: Do anything more that needs to be done here
                if (options && options.success){
                    options.success()
                }
            });
        })
    });
}
function save(options){
    db.get(options.doc._id, function(err, doc){
        options.doc = doc
        if (!err && options.doc && options.doc._id && typeof options.doc._id !== 'undefined'){
            // Save to the db all the HTML we've gotten
            // TODO: This is running several times in series
            options.doc[options.to_html] = options.output_array
            options.doc[options.status_flag] = 'gotten';
            // Deletes number downloaded since it's not needed anymore
            delete options.doc[options.number_downloaded]
            db.save(options.doc._id, options.doc._rev, options.doc, function(err, response){
                if (err !== null){
                    console.error(err)
                    // Recurse to try saving again
                    // Only recurse a certain number of times, then fail, to avoid a memory leak
                    if (options.save_attempts <= 5){
                        options.save_attempts++;
                        console.log('options.save_attempts: ' + options.save_attempts)
                        save(options)
                    }else{
                        // TODO: This is where we get an error.  For some reason sometimes,
                        //  but not always, we have the wrong revision here, and this causes get_state_url_html
                        //  to never == 'gotten', (so the state details page doesn't display?)
                        console.error('Failed to save doc: ' + options.doc._id, options.doc._rev)
                    }
                }else{
                    console.log('Succeeded at saving all the states\' HTML pages')
                    options.output_array_saved = true
                    // Remove this options.status_flag from the list of tasks
                    currently_getting.splice(currently_getting.indexOf(options.status_flag),1)
                    // Clean up some memory
                    options.output_array = []
                }
            })
        }
    })
}
function recurse_then_save(i, options){
    // If we've downloaded all the HTML, and haven't saved to the db yet
    if (options.output_array.length == options.doc[options.from_urls].length && options.output_array_saved !== true){
        options.save_attempts = 0
        if (options.output_array_saved !== true){
            save(options)
            // console.log ("after saving all the states")
        }
    }
    // Call the parent function recursively to enable throttling the rate of web-scraping requests
    // Handle next URL
    recurse_urls(i+1, options)
}
function recurse_urls(i, options){
    if (typeof options.doc[options.from_urls] == 'undefined'){
        console.log(options.doc[options.from_urls])
    }
    // Stop running if we have reached the end of the list of URLs,
    if (options.doc[options.from_urls][i] !== '' && typeof options.doc[options.from_urls][i] !== 'undefined' &&
            // and don't run if we've already downloaded the HTML for this URL
            typeof options.doc[i] == 'undefined'){
        // TODO: Make this handle options.doc[options.method] == 'post'
        http.get(options.doc[options.from_urls][i], function(res){
            var pageData = ''
            res.on('data', function(chunk){
                pageData += chunk
            })
            res.on('end', function(){
                // TODO: Check to see if we got a 404 response
                // Append result to options.output_array
                options.output_array[i] = pageData
                if (options.doc[options.status_flag] !== 'getting'){
                    options.doc[options.status_flag] = 'getting'
                    // Set flag to indicate that we just reset the status_flag
                    options.flag_set = true
                    // report to the db the fact we are getting the HTML
                    // console.log ("before saving all the states")
                    db.save(options.doc._id, options.doc._rev, options.doc, function(err, response){
                        recurse_then_save(i, options)
                    })
                }
                // Record the number downloaded
                // Don't run until the status_flag has been set
                if (typeof options.flag_set !== 'undefined' && options.flag_set === true){
                    recurse_then_save(i, options)
                }
            })
        })
    }else{
        currently_getting.splice(currently_getting.indexOf(options.status_flag),1)
    }
}
currently_getting = []
function get_url_set(options){
    // Don't run more than one copy of this task at a time
    if (currently_getting.indexOf(options.status_flag) == -1){
        // Add this options.status_flag to the list of tasks
        currently_getting.push(options.status_flag)
        var i = 0
        options.output_array = []
        options.output_array_saved = false
        // Use a recursive function to allow throttling the rate of web-scraping requests
        //  per second to avoid getting banned by some servers.
        recurse_urls(i, options)
    }
}

// Handle all changes
process.on('message', function(doc){
    // Watch for requests to get the contents of a URL for a church directory
    // TODO: Check to see if the URL is valid
    if (doc.collection == 'directory' && doc.get_url_html=='requested' && doc.url){
        // E.g., when a user enters "opc.org/locator.html" into the church directory configuration page,
        //  then go get the contents of that URL.
        get_url(doc, 'url', 'url_html', 'get_url_html')
    }
    if (doc.collection == 'directory' && doc.get_cong_url_html=='requested' && doc.cong_url){
        get_url(doc, 'cong_url_raw', 'cong_url_html', 'get_cong_url_html', {success:function(){
            // Iterate state pages' HTML
            for (var i=0; i<doc.state_url_html.length; i++){
                // TODO: Get each cong's URL
                var state_html = doc.state_url_html[i]
                
                // TODO: Get each cong page's HTML & write to database
            }
        }})
    }
    // Watch for requests to get the contents of a state page URL
    if (doc.collection == 'directory' && doc.get_state_url_html=='requested' && doc.state_url){
        // Interpolate state names into URLs
        var state_page_urls = []
        console.log('before interpolating state names into URLs')
        for (var i=0; i<doc.state_page_values.length; i++){
            if (doc.state_page_values[i] !== ''){
                state_page_urls.push(doc.state_url.replace('{state_name}', doc.state_page_values[i]))
            }
        }
        console.log('about to get_url_set')
        doc.state_page_urls = state_page_urls
        get_url_set({
            doc:               doc,
            from_urls:          'state_page_urls',
            method:             'state_url_method',
            to_html:            'state_url_html',
            status_flag:        'get_state_url_html',
            number_downloaded:  'state_urls_gotten',
            success:function(){
            // TODO: Cleanup unnecessary doc attributes here?  Probably that should be done in
            //  ImportDirectoryView.js instead.
        }})
    }
    // Watch for requests to get the contents of a batchgeo map URL
    if (doc.collection == 'directory' && doc.get_batchgeo_map_html=='requested' && doc.batchgeo_map_url){
        get_url(doc, 'batchgeo_map_url', 'batchgeo_map_html', 'get_batchgeo_map_html')
    }
    // Watch for requests to get the contents of a JSON feed
    if (doc.collection == 'directory' && doc.get_json=='requested' && doc.json_url){
        get_url(doc, 'json_url', 'json', 'get_json')
    }
});