1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
|
"""ImageRetreivalDaemon"""
# Licensed under MOZILLA PUBLIC LICENSE Version 1.1
# Author: Keith Hughitt <keith.hughitt@nasa.gov>
# Author: Jack Ireland <jack.ireland@nasa.gov>
# pylint: disable=E1121
import sys
import datetime
import time
import logging
import os
import redis
import sunpy
import Queue
class ImageRetrievalDaemon:
"""Retrieves images from the server as specified"""
def __init__(self, server, browse_method, download_method, conf):
"""Explain."""
self._db = None
# Database info
self.dbhost = conf.get('redis', 'host')
self.dbnum = conf.get('redis', 'database')
self.dbport = int(conf.get('redis', 'port'))
# Maximum number of simultaneous downloads
self.queue = Queue.Queue()
self.max_downloads = 4
# Filepaths
self.working_dir = os.path.expanduser(conf.get('directories',
'working_dir'))
self.image_archive = os.path.expanduser(conf.get('directories',
'image_archive'))
# Check directory permission
self._check_permissions()
# Load data server, browser, and downloader
self.server = self._load_server(server)
self.browser = self._load_browser(browse_method, self.server.get_uri())
# Start downloaders
self.downloaders = []
for i in range(self.max_downloads):
self.downloaders.append(self._load_downloader(download_method))
# Shutdown switch
self.shutdown_requested = False
# Initialize database
self._init_db()
def _init_db(self):
"""Initialise the database"""
try:
self._db = redis.StrictRedis(host=self.dbhost, port=self.dbport,
db=self.dbnum)
self._db.ping()
except redis.ConnectionError:
logging.error('Unable to connect to Redis. Is redis running?')
print("Please start redis and try again...")
sys.exit()
def _check_permissions(self):
"""Checks to make sure we have write permissions to directories"""
for d in [self.working_dir, self.image_archive]:
if not (os.path.isdir(d) and os.access(d, os.W_OK)):
print("Unable to write to specified directories. "
"Please check permissions for locations listed in "
"settings.cfg and try again...")
sys.exit()
def _load_server(self, server):
"""Loads a data server"""
cls = self._load_class('hvpull.servers',
server, self.get_servers().get(server))
return cls()
def _load_browser(self, browse_method, uri):
"""Loads a data browser"""
cls = self._load_class('hvpull.browser', browse_method,
self.get_browsers().get(browse_method))
return cls(uri)
def _load_downloader(self, download_method):
"""Loads a data downloader"""
cls = self._load_class('hvpull.downloader', download_method,
self.get_downloaders().get(download_method))
downloader = cls(self.image_archive, self.working_dir,
self.server.get_uri(), self.queue)
downloader.setDaemon(True)
downloader.start()
return downloader
def start(self, starttime=None, endtime=None):
"""Start daemon operation."""
logging.info("Initializing HVPull")
date_fmt = "%Y-%m-%d %H:%M:%S"
# TODO: Process urls in batches of ~1-500.. this way images start
# appearing more quickly when filling in large gaps, etc.
# Determine starttime to use
if starttime is not None:
starttime = datetime.datetime.strptime(starttime, date_fmt)
else:
starttime = self.server.get_starttime()
# If end time is specified, fill in data from start to end
if endtime is not None:
endtime = datetime.datetime.strptime(endtime, date_fmt)
urls = self.query(starttime, endtime)
self.acquire(urls)
self.ingest(urls)
return None
else:
# Otherwise, first query from start -> now
now = datetime.datetime.utcnow()
urls = self.query(starttime, now)
self.acquire(urls)
self.ingest(urls)
# Begin main loop
while not self.shutdown_requested:
now = datetime.datetime.utcnow()
starttime = self.server.get_starttime()
# get a list of files available
urls = self.query(starttime, now)
# acquire the data files
self.acquire(urls)
self.ingest(urls)
#time.sleep(self.server.pause.seconds)
time.sleep(self.server.pause.seconds)
# Shutdown
self.stop()
def stop(self):
logging.info("Exiting HVPull")
sys.exit()
def ingest(self, urls):
"""
Add images to helioviewer images db.
(1) Make sure the file exists
(2) Make sure the file is 'good', and quarantine if it is not.
(3) Apply the ESA JPIP encoding.
(4) Ingest
(5) Update database to say that the file has been successfully
'ingested'.
"""
base_url = self.server.get_uri()
# Get filepaths
filepaths = []
for url in urls:
p = os.path.join(self.image_archive, url.replace(base_url, ""))
if os.path.isfile(p):
filepaths.append(p)
# Add to database
for filepath in filepaths:
info = sunpy.read_header(filepath)
img_counter = self._db.incr('counter:img_id')
img_id = 'img:%s' % os.path.basename(filepath)
params = {
"id": img_counter,
"timestamp": datetime.datetime.utcnow(),
"observatory": info['observatory'],
"instrument": info['instrument'],
"detector": info['detector'],
"measurement": info['measurement'],
"date_obs": info['date']
}
self._db.hmset(img_id, params)
def acquire(self, urls):
"""Acquires all the available files."""
# If no new files are available do nothing
if not urls:
return
print("Found %d new files" % len(urls))
# Download files
while len(urls) > 0:
# Download files 20 at a time to avoid blocking shutdown requests
for i in range(20): #pylint: disable=W0612
if len(urls) > 0:
url = urls.pop()
self.queue.put(url)
self.queue.join()
if self.shutdown_requested:
self.stop()
def shutdown(self):
print("Stopping HVPull...")
self.shutdown_requested = True
for downloader in self.downloaders:
downloader.stop()
def query(self, starttime, endtime):
"""Query and retrieve data within the specified range.
Checks for data in the specified range and retrieves any new files.
After execution is completed, the same range is checked again to see
if any new files have appeared since the first execution. This continues
until no new files are found (for xxx minutes?)
"""
# Get the nickname subdirectory list present at the server
root_url = self.server.get_uri()
nicknames = self.browser.get_directories(root_url)
# No nicknames found.
if nicknames == []:
return None
logging.info("Querying time range %s - %s", starttime, endtime)
# Get the list of dates
fmt = "%Y/%m/%d"
dates = [starttime.strftime(fmt)]
date = starttime.date()
while date < endtime.date():
date = date + datetime.timedelta(days=1)
dates.append(date.strftime(fmt))
# Ensure the dates are most recent first
dates.sort()
dates.reverse()
# Get the measurement subdirectories present at the server
measurements = []
for nickname in nicknames:
for date in dates:
location = os.path.join(nickname, date)
measurement = self.browser.get_directories(location)
measurements.extend(measurement)
# No measurements found
if measurements == []:
return None
# Get all the unique measurements
measurements = list(set(measurements))
# Get a sorted list of available JP2 files via browser
files = []
# TESTING>>>>>>
# measurements = [measurements[1]]
# Check each remote directory for new files
for measurement in measurements:
if self.shutdown_requested:
return
logging.info('Scanning ' + measurement)
matches = self.browser.get_files(measurement, "jp2")
files.extend(matches)
# Remove any duplicates
files = list(set(files))
return filter(self._filter_new, files) or None
def _load_class(self, base_package, packagename, classname):
"""Dynamically loads a class given a set of strings indicating its
location"""
# Import module
modname = "%s.%s" % (base_package, packagename)
__import__(modname)
# Instantiate class and return
return getattr(sys.modules[modname], classname)
def _filter_new(self, url):
"""For a given list of remote files determines which ones have not
yet been acquired."""
filename = os.path.basename(url)
return not self._db.exists("img:%s" % filename)
@classmethod
def get_servers(cls):
"""Returns a list of valid servers to interact with"""
return {
"lmsal": "LMSALDataServer",
"soho": "SOHODataServer",
"stereo": "STEREODataServer"
}
@classmethod
def get_browsers(cls):
"""Returns a list of valid data browsers to interact with"""
return {
"httpbrowser": "HTTPDataBrowser",
"localbrowser": "LocalDataBrowser"
}
@classmethod
def get_downloaders(cls):
"""Returns a list of valid data downloaders to interact with"""
return {
"urllib": "URLLibDownloader"
}
|