~jstys-z/helioviewer.org/client5

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
"""ImageRetreivalDaemon"""
# Licensed under MOZILLA PUBLIC LICENSE Version 1.1
# Author: Keith Hughitt <keith.hughitt@nasa.gov>
# Author: Jack Ireland <jack.ireland@nasa.gov>
# pylint: disable=E1121
import sys
import datetime
import time
import logging
import os
import redis
import sunpy
import Queue

class ImageRetrievalDaemon:
    """Retrieves images from the server as specified"""
    def __init__(self, server, browse_method, download_method, conf):
        """Explain."""
        self._db = None
        
        # Database info
        self.dbhost = conf.get('redis', 'host')
        self.dbnum = conf.get('redis', 'database')
        self.dbport = int(conf.get('redis', 'port'))
        
        # Maximum number of simultaneous downloads
        self.queue = Queue.Queue()
        self.max_downloads = 4
        
        # Filepaths
        self.working_dir = os.path.expanduser(conf.get('directories',
                                                       'working_dir'))
        self.image_archive = os.path.expanduser(conf.get('directories',
                                                         'image_archive'))
        # Check directory permission
        self._check_permissions() 
        
        # Load data server, browser, and downloader
        self.server = self._load_server(server)
        self.browser = self._load_browser(browse_method, self.server.get_uri())
        
        # Start downloaders
        self.downloaders = []
        
        for i in range(self.max_downloads):
            self.downloaders.append(self._load_downloader(download_method))

        # Shutdown switch
        self.shutdown_requested = False
        
        # Initialize database
        self._init_db()

    def _init_db(self):
        """Initialise the database"""
        try:
            self._db = redis.StrictRedis(host=self.dbhost, port=self.dbport, 
                                         db=self.dbnum)
            self._db.ping()
        except redis.ConnectionError:
            logging.error('Unable to connect to Redis. Is redis running?')
            print("Please start redis and try again...")
            sys.exit()

    def _check_permissions(self):
        """Checks to make sure we have write permissions to directories"""
        for d in [self.working_dir, self.image_archive]:
            if not (os.path.isdir(d) and os.access(d, os.W_OK)):
                print("Unable to write to specified directories. "
                      "Please check permissions for locations listed in "
                      "settings.cfg and try again...")
                sys.exit()

    def _load_server(self, server):
        """Loads a data server"""
        cls = self._load_class('downloader.servers', 
                               server, self.get_servers().get(server))
        return cls()
            
    def _load_browser(self, browse_method, uri):
        """Loads a data browser"""
        cls = self._load_class('downloader.browser', browse_method, 
                               self.get_browsers().get(browse_method))
        return cls(uri)
    
    def _load_downloader(self, download_method):
        """Loads a data downloader"""
        cls = self._load_class('downloader.downloader', download_method, 
                               self.get_downloaders().get(download_method))
        downloader = cls(self.image_archive, self.working_dir, 
                         self.server.get_uri(), self.queue)
        
        downloader.setDaemon(True)
        downloader.start()

        return downloader
        
    def start(self, starttime=None, endtime=None):
        """Start daemon operation."""
        logging.info("Initializing HVPull")
        
        date_fmt = "%Y-%m-%d %H:%M:%S"
        
        # TODO: Process urls in batches of ~1-500.. this way images start
        # appearing more quickly when filling in large gaps, etc.
        
        # Determine starttime to use
        if starttime is not None:
            starttime = datetime.datetime.strptime(starttime, date_fmt)
        else:
            starttime = self.server.get_starttime()
            
        # If end time is specified, fill in data from start to end
        if endtime is not None:
            endtime = datetime.datetime.strptime(endtime, date_fmt)
            urls = self.query(starttime, endtime)
            self.acquire(urls)
            self.ingest(urls)
            return None
        else:
        # Otherwise, first query from start -> now
            now = datetime.datetime.utcnow()
            urls = self.query(starttime, now)
            self.acquire(urls)
            self.ingest(urls)
        
        # Begin main loop
        while not self.shutdown_requested:
            now = datetime.datetime.utcnow()
            starttime = self.server.get_starttime()
            
            # get a list of files available
            urls = self.query(starttime, now)
            
            # acquire the data files
            self.acquire(urls)
            self.ingest(urls)
            
            #time.sleep(self.server.pause.seconds)
            time.sleep(self.server.pause.seconds)
        
        # Shutdown
        self.stop()
        
    def stop(self):
        logging.info("Exiting HVPull")
        sys.exit()
        
    def ingest(self, urls):
        """
        Add images to helioviewer images db.
          (1) Make sure the file exists
          (2) Make sure the file is 'good', and quarantine if it is not.
          (3) Apply the ESA JPIP encoding.
          (4) Ingest
          (5) Update database to say that the file has been successfully 
              'ingested'.
              
        """
        base_url = self.server.get_uri()
        
        # Get filepaths
        filepaths = []
        
        for url in urls:
            p = os.path.join(self.image_archive, url.replace(base_url, ""))
            if os.path.isfile(p):
                filepaths.append(p)
            
        # Add to hvpull/Helioviewer.org databases
        for filepath in filepaths:
            info = sunpy.read_header(filepath)
            
            img_counter = self._db.incr('counter:img_id')
            img_id = 'img:%s' % os.path.basename(filepath)
            
            params = {
                "id": img_counter,
                "timestamp": datetime.datetime.utcnow(),
                "observatory": info['observatory'],
                "instrument": info['instrument'],
                "detector": info['detector'],
                "measurement": info['measurement'],
                "date_obs": info['date']
            }
            self._db.hmset(img_id, params)


    def acquire(self, urls):
        """Acquires all the available files."""
        # If no new files are available do nothing
        if not urls:
            return
        
        print("Found %d new files" % len(urls))
        
        # Download files
        while len(urls) > 0:
            # Download files 20 at a time to avoid blocking shutdown requests
            for i in range(20): #pylint: disable=W0612
                if len(urls) > 0:
                    url = urls.pop()
                    self.queue.put(url)
                
            self.queue.join()
            
            if self.shutdown_requested:
                self.stop()

    def shutdown(self):
        print("Stopping HVPull...")
        self.shutdown_requested = True
        
        for downloader in self.downloaders:
            downloader.stop()
        
    def query(self, starttime, endtime):
        """Query and retrieve data within the specified range.
        
        Checks for data in the specified range and retrieves any new files.
        After execution is completed, the same range is checked again to see
        if any new files have appeared since the first execution. This continues
        until no new files are found (for xxx minutes?)
        """
        # Get the nickname subdirectory list present at the server
        root_url = self.server.get_uri()
        nicknames = self.browser.get_directories(root_url)

        # No nicknames found.
        if nicknames == []:
            return None
        
        logging.info("Querying time range %s - %s", starttime, endtime)
                
        # Get the list of dates
        fmt = "%Y/%m/%d"
        dates = [starttime.strftime(fmt)]

        date = starttime.date()
        while date < endtime.date():
            date = date + datetime.timedelta(days=1)
            dates.append(date.strftime(fmt))
        
        # Ensure the dates are most recent first
        dates.sort()
        dates.reverse()
        
        # Get the measurement subdirectories present at the server        
        measurements = []
        for nickname in nicknames:
            for date in dates:
                location = os.path.join(nickname, date)
                measurement = self.browser.get_directories(location)
                measurements.extend(measurement)

        # No measurements found
        if measurements == []:
            return None

        # Get all the unique measurements
        measurements = list(set(measurements))

        # Get a sorted list of available JP2 files via browser
        files = []
        
        # TESTING>>>>>>
        # measurements = [measurements[1]]

        # Check each remote directory for new files
        for measurement in measurements:
            if self.shutdown_requested:
                return            
            logging.info('Scanning ' + measurement)
            matches = self.browser.get_files(measurement, "jp2")
            files.extend(matches)

        # Remove any duplicates
        files = list(set(files))
        
        return filter(self._filter_new, files) or None
    
    def _load_class(self, base_package, packagename, classname):
        """Dynamically loads a class given a set of strings indicating its 
        location"""
        # Import module
        modname = "%s.%s" % (base_package, packagename)
        __import__(modname)
    
        # Instantiate class and return
        return getattr(sys.modules[modname], classname)
    
    def _filter_new(self, url):
        """For a given list of remote files determines which ones have not
        yet been acquired."""
        filename = os.path.basename(url)
        return not self._db.exists("img:%s" % filename)
    
    @classmethod
    def get_servers(cls):
        """Returns a list of valid servers to interact with"""
        return {
            "lmsal": "LMSALDataServer",
            "soho": "SOHODataServer",
            "stereo": "STEREODataServer"
        }
        
    @classmethod
    def get_browsers(cls):
        """Returns a list of valid data browsers to interact with"""
        return {
        "httpbrowser": "HTTPDataBrowser",
        "localbrowser": "LocalDataBrowser"
        }

    @classmethod
    def get_downloaders(cls):
        """Returns a list of valid data downloaders to interact with"""
        return {
            "urllib": "URLLibDownloader"
        }