2
from __future__ import division
4
__author__ = "Jai Ram Rideout"
5
__copyright__ = "Copyright 2012, The QIIME project"
6
__credits__ = ["Jai Ram Rideout"]
9
__maintainer__ = "Jai Ram Rideout"
10
__email__ = "jai.rideout@gmail.com"
11
__status__ = "Release"
13
"""Contains functionality to interact with remote services."""
15
from collections import defaultdict
16
from csv import writer
18
from socket import gaierror
19
from StringIO import StringIO
20
from cogent.app.util import ApplicationNotFoundError
22
def raise_gdata_not_found_error(*args, **kwargs):
23
raise ApplicationNotFoundError("gdata cannot be found.\nIs it installed? "
24
"Is it in your $PYTHONPATH?\nThis is an optional QIIME "
25
"dependency, but is required if you plan to use QIIME's remote "
26
"mapping file features. For more information, please see "
27
"http://qiime.org/install/install.html.")
29
# Load gdata if it's available. If it's not, skip it but set up to raise errors
30
# if the user tries to use it.
32
from gdata.spreadsheet import SpreadsheetsCellsFeedFromString
33
from gdata.spreadsheet.service import CellQuery
34
from gdata.spreadsheet.service import SpreadsheetsService
36
# Set functions which cannot be imported to raise_gdata_not_found_error.
37
SpreadsheetsCellsFeedFromString = CellQuery = SpreadsheetsService = \
38
raise_gdata_not_found_error
40
class GoogleSpreadsheetError(Exception):
43
class GoogleSpreadsheetConnectionError(Exception):
46
def load_google_spreadsheet(spreadsheet_key, worksheet_name=None):
47
"""Downloads and exports a Google Spreadsheet in TSV format.
49
Returns a string containing the spreadsheet contents in TSV format (e.g.
50
for writing out to a file or parsing).
52
The first line is assumed to be the spreadsheet header (i.e. containing
53
column names), which can optionally be followed by one or more comment
54
lines (starting with '#'). Only the first cell of a comment line will be
55
parsed (to keep exported spreadsheets consistent with QIIME mapping files'
56
comments). The (optional) comments section is then followed by the
59
Some of this code is based on the following websites, as well as the
60
gdata.spreadsheet.text_db module:
61
http://www.payne.org/index.php/Reading_Google_Spreadsheets_in_Python
62
http://stackoverflow.com/a/12031835
65
spreadsheet_key - the key used to identify the spreadsheet (a string).
66
Can either be a key or a URL containing the key
67
worksheet_name - the name of the worksheet to load data from (a
68
string). If not supplied, will use first worksheet in the
71
spreadsheet_key = _extract_spreadsheet_key_from_url(spreadsheet_key)
72
gd_client = SpreadsheetsService()
75
worksheets_feed = gd_client.GetWorksheetsFeed(spreadsheet_key,
79
raise GoogleSpreadsheetConnectionError("Could not establish "
80
"connection with server. Do "
81
"you have an active Internet "
84
if len(worksheets_feed.entry) < 1:
85
raise GoogleSpreadsheetError("The Google Spreadsheet with key '%s' "
86
"does not have any worksheets associated "
87
"with it." % spreadsheet_key)
89
# Find worksheet that will be exported. If a name has not been provided,
90
# use the first worksheet.
92
if worksheet_name is not None:
93
for sheet in worksheets_feed.entry:
94
if sheet.title.text == worksheet_name:
98
raise GoogleSpreadsheetError("The worksheet name '%s' could not "
99
"be found in the Google Spreadsheet "
101
% (worksheet_name, spreadsheet_key))
103
# Choose the first one.
104
worksheet = worksheets_feed.entry[0]
106
# Extract the ID of the worksheet.
107
worksheet_id = worksheet.id.text.split('/')[-1]
109
# Now that we have a spreadsheet key and worksheet ID, we can read the
110
# data. First get the headers (first row). We need this in order to grab
111
# the rest of the actual data in the correct order (it is returned
113
headers = _get_spreadsheet_headers(gd_client, spreadsheet_key,
116
raise GoogleSpreadsheetError("Could not load spreadsheet header (it "
117
"appears to be empty). Is your Google "
118
"Spreadsheet with key '%s' empty?"
121
# Loop through the rest of the rows and build up a list of data (in the
122
# same row/col order found in the spreadsheet).
123
spreadsheet_lines = _export_spreadsheet(gd_client, spreadsheet_key,
124
worksheet_id, headers)
126
out_lines = StringIO()
127
tsv_writer = writer(out_lines, delimiter='\t', lineterminator='\n')
128
tsv_writer.writerows(spreadsheet_lines)
129
return out_lines.getvalue()
131
def _extract_spreadsheet_key_from_url(url):
132
"""Extracts a key from a URL in the form '...key=some_key&foo=42...
134
If the URL doesn't look valid, assumes the URL is the key and returns it
140
result = url.split('key=')[-1].split('#')[0].split('&')[0]
144
def _get_spreadsheet_headers(client, spreadsheet_key, worksheet_id):
145
"""Returns a list of headers (the first line of the spreadsheet).
147
Will be in the order they appear in the spreadsheet.
154
feed = client.GetCellsFeed(spreadsheet_key, worksheet_id, query=query,
155
visibility='public', projection='values')
157
# Wish python had a do-while...
159
for entry in feed.entry:
160
headers.append(entry.content.text)
162
# Get the next set of cells if needed.
163
next_link = feed.GetNextLink()
166
feed = client.Get(next_link.href,
167
converter=SpreadsheetsCellsFeedFromString)
173
def _export_spreadsheet(client, spreadsheet_key, worksheet_id, headers):
174
"""Returns a list of lists containing the entire spreadsheet.
176
This will include the header, any comment lines, and the spreadsheet data.
177
Blank cells are represented as None. Data will only be read up to the first
178
blank line that is encountered (this is a limitation of the Google
181
Comments are only supported after the header and before any real data is
182
encountered. The lines must start with [optional whitespace] '#' and only
183
the first cell is kept in that case (to avoid many empty cells after the
184
comment cell, which mimics QIIME's mapping file format).
186
Only cell data that falls under the supplied headers will be included.
188
# Convert the headers into Google's internal "cleaned" representation.
189
# These will be used as lookups to pull out cell data.
190
cleaned_headers = _get_cleaned_headers(headers)
192
# List feed skips header and returns rows in the order they appear in the
194
spreadsheet_lines = [headers]
195
rows_feed = client.GetListFeed(spreadsheet_key, worksheet_id,
196
visibility='public', projection='values')
200
for row in rows_feed.entry:
203
# Loop through our headers and use the cleaned version to look up
204
# the cell data. In certain cases (if the original header was blank
205
# or only contained special characters) we will not be able to map
206
# our header, so the best we can do is tell the user to change the
207
# name of their header to be something simple/alphanumeric.
208
for header_idx, (header, cleaned_header) in \
209
enumerate(zip(headers, cleaned_headers)):
211
cell_data = row.custom[cleaned_header].text
213
raise GoogleSpreadsheetError("Could not map header '%s' "
214
"to Google Spreadsheet's internal representation "
215
"of the header. We suggest changing the name of "
216
"the header in your Google Spreadsheet to be "
217
"alphanumeric if possible, as this will likely "
218
"solve the issue. Note that the name isn't "
219
"*required* to be alphanumeric, but it may fix "
220
"issues with converting to Google Spreadsheet's "
221
"internal format in some cases." % header)
223
# Special handling of comments (if it's a comment, only keep
224
# that cell to avoid several blank cells following it).
225
if not found_data and header_idx == 0 and \
226
cell_data.lstrip().startswith('#'):
227
line.append(cell_data)
230
line.append(cell_data)
233
spreadsheet_lines.append(line)
235
# Get the next set of rows if necessary.
236
next_link = rows_feed.GetNextLink()
239
rows_feed = client.Get(next_link.href,
240
converter=SpreadsheetsListFeedFromString)
244
return spreadsheet_lines
246
def _get_cleaned_headers(headers):
247
"""Creates a list of "cleaned" headers which spreadsheets accept.
249
A Google Spreadsheet converts the header names into a "cleaned" internal
250
representation, which must be used to reference a cell at a particular
251
header/column. They are all lower case and contain no spaces or special
252
characters. If two columns have the same name after being sanitized, the
253
columns further to the right have _2, _3 _4, etc. appended to them.
255
If there are column names which consist of all special characters, or if
256
the column header is blank, an obfuscated value will be used for a column
257
name. This method does not handle blank column names or column names with
258
only special characters.
260
Taken from gdata.spreadsheet.text_db.ConvertStringsToColumnHeaders and
261
modified to handle headers with pound signs or that start with numbers, as
262
well as correctly handle duplicate cleaned headers.
265
for header in headers:
266
# Google strips special characters, whitespace, and underscores first,
267
# and then strips any *leading* digits. This order is extremely
269
sanitized = sub(r'^\d+', '', sub(r'[\W_]', '', header.lower()))
270
if len(sanitized) > 0:
271
cleaned_headers.append(sanitized)
273
raise GoogleSpreadsheetError("Encountered a header '%s' that was "
274
"either blank or consisted only of special characters. "
275
"Could not map the header to the internal representation "
276
"used by the Google Spreadsheet. Please change the header "
277
"to consist of at least one alphanumeric character."
280
# When the same sanitized header appears multiple times in the first row
281
# of a spreadsheet, _n is appended to the name to make it unique.
282
header_count = defaultdict(int)
285
for header, cleaned_header in zip(headers, cleaned_headers):
286
new_header = cleaned_header
288
if header_count[cleaned_header] > 0:
289
# Google's numbering starts from _2, hence the +1.
290
new_header = '%s_%d' % (cleaned_header,
291
header_count[cleaned_header] + 1)
293
header_count[cleaned_header] += 1
294
results.append(new_header)