3
csv.py - read/write/investigate CSV files
7
from _csv import Error, __version__, writer, reader, register_dialect, \
8
unregister_dialect, get_dialect, list_dialects, \
9
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
13
from cStringIO import StringIO
15
from StringIO import StringIO
17
__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
18
"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
19
"register_dialect", "get_dialect", "list_dialects", "Sniffer",
20
"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
30
skipinitialspace = None
35
if self.__class__ != Dialect:
37
errors = self._validate()
39
raise Error, "Dialect did not validate: %s" % ", ".join(errors)
44
errors.append("can't directly instantiate Dialect class")
46
if self.delimiter is None:
47
errors.append("delimiter character not set")
48
elif (not isinstance(self.delimiter, str) or
49
len(self.delimiter) > 1):
50
errors.append("delimiter must be one-character string")
52
if self.quotechar is None:
53
if self.quoting != QUOTE_NONE:
54
errors.append("quotechar not set")
55
elif (not isinstance(self.quotechar, str) or
56
len(self.quotechar) > 1):
57
errors.append("quotechar must be one-character string")
59
if self.lineterminator is None:
60
errors.append("lineterminator not set")
61
elif not isinstance(self.lineterminator, str):
62
errors.append("lineterminator must be a string")
64
if self.doublequote not in (True, False):
65
errors.append("doublequote parameter must be True or False")
67
if self.skipinitialspace not in (True, False):
68
errors.append("skipinitialspace parameter must be True or False")
70
if self.quoting is None:
71
errors.append("quoting parameter not set")
73
if self.quoting is QUOTE_NONE:
74
if (not isinstance(self.escapechar, (unicode, str)) or
75
len(self.escapechar) > 1):
76
errors.append("escapechar must be a one-character string or unicode object")
84
skipinitialspace = False
85
lineterminator = '\r\n'
86
quoting = QUOTE_MINIMAL
87
register_dialect("excel", excel)
89
class excel_tab(excel):
91
register_dialect("excel-tab", excel_tab)
95
def __init__(self, f, fieldnames=None, restkey=None, restval=None,
96
dialect="excel", *args, **kwds):
97
self.fieldnames = fieldnames # list of keys for the dict
98
self.restkey = restkey # key to catch long rows
99
self.restval = restval # default value for short rows
100
self.reader = reader(f, dialect, *args, **kwds)
106
row = self.reader.next()
107
if self.fieldnames is None:
108
self.fieldnames = row
109
row = self.reader.next()
111
# unlike the basic reader, we prefer not to return blanks,
112
# because we will typically wind up with a dict full of None
115
row = self.reader.next()
116
d = dict(zip(self.fieldnames, row))
117
lf = len(self.fieldnames)
120
d[self.restkey] = row[lf:]
122
for key in self.fieldnames[lr:]:
123
d[key] = self.restval
128
def __init__(self, f, fieldnames, restval="", extrasaction="raise",
129
dialect="excel", *args, **kwds):
130
self.fieldnames = fieldnames # list of keys for the dict
131
self.restval = restval # for writing short dicts
132
if extrasaction.lower() not in ("raise", "ignore"):
134
("extrasaction (%s) must be 'raise' or 'ignore'" %
136
self.extrasaction = extrasaction
137
self.writer = writer(f, dialect, *args, **kwds)
139
def _dict_to_list(self, rowdict):
140
if self.extrasaction == "raise":
141
for k in rowdict.keys():
142
if k not in self.fieldnames:
143
raise ValueError, "dict contains fields not in fieldnames"
144
return [rowdict.get(key, self.restval) for key in self.fieldnames]
146
def writerow(self, rowdict):
147
return self.writer.writerow(self._dict_to_list(rowdict))
149
def writerows(self, rowdicts):
151
for rowdict in rowdicts:
152
rows.append(self._dict_to_list(rowdict))
153
return self.writer.writerows(rows)
155
# Guard Sniffer's type checking against builds that exclude complex()
163
"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
164
Returns a Dialect object.
167
# in case there is more than one possible delimiter
168
self.preferred = [',', '\t', ';', ' ', ':']
171
def sniff(self, sample, delimiters=None):
173
Returns a dialect (or None) corresponding to the sample
176
quotechar, delimiter, skipinitialspace = \
177
self._guess_quote_and_delimiter(sample, delimiters)
178
if delimiter is None:
179
delimiter, skipinitialspace = self._guess_delimiter(sample,
182
class dialect(Dialect):
184
lineterminator = '\r\n'
185
quoting = QUOTE_MINIMAL
189
dialect.delimiter = delimiter
190
# _csv.reader won't accept a quotechar of ''
191
dialect.quotechar = quotechar or '"'
192
dialect.skipinitialspace = skipinitialspace
197
def _guess_quote_and_delimiter(self, data, delimiters):
199
Looks for text enclosed between two identical quotes
200
(the probable quotechar) which are preceded and followed
201
by the same character (the probable delimiter).
204
The quote with the most wins, same with the delimiter.
205
If there is no quotechar the delimiter can't be determined
210
for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
211
'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
212
'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
213
'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
214
regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
215
matches = regexp.findall(data)
220
return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
226
n = regexp.groupindex['quote'] - 1
229
quotes[key] = quotes.get(key, 0) + 1
231
n = regexp.groupindex['delim'] - 1
235
if key and (delimiters is None or key in delimiters):
236
delims[key] = delims.get(key, 0) + 1
238
n = regexp.groupindex['space'] - 1
244
quotechar = reduce(lambda a, b, quotes = quotes:
245
(quotes[a] > quotes[b]) and a or b, quotes.keys())
248
delim = reduce(lambda a, b, delims = delims:
249
(delims[a] > delims[b]) and a or b, delims.keys())
250
skipinitialspace = delims[delim] == spaces
251
if delim == '\n': # most likely a file with a single column
254
# there is *no* delimiter, it's a single column of quoted data
258
return (quotechar, delim, skipinitialspace)
261
def _guess_delimiter(self, data, delimiters):
263
The delimiter /should/ occur the same number of times on
264
each row. However, due to malformed data, it may not. We don't want
265
an all or nothing approach, so we allow for small variations in this
267
1) build a table of the frequency of each character on every line.
268
2) build a table of freqencies of this frequency (meta-frequency?),
269
e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
271
3) use the mode of the meta-frequency to determine the /expected/
272
frequency for that character
273
4) find out how often the character actually meets that goal
274
5) the character that best meets its goal is the delimiter
275
For performance reasons, the data is evaluated in chunks, so it can
276
try and evaluate the smallest portion of the data possible, evaluating
277
additional chunks as necessary.
280
data = filter(None, data.split('\n'))
282
ascii = [chr(c) for c in range(127)] # 7-bit ASCII
284
# build frequency tables
285
chunkLength = min(10, len(data))
290
start, end = 0, min(chunkLength, len(data))
291
while start < len(data):
293
for line in data[start:end]:
295
metaFrequency = charFrequency.get(char, {})
296
# must count even if frequency is 0
297
freq = line.strip().count(char)
299
metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
300
charFrequency[char] = metaFrequency
302
for char in charFrequency.keys():
303
items = charFrequency[char].items()
304
if len(items) == 1 and items[0][0] == 0:
306
# get the mode of the frequencies
308
modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
310
# adjust the mode - subtract the sum of all
312
items.remove(modes[char])
313
modes[char] = (modes[char][0], modes[char][1]
314
- reduce(lambda a, b: (0, a[1] + b[1]),
317
modes[char] = items[0]
319
# build a list of possible delimiters
320
modeList = modes.items()
321
total = float(chunkLength * iteration)
322
# (rows of consistent data) / (number of rows) = 100%
324
# minimum consistency threshold
326
while len(delims) == 0 and consistency >= threshold:
327
for k, v in modeList:
328
if v[0] > 0 and v[1] > 0:
329
if ((v[1]/total) >= consistency and
330
(delimiters is None or k in delimiters)):
335
delim = delims.keys()[0]
336
skipinitialspace = (data[0].count(delim) ==
337
data[0].count("%c " % delim))
338
return (delim, skipinitialspace)
340
# analyze another chunkLength lines
347
# if there's more than one, fall back to a 'preferred' list
349
for d in self.preferred:
350
if d in delims.keys():
351
skipinitialspace = (data[0].count(d) ==
352
data[0].count("%c " % d))
353
return (d, skipinitialspace)
355
# finally, just return the first damn character in the list
356
delim = delims.keys()[0]
357
skipinitialspace = (data[0].count(delim) ==
358
data[0].count("%c " % delim))
359
return (delim, skipinitialspace)
362
def has_header(self, sample):
363
# Creates a dictionary of types of data in each column. If any
364
# column is of a single type (say, integers), *except* for the first
365
# row, then the first row is presumed to be labels. If the type
366
# can't be determined, it is assumed to be a string in which case
367
# the length of the string is the determining factor: if all of the
368
# rows except for the first are the same length, it's a header.
369
# Finally, a 'vote' is taken at the end for each column, adding or
370
# subtracting from the likelihood of the first row being a header.
372
rdr = reader(StringIO(sample), self.sniff(sample))
374
header = rdr.next() # assume first row is header
376
columns = len(header)
378
for i in range(columns): columnTypes[i] = None
382
# arbitrary number of rows to check, to keep it sane
387
if len(row) != columns:
388
continue # skip rows that have irregular number of columns
390
for col in columnTypes.keys():
392
for thisType in [int, long, float, complex]:
396
except (ValueError, OverflowError):
399
# fallback to length of string
400
thisType = len(row[col])
402
# treat longs as ints
406
if thisType != columnTypes[col]:
407
if columnTypes[col] is None: # add new column type
408
columnTypes[col] = thisType
410
# type is inconsistent, remove column from
414
# finally, compare results against first row and "vote"
415
# on whether it's a header
417
for col, colType in columnTypes.items():
418
if type(colType) == type(0): # it's a length
419
if len(header[col]) != colType:
423
else: # attempt typecast
426
except (ValueError, TypeError):