2
Module contains tools for processing files into DataFrames or other objects
5
from pandas.core.index import Index
6
from pandas.core.frame import DataFrame
7
from pandas.core.matrix import DataMatrix
8
from pandas.core.series import Series
10
from datetime import datetime, timedelta
13
from dateutil import parser
15
# just a little hack for now
20
return datetime.strptime(val, '%m/%d/%Y')
24
from itertools import izip
28
def simpleParser(nestedList, forceFloat=True, colNames=None,
29
header=0, indexCol=0):
31
Workhorse function for processing nested list into DataFrame
33
naValues = set(['-1.#IND', '1.#QNAN', '1.#IND',
34
'-1.#QNAN','1.#INF','-1.#INF', '1.#INF000000',
35
'NA', 'NULL', 'NaN', 'nan', ''])
38
if header is not None:
39
columns = lines[header]
40
columns = [c if c != '' else 'Unnamed: ' + string.ascii_uppercase[i]
41
for i, c in enumerate(columns)]
42
content = lines[header+1:]
44
colCounts = dict(((col, 0) for col in columns))
45
for i, col in enumerate(columns):
46
if columns.count(col) > 1:
47
columns[i] = col + str(colCounts[col])
51
columns = string.ascii_uppercase[:len(lines[0])]
56
for i, (c, col) in enumerate(izip(columns, izip(*content))):
74
if header is not None:
75
if 'date' in columns[0].lower() or 'Unnamed' in columns[0]:
77
for s in data[columns[0]]:
79
dates.append(parser.parse(s))
82
data[columns[0]] = dates
83
for c, values in data.iteritems():
85
data[c] = np.array(values, dtype = np.float64)
87
data[c] = np.array(values, dtype = np.object_)
88
if indexCol is not None:
89
index = Index(data[columns[indexCol]])
90
frameData = dict([(col, data[col]) for col in columns \
91
if col != columns[indexCol]])
92
return DataFrame(data=frameData, index=index)
94
index = np.arange(len(data.values()[0]))
95
frameData = dict([(col, data[col]) for col in columns])
96
return DataFrame(data=frameData, index=index)
98
def parseCSV(filepath, header=0, indexCol=0):
100
Parse CSV file into a DataFrame object. Try to parse dates if possible.
103
f = open(filepath,'rb')
104
reader = csv.reader(f, dialect='excel')
105
lines = [l for l in reader]
107
return simpleParser(lines, header=header, indexCol=indexCol)
109
def parseText(filepath, sep='\t', header=0, indexCol=0, colNames = None):
111
Parse whitespace separated file into a DataFrame object.
112
Try to parse dates if possible.
114
lines = [l.rstrip().split(sep) for l in open(filepath,'rb').readlines()]
115
return simpleParser(lines, header=header, indexCol=indexCol,
118
#===============================================================================
120
#===============================================================================
122
OLE_TIME_ZERO = datetime(1899, 12, 30, 0, 0, 0)
123
def ole2datetime(oledt):
124
"""function for converting excel date to normal date format"""
125
return OLE_TIME_ZERO + timedelta(days=float(oledt))
127
def parseExcel(filepath, header = None, indexCol = 0, dateCol = 0,
132
raise ImportError('Sorry, you do not have xlrd.')
133
book = xlrd.open_workbook(filepath)
134
sheet = book.sheet_by_name(sheetname)
135
data = [sheet.row_values(i) for i in range(sheet.nrows)]
138
row[0] = ole2datetime(row[0])
141
return simpleParser(data, header = header, indexCol = indexCol)