Skip to content

Commit 297b70b

Browse files
committed
ENH: Initial support for reading Open Document Format ods spreadsheet (GH2311)
1 parent 722fc77 commit 297b70b

File tree

2 files changed

+241
-8
lines changed

2 files changed

+241
-8
lines changed

pandas/io/api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pandas.io.parsers import read_csv, read_table, read_fwf
66
from pandas.io.clipboard import read_clipboard
77
from pandas.io.excel import ExcelFile, ExcelWriter, read_excel
8+
from pandas.io.ods import OdsFile, read_ods
89
from pandas.io.pytables import HDFStore, Term, get_store, read_hdf
910
from pandas.io.json import read_json
1011
from pandas.io.html import read_html

pandas/io/excel.py

Lines changed: 240 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from warnings import warn
2323
from distutils.version import LooseVersion
2424

25-
__all__ = ["read_excel", "ExcelWriter", "ExcelFile"]
25+
__all__ = ["read_excel", "ExcelWriter", "ExcelFile", "OdsFile"]
2626

2727
_writer_extensions = ["xlsx", "xls", "xlsm"]
2828
_writers = {}
@@ -67,16 +67,17 @@ def get_writer(engine_name):
6767

6868

6969
def read_excel(io, sheetname=0, **kwds):
70-
"""Read an Excel table into a pandas DataFrame
70+
"""Read an Excel/ods table into a pandas DataFrame
7171
7272
Parameters
7373
----------
74-
io : string, file-like object, or xlrd workbook.
74+
io : string, file-like object, or xlrd workbook for MS Excel files. For an
75+
ods file (Open Document Formant), string or ezodf workbook is required.
7576
The string could be a URL. Valid URL schemes include http, ftp, s3,
7677
and file. For file URLs, a host is expected. For instance, a local
7778
file could be file://localhost/path/to/workbook.xlsx
7879
sheetname : string or int, default 0
79-
Name of Excel sheet or the page number of the sheet
80+
Name of Excel/ods sheet or the page number of the sheet
8081
header : int, default 0
8182
Row to use for the column labels of the parsed DataFrame
8283
skiprows : list-like
@@ -86,7 +87,7 @@ def read_excel(io, sheetname=0, **kwds):
8687
converters : dict, default None
8788
Dict of functions for converting values in certain columns. Keys can
8889
either be integers or column labels, values are functions that take one
89-
input argument, the Excel cell content, and return the transformed
90+
input argument, the Excel/ods cell content, and return the transformed
9091
content.
9192
index_col : int, default None
9293
Column to use as the row labels of the DataFrame. Pass None if
@@ -106,10 +107,10 @@ def read_excel(io, sheetname=0, **kwds):
106107
Indicate number of NA values placed in non-numeric columns
107108
engine: string, default None
108109
If io is not a buffer or path, this must be set to identify io.
109-
Acceptable values are None or xlrd
110+
Acceptable values are None, xlrd, or ezodf
110111
convert_float : boolean, default True
111112
convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
112-
data will be read in as floats: Excel stores all numbers as floats
113+
data will be read in as floats: Excel/ods stores all numbers as floats
113114
internally
114115
has_index_names : boolean, default False
115116
True if the cols defined in index_col have an index name and are
@@ -119,7 +120,7 @@ def read_excel(io, sheetname=0, **kwds):
119120
Returns
120121
-------
121122
parsed : DataFrame
122-
DataFrame from the passed in Excel file
123+
DataFrame of the given workbook in the Excel/ods file.
123124
124125
"""
125126
if 'kind' in kwds:
@@ -129,9 +130,240 @@ def read_excel(io, sheetname=0, **kwds):
129130

130131
engine = kwds.pop('engine', None)
131132

133+
if engine == 'ezodf':
134+
return OdsFile(io).parse(sheetname=sheetname, **kwds)
135+
136+
# figure out if the file is an MS Excel or ODF ODS type
137+
# code is doubled here: it is very similar to OdsFile.__init__. Is there a
138+
# better way?
139+
if isinstance(io, compat.string_types):
140+
if io[-4:] == '.ods':
141+
try:
142+
return OdsFile(io).parse(sheetname=sheetname, **kwds)
143+
except Exception as e:
144+
print('ods support requires ezodf, please install ezodf first')
145+
raise e
146+
elif io[-4:] in ['xls', 'xlsx', 'xlsm']:
147+
return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
148+
try:
149+
import ezodf
150+
if isinstance(io, ezodf.document.PackagedDocument):
151+
return OdsFile(io).parse(sheetname=sheetname, **kwds)
152+
except ImportError:
153+
pass
132154
return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
133155

134156

157+
class OdsFile(object):
158+
"""
159+
Class for parsing tabular ods sheets into DataFrame objects.
160+
Uses ezodf. See OdsFile.parse for more documentation
161+
162+
Parameters
163+
----------
164+
io : string or ezodf workbook
165+
If a string, expected to be a path to ods file
166+
"""
167+
def __init__(self, io, **kwds):
168+
169+
import ezodf # throw an ImportError if we need to
170+
# ezodf does not have a __version__ or similar attribute
171+
172+
self.io = io
173+
174+
if isinstance(io, compat.string_types):
175+
if _is_url(io):
176+
data = _urlopen(io).read()
177+
self.book = ezodf.opendoc(data)
178+
else:
179+
self.book = ezodf.opendoc(io)
180+
# this the corresponding ezopdf instance of a workbook
181+
elif isinstance(io, ezodf.document.PackagedDocument):
182+
self.book = io
183+
else:
184+
raise ValueError('IO must be a path or ods workbook')
185+
186+
def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
187+
index_col=None, parse_cols=None, parse_dates=False,
188+
date_parser=None, na_values=None, thousands=None, chunksize=None,
189+
convert_float=True, has_index_names=False, converters=None, **kwds):
190+
"""Read an ods table into DataFrame
191+
192+
Parameters
193+
----------
194+
sheetname : string or integer
195+
Name of ods sheet or the page number of the sheet
196+
header : int, default 0
197+
Row to use for the column labels of the parsed DataFrame
198+
skiprows : list-like
199+
Rows to skip at the beginning (0-indexed)
200+
skip_footer : int, default 0
201+
Rows at the end to skip (0-indexed)
202+
converters : dict, default None
203+
Dict of functions for converting values in certain columns. Keys can
204+
either be integers or column labels
205+
index_col : int, default None
206+
Column to use as the row labels of the DataFrame. Pass None if
207+
there is no such column
208+
parse_cols : int or list, default None
209+
* If None then parse all columns
210+
* If int then indicates last column to be parsed
211+
* If list of ints then indicates list of column numbers to be
212+
parsed
213+
* If string then indicates comma separated list of column names and
214+
column ranges (e.g. "A:E" or "A,C,E:F")
215+
parse_dates : boolean, default False
216+
Parse date ods values,
217+
date_parser : function default None
218+
Date parsing function
219+
na_values : list-like, default None
220+
List of additional strings to recognize as NA/NaN
221+
thousands : str, default None
222+
Thousands separator
223+
chunksize : int, default None
224+
Size of file chunk to read for lazy evaluation.
225+
convert_float : boolean, default True
226+
convert integral floats to int (i.e., 1.0 --> 1). If False, all
227+
numeric data will be read in as floats: ods stores all numbers as
228+
floats internally.
229+
has_index_names : boolean, default False
230+
True if the cols defined in index_col have an index name and are
231+
not in the header
232+
233+
Returns
234+
-------
235+
parsed : DataFrame
236+
DataFrame parsed from the ods file
237+
"""
238+
skipfooter = kwds.pop('skipfooter', None)
239+
if skipfooter is not None:
240+
skip_footer = skipfooter
241+
242+
return self._parse_ods(sheetname=sheetname, header=header,
243+
skiprows=skiprows,
244+
index_col=index_col,
245+
has_index_names=has_index_names,
246+
parse_cols=parse_cols,
247+
parse_dates=parse_dates,
248+
date_parser=date_parser, na_values=na_values,
249+
thousands=thousands, chunksize=chunksize,
250+
skip_footer=skip_footer,
251+
convert_float=convert_float,
252+
converters=converters,
253+
**kwds)
254+
255+
def _print_cellinfo(self, cell):
256+
print(' plaintext:', cell.plaintext()) # no formatting
257+
# formatted, but what is difference with value?
258+
print('display_form:', cell.display_form) # format, ?=plaintext
259+
print(' value:', cell.value) # data handled
260+
print(' value_type:', cell.value_type) # data type
261+
print(' formula:', cell.formula)
262+
263+
def _parse_datetime(self, cell):
264+
"""
265+
Parse the date or time to a datetime object
266+
"""
267+
if cell.value_type == 'time' and cell.formula is not None:
268+
try:
269+
value = datetime.datetime.strptime(cell.formula,
270+
'of:=TIME(%H;%M;%S)')
271+
except ValueError:
272+
# hours can be more then 23
273+
hours = int(cell.value[2:].split('H')[0])
274+
minutes = int(cell.value[2:].split('M')[0][-2:])
275+
seconds = int(cell.value[2:].split('M')[1][:-1])
276+
if hours > 23:
277+
value = datetime.timedelta(hours=hours, minutes=minutes,
278+
seconds=seconds)
279+
else:
280+
# TODO: should return a time object, not datetime?
281+
value = datetime.datetime.strptime(cell.value,
282+
'PT%HH%MM%SS')
283+
# TODO: this does not cover all scenario's
284+
# TODO: now timedelta objects will be mixed with normal time
285+
elif cell.value_type == 'date' and cell.formula is not None:
286+
try:
287+
value = datetime.datetime.strptime(cell.formula,
288+
'of:=DATE(%Y;%m;%d)')
289+
except (ValueError, TypeError):
290+
# TODO: parsing other scenerio's
291+
value = cell.value
292+
else:
293+
value = None
294+
return value
295+
296+
def _parse_ods(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
297+
index_col=None, has_index_names=None, parse_cols=None,
298+
parse_dates=False, date_parser=None, na_values=None,
299+
thousands=None, chunksize=None, convert_float=True,
300+
**kwds):
301+
302+
# sheetname can be index or string
303+
sheet = self.book.sheets[sheetname]
304+
305+
data = []
306+
307+
for i in range(sheet.nrows()):
308+
row = []
309+
for j, cell in enumerate(sheet.row(i)):
310+
typ = cell.value_type
311+
if isinstance(cell.value, float):
312+
value = cell.value
313+
if convert_float:
314+
# GH5394 - Excel and ODS 'numbers' are always floats
315+
# it's a minimal perf hit and less suprising
316+
# FIXME: this goes wrong when int(cell.value) returns
317+
# a long (>1e18)
318+
val = int(cell.value)
319+
if val == cell.value:
320+
value = val
321+
elif isinstance(typ, str):
322+
if typ == 'string':
323+
value = cell.value
324+
elif typ == 'date' or typ == 'time':
325+
value = self._parse_datetime(cell)
326+
elif isinstance(typ, bool):
327+
value = cell.value
328+
elif isinstance(typ, type(None)):
329+
value = np.nan
330+
else:
331+
value = np.nan
332+
333+
row.append(value)
334+
335+
data.append(row)
336+
337+
parser = TextParser(data, header=header, index_col=index_col,
338+
has_index_names=has_index_names,
339+
na_values=na_values,
340+
thousands=thousands,
341+
parse_dates=parse_dates,
342+
date_parser=date_parser,
343+
skiprows=skiprows,
344+
skip_footer=skip_footer,
345+
chunksize=chunksize,
346+
**kwds)
347+
348+
return parser.read()
349+
350+
@property
351+
def sheet_names(self):
352+
# book.sheet.names() is a generator
353+
return [sheetname for sheetname in self.book.sheet.names()]
354+
355+
def close(self):
356+
"""close io if necessary"""
357+
if hasattr(self.io, 'close'):
358+
self.io.close()
359+
360+
def __enter__(self):
361+
return self
362+
363+
def __exit__(self, exc_type, exc_value, traceback):
364+
self.close()
365+
366+
135367
class ExcelFile(object):
136368
"""
137369
Class for parsing tabular excel sheets into DataFrame objects.

0 commit comments

Comments
 (0)