22
22
from warnings import warn
23
23
from distutils .version import LooseVersion
24
24
25
- __all__ = ["read_excel" , "ExcelWriter" , "ExcelFile" ]
25
+ __all__ = ["read_excel" , "ExcelWriter" , "ExcelFile" , "OdsFile" ]
26
26
27
27
_writer_extensions = ["xlsx" , "xls" , "xlsm" ]
28
28
_writers = {}
@@ -67,16 +67,17 @@ def get_writer(engine_name):
67
67
68
68
69
69
def read_excel (io , sheetname = 0 , ** kwds ):
70
- """Read an Excel table into a pandas DataFrame
70
+ """Read an Excel/ods table into a pandas DataFrame
71
71
72
72
Parameters
73
73
----------
74
- io : string, file-like object, or xlrd workbook.
74
+ io : string, file-like object, or xlrd workbook for MS Excel files. For an
75
+ ods file (Open Document Formant), string or ezodf workbook is required.
75
76
The string could be a URL. Valid URL schemes include http, ftp, s3,
76
77
and file. For file URLs, a host is expected. For instance, a local
77
78
file could be file://localhost/path/to/workbook.xlsx
78
79
sheetname : string or int, default 0
79
- Name of Excel sheet or the page number of the sheet
80
+ Name of Excel/ods sheet or the page number of the sheet
80
81
header : int, default 0
81
82
Row to use for the column labels of the parsed DataFrame
82
83
skiprows : list-like
@@ -86,7 +87,7 @@ def read_excel(io, sheetname=0, **kwds):
86
87
converters : dict, default None
87
88
Dict of functions for converting values in certain columns. Keys can
88
89
either be integers or column labels, values are functions that take one
89
- input argument, the Excel cell content, and return the transformed
90
+ input argument, the Excel/ods cell content, and return the transformed
90
91
content.
91
92
index_col : int, default None
92
93
Column to use as the row labels of the DataFrame. Pass None if
@@ -106,10 +107,10 @@ def read_excel(io, sheetname=0, **kwds):
106
107
Indicate number of NA values placed in non-numeric columns
107
108
engine: string, default None
108
109
If io is not a buffer or path, this must be set to identify io.
109
- Acceptable values are None or xlrd
110
+ Acceptable values are None, xlrd, or ezodf
110
111
convert_float : boolean, default True
111
112
convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
112
- data will be read in as floats: Excel stores all numbers as floats
113
+ data will be read in as floats: Excel/ods stores all numbers as floats
113
114
internally
114
115
has_index_names : boolean, default False
115
116
True if the cols defined in index_col have an index name and are
@@ -119,7 +120,7 @@ def read_excel(io, sheetname=0, **kwds):
119
120
Returns
120
121
-------
121
122
parsed : DataFrame
122
- DataFrame from the passed in Excel file
123
+ DataFrame of the given workbook in the Excel/ods file.
123
124
124
125
"""
125
126
if 'kind' in kwds :
@@ -129,9 +130,240 @@ def read_excel(io, sheetname=0, **kwds):
129
130
130
131
engine = kwds .pop ('engine' , None )
131
132
133
+ if engine == 'ezodf' :
134
+ return OdsFile (io ).parse (sheetname = sheetname , ** kwds )
135
+
136
+ # figure out if the file is an MS Excel or ODF ODS type
137
+ # code is doubled here: it is very similar to OdsFile.__init__. Is there a
138
+ # better way?
139
+ if isinstance (io , compat .string_types ):
140
+ if io [- 4 :] == '.ods' :
141
+ try :
142
+ return OdsFile (io ).parse (sheetname = sheetname , ** kwds )
143
+ except Exception as e :
144
+ print ('ods support requires ezodf, please install ezodf first' )
145
+ raise e
146
+ elif io [- 4 :] in ['xls' , 'xlsx' , 'xlsm' ]:
147
+ return ExcelFile (io , engine = engine ).parse (sheetname = sheetname , ** kwds )
148
+ try :
149
+ import ezodf
150
+ if isinstance (io , ezodf .document .PackagedDocument ):
151
+ return OdsFile (io ).parse (sheetname = sheetname , ** kwds )
152
+ except ImportError :
153
+ pass
132
154
return ExcelFile (io , engine = engine ).parse (sheetname = sheetname , ** kwds )
133
155
134
156
157
+ class OdsFile (object ):
158
+ """
159
+ Class for parsing tabular ods sheets into DataFrame objects.
160
+ Uses ezodf. See OdsFile.parse for more documentation
161
+
162
+ Parameters
163
+ ----------
164
+ io : string or ezodf workbook
165
+ If a string, expected to be a path to ods file
166
+ """
167
+ def __init__ (self , io , ** kwds ):
168
+
169
+ import ezodf # throw an ImportError if we need to
170
+ # ezodf does not have a __version__ or similar attribute
171
+
172
+ self .io = io
173
+
174
+ if isinstance (io , compat .string_types ):
175
+ if _is_url (io ):
176
+ data = _urlopen (io ).read ()
177
+ self .book = ezodf .opendoc (data )
178
+ else :
179
+ self .book = ezodf .opendoc (io )
180
+ # this the corresponding ezopdf instance of a workbook
181
+ elif isinstance (io , ezodf .document .PackagedDocument ):
182
+ self .book = io
183
+ else :
184
+ raise ValueError ('IO must be a path or ods workbook' )
185
+
186
+ def parse (self , sheetname = 0 , header = 0 , skiprows = None , skip_footer = 0 ,
187
+ index_col = None , parse_cols = None , parse_dates = False ,
188
+ date_parser = None , na_values = None , thousands = None , chunksize = None ,
189
+ convert_float = True , has_index_names = False , converters = None , ** kwds ):
190
+ """Read an ods table into DataFrame
191
+
192
+ Parameters
193
+ ----------
194
+ sheetname : string or integer
195
+ Name of ods sheet or the page number of the sheet
196
+ header : int, default 0
197
+ Row to use for the column labels of the parsed DataFrame
198
+ skiprows : list-like
199
+ Rows to skip at the beginning (0-indexed)
200
+ skip_footer : int, default 0
201
+ Rows at the end to skip (0-indexed)
202
+ converters : dict, default None
203
+ Dict of functions for converting values in certain columns. Keys can
204
+ either be integers or column labels
205
+ index_col : int, default None
206
+ Column to use as the row labels of the DataFrame. Pass None if
207
+ there is no such column
208
+ parse_cols : int or list, default None
209
+ * If None then parse all columns
210
+ * If int then indicates last column to be parsed
211
+ * If list of ints then indicates list of column numbers to be
212
+ parsed
213
+ * If string then indicates comma separated list of column names and
214
+ column ranges (e.g. "A:E" or "A,C,E:F")
215
+ parse_dates : boolean, default False
216
+ Parse date ods values,
217
+ date_parser : function default None
218
+ Date parsing function
219
+ na_values : list-like, default None
220
+ List of additional strings to recognize as NA/NaN
221
+ thousands : str, default None
222
+ Thousands separator
223
+ chunksize : int, default None
224
+ Size of file chunk to read for lazy evaluation.
225
+ convert_float : boolean, default True
226
+ convert integral floats to int (i.e., 1.0 --> 1). If False, all
227
+ numeric data will be read in as floats: ods stores all numbers as
228
+ floats internally.
229
+ has_index_names : boolean, default False
230
+ True if the cols defined in index_col have an index name and are
231
+ not in the header
232
+
233
+ Returns
234
+ -------
235
+ parsed : DataFrame
236
+ DataFrame parsed from the ods file
237
+ """
238
+ skipfooter = kwds .pop ('skipfooter' , None )
239
+ if skipfooter is not None :
240
+ skip_footer = skipfooter
241
+
242
+ return self ._parse_ods (sheetname = sheetname , header = header ,
243
+ skiprows = skiprows ,
244
+ index_col = index_col ,
245
+ has_index_names = has_index_names ,
246
+ parse_cols = parse_cols ,
247
+ parse_dates = parse_dates ,
248
+ date_parser = date_parser , na_values = na_values ,
249
+ thousands = thousands , chunksize = chunksize ,
250
+ skip_footer = skip_footer ,
251
+ convert_float = convert_float ,
252
+ converters = converters ,
253
+ ** kwds )
254
+
255
+ def _print_cellinfo (self , cell ):
256
+ print (' plaintext:' , cell .plaintext ()) # no formatting
257
+ # formatted, but what is difference with value?
258
+ print ('display_form:' , cell .display_form ) # format, ?=plaintext
259
+ print (' value:' , cell .value ) # data handled
260
+ print (' value_type:' , cell .value_type ) # data type
261
+ print (' formula:' , cell .formula )
262
+
263
+ def _parse_datetime (self , cell ):
264
+ """
265
+ Parse the date or time to a datetime object
266
+ """
267
+ if cell .value_type == 'time' and cell .formula is not None :
268
+ try :
269
+ value = datetime .datetime .strptime (cell .formula ,
270
+ 'of:=TIME(%H;%M;%S)' )
271
+ except ValueError :
272
+ # hours can be more then 23
273
+ hours = int (cell .value [2 :].split ('H' )[0 ])
274
+ minutes = int (cell .value [2 :].split ('M' )[0 ][- 2 :])
275
+ seconds = int (cell .value [2 :].split ('M' )[1 ][:- 1 ])
276
+ if hours > 23 :
277
+ value = datetime .timedelta (hours = hours , minutes = minutes ,
278
+ seconds = seconds )
279
+ else :
280
+ # TODO: should return a time object, not datetime?
281
+ value = datetime .datetime .strptime (cell .value ,
282
+ 'PT%HH%MM%SS' )
283
+ # TODO: this does not cover all scenario's
284
+ # TODO: now timedelta objects will be mixed with normal time
285
+ elif cell .value_type == 'date' and cell .formula is not None :
286
+ try :
287
+ value = datetime .datetime .strptime (cell .formula ,
288
+ 'of:=DATE(%Y;%m;%d)' )
289
+ except (ValueError , TypeError ):
290
+ # TODO: parsing other scenerio's
291
+ value = cell .value
292
+ else :
293
+ value = None
294
+ return value
295
+
296
+ def _parse_ods (self , sheetname = 0 , header = 0 , skiprows = None , skip_footer = 0 ,
297
+ index_col = None , has_index_names = None , parse_cols = None ,
298
+ parse_dates = False , date_parser = None , na_values = None ,
299
+ thousands = None , chunksize = None , convert_float = True ,
300
+ ** kwds ):
301
+
302
+ # sheetname can be index or string
303
+ sheet = self .book .sheets [sheetname ]
304
+
305
+ data = []
306
+
307
+ for i in range (sheet .nrows ()):
308
+ row = []
309
+ for j , cell in enumerate (sheet .row (i )):
310
+ typ = cell .value_type
311
+ if isinstance (cell .value , float ):
312
+ value = cell .value
313
+ if convert_float :
314
+ # GH5394 - Excel and ODS 'numbers' are always floats
315
+ # it's a minimal perf hit and less suprising
316
+ # FIXME: this goes wrong when int(cell.value) returns
317
+ # a long (>1e18)
318
+ val = int (cell .value )
319
+ if val == cell .value :
320
+ value = val
321
+ elif isinstance (typ , str ):
322
+ if typ == 'string' :
323
+ value = cell .value
324
+ elif typ == 'date' or typ == 'time' :
325
+ value = self ._parse_datetime (cell )
326
+ elif isinstance (typ , bool ):
327
+ value = cell .value
328
+ elif isinstance (typ , type (None )):
329
+ value = np .nan
330
+ else :
331
+ value = np .nan
332
+
333
+ row .append (value )
334
+
335
+ data .append (row )
336
+
337
+ parser = TextParser (data , header = header , index_col = index_col ,
338
+ has_index_names = has_index_names ,
339
+ na_values = na_values ,
340
+ thousands = thousands ,
341
+ parse_dates = parse_dates ,
342
+ date_parser = date_parser ,
343
+ skiprows = skiprows ,
344
+ skip_footer = skip_footer ,
345
+ chunksize = chunksize ,
346
+ ** kwds )
347
+
348
+ return parser .read ()
349
+
350
+ @property
351
+ def sheet_names (self ):
352
+ # book.sheet.names() is a generator
353
+ return [sheetname for sheetname in self .book .sheet .names ()]
354
+
355
+ def close (self ):
356
+ """close io if necessary"""
357
+ if hasattr (self .io , 'close' ):
358
+ self .io .close ()
359
+
360
+ def __enter__ (self ):
361
+ return self
362
+
363
+ def __exit__ (self , exc_type , exc_value , traceback ):
364
+ self .close ()
365
+
366
+
135
367
class ExcelFile (object ):
136
368
"""
137
369
Class for parsing tabular excel sheets into DataFrame objects.
0 commit comments