ENH: Initial support for reading Open Document Format ods spreadsheet (GH2311)

davidovitch · davidovitch · commit 297b70b43f35 · 2014-12-12T23:54:10.000+01:00
diff --git a/pandas/io/api.py b/pandas/io/api.py
@@ -5,6 +5,7 @@
 from pandas.io.parsers import read_csv, read_table, read_fwf
 from pandas.io.clipboard import read_clipboard
 from pandas.io.excel import ExcelFile, ExcelWriter, read_excel
+from pandas.io.ods import OdsFile, read_ods
 from pandas.io.pytables import HDFStore, Term, get_store, read_hdf
 from pandas.io.json import read_json
 from pandas.io.html import read_html
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -22,7 +22,7 @@
 from warnings import warn
 from distutils.version import LooseVersion
 
-__all__ = ["read_excel", "ExcelWriter", "ExcelFile"]
+__all__ = ["read_excel", "ExcelWriter", "ExcelFile", "OdsFile"]
 
 _writer_extensions = ["xlsx", "xls", "xlsm"]
 _writers = {}
@@ -67,16 +67,17 @@ def get_writer(engine_name):
 
 
 def read_excel(io, sheetname=0, **kwds):
-    """Read an Excel table into a pandas DataFrame
+    """Read an Excel/ods table into a pandas DataFrame
 
     Parameters
     ----------
-    io : string, file-like object, or xlrd workbook.
+    io : string, file-like object, or xlrd workbook for MS Excel files. For an
+        ods file (Open Document Formant), string or ezodf workbook is required.
         The string could be a URL. Valid URL schemes include http, ftp, s3,
         and file. For file URLs, a host is expected. For instance, a local
         file could be file://localhost/path/to/workbook.xlsx
     sheetname : string or int, default 0
-        Name of Excel sheet or the page number of the sheet
+        Name of Excel/ods sheet or the page number of the sheet
     header : int, default 0
         Row to use for the column labels of the parsed DataFrame
     skiprows : list-like
@@ -86,7 +87,7 @@ def read_excel(io, sheetname=0, **kwds):
     converters : dict, default None
         Dict of functions for converting values in certain columns. Keys can
         either be integers or column labels, values are functions that take one
-        input argument, the Excel cell content, and return the transformed
+        input argument, the Excel/ods cell content, and return the transformed
         content.
     index_col : int, default None
         Column to use as the row labels of the DataFrame. Pass None if
@@ -106,10 +107,10 @@ def read_excel(io, sheetname=0, **kwds):
         Indicate number of NA values placed in non-numeric columns
     engine: string, default None
         If io is not a buffer or path, this must be set to identify io.
-        Acceptable values are None or xlrd
+        Acceptable values are None, xlrd, or ezodf
     convert_float : boolean, default True
         convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
-        data will be read in as floats: Excel stores all numbers as floats
+        data will be read in as floats: Excel/ods stores all numbers as floats
         internally
     has_index_names : boolean, default False
         True if the cols defined in index_col have an index name and are
@@ -119,7 +120,7 @@ def read_excel(io, sheetname=0, **kwds):
     Returns
     -------
     parsed : DataFrame
-        DataFrame from the passed in Excel file
+        DataFrame of the given workbook in the Excel/ods file.
 
     """
     if 'kind' in kwds:
@@ -129,9 +130,240 @@ def read_excel(io, sheetname=0, **kwds):
 
     engine = kwds.pop('engine', None)
 
+    if engine == 'ezodf':
+        return OdsFile(io).parse(sheetname=sheetname, **kwds)
+
+    # figure out if the file is an MS Excel or ODF ODS type
+    # code is doubled here: it is very similar to OdsFile.__init__. Is there a
+    # better way?
+    if isinstance(io, compat.string_types):
+        if io[-4:] == '.ods':
+            try:
+                return OdsFile(io).parse(sheetname=sheetname, **kwds)
+            except Exception as e:
+                print('ods support requires ezodf, please install ezodf first')
+                raise e
+        elif io[-4:] in ['xls', 'xlsx', 'xlsm']:
+            return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
+    try:
+        import ezodf
+        if isinstance(io, ezodf.document.PackagedDocument):
+            return OdsFile(io).parse(sheetname=sheetname, **kwds)
+    except ImportError:
+        pass
     return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
 
 
+class OdsFile(object):
+    """
+    Class for parsing tabular ods sheets into DataFrame objects.
+    Uses ezodf. See OdsFile.parse for more documentation
+
+    Parameters
+    ----------
+    io : string or ezodf workbook
+        If a string, expected to be a path to ods file
+    """
+    def __init__(self, io, **kwds):
+
+        import ezodf  # throw an ImportError if we need to
+        # ezodf does not have a __version__ or similar attribute
+
+        self.io = io
+
+        if isinstance(io, compat.string_types):
+            if _is_url(io):
+                data = _urlopen(io).read()
+                self.book = ezodf.opendoc(data)
+            else:
+                self.book = ezodf.opendoc(io)
+        # this the corresponding ezopdf instance of a workbook
+        elif isinstance(io, ezodf.document.PackagedDocument):
+            self.book = io
+        else:
+            raise ValueError('IO must be a path or ods workbook')
+
+    def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
+              index_col=None, parse_cols=None, parse_dates=False,
+              date_parser=None, na_values=None, thousands=None, chunksize=None,
+              convert_float=True, has_index_names=False, converters=None, **kwds):
+        """Read an ods table into DataFrame
+
+        Parameters
+        ----------
+        sheetname : string or integer
+            Name of ods sheet or the page number of the sheet
+        header : int, default 0
+            Row to use for the column labels of the parsed DataFrame
+        skiprows : list-like
+            Rows to skip at the beginning (0-indexed)
+        skip_footer : int, default 0
+            Rows at the end to skip (0-indexed)
+        converters : dict, default None
+            Dict of functions for converting values in certain columns. Keys can
+            either be integers or column labels
+        index_col : int, default None
+            Column to use as the row labels of the DataFrame. Pass None if
+            there is no such column
+        parse_cols : int or list, default None
+            * If None then parse all columns
+            * If int then indicates last column to be parsed
+            * If list of ints then indicates list of column numbers to be
+              parsed
+            * If string then indicates comma separated list of column names and
+              column ranges (e.g. "A:E" or "A,C,E:F")
+        parse_dates : boolean, default False
+            Parse date ods values,
+        date_parser : function default None
+            Date parsing function
+        na_values : list-like, default None
+            List of additional strings to recognize as NA/NaN
+        thousands : str, default None
+            Thousands separator
+        chunksize : int, default None
+            Size of file chunk to read for lazy evaluation.
+        convert_float : boolean, default True
+            convert integral floats to int (i.e., 1.0 --> 1). If False, all
+            numeric data will be read in as floats: ods stores all numbers as
+            floats internally.
+        has_index_names : boolean, default False
+            True if the cols defined in index_col have an index name and are
+            not in the header
+
+        Returns
+        -------
+        parsed : DataFrame
+            DataFrame parsed from the ods file
+        """
+        skipfooter = kwds.pop('skipfooter', None)
+        if skipfooter is not None:
+            skip_footer = skipfooter
+
+        return self._parse_ods(sheetname=sheetname, header=header,
+                               skiprows=skiprows,
+                               index_col=index_col,
+                               has_index_names=has_index_names,
+                               parse_cols=parse_cols,
+                               parse_dates=parse_dates,
+                               date_parser=date_parser, na_values=na_values,
+                               thousands=thousands, chunksize=chunksize,
+                               skip_footer=skip_footer,
+                               convert_float=convert_float,
+                               converters=converters,
+                               **kwds)
+
+    def _print_cellinfo(self, cell):
+        print('   plaintext:', cell.plaintext()) # no formatting
+        # formatted, but what is difference with value?
+        print('display_form:', cell.display_form) # format, ?=plaintext
+        print('       value:', cell.value)       # data handled
+        print('  value_type:', cell.value_type)  # data type
+        print('     formula:', cell.formula)
+
+    def _parse_datetime(self, cell):
+        """
+        Parse the date or time to a datetime object
+        """
+        if cell.value_type == 'time' and cell.formula is not None:
+            try:
+                value = datetime.datetime.strptime(cell.formula,
+                                                   'of:=TIME(%H;%M;%S)')
+            except ValueError:
+                # hours can be more then 23
+                hours = int(cell.value[2:].split('H')[0])
+                minutes = int(cell.value[2:].split('M')[0][-2:])
+                seconds = int(cell.value[2:].split('M')[1][:-1])
+                if hours > 23:
+                    value = datetime.timedelta(hours=hours, minutes=minutes,
+                                               seconds=seconds)
+                else:
+                    # TODO: should return a time object, not datetime?
+                    value = datetime.datetime.strptime(cell.value,
+                                                       'PT%HH%MM%SS')
+                    # TODO: this does not cover all scenario's
+                    # TODO: now timedelta objects will be mixed with normal time
+        elif cell.value_type == 'date' and cell.formula is not None:
+            try:
+                value = datetime.datetime.strptime(cell.formula,
+                                                   'of:=DATE(%Y;%m;%d)')
+            except (ValueError, TypeError):
+                # TODO: parsing other scenerio's
+                value = cell.value
+        else:
+            value = None
+        return value
+
+    def _parse_ods(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
+                   index_col=None, has_index_names=None, parse_cols=None,
+                   parse_dates=False, date_parser=None, na_values=None,
+                   thousands=None, chunksize=None, convert_float=True,
+                   **kwds):
+
+        # sheetname can be index or string
+        sheet = self.book.sheets[sheetname]
+
+        data = []
+
+        for i in range(sheet.nrows()):
+            row = []
+            for j, cell in enumerate(sheet.row(i)):
+                typ = cell.value_type
+                if isinstance(cell.value, float):
+                    value = cell.value
+                    if convert_float:
+                        # GH5394 - Excel and ODS 'numbers' are always floats
+                        # it's a minimal perf hit and less suprising
+                        # FIXME: this goes wrong when int(cell.value) returns
+                        # a long (>1e18)
+                        val = int(cell.value)
+                        if val == cell.value:
+                            value = val
+                elif isinstance(typ, str):
+                    if typ == 'string':
+                        value = cell.value
+                    elif typ == 'date' or typ == 'time':
+                        value = self._parse_datetime(cell)
+                elif isinstance(typ, bool):
+                    value = cell.value
+                elif isinstance(typ, type(None)):
+                    value = np.nan
+                else:
+                    value = np.nan
+
+                row.append(value)
+
+            data.append(row)
+
+        parser = TextParser(data, header=header, index_col=index_col,
+                            has_index_names=has_index_names,
+                            na_values=na_values,
+                            thousands=thousands,
+                            parse_dates=parse_dates,
+                            date_parser=date_parser,
+                            skiprows=skiprows,
+                            skip_footer=skip_footer,
+                            chunksize=chunksize,
+                            **kwds)
+
+        return parser.read()
+
+    @property
+    def sheet_names(self):
+        # book.sheet.names() is a generator
+        return [sheetname for sheetname in self.book.sheet.names()]
+
+    def close(self):
+        """close io if necessary"""
+        if hasattr(self.io, 'close'):
+            self.io.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+
 class ExcelFile(object):
     """
     Class for parsing tabular excel sheets into DataFrame objects.