pandas-dev · jreback · Jul 3, 2019 · Feb 27, 2019 · Feb 28, 2019 · Feb 28, 2019
diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml
@@ -15,6 +15,7 @@ dependencies:
   - nomkl
   - numexpr
   - numpy=1.15.*
+  - odfpy
   - openpyxl
   - pandas-gbq
   # https://github.com/pydata/pandas-gbq/issues/271

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -160,6 +160,7 @@ Other enhancements
 - Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '<backend-module>')`` where ``<backend-module`` is a library implementing the pandas plotting API (:issue:`14130`)
 - :class:`pandas.offsets.BusinessHour` supports multiple opening hours intervals (:issue:`15481`)
 - :func:`read_excel` can now use ``openpyxl`` to read Excel files via the ``engine='openpyxl'`` argument. This will become the default in a future release (:issue:`11499`)
+- :func:`pandas.io.excel.read_excel` supports reading OpenDocument tables. Specify engine='odf' to enable. (:issue:`9070`)
 
 .. _whatsnew_0250.api_breaking:
 

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -13,6 +13,7 @@
     "lxml.etree": "3.8.0",
     "matplotlib": "2.2.2",
     "numexpr": "2.6.2",
+    "odfpy": "1.3.0",
     "openpyxl": "2.4.8",
     "pandas_gbq": "0.8.0",
     "pyarrow": "0.9.0",

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -768,12 +768,14 @@ class ExcelFile:
         Acceptable values are None or ``xlrd``.
     """
 
-    from pandas.io.excel._xlrd import _XlrdReader
+    from pandas.io.excel._odfreader import _ODFReader
     from pandas.io.excel._openpyxl import _OpenpyxlReader
+    from pandas.io.excel._xlrd import _XlrdReader
 
     _engines = {
         'xlrd': _XlrdReader,
         'openpyxl': _OpenpyxlReader,
+        'odf': _ODFReader,
     }
 
     def __init__(self, io, engine=None):

diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py
@@ -0,0 +1,192 @@
+from typing import Dict, List
+
+from pandas.compat._optional import import_optional_dependency
+
+import pandas as pd
+from pandas._typing import FilePathOrBuffer, Scalar
+
+from pandas.io.excel._base import _BaseExcelReader
+
+
+class _ODFReader(_BaseExcelReader):
+    """Read tables out of OpenDocument formatted files
+
+    Parameters
+    ----------
+    filepath_or_buffer: string, path to be parsed or
+        an open readable stream.
+    """
+    def __init__(self, filepath_or_buffer: FilePathOrBuffer):
+        import_optional_dependency("odf")
+        super().__init__(filepath_or_buffer)
+
+    @property
+    def _workbook_class(self):
+        from odf.opendocument import OpenDocument
+        return OpenDocument
+
+    def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
+        from odf.opendocument import load
+        return load(filepath_or_buffer)
+
+    @property
+    def sheet_names(self) -> List[str]:
+        """Return a list of sheet names present in the document"""
+        from odf.table import Table
+
+        tables = self.book.getElementsByType(Table)
+        return [t.getAttribute("name") for t in tables]
+
+    def get_sheet_by_index(self, index: int):
+        from odf.table import Table
+        tables = self.book.getElementsByType(Table)
+        return tables[index]
+
+    def get_sheet_by_name(self, name: str):
+        from odf.table import Table
+
+        tables = self.book.getElementsByType(Table)
+
+        for table in tables:
+            if table.getAttribute("name") == name:
+                return table
+
+        raise ValueError("sheet {name} not found".format(name))
+
+    def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
+        """Parse an ODF Table into a list of lists
+        """
+        from odf.table import TableCell, TableRow
+
+        sheet_rows = sheet.getElementsByType(TableRow)
+        table = []  # type: List[List[Scalar]]
+        empty_rows = 0
+        max_row_len = 0
+        row_spans = {}  # type: Dict[int, int]
+
+        for i, sheet_row in enumerate(sheet_rows):
+            sheet_cells = sheet_row.getElementsByType(TableCell)
+            empty_cells = 0
+            table_row = []  # type: List[Scalar]
+
+            for j, sheet_cell in enumerate(sheet_cells):
+                # Handle vertically merged cells; only works with first column
+                if row_spans.get(j, 0) > 1:
+                    table_row.append('')
+                    row_spans[j] = row_spans[j] - 1
+
+                value = self._get_cell_value(sheet_cell, convert_float)
+                column_repeat = self._get_column_repeat(sheet_cell)
+                column_span = self._get_column_span(sheet_cell)
+                row_span = self._get_row_span(sheet_cell)
+
+                if row_span > 1:
+                    if j > 0:
+                        raise NotImplementedError(
+                            "The odf reader only supports vertical cell"
+                            "merging in the initial column")
+                    else:
+                        row_spans[j] = row_span
+
+                if len(sheet_cell.childNodes) == 0:
+                    empty_cells += column_repeat
+                else:
+                    if empty_cells > 0:
+                        table_row.extend([''] * empty_cells)
+                        empty_cells = 0
+                    table_row.extend([value] * column_repeat)
+
+                    # horizontally merged cells should only show first value
+                    if column_span > 1:
+                        table_row.extend([''] * (column_span - 1))
+
+            if max_row_len < len(table_row):
+                max_row_len = len(table_row)
+
+            row_repeat = self._get_row_repeat(sheet_row)
+            if self._is_empty_row(sheet_row):
+                empty_rows += row_repeat
+            else:
+                if empty_rows > 0:
+                    # add blank rows to our table
+                    table.extend([['']] * empty_rows)
+                    empty_rows = 0
+                for _ in range(row_repeat):
+                    table.append(table_row)
+
+        # Make our table square
+        for row in table:
+            if len(row) < max_row_len:
+                row.extend([''] * (max_row_len - len(row)))
+
+        return table
+
+    def _get_row_repeat(self, row) -> int:
+        """Return number of times this row was repeated
+        Repeating an empty row appeared to be a common way
+        of representing sparse rows in the table.
+        """
+        from odf.namespaces import TABLENS
+
+        return int(row.attributes.get((TABLENS, 'number-rows-repeated'), 1))
+
+    def _get_column_repeat(self, cell) -> int:
+        from odf.namespaces import TABLENS
+        return int(cell.attributes.get(
+            (TABLENS, 'number-columns-repeated'), 1))
+
+    def _get_row_span(self, cell) -> int:
+        """For handling cells merged vertically."""
+        from odf.namespaces import TABLENS
+        return int(cell.attributes.get((TABLENS, 'number-rows-spanned'), 1))
+
+    def _get_column_span(self, cell) -> int:
+        """For handling cells merged horizontally."""
+        from odf.namespaces import TABLENS
+        return int(cell.attributes.get((TABLENS, 'number-columns-spanned'), 1))
+
+    def _is_empty_row(self, row) -> bool:
+        """Helper function to find empty rows
+        """
+        for column in row.childNodes:
+            if len(column.childNodes) > 0:
+                return False
+
+        return True
+
+    def _get_cell_value(self, cell, convert_float: bool) -> Scalar:
+        from odf.namespaces import OFFICENS
+        cell_type = cell.attributes.get((OFFICENS, 'value-type'))
+        if cell_type == 'boolean':
+            if str(cell) == "TRUE":
+                return True
+            return False
+        if cell_type is None:
+            return ''  # compat with xlrd
+        elif cell_type == 'float':
+            # GH5394
+            cell_value = float(cell.attributes.get((OFFICENS, 'value')))
+
+            if cell_value == 0. and str(cell) != cell_value:  # NA handling
+                return str(cell)
+
+            if convert_float:
+                val = int(cell_value)
+                if val == cell_value:
+                    return val
+            return cell_value
+        elif cell_type == 'percentage':
+            cell_value = cell.attributes.get((OFFICENS, 'value'))
+            return float(cell_value)
+        elif cell_type == 'string':
+            return str(cell)
+        elif cell_type == 'currency':
+            cell_value = cell.attributes.get((OFFICENS, 'value'))
+            return float(cell_value)
+        elif cell_type == 'date':
+            cell_value = cell.attributes.get((OFFICENS, 'date-value'))
+            return pd.to_datetime(cell_value)
+        elif cell_type == 'time':
+            return pd.to_datetime(str(cell)).time()
+        else:
+            raise ValueError('Unrecognized type {}'.format(cell_type))
diff --git a/pandas/tests/io/data/blank-row-repeat.ods b/pandas/tests/io/data/blank-row-repeat.ods
diff --git a/pandas/tests/io/data/blank.ods b/pandas/tests/io/data/blank.ods
diff --git a/pandas/tests/io/data/blank_with_header.ods b/pandas/tests/io/data/blank_with_header.ods
diff --git a/pandas/tests/io/data/invalid_value_type.ods b/pandas/tests/io/data/invalid_value_type.ods
diff --git a/pandas/tests/io/data/lowerdiagonal.ods b/pandas/tests/io/data/lowerdiagonal.ods
diff --git a/pandas/tests/io/data/raising_repeats.ods b/pandas/tests/io/data/raising_repeats.ods
diff --git a/pandas/tests/io/data/runlengthencoding.ods b/pandas/tests/io/data/runlengthencoding.ods
diff --git a/pandas/tests/io/data/test1.ods b/pandas/tests/io/data/test1.ods
diff --git a/pandas/tests/io/data/test2.ods b/pandas/tests/io/data/test2.ods
diff --git a/pandas/tests/io/data/test3.ods b/pandas/tests/io/data/test3.ods
diff --git a/pandas/tests/io/data/test4.ods b/pandas/tests/io/data/test4.ods
diff --git a/pandas/tests/io/data/test5.ods b/pandas/tests/io/data/test5.ods
diff --git a/pandas/tests/io/data/test_converters.ods b/pandas/tests/io/data/test_converters.ods
diff --git a/pandas/tests/io/data/test_index_name_pre17.ods b/pandas/tests/io/data/test_index_name_pre17.ods
diff --git a/pandas/tests/io/data/test_multisheet.ods b/pandas/tests/io/data/test_multisheet.ods
diff --git a/pandas/tests/io/data/test_squeeze.ods b/pandas/tests/io/data/test_squeeze.ods
diff --git a/pandas/tests/io/data/test_types.ods b/pandas/tests/io/data/test_types.ods
diff --git a/pandas/tests/io/data/testdateoverflow.ods b/pandas/tests/io/data/testdateoverflow.ods
diff --git a/pandas/tests/io/data/testdtype.ods b/pandas/tests/io/data/testdtype.ods
diff --git a/pandas/tests/io/data/testmultiindex.ods b/pandas/tests/io/data/testmultiindex.ods
diff --git a/pandas/tests/io/data/testskiprows.ods b/pandas/tests/io/data/testskiprows.ods
diff --git a/pandas/tests/io/data/times_1900.ods b/pandas/tests/io/data/times_1900.ods
diff --git a/pandas/tests/io/data/times_1904.ods b/pandas/tests/io/data/times_1904.ods
diff --git a/pandas/tests/io/data/writertable.odt b/pandas/tests/io/data/writertable.odt
diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py
@@ -30,7 +30,7 @@ def df_ref():
     return df_ref
 
 
-@pytest.fixture(params=['.xls', '.xlsx', '.xlsm'])
+@pytest.fixture(params=['.xls', '.xlsx', '.xlsm', '.ods'])
 def read_ext(request):
     """
     Valid extensions for reading Excel files.

diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py
@@ -0,0 +1,76 @@
+import functools
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+pytest.importorskip("odf")
+
+
+@pytest.fixture(autouse=True)
+def cd_and_set_engine(monkeypatch, datapath):
+    func = functools.partial(pd.read_excel, engine="odf")
+    monkeypatch.setattr(pd, 'read_excel', func)
+    monkeypatch.chdir(datapath("io", "data"))
+
+
+def test_read_invalid_types_raises():
+    # the invalid_value_type.ods required manually editing
+    # of the included content.xml file
+    with pytest.raises(ValueError,
+                       match="Unrecognized type awesome_new_type"):
+        pd.read_excel("invalid_value_type.ods", header=None)
+
+
+def test_read_lower_diagonal():
+    # Make sure we can parse:
+    # 1
+    # 2 3
+    # 4 5 6
+    # 7 8 9 10
+
+    sheet = pd.read_excel("lowerdiagonal.ods", 'Sheet1',
+                          index_col=None, header=None)
+
+    assert sheet.shape == (4, 4)
+
+
+def test_read_writer_table():
+    # Also test reading tables from an text OpenDocument file
+    # (.odt)
+    index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header")
+    expected = pd.DataFrame([
+        [1, np.nan, 7],
+        [2, np.nan, 8],
+        [3, np.nan, 9],
+    ], index=index, columns=["Column 1", "Unnamed: 2", "Column 3"])
+
+    result = pd.read_excel("writertable.odt", 'Table1', index_col=0)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_blank_row_repeat():
+    table = pd.read_excel("blank-row-repeat.ods", 'Value')
+
+    assert table.shape == (14, 2)
+    assert table['value'][7] == 9.0
+    assert pd.isnull(table['value'][8])
+    assert not pd.isnull(table['value'][11])
+
+
+def test_runlengthencoding():
+    sheet = pd.read_excel("runlengthencoding.ods", 'Sheet1', header=None)
+    assert sheet.shape == (5, 3)
+    # check by column, not by row.
+    assert list(sheet[0]) == [1.0, 1.0, 2.0, 2.0, 2.0]
+    assert list(sheet[1]) == [1.0, 2.0, 2.0, 2.0, 2.0]
+    assert list(sheet[2]) == [1.0, 2.0, 2.0, 2.0, 2.0]
+
+
+def test_raises_repeated_rows_not_in_col_0():
+    with pytest.raises(NotImplementedError,
+                       match="merging in the initial column"):
+        pd.read_excel("raising_repeats.ods")
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -36,6 +36,7 @@ def ignore_xlrd_time_clock_warning():
     pytest.param('xlrd', marks=td.skip_if_no('xlrd')),
     pytest.param('openpyxl', marks=td.skip_if_no('openpyxl')),
     pytest.param(None, marks=td.skip_if_no('xlrd')),
+    pytest.param("odf", marks=td.skip_if_no("odf")),
 ])
 def engine(request):
     """
@@ -53,6 +54,11 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext):
         """
         if engine == 'openpyxl' and read_ext == '.xls':
             pytest.skip()
+        if engine == 'odf' and read_ext != '.ods':
+            pytest.skip()
+        if read_ext == ".ods" and engine != "odf":
+            pytest.skip()
+
         func = partial(pd.read_excel, engine=engine)
         monkeypatch.chdir(datapath("io", "data"))
         monkeypatch.setattr(pd, 'read_excel', func)
@@ -439,6 +445,9 @@ def test_bad_engine_raises(self, read_ext):
 
     @tm.network
     def test_read_from_http_url(self, read_ext):
+        if read_ext == '.ods':  # TODO: remove once on master
+            pytest.skip()
+
         url = ('https://raw.github.com/pandas-dev/pandas/master/'
                'pandas/tests/io/data/test1' + read_ext)
         url_table = pd.read_excel(url)
@@ -736,6 +745,10 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext):
         """
         Change directory and set engine for ExcelFile objects.
         """
+        if engine == 'odf' and read_ext != '.ods':
+            pytest.skip()
+        if read_ext == ".ods" and engine != "odf":
+            pytest.skip()
         if engine == 'openpyxl' and read_ext == '.xls':
             pytest.skip()
 

diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py
@@ -10,6 +10,12 @@
 xlwt = pytest.importorskip("xlwt")
 
 
+@pytest.fixture(autouse=True)
+def skip_ods_files(read_ext):
+    if read_ext == ".ods":
+        pytest.skip("Not valid for xlrd")
+
+
 def test_read_xlrd_book(read_ext, frame):
     df = frame