-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Openpyxl engine for reading excel files #25092
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
e29b4c0
e0199a8
ce4eb01
b25877e
821fa4d
4694668
712f1ef
1d49a0e
1473c0e
6e8ffba
d57dfc1
e984f6b
44f7af2
98d3865
d0188ba
205d52b
7b550bf
875de8d
12ad6d8
dfd6a36
fef7233
eaafd5f
8d2db02
13e7793
b053cce
fe4dd73
64e5f2d
99b2cad
ce5ac05
c7895ea
2ca9368
5fb1aef
537dd0c
44cddc5
e4c8f23
daff364
1224918
1bfc030
747311e
a77a4c7
ddcaad8
757235d
cdd627f
0b58109
45f21f8
e97d029
1edae5e
a69e104
f5f40e4
22e24bb
903b188
1b3ae99
02e19a8
3e18f97
d11956c
61d7a3f
13d41b2
97c85f5
614d972
d87d9c0
7348b0c
c1a1792
d72ca5a
0bba345
8dd8bf6
eaaa680
6bf5183
a06bf9b
f43e90f
8fabe0a
0ff5ce3
fb73692
17b1d73
3d248ed
c369fd8
70b15a4
a3a3bca
fcd43f0
d9c1fa6
3c239a4
4a25a5a
6258e59
00f34b1
a1fba90
88ee325
837ce26
dddc8c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,16 @@ | ||
from pandas.io.excel._base import ExcelWriter | ||
from collections import OrderedDict | ||
from io import BytesIO | ||
|
||
import pandas.compat as compat | ||
from pandas.core.dtypes.common import is_integer, is_list_like | ||
from pandas.core.frame import DataFrame | ||
from pandas.io.common import (_is_url, _urlopen, _validate_header_arg, | ||
get_filepath_or_buffer) | ||
from pandas.io.excel._base import (ExcelFile, ExcelWriter, _BaseExcelReader, | ||
_fill_mi_header, _maybe_convert_to_string, | ||
_maybe_convert_usecols, _pop_header_name) | ||
from pandas.io.excel._util import _validate_freeze_panes | ||
from pandas.io.parsers import _validate_usecols_arg, _validate_usecols_names | ||
|
||
|
||
class _OpenpyxlWriter(ExcelWriter): | ||
|
@@ -451,3 +462,256 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, | |
xcell = wks.cell(column=col, row=row) | ||
for k, v in style_kwargs.items(): | ||
setattr(xcell, k, v) | ||
|
||
|
||
class _OpenpyxlReader(_BaseExcelReader): | ||
|
||
def __init__(self, filepath_or_buffer): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Reader using openpyxl engine. | ||
|
||
Parameters | ||
---------- | ||
filepath_or_buffer : string, path object or Workbook | ||
Object to be parsed. | ||
""" | ||
err_msg = "Install xlrd >= 1.0.0 for Excel support" | ||
|
||
try: | ||
import openpyxl | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
except ImportError: | ||
raise ImportError(err_msg) | ||
|
||
# If filepath_or_buffer is a url, want to keep the data as bytes so | ||
# can't pass to get_filepath_or_buffer() | ||
if _is_url(filepath_or_buffer): | ||
filepath_or_buffer = BytesIO(_urlopen(filepath_or_buffer).read()) | ||
elif not isinstance(filepath_or_buffer, | ||
(ExcelFile, openpyxl.Workbook)): | ||
filepath_or_buffer, _, _, _ = get_filepath_or_buffer( | ||
filepath_or_buffer) | ||
|
||
if isinstance(filepath_or_buffer, openpyxl.Workbook): | ||
self.book = filepath_or_buffer | ||
elif hasattr(filepath_or_buffer, "read"): | ||
if hasattr(filepath_or_buffer, 'seek'): | ||
filepath_or_buffer.seek(0) | ||
self.book = openpyxl.load_workbook( | ||
filepath_or_buffer, data_only=True) | ||
elif isinstance(filepath_or_buffer, compat.string_types): | ||
self.book = openpyxl.load_workbook( | ||
filepath_or_buffer, data_only=True) | ||
else: | ||
raise ValueError('Must explicitly set engine if not passing in' | ||
' buffer or path for io.') | ||
|
||
@property | ||
def sheet_names(self): | ||
return self.book.sheetnames | ||
|
||
def get_sheet_by_name(self, name): | ||
return self.book[name] | ||
|
||
def get_sheet_by_index(self, index): | ||
return self.book.worksheets[index] | ||
|
||
@staticmethod | ||
def _replace_type_error_with_nan(rows): | ||
nan = float('nan') | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for row in rows: | ||
yield [nan | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if cell.data_type == cell.TYPE_ERROR | ||
else cell.value | ||
for cell in row] | ||
|
||
def get_sheet_data(self, sheet, convert_float): | ||
data = self._replace_type_error_with_nan(sheet.rows) | ||
# TODO: support using iterator | ||
# TODO: don't make strings out of data | ||
return list(data) | ||
|
||
def parse(self, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So similar comment as above it would be preferable not to override this and just leave in the base class. I've noticed a vast majority of this is simply copy / paste. Rather indifferent but if we go the route of cleanup in a follow up issue then for sure need to consolidate this as well There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The _BaseExcelReader.parse function is closely coupled to the later call of the TextParser which makes no sense foe an openpyxl based reader as it already outputs structured, properly parsed data. Effectively the parse function for openpyxl does almost the reverse of the base parser. The base parser applies keywords to overcome limitations of xlrd and then converts the data do a dataframe. This parse functions first makes a dataframe, and then reverse applies the many keywords that the excel read supports to mimic the behaviour and pass all the tests. These fundamental differences in approach make it very difficult to keep the functions generic. The same applies to the init function. This function e.g. needs the specified engine to be imported, and becomes very ugly when that is made generic, see discussion above There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this is probably related to my earlier question about |
||
sheet_name=0, | ||
header=0, | ||
names=None, | ||
index_col=None, | ||
usecols=None, | ||
squeeze=False, | ||
converters=None, | ||
dtype=None, | ||
true_values=None, | ||
false_values=None, | ||
skiprows=None, | ||
nrows=None, | ||
na_values=None, | ||
verbose=False, | ||
parse_dates=False, | ||
date_parser=None, | ||
thousands=None, | ||
comment=None, | ||
skipfooter=0, | ||
convert_float=True, | ||
mangle_dupe_cols=True, | ||
**kwds): | ||
|
||
_validate_header_arg(header) | ||
|
||
ret_dict = False | ||
|
||
# Keep sheetname to maintain backwards compatibility. | ||
if isinstance(sheet_name, list): | ||
sheets = sheet_name | ||
ret_dict = True | ||
elif sheet_name is None: | ||
sheets = self.sheet_names | ||
ret_dict = True | ||
else: | ||
sheets = [sheet_name] | ||
|
||
# handle same-type duplicates. | ||
sheets = list(OrderedDict.fromkeys(sheets).keys()) | ||
|
||
output = OrderedDict() | ||
|
||
for asheetname in sheets: | ||
if verbose: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you make this a function instead of inlining all of this |
||
print("Reading sheet {sheet}".format(sheet=asheetname)) | ||
|
||
if isinstance(asheetname, compat.string_types): | ||
sheet = self.get_sheet_by_name(asheetname) | ||
else: # assume an integer if not a string | ||
sheet = self.get_sheet_by_index(asheetname) | ||
|
||
data = self.get_sheet_data(sheet, convert_float) | ||
if not data or data == [[None]]: | ||
output[asheetname] = DataFrame() | ||
continue | ||
|
||
usecols = _maybe_convert_usecols(usecols) | ||
|
||
if is_list_like(header) and len(header) == 1: | ||
header = header[0] | ||
|
||
# TODO: scrutinize what is going here | ||
# forward fill and pull out names for MultiIndex column | ||
header_names = None | ||
if header is not None and is_list_like(header): | ||
header_names = [] | ||
control_row = [True] * len(data[0]) | ||
|
||
for row in header: | ||
if is_integer(skiprows): | ||
row += skiprows | ||
|
||
data[row], control_row = _fill_mi_header(data[row], | ||
control_row) | ||
|
||
if index_col is not None: | ||
header_name, _ = _pop_header_name(data[row], index_col) | ||
header_names.append(header_name) | ||
|
||
# TODO: implement whatever this should do | ||
# has_index_names = is_list_like(header) and len(header) > 1 | ||
|
||
if skiprows: | ||
data = [row for i, row in enumerate(data) if i not in skiprows] | ||
|
||
if skipfooter: | ||
data = data[:-skipfooter] | ||
|
||
column_names = [cell for i, cell in enumerate(data.pop(0))] | ||
|
||
frame = DataFrame(data, columns=column_names) | ||
if usecols: | ||
_validate_usecols_arg(usecols) | ||
usecols = sorted(usecols) | ||
if any(isinstance(i, str) for i in usecols): | ||
_validate_usecols_names(usecols, column_names) | ||
frame = frame[usecols] | ||
else: | ||
frame = frame.iloc[:, usecols] | ||
|
||
if not converters: | ||
converters = dict() | ||
if not dtype: | ||
dtype = dict() | ||
|
||
# handle columns referenced by number so all references are by | ||
# column name | ||
handled_converters = {} | ||
for k, v in converters.items(): | ||
if k not in frame.columns and isinstance(k, int): | ||
k = frame.columns[k] | ||
handled_converters[k] = v | ||
converters = handled_converters | ||
|
||
# attempt to convert object columns to integer. Only because this | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls make helper functions for things; this function is getting too long |
||
# is implicitly done when reading and excel file with xlrd | ||
# TODO: question if this should be default behaviour | ||
if len(frame) > 0: | ||
for column in set(frame) - set(dtype.keys()): | ||
if frame[column].dtype == object: | ||
try: | ||
frame[column] = frame[column].astype('int64') | ||
except (ValueError, TypeError): | ||
try: | ||
frame[column] = frame[column].astype('float64') | ||
except (ValueError, TypeError): | ||
continue | ||
elif (convert_float and | ||
frame[column].dtype == float and | ||
all(frame[column] % 1 == 0)): | ||
frame[column] = frame[column].astype('int64') | ||
elif not convert_float: | ||
if frame[column].dtype == int: | ||
frame[column] = frame[column].astype('float64') | ||
|
||
if converters: | ||
for k, v in converters.items(): | ||
# for compatibiliy reasons | ||
if frame[k].dtype == float and convert_float: | ||
frame[k] = frame[k].fillna('') | ||
frame[k] = frame[k].apply(v) | ||
|
||
if dtype: | ||
for k, v in dtype.items(): | ||
frame[k] = frame[k].astype(v) | ||
|
||
if index_col is not None: | ||
if is_list_like(index_col): | ||
if any(isinstance(i, str) for i in index_col): | ||
# TODO: see if there is already a method for this in | ||
# pandas.io.parsers | ||
frame = frame.set_index(index_col) | ||
if len(index_col) == 1: | ||
# TODO: understand why this is needed | ||
raise TypeError( | ||
"list indices must be integers.*, not str") | ||
else: | ||
frame = frame.set_index( | ||
[column_names[i] for i in index_col]) | ||
else: | ||
if isinstance(index_col, str): | ||
frame = frame.set_index(index_col) | ||
else: | ||
frame = frame.set_index(column_names[index_col]) | ||
|
||
output[asheetname] = frame | ||
if not squeeze or isinstance(output[asheetname], DataFrame): | ||
if header_names: | ||
output[asheetname].columns = output[ | ||
asheetname].columns.set_names(header_names) | ||
elif compat.PY2: | ||
output[asheetname].columns = _maybe_convert_to_string( | ||
output[asheetname].columns) | ||
|
||
# name unnamed columns | ||
unnamed = 0 | ||
for i, col_name in enumerate(frame.columns.values): | ||
if col_name is None: | ||
frame.columns.values[i] = "Unnamed: {n}".format(n=unnamed) | ||
unnamed += 1 | ||
|
||
if ret_dict: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are you not always returning a dict? |
||
return output | ||
else: | ||
return output[asheetname] |
Uh oh!
There was an error while loading. Please reload this page.