From d8a28938fab0ba53f300ee28c780b3d0a637ecf1 Mon Sep 17 00:00:00 2001 From: jnmclarty Date: Sun, 8 Feb 2015 20:44:55 -0500 Subject: [PATCH] ENH read multiple sheets in read_excel() --- doc/source/io.rst | 118 ++++++++--- doc/source/whatsnew/v0.16.0.txt | 8 + pandas/io/excel.py | 231 ++++++++++++++-------- pandas/io/tests/data/test_multisheet.xlsx | Bin 0 -> 10816 bytes pandas/io/tests/test_excel.py | 55 +++++- 5 files changed, 300 insertions(+), 112 deletions(-) create mode 100644 pandas/io/tests/data/test_multisheet.xlsx diff --git a/doc/source/io.rst b/doc/source/io.rst index f8fe6fc8a4c3a..ff031ccc88ddf 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1949,56 +1949,106 @@ module and use the same parsing code as the above to convert tabular data into a DataFrame. See the :ref:`cookbook` for some advanced strategies -Besides ``read_excel`` you can also read Excel files using the ``ExcelFile`` -class. The following two commands are equivalent: +Reading Excel Files +~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.16 + +``read_excel`` can read more than one sheet, by setting ``sheetname`` to either +a list of sheet names, a list of sheet positions, or ``None`` to read all sheets. + +.. versionadded:: 0.13 + +Sheets can be specified by sheet index or sheet name, using an integer or string, +respectively. + +.. versionadded:: 0.12 + +``ExcelFile`` has been moved to the top level namespace. + +There are two approaches to reading an excel file. The ``read_excel`` function +and the ``ExcelFile`` class. ``read_excel`` is for reading one file +with file-specific arguments (ie. identical data formats across sheets). +``ExcelFile`` is for reading one file with sheet-specific arguments (ie. various data +formats across sheets). Choosing the approach is largely a question of +code readability and execution speed. + +Equivalent class and function approaches to read a single sheet: .. code-block:: python # using the ExcelFile class xls = pd.ExcelFile('path_to_file.xls') - xls.parse('Sheet1', index_col=None, na_values=['NA']) + data = xls.parse('Sheet1', index_col=None, na_values=['NA']) # using the read_excel function - read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + data = read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) -The class based approach can be used to read multiple sheets or to introspect -the sheet names using the ``sheet_names`` attribute. +Equivalent class and function approaches to read multiple sheets: -.. note:: +.. code-block:: python - The prior method of accessing ``ExcelFile`` has been moved from - ``pandas.io.parsers`` to the top level namespace starting from pandas - 0.12.0. + data = {} + # For when Sheet1's format differs from Sheet2 + xls = pd.ExcelFile('path_to_file.xls') + data['Sheet1'] = xls.parse('Sheet1', index_col=None, na_values=['NA']) + data['Sheet2'] = xls.parse('Sheet2', index_col=1) + + # For when Sheet1's format is identical to Sheet2 + data = read_excel('path_to_file.xls', ['Sheet1','Sheet2'], index_col=None, na_values=['NA']) + +Specifying Sheets ++++++++++++++++++ +.. _io.specifying_sheets: -.. versionadded:: 0.13 +.. note :: The second argument is ``sheetname``, not to be confused with ``ExcelFile.sheet_names`` -There are now two ways to read in sheets from an Excel file. You can provide -either the index of a sheet or its name to by passing different values for -``sheet_name``. +.. note :: An ExcelFile's attribute ``sheet_names`` provides access to a list of sheets. +- The arguments ``sheetname`` allows specifying the sheet or sheets to read. +- The default value for ``sheetname`` is 0, indicating to read the first sheet - Pass a string to refer to the name of a particular sheet in the workbook. - Pass an integer to refer to the index of a sheet. Indices follow Python convention, beginning at 0. -- The default value is ``sheet_name=0``. This reads the first sheet. - -Using the sheet name: +- Pass a list of either strings or integers, to return a dictionary of specified sheets. +- Pass a ``None`` to return a dictionary of all available sheets. .. code-block:: python + # Returns a DataFrame read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) Using the sheet index: .. code-block:: python - read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) + # Returns a DataFrame + read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) Using all default values: .. code-block:: python + # Returns a DataFrame read_excel('path_to_file.xls') +Using None to get all sheets: + +.. code-block:: python + + # Returns a dictionary of DataFrames + read_excel('path_to_file.xls',sheetname=None) + +Using a list to get multiple sheets: + +.. code-block:: python + + # Returns the 1st and 4th sheet, as a dictionary of DataFrames. + read_excel('path_to_file.xls',sheetname=['Sheet1',3]) + +Parsing Specific Columns +++++++++++++++++++++++++ + It is often the case that users will insert columns to do temporary computations in Excel and you may not want to read in those columns. `read_excel` takes a `parse_cols` keyword to allow you to specify a subset of columns to parse. @@ -2017,26 +2067,30 @@ indices to be parsed. read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3]) -.. note:: +Cell Converters ++++++++++++++++ - It is possible to transform the contents of Excel cells via the `converters` - option. For instance, to convert a column to boolean: +It is possible to transform the contents of Excel cells via the `converters` +option. For instance, to convert a column to boolean: - .. code-block:: python +.. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) + read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) - This options handles missing values and treats exceptions in the converters - as missing data. Transformations are applied cell by cell rather than to the - column as a whole, so the array dtype is not guaranteed. For instance, a - column of integers with missing values cannot be transformed to an array - with integer dtype, because NaN is strictly a float. You can manually mask - missing data to recover integer dtype: +This options handles missing values and treats exceptions in the converters +as missing data. Transformations are applied cell by cell rather than to the +column as a whole, so the array dtype is not guaranteed. For instance, a +column of integers with missing values cannot be transformed to an array +with integer dtype, because NaN is strictly a float. You can manually mask +missing data to recover integer dtype: - .. code-block:: python +.. code-block:: python - cfun = lambda x: int(x) if x else -1 - read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) + cfun = lambda x: int(x) if x else -1 + read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) + +Writing Excel Files +~~~~~~~~~~~~~~~~~~~ To write a DataFrame object to a sheet of an Excel file, you can use the ``to_excel`` instance method. The arguments are largely the same as ``to_csv`` diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index c12513e087619..bee77d35674f3 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -190,6 +190,14 @@ Enhancements - Added ``StringMethods.find()`` and ``rfind()`` which behave as the same as standard ``str`` (:issue:`9386`) - Added ``StringMethods.isnumeric`` and ``isdecimal`` which behave as the same as standard ``str`` (:issue:`9439`) +- The ``read_excel()`` function's :ref:`sheetname <_io.specifying_sheets>` argument now accepts a list and ``None``, to get multiple or all sheets respectively. If more than one sheet is specified, a dictionary is returned. (:issue:`9450`) + +.. code-block:: python + + # Returns the 1st and 4th sheet, as a dictionary of DataFrames. + pd.read_excel('path_to_file.xls',sheetname=['Sheet1',3]) + +- A ``verbose`` argument has been augmented in ``io.read_excel()``, defaults to False. Set to True to print sheet names as they are parsed. (:issue:`9450`) - Added ``StringMethods.ljust()`` and ``rjust()`` which behave as the same as standard ``str`` (:issue:`9352`) - ``StringMethods.pad()`` and ``center()`` now accept ``fillchar`` option to specify filling character (:issue:`9352`) - Added ``StringMethods.zfill()`` which behave as the same as standard ``str`` (:issue:`9387`) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index acec411a2e546..cab342dc339f4 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -75,8 +75,26 @@ def read_excel(io, sheetname=0, **kwds): The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file://localhost/path/to/workbook.xlsx - sheetname : string or int, default 0 - Name of Excel sheet or the page number of the sheet + sheetname : string, int, mixed list of strings/ints, or None, default 0 + + Strings are used for sheet names, Integers are used in zero-indexed sheet + positions. + + Lists of strings/integers are used to request multiple sheets. + + Specify None to get all sheets. + + str|int -> DataFrame is returned. + list|None -> Dict of DataFrames is returned, with keys representing sheets. + + Available Cases + + * Defaults to 0 -> 1st sheet as a DataFrame + * 1 -> 2nd sheet as a DataFrame + * "Sheet1" -> 1st sheet as a DataFrame + * [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames + * None -> All sheets as a dictionary of DataFrames + header : int, default 0 Row to use for the column labels of the parsed DataFrame skiprows : list-like @@ -118,8 +136,9 @@ def read_excel(io, sheetname=0, **kwds): Returns ------- - parsed : DataFrame - DataFrame from the passed in Excel file + parsed : DataFrame or Dict of DataFrames + DataFrame from the passed in Excel file. See notes in sheetname argument + for more information on when a Dict of Dataframes is returned. """ if 'kind' in kwds: @@ -185,8 +204,25 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, Parameters ---------- - sheetname : string or integer - Name of Excel sheet or the page number of the sheet + sheetname : string, int, mixed list of strings/ints, or None, default 0 + + Strings are used for sheet names, Integers are used in zero-indexed sheet + positions. + + Lists of strings/integers are used to request multiple sheets. + + Specify None to get all sheets. + + str|int -> DataFrame is returned. + list|None -> Dict of DataFrames is returned, with keys representing sheets. + + Available Cases + + * Defaults to 0 -> 1st sheet as a DataFrame + * 1 -> 2nd sheet as a DataFrame + * "Sheet1" -> 1st sheet as a DataFrame + * [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames + * None -> All sheets as a dictionary of DataFrames header : int, default 0 Row to use for the column labels of the parsed DataFrame skiprows : list-like @@ -223,11 +259,15 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, has_index_names : boolean, default False True if the cols defined in index_col have an index name and are not in the header + verbose : boolean, default False + Set to True to print a single statement when reading each + excel sheet. Returns ------- - parsed : DataFrame - DataFrame parsed from the Excel file + parsed : DataFrame or Dict of DataFrames + DataFrame from the passed in Excel file. See notes in sheetname argument + for more information on when a Dict of Dataframes is returned. """ skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: @@ -283,7 +323,7 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, convert_float=True, - **kwds): + verbose=False, **kwds): import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, @@ -291,81 +331,114 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, epoch1904 = self.book.datemode + def _parse_cell(cell_contents,cell_typ): + """converts the contents of the cell into a pandas + appropriate object""" + + if cell_typ == XL_CELL_DATE: + if xlrd_0_9_3: + # Use the newer xlrd datetime handling. + cell_contents = xldate.xldate_as_datetime(cell_contents, + epoch1904) + + # Excel doesn't distinguish between dates and time, + # so we treat dates on the epoch as times only. + # Also, Excel supports 1900 and 1904 epochs. + year = (cell_contents.timetuple())[0:3] + if ((not epoch1904 and year == (1899, 12, 31)) + or (epoch1904 and year == (1904, 1, 1))): + cell_contents = datetime.time(cell_contents.hour, + cell_contents.minute, + cell_contents.second, + cell_contents.microsecond) + else: + # Use the xlrd <= 0.9.2 date handling. + dt = xldate.xldate_as_tuple(cell_contents, epoch1904) + + if dt[0] < datetime.MINYEAR: + cell_contents = datetime.time(*dt[3:]) + else: + cell_contents = datetime.datetime(*dt) + + elif cell_typ == XL_CELL_ERROR: + cell_contents = np.nan + elif cell_typ == XL_CELL_BOOLEAN: + cell_contents = bool(cell_contents) + elif convert_float and cell_typ == XL_CELL_NUMBER: + # GH5394 - Excel 'numbers' are always floats + # it's a minimal perf hit and less suprising + val = int(cell_contents) + if val == cell_contents: + cell_contents = val + return cell_contents + # xlrd >= 0.9.3 can return datetime objects directly. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): xlrd_0_9_3 = True else: xlrd_0_9_3 = False - - if isinstance(sheetname, compat.string_types): - sheet = self.book.sheet_by_name(sheetname) - else: # assume an integer if not a string - sheet = self.book.sheet_by_index(sheetname) - - data = [] - should_parse = {} - for i in range(sheet.nrows): - row = [] - for j, (value, typ) in enumerate(zip(sheet.row_values(i), - sheet.row_types(i))): - if parse_cols is not None and j not in should_parse: - should_parse[j] = self._should_parse(j, parse_cols) - - if parse_cols is None or should_parse[j]: - if typ == XL_CELL_DATE: - if xlrd_0_9_3: - # Use the newer xlrd datetime handling. - value = xldate.xldate_as_datetime(value, epoch1904) - - # Excel doesn't distinguish between dates and time, - # so we treat dates on the epoch as times only. - # Also, Excel supports 1900 and 1904 epochs. - year = (value.timetuple())[0:3] - if ((not epoch1904 and year == (1899, 12, 31)) - or (epoch1904 and year == (1904, 1, 1))): - value = datetime.time(value.hour, - value.minute, - value.second, - value.microsecond) - else: - # Use the xlrd <= 0.9.2 date handling. - dt = xldate.xldate_as_tuple(value, epoch1904) - - if dt[0] < datetime.MINYEAR: - value = datetime.time(*dt[3:]) - else: - value = datetime.datetime(*dt) - - elif typ == XL_CELL_ERROR: - value = np.nan - elif typ == XL_CELL_BOOLEAN: - value = bool(value) - elif convert_float and typ == XL_CELL_NUMBER: - # GH5394 - Excel 'numbers' are always floats - # it's a minimal perf hit and less suprising - val = int(value) - if val == value: - value = val - - row.append(value) - - data.append(row) - - if header is not None: - data[header] = _trim_excel_header(data[header]) - - parser = TextParser(data, header=header, index_col=index_col, - has_index_names=has_index_names, - na_values=na_values, - thousands=thousands, - parse_dates=parse_dates, - date_parser=date_parser, - skiprows=skiprows, - skip_footer=skip_footer, - chunksize=chunksize, - **kwds) - - return parser.read() + + ret_dict = False + + #Keep sheetname to maintain backwards compatibility. + if isinstance(sheetname, list): + sheets = sheetname + ret_dict = True + elif sheetname is None: + sheets = self.sheet_names + ret_dict = True + else: + sheets = [sheetname] + + #handle same-type duplicates. + sheets = list(set(sheets)) + + output = {} + + for asheetname in sheets: + if verbose: + print("Reading sheet %s" % asheetname) + + if isinstance(asheetname, compat.string_types): + sheet = self.book.sheet_by_name(asheetname) + else: # assume an integer if not a string + sheet = self.book.sheet_by_index(asheetname) + + data = [] + should_parse = {} + + for i in range(sheet.nrows): + row = [] + for j, (value, typ) in enumerate(zip(sheet.row_values(i), + sheet.row_types(i))): + if parse_cols is not None and j not in should_parse: + should_parse[j] = self._should_parse(j, parse_cols) + + if parse_cols is None or should_parse[j]: + row.append(_parse_cell(value,typ)) + data.append(row) + + if header is not None: + data[header] = _trim_excel_header(data[header]) + + parser = TextParser(data, header=header, index_col=index_col, + has_index_names=has_index_names, + na_values=na_values, + thousands=thousands, + parse_dates=parse_dates, + date_parser=date_parser, + skiprows=skiprows, + skip_footer=skip_footer, + chunksize=chunksize, + **kwds) + + output[asheetname] = parser.read() + + if ret_dict: + return output + else: + return output[asheetname] + @property def sheet_names(self): diff --git a/pandas/io/tests/data/test_multisheet.xlsx b/pandas/io/tests/data/test_multisheet.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..5de07772b276aece2a8fef0fca7fc8018e1e4c2a GIT binary patch literal 10816 zcmeHN1zQ|h)^6NAI0^19L4vzWuwcOn4vo88aEIXT?(PJ4cXua&;P7>3XTO=r%>IJi zdY-E4?pxoIIOzKSZU4nHFcSAsx`*ji;5pQb@L*Z(BE{!{=8rOMGd|y8 zdUx#UrDLOFM^D$?l0AqNIr`vTrzgnvJu=+edM=yi2H|;B=XOdnuGPTBG*psY#@39- z4|%1TU~~NIp>6BI47Vq*XMy!L-f$}TBkH)mZW{zfaCMy{@0i#W)pSAyIy}eHq7ay2 z_ztem8fz)M;Y>}~7EEqo1JsV9c*=_0q_+e~r74z!!#|Aq+cn3Kpi=ALUiXl%av3V) z?(3>^wGt!tX}(U5UPp3tsHlD0t{5c~fWnwff!IFxIDP*$s4n!RQ34^4K19m5+E#rU z+3d+q=a17^ia z;zAq3P(c%BZjUhHQazy`5M|eB6zJD@_8txp0DzYl2!O(0*jlH;M0y2+pbUtN2q3oV z*cw^bF*5u-{*R^q#nk-MqnF0W%J(oK2A)ei1`XWIu128>NjnKjv=A$MdP}aL)P?7e z;jea35}+#M`9X+#wRt`cEwA!~9}N)y*kUUUMSsOjTJKaAn0RmF07FAzn;>RWy4j27 zID0+&BS~D!h03ullBT4wFiU!Hom6c4QltuXoZ$l=97Y~NAQo?uk4B%Y`kMZ2IoPa_ z(rH;>Wdld%QS5k%*L*_$Ark**E}4_5B+LOjedEP)_W=vy>pMIZB@-^wO8s=(cO))4 zh8CUQMN&F2p4{o>QU?@C*wAlT#>M-|Gq1fgYS}J_(_DHv;Cf01E(U!=2@BUihU@Pn zxw?ko#sH0|LjnMJ09Y_*3&wx(#L3#$T;JN-{HIa-H)p^=mJM|6fA`iJH*N;POy9X* zo8P2|O^nq?TnE<$oU3LB6kQ8)WZ4s47q4$WSh>g$;!epaNC7J}ygj_8(qb!G!tNYOs%Gd%uB0{(3t4jD(N7Mb6?NPFJjkt}Aqq&Pi!h z86Ve(Y*`vN*1*y`@u4;Lh>mv0CE2R!r3CD&Ozi7AfY%JZZb5tK*o%2fbtz^$=k;PX z5;om!!*^qr&%_~8I(n!~`JYkZWzmCubOK$pFvJGMc=6Sj;XcXG3+{Y4lW2#G{fJAU z>AP4CCoJYVbKcdjl4&YZbA>+AQ_PfQy|ou!kr3!S`0OeBfO>;Vc|*L@!!_nN&gZ{{ z{ZEkMO@sZa4_b4fpa1|A5LQ5t`ZJhHl?SbsnNZuX5BO1Axi#pF(8!=J71>1JL;A#O zxn~tce@!M)iHWd$^>h=FTda~I9rA<3DyN_0e3Yq;Z!U(%uS%#S4BD8FJhPF(f9cbe z-nEsjWUzw_ri>XBF0W7C{M|Cgp4K`o5PwqZ-c2SBhWy-S6VS# z2{u-%LJD+H7UQ@c+%x4spT+DAx1b3`TKF+yY0gDXfyT+)AUXja=4ZUoSG7r1-(o?) zUGXIvTx??c{cY_wJL;B9Z?J~u5il9 zT@}uzu>4KKH}u^`?p0cJ#+MB*%Mxl@gBMtJbj-{REMeytP%S~WeRS-q;`74k2e+L| z_|HQf5dQ?nP+u#xSdhaWf%nsC{|%1zCPtPf@pK9Ix83@A(n!oF2_T zSukEgyIB}7pHL7V=k65N8AVnnN8=8*FcT}p7-T6VCuqaBT`Md>qaTnznb*sjDh+f@ zP>wE;mhD<4TM^N#_ps#LU0ofvajeI(VKc6V7uoqM$*D~_mSEGA`^BQ+#~LmCrZExF`5GH49J0ghuLzjQD&%^)goC%55zA=P{!L zQ{;xXU%mEQ{gPlFFQpKHCaFa!$o!>^Jt+#!F>orkrHyT^0GI;HfG@_jnwVcTDXiBY zq!PS=@Mda;OQ`m=o72FNr`IXqSfLk+$D{@GdOx9oF=0}5EjR?pqs|lJI`L+hI}F9# zY{cHR5^v_aJL5s=2`vEgxY09*@X9Zq7fK)pZG>$*d-vgfu z{K@=0r<3~~>jiw=;?3NJpCGI@8|d?;jA(foO2+OFEu1#siG|_+R38?>O_g-?TF3oiF+<1e=Djf$_mdS}<`G$!F2e|v zDX%#L9Af<|0knnsFK3|x0PT-NW<3s7hpn2_uwh8fE}s1x&4hV&f^?)h!TJ!5{Wz!9 zGe_REwsfOCJuWPDDCuJkD@gtk76w67%tFf<;{xizqlP$oetQlf=sx+JUER^+3&ed1 zi>nm97sqdK^TKL^fmRp=p`T~Tg>7}o6emrCdQlSt!ioHsE=Jer3x*3V=Xl)>Nj3un zW`sOH7j5W~nBS3V9U;u=D2cW*G#R5o7sYOe(93I1~^ z?}?i|TFQWOlyJ)E$KEdyCsjR6LYE^$No3IxPu_$=Atd|nFxYS3u&$%Jpx6iu_5s&P z7GQ4{g@73LDh0}9pWRvH>8o19lJ58z4j}dNSco11ll#fIl$PX!n_uK+YKrYOX1J-V z+BLWfa@{e`_^CE{+2qR&oQ>xY7Py~03)atGv^zN3hW9I$0u=lLu}d8493wdxRw@0b zJgPFC6H8{fgTAr75y)9mEo|F0qvZ(QU|FZaX*3N{CZsglA|=Uwf*sWj)=|Nqq#l=* zZ(=ndP<<{|w+z0dUN(DSio99iT&_|rv2{ugh7r#;skkk1T(JC%_?j)`jHG?riE&M&Y`!tszVSOH52Pey++RvzGS6 zG|0|;`1d>sHbclz36t?%T=NpvwiL6h04$7QL@JWi7%dJt`7s9Rx(}ag1%;$JV*L{W z(eSo;xUX|!zlS-Lz!nM0H|yWAzhc81W8C@?!8gff6R|0o z6F^hWLU4o}~-FvL+dEoTU+{j0* zoK|J4M4s?|9?lU6)0+dc-)TqF2v6?rCV@tGUmd$6!@hypMEmV6i5A!QxY|`W%#Q7V zxBPS}$(|iMvu#pEhfYU%5$E$I9vvEo4(2ymm7l4~?-z{|!BWL1MV~K4>nc3FgA>KA zu*K*YSg(EZNpJBrHxa+320oEj z%4a4hwK_hT!N`&aSBZA-Izv* zAhpsk(eB~Vn8)^xwszu#sahEm2B~_oahd3f=YEtmxdE7X=6re})ohHR#s`FDRhvHe zV(4sr^3q%(Eaje}m4)J%Fp9lwl7^rN);feCX+>_j9r8CMvMwVeXx0d-M@u5rsb&Ke zEZRkWnrJ%u9~AB?%gPPjE9+M6am|nS#i@z5HZI2rO*tU1$e(@bgR!==)z`UDgC|$y z1#q_@yAJU*?6Y&;X?ObohOHzM7zT$vim>WGq!*17jyV=PSm-s)3-N4pf2<}$=A=rf z!Rexh*NLPvbZIB^L2xU4##Q2hmVP)`7Lc$4KjnX8qDM=FSF@_*UsYA&0zbt86O! zwgo%MyDl*p_r0u#d`KD;5}29Fdyw`9N{FK`Oq3xt4*cexTze}b*N^rSbb;9{Kb z{*nA(OK(12@_ny{tDg;IoxO(IQ4 z`0?$v-zK&iQzHAF3~#muD!Xq!6(d0G98BO6Om|kz;AHH{ZxSC&n3Tya;r$@GB0tFX%>Hb>b*j;>N7JJ2T!2t*C>o^FL z^_`gv?9Y-pfo&S!Uo~VMA@LQYU10;>d>uyc-bx3rog%eVGK;18lA3jD(xZKvUQb+T zI&NC@np%3h z*P~@g>CWCWnJzV-E;3W7L5)~Yn!Y@h%fj|bPG6Dx-U8jnny__zTlPtu*p(&vhH<~S zy|FcsbnYISMSSvX?PwQ_P;Qg~j!C~0W>n5@x!4ZU9sU|SLZyPt6lWXAl2=@GEEO{e zKK!P_1df&(8%h;Ot6E+P`8C+3aA384KYSIu27MM$$p#KaUP;VkWUx6iuUHatvic_(b=g(n|{4^5H5A7*8=zx3YuE_5%BDp?>YDrZ~A z_Bq!F_B4ZJPK@Y;IL-?k7@pSFgNe~b{dCQ^P^V=!M zkZO{zr8uhhm|k*guQECnz+&Y%l8ttgOQEwuiD^|5mB_O^uUf~g>DDzuq8eScHzO8M2&u(mHmGzj=ra0q-9iB&S&lhIE-AFzdtKxW+ zr(o-nbA0)BdaakVz|d!G1IN1r+?`7jyzq}gsQdoYGCZd3J!h3(N-aOz!Q#x611roER}P!Ny% zS{EvzJ1lSeSa`UM7rvpUK@z$Z>KxZ>u-_~Jc{h#*-)~|*MSKm3pBgkZgU0%TaV_E( z4XNb&!8I(YsTXr3dym1d0uw%`_qYo!(Y8DppI78&oqyBoA9f2~^>MtH;0AgC z;nDq1#3I|8Sn>#3#C+LE<-{>cs>EQ@_0`TjOhYsI#kE??=^B_Mbw#-Dj&Lxeb}?K2 z`N#0{La|I*Bw+V0f_Uu)B~#J|G`;C_qYhFf&{PSzWLu)q$zt*MC?&}(wRBxKHQQPC z{bg8`SD2ExyKLB`nKiW&O5+W4Wl%DDQD{J-X)C7G@zH>7O4ez3i1>^SNb9?7HdcRJWG zx}A+YS)OUGIfKb{1(YO>v>kO496A*u&MMvnn)e=3%+v1enx=+;e=h?75oIGPS(X#I zGU{Y^{FXC$u^FC(+LEnnU2)K(|_?bAHRAntDw96N|HyvI!mi;ZZ<*5}u@zR^= ztkP{%Gyv>C1*Z5AS;L%`;|W?kIR5V4qPVT46+# zQjsG2!{>q6V9u=1Jp5+VnTpmAVe<7T_cL$#gVGT9A?5aWxuQ3GQYAzqcm|t&9ty0d zK&H3CalKwjC-t% zygoiY$8FrK{vZ%QxJiD&Q3bccgVs<8awnm0Z z_O_;0#&$oAkl~mhd^Zy+)I{6H>+TcFB^r{Ld;~!r?>9uXIY?}=OGzPXYoC2HhhO-5 z+?CdaE37!HBb5cCcQd1s7hIc_f$z?+@fGMIH0H&-;ol4>EZ>bVg-SK_V%n4RsG3{7 zLyV~5B+A<(=-(jwdYSElGg-GnEOp?Jv~T`8l~O@ARbpx#eMaQ@-R2n*ZDnBFTYZ#( zN&RroyvR7O>c{vwI8WJvsvjW!|J^+m=H9r;24(!9L1`AG-yhE2#lq;fr^;2f1esRU zXFk)Hu5V65(!|mvnHr!J-%_#UX|BgxUGx%9!TQ4(*@ts(2x8^5daDQD^N8=;`{~@= zZSP!NinRi$B>WZjPe_H1rOrk(hVK=i>4y`c-^uuvRpPOkF#tu7f4Y z9lTguay)&=Us4hr2{0@2=MSIqRRdm+ag^ofvRlP&c)o%}&_SiR^sdC3H5Y)KjQn`4 zB30X49b@w;k{3%XTVwq!cPe*B;2O0r_nqF&7=m*bO<9=P2~_7@v8kGT#LT`NjJxPb289G`?f!7ar&KG&PfO1j86 zMc$sTOs*%Ls(*Bz4~{v??jYYbxytp~lL1ny0Qheyv8Tj>T3@OY4vUP3V+7Y@MlIP4bko#NPRo(|R?i_rNCuN4J;lvme%o_V2>x z1ACC>4XO-+Z}!y!jmHE!2;DPz_uE6Tx*o_7TenzX+nfax)@*H(oJG;amPK2eBts1Q zc<(sQ^frPvF9hCw8(d-$HWFva#_j<458h4(>m`>j78H2hbyExmGCh%S=bPPP%%~4l$`ld z8tu#hmvZt7d?c=;&MYGvk9fEq5rfS|cIeWIPEnuS*BBJy+~-I=3&F3c&SEwu1nKZg z8d2n1z6OTMw?n0(+xlTTGc@ft!t{M2$%ar2firEl;05G%D)@n)zRrjHN?8F@vXX)1 zXS=5ye=f6dhxR*3@;Noi{l_zNK{ti1?xT~l{5bQ)jUKZ+~VW$TK z{ttDaDf@j(jqL)Jk3c0`h#$y-XW=WK@B;r>}TWTn#s}$7HMdl=aT;*8K zI5ej_lHOr-@JH7+^VPQ>5aopG(KBG@Q4uYuH38K^AD$XQ$Fl`kvRL?_)j}~c8|Ti~ zi!3AHShzZU>A}lIh2vBevn?~AzaNr`K%3A~NJ531qPFD<%tt=))E4GlST)Xk{QiDigS zEif|<4?@B}UUG&c*MUDmmf^20v%#%9GMV+s`+{Ka0*3>gDFgRd&a!?bHRCm_qelp&mi3%pe{R@_%QDUg6!YH9PSS6 z3H~_(DgmN`UL45VG0?Lx5H`^>wfYHW`?yUxKhV+NF-=9eGCw_k3}QQI1AWs&5sp^`476M)zF!S^O$e4*MCuPR~rq;uF`Z|7*&! z^7={FGj5xObNOs1a3217iy>#5jndKE=$bL|O%h!g9!Cl^&+JRqaCtV*tSWiiPF#X~ z7P19niOTUP-3+j87A@{8g(>WWtU6%1hS`cCRNJ*&W{`)VH7|_oEy2dMAPpfWI3Ezw zoe}LKT%^;f3Orbp94lT;S^s^~=leia>_QT7LHma_qbM;>9HOuqY)&`93&-y6g&;{tKa;ptK^EIJqIcK;b$)GBCa_7~gV%tH zhNVsvQl0xA`b7Gw(@5<%nBjVKIdNA~;5Ag_ZG@c^hb(kSMz6^<#pz18LCrG1zcw&% zdXPi<&mC_6=G?!}|IqKIApKVbe{H(?x8TonIp~&uYQgzc@UM*se-@kud6oaUE#X%^ zzgD0B&_n_9U4N-X|0?`zt?&=wI#7!dNch*v;japQP1OFOzyod{C}+QtLR^Su|EU>fL%fW;BTJUui}5TsecyNBKec}?-uq~>0cMIKjf*&fBWBm ZS<)1wp+GqO8UG*wq(D&Lp!oUi{{Upfs{8-| literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 5909f8af0e5dd..95f072835f2b6 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -80,6 +80,7 @@ def setUp(self): self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') self.xlsx1 = os.path.join(self.dirpath, 'test.xlsx') + self.multisheet = os.path.join(self.dirpath, 'test_multisheet.xlsx') self.frame = _frame.copy() self.frame2 = _frame2.copy() self.tsframe = _tsframe.copy() @@ -423,7 +424,59 @@ def test_reader_converters(self): for path in (xls_path, xlsx_path): actual = read_excel(path, 'Sheet1', converters=converters) tm.assert_frame_equal(actual, expected) - + + def test_reading_all_sheets(self): + # Test reading all sheetnames by setting sheetname to None, + # Ensure a dict is returned. + # See PR #9450 + + _skip_if_no_xlrd() + + dfs = read_excel(self.multisheet,sheetname=None) + expected_keys = ['Alpha','Beta','Charlie'] + tm.assert_contains_all(expected_keys,dfs.keys()) + + def test_reading_multiple_specific_sheets(self): + # Test reading specific sheetnames by specifying a mixed list + # of integers and strings, and confirm that duplicated sheet + # references (positions/names) are removed properly. + + # Ensure a dict is returned + # See PR #9450 + _skip_if_no_xlrd() + + #Explicitly request duplicates. Only the set should be returned. + expected_keys = [2,'Charlie','Charlie'] + dfs = read_excel(self.multisheet,sheetname=expected_keys) + expected_keys = list(set(expected_keys)) + tm.assert_contains_all(expected_keys,dfs.keys()) + assert len(expected_keys) == len(dfs.keys()) + + def test_creating_and_reading_multiple_sheets(self): + # Test reading multiple sheets, from a runtime created excel file + # with multiple sheets. + # See PR #9450 + + _skip_if_no_xlrd() + _skip_if_no_xlwt() + + def tdf(sheetname): + d, i = [11,22,33], [1,2,3] + return DataFrame(d,i,columns=[sheetname]) + + sheets = ['AAA','BBB','CCC'] + + dfs = [tdf(s) for s in sheets] + dfs = dict(zip(sheets,dfs)) + + with ensure_clean('.xlsx') as pth: + with ExcelWriter(pth) as ew: + for sheetname, df in dfs.iteritems(): + df.to_excel(ew,sheetname) + dfs_returned = pd.read_excel(pth,sheetname=sheets) + for s in sheets: + tm.assert_frame_equal(dfs[s],dfs_returned[s]) + def test_reader_seconds(self): # Test reading times with and without milliseconds. GH5945. _skip_if_no_xlrd()