diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index c3a0e3599a0f9..32b548e5f32f1 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -135,7 +135,7 @@ Other Enhancements - Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`) - :func:`DataFrame.corrwith` now silently drops non-numeric columns when passed a Series. Before, an exception was raised (:issue:`18570`). - :class:`IntervalIndex` now supports time zone aware ``Interval`` objects (:issue:`18537`, :issue:`18538`) - +- :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`) .. _whatsnew_0220.api_breaking: @@ -188,6 +188,7 @@ Other API Changes - :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) - The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) - Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`) +- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:pr:`16672`) .. _whatsnew_0220.deprecations: diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 882130bedcbf0..a1dcd52b61270 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -70,6 +70,7 @@ * None -> All sheets as a dictionary of DataFrames sheetname : string, int, mixed list of strings/ints, or None, default 0 + .. deprecated:: 0.21.0 Use `sheet_name` instead @@ -77,24 +78,29 @@ Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row positions will be combined into a ``MultiIndex``. Use None if there is no header. -skiprows : list-like - Rows to skip at the beginning (0-indexed) -skip_footer : int, default 0 - Rows at the end to skip (0-indexed) +names : array-like, default None + List of column names to use. If file contains no header row, + then you should explicitly pass header=None index_col : int, list of ints, default None Column (0-indexed) to use as the row labels of the DataFrame. Pass None if there is no such column. If a list is passed, those columns will be combined into a ``MultiIndex``. If a subset of data is selected with ``usecols``, index_col is based on the subset. -names : array-like, default None - List of column names to use. If file contains no header row, - then you should explicitly pass header=None -converters : dict, default None - Dict of functions for converting values in certain columns. Keys can - either be integers or column labels, values are functions that take one - input argument, the Excel cell content, and return the transformed - content. +parse_cols : int or list, default None + + .. deprecated:: 0.21.0 + Pass in `usecols` instead. + +usecols : int or list, default None + * If None then parse all columns, + * If int then indicates last column to be parsed + * If list of ints then indicates list of column numbers to be parsed + * If string then indicates comma separated list of Excel column letters and + column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of + both sides. +squeeze : boolean, default False + If the parsed data only contains one column then return a Series dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} Use `object` to preserve data as stored in Excel and not interpret dtype. @@ -103,6 +109,14 @@ .. versionadded:: 0.20.0 +engine: string, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None or xlrd +converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. true_values : list, default None Values to consider as True @@ -113,36 +127,29 @@ .. versionadded:: 0.19.0 -parse_cols : int or list, default None - .. deprecated:: 0.21.0 - Pass in `usecols` instead. +skiprows : list-like + Rows to skip at the beginning (0-indexed) +nrows : int, default None + Number of rows to parse + + .. versionadded:: 0.22.0 -usecols : int or list, default None - * If None then parse all columns, - * If int then indicates last column to be parsed - * If list of ints then indicates list of column numbers to be parsed - * If string then indicates comma separated list of Excel column letters and - column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of - both sides. -squeeze : boolean, default False - If the parsed data only contains one column then return a Series na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70) + """'. -thousands : str, default None - Thousands separator for parsing string columns to numeric. Note that - this parameter is only necessary for columns stored as TEXT in Excel, - any numeric columns will automatically be parsed, regardless of display - format. keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to. verbose : boolean, default False Indicate number of NA values placed in non-numeric columns -engine: string, default None - If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or xlrd +thousands : str, default None + Thousands separator for parsing string columns to numeric. Note that + this parameter is only necessary for columns stored as TEXT in Excel, + any numeric columns will automatically be parsed, regardless of display + format. +skip_footer : int, default 0 + Rows at the end to skip (0-indexed) convert_float : boolean, default True convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats @@ -193,12 +200,27 @@ def get_writer(engine_name): @Appender(_read_excel_doc) @deprecate_kwarg("parse_cols", "usecols") -def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, - index_col=None, names=None, usecols=None, parse_dates=False, - date_parser=None, na_values=None, thousands=None, - convert_float=True, converters=None, dtype=None, - true_values=None, false_values=None, engine=None, - squeeze=False, **kwds): +def read_excel(io, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + parse_dates=False, + date_parser=None, + thousands=None, + skip_footer=0, + convert_float=True, + **kwds): # Can't use _deprecate_kwarg since sheetname=None has a special meaning if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: @@ -213,12 +235,25 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, io = ExcelFile(io, engine=engine) return io._parse_excel( - sheetname=sheet_name, header=header, skiprows=skiprows, names=names, - index_col=index_col, usecols=usecols, parse_dates=parse_dates, - date_parser=date_parser, na_values=na_values, thousands=thousands, - convert_float=convert_float, skip_footer=skip_footer, - converters=converters, dtype=dtype, true_values=true_values, - false_values=false_values, squeeze=squeeze, **kwds) + sheetname=sheet_name, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + squeeze=squeeze, + dtype=dtype, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + skip_footer=skip_footer, + convert_float=convert_float, + **kwds) class ExcelFile(object): @@ -282,11 +317,25 @@ def __init__(self, io, **kwds): def __fspath__(self): return self._io - def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, - names=None, index_col=None, usecols=None, parse_dates=False, - date_parser=None, na_values=None, thousands=None, - convert_float=True, converters=None, true_values=None, - false_values=None, squeeze=False, **kwds): + def parse(self, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + converters=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + parse_dates=False, + date_parser=None, + thousands=None, + skip_footer=0, + convert_float=True, + **kwds): """ Parse specified sheet(s) into a DataFrame @@ -294,19 +343,23 @@ def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, docstring for more info on accepted parameters """ - return self._parse_excel(sheetname=sheet_name, header=header, - skiprows=skiprows, names=names, + return self._parse_excel(sheetname=sheet_name, + header=header, + names=names, index_col=index_col, usecols=usecols, + squeeze=squeeze, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, parse_dates=parse_dates, - date_parser=date_parser, na_values=na_values, + date_parser=date_parser, thousands=thousands, skip_footer=skip_footer, convert_float=convert_float, - converters=converters, - true_values=true_values, - false_values=false_values, - squeeze=squeeze, **kwds) def _should_parse(self, i, usecols): @@ -342,12 +395,26 @@ def _excel2num(x): else: return i in usecols - def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, - skip_footer=0, index_col=None, usecols=None, - parse_dates=False, date_parser=None, na_values=None, - thousands=None, convert_float=True, true_values=None, - false_values=None, verbose=False, dtype=None, - squeeze=False, **kwds): + def _parse_excel(self, + sheetname=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + dtype=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + verbose=False, + parse_dates=False, + date_parser=None, + thousands=None, + skip_footer=0, + convert_float=True, + **kwds): skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: @@ -509,21 +576,24 @@ def _parse_cell(cell_contents, cell_typ): # GH 12292 : error when read one empty column from excel file try: - parser = TextParser(data, header=header, index_col=index_col, + parser = TextParser(data, + header=header, + index_col=index_col, has_index_names=has_index_names, - na_values=na_values, - thousands=thousands, - parse_dates=parse_dates, - date_parser=date_parser, + squeeze=squeeze, + dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, + nrows=nrows, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, skipfooter=skip_footer, - squeeze=squeeze, - dtype=dtype, **kwds) - output[asheetname] = parser.read() + output[asheetname] = parser.read(nrows=nrows) if names is not None: output[asheetname].columns = names if not squeeze or isinstance(output[asheetname], DataFrame): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 83b1d8ec1a070..a04d77de08950 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -440,7 +440,7 @@ def _read(filepath_or_buffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get('iterator', False) chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1) - nrows = _validate_integer('nrows', kwds.get('nrows', None)) + nrows = kwds.get('nrows', None) # Check for duplicates in names. _validate_names(kwds.get("names", None)) @@ -1062,6 +1062,8 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): + nrows = _validate_integer('nrows', nrows) + if nrows is not None: if self.options.get('skipfooter'): raise ValueError('skipfooter not supported for iteration') diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 96117b3c21a9b..3fd55bcad677a 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1017,6 +1017,33 @@ def test_read_excel_skiprows_list(self): 'skiprows_list', skiprows=np.array([0, 2])) tm.assert_frame_equal(actual, expected) + def test_read_excel_nrows(self): + # GH 16645 + num_rows_to_pull = 5 + actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), + nrows=num_rows_to_pull) + expected = pd.read_excel(os.path.join(self.dirpath, + 'test1' + self.ext)) + expected = expected[:num_rows_to_pull] + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_greater_than_nrows_in_file(self): + # GH 16645 + expected = pd.read_excel(os.path.join(self.dirpath, + 'test1' + self.ext)) + num_records_in_file = len(expected) + num_rows_to_pull = num_records_in_file + 10 + actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), + nrows=num_rows_to_pull) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_non_integer_parameter(self): + # GH 16645 + msg = "'nrows' must be an integer >=0" + with tm.assert_raises_regex(ValueError, msg): + pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), + nrows='5') + def test_read_excel_squeeze(self): # GH 12157 f = os.path.join(self.dirpath, 'test_squeeze' + self.ext)