diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eb14a26e75a9c..e7c03de879e8a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2008,45 +2008,45 @@ def _repr_data_resource_(self): Parameters ---------- - excel_writer : string or ExcelWriter object + excel_writer : str or ExcelWriter object File path or existing ExcelWriter. - sheet_name : string, default 'Sheet1' + sheet_name : str, default 'Sheet1' Name of sheet which will contain DataFrame. - na_rep : string, default '' + na_rep : str, default '' Missing data representation. - float_format : string, optional + float_format : str, optional Format string for floating point numbers. For example ``float_format="%%.2f"`` will format 0.1234 to 0.12. - columns : sequence or list of string, optional + columns : sequence or list of str, optional Columns to write. - header : boolean or list of string, default True - Write out the column names. If a list of strings is given it is + header : bool or list of str, default True + Write out the column names. If a list of string is given it is assumed to be aliases for the column names. - index : boolean, default True + index : bool, default True Write row names (index). - index_label : string or sequence, optional + index_label : str or sequence, optional Column label for index column(s) if desired. If not specified, and `header` and `index` are True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. - startrow : integer, default 0 + startrow : int, default 0 Upper left cell row to dump data frame. - startcol : integer, default 0 + startcol : int, default 0 Upper left cell column to dump data frame. - engine : string, optional + engine : str, optional Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and ``io.excel.xlsm.writer``. - merge_cells : boolean, default True + merge_cells : bool, default True Write MultiIndex and Hierarchical Rows as merged cells. - encoding : string, optional + encoding : str, optional Encoding of the resulting excel file. Only necessary for xlwt, other writers support unicode natively. - inf_rep : string, default 'inf' + inf_rep : str, default 'inf' Representation for infinity (there is no native representation for infinity in Excel). - verbose : boolean, default True + verbose : bool, default True Display more information in the error logs. - freeze_panes : tuple of integer (length 2), optional + freeze_panes : tuple of int (length 2), optional Specifies the one-based bottommost row and rightmost column that is to be frozen. @@ -2054,8 +2054,10 @@ def _repr_data_resource_(self): See Also -------- - read_excel - ExcelWriter + to_csv : Write DataFrame to a comma-separated values (csv) file. + ExcelWriter : Class for writing DataFrame objects into excel sheets. + read_excel : Read an Excel file into a pandas DataFrame. + read_csv : Read a comma-separated values (csv) file into DataFrame. Notes ----- @@ -2071,8 +2073,8 @@ def _repr_data_resource_(self): Create, write to and save a workbook: >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']], - ... index=['row 1', 'row 2'], - ... columns=['col 1', 'col 2']) + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) >>> df1.to_excel("output.xlsx") # doctest: +SKIP To specify the sheet name: @@ -2166,7 +2168,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, double_precision : int, default 10 The number of decimal places to use when encoding floating point values. - force_ascii : boolean, default True + force_ascii : bool, default True Force encoded string to be ASCII. date_unit : string, default 'ms' (milliseconds) The time unit to encode to, governs timestamp and ISO8601 @@ -2176,7 +2178,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, Handler to call if object cannot otherwise be converted to a suitable format for JSON. Should receive a single argument which is the object to convert and return a serialisable object. - lines : boolean, default False + lines : bool, default False If 'orient' is 'records' write out line delimited json format. Will throw ValueError if incorrect 'orient' since others are not list like. @@ -2192,7 +2194,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, .. versionadded:: 0.21.0 .. versionchanged:: 0.24.0 'infer' option added and set to default - index : boolean, default True + index : bool, default True Whether to include the index values in the JSON string. Not including the index (``index=False``) is only supported when orient is 'split' or 'table'. @@ -2375,7 +2377,7 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): ---------- path : string File path, buffer-like, or None if None, return generated string - append : boolean whether to append to an existing msgpack + append : bool whether to append to an existing msgpack (default is False) compress : type of compressor (zlib or blosc), default to None (no compression) @@ -2410,7 +2412,7 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True, * replace: Drop the table before inserting new values. * append: Insert new values to the existing table. - index : boolean, default True + index : bool, default True Write DataFrame index as a column. Uses `index_label` as the column name in the table. index_label : string or sequence, default None diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 9399f36072e5f..3a7c39ec65309 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -39,71 +39,63 @@ _writers = {} _read_excel_doc = """ -Read an Excel table into a pandas DataFrame +Read an Excel file into a pandas DataFrame. + +Support both `xls` and `xlsx` file extensions from a local filesystem or URL. +Support an option to read a single sheet or a list of sheets. Parameters ---------- -io : string, path object (pathlib.Path or py._path.local.LocalPath), - file-like object, pandas ExcelFile, or xlrd workbook. +io : str, file descriptor, pathlib.Path, ExcelFile or xlrd.Book The string could be a URL. Valid URL schemes include http, ftp, s3, gcs, and file. For file URLs, a host is expected. For instance, a local - file could be file://localhost/path/to/workbook.xlsx -sheet_name : string, int, mixed list of strings/ints, or None, default 0 - - Strings are used for sheet names, Integers are used in zero-indexed - sheet positions. - - Lists of strings/integers are used to request multiple sheets. - - Specify None to get all sheets. - - str|int -> DataFrame is returned. - list|None -> Dict of DataFrames is returned, with keys representing - sheets. - - Available Cases - - * Defaults to 0 -> 1st sheet as a DataFrame - * 1 -> 2nd sheet as a DataFrame - * "Sheet1" -> 1st sheet as a DataFrame - * [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames - * None -> All sheets as a dictionary of DataFrames - -sheetname : string, int, mixed list of strings/ints, or None, default 0 - - .. deprecated:: 0.21.0 - Use `sheet_name` instead - -header : int, list of ints, default 0 + file could be /path/to/workbook.xlsx. +sheet_name : str, int, list, or None, default 0 + Strings are used for sheet names. Integers are used in zero-indexed + sheet positions. Lists of strings/integers are used to request + multiple sheets. Specify None to get all sheets. + + Available cases: + + * Defaults to ``0``: 1st sheet as a `DataFrame` + * ``1``: 2nd sheet as a `DataFrame` + * ``"Sheet1"``: Load sheet with name "Sheet1" + * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" + as a dict of `DataFrame` + * None: All sheets. + +header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row positions will be combined into a ``MultiIndex``. Use None if there is no header. names : array-like, default None List of column names to use. If file contains no header row, - then you should explicitly pass header=None -index_col : int, list of ints, default None + then you should explicitly pass header=None. +index_col : int, list of int, default None Column (0-indexed) to use as the row labels of the DataFrame. Pass None if there is no such column. If a list is passed, those columns will be combined into a ``MultiIndex``. If a subset of data is selected with ``usecols``, index_col is based on the subset. parse_cols : int or list, default None + Alias of `usecols`. .. deprecated:: 0.21.0 - Pass in `usecols` instead. + Use `usecols` instead. usecols : int, str, list-like, or callable default None - * If None, then parse all columns, - * If int, then indicates last column to be parsed + Return a subset of the columns. + * If None, then parse all columns. + * If int, then indicates last column to be parsed. .. deprecated:: 0.24.0 - Pass in a list of ints instead from 0 to `usecols` inclusive. + Pass in a list of int instead from 0 to `usecols` inclusive. - * If string, then indicates comma separated list of Excel column letters + * If str, then indicates comma separated list of Excel column letters and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of both sides. - * If list of ints, then indicates list of column numbers to be parsed. - * If list of strings, then indicates list of column names to be parsed. + * If list of int, then indicates list of column numbers to be parsed. + * If list of string, then indicates list of column names to be parsed. .. versionadded:: 0.24.0 @@ -112,8 +104,8 @@ .. versionadded:: 0.24.0 -squeeze : boolean, default False - If the parsed data only contains one column then return a Series +squeeze : bool, default False + If the parsed data only contains one column then return a Series. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} Use `object` to preserve data as stored in Excel and not interpret dtype. @@ -122,28 +114,28 @@ .. versionadded:: 0.20.0 -engine : string, default None +engine : str, default None If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or xlrd + Acceptable values are None or xlrd. converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one input argument, the Excel cell content, and return the transformed content. true_values : list, default None - Values to consider as True + Values to consider as True. .. versionadded:: 0.19.0 false_values : list, default None - Values to consider as False + Values to consider as False. .. versionadded:: 0.19.0 skiprows : list-like - Rows to skip at the beginning (0-indexed) + Rows to skip at the beginning (0-indexed). nrows : int, default None - Number of rows to parse + Number of rows to parse. .. versionadded:: 0.23.0 @@ -154,8 +146,34 @@ keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to. -verbose : boolean, default False - Indicate number of NA values placed in non-numeric columns +verbose : bool, default False + Indicate number of NA values placed in non-numeric columns. +parse_dates : bool, list-like, or dict, default False + The behavior is as follows: + + * bool. If True -> try parsing the index. + * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call + result 'foo' + + If a column or index contains an unparseable date, the entire column or + index will be returned unaltered as an object data type. For non-standard + datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv`` + + Note: A fast-path exists for iso8601-formatted dates. +date_parser : function, optional + Function to use for converting a sequence of string columns to an array of + datetime instances. The default uses ``dateutil.parser.parser`` to do the + conversion. Pandas will try to call `date_parser` in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by `parse_dates` into a single array + and pass that; and 3) call `date_parser` once for each row using one or + more strings (corresponding to the columns defined by `parse_dates`) as + arguments. thousands : str, default None Thousands separator for parsing string columns to numeric. Note that this parameter is only necessary for columns stored as TEXT in Excel, @@ -166,96 +184,89 @@ argument to indicate comments in the input file. Any data between the comment string and the end of the current line is ignored. skip_footer : int, default 0 + Alias of `skipfooter`. .. deprecated:: 0.23.0 - Pass in `skipfooter` instead. + Use `skipfooter` instead. skipfooter : int, default 0 - Rows at the end to skip (0-indexed) -convert_float : boolean, default True - convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric + Rows at the end to skip (0-indexed). +convert_float : bool, default True + Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats - internally -mangle_dupe_cols : boolean, default True + internally. +mangle_dupe_cols : bool, default True Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. +**kwds : optional + Optional keyword arguments can be passed to ``TextFileReader``. Returns ------- -parsed : DataFrame or Dict of DataFrames +DataFrame or dict of DataFrames DataFrame from the passed in Excel file. See notes in sheet_name argument for more information on when a dict of DataFrames is returned. -Examples +See Also -------- +to_excel : Write DataFrame to an Excel file. +to_csv : Write DataFrame to a comma-separated values (csv) file. +read_csv : Read a comma-separated values (csv) file into DataFrame. +read_fwf : Read a table of fixed-width formatted lines into DataFrame. -An example DataFrame written to a local file - ->>> df_out = pd.DataFrame([('string1', 1), -... ('string2', 2), -... ('string3', 3)], -... columns=['Name', 'Value']) ->>> df_out - Name Value -0 string1 1 -1 string2 2 -2 string3 3 ->>> df_out.to_excel('tmp.xlsx') - +Examples +-------- The file can be read using the file name as string or an open file object: ->>> pd.read_excel('tmp.xlsx') - Name Value -0 string1 1 -1 string2 2 -2 string3 3 +>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP + Name Value +0 string1 1 +1 string2 2 +2 #Comment 3 ->>> pd.read_excel(open('tmp.xlsx','rb')) - Name Value -0 string1 1 -1 string2 2 -2 string3 3 +>>> pd.read_excel(open('tmp.xlsx', 'rb'), +... sheet_name='Sheet3') # doctest: +SKIP + Unnamed: 0 Name Value +0 0 string1 1 +1 1 string2 2 +2 2 #Comment 3 Index and header can be specified via the `index_col` and `header` arguments ->>> pd.read_excel('tmp.xlsx', index_col=None, header=None) - 0 1 2 -0 NaN Name Value -1 0.0 string1 1 -2 1.0 string2 2 -3 2.0 string3 3 +>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP + 0 1 2 +0 NaN Name Value +1 0.0 string1 1 +2 1.0 string2 2 +3 2.0 #Comment 3 Column types are inferred but can be explicitly specified ->>> pd.read_excel('tmp.xlsx', dtype={'Name':str, 'Value':float}) - Name Value -0 string1 1.0 -1 string2 2.0 -2 string3 3.0 +>>> pd.read_excel('tmp.xlsx', index_col=0, +... dtype={'Name': str, 'Value': float}) # doctest: +SKIP + Name Value +0 string1 1.0 +1 string2 2.0 +2 #Comment 3.0 True, False, and NA values, and thousands separators have defaults, but can be explicitly specified, too. Supply the values you would like as strings or lists of strings! ->>> pd.read_excel('tmp.xlsx', -... na_values=['string1', 'string2']) - Name Value -0 NaN 1 -1 NaN 2 -2 string3 3 +>>> pd.read_excel('tmp.xlsx', index_col=0, +... na_values=['string1', 'string2']) # doctest: +SKIP + Name Value +0 NaN 1 +1 NaN 2 +2 #Comment 3 Comment lines in the excel input file can be skipped using the `comment` kwarg ->>> df = pd.DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) ->>> df.to_excel('tmp.xlsx', index=False) ->>> pd.read_excel('tmp.xlsx') - a b -0 1 2 -1 #2 3 - ->>> pd.read_excel('tmp.xlsx', comment='#') - a b -0 1 2 +>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP + Name Value +0 string1 1.0 +1 string2 2.0 +2 None NaN """ @@ -302,6 +313,7 @@ def read_excel(io, header=0, names=None, index_col=None, + parse_cols=None, usecols=None, squeeze=False, dtype=None, @@ -312,10 +324,13 @@ def read_excel(io, skiprows=None, nrows=None, na_values=None, + keep_default_na=True, + verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, + skip_footer=0, skipfooter=0, convert_float=True, mangle_dupe_cols=True, @@ -348,6 +363,8 @@ def read_excel(io, skiprows=skiprows, nrows=nrows, na_values=na_values, + keep_default_na=keep_default_na, + verbose=verbose, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, @@ -804,7 +821,7 @@ def _maybe_convert_usecols(usecols): if is_integer(usecols): warnings.warn(("Passing in an integer for `usecols` has been " - "deprecated. Please pass in a list of ints from " + "deprecated. Please pass in a list of int from " "0 to `usecols` inclusive instead."), FutureWarning, stacklevel=2) return lrange(usecols + 1) @@ -880,7 +897,7 @@ def _fill_mi_header(row, control_row): ---------- row : list List of items in a single row. - control_row : list of boolean + control_row : list of bool Helps to determine if particular column is in same parent index as the previous value. Used to stop propagation of empty cells between different indexes.