From 669942720baed986ea26b0e52161984ebe0d193e Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 20 Apr 2020 11:50:17 +0100 Subject: [PATCH 1/8] add subs in info.py --- pandas/io/formats/info.py | 117 ++++++-------------------------------- 1 file changed, 18 insertions(+), 99 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 7b5e553cf394e..b9befa3eec441 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,7 +1,10 @@ import sys +from typing import IO, Optional, Union from pandas._config import get_option +from pandas._typing import FrameOrSeries + from pandas.io.formats import format as fmt from pandas.io.formats.printing import pprint_thing @@ -11,18 +14,21 @@ def _put_str(s, space): def info( - data, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None + data: FrameOrSeries, + verbose: Optional[bool] = None, + buf: Optional[IO[str]] = None, + max_cols: Optional[int] = None, + memory_usage: Optional[Union[bool, str]] = None, + null_counts: Optional[bool] = None, ) -> None: """ - Print a concise summary of a DataFrame. + Print a concise summary of a %(klass)s. - This method prints information about a DataFrame including - the index dtype and column dtypes, non-null values and memory usage. + This method prints information about a %(klass)s including + the index dtype%(type_sub)s, non-null values and memory usage. Parameters ---------- - data : DataFrame - DataFrame to print information about. verbose : bool, optional Whether to print the full summary. By default, the setting in ``pandas.options.display.max_info_columns`` is followed. @@ -30,16 +36,11 @@ def info( Where to send the output. By default, the output is printed to sys.stdout. Pass a writable buffer if you need to further process the output. - max_cols : int, optional - When to switch from the verbose to the truncated output. If the - DataFrame has more than `max_cols` columns, the truncated output - is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used. + %(max_cols_sub)s memory_usage : bool, str, optional - Specifies whether total memory usage of the DataFrame + Specifies whether total memory usage of the %(klass)s elements (including the index) should be displayed. By default, this follows the ``pandas.options.display.memory_usage`` setting. - True always show memory usage. False never shows memory usage. A value of 'deep' is equivalent to "True with deep introspection". Memory usage is shown in human-readable units (base-2 @@ -50,7 +51,7 @@ def info( at the cost of computational resources. null_counts : bool, optional Whether to show the non-null counts. By default, this is shown - only if the frame is smaller than + only if the %(klass)s is smaller than ``pandas.options.display.max_info_rows`` and ``pandas.options.display.max_info_columns``. A value of True always shows the counts, and False never shows the counts. @@ -58,97 +59,15 @@ def info( Returns ------- None - This method prints a summary of a DataFrame and returns None. + This method prints a summary of a %(klass)s and returns None. See Also -------- - DataFrame.describe: Generate descriptive statistics of DataFrame - columns. - DataFrame.memory_usage: Memory usage of DataFrame columns. + %(see_also_sub)s Examples -------- - >>> int_values = [1, 2, 3, 4, 5] - >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] - >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] - >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, - ... "float_col": float_values}) - >>> df - int_col text_col float_col - 0 1 alpha 0.00 - 1 2 beta 0.25 - 2 3 gamma 0.50 - 3 4 delta 0.75 - 4 5 epsilon 1.00 - - Prints information of all columns: - - >>> df.info(verbose=True) - - RangeIndex: 5 entries, 0 to 4 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 - dtypes: float64(1), int64(1), object(1) - memory usage: 248.0+ bytes - - Prints a summary of columns count and its dtypes but not per column - information: - - >>> df.info(verbose=False) - - RangeIndex: 5 entries, 0 to 4 - Columns: 3 entries, int_col to float_col - dtypes: float64(1), int64(1), object(1) - memory usage: 248.0+ bytes - - Pipe output of DataFrame.info to buffer instead of sys.stdout, get - buffer content and writes to a text file: - - >>> import io - >>> buffer = io.StringIO() - >>> df.info(buf=buffer) - >>> s = buffer.getvalue() - >>> with open("df_info.txt", "w", - ... encoding="utf-8") as f: # doctest: +SKIP - ... f.write(s) - 260 - - The `memory_usage` parameter allows deep introspection mode, specially - useful for big DataFrames and fine-tune memory optimization: - - >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) - >>> df = pd.DataFrame({ - ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) - ... }) - >>> df.info() - - RangeIndex: 1000000 entries, 0 to 999999 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object - dtypes: object(3) - memory usage: 22.9+ MB - - >>> df.info(memory_usage='deep') - - RangeIndex: 1000000 entries, 0 to 999999 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object - dtypes: object(3) - memory usage: 188.8 MB + %(examples_sub)s """ if buf is None: # pragma: no cover buf = sys.stdout From 52ed24f700a1e5540d6915d1cd185db978a08b4b Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 20 Apr 2020 11:53:37 +0100 Subject: [PATCH 2/8] substitute docstring in frame.py --- pandas/core/frame.py | 97 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 85bb47485a2e7..5c7d78254ab15 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2326,6 +2326,103 @@ def to_html( ) # ---------------------------------------------------------------------- + @Substitution( + klass="DataFrame", + type_sub=" and columns", + max_cols_sub=""" +max_cols : int, optional + When to switch from the verbose to the truncated output. If the + DataFrame has more than `max_cols` columns, the truncated output + is used. By default, the setting in + ``pandas.options.display.max_info_columns`` is used. + """, + examples_sub=""" +>>> int_values = [1, 2, 3, 4, 5] +>>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] +>>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] +>>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, +... "float_col": float_values}) +>>> df + int_col text_col float_col +0 1 alpha 0.00 +1 2 beta 0.25 +2 3 gamma 0.50 +3 4 delta 0.75 +4 5 epsilon 1.00 + +Prints information of all columns: + +>>> df.info(verbose=True) + +RangeIndex: 5 entries, 0 to 4 +Data columns (total 3 columns): + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 +dtypes: float64(1), int64(1), object(1) +memory usage: 248.0+ bytes + +Prints a summary of columns count and its dtypes but not per column +information: + +>>> df.info(verbose=False) + +RangeIndex: 5 entries, 0 to 4 +Columns: 3 entries, int_col to float_col +dtypes: float64(1), int64(1), object(1) +memory usage: 248.0+ bytes + +Pipe output of DataFrame.info to buffer instead of sys.stdout, get +buffer content and writes to a text file: + +>>> import io +>>> buffer = io.StringIO() +>>> df.info(buf=buffer) +>>> s = buffer.getvalue() +>>> with open("df_info.txt", "w", +... encoding="utf-8") as f: # doctest: +SKIP +... f.write(s) +260 + +The `memory_usage` parameter allows deep introspection mode, specially +useful for big DataFrames and fine-tune memory optimization: + +>>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) +>>> df = pd.DataFrame({ +... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), +... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), +... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) +... }) +>>> df.info() + +RangeIndex: 1000000 entries, 0 to 999999 +Data columns (total 3 columns): + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object +dtypes: object(3) +memory usage: 22.9+ MB + +>>> df.info(memory_usage='deep') + +RangeIndex: 1000000 entries, 0 to 999999 +Data columns (total 3 columns): + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object +dtypes: object(3) +memory usage: 188.8 MB""", + see_also_sub=""" +DataFrame.describe: Generate descriptive statistics of DataFrame + columns. +DataFrame.memory_usage: Memory usage of DataFrame columns.""", + ) @doc(info) def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None From 87a987453dce21baa703ae192f850361799bad8e Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 20 Apr 2020 11:54:42 +0100 Subject: [PATCH 3/8] Add type annotations --- pandas/core/frame.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5c7d78254ab15..3f5efc8fc2973 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2425,7 +2425,12 @@ def to_html( ) @doc(info) def info( - self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None + self, + verbose: Optional[bool] = None, + buf: Optional[IO[str]] = None, + max_cols: Optional[int] = None, + memory_usage: Optional[Union[bool, str]] = None, + null_counts: Optional[bool] = None, ) -> None: return info(self, verbose, buf, max_cols, memory_usage, null_counts) From 03dbee3f1f6166c103c04aca2d33a5c59a87d18b Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 20 Apr 2020 12:04:29 +0100 Subject: [PATCH 4/8] reinstate data parameter --- pandas/io/formats/info.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index b9befa3eec441..cb77afbad233a 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -29,6 +29,8 @@ def info( Parameters ---------- + data : %(klass)s + %(klass)s to print information about. verbose : bool, optional Whether to print the full summary. By default, the setting in ``pandas.options.display.max_info_columns`` is followed. From ffefba151d8498238481a99c446a2c2aeaa52d1b Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 20 Apr 2020 15:27:07 +0100 Subject: [PATCH 5/8] revert removed line --- pandas/io/formats/info.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index cb77afbad233a..d68a1fdde8da9 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -43,6 +43,7 @@ def info( Specifies whether total memory usage of the %(klass)s elements (including the index) should be displayed. By default, this follows the ``pandas.options.display.memory_usage`` setting. + True always show memory usage. False never shows memory usage. A value of 'deep' is equivalent to "True with deep introspection". Memory usage is shown in human-readable units (base-2 From d900b13d87ab972b3204152cc0762e950a35871c Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 21 Apr 2020 18:57:24 +0100 Subject: [PATCH 6/8] indent docstring --- pandas/core/frame.py | 194 ++++++++++++++++++++++--------------------- 1 file changed, 101 insertions(+), 93 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3f5efc8fc2973..ca0600f548efc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2329,99 +2329,107 @@ def to_html( @Substitution( klass="DataFrame", type_sub=" and columns", - max_cols_sub=""" -max_cols : int, optional - When to switch from the verbose to the truncated output. If the - DataFrame has more than `max_cols` columns, the truncated output - is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used. - """, - examples_sub=""" ->>> int_values = [1, 2, 3, 4, 5] ->>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] ->>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] ->>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, -... "float_col": float_values}) ->>> df - int_col text_col float_col -0 1 alpha 0.00 -1 2 beta 0.25 -2 3 gamma 0.50 -3 4 delta 0.75 -4 5 epsilon 1.00 - -Prints information of all columns: - ->>> df.info(verbose=True) - -RangeIndex: 5 entries, 0 to 4 -Data columns (total 3 columns): - # Column Non-Null Count Dtype ---- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 -dtypes: float64(1), int64(1), object(1) -memory usage: 248.0+ bytes - -Prints a summary of columns count and its dtypes but not per column -information: - ->>> df.info(verbose=False) - -RangeIndex: 5 entries, 0 to 4 -Columns: 3 entries, int_col to float_col -dtypes: float64(1), int64(1), object(1) -memory usage: 248.0+ bytes - -Pipe output of DataFrame.info to buffer instead of sys.stdout, get -buffer content and writes to a text file: - ->>> import io ->>> buffer = io.StringIO() ->>> df.info(buf=buffer) ->>> s = buffer.getvalue() ->>> with open("df_info.txt", "w", -... encoding="utf-8") as f: # doctest: +SKIP -... f.write(s) -260 - -The `memory_usage` parameter allows deep introspection mode, specially -useful for big DataFrames and fine-tune memory optimization: - ->>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) ->>> df = pd.DataFrame({ -... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), -... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), -... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) -... }) ->>> df.info() - -RangeIndex: 1000000 entries, 0 to 999999 -Data columns (total 3 columns): - # Column Non-Null Count Dtype ---- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object -dtypes: object(3) -memory usage: 22.9+ MB - ->>> df.info(memory_usage='deep') - -RangeIndex: 1000000 entries, 0 to 999999 -Data columns (total 3 columns): - # Column Non-Null Count Dtype ---- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object -dtypes: object(3) -memory usage: 188.8 MB""", - see_also_sub=""" -DataFrame.describe: Generate descriptive statistics of DataFrame - columns. -DataFrame.memory_usage: Memory usage of DataFrame columns.""", + max_cols_sub=( + """ + max_cols : int, optional + When to switch from the verbose to the truncated output. If the + DataFrame has more than `max_cols` columns, the truncated output + is used. By default, the setting in + ``pandas.options.display.max_info_columns`` is used. + """ + ), + examples_sub=( + """ + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] + >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, + ... "float_col": float_values}) + >>> df + int_col text_col float_col + 0 1 alpha 0.00 + 1 2 beta 0.25 + 2 3 gamma 0.50 + 3 4 delta 0.75 + 4 5 epsilon 1.00 + + Prints information of all columns: + + >>> df.info(verbose=True) + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 248.0+ bytes + + Prints a summary of columns count and its dtypes but not per column + information: + + >>> df.info(verbose=False) + + RangeIndex: 5 entries, 0 to 4 + Columns: 3 entries, int_col to float_col + dtypes: float64(1), int64(1), object(1) + memory usage: 248.0+ bytes + + Pipe output of DataFrame.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: + + >>> import io + >>> buffer = io.StringIO() + >>> df.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP + ... f.write(s) + 260 + + The `memory_usage` parameter allows deep introspection mode, specially + useful for big DataFrames and fine-tune memory optimization: + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> df = pd.DataFrame({ + ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) + ... }) + >>> df.info() + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object + dtypes: object(3) + memory usage: 22.9+ MB + + >>> df.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object + dtypes: object(3) + memory usage: 188.8 MB + """ + ), + see_also_sub=( + """ + DataFrame.describe: Generate descriptive statistics of DataFrame + columns. + DataFrame.memory_usage: Memory usage of DataFrame columns. + """ + ), ) @doc(info) def info( From c93775ed439c5d966c52d34a00410b9796438737 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 21 Apr 2020 19:20:48 +0100 Subject: [PATCH 7/8] fix validation of docstrings --- pandas/core/frame.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ca0600f548efc..20625601b3fd8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2335,8 +2335,7 @@ def to_html( When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used. - """ + ``pandas.options.display.max_info_columns`` is used.""" ), examples_sub=( """ @@ -2420,15 +2419,13 @@ def to_html( 1 column_2 1000000 non-null object 2 column_3 1000000 non-null object dtypes: object(3) - memory usage: 188.8 MB - """ + memory usage: 188.8 MB""" ), see_also_sub=( """ DataFrame.describe: Generate descriptive statistics of DataFrame columns. - DataFrame.memory_usage: Memory usage of DataFrame columns. - """ + DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), ) @doc(info) From 022dd6d1ca48ff9d0243dd84681a389b4bffcd55 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 21 Apr 2020 19:28:22 +0100 Subject: [PATCH 8/8] fix indentation of max_columns --- pandas/core/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 20625601b3fd8..57c315af09e8d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2330,12 +2330,12 @@ def to_html( klass="DataFrame", type_sub=" and columns", max_cols_sub=( - """ - max_cols : int, optional + """max_cols : int, optional When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used.""" + ``pandas.options.display.max_info_columns`` is used. + """ ), examples_sub=( """