From c3568a217068e75e450ed0d03b64140e7c71bd1c Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sun, 20 Sep 2020 18:54:57 +0700 Subject: [PATCH 01/34] REF: drop TableFormatter Move all methods to DataFrameFormatter, inherit relevant classes from DataFrameFormatter. --- pandas/io/formats/format.py | 145 ++++++++++++++++-------------------- pandas/io/formats/html.py | 3 +- pandas/io/formats/latex.py | 4 +- 3 files changed, 66 insertions(+), 86 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 7eb31daa894c9..1557ee7860b5f 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -41,7 +41,6 @@ from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.nattype import NaTType from pandas._typing import FilePathOrBuffer, Label -from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -450,87 +449,7 @@ def get_adjustment() -> TextAdjustment: return TextAdjustment() -class TableFormatter: - - show_dimensions: Union[bool, str] - formatters: FormattersType - columns: Index - _is_truncated: bool - - @property - def is_truncated(self) -> bool: - return self._is_truncated - - @property - def should_show_dimensions(self) -> bool: - return self.show_dimensions is True or ( - self.show_dimensions == "truncate" and self.is_truncated - ) - - def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: - if isinstance(self.formatters, (list, tuple)): - if is_integer(i): - i = cast(int, i) - return self.formatters[i] - else: - return None - else: - if is_integer(i) and i not in self.columns: - i = self.columns[i] - return self.formatters.get(i, None) - - @contextmanager - def get_buffer( - self, buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None - ): - """ - Context manager to open, yield and close buffer for filenames or Path-like - objects, otherwise yield buf unchanged. - """ - if buf is not None: - buf = stringify_path(buf) - else: - buf = StringIO() - - if encoding is None: - encoding = "utf-8" - elif not isinstance(buf, str): - raise ValueError("buf is not a file name and encoding is specified.") - - if hasattr(buf, "write"): - yield buf - elif isinstance(buf, str): - with open(buf, "w", encoding=encoding, newline="") as f: - # GH#30034 open instead of codecs.open prevents a file leak - # if we have an invalid encoding argument. - # newline="" is needed to roundtrip correctly on - # windows test_to_latex_filename - yield f - else: - raise TypeError("buf is not a file name and it has no write method") - - def write_result(self, buf: IO[str]) -> None: - """ - Write the result of serialization to buf. - """ - raise AbstractMethodError(self) - - def get_result( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - ) -> Optional[str]: - """ - Perform serialization. Write to buf or return as string if buf is None. - """ - with self.get_buffer(buf, encoding=encoding) as f: - self.write_result(buf=f) - if buf is None: - return f.getvalue() - return None - - -class DataFrameFormatter(TableFormatter): +class DataFrameFormatter: """ Render a DataFrame @@ -595,6 +514,68 @@ def __init__( self._truncate() self.adj = get_adjustment() + def get_result( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + ) -> Optional[str]: + """ + Perform serialization. Write to buf or return as string if buf is None. + """ + with self.get_buffer(buf, encoding=encoding) as f: + self.write_result(buf=f) + if buf is None: + return f.getvalue() + return None + + @property + def should_show_dimensions(self) -> bool: + return self.show_dimensions is True or ( + self.show_dimensions == "truncate" and self.is_truncated + ) + + def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: + if isinstance(self.formatters, (list, tuple)): + if is_integer(i): + i = cast(int, i) + return self.formatters[i] + else: + return None + else: + if is_integer(i) and i not in self.columns: + i = self.columns[i] + return self.formatters.get(i, None) + + @contextmanager + def get_buffer( + self, buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None + ): + """ + Context manager to open, yield and close buffer for filenames or Path-like + objects, otherwise yield buf unchanged. + """ + if buf is not None: + buf = stringify_path(buf) + else: + buf = StringIO() + + if encoding is None: + encoding = "utf-8" + elif not isinstance(buf, str): + raise ValueError("buf is not a file name and encoding is specified.") + + if hasattr(buf, "write"): + yield buf + elif isinstance(buf, str): + with open(buf, "w", encoding=encoding, newline="") as f: + # GH#30034 open instead of codecs.open prevents a file leak + # if we have an invalid encoding argument. + # newline="" is needed to roundtrip correctly on + # windows test_to_latex_filename + yield f + else: + raise TypeError("buf is not a file name and it has no write method") + def _initialize_sparsify(self, sparsify: Optional[bool]) -> bool: if sparsify is None: return get_option("display.multi_sparse") diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index c8eb89afdd849..b8e2a5ec8d443 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -14,14 +14,13 @@ from pandas.io.common import is_url from pandas.io.formats.format import ( DataFrameFormatter, - TableFormatter, buffer_put_lines, get_level_lengths, ) from pandas.io.formats.printing import pprint_thing -class HTMLFormatter(TableFormatter): +class HTMLFormatter(DataFrameFormatter): """ Internal class for formatting output data in html. This class is intended for shared functionality between diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index eb35fff3a4f8e..39c7b4b34a131 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -8,7 +8,7 @@ from pandas.core.dtypes.generic import ABCMultiIndex -from pandas.io.formats.format import DataFrameFormatter, TableFormatter +from pandas.io.formats.format import DataFrameFormatter class RowStringConverter(ABC): @@ -595,7 +595,7 @@ def env_end(self) -> str: return "\\end{tabular}" -class LatexFormatter(TableFormatter): +class LatexFormatter(DataFrameFormatter): """ Used to render a DataFrame to a LaTeX tabular/longtable environment output. From 837858f55fdb885f1c65cb8306c026ec5e5c646d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sun, 20 Sep 2020 19:29:52 +0700 Subject: [PATCH 02/34] REF: extract ConsoleFormatter --- pandas/core/frame.py | 2 +- pandas/io/formats/format.py | 88 ++++++++++++++++++++----------------- 2 files changed, 49 insertions(+), 41 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 36dfe43bfd708..cc8bc73ac87e6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -855,7 +855,7 @@ def to_string( from pandas import option_context with option_context("display.max_colwidth", max_colwidth): - formatter = fmt.DataFrameFormatter( + formatter = fmt.ConsoleFormatter( self, columns=columns, col_space=col_space, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 1557ee7860b5f..429c7266c735e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -69,6 +69,8 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.reshape.concat import concat +from pandas.errors import AbstractMethodError + from pandas.io.common import stringify_path from pandas.io.formats.printing import adjoin, justify, pprint_thing @@ -528,6 +530,9 @@ def get_result( return f.getvalue() return None + def write_result(self, buf: IO[str]) -> None: + raise AbstractMethodError + @property def should_show_dimensions(self) -> bool: return self.show_dimensions is True or ( @@ -882,43 +887,10 @@ def _insert_dot_separator_vertical( col.insert(row_num + n_header_rows, dot_str) return strcols - def write_result(self, buf: IO[str]) -> None: - """ - Render a DataFrame to a console-friendly tabular output. - """ - text = self._get_string_representation() - - buf.writelines(text) - - if self.should_show_dimensions: - buf.write(self._dimensions_info) - @property def _dimensions_info(self) -> str: return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]" - def _get_string_representation(self) -> str: - if self.frame.empty: - info_line = ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {pprint_thing(self.frame.columns)}\n" - f"Index: {pprint_thing(self.frame.index)}" - ) - return info_line - - strcols = self._to_str_columns() - - if self.line_width is None: - # no need to wrap around just print the whole frame - return self.adj.adjoin(1, *strcols) - - if self.max_cols is None or self.max_cols > 0: - # need to wrap around - return self._join_multiline(*strcols) - - # max_cols == 0. Try to fit frame to terminal - return self._fit_strcols_to_terminal_width(strcols) - def _fit_strcols_to_terminal_width(self, strcols) -> str: from pandas import Series @@ -992,13 +964,6 @@ def _join_multiline(self, *args) -> str: start = end return "\n\n".join(str_lst) - def to_string( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - ) -> Optional[str]: - return self.get_result(buf=buf, encoding=encoding) - def to_latex( self, buf: Optional[FilePathOrBuffer[str]] = None, @@ -1179,6 +1144,49 @@ def _get_column_name_list(self) -> List[str]: return names +class ConsoleFormatter(DataFrameFormatter): + + def write_result(self, buf: IO[str]) -> None: + """ + Render a DataFrame to a console-friendly tabular output. + """ + text = self._get_string_representation() + + buf.writelines(text) + + if self.should_show_dimensions: + buf.write(self._dimensions_info) + + def to_string( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + ) -> Optional[str]: + return self.get_result(buf=buf, encoding=encoding) + + def _get_string_representation(self) -> str: + if self.frame.empty: + info_line = ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {pprint_thing(self.frame.columns)}\n" + f"Index: {pprint_thing(self.frame.index)}" + ) + return info_line + + strcols = self._to_str_columns() + + if self.line_width is None: + # no need to wrap around just print the whole frame + return self.adj.adjoin(1, *strcols) + + if self.max_cols is None or self.max_cols > 0: + # need to wrap around + return self._join_multiline(*strcols) + + # max_cols == 0. Try to fit frame to terminal + return self._fit_strcols_to_terminal_width(strcols) + + # ---------------------------------------------------------------------- # Array formatters From 08e899f4b4f576251be214a19c25b85b54bcf249 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sun, 20 Sep 2020 19:32:41 +0700 Subject: [PATCH 03/34] CLN: remove ConsoleFormatter to_string method Replace it with get_result method, which is going to become abstract method for the parent class. --- pandas/core/frame.py | 2 +- pandas/io/formats/format.py | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cc8bc73ac87e6..9cc23eba5aa00 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -874,7 +874,7 @@ def to_string( decimal=decimal, line_width=line_width, ) - return formatter.to_string(buf=buf, encoding=encoding) + return formatter.get_result(buf=buf, encoding=encoding) # ---------------------------------------------------------------------- diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 429c7266c735e..2140210130193 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1157,13 +1157,6 @@ def write_result(self, buf: IO[str]) -> None: if self.should_show_dimensions: buf.write(self._dimensions_info) - def to_string( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - ) -> Optional[str]: - return self.get_result(buf=buf, encoding=encoding) - def _get_string_representation(self) -> str: if self.frame.empty: info_line = ( From 602c98454bedfa8506f397aed0287cbfe3f92f05 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sun, 20 Sep 2020 19:52:32 +0700 Subject: [PATCH 04/34] REF: move table_id & render_links to HTMLFormatter --- pandas/core/frame.py | 14 ++++++-------- pandas/io/formats/format.py | 20 ++++++++++++++------ pandas/io/formats/html.py | 6 ++++-- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9cc23eba5aa00..d608e33009df3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -786,8 +786,6 @@ def _repr_html_(self) -> Optional[str]: max_cols=max_cols, show_dimensions=show_dimensions, decimal=".", - table_id=None, - render_links=False, ) return formatter.to_html(notebook=True) else: @@ -2467,21 +2465,19 @@ def to_html( columns=columns, col_space=col_space, na_rep=na_rep, + header=header, + index=index, formatters=formatters, float_format=float_format, + bold_rows=bold_rows, sparsify=sparsify, justify=justify, index_names=index_names, - header=header, - index=index, - bold_rows=bold_rows, escape=escape, + decimal=decimal, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, - decimal=decimal, - table_id=table_id, - render_links=render_links, ) # TODO: a generic formatter wld b in DataFrameFormatter return formatter.to_html( @@ -2490,6 +2486,8 @@ def to_html( notebook=notebook, border=border, encoding=encoding, + table_id=table_id, + render_links=render_links, ) # ---------------------------------------------------------------------- diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 2140210130193..e405b5e321317 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -483,8 +483,6 @@ def __init__( max_cols: Optional[int] = None, show_dimensions: Union[bool, str] = False, decimal: str = ".", - table_id: Optional[str] = None, - render_links: bool = False, bold_rows: bool = False, escape: bool = True, ): @@ -503,8 +501,6 @@ def __init__( self.min_rows = min_rows self.max_cols = max_cols self.show_dimensions = show_dimensions - self.table_id = table_id - self.render_links = render_links self.justify = self._initialize_justify(justify) self.bold_rows = bold_rows self.escape = escape @@ -1015,6 +1011,8 @@ def to_html( classes: Optional[Union[str, List, Tuple]] = None, notebook: bool = False, border: Optional[int] = None, + table_id: Optional[str] = None, + render_links: bool = False, ) -> Optional[str]: """ Render a DataFrame to a html table. @@ -1029,13 +1027,23 @@ def to_html( border : int A ``border=border`` attribute is included in the opening ```` tag. Default ``pd.options.display.html.border``. + table_id + + render_links + """ from pandas.io.formats.html import HTMLFormatter, NotebookFormatter Klass = NotebookFormatter if notebook else HTMLFormatter - return Klass(self, classes=classes, border=border).get_result( - buf=buf, encoding=encoding + + instance = Klass( + self, + classes=classes, + border=border, + table_id=table_id, + render_links=render_links, ) + return instance.get_result(buf=buf, encoding=encoding) def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: from pandas.core.indexes.multi import sparsify_labels diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index b8e2a5ec8d443..892ef3878876e 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -37,6 +37,8 @@ def __init__( formatter: DataFrameFormatter, classes: Optional[Union[str, List[str], Tuple[str, ...]]] = None, border: Optional[int] = None, + table_id: Optional[str] = None, + render_links: bool = False, ) -> None: self.fmt = formatter self.classes = classes @@ -50,8 +52,8 @@ def __init__( if border is None: border = cast(int, get_option("display.html.border")) self.border = border - self.table_id = self.fmt.table_id - self.render_links = self.fmt.render_links + self.table_id = table_id + self.render_links = render_links self.col_space = { column: f"{value}px" if isinstance(value, int) else value From bd5cb87f8b520571635ba977823d6026007176a4 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sun, 20 Sep 2020 19:56:41 +0700 Subject: [PATCH 05/34] REF: move _join_multiline to ConsoleFormatter --- pandas/io/formats/format.py | 76 ++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index e405b5e321317..9b5fa32ac9afe 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -922,44 +922,6 @@ def _fit_strcols_to_terminal_width(self, strcols) -> str: strcols = self._to_str_columns() return self.adj.adjoin(1, *strcols) - def _join_multiline(self, *args) -> str: - lwidth = self.line_width - adjoin_width = 1 - strcols = list(args) - if self.index: - idx = strcols.pop(0) - lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width - - col_widths = [ - np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 - for col in strcols - ] - - assert lwidth is not None - col_bins = _binify(col_widths, lwidth) - nbins = len(col_bins) - - if self.is_truncated_vertically: - assert self.max_rows_fitted is not None - nrows = self.max_rows_fitted + 1 - else: - nrows = len(self.frame) - - str_lst = [] - start = 0 - for i, end in enumerate(col_bins): - row = strcols[start:end] - if self.index: - row.insert(0, idx) - if nbins > 1: - if end <= len(strcols) and i < nbins - 1: - row.append([" \\"] + [" "] * (nrows - 1)) - else: - row.append([" "] * nrows) - str_lst.append(self.adj.adjoin(adjoin_width, *row)) - start = end - return "\n\n".join(str_lst) - def to_latex( self, buf: Optional[FilePathOrBuffer[str]] = None, @@ -1187,6 +1149,44 @@ def _get_string_representation(self) -> str: # max_cols == 0. Try to fit frame to terminal return self._fit_strcols_to_terminal_width(strcols) + def _join_multiline(self, *args) -> str: + lwidth = self.line_width + adjoin_width = 1 + strcols = list(args) + if self.index: + idx = strcols.pop(0) + lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + + col_widths = [ + np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 + for col in strcols + ] + + assert lwidth is not None + col_bins = _binify(col_widths, lwidth) + nbins = len(col_bins) + + if self.is_truncated_vertically: + assert self.max_rows_fitted is not None + nrows = self.max_rows_fitted + 1 + else: + nrows = len(self.frame) + + str_lst = [] + start = 0 + for i, end in enumerate(col_bins): + row = strcols[start:end] + if self.index: + row.insert(0, idx) + if nbins > 1: + if end <= len(strcols) and i < nbins - 1: + row.append([" \\"] + [" "] * (nrows - 1)) + else: + row.append([" "] * nrows) + str_lst.append(self.adj.adjoin(adjoin_width, *row)) + start = end + return "\n\n".join(str_lst) + # ---------------------------------------------------------------------- # Array formatters From 6e8d4d88a0af54d587f3d4c1fec5b4d782147b31 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 02:10:56 +0700 Subject: [PATCH 06/34] REF: separate dataframe formatting from rendering --- pandas/core/frame.py | 11 +- pandas/core/generic.py | 8 +- pandas/io/formats/format.py | 630 ++++++++++++++++++------------------ pandas/io/formats/html.py | 11 +- pandas/io/formats/latex.py | 7 +- 5 files changed, 345 insertions(+), 322 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d608e33009df3..b99e814ba4920 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -787,7 +787,7 @@ def _repr_html_(self) -> Optional[str]: show_dimensions=show_dimensions, decimal=".", ) - return formatter.to_html(notebook=True) + return fmt.DataFrameRenderer(formatter).to_html(notebook=True) else: return None @@ -853,7 +853,7 @@ def to_string( from pandas import option_context with option_context("display.max_colwidth", max_colwidth): - formatter = fmt.ConsoleFormatter( + formatter = fmt.DataFrameFormatter( self, columns=columns, col_space=col_space, @@ -872,7 +872,10 @@ def to_string( decimal=decimal, line_width=line_width, ) - return formatter.get_result(buf=buf, encoding=encoding) + return fmt.DataFrameRenderer(formatter).to_string( + buf=buf, + encoding=encoding, + ) # ---------------------------------------------------------------------- @@ -2480,7 +2483,7 @@ def to_html( show_dimensions=show_dimensions, ) # TODO: a generic formatter wld b in DataFrameFormatter - return formatter.to_html( + return fmt.DataFrameRenderer(formatter).to_html( buf=buf, classes=classes, notebook=notebook, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0b9021b094cd7..df509b6f71527 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -106,7 +106,11 @@ from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window from pandas.io.formats import format as fmt -from pandas.io.formats.format import DataFrameFormatter, format_percentiles +from pandas.io.formats.format import ( + DataFrameFormatter, + DataFrameRenderer, + format_percentiles, +) from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: @@ -3127,7 +3131,7 @@ def to_latex( escape=escape, decimal=decimal, ) - return formatter.to_latex( + return DataFrameRenderer(formatter).to_latex( buf=buf, column_format=column_format, longtable=longtable, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 9b5fa32ac9afe..c49ddcba7a276 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -69,8 +69,6 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.reshape.concat import concat -from pandas.errors import AbstractMethodError - from pandas.io.common import stringify_path from pandas.io.formats.printing import adjoin, justify, pprint_thing @@ -452,14 +450,6 @@ def get_adjustment() -> TextAdjustment: class DataFrameFormatter: - """ - Render a DataFrame - - self.to_string() : console-friendly tabular output - self.to_html() : html table - self.to_latex() : LaTeX tabular environment table - - """ __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring @@ -487,47 +477,42 @@ def __init__( escape: bool = True, ): self.frame = frame - self.show_index_names = index_names - self.sparsify = self._initialize_sparsify(sparsify) - self.float_format = float_format - self.formatters = self._initialize_formatters(formatters) - self.na_rep = na_rep - self.decimal = decimal + self.columns = self._initialize_columns(columns) self.col_space = self._initialize_colspace(col_space) self.header = header self.index = index + self.na_rep = na_rep + self.formatters = self._initialize_formatters(formatters) + self.justify = self._initialize_justify(justify) + self.float_format = float_format + self.sparsify = self._initialize_sparsify(sparsify) + self.show_index_names = index_names + self.decimal = decimal + self.bold_rows = bold_rows + self.escape = escape self.line_width = line_width self.max_rows = max_rows self.min_rows = min_rows self.max_cols = max_cols self.show_dimensions = show_dimensions - self.justify = self._initialize_justify(justify) - self.bold_rows = bold_rows - self.escape = escape - self.columns = self._initialize_columns(columns) self.max_cols_fitted = self._calc_max_cols_fitted() self.max_rows_fitted = self._calc_max_rows_fitted() - self._truncate() + self.truncate() self.adj = get_adjustment() - def get_result( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - ) -> Optional[str]: + def get_strcols(self) -> List[List[str]]: """ - Perform serialization. Write to buf or return as string if buf is None. + Render a DataFrame to a list of columns (as lists of strings). """ - with self.get_buffer(buf, encoding=encoding) as f: - self.write_result(buf=f) - if buf is None: - return f.getvalue() - return None + strcols = self._get_strcols_without_index() - def write_result(self, buf: IO[str]) -> None: - raise AbstractMethodError + if self.index: + str_index = self._get_formatted_index(self.tr_frame) + strcols.insert(0, str_index) + + return strcols @property def should_show_dimensions(self) -> bool: @@ -535,47 +520,53 @@ def should_show_dimensions(self) -> bool: self.show_dimensions == "truncate" and self.is_truncated ) - def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: - if isinstance(self.formatters, (list, tuple)): - if is_integer(i): - i = cast(int, i) - return self.formatters[i] - else: - return None - else: - if is_integer(i) and i not in self.columns: - i = self.columns[i] - return self.formatters.get(i, None) + @property + def is_truncated(self) -> bool: + return bool(self.is_truncated_horizontally or self.is_truncated_vertically) - @contextmanager - def get_buffer( - self, buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None - ): - """ - Context manager to open, yield and close buffer for filenames or Path-like - objects, otherwise yield buf unchanged. - """ - if buf is not None: - buf = stringify_path(buf) - else: - buf = StringIO() + @property + def is_truncated_horizontally(self) -> bool: + return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted)) - if encoding is None: - encoding = "utf-8" - elif not isinstance(buf, str): - raise ValueError("buf is not a file name and encoding is specified.") + @property + def is_truncated_vertically(self) -> bool: + return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted)) - if hasattr(buf, "write"): - yield buf - elif isinstance(buf, str): - with open(buf, "w", encoding=encoding, newline="") as f: - # GH#30034 open instead of codecs.open prevents a file leak - # if we have an invalid encoding argument. - # newline="" is needed to roundtrip correctly on - # windows test_to_latex_filename - yield f - else: - raise TypeError("buf is not a file name and it has no write method") + @property + def info_line(self): + return ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {pprint_thing(self.frame.columns)}\n" + f"Index: {pprint_thing(self.frame.index)}" + ) + + @property + def dimensions_info(self) -> str: + return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]" + + @property + def has_index_names(self) -> bool: + return _has_names(self.frame.index) + + @property + def has_column_names(self) -> bool: + return _has_names(self.frame.columns) + + @property + def show_row_idx_names(self) -> bool: + return all((self.has_index_names, self.index, self.show_index_names)) + + @property + def show_col_idx_names(self) -> bool: + return all((self.has_column_names, self.show_index_names, self.header)) + + @property + def max_rows_displayed(self) -> int: + return min(self.max_rows or len(self.frame), len(self.frame)) + + @property + def need_to_wrap_around(self) -> bool: + return bool(self.max_cols is None or self.max_cols > 0) def _initialize_sparsify(self, sparsify: Optional[bool]) -> bool: if sparsify is None: @@ -635,10 +626,6 @@ def _initialize_colspace( result = dict(zip(self.frame.columns, col_space)) return result - @property - def max_rows_displayed(self) -> int: - return min(self.max_rows or len(self.frame), len(self.frame)) - def _calc_max_cols_fitted(self) -> Optional[int]: """Number of columns fitting the screen.""" if not self._is_in_terminal(): @@ -689,26 +676,14 @@ def _get_number_of_auxillary_rows(self) -> int: num_rows = dot_row + prompt_row if self.show_dimensions: - num_rows += len(self._dimensions_info.splitlines()) + num_rows += len(self.dimensions_info.splitlines()) if self.header: num_rows += 1 return num_rows - @property - def is_truncated_horizontally(self) -> bool: - return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted)) - - @property - def is_truncated_vertically(self) -> bool: - return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted)) - - @property - def is_truncated(self) -> bool: - return bool(self.is_truncated_horizontally or self.is_truncated_vertically) - - def _truncate(self) -> None: + def truncate(self) -> None: """ Check whether the frame should be truncated. If so, slice the frame up. """ @@ -773,7 +748,7 @@ def _get_strcols_without_index(self) -> List[List[str]]: if not is_list_like(self.header) and not self.header: for i, c in enumerate(self.tr_frame): - fmt_values = self._format_col(i) + fmt_values = self.format_col(i) fmt_values = _make_fixed_width( strings=fmt_values, justify=self.justify, @@ -804,7 +779,7 @@ def _get_strcols_without_index(self) -> List[List[str]]: header_colwidth = max( int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader) ) - fmt_values = self._format_col(i) + fmt_values = self.format_col(i) fmt_values = _make_fixed_width( fmt_values, self.justify, minimum=header_colwidth, adj=self.adj ) @@ -815,34 +790,165 @@ def _get_strcols_without_index(self) -> List[List[str]]: return strcols - def _get_strcols(self) -> List[List[str]]: - strcols = self._get_strcols_without_index() + def format_col(self, i: int) -> List[str]: + frame = self.tr_frame + formatter = self._get_formatter(i) + return format_array( + frame.iloc[:, i]._values, + formatter, + float_format=self.float_format, + na_rep=self.na_rep, + space=self.col_space.get(frame.columns[i]), + decimal=self.decimal, + leading_space=self.index, + ) - str_index = self._get_formatted_index(self.tr_frame) - if self.index: - strcols.insert(0, str_index) + def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: + if isinstance(self.formatters, (list, tuple)): + if is_integer(i): + i = cast(int, i) + return self.formatters[i] + else: + return None + else: + if is_integer(i) and i not in self.columns: + i = self.columns[i] + return self.formatters.get(i, None) + + def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: + from pandas.core.indexes.multi import sparsify_labels + + columns = frame.columns + + if isinstance(columns, MultiIndex): + fmt_columns = columns.format(sparsify=False, adjoin=False) + fmt_columns = list(zip(*fmt_columns)) + dtypes = self.frame.dtypes._values + + # if we have a Float level, they don't use leading space at all + restrict_formatting = any(l.is_floating for l in columns.levels) + need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) + + def space_format(x, y): + if ( + y not in self.formatters + and need_leadsp[x] + and not restrict_formatting + ): + return " " + y + return y + + str_columns = list( + zip(*[[space_format(x, y) for y in x] for x in fmt_columns]) + ) + if self.sparsify and len(str_columns): + str_columns = sparsify_labels(str_columns) + + str_columns = [list(x) for x in zip(*str_columns)] + else: + fmt_columns = columns.format() + dtypes = self.frame.dtypes + need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) + str_columns = [ + [" " + x if not self._get_formatter(i) and need_leadsp[x] else x] + for i, (col, x) in enumerate(zip(columns, fmt_columns)) + ] + # self.str_columns = str_columns + return str_columns + + def _get_formatted_index(self, frame: "DataFrame") -> List[str]: + # Note: this is only used by to_string() and to_latex(), not by + # to_html(). so safe to cast col_space here. + col_space = {k: cast(int, v) for k, v in self.col_space.items()} + index = frame.index + columns = frame.columns + fmt = self._get_formatter("__index__") + + if isinstance(index, MultiIndex): + fmt_index = index.format( + sparsify=self.sparsify, + adjoin=False, + names=self.show_row_idx_names, + formatter=fmt, + ) + else: + fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)] + + fmt_index = [ + tuple( + _make_fixed_width( + list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj + ) + ) + for x in fmt_index + ] + adjoined = self.adj.adjoin(1, *fmt_index).split("\n") + + # empty space for columns + if self.show_col_idx_names: + col_header = [str(x) for x in self._get_column_name_list()] + else: + col_header = [""] * columns.nlevels + + if self.header: + return col_header + adjoined + else: + return adjoined + + def _get_column_name_list(self) -> List[str]: + names: List[str] = [] + columns = self.frame.columns + if isinstance(columns, MultiIndex): + names.extend("" if name is None else name for name in columns.names) + else: + names.append("" if columns.name is None else columns.name) + return names + + +class StringFormatter: + + def __init__(self, fmt): + self.fmt = fmt + self.adj = fmt.adj + self.frame = fmt.frame + + def to_string(self) -> str: + text = self._get_string_representation() + if self.fmt.should_show_dimensions: + text = "".join([text, self.fmt.dimensions_info]) + return text + + def _get_strcols(self): + strcols = self.fmt.get_strcols() + if self.fmt.is_truncated: + strcols = self._insert_dot_separators(strcols) return strcols - def _to_str_columns(self) -> List[List[str]]: - """ - Render a DataFrame to a list of columns (as lists of strings). - """ + def _get_string_representation(self) -> str: + if self.fmt.frame.empty: + return self.fmt.info_line + strcols = self._get_strcols() - if self.is_truncated: - strcols = self._insert_dot_separators(strcols) + if self.fmt.line_width is None: + # no need to wrap around just print the whole frame + return self.adj.adjoin(1, *strcols) - return strcols + if self.fmt.need_to_wrap_around: + return self._join_multiline(*strcols) + + # max_cols == 0. Try to fit frame to terminal + return self._fit_strcols_to_terminal_width(strcols) def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: - str_index = self._get_formatted_index(self.tr_frame) + str_index = self.fmt._get_formatted_index(self.fmt.tr_frame) index_length = len(str_index) - if self.is_truncated_horizontally: + if self.fmt.is_truncated_horizontally: strcols = self._insert_dot_separator_horizontal(strcols, index_length) - if self.is_truncated_vertically: + if self.fmt.is_truncated_vertically: strcols = self._insert_dot_separator_vertical(strcols, index_length) return strcols @@ -850,19 +956,19 @@ def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: def _insert_dot_separator_horizontal( self, strcols: List[List[str]], index_length: int ) -> List[List[str]]: - strcols.insert(self.tr_col_num + 1, [" ..."] * index_length) + strcols.insert(self.fmt.tr_col_num + 1, [" ..."] * index_length) return strcols def _insert_dot_separator_vertical( self, strcols: List[List[str]], index_length: int ) -> List[List[str]]: - n_header_rows = index_length - len(self.tr_frame) - row_num = self.tr_row_num + n_header_rows = index_length - len(self.fmt.tr_frame) + row_num = self.fmt.tr_row_num for ix, col in enumerate(strcols): cwidth = self.adj.len(col[row_num]) - if self.is_truncated_horizontally: - is_dot_col = ix == self.tr_col_num + 1 + if self.fmt.is_truncated_horizontally: + is_dot_col = ix == self.fmt.tr_col_num + 1 else: is_dot_col = False @@ -883,9 +989,43 @@ def _insert_dot_separator_vertical( col.insert(row_num + n_header_rows, dot_str) return strcols - @property - def _dimensions_info(self) -> str: - return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]" + def _join_multiline(self, *args) -> str: + lwidth = self.fmt.line_width + adjoin_width = 1 + strcols = list(args) + if self.fmt.index: + idx = strcols.pop(0) + lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + + col_widths = [ + np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 + for col in strcols + ] + + assert lwidth is not None + col_bins = _binify(col_widths, lwidth) + nbins = len(col_bins) + + if self.fmt.is_truncated_vertically: + assert self.fmt.max_rows_fitted is not None + nrows = self.fmt.max_rows_fitted + 1 + else: + nrows = len(self.frame) + + str_lst = [] + start = 0 + for i, end in enumerate(col_bins): + row = strcols[start:end] + if self.fmt.index: + row.insert(0, idx) + if nbins > 1: + if end <= len(strcols) and i < nbins - 1: + row.append([" \\"] + [" "] * (nrows - 1)) + else: + row.append([" "] * nrows) + str_lst.append(self.adj.adjoin(adjoin_width, *row)) + start = end + return "\n\n".join(str_lst) def _fit_strcols_to_terminal_width(self, strcols) -> str: from pandas import Series @@ -911,17 +1051,25 @@ def _fit_strcols_to_terminal_width(self, strcols) -> str: n_cols = len(col_lens) # subtract index column - max_cols_fitted = n_cols - self.index + max_cols_fitted = n_cols - self.fmt.index # GH-21180. Ensure that we print at least two. max_cols_fitted = max(max_cols_fitted, 2) - self.max_cols_fitted = max_cols_fitted + self.fmt.max_cols_fitted = max_cols_fitted # Call again _truncate to cut frame appropriately # and then generate string representation - self._truncate() - strcols = self._to_str_columns() + self.fmt.truncate() + strcols = self._get_strcols() return self.adj.adjoin(1, *strcols) + +class DataFrameRenderer: + def __init__( + self, + fmt: DataFrameFormatter, + ): + self.fmt = fmt + def to_latex( self, buf: Optional[FilePathOrBuffer[str]] = None, @@ -941,7 +1089,7 @@ def to_latex( from pandas.io.formats.latex import LatexFormatter latex_formatter = LatexFormatter( - self, + self.fmt, longtable=longtable, column_format=column_format, multicolumn=multicolumn, @@ -951,20 +1099,8 @@ def to_latex( label=label, position=position, ) - return latex_formatter.get_result(buf=buf, encoding=encoding) - - def _format_col(self, i: int) -> List[str]: - frame = self.tr_frame - formatter = self._get_formatter(i) - return format_array( - frame.iloc[:, i]._values, - formatter, - float_format=self.float_format, - na_rep=self.na_rep, - space=self.col_space.get(frame.columns[i]), - decimal=self.decimal, - leading_space=self.index, - ) + string = latex_formatter.to_string() + return self._get_result(string, buf=buf, encoding=encoding) def to_html( self, @@ -998,194 +1134,72 @@ def to_html( Klass = NotebookFormatter if notebook else HTMLFormatter - instance = Klass( - self, + html_formatter = Klass( + self.fmt, classes=classes, border=border, table_id=table_id, render_links=render_links, ) - return instance.get_result(buf=buf, encoding=encoding) - - def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: - from pandas.core.indexes.multi import sparsify_labels - - columns = frame.columns - - if isinstance(columns, MultiIndex): - fmt_columns = columns.format(sparsify=False, adjoin=False) - fmt_columns = list(zip(*fmt_columns)) - dtypes = self.frame.dtypes._values - - # if we have a Float level, they don't use leading space at all - restrict_formatting = any(l.is_floating for l in columns.levels) - need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) - - def space_format(x, y): - if ( - y not in self.formatters - and need_leadsp[x] - and not restrict_formatting - ): - return " " + y - return y - - str_columns = list( - zip(*[[space_format(x, y) for y in x] for x in fmt_columns]) - ) - if self.sparsify and len(str_columns): - str_columns = sparsify_labels(str_columns) - - str_columns = [list(x) for x in zip(*str_columns)] - else: - fmt_columns = columns.format() - dtypes = self.frame.dtypes - need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) - str_columns = [ - [" " + x if not self._get_formatter(i) and need_leadsp[x] else x] - for i, (col, x) in enumerate(zip(columns, fmt_columns)) - ] - # self.str_columns = str_columns - return str_columns - - @property - def has_index_names(self) -> bool: - return _has_names(self.frame.index) - - @property - def has_column_names(self) -> bool: - return _has_names(self.frame.columns) - - @property - def show_row_idx_names(self) -> bool: - return all((self.has_index_names, self.index, self.show_index_names)) - - @property - def show_col_idx_names(self) -> bool: - return all((self.has_column_names, self.show_index_names, self.header)) - - def _get_formatted_index(self, frame: "DataFrame") -> List[str]: - # Note: this is only used by to_string() and to_latex(), not by - # to_html(). so safe to cast col_space here. - col_space = {k: cast(int, v) for k, v in self.col_space.items()} - index = frame.index - columns = frame.columns - fmt = self._get_formatter("__index__") + string = html_formatter.to_string() + return self._get_result(string, buf=buf, encoding=encoding) - if isinstance(index, MultiIndex): - fmt_index = index.format( - sparsify=self.sparsify, - adjoin=False, - names=self.show_row_idx_names, - formatter=fmt, - ) - else: - fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)] - - fmt_index = [ - tuple( - _make_fixed_width( - list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj - ) - ) - for x in fmt_index - ] - - adjoined = self.adj.adjoin(1, *fmt_index).split("\n") - - # empty space for columns - if self.show_col_idx_names: - col_header = [str(x) for x in self._get_column_name_list()] - else: - col_header = [""] * columns.nlevels - - if self.header: - return col_header + adjoined - else: - return adjoined - - def _get_column_name_list(self) -> List[str]: - names: List[str] = [] - columns = self.frame.columns - if isinstance(columns, MultiIndex): - names.extend("" if name is None else name for name in columns.names) - else: - names.append("" if columns.name is None else columns.name) - return names - - -class ConsoleFormatter(DataFrameFormatter): - - def write_result(self, buf: IO[str]) -> None: + def to_string( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + ) -> Optional[str]: """ Render a DataFrame to a console-friendly tabular output. """ - text = self._get_string_representation() + string_formatter = StringFormatter(self.fmt) + string = string_formatter.to_string() + return self._get_result(string, buf=buf, encoding=encoding) - buf.writelines(text) - - if self.should_show_dimensions: - buf.write(self._dimensions_info) - - def _get_string_representation(self) -> str: - if self.frame.empty: - info_line = ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {pprint_thing(self.frame.columns)}\n" - f"Index: {pprint_thing(self.frame.index)}" - ) - return info_line - - strcols = self._to_str_columns() - - if self.line_width is None: - # no need to wrap around just print the whole frame - return self.adj.adjoin(1, *strcols) - - if self.max_cols is None or self.max_cols > 0: - # need to wrap around - return self._join_multiline(*strcols) - - # max_cols == 0. Try to fit frame to terminal - return self._fit_strcols_to_terminal_width(strcols) - - def _join_multiline(self, *args) -> str: - lwidth = self.line_width - adjoin_width = 1 - strcols = list(args) - if self.index: - idx = strcols.pop(0) - lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + def _get_result( + self, + string: str, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + ) -> Optional[str]: + """ + Perform serialization. Write to buf or return as string if buf is None. + """ + with self.get_buffer(buf, encoding=encoding) as f: + f.write(string) + if buf is None: + return f.getvalue() + return None - col_widths = [ - np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 - for col in strcols - ] + @contextmanager + def get_buffer( + self, buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None + ): + """ + Context manager to open, yield and close buffer for filenames or Path-like + objects, otherwise yield buf unchanged. + """ + if buf is not None: + buf = stringify_path(buf) + else: + buf = StringIO() - assert lwidth is not None - col_bins = _binify(col_widths, lwidth) - nbins = len(col_bins) + if encoding is None: + encoding = "utf-8" + elif not isinstance(buf, str): + raise ValueError("buf is not a file name and encoding is specified.") - if self.is_truncated_vertically: - assert self.max_rows_fitted is not None - nrows = self.max_rows_fitted + 1 + if hasattr(buf, "write"): + yield buf + elif isinstance(buf, str): + with open(buf, "w", encoding=encoding, newline="") as f: + # GH#30034 open instead of codecs.open prevents a file leak + # if we have an invalid encoding argument. + # newline="" is needed to roundtrip correctly on + # windows test_to_latex_filename + yield f else: - nrows = len(self.frame) - - str_lst = [] - start = 0 - for i, end in enumerate(col_bins): - row = strcols[start:end] - if self.index: - row.insert(0, idx) - if nbins > 1: - if end <= len(strcols) and i < nbins - 1: - row.append([" \\"] + [" "] * (nrows - 1)) - else: - row.append([" "] * nrows) - str_lst.append(self.adj.adjoin(adjoin_width, *row)) - start = end - return "\n\n".join(str_lst) + raise TypeError("buf is not a file name and it has no write method") # ---------------------------------------------------------------------- diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 892ef3878876e..5fa1411617d20 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -199,8 +199,11 @@ def render(self) -> List[str]: return self.elements - def write_result(self, buf: IO[str]) -> None: - buffer_put_lines(buf, self.render()) + def to_string(self) -> str: + lines = self.render() + if any(isinstance(x, str) for x in lines): + lines = [str(x) for x in lines] + return "\n".join(lines) def _write_table(self, indent: int = 0) -> None: _classes = ["dataframe"] # Default class. @@ -371,7 +374,7 @@ def _write_header(self, indent: int) -> None: def _get_formatted_values(self) -> Dict[int, List[str]]: with option_context("display.max_colwidth", None): - fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} + fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)} return fmt_values def _write_body(self, indent: int) -> None: @@ -566,7 +569,7 @@ class NotebookFormatter(HTMLFormatter): """ def _get_formatted_values(self) -> Dict[int, List[str]]: - return {i: self.fmt._format_col(i) for i in range(self.ncols)} + return {i: self.fmt.format_col(i) for i in range(self.ncols)} def _get_columns_formatted_values(self) -> List[str]: return self.columns.format() diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 39c7b4b34a131..f0e7ad67f1f98 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -108,7 +108,7 @@ def _get_strcols(self) -> List[List[str]]: ) strcols = [[info_line]] else: - strcols = self.fmt._to_str_columns() + strcols = self.fmt.get_strcols() # reestablish the MultiIndex that has been joined by _to_str_column if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): @@ -634,13 +634,12 @@ def __init__( self.label = label self.position = position - def write_result(self, buf: IO[str]) -> None: + def to_string(self) -> str: """ Render a DataFrame to a LaTeX tabular, longtable, or table/tabular environment output. """ - table_string = self.builder.get_result() - buf.write(table_string) + return self.builder.get_result() @property def builder(self) -> TableBuilderAbstract: From cbd3c76b9a6cc6e1c1a35e9eaaa8b7eec71e5e14 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 02:34:36 +0700 Subject: [PATCH 07/34] REF: extract _empty_info_line property in latex --- pandas/io/formats/format.py | 18 +++++++++--------- pandas/io/formats/latex.py | 21 ++++++++++++--------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c49ddcba7a276..c4da2589aae76 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -532,14 +532,6 @@ def is_truncated_horizontally(self) -> bool: def is_truncated_vertically(self) -> bool: return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted)) - @property - def info_line(self): - return ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {pprint_thing(self.frame.columns)}\n" - f"Index: {pprint_thing(self.frame.index)}" - ) - @property def dimensions_info(self) -> str: return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]" @@ -927,7 +919,7 @@ def _get_strcols(self): def _get_string_representation(self) -> str: if self.fmt.frame.empty: - return self.fmt.info_line + return self._empty_info_line strcols = self._get_strcols() @@ -941,6 +933,14 @@ def _get_string_representation(self) -> str: # max_cols == 0. Try to fit frame to terminal return self._fit_strcols_to_terminal_width(strcols) + @property + def _empty_info_line(self): + return ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {pprint_thing(self.frame.columns)}\n" + f"Index: {pprint_thing(self.frame.index)}" + ) + def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: str_index = self.fmt._get_formatted_index(self.fmt.tr_frame) index_length = len(str_index) diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index f0e7ad67f1f98..e57a78171f77b 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -2,7 +2,7 @@ Module for formatting output data in Latex. """ from abc import ABC, abstractmethod -from typing import IO, Iterator, List, Optional, Type +from typing import Iterator, List, Optional, Type import numpy as np @@ -100,17 +100,12 @@ def header_levels(self) -> int: def _get_strcols(self) -> List[List[str]]: """String representation of the columns.""" - if len(self.frame.columns) == 0 or len(self.frame.index) == 0: - info_line = ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {self.frame.columns}\n" - f"Index: {self.frame.index}" - ) - strcols = [[info_line]] + if self.fmt.frame.empty: + strcols = [[self._empty_info_line]] else: strcols = self.fmt.get_strcols() - # reestablish the MultiIndex that has been joined by _to_str_column + # reestablish the MultiIndex that has been joined by get_strcols() if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): out = self.frame.index.format( adjoin=False, @@ -143,6 +138,14 @@ def pad_empties(x): strcols = out + strcols[1:] return strcols + @property + def _empty_info_line(self): + return ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {self.frame.columns}\n" + f"Index: {self.frame.index}" + ) + def _preprocess_row(self, row: List[str]) -> List[str]: """Preprocess elements of the row.""" if self.fmt.escape: From 5c309240794c8fbb824c42a04321247868b0330e Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 02:45:32 +0700 Subject: [PATCH 08/34] DOC: docstrings for DataFrame & String Formatters --- pandas/io/formats/format.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c4da2589aae76..b00f40364e978 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -450,6 +450,7 @@ def get_adjustment() -> TextAdjustment: class DataFrameFormatter: + """Class for processing dataframe formatting options and data.""" __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring @@ -899,6 +900,7 @@ def _get_column_name_list(self) -> List[str]: class StringFormatter: + """Formatter for string representation of a dataframe.""" def __init__(self, fmt): self.fmt = fmt From af8fe988db247abd4a8d3253c2e500f291c2e6f3 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 02:47:34 +0700 Subject: [PATCH 09/34] REF: make _get_buffer private --- pandas/io/formats/format.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b00f40364e978..6f67ff7df6eff 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1066,10 +1066,7 @@ def _fit_strcols_to_terminal_width(self, strcols) -> str: class DataFrameRenderer: - def __init__( - self, - fmt: DataFrameFormatter, - ): + def __init__(self, fmt: DataFrameFormatter): self.fmt = fmt def to_latex( @@ -1167,14 +1164,14 @@ def _get_result( """ Perform serialization. Write to buf or return as string if buf is None. """ - with self.get_buffer(buf, encoding=encoding) as f: + with self._get_buffer(buf, encoding=encoding) as f: f.write(string) if buf is None: return f.getvalue() return None @contextmanager - def get_buffer( + def _get_buffer( self, buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None ): """ From 6e9fb3c944a74e4a801cc98286b2489ee42f4d28 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 03:05:24 +0700 Subject: [PATCH 10/34] REF: pass DataFrameFormatter to CSVFormatter --- pandas/core/generic.py | 25 +++++++++++++++---------- pandas/io/formats/csvs.py | 24 +++++++++++------------- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index df509b6f71527..bac15f690ce55 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3324,8 +3324,17 @@ def to_csv( from pandas.io.formats.csvs import CSVFormatter - formatter = CSVFormatter( - df, + formatter = DataFrameFormatter( + frame=df, + columns=columns, + header=header, + index=index, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + ) + + csv_formatter = CSVFormatter( path_or_buf, line_terminator=line_terminator, sep=sep, @@ -3333,11 +3342,7 @@ def to_csv( errors=errors, compression=compression, quoting=quoting, - na_rep=na_rep, - float_format=float_format, cols=columns, - header=header, - index=index, index_label=index_label, mode=mode, chunksize=chunksize, @@ -3345,14 +3350,14 @@ def to_csv( date_format=date_format, doublequote=doublequote, escapechar=escapechar, - decimal=decimal, storage_options=storage_options, + formatter=formatter, ) - formatter.save() + csv_formatter.save() if path_or_buf is None: - assert isinstance(formatter.path_or_buf, StringIO) - return formatter.path_or_buf.getvalue() + assert isinstance(csv_formatter.path_or_buf, StringIO) + return csv_formatter.path_or_buf.getvalue() return None diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index d0e9163fc5f11..106471d53e6a1 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,7 +5,7 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Any, Dict, Hashable, Iterator, List, Optional, Sequence, Union +from typing import Any, Dict, Iterator, List, Optional, Sequence import numpy as np @@ -29,19 +29,15 @@ from pandas.core.indexes.api import Index from pandas.io.common import get_filepath_or_buffer, get_handle +from pandas.io.formats.format import DataFrameFormatter class CSVFormatter: def __init__( self, - obj, path_or_buf: Optional[FilePathOrBuffer[str]] = None, sep: str = ",", - na_rep: str = "", - float_format: Optional[str] = None, cols: Optional[Sequence[Label]] = None, - header: Union[bool, Sequence[Hashable]] = True, - index: bool = True, index_label: Optional[IndexLabel] = None, mode: str = "w", encoding: Optional[str] = None, @@ -54,10 +50,12 @@ def __init__( date_format: Optional[str] = None, doublequote: bool = True, escapechar: Optional[str] = None, - decimal=".", storage_options: StorageOptions = None, + formatter: DataFrameFormatter = None, ): - self.obj = obj + self.fmt = formatter + + self.obj = self.fmt.frame self.encoding = encoding or "utf-8" @@ -79,11 +77,11 @@ def __init__( self.mode = ioargs.mode self.sep = sep - self.na_rep = na_rep - self.float_format = float_format - self.decimal = decimal - self.header = header - self.index = index + self.na_rep = self.fmt.na_rep + self.float_format = self.fmt.float_format + self.decimal = self.fmt.decimal + self.header = self.fmt.header + self.index = self.fmt.index self.index_label = index_label self.errors = errors self.quoting = quoting or csvlib.QUOTE_MINIMAL From 878eed2542fff5ec3f1ee7e607d5d926c49bad53 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 03:25:46 +0700 Subject: [PATCH 11/34] REF: create to_csv in DataFrameRenderer --- pandas/core/generic.py | 15 ++-------- pandas/io/formats/format.py | 56 ++++++++++++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 14 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bac15f690ce55..942e17f3c722b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4,7 +4,6 @@ from datetime import timedelta import functools import gc -from io import StringIO import json import operator import pickle @@ -3322,8 +3321,6 @@ def to_csv( """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() - from pandas.io.formats.csvs import CSVFormatter - formatter = DataFrameFormatter( frame=df, columns=columns, @@ -3334,7 +3331,7 @@ def to_csv( decimal=decimal, ) - csv_formatter = CSVFormatter( + return DataFrameRenderer(formatter).to_csv( path_or_buf, line_terminator=line_terminator, sep=sep, @@ -3342,7 +3339,7 @@ def to_csv( errors=errors, compression=compression, quoting=quoting, - cols=columns, + columns=columns, index_label=index_label, mode=mode, chunksize=chunksize, @@ -3351,15 +3348,7 @@ def to_csv( doublequote=doublequote, escapechar=escapechar, storage_options=storage_options, - formatter=formatter, ) - csv_formatter.save() - - if path_or_buf is None: - assert isinstance(csv_formatter.path_or_buf, StringIO) - return csv_formatter.path_or_buf.getvalue() - - return None # ---------------------------------------------------------------------- # Lookup Caching diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 6f67ff7df6eff..cf4e6d1ba6789 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -40,7 +40,13 @@ from pandas._libs.tslib import format_array_from_datetime from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.nattype import NaTType -from pandas._typing import FilePathOrBuffer, Label +from pandas._typing import ( + CompressionOptions, + FilePathOrBuffer, + IndexLabel, + Label, + StorageOptions, +) from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -1155,6 +1161,54 @@ def to_string( string = string_formatter.to_string() return self._get_result(string, buf=buf, encoding=encoding) + def to_csv( + self, + path_or_buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + sep: str = ",", + columns: Optional[Sequence[Label]] = None, + index_label: Optional[IndexLabel] = None, + mode: str = "w", + compression: CompressionOptions = "infer", + quoting: Optional[int] = None, + quotechar: str = '"', + line_terminator: Optional[str] = None, + chunksize: Optional[int] = None, + date_format: Optional[str] = None, + doublequote: bool = True, + escapechar: Optional[str] = None, + errors: str = "strict", + storage_options: StorageOptions = None, + ) -> Optional[str]: + from pandas.io.formats.csvs import CSVFormatter + + csv_formatter = CSVFormatter( + path_or_buf=path_or_buf, + line_terminator=line_terminator, + sep=sep, + encoding=encoding, + errors=errors, + compression=compression, + quoting=quoting, + cols=columns, + index_label=index_label, + mode=mode, + chunksize=chunksize, + quotechar=quotechar, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, + storage_options=storage_options, + formatter=self.fmt, + ) + csv_formatter.save() + + if path_or_buf is None: + assert isinstance(csv_formatter.path_or_buf, StringIO) + return csv_formatter.path_or_buf.getvalue() + + return None + def _get_result( self, string: str, From d87638b2784022d4da66dcf2336c6e12e38c15af Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 03:33:31 +0700 Subject: [PATCH 12/34] LINT: imports and line breaks --- pandas/core/frame.py | 3 +-- pandas/io/formats/html.py | 8 ++------ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b99e814ba4920..c5b218be089c6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -873,8 +873,7 @@ def to_string( line_width=line_width, ) return fmt.DataFrameRenderer(formatter).to_string( - buf=buf, - encoding=encoding, + buf=buf, encoding=encoding ) # ---------------------------------------------------------------------- diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 5fa1411617d20..65c7bb7ce8097 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -3,7 +3,7 @@ """ from textwrap import dedent -from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast from pandas._config import get_option @@ -12,11 +12,7 @@ from pandas import MultiIndex, option_context from pandas.io.common import is_url -from pandas.io.formats.format import ( - DataFrameFormatter, - buffer_put_lines, - get_level_lengths, -) +from pandas.io.formats.format import DataFrameFormatter, get_level_lengths from pandas.io.formats.printing import pprint_thing From 1292be5ee0c32486db1295eed3db5b2f058552b0 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 03:41:23 +0700 Subject: [PATCH 13/34] REF: move StringFormatter to separate module New module suggested: pandas/io/formats/string.py --- pandas/io/formats/format.py | 188 +--------------------------------- pandas/io/formats/string.py | 195 ++++++++++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+), 186 deletions(-) create mode 100644 pandas/io/formats/string.py diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index cf4e6d1ba6789..daaf16d5710e4 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -905,172 +905,6 @@ def _get_column_name_list(self) -> List[str]: return names -class StringFormatter: - """Formatter for string representation of a dataframe.""" - - def __init__(self, fmt): - self.fmt = fmt - self.adj = fmt.adj - self.frame = fmt.frame - - def to_string(self) -> str: - text = self._get_string_representation() - if self.fmt.should_show_dimensions: - text = "".join([text, self.fmt.dimensions_info]) - return text - - def _get_strcols(self): - strcols = self.fmt.get_strcols() - if self.fmt.is_truncated: - strcols = self._insert_dot_separators(strcols) - return strcols - - def _get_string_representation(self) -> str: - if self.fmt.frame.empty: - return self._empty_info_line - - strcols = self._get_strcols() - - if self.fmt.line_width is None: - # no need to wrap around just print the whole frame - return self.adj.adjoin(1, *strcols) - - if self.fmt.need_to_wrap_around: - return self._join_multiline(*strcols) - - # max_cols == 0. Try to fit frame to terminal - return self._fit_strcols_to_terminal_width(strcols) - - @property - def _empty_info_line(self): - return ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {pprint_thing(self.frame.columns)}\n" - f"Index: {pprint_thing(self.frame.index)}" - ) - - def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: - str_index = self.fmt._get_formatted_index(self.fmt.tr_frame) - index_length = len(str_index) - - if self.fmt.is_truncated_horizontally: - strcols = self._insert_dot_separator_horizontal(strcols, index_length) - - if self.fmt.is_truncated_vertically: - strcols = self._insert_dot_separator_vertical(strcols, index_length) - - return strcols - - def _insert_dot_separator_horizontal( - self, strcols: List[List[str]], index_length: int - ) -> List[List[str]]: - strcols.insert(self.fmt.tr_col_num + 1, [" ..."] * index_length) - return strcols - - def _insert_dot_separator_vertical( - self, strcols: List[List[str]], index_length: int - ) -> List[List[str]]: - n_header_rows = index_length - len(self.fmt.tr_frame) - row_num = self.fmt.tr_row_num - for ix, col in enumerate(strcols): - cwidth = self.adj.len(col[row_num]) - - if self.fmt.is_truncated_horizontally: - is_dot_col = ix == self.fmt.tr_col_num + 1 - else: - is_dot_col = False - - if cwidth > 3 or is_dot_col: - dots = "..." - else: - dots = ".." - - if ix == 0: - dot_mode = "left" - elif is_dot_col: - cwidth = 4 - dot_mode = "right" - else: - dot_mode = "right" - - dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0] - col.insert(row_num + n_header_rows, dot_str) - return strcols - - def _join_multiline(self, *args) -> str: - lwidth = self.fmt.line_width - adjoin_width = 1 - strcols = list(args) - if self.fmt.index: - idx = strcols.pop(0) - lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width - - col_widths = [ - np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 - for col in strcols - ] - - assert lwidth is not None - col_bins = _binify(col_widths, lwidth) - nbins = len(col_bins) - - if self.fmt.is_truncated_vertically: - assert self.fmt.max_rows_fitted is not None - nrows = self.fmt.max_rows_fitted + 1 - else: - nrows = len(self.frame) - - str_lst = [] - start = 0 - for i, end in enumerate(col_bins): - row = strcols[start:end] - if self.fmt.index: - row.insert(0, idx) - if nbins > 1: - if end <= len(strcols) and i < nbins - 1: - row.append([" \\"] + [" "] * (nrows - 1)) - else: - row.append([" "] * nrows) - str_lst.append(self.adj.adjoin(adjoin_width, *row)) - start = end - return "\n\n".join(str_lst) - - def _fit_strcols_to_terminal_width(self, strcols) -> str: - from pandas import Series - - lines = self.adj.adjoin(1, *strcols).split("\n") - max_len = Series(lines).str.len().max() - # plus truncate dot col - width, _ = get_terminal_size() - dif = max_len - width - # '+ 1' to avoid too wide repr (GH PR #17023) - adj_dif = dif + 1 - col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) - n_cols = len(col_lens) - counter = 0 - while adj_dif > 0 and n_cols > 1: - counter += 1 - mid = int(round(n_cols / 2.0)) - mid_ix = col_lens.index[mid] - col_len = col_lens[mid_ix] - # adjoin adds one - adj_dif -= col_len + 1 - col_lens = col_lens.drop(mid_ix) - n_cols = len(col_lens) - - # subtract index column - max_cols_fitted = n_cols - self.fmt.index - # GH-21180. Ensure that we print at least two. - max_cols_fitted = max(max_cols_fitted, 2) - self.fmt.max_cols_fitted = max_cols_fitted - - # Call again _truncate to cut frame appropriately - # and then generate string representation - self.fmt.truncate() - strcols = self._get_strcols() - return self.adj.adjoin(1, *strcols) - - class DataFrameRenderer: def __init__(self, fmt: DataFrameFormatter): self.fmt = fmt @@ -1157,6 +991,8 @@ def to_string( """ Render a DataFrame to a console-friendly tabular output. """ + from pandas.io.formats.string import StringFormatter + string_formatter = StringFormatter(self.fmt) string = string_formatter.to_string() return self._get_result(string, buf=buf, encoding=encoding) @@ -2080,26 +1916,6 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non set_option("display.column_space", max(12, accuracy + 9)) -def _binify(cols: List[int], line_width: int) -> List[int]: - adjoin_width = 1 - bins = [] - curr_width = 0 - i_last_column = len(cols) - 1 - for i, w in enumerate(cols): - w_adjoined = w + adjoin_width - curr_width += w_adjoined - if i_last_column == i: - wrap = curr_width + 1 > line_width and i > 0 - else: - wrap = curr_width + 2 > line_width and i > 0 - if wrap: - bins.append(i) - curr_width = w_adjoined - - bins.append(len(cols)) - return bins - - def get_level_lengths( levels: Any, sentinel: Union[bool, object, str] = "" ) -> List[Dict[int, int]]: diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py new file mode 100644 index 0000000000000..e09ef58fad146 --- /dev/null +++ b/pandas/io/formats/string.py @@ -0,0 +1,195 @@ +""" +Module for formatting output data in console (to string). +""" +from shutil import get_terminal_size +from typing import List + +import numpy as np + +from pandas.io.formats.printing import pprint_thing + + +class StringFormatter: + """Formatter for string representation of a dataframe.""" + + def __init__(self, fmt): + self.fmt = fmt + self.adj = fmt.adj + self.frame = fmt.frame + + def to_string(self) -> str: + text = self._get_string_representation() + if self.fmt.should_show_dimensions: + text = "".join([text, self.fmt.dimensions_info]) + return text + + def _get_strcols(self): + strcols = self.fmt.get_strcols() + if self.fmt.is_truncated: + strcols = self._insert_dot_separators(strcols) + return strcols + + def _get_string_representation(self) -> str: + if self.fmt.frame.empty: + return self._empty_info_line + + strcols = self._get_strcols() + + if self.fmt.line_width is None: + # no need to wrap around just print the whole frame + return self.adj.adjoin(1, *strcols) + + if self.fmt.need_to_wrap_around: + return self._join_multiline(*strcols) + + # max_cols == 0. Try to fit frame to terminal + return self._fit_strcols_to_terminal_width(strcols) + + @property + def _empty_info_line(self): + return ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {pprint_thing(self.frame.columns)}\n" + f"Index: {pprint_thing(self.frame.index)}" + ) + + def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: + str_index = self.fmt._get_formatted_index(self.fmt.tr_frame) + index_length = len(str_index) + + if self.fmt.is_truncated_horizontally: + strcols = self._insert_dot_separator_horizontal(strcols, index_length) + + if self.fmt.is_truncated_vertically: + strcols = self._insert_dot_separator_vertical(strcols, index_length) + + return strcols + + def _insert_dot_separator_horizontal( + self, strcols: List[List[str]], index_length: int + ) -> List[List[str]]: + strcols.insert(self.fmt.tr_col_num + 1, [" ..."] * index_length) + return strcols + + def _insert_dot_separator_vertical( + self, strcols: List[List[str]], index_length: int + ) -> List[List[str]]: + n_header_rows = index_length - len(self.fmt.tr_frame) + row_num = self.fmt.tr_row_num + for ix, col in enumerate(strcols): + cwidth = self.adj.len(col[row_num]) + + if self.fmt.is_truncated_horizontally: + is_dot_col = ix == self.fmt.tr_col_num + 1 + else: + is_dot_col = False + + if cwidth > 3 or is_dot_col: + dots = "..." + else: + dots = ".." + + if ix == 0: + dot_mode = "left" + elif is_dot_col: + cwidth = 4 + dot_mode = "right" + else: + dot_mode = "right" + + dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0] + col.insert(row_num + n_header_rows, dot_str) + return strcols + + def _join_multiline(self, *args) -> str: + lwidth = self.fmt.line_width + adjoin_width = 1 + strcols = list(args) + if self.fmt.index: + idx = strcols.pop(0) + lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + + col_widths = [ + np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 + for col in strcols + ] + + assert lwidth is not None + col_bins = _binify(col_widths, lwidth) + nbins = len(col_bins) + + if self.fmt.is_truncated_vertically: + assert self.fmt.max_rows_fitted is not None + nrows = self.fmt.max_rows_fitted + 1 + else: + nrows = len(self.frame) + + str_lst = [] + start = 0 + for i, end in enumerate(col_bins): + row = strcols[start:end] + if self.fmt.index: + row.insert(0, idx) + if nbins > 1: + if end <= len(strcols) and i < nbins - 1: + row.append([" \\"] + [" "] * (nrows - 1)) + else: + row.append([" "] * nrows) + str_lst.append(self.adj.adjoin(adjoin_width, *row)) + start = end + return "\n\n".join(str_lst) + + def _fit_strcols_to_terminal_width(self, strcols) -> str: + from pandas import Series + + lines = self.adj.adjoin(1, *strcols).split("\n") + max_len = Series(lines).str.len().max() + # plus truncate dot col + width, _ = get_terminal_size() + dif = max_len - width + # '+ 1' to avoid too wide repr (GH PR #17023) + adj_dif = dif + 1 + col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) + n_cols = len(col_lens) + counter = 0 + while adj_dif > 0 and n_cols > 1: + counter += 1 + mid = int(round(n_cols / 2.0)) + mid_ix = col_lens.index[mid] + col_len = col_lens[mid_ix] + # adjoin adds one + adj_dif -= col_len + 1 + col_lens = col_lens.drop(mid_ix) + n_cols = len(col_lens) + + # subtract index column + max_cols_fitted = n_cols - self.fmt.index + # GH-21180. Ensure that we print at least two. + max_cols_fitted = max(max_cols_fitted, 2) + self.fmt.max_cols_fitted = max_cols_fitted + + # Call again _truncate to cut frame appropriately + # and then generate string representation + self.fmt.truncate() + strcols = self._get_strcols() + return self.adj.adjoin(1, *strcols) + + +def _binify(cols: List[int], line_width: int) -> List[int]: + adjoin_width = 1 + bins = [] + curr_width = 0 + i_last_column = len(cols) - 1 + for i, w in enumerate(cols): + w_adjoined = w + adjoin_width + curr_width += w_adjoined + if i_last_column == i: + wrap = curr_width + 1 > line_width and i > 0 + else: + wrap = curr_width + 2 > line_width and i > 0 + if wrap: + bins.append(i) + curr_width = w_adjoined + + bins.append(len(cols)) + return bins From 41553f687cd6e8c72571a923f460ca31b25ad975 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 04:01:58 +0700 Subject: [PATCH 14/34] TYP: handle mypy errors after enabling composition --- pandas/core/generic.py | 3 +-- pandas/io/formats/csvs.py | 9 +++++++-- pandas/io/formats/string.py | 7 ++++--- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 942e17f3c722b..52cb96126265c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3163,7 +3163,7 @@ def to_csv( date_format: Optional[str] = None, doublequote: bool_t = True, escapechar: Optional[str] = None, - decimal: Optional[str] = ".", + decimal: str = ".", errors: str = "strict", storage_options: StorageOptions = None, ) -> Optional[str]: @@ -3323,7 +3323,6 @@ def to_csv( formatter = DataFrameFormatter( frame=df, - columns=columns, header=header, index=index, na_rep=na_rep, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 106471d53e6a1..0ca680f243c32 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -35,6 +35,7 @@ class CSVFormatter: def __init__( self, + formatter: DataFrameFormatter, path_or_buf: Optional[FilePathOrBuffer[str]] = None, sep: str = ",", cols: Optional[Sequence[Label]] = None, @@ -51,7 +52,6 @@ def __init__( doublequote: bool = True, escapechar: Optional[str] = None, storage_options: StorageOptions = None, - formatter: DataFrameFormatter = None, ): self.fmt = formatter @@ -159,7 +159,12 @@ def _refine_cols(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: # update columns to include possible multiplicity of dupes # and make sure sure cols is just a list of labels - cols = self.obj.columns + + # Ignore mypy error + # Incompatible types in assignment + # (expression has type "Index", + # variable has type "Optional[Sequence[Optional[Hashable]]]") [assignment] + cols = self.obj.columns # type: ignore[assignment] if isinstance(cols, ABCIndexClass): return cols._format_native_types(**self._number_format) else: diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index e09ef58fad146..83e818ae1cf94 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -6,13 +6,14 @@ import numpy as np +from pandas.io.formats.format import DataFrameFormatter from pandas.io.formats.printing import pprint_thing class StringFormatter: """Formatter for string representation of a dataframe.""" - def __init__(self, fmt): + def __init__(self, fmt: DataFrameFormatter): self.fmt = fmt self.adj = fmt.adj self.frame = fmt.frame @@ -23,7 +24,7 @@ def to_string(self) -> str: text = "".join([text, self.fmt.dimensions_info]) return text - def _get_strcols(self): + def _get_strcols(self) -> List[List[str]]: strcols = self.fmt.get_strcols() if self.fmt.is_truncated: strcols = self._insert_dot_separators(strcols) @@ -46,7 +47,7 @@ def _get_string_representation(self) -> str: return self._fit_strcols_to_terminal_width(strcols) @property - def _empty_info_line(self): + def _empty_info_line(self) -> str: return ( f"Empty {type(self.frame).__name__}\n" f"Columns: {pprint_thing(self.frame.columns)}\n" From a66ca5ef1942799399b5855a7ff2bdd80babf85c Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 04:11:11 +0700 Subject: [PATCH 15/34] REF: remove non-existent parent in LatexFormatter --- pandas/io/formats/latex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index e57a78171f77b..2c9a9a5893b14 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -598,7 +598,7 @@ def env_end(self) -> str: return "\\end{tabular}" -class LatexFormatter(DataFrameFormatter): +class LatexFormatter: """ Used to render a DataFrame to a LaTeX tabular/longtable environment output. From 3fbe4ba3e9586df50841036e68e62aab42ef310c Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 10:49:07 +0700 Subject: [PATCH 16/34] REF: move line_width to StringFormatter --- pandas/core/frame.py | 5 +++-- pandas/io/formats/format.py | 5 ++--- pandas/io/formats/string.py | 13 +++++++++---- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c5b218be089c6..c8dae9173da24 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -870,10 +870,11 @@ def to_string( max_cols=max_cols, show_dimensions=show_dimensions, decimal=decimal, - line_width=line_width, ) return fmt.DataFrameRenderer(formatter).to_string( - buf=buf, encoding=encoding + buf=buf, + encoding=encoding, + line_width=line_width, ) # ---------------------------------------------------------------------- diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index daaf16d5710e4..e97048986b05a 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -474,7 +474,6 @@ def __init__( float_format: Optional[FloatFormatType] = None, sparsify: Optional[bool] = None, index_names: bool = True, - line_width: Optional[int] = None, max_rows: Optional[int] = None, min_rows: Optional[int] = None, max_cols: Optional[int] = None, @@ -497,7 +496,6 @@ def __init__( self.decimal = decimal self.bold_rows = bold_rows self.escape = escape - self.line_width = line_width self.max_rows = max_rows self.min_rows = min_rows self.max_cols = max_cols @@ -987,13 +985,14 @@ def to_string( self, buf: Optional[FilePathOrBuffer[str]] = None, encoding: Optional[str] = None, + line_width: Optional[int] = None, ) -> Optional[str]: """ Render a DataFrame to a console-friendly tabular output. """ from pandas.io.formats.string import StringFormatter - string_formatter = StringFormatter(self.fmt) + string_formatter = StringFormatter(self.fmt, line_width=line_width) string = string_formatter.to_string() return self._get_result(string, buf=buf, encoding=encoding) diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index 83e818ae1cf94..e99f1649310e1 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -2,7 +2,7 @@ Module for formatting output data in console (to string). """ from shutil import get_terminal_size -from typing import List +from typing import List, Optional import numpy as np @@ -13,10 +13,15 @@ class StringFormatter: """Formatter for string representation of a dataframe.""" - def __init__(self, fmt: DataFrameFormatter): + def __init__( + self, + fmt: DataFrameFormatter, + line_width: Optional[int] = None, + ): self.fmt = fmt self.adj = fmt.adj self.frame = fmt.frame + self.line_width = line_width def to_string(self) -> str: text = self._get_string_representation() @@ -36,7 +41,7 @@ def _get_string_representation(self) -> str: strcols = self._get_strcols() - if self.fmt.line_width is None: + if self.line_width is None: # no need to wrap around just print the whole frame return self.adj.adjoin(1, *strcols) @@ -103,7 +108,7 @@ def _insert_dot_separator_vertical( return strcols def _join_multiline(self, *args) -> str: - lwidth = self.fmt.line_width + lwidth = self.line_width adjoin_width = 1 strcols = list(args) if self.fmt.index: From 733fa34a773d95495f5d6a6e89474ab8a0f32205 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 11:53:01 +0700 Subject: [PATCH 17/34] REF: move need_to_wrap to StringFormatter --- pandas/io/formats/format.py | 4 ---- pandas/io/formats/string.py | 7 +++++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index e97048986b05a..7512b166b5f57 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -561,10 +561,6 @@ def show_col_idx_names(self) -> bool: def max_rows_displayed(self) -> int: return min(self.max_rows or len(self.frame), len(self.frame)) - @property - def need_to_wrap_around(self) -> bool: - return bool(self.max_cols is None or self.max_cols > 0) - def _initialize_sparsify(self, sparsify: Optional[bool]) -> bool: if sparsify is None: return get_option("display.multi_sparse") diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index e99f1649310e1..9dae002ccc9e2 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -45,10 +45,9 @@ def _get_string_representation(self) -> str: # no need to wrap around just print the whole frame return self.adj.adjoin(1, *strcols) - if self.fmt.need_to_wrap_around: + if self._need_to_wrap_around: return self._join_multiline(*strcols) - # max_cols == 0. Try to fit frame to terminal return self._fit_strcols_to_terminal_width(strcols) @property @@ -59,6 +58,10 @@ def _empty_info_line(self) -> str: f"Index: {pprint_thing(self.frame.index)}" ) + @property + def _need_to_wrap_around(self) -> bool: + return bool(self.fmt.max_cols is None or self.fmt.max_cols > 0) + def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: str_index = self.fmt._get_formatted_index(self.fmt.tr_frame) index_length = len(str_index) From bfb37d7abd954d92ae1b4062387ebed13f00d064 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 12:14:51 +0700 Subject: [PATCH 18/34] DOC: add docstrings to DataFrame.to_xxx methods --- pandas/io/formats/format.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 7512b166b5f57..685e94f423ad7 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -950,6 +950,10 @@ def to_html( Parameters ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + encoding : str, default “utf-8” + Set character encoding. classes : str or list-like classes to include in the `class` attribute of the opening ``
`` tag, in addition to the default "dataframe". @@ -958,10 +962,10 @@ def to_html( border : int A ``border=border`` attribute is included in the opening ``
`` tag. Default ``pd.options.display.html.border``. - table_id - - render_links - + table_id : str, optional + A css id is included in the opening `
` tag if specified. + render_links : bool, default False + Convert URLs to HTML links. """ from pandas.io.formats.html import HTMLFormatter, NotebookFormatter @@ -985,6 +989,15 @@ def to_string( ) -> Optional[str]: """ Render a DataFrame to a console-friendly tabular output. + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + encoding: str, default “utf-8” + Set character encoding. + line_width : int, optional + Width to wrap a line in characters. """ from pandas.io.formats.string import StringFormatter @@ -1011,6 +1024,9 @@ def to_csv( errors: str = "strict", storage_options: StorageOptions = None, ) -> Optional[str]: + """ + Render dataframe as comma-separated file. + """ from pandas.io.formats.csvs import CSVFormatter csv_formatter = CSVFormatter( From f1b494e7d6194bde0b307bddefdbde7d7c279277 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 15:59:50 +0700 Subject: [PATCH 19/34] CLN: to_string on top in HTMLFormatter --- pandas/io/formats/html.py | 34 +++++++++++++++++----------------- pandas/io/formats/string.py | 6 +----- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 65c7bb7ce8097..46e1d4951d43a 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -56,6 +56,23 @@ def __init__( for column, value in self.fmt.col_space.items() } + def to_string(self) -> str: + lines = self.render() + if any(isinstance(x, str) for x in lines): + lines = [str(x) for x in lines] + return "\n".join(lines) + + def render(self) -> List[str]: + self._write_table() + + if self.should_show_dimensions: + by = chr(215) # × + self.write( + f"

{len(self.frame)} rows {by} {len(self.frame.columns)} columns

" + ) + + return self.elements + @property def show_row_idx_names(self) -> bool: return self.fmt.show_row_idx_names @@ -184,23 +201,6 @@ def write_tr( indent -= indent_delta self.write("", indent) - def render(self) -> List[str]: - self._write_table() - - if self.should_show_dimensions: - by = chr(215) # × - self.write( - f"

{len(self.frame)} rows {by} {len(self.frame.columns)} columns

" - ) - - return self.elements - - def to_string(self) -> str: - lines = self.render() - if any(isinstance(x, str) for x in lines): - lines = [str(x) for x in lines] - return "\n".join(lines) - def _write_table(self, indent: int = 0) -> None: _classes = ["dataframe"] # Default class. use_mathjax = get_option("display.html.use_mathjax") diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index 9dae002ccc9e2..03b3981874d04 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -13,11 +13,7 @@ class StringFormatter: """Formatter for string representation of a dataframe.""" - def __init__( - self, - fmt: DataFrameFormatter, - line_width: Optional[int] = None, - ): + def __init__(self, fmt: DataFrameFormatter, line_width: Optional[int] = None): self.fmt = fmt self.adj = fmt.adj self.frame = fmt.frame From 75daa74569ea41d61ce9ea6ed2e42b7112aae179 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 21 Sep 2020 18:14:00 +0700 Subject: [PATCH 20/34] LINT: black conv multiline -> oneline --- pandas/core/frame.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c8dae9173da24..eaf840ef20e04 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -872,9 +872,7 @@ def to_string( decimal=decimal, ) return fmt.DataFrameRenderer(formatter).to_string( - buf=buf, - encoding=encoding, - line_width=line_width, + buf=buf, encoding=encoding, line_width=line_width, ) # ---------------------------------------------------------------------- From 6e392779a96ccedea0651908657987d3424d4f7d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 22 Sep 2020 19:06:22 +0700 Subject: [PATCH 21/34] DOC: add docstring to DataFrameRenderer --- pandas/io/formats/format.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 685e94f423ad7..568a0e1b53702 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -900,6 +900,22 @@ def _get_column_name_list(self) -> List[str]: class DataFrameRenderer: + """Class for creating dataframe output in multiple formats. + + Called in pandas.core.generic.NDFrame: + - to_csv + - to_latex + + Called in pandas.core.frame.DataFrame: + - to_html + - to_string + + Parameters + ---------- + fmt : DataFrameFormatter + Formatter with the formating options. + """ + def __init__(self, fmt: DataFrameFormatter): self.fmt = fmt From df3b5c6858b11db39ee68c2b0bd0540e0f55438c Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 22 Sep 2020 19:15:44 +0700 Subject: [PATCH 22/34] REF: move _get_result, _get_buffer to module level These methods do not depend on the object state, so were removed from the class to the module level. --- pandas/io/formats/format.py | 93 ++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 568a0e1b53702..4113098ea7847 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -949,7 +949,7 @@ def to_latex( position=position, ) string = latex_formatter.to_string() - return self._get_result(string, buf=buf, encoding=encoding) + return save_to_buffer(string, buf=buf, encoding=encoding) def to_html( self, @@ -995,7 +995,7 @@ def to_html( render_links=render_links, ) string = html_formatter.to_string() - return self._get_result(string, buf=buf, encoding=encoding) + return save_to_buffer(string, buf=buf, encoding=encoding) def to_string( self, @@ -1019,7 +1019,7 @@ def to_string( string_formatter = StringFormatter(self.fmt, line_width=line_width) string = string_formatter.to_string() - return self._get_result(string, buf=buf, encoding=encoding) + return save_to_buffer(string, buf=buf, encoding=encoding) def to_csv( self, @@ -1072,50 +1072,49 @@ def to_csv( return None - def _get_result( - self, - string: str, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - ) -> Optional[str]: - """ - Perform serialization. Write to buf or return as string if buf is None. - """ - with self._get_buffer(buf, encoding=encoding) as f: - f.write(string) - if buf is None: - return f.getvalue() - return None - - @contextmanager - def _get_buffer( - self, buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None - ): - """ - Context manager to open, yield and close buffer for filenames or Path-like - objects, otherwise yield buf unchanged. - """ - if buf is not None: - buf = stringify_path(buf) - else: - buf = StringIO() - - if encoding is None: - encoding = "utf-8" - elif not isinstance(buf, str): - raise ValueError("buf is not a file name and encoding is specified.") - - if hasattr(buf, "write"): - yield buf - elif isinstance(buf, str): - with open(buf, "w", encoding=encoding, newline="") as f: - # GH#30034 open instead of codecs.open prevents a file leak - # if we have an invalid encoding argument. - # newline="" is needed to roundtrip correctly on - # windows test_to_latex_filename - yield f - else: - raise TypeError("buf is not a file name and it has no write method") + +def save_to_buffer( + string: str, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, +) -> Optional[str]: + """ + Perform serialization. Write to buf or return as string if buf is None. + """ + with get_buffer(buf, encoding=encoding) as f: + f.write(string) + if buf is None: + return f.getvalue() + return None + + +@contextmanager +def get_buffer(buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None): + """ + Context manager to open, yield and close buffer for filenames or Path-like + objects, otherwise yield buf unchanged. + """ + if buf is not None: + buf = stringify_path(buf) + else: + buf = StringIO() + + if encoding is None: + encoding = "utf-8" + elif not isinstance(buf, str): + raise ValueError("buf is not a file name and encoding is specified.") + + if hasattr(buf, "write"): + yield buf + elif isinstance(buf, str): + with open(buf, "w", encoding=encoding, newline="") as f: + # GH#30034 open instead of codecs.open prevents a file leak + # if we have an invalid encoding argument. + # newline="" is needed to roundtrip correctly on + # windows test_to_latex_filename + yield f + else: + raise TypeError("buf is not a file name and it has no write method") # ---------------------------------------------------------------------- From 5a18386bbba10e6e3de6d737042c8634ed1084bc Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 22 Sep 2020 19:41:19 +0700 Subject: [PATCH 23/34] REF: replace setters with initializer methods --- pandas/io/formats/csvs.py | 68 ++++++++++++--------------------------- 1 file changed, 21 insertions(+), 47 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 0ca680f243c32..ebd68085c19ab 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -83,29 +83,28 @@ def __init__( self.header = self.fmt.header self.index = self.fmt.index self.index_label = index_label + self.index_label = self._initialize_index_label(index_label) self.errors = errors self.quoting = quoting or csvlib.QUOTE_MINIMAL - self.quotechar = quotechar + self.quotechar = self._initialize_quotechar(quotechar) self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator or os.linesep self.date_format = date_format - self.cols = cols # type: ignore[assignment] - self.chunksize = chunksize # type: ignore[assignment] + self.cols = self._initialize_columns(cols) + self.chunksize = self._initialize_chunksize(chunksize) @property - def index_label(self) -> IndexLabel: - return self._index_label - @index_label.setter - def index_label(self, index_label: Optional[IndexLabel]) -> None: + + def _initialize_index_label(self, index_label: Optional[IndexLabel]) -> IndexLabel: if index_label is not False: if index_label is None: - index_label = self._get_index_label_from_obj() + return self._get_index_label_from_obj() elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): # given a string for a DF with Index - index_label = [index_label] - self._index_label = index_label + return [index_label] + return index_label def _get_index_label_from_obj(self) -> List[str]: if isinstance(self.obj.index, ABCMultiIndex): @@ -120,30 +119,16 @@ def _get_index_label_flat(self) -> List[str]: index_label = self.obj.index.name return [""] if index_label is None else [index_label] - @property - def quotechar(self) -> Optional[str]: + def _initialize_quotechar(self, quotechar: Optional[str]) -> Optional[str]: if self.quoting != csvlib.QUOTE_NONE: # prevents crash in _csv - return self._quotechar - return None - - @quotechar.setter - def quotechar(self, quotechar: Optional[str]) -> None: - self._quotechar = quotechar + return quotechar @property def has_mi_columns(self) -> bool: return bool(isinstance(self.obj.columns, ABCMultiIndex)) - @property - def cols(self) -> Sequence[Label]: - return self._cols - - @cols.setter - def cols(self, cols: Optional[Sequence[Label]]) -> None: - self._cols = self._refine_cols(cols) - - def _refine_cols(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: + def _initialize_columns(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: # validate mi options if self.has_mi_columns: if cols is not None: @@ -159,17 +144,17 @@ def _refine_cols(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: # update columns to include possible multiplicity of dupes # and make sure sure cols is just a list of labels - - # Ignore mypy error - # Incompatible types in assignment - # (expression has type "Index", - # variable has type "Optional[Sequence[Optional[Hashable]]]") [assignment] - cols = self.obj.columns # type: ignore[assignment] - if isinstance(cols, ABCIndexClass): - return cols._format_native_types(**self._number_format) + new_cols = self.obj.columns + if isinstance(new_cols, ABCIndexClass): + return new_cols._format_native_types(**self._number_format) else: assert isinstance(cols, Sequence) - return list(cols) + return list(new_cols) + + def _initialize_chunksize(self, chunksize: Optional[int]) -> int: + if chunksize is None: + return (100000 // (len(self.cols) or 1)) or 1 + return int(chunksize) @property def _number_format(self) -> Dict[str, Any]: @@ -182,17 +167,6 @@ def _number_format(self) -> Dict[str, Any]: decimal=self.decimal, ) - @property - def chunksize(self) -> int: - return self._chunksize - - @chunksize.setter - def chunksize(self, chunksize: Optional[int]) -> None: - if chunksize is None: - chunksize = (100000 // (len(self.cols) or 1)) or 1 - assert chunksize is not None - self._chunksize = int(chunksize) - @property def data_index(self) -> Index: data_index = self.obj.index From fc68fa5cbd1f6dd0ddf4e6a1bbfbc4223f88f4b5 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 22 Sep 2020 19:42:21 +0700 Subject: [PATCH 24/34] REF: extract properties to make composition clear --- pandas/io/formats/csvs.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index ebd68085c19ab..2eb52117af324 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -77,12 +77,6 @@ def __init__( self.mode = ioargs.mode self.sep = sep - self.na_rep = self.fmt.na_rep - self.float_format = self.fmt.float_format - self.decimal = self.fmt.decimal - self.header = self.fmt.header - self.index = self.fmt.index - self.index_label = index_label self.index_label = self._initialize_index_label(index_label) self.errors = errors self.quoting = quoting or csvlib.QUOTE_MINIMAL @@ -95,7 +89,24 @@ def __init__( self.chunksize = self._initialize_chunksize(chunksize) @property + def na_rep(self): + return self.fmt.na_rep + @property + def float_format(self): + return self.fmt.float_format + + @property + def decimal(self): + return self.fmt.decimal + + @property + def header(self): + return self.fmt.header + + @property + def index(self): + return self.fmt.index def _initialize_index_label(self, index_label: Optional[IndexLabel]) -> IndexLabel: if index_label is not False: @@ -123,6 +134,7 @@ def _initialize_quotechar(self, quotechar: Optional[str]) -> Optional[str]: if self.quoting != csvlib.QUOTE_NONE: # prevents crash in _csv return quotechar + return None @property def has_mi_columns(self) -> bool: @@ -148,7 +160,6 @@ def _initialize_columns(self, cols: Optional[Sequence[Label]]) -> Sequence[Label if isinstance(new_cols, ABCIndexClass): return new_cols._format_native_types(**self._number_format) else: - assert isinstance(cols, Sequence) return list(new_cols) def _initialize_chunksize(self, chunksize: Optional[int]) -> int: From 19d21561ed16452fb88645fa02790391a4976a2b Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 22 Sep 2020 23:58:17 +0700 Subject: [PATCH 25/34] REF: eliminate inheritance for HTMLFormatter --- pandas/io/formats/html.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 46e1d4951d43a..b4f7e3922f02f 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -16,7 +16,7 @@ from pandas.io.formats.printing import pprint_thing -class HTMLFormatter(DataFrameFormatter): +class HTMLFormatter: """ Internal class for formatting output data in html. This class is intended for shared functionality between @@ -73,6 +73,10 @@ def render(self) -> List[str]: return self.elements + @property + def should_show_dimensions(self): + return self.fmt.should_show_dimensions + @property def show_row_idx_names(self) -> bool: return self.fmt.show_row_idx_names From 271ef5c96a57ca92b5b0a7d3614714167077aba6 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 22 Sep 2020 23:59:14 +0700 Subject: [PATCH 26/34] LINT: new black --- pandas/core/frame.py | 4 +++- pandas/io/formats/format.py | 4 ---- pandas/io/formats/latex.py | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index eaf840ef20e04..c8dae9173da24 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -872,7 +872,9 @@ def to_string( decimal=decimal, ) return fmt.DataFrameRenderer(formatter).to_string( - buf=buf, encoding=encoding, line_width=line_width, + buf=buf, + encoding=encoding, + line_width=line_width, ) # ---------------------------------------------------------------------- diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 4113098ea7847..db7bd9f8d417c 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1301,10 +1301,6 @@ def _format(x): class FloatArrayFormatter(GenericArrayFormatter): - """ - - """ - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 2c9a9a5893b14..5a927c96624c4 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -41,8 +41,8 @@ def __init__( self.multirow = multirow self.clinebuf: List[List[int]] = [] self.strcols = self._get_strcols() - self.strrows: List[List[str]] = ( - list(zip(*self.strcols)) # type: ignore[arg-type] + self.strrows: List[List[str]] = list( + zip(*self.strcols) # type: ignore[arg-type] ) def get_strrow(self, row_num: int) -> str: From 22d098259f39143bb186fb054aeb38ae3d9e49e9 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 23 Sep 2020 14:52:40 +0700 Subject: [PATCH 27/34] TYP: type properties in CSVFormatter --- pandas/io/formats/csvs.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 2eb52117af324..578e241aa7cc9 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,7 +5,7 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Any, Dict, Iterator, List, Optional, Sequence +from typing import Any, Dict, Iterator, List, Optional, Sequence, Union import numpy as np @@ -29,7 +29,10 @@ from pandas.core.indexes.api import Index from pandas.io.common import get_filepath_or_buffer, get_handle -from pandas.io.formats.format import DataFrameFormatter +from pandas.io.formats.format import ( + DataFrameFormatter, + FloatFormatType, +) class CSVFormatter: @@ -89,23 +92,23 @@ def __init__( self.chunksize = self._initialize_chunksize(chunksize) @property - def na_rep(self): + def na_rep(self) -> str: return self.fmt.na_rep @property - def float_format(self): + def float_format(self) -> Optional[FloatFormatType]: return self.fmt.float_format @property - def decimal(self): + def decimal(self) -> str: return self.fmt.decimal @property - def header(self): + def header(self) -> Union[bool, Sequence[str]]: return self.fmt.header @property - def index(self): + def index(self) -> bool: return self.fmt.index def _initialize_index_label(self, index_label: Optional[IndexLabel]) -> IndexLabel: From 94dbadd12c00ca3748c95e095d642391658513f1 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 23 Sep 2020 15:02:24 +0700 Subject: [PATCH 28/34] TYP: type strcols --- pandas/io/formats/string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index 03b3981874d04..366ab11099e8d 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -144,7 +144,7 @@ def _join_multiline(self, *args) -> str: start = end return "\n\n".join(str_lst) - def _fit_strcols_to_terminal_width(self, strcols) -> str: + def _fit_strcols_to_terminal_width(self, strcols: List[List[str]]) -> str: from pandas import Series lines = self.adj.adjoin(1, *strcols).split("\n") From 914981b550f8ae39d135977f41e1e542aeb9b49c Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 23 Sep 2020 15:06:10 +0700 Subject: [PATCH 29/34] TYP: _join_multiline --- pandas/io/formats/string.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index 366ab11099e8d..64c0de6d7a9f5 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -106,10 +106,10 @@ def _insert_dot_separator_vertical( col.insert(row_num + n_header_rows, dot_str) return strcols - def _join_multiline(self, *args) -> str: + def _join_multiline(self, *strcols: List[str]) -> str: lwidth = self.line_width adjoin_width = 1 - strcols = list(args) + strcols = list(strcols) if self.fmt.index: idx = strcols.pop(0) lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width From 482ccd111ad6cae4c07bffe32c2f6f23f25033c1 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 23 Sep 2020 15:16:15 +0700 Subject: [PATCH 30/34] LINT: sort in one line --- pandas/io/formats/csvs.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 578e241aa7cc9..74069e56826c2 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -29,10 +29,7 @@ from pandas.core.indexes.api import Index from pandas.io.common import get_filepath_or_buffer, get_handle -from pandas.io.formats.format import ( - DataFrameFormatter, - FloatFormatType, -) +from pandas.io.formats.format import DataFrameFormatter, FloatFormatType class CSVFormatter: From 7b57fc8c2d5028fc4bbf5ef9d9f8e0908b35d510 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 23 Sep 2020 16:18:44 +0700 Subject: [PATCH 31/34] REF: _join_multiline to accept single arg --- pandas/io/formats/string.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index 64c0de6d7a9f5..9cee064299824 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -42,7 +42,7 @@ def _get_string_representation(self) -> str: return self.adj.adjoin(1, *strcols) if self._need_to_wrap_around: - return self._join_multiline(*strcols) + return self._join_multiline(strcols) return self._fit_strcols_to_terminal_width(strcols) @@ -106,10 +106,9 @@ def _insert_dot_separator_vertical( col.insert(row_num + n_header_rows, dot_str) return strcols - def _join_multiline(self, *strcols: List[str]) -> str: + def _join_multiline(self, strcols: List[List[str]]) -> str: lwidth = self.line_width adjoin_width = 1 - strcols = list(strcols) if self.fmt.index: idx = strcols.pop(0) lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width From 1e2969f40c52b6a2459bbb8dea716b2f9efa2b73 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 23 Sep 2020 17:00:33 +0700 Subject: [PATCH 32/34] REF: eliminate mutation in _join_multiline --- pandas/io/formats/string.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index 9cee064299824..4ebb78f29c739 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -2,7 +2,7 @@ Module for formatting output data in console (to string). """ from shutil import get_terminal_size -from typing import List, Optional +from typing import Iterable, List, Optional import numpy as np @@ -106,9 +106,11 @@ def _insert_dot_separator_vertical( col.insert(row_num + n_header_rows, dot_str) return strcols - def _join_multiline(self, strcols: List[List[str]]) -> str: + def _join_multiline(self, strcols_input: Iterable[List[str]]) -> str: lwidth = self.line_width adjoin_width = 1 + strcols = list(strcols_input) + if self.fmt.index: idx = strcols.pop(0) lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width From 1335a11c75faa2a1b761ce4fed5bd72ee8894d26 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 20 Oct 2020 21:27:23 +0700 Subject: [PATCH 33/34] TYP: extract imports for typing only in csvs --- pandas/io/formats/csvs.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 74069e56826c2..ff27dd72b89e2 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,7 +5,7 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Any, Dict, Iterator, List, Optional, Sequence, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Sequence, Union import numpy as np @@ -29,13 +29,15 @@ from pandas.core.indexes.api import Index from pandas.io.common import get_filepath_or_buffer, get_handle -from pandas.io.formats.format import DataFrameFormatter, FloatFormatType + +if TYPE_CHECKING: + from pandas.io.formats.format import DataFrameFormatter, FloatFormatType class CSVFormatter: def __init__( self, - formatter: DataFrameFormatter, + formatter: "DataFrameFormatter", path_or_buf: Optional[FilePathOrBuffer[str]] = None, sep: str = ",", cols: Optional[Sequence[Label]] = None, @@ -93,7 +95,7 @@ def na_rep(self) -> str: return self.fmt.na_rep @property - def float_format(self) -> Optional[FloatFormatType]: + def float_format(self) -> Optional["FloatFormatType"]: return self.fmt.float_format @property From b67b481b9a3df2feb415074ad68f2bd469d299c1 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 20 Oct 2020 21:31:23 +0700 Subject: [PATCH 34/34] TYP: move FloatFormatType alias to _typing --- pandas/_typing.py | 6 ++++++ pandas/io/formats/csvs.py | 3 ++- pandas/io/formats/format.py | 3 ++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 7678d1bf12d8b..a9177106535fc 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -38,6 +38,8 @@ from pandas.core.indexes.base import Index from pandas.core.series import Series + from pandas.io.formats.format import EngFormatter + # array-like AnyArrayLike = TypeVar("AnyArrayLike", "ExtensionArray", "Index", "Series", np.ndarray) @@ -127,6 +129,10 @@ EncodingVar = TypeVar("EncodingVar", str, None, Optional[str]) +# type of float formatter in DataFrameFormatter +FloatFormatType = Union[str, Callable, "EngFormatter"] + + @dataclass class IOargs(Generic[ModeVar, EncodingVar]): """ diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index ff27dd72b89e2..6c62d6825bc84 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -13,6 +13,7 @@ from pandas._typing import ( CompressionOptions, FilePathOrBuffer, + FloatFormatType, IndexLabel, Label, StorageOptions, @@ -31,7 +32,7 @@ from pandas.io.common import get_filepath_or_buffer, get_handle if TYPE_CHECKING: - from pandas.io.formats.format import DataFrameFormatter, FloatFormatType + from pandas.io.formats.format import DataFrameFormatter class CSVFormatter: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index cc9e590a440cf..6f4bd2ed8c73a 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -42,6 +42,7 @@ from pandas._typing import ( CompressionOptions, FilePathOrBuffer, + FloatFormatType, IndexLabel, Label, StorageOptions, @@ -80,10 +81,10 @@ if TYPE_CHECKING: from pandas import Categorical, DataFrame, Series + FormattersType = Union[ List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] ] -FloatFormatType = Union[str, Callable, "EngFormatter"] ColspaceType = Mapping[Label, Union[str, int]] ColspaceArgType = Union[ str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]]