From b21d198a956cd4b4568a995692c9ac3396dd61f5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 21 May 2024 13:16:46 -0700 Subject: [PATCH 1/5] Use iterator in internals.pyx --- pandas/_libs/internals.pyx | 16 ++++++---------- pandas/io/sql.py | 7 ++++--- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 05c4e7bd5e9dc..41a346c83a758 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -445,21 +445,18 @@ def get_concat_blkno_indexers(list blknos_list not None): # we have the blknos for each of several BlockManagers # list[np.ndarray[int64_t]] cdef: - Py_ssize_t i, j, k, start, ncols + Py_ssize_t i, j, k, ncols, start = 0 cnp.npy_intp n_mgrs ndarray[intp_t] blknos, cur_blknos, run_blknos BlockPlacement bp - list result = [] - - n_mgrs = len(blknos_list) - cur_blknos = cnp.PyArray_EMPTY(1, &n_mgrs, cnp.NPY_INTP, 0) blknos = blknos_list[0] ncols = len(blknos) if ncols == 0: - return [] + return - start = 0 + n_mgrs = len(blknos_list) + cur_blknos = cnp.PyArray_EMPTY(1, &n_mgrs, cnp.NPY_INTP, 0) for i in range(n_mgrs): blknos = blknos_list[i] cur_blknos[i] = blknos[0] @@ -476,7 +473,7 @@ def get_concat_blkno_indexers(list blknos_list not None): if blknos[i] != blknos[i - 1]: bp = BlockPlacement(slice(start, i)) run_blknos = cnp.PyArray_Copy(cur_blknos) - result.append((run_blknos, bp)) + yield run_blknos, bp start = i for j in range(n_mgrs): @@ -487,8 +484,7 @@ def get_concat_blkno_indexers(list blknos_list not None): if start != ncols: bp = BlockPlacement(slice(start, ncols)) run_blknos = cnp.PyArray_Copy(cur_blknos) - result.append((run_blknos, bp)) - return result + yield run_blknos, bp @cython.boundscheck(False) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c0007c5e7d78c..399851790afc7 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -157,6 +157,7 @@ def _convert_arrays_to_dataframe( dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame: content = lib.to_object_array_tuples(data) + idx_len = content.shape[1] arrays = convert_object_array( list(content.T), dtype=None, @@ -177,9 +178,9 @@ def _convert_arrays_to_dataframe( result_arrays.append(ArrowExtensionArray(pa_array)) arrays = result_arrays # type: ignore[assignment] if arrays: - df = DataFrame(dict(zip(range(len(columns)), arrays))) - df.columns = columns - return df + return DataFrame._from_arrays( + arrays, columns=columns, index=range(idx_len), verify_integrity=False + ) else: return DataFrame(columns=columns) From f7d67b9578ff1934e232b11c1015912f475c476a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 21 May 2024 14:50:57 -0700 Subject: [PATCH 2/5] Make list comps lazier --- pandas/core/apply.py | 2 +- pandas/core/generic.py | 10 +++++--- pandas/io/excel/_base.py | 19 +++++++------- pandas/io/excel/_odfreader.py | 36 ++++++++++++--------------- pandas/io/excel/_xlrd.py | 11 +++----- pandas/io/parsers/c_parser_wrapper.py | 2 +- 6 files changed, 38 insertions(+), 42 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 32e8aea7ea8ab..25836e967e948 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -664,7 +664,7 @@ def _apply_str(self, obj, func: str, *args, **kwargs): # people may aggregate on a non-callable attribute # but don't let them think they can pass args to it assert len(args) == 0 - assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0 + assert not any(kwarg == "axis" for kwarg in kwargs) return f elif hasattr(np, func) and hasattr(obj, "__array__"): # in particular exclude Window diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0e91aa23fcdb4..80e8bb343d852 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1750,11 +1750,13 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike if `key` matches multiple labels """ axis = self._get_axis_number(axis) - other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + first_other_axes = next( + (ax for ax in range(self._AXIS_LEN) if ax != axis), None + ) if self._is_label_reference(key, axis=axis): self._check_label_or_level_ambiguity(key, axis=axis) - values = self.xs(key, axis=other_axes[0])._values + values = self.xs(key, axis=first_other_axes)._values elif self._is_level_reference(key, axis=axis): values = self.axes[axis].get_level_values(key)._values else: @@ -1762,7 +1764,9 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike # Check for duplicates if values.ndim > 1: - if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex): + if first_other_axes is not None and isinstance( + self._get_axis(first_other_axes), MultiIndex + ): multi_message = ( "\n" "For a multi-index, the label must be a " diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index dd06c597c1857..1eb22d4ee9de7 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -857,24 +857,23 @@ def _parse_sheet( # a row containing just the index name(s) has_index_names = False if is_list_header and not is_len_one_list_header and index_col is not None: - index_col_list: Sequence[int] + index_col_set: set[int] if isinstance(index_col, int): - index_col_list = [index_col] + index_col_set = {index_col} else: assert isinstance(index_col, Sequence) - index_col_list = index_col + index_col_set = set(index_col) # We have to handle mi without names. If any of the entries in the data # columns are not empty, this is a regular row assert isinstance(header, Sequence) if len(header) < len(data): potential_index_names = data[len(header)] - potential_data = [ - x + has_index_names = all( + x == "" or x is None for i, x in enumerate(potential_index_names) - if not control_row[i] and i not in index_col_list - ] - has_index_names = all(x == "" or x is None for x in potential_data) + if not control_row[i] and i not in index_col_set + ) if is_list_like(index_col): # Forward fill values for MultiIndex index. @@ -1457,9 +1456,9 @@ def inspect_excel_format( with zipfile.ZipFile(stream) as zf: # Workaround for some third party files that use forward slashes and # lower case names. - component_names = [ + component_names = { name.replace("\\", "/").lower() for name in zf.namelist() - ] + } if "xl/workbook.xml" in component_names: return "xlsx" diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 69b514da32857..f79417d11080d 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -122,29 +122,25 @@ def get_sheet_data( table: list[list[Scalar | NaTType]] = [] for sheet_row in sheet_rows: - sheet_cells = [ - x - for x in sheet_row.childNodes - if hasattr(x, "qname") and x.qname in cell_names - ] empty_cells = 0 table_row: list[Scalar | NaTType] = [] - for sheet_cell in sheet_cells: - if sheet_cell.qname == table_cell_name: - value = self._get_cell_value(sheet_cell) - else: - value = self.empty_value - - column_repeat = self._get_column_repeat(sheet_cell) - - # Queue up empty values, writing only if content succeeds them - if value == self.empty_value: - empty_cells += column_repeat - else: - table_row.extend([self.empty_value] * empty_cells) - empty_cells = 0 - table_row.extend([value] * column_repeat) + for sheet_cell in sheet_row.childNodes: + if hasattr(sheet_cell, "qname") and sheet_cell.qname in cell_names: + if sheet_cell.qname == table_cell_name: + value = self._get_cell_value(sheet_cell) + else: + value = self.empty_value + + column_repeat = self._get_column_repeat(sheet_cell) + + # Queue up empty values, writing only if content succeeds them + if value == self.empty_value: + empty_cells += column_repeat + else: + table_row.extend([self.empty_value] * empty_cells) + empty_cells = 0 + table_row.extend([value] * column_repeat) if max_row_len < len(table_row): max_row_len = len(table_row) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index a444970792e6e..5d39a840336eb 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -128,16 +128,13 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = val return cell_contents - data = [] - nrows = sheet.nrows if file_rows_needed is not None: nrows = min(nrows, file_rows_needed) - for i in range(nrows): - row = [ + return [ + [ _parse_cell(value, typ) for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) ] - data.append(row) - - return data + for i in range(nrows) + ] diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 4de626288aa41..b39a29b1190b5 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -360,7 +360,7 @@ def _concatenate_chunks( The tricky part is handling Categoricals, where different chunks may have different inferred categories. """ - names = list(chunks[0].keys()) + names = chunks[0].keys() warning_columns = [] result: dict = {} From ec4ce4442061d3edbd17f7d9837f3e2aba331dfc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 22 May 2024 11:15:09 -0700 Subject: [PATCH 3/5] fix some things --- pandas/_libs/internals.pyx | 16 ++++++++++------ pandas/io/parsers/c_parser_wrapper.py | 2 +- pandas/io/sql.py | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 41a346c83a758..05c4e7bd5e9dc 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -445,18 +445,21 @@ def get_concat_blkno_indexers(list blknos_list not None): # we have the blknos for each of several BlockManagers # list[np.ndarray[int64_t]] cdef: - Py_ssize_t i, j, k, ncols, start = 0 + Py_ssize_t i, j, k, start, ncols cnp.npy_intp n_mgrs ndarray[intp_t] blknos, cur_blknos, run_blknos BlockPlacement bp + list result = [] + + n_mgrs = len(blknos_list) + cur_blknos = cnp.PyArray_EMPTY(1, &n_mgrs, cnp.NPY_INTP, 0) blknos = blknos_list[0] ncols = len(blknos) if ncols == 0: - return + return [] - n_mgrs = len(blknos_list) - cur_blknos = cnp.PyArray_EMPTY(1, &n_mgrs, cnp.NPY_INTP, 0) + start = 0 for i in range(n_mgrs): blknos = blknos_list[i] cur_blknos[i] = blknos[0] @@ -473,7 +476,7 @@ def get_concat_blkno_indexers(list blknos_list not None): if blknos[i] != blknos[i - 1]: bp = BlockPlacement(slice(start, i)) run_blknos = cnp.PyArray_Copy(cur_blknos) - yield run_blknos, bp + result.append((run_blknos, bp)) start = i for j in range(n_mgrs): @@ -484,7 +487,8 @@ def get_concat_blkno_indexers(list blknos_list not None): if start != ncols: bp = BlockPlacement(slice(start, ncols)) run_blknos = cnp.PyArray_Copy(cur_blknos) - yield run_blknos, bp + result.append((run_blknos, bp)) + return result @cython.boundscheck(False) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index b39a29b1190b5..4de626288aa41 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -360,7 +360,7 @@ def _concatenate_chunks( The tricky part is handling Categoricals, where different chunks may have different inferred categories. """ - names = chunks[0].keys() + names = list(chunks[0].keys()) warning_columns = [] result: dict = {} diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 399851790afc7..874320f08fb75 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -157,7 +157,7 @@ def _convert_arrays_to_dataframe( dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame: content = lib.to_object_array_tuples(data) - idx_len = content.shape[1] + idx_len = content.shape[0] arrays = convert_object_array( list(content.T), dtype=None, From 85413740ef5c9a19415c81bf32a78132ab3b586d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 22 May 2024 11:52:30 -0700 Subject: [PATCH 4/5] Add handling for mypy --- pandas/core/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 80e8bb343d852..46606c5f6003e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1753,6 +1753,8 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike first_other_axes = next( (ax for ax in range(self._AXIS_LEN) if ax != axis), None ) + if first_other_axes is None: + raise ValueError("axis matched all axes") if self._is_label_reference(key, axis=axis): self._check_label_or_level_ambiguity(key, axis=axis) @@ -1764,9 +1766,7 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike # Check for duplicates if values.ndim > 1: - if first_other_axes is not None and isinstance( - self._get_axis(first_other_axes), MultiIndex - ): + if isinstance(self._get_axis(first_other_axes), MultiIndex): multi_message = ( "\n" "For a multi-index, the label must be a " From 5af03ef607e7bd512f72639f33452ff32067d030 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 22 May 2024 13:08:16 -0700 Subject: [PATCH 5/5] Move check to more specific location --- pandas/core/generic.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 46606c5f6003e..ca60ca9b48a14 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1753,11 +1753,11 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike first_other_axes = next( (ax for ax in range(self._AXIS_LEN) if ax != axis), None ) - if first_other_axes is None: - raise ValueError("axis matched all axes") if self._is_label_reference(key, axis=axis): self._check_label_or_level_ambiguity(key, axis=axis) + if first_other_axes is None: + raise ValueError("axis matched all axes") values = self.xs(key, axis=first_other_axes)._values elif self._is_level_reference(key, axis=axis): values = self.axes[axis].get_level_values(key)._values @@ -1766,7 +1766,9 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike # Check for duplicates if values.ndim > 1: - if isinstance(self._get_axis(first_other_axes), MultiIndex): + if first_other_axes is not None and isinstance( + self._get_axis(first_other_axes), MultiIndex + ): multi_message = ( "\n" "For a multi-index, the label must be a "