From b21d198a956cd4b4568a995692c9ac3396dd61f5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 21 May 2024 13:16:46 -0700
Subject: [PATCH 1/5] Use iterator in internals.pyx

---
 pandas/_libs/internals.pyx | 16 ++++++----------
 pandas/io/sql.py           |  7 ++++---
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
index 05c4e7bd5e9dc..41a346c83a758 100644
--- a/pandas/_libs/internals.pyx
+++ b/pandas/_libs/internals.pyx
@@ -445,21 +445,18 @@ def get_concat_blkno_indexers(list blknos_list not None):
     # we have the blknos for each of several BlockManagers
     # list[np.ndarray[int64_t]]
     cdef:
-        Py_ssize_t i, j, k, start, ncols
+        Py_ssize_t i, j, k, ncols, start = 0
         cnp.npy_intp n_mgrs
         ndarray[intp_t] blknos, cur_blknos, run_blknos
         BlockPlacement bp
-        list result = []
-
-    n_mgrs = len(blknos_list)
-    cur_blknos = cnp.PyArray_EMPTY(1, &n_mgrs, cnp.NPY_INTP, 0)
 
     blknos = blknos_list[0]
     ncols = len(blknos)
     if ncols == 0:
-        return []
+        return
 
-    start = 0
+    n_mgrs = len(blknos_list)
+    cur_blknos = cnp.PyArray_EMPTY(1, &n_mgrs, cnp.NPY_INTP, 0)
     for i in range(n_mgrs):
         blknos = blknos_list[i]
         cur_blknos[i] = blknos[0]
@@ -476,7 +473,7 @@ def get_concat_blkno_indexers(list blknos_list not None):
             if blknos[i] != blknos[i - 1]:
                 bp = BlockPlacement(slice(start, i))
                 run_blknos = cnp.PyArray_Copy(cur_blknos)
-                result.append((run_blknos, bp))
+                yield run_blknos, bp
 
                 start = i
                 for j in range(n_mgrs):
@@ -487,8 +484,7 @@ def get_concat_blkno_indexers(list blknos_list not None):
     if start != ncols:
         bp = BlockPlacement(slice(start, ncols))
         run_blknos = cnp.PyArray_Copy(cur_blknos)
-        result.append((run_blknos, bp))
-    return result
+        yield run_blknos, bp
 
 
 @cython.boundscheck(False)
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index c0007c5e7d78c..399851790afc7 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -157,6 +157,7 @@ def _convert_arrays_to_dataframe(
     dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
 ) -> DataFrame:
     content = lib.to_object_array_tuples(data)
+    idx_len = content.shape[1]
     arrays = convert_object_array(
         list(content.T),
         dtype=None,
@@ -177,9 +178,9 @@ def _convert_arrays_to_dataframe(
             result_arrays.append(ArrowExtensionArray(pa_array))
         arrays = result_arrays  # type: ignore[assignment]
     if arrays:
-        df = DataFrame(dict(zip(range(len(columns)), arrays)))
-        df.columns = columns
-        return df
+        return DataFrame._from_arrays(
+            arrays, columns=columns, index=range(idx_len), verify_integrity=False
+        )
     else:
         return DataFrame(columns=columns)
 

From f7d67b9578ff1934e232b11c1015912f475c476a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 21 May 2024 14:50:57 -0700
Subject: [PATCH 2/5] Make list comps lazier

---
 pandas/core/apply.py                  |  2 +-
 pandas/core/generic.py                | 10 +++++---
 pandas/io/excel/_base.py              | 19 +++++++-------
 pandas/io/excel/_odfreader.py         | 36 ++++++++++++---------------
 pandas/io/excel/_xlrd.py              | 11 +++-----
 pandas/io/parsers/c_parser_wrapper.py |  2 +-
 6 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 32e8aea7ea8ab..25836e967e948 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -664,7 +664,7 @@ def _apply_str(self, obj, func: str, *args, **kwargs):
             # people may aggregate on a non-callable attribute
             # but don't let them think they can pass args to it
             assert len(args) == 0
-            assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0
+            assert not any(kwarg == "axis" for kwarg in kwargs)
             return f
         elif hasattr(np, func) and hasattr(obj, "__array__"):
             # in particular exclude Window
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 0e91aa23fcdb4..80e8bb343d852 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -1750,11 +1750,13 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike
             if `key` matches multiple labels
         """
         axis = self._get_axis_number(axis)
-        other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
+        first_other_axes = next(
+            (ax for ax in range(self._AXIS_LEN) if ax != axis), None
+        )
 
         if self._is_label_reference(key, axis=axis):
             self._check_label_or_level_ambiguity(key, axis=axis)
-            values = self.xs(key, axis=other_axes[0])._values
+            values = self.xs(key, axis=first_other_axes)._values
         elif self._is_level_reference(key, axis=axis):
             values = self.axes[axis].get_level_values(key)._values
         else:
@@ -1762,7 +1764,9 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike
 
         # Check for duplicates
         if values.ndim > 1:
-            if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
+            if first_other_axes is not None and isinstance(
+                self._get_axis(first_other_axes), MultiIndex
+            ):
                 multi_message = (
                     "\n"
                     "For a multi-index, the label must be a "
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index dd06c597c1857..1eb22d4ee9de7 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -857,24 +857,23 @@ def _parse_sheet(
         # a row containing just the index name(s)
         has_index_names = False
         if is_list_header and not is_len_one_list_header and index_col is not None:
-            index_col_list: Sequence[int]
+            index_col_set: set[int]
             if isinstance(index_col, int):
-                index_col_list = [index_col]
+                index_col_set = {index_col}
             else:
                 assert isinstance(index_col, Sequence)
-                index_col_list = index_col
+                index_col_set = set(index_col)
 
             # We have to handle mi without names. If any of the entries in the data
             # columns are not empty, this is a regular row
             assert isinstance(header, Sequence)
             if len(header) < len(data):
                 potential_index_names = data[len(header)]
-                potential_data = [
-                    x
+                has_index_names = all(
+                    x == "" or x is None
                     for i, x in enumerate(potential_index_names)
-                    if not control_row[i] and i not in index_col_list
-                ]
-                has_index_names = all(x == "" or x is None for x in potential_data)
+                    if not control_row[i] and i not in index_col_set
+                )
 
         if is_list_like(index_col):
             # Forward fill values for MultiIndex index.
@@ -1457,9 +1456,9 @@ def inspect_excel_format(
         with zipfile.ZipFile(stream) as zf:
             # Workaround for some third party files that use forward slashes and
             # lower case names.
-            component_names = [
+            component_names = {
                 name.replace("\\", "/").lower() for name in zf.namelist()
-            ]
+            }
 
         if "xl/workbook.xml" in component_names:
             return "xlsx"
diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py
index 69b514da32857..f79417d11080d 100644
--- a/pandas/io/excel/_odfreader.py
+++ b/pandas/io/excel/_odfreader.py
@@ -122,29 +122,25 @@ def get_sheet_data(
         table: list[list[Scalar | NaTType]] = []
 
         for sheet_row in sheet_rows:
-            sheet_cells = [
-                x
-                for x in sheet_row.childNodes
-                if hasattr(x, "qname") and x.qname in cell_names
-            ]
             empty_cells = 0
             table_row: list[Scalar | NaTType] = []
 
-            for sheet_cell in sheet_cells:
-                if sheet_cell.qname == table_cell_name:
-                    value = self._get_cell_value(sheet_cell)
-                else:
-                    value = self.empty_value
-
-                column_repeat = self._get_column_repeat(sheet_cell)
-
-                # Queue up empty values, writing only if content succeeds them
-                if value == self.empty_value:
-                    empty_cells += column_repeat
-                else:
-                    table_row.extend([self.empty_value] * empty_cells)
-                    empty_cells = 0
-                    table_row.extend([value] * column_repeat)
+            for sheet_cell in sheet_row.childNodes:
+                if hasattr(sheet_cell, "qname") and sheet_cell.qname in cell_names:
+                    if sheet_cell.qname == table_cell_name:
+                        value = self._get_cell_value(sheet_cell)
+                    else:
+                        value = self.empty_value
+
+                    column_repeat = self._get_column_repeat(sheet_cell)
+
+                    # Queue up empty values, writing only if content succeeds them
+                    if value == self.empty_value:
+                        empty_cells += column_repeat
+                    else:
+                        table_row.extend([self.empty_value] * empty_cells)
+                        empty_cells = 0
+                        table_row.extend([value] * column_repeat)
 
             if max_row_len < len(table_row):
                 max_row_len = len(table_row)
diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py
index a444970792e6e..5d39a840336eb 100644
--- a/pandas/io/excel/_xlrd.py
+++ b/pandas/io/excel/_xlrd.py
@@ -128,16 +128,13 @@ def _parse_cell(cell_contents, cell_typ):
                         cell_contents = val
             return cell_contents
 
-        data = []
-
         nrows = sheet.nrows
         if file_rows_needed is not None:
             nrows = min(nrows, file_rows_needed)
-        for i in range(nrows):
-            row = [
+        return [
+            [
                 _parse_cell(value, typ)
                 for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
             ]
-            data.append(row)
-
-        return data
+            for i in range(nrows)
+        ]
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index 4de626288aa41..b39a29b1190b5 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -360,7 +360,7 @@ def _concatenate_chunks(
     The tricky part is handling Categoricals, where different chunks
     may have different inferred categories.
     """
-    names = list(chunks[0].keys())
+    names = chunks[0].keys()
     warning_columns = []
 
     result: dict = {}

From ec4ce4442061d3edbd17f7d9837f3e2aba331dfc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 22 May 2024 11:15:09 -0700
Subject: [PATCH 3/5] fix some things

---
 pandas/_libs/internals.pyx            | 16 ++++++++++------
 pandas/io/parsers/c_parser_wrapper.py |  2 +-
 pandas/io/sql.py                      |  2 +-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
index 41a346c83a758..05c4e7bd5e9dc 100644
--- a/pandas/_libs/internals.pyx
+++ b/pandas/_libs/internals.pyx
@@ -445,18 +445,21 @@ def get_concat_blkno_indexers(list blknos_list not None):
     # we have the blknos for each of several BlockManagers
     # list[np.ndarray[int64_t]]
     cdef:
-        Py_ssize_t i, j, k, ncols, start = 0
+        Py_ssize_t i, j, k, start, ncols
         cnp.npy_intp n_mgrs
         ndarray[intp_t] blknos, cur_blknos, run_blknos
         BlockPlacement bp
+        list result = []
+
+    n_mgrs = len(blknos_list)
+    cur_blknos = cnp.PyArray_EMPTY(1, &n_mgrs, cnp.NPY_INTP, 0)
 
     blknos = blknos_list[0]
     ncols = len(blknos)
     if ncols == 0:
-        return
+        return []
 
-    n_mgrs = len(blknos_list)
-    cur_blknos = cnp.PyArray_EMPTY(1, &n_mgrs, cnp.NPY_INTP, 0)
+    start = 0
     for i in range(n_mgrs):
         blknos = blknos_list[i]
         cur_blknos[i] = blknos[0]
@@ -473,7 +476,7 @@ def get_concat_blkno_indexers(list blknos_list not None):
             if blknos[i] != blknos[i - 1]:
                 bp = BlockPlacement(slice(start, i))
                 run_blknos = cnp.PyArray_Copy(cur_blknos)
-                yield run_blknos, bp
+                result.append((run_blknos, bp))
 
                 start = i
                 for j in range(n_mgrs):
@@ -484,7 +487,8 @@ def get_concat_blkno_indexers(list blknos_list not None):
     if start != ncols:
         bp = BlockPlacement(slice(start, ncols))
         run_blknos = cnp.PyArray_Copy(cur_blknos)
-        yield run_blknos, bp
+        result.append((run_blknos, bp))
+    return result
 
 
 @cython.boundscheck(False)
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index b39a29b1190b5..4de626288aa41 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -360,7 +360,7 @@ def _concatenate_chunks(
     The tricky part is handling Categoricals, where different chunks
     may have different inferred categories.
     """
-    names = chunks[0].keys()
+    names = list(chunks[0].keys())
     warning_columns = []
 
     result: dict = {}
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index 399851790afc7..874320f08fb75 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -157,7 +157,7 @@ def _convert_arrays_to_dataframe(
     dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
 ) -> DataFrame:
     content = lib.to_object_array_tuples(data)
-    idx_len = content.shape[1]
+    idx_len = content.shape[0]
     arrays = convert_object_array(
         list(content.T),
         dtype=None,

From 85413740ef5c9a19415c81bf32a78132ab3b586d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 22 May 2024 11:52:30 -0700
Subject: [PATCH 4/5] Add handling for mypy

---
 pandas/core/generic.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 80e8bb343d852..46606c5f6003e 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -1753,6 +1753,8 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike
         first_other_axes = next(
             (ax for ax in range(self._AXIS_LEN) if ax != axis), None
         )
+        if first_other_axes is None:
+            raise ValueError("axis matched all axes")
 
         if self._is_label_reference(key, axis=axis):
             self._check_label_or_level_ambiguity(key, axis=axis)
@@ -1764,9 +1766,7 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike
 
         # Check for duplicates
         if values.ndim > 1:
-            if first_other_axes is not None and isinstance(
-                self._get_axis(first_other_axes), MultiIndex
-            ):
+            if isinstance(self._get_axis(first_other_axes), MultiIndex):
                 multi_message = (
                     "\n"
                     "For a multi-index, the label must be a "

From 5af03ef607e7bd512f72639f33452ff32067d030 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 22 May 2024 13:08:16 -0700
Subject: [PATCH 5/5] Move check to more specific location

---
 pandas/core/generic.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 46606c5f6003e..ca60ca9b48a14 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -1753,11 +1753,11 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike
         first_other_axes = next(
             (ax for ax in range(self._AXIS_LEN) if ax != axis), None
         )
-        if first_other_axes is None:
-            raise ValueError("axis matched all axes")
 
         if self._is_label_reference(key, axis=axis):
             self._check_label_or_level_ambiguity(key, axis=axis)
+            if first_other_axes is None:
+                raise ValueError("axis matched all axes")
             values = self.xs(key, axis=first_other_axes)._values
         elif self._is_level_reference(key, axis=axis):
             values = self.axes[axis].get_level_values(key)._values
@@ -1766,7 +1766,9 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike
 
         # Check for duplicates
         if values.ndim > 1:
-            if isinstance(self._get_axis(first_other_axes), MultiIndex):
+            if first_other_axes is not None and isinstance(
+                self._get_axis(first_other_axes), MultiIndex
+            ):
                 multi_message = (
                     "\n"
                     "For a multi-index, the label must be a "