Merge branch 'issue-index' of https://github.com/xaris96/pandas into issue-index

pelagiavlas · pelagiavlas · commit b1e014e3f5e2 · 2025-05-21T12:37:26.000+03:00
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -120,6 +120,7 @@ Timezones
 Numeric
 ^^^^^^^
 - Enabled :class:`Series.mode` and :class:`DataFrame.mode` with ``dropna=False`` to sort the result for all dtypes in the presence of NA values; previously only certain dtypes would sort (:issue:`60702`)
+- Bug in :meth:`Series.round` on object columns no longer raises ``TypeError``
 -
 
 Conversion
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -779,6 +779,7 @@ I/O
 - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`)
 - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
 - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`)
+- Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`)
 - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
@@ -239,7 +239,8 @@ def stringify(value):
             if conv_val not in metadata:
                 result = -1
             else:
-                result = metadata.searchsorted(conv_val, side="left")
+                # Find the index of the first match of conv_val in metadata
+                result = np.flatnonzero(metadata == conv_val)[0]
             return TermValue(result, result, "integer")
         elif kind == "integer":
             try:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -9258,11 +9258,11 @@ def groupby(
 
         Parameters
         ----------%s
-        columns : str or object or a list of str
+        columns : Hashable or a sequence of the previous
             Column to use to make new frame's columns.
-        index : str or object or a list of str, optional
+        index : Hashable or a sequence of the previous, optional
             Column to use to make new frame's index. If not given, uses existing index.
-        values : str, object or a list of the previous, optional
+        values : Hashable or a sequence of the previous, optional
             Column(s) to use for populating new frame's values. If not
             specified, all remaining columns will be used and the result will
             have hierarchically indexed columns.
@@ -9401,12 +9401,12 @@ def pivot(
         ----------%s
         values : list-like or scalar, optional
             Column or columns to aggregate.
-        index : column, Grouper, array, or list of the previous
+        index : column, Grouper, array, or sequence of the previous
             Keys to group by on the pivot table index. If a list is passed,
             it can contain any of the other types (except list). If an array is
             passed, it must be the same length as the data and will be used in
             the same manner as column values.
-        columns : column, Grouper, array, or list of the previous
+        columns : column, Grouper, array, or sequence of the previous
             Keys to group by on the pivot table column. If a list is passed,
             it can contain any of the other types (except list). If an array is
             passed, it must be the same length as the data and will be used in
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -1731,10 +1731,16 @@ def name(self) -> Hashable:
         """
         Return Index or MultiIndex name.
 
+        Returns
+        -------
+        label (hashable object)
+            The name of the Index.
+
         See Also
         --------
         Index.set_names: Able to set new names partially and by level.
         Index.rename: Able to set new names partially and by level.
+        Series.name: Corresponding Series property.
 
         Examples
         --------
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -76,12 +76,12 @@ def pivot_table(
         Input pandas DataFrame object.
     values : list-like or scalar, optional
         Column or columns to aggregate.
-    index : column, Grouper, array, or list of the previous
+    index : column, Grouper, array, or sequence of the previous
         Keys to group by on the pivot table index. If a list is passed,
         it can contain any of the other types (except list). If an array is
         passed, it must be the same length as the data and will be used in
         the same manner as column values.
-    columns : column, Grouper, array, or list of the previous
+    columns : column, Grouper, array, or sequence of the previous
         Keys to group by on the pivot table column. If a list is passed,
         it can contain any of the other types (except list). If an array is
         passed, it must be the same length as the data and will be used in
@@ -708,11 +708,11 @@ def pivot(
     ----------
     data : DataFrame
         Input pandas DataFrame object.
-    columns : str or object or a list of str
+    columns : Hashable or a sequence of the previous
         Column to use to make new frame's columns.
-    index : str or object or a list of str, optional
+    index : Hashable or a sequence of the previous, optional
         Column to use to make new frame's index. If not given, uses existing index.
-    values : str, object or a list of the previous, optional
+    values : Hashable or a sequence of the previous, optional
         Column(s) to use for populating new frame's values. If not
         specified, all remaining columns will be used and the result will
         have hierarchically indexed columns.
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2514,6 +2514,8 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series:
         dtype: float64
         """
         nv.validate_round(args, kwargs)
+        if self.dtype == "object":
+            raise TypeError("Expected numeric dtype, got object instead.")
         new_mgr = self._mgr.round(decimals=decimals)
         return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(
             self, method="round"
diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py
@@ -464,7 +464,9 @@ def test_sort_values_invalid_na_position(
     reason="Sorting fails due to heterogeneous types in index (int vs str)",
     strict=False,
 )
-def test_sort_values_with_missing(index_with_missing, na_position, request, box_in_series):
+def test_sort_values_with_missing(
+    index_with_missing, na_position, request, box_in_series
+):
     # GH 35584. Test that sort_values works with missing values,
     # sort non-missing and place missing according to na_position
 
@@ -489,7 +491,6 @@ def test_sort_values_with_missing(index_with_missing, na_position, request, box_
     else:
         not_na_vals = index_with_missing[index_with_missing.notna()].values
 
-
     sorted_values = np.sort(not_na_vals)
     if na_position == "first":
         sorted_values = np.concatenate([[None] * missing_count, sorted_values])
@@ -500,7 +501,9 @@ def test_sort_values_with_missing(index_with_missing, na_position, request, box_
     if isinstance(index_with_missing, pd.Series):
         expected = pd.Series(sorted_values, dtype=index_with_missing.dtype)
     else:
-        expected = type(index_with_missing)(sorted_values, dtype=index_with_missing.dtype)
+        expected = type(index_with_missing)(
+            sorted_values, dtype=index_with_missing.dtype
+        )
 
     result = index_with_missing.sort_values(na_position=na_position)
     if isinstance(index_with_missing, pd.Series):
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
@@ -23,6 +23,9 @@
     timedelta_range,
 )
 import pandas._testing as tm
+from pandas.api.types import (
+    CategoricalDtype,
+)
 from pandas.tests.io.pytables.common import (
     _maybe_remove,
     ensure_clean_store,
@@ -1107,3 +1110,23 @@ def test_store_bool_index(tmp_path, setup_path):
     df.to_hdf(path, key="a")
     result = read_hdf(path, "a")
     tm.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize("model", ["name", "longname", "verylongname"])
+def test_select_categorical_string_columns(tmp_path, model):
+    # Corresponding to BUG: 57608
+
+    path = tmp_path / "test.h5"
+
+    models = CategoricalDtype(categories=["name", "longname", "verylongname"])
+    df = DataFrame(
+        {"modelId": ["name", "longname", "longname"], "value": [1, 2, 3]}
+    ).astype({"modelId": models, "value": int})
+
+    with HDFStore(path, "w") as store:
+        store.append("df", df, data_columns=["modelId"])
+
+    with HDFStore(path, "r") as store:
+        result = store.select("df", "modelId == model")
+        expected = df[df["modelId"] == model]
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_round.py b/pandas/tests/series/methods/test_round.py
@@ -72,3 +72,10 @@ def test_round_ea_boolean(self):
         tm.assert_series_equal(result, expected)
         result.iloc[0] = False
         tm.assert_series_equal(ser, expected)
+
+    def test_round_dtype_object(self):
+        # GH#61206
+        ser = Series([0.2], dtype="object")
+        msg = "Expected numeric dtype, got object instead."
+        with pytest.raises(TypeError, match=msg):
+            ser.round()

Original file line number	Diff line number	Diff line change
`@@ -120,6 +120,7 @@ Timezones`
`120`	`120`	`Numeric`
`121`	`121`	`^^^^^^^`
`122`	`122`	- Enabled :class:`Series.mode` and :class:`DataFrame.mode` with ``dropna=False`` to sort the result for all dtypes in the presence of NA values; previously only certain dtypes would sort (:issue:`60702`)
	`123`	+- Bug in :meth:`Series.round` on object columns no longer raises ``TypeError``
`123`	`124`	`-`
`124`	`125`
`125`	`126`	`Conversion`