diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 719afe160614f..7294efe843cce 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -4,6 +4,7 @@ from copy import copy import csv from enum import Enum +import itertools from typing import ( TYPE_CHECKING, Any, @@ -271,7 +272,7 @@ def _maybe_make_multi_index_columns( @final def _make_index( - self, data, alldata, columns, indexnamerow: list[Scalar] | None = None + self, alldata, columns, indexnamerow: list[Scalar] | None = None ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: index: Index | None if isinstance(self.index_col, list) and len(self.index_col): @@ -326,7 +327,11 @@ def _agg_index(self, index) -> Index: converters = self._clean_mapping(self.converters) clean_dtypes = self._clean_mapping(self.dtype) - for i, arr in enumerate(index): + if self.index_names is not None: + names: Iterable = self.index_names + else: + names = itertools.cycle([None]) + for i, (arr, name) in enumerate(zip(index, names)): if self._should_parse_dates(i): arr = date_converter( arr, @@ -369,12 +374,17 @@ def _agg_index(self, index) -> Index: arr, _ = self._infer_types( arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool ) - arrays.append(arr) - - names = self.index_names - index = ensure_index_from_sequences(arrays, names) + if cast_type is not None: + # Don't perform RangeIndex inference + idx = Index(arr, name=name, dtype=cast_type) + else: + idx = ensure_index_from_sequences([arr], [name]) + arrays.append(idx) - return index + if len(arrays) == 1: + return arrays[0] + else: + return MultiIndex.from_arrays(arrays) @final def _set_noconvert_dtype_columns( @@ -704,12 +714,11 @@ def _get_empty_meta( dtype_dict: defaultdict[Hashable, Any] if not is_dict_like(dtype): # if dtype == None, default will be object. - default_dtype = dtype or object - dtype_dict = defaultdict(lambda: default_dtype) + dtype_dict = defaultdict(lambda: dtype) else: dtype = cast(dict, dtype) dtype_dict = defaultdict( - lambda: object, + lambda: None, {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, ) @@ -726,8 +735,14 @@ def _get_empty_meta( if (index_col is None or index_col is False) or index_names is None: index = default_index(0) else: - data = [Series([], dtype=dtype_dict[name]) for name in index_names] - index = ensure_index_from_sequences(data, names=index_names) + # TODO: We could return default_index(0) if dtype_dict[name] is None + data = [ + Index([], name=name, dtype=dtype_dict[name]) for name in index_names + ] + if len(data) == 1: + index = data[0] + else: + index = MultiIndex.from_arrays(data) index_col.sort() for i, n in enumerate(index_col): diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index f4198ac2a1443..818c9f5ff6b80 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -338,7 +338,7 @@ def read( data = {k: v for k, (i, v) in zip(names, data_tups)} date_data = self._do_date_conversions(names, data) - index, column_names = self._make_index(date_data, alldata, names) + index, column_names = self._make_index(alldata, names) return index, column_names, date_data diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index c445529a6db48..3a2a1c37f1879 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -312,9 +312,7 @@ def read( conv_data = self._convert_data(data) conv_data = self._do_date_conversions(columns, conv_data) - index, result_columns = self._make_index( - conv_data, alldata, columns, indexnamerow - ) + index, result_columns = self._make_index(alldata, columns, indexnamerow) return index, result_columns, conv_data diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index ba928abcb30ad..23a467b0bb952 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -27,6 +27,8 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @@ -607,6 +609,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL @@ -617,7 +620,7 @@ def test_accurate_parsing_of_large_integers(all_parsers): AMZN,20230301181139587,2023552585717889759,2023552585717263360 MSFT,20230301181139587,2023552585717889863,2023552585717263361 NVDA,20230301181139587,2023552585717889827,2023552585717263361""" - orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 @@ -639,3 +642,16 @@ def test_dtypes_with_usecols(all_parsers): values = ["1", "4"] expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) tm.assert_frame_equal(result, expected) + + +def test_index_col_with_dtype_no_rangeindex(all_parsers): + data = StringIO("345.5,519.5,0\n519.5,726.5,1") + result = all_parsers.read_csv( + data, + header=None, + names=["start", "stop", "bin_id"], + dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32}, + index_col="bin_id", + ).index + expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id") + tm.assert_index_equal(result, expected)