Skip to content

BUG: Correct read_stata #251

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Sep 27, 2022
46 changes: 0 additions & 46 deletions pandas-stubs/core/frame.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -633,37 +633,6 @@ class DataFrame(NDFrame, OpsMixin):
axis: AxisType = ...,
fill_value: Hashable | None = ...,
) -> DataFrame: ...
@overload
def set_index(
self,
keys: Label
| Series
| Index
| np.ndarray
| Iterator[HashableT]
| list[HashableT],
drop: _bool = ...,
append: _bool = ...,
verify_integrity: _bool = ...,
*,
inplace: Literal[True],
) -> None: ...
@overload
def set_index(
self,
keys: Label
| Series
| Index
| np.ndarray
| Iterator[HashableT]
| list[HashableT],
drop: _bool = ...,
append: _bool = ...,
verify_integrity: _bool = ...,
*,
inplace: Literal[False],
) -> DataFrame: ...
@overload
def set_index(
self,
keys: Label
Expand All @@ -674,24 +643,9 @@ class DataFrame(NDFrame, OpsMixin):
| list[HashableT],
drop: _bool = ...,
append: _bool = ...,
*,
verify_integrity: _bool = ...,
) -> DataFrame: ...
@overload
def set_index(
self,
keys: Label
| Series
| Index
| np.ndarray
| Iterator[HashableT]
| list[HashableT],
drop: _bool = ...,
append: _bool = ...,
inplace: _bool | None = ...,
verify_integrity: _bool = ...,
) -> DataFrame | None: ...
@overload
def reset_index(
self,
level: Level | Sequence[Level] = ...,
Expand Down
26 changes: 14 additions & 12 deletions pandas-stubs/io/stata.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ from pandas._typing import (

@overload
def read_stata(
path: FilePath | ReadBuffer[bytes],
filepath_or_buffer: FilePath | ReadBuffer[bytes],
*,
convert_dates: bool = ...,
convert_categoricals: bool = ...,
index_col: str | None = ...,
Expand All @@ -34,29 +35,30 @@ def read_stata(
columns: list[HashableT] | None = ...,
order_categoricals: bool = ...,
chunksize: int | None = ...,
*,
iterator: Literal[True],
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
) -> StataReader: ...
@overload
def read_stata(
path: FilePath | ReadBuffer[bytes],
convert_dates: bool,
convert_categoricals: bool,
index_col: str | None,
convert_missing: bool,
preserve_dtypes: bool,
columns: list[HashableT] | None,
order_categoricals: bool,
chunksize: int | None,
iterator: Literal[True],
filepath_or_buffer: FilePath | ReadBuffer[bytes],
*,
convert_dates: bool = ...,
convert_categoricals: bool = ...,
index_col: str | None = ...,
convert_missing: bool = ...,
preserve_dtypes: bool = ...,
columns: list[HashableT] | None = ...,
order_categoricals: bool = ...,
chunksize: int,
iterator: bool = ...,
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
) -> StataReader: ...
@overload
def read_stata(
path: FilePath | ReadBuffer[bytes],
*,
convert_dates: bool = ...,
convert_categoricals: bool = ...,
index_col: str | None = ...,
Expand Down
18 changes: 12 additions & 6 deletions tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,6 @@ def test_types_set_index() -> None:
res3: pd.DataFrame = df.set_index("col1", append=True)
res4: pd.DataFrame = df.set_index("col1", verify_integrity=True)
res5: pd.DataFrame = df.set_index(["col1", "col2"])
res6: None = df.set_index("col1", inplace=True)
# GH 140
res7: pd.DataFrame = df.set_index(pd.Index(["w", "x", "y", "z"]))

Expand Down Expand Up @@ -1505,27 +1504,31 @@ def test_frame_scalars_slice() -> None:

# Note: bool_ cannot be tested since the index is object and pandas does not
# support boolean access using loc except when the index is boolean
check(assert_type(df.loc[str_], pd.Series), pd.Series)
with pytest.warns(FutureWarning, match="Comparison of Timestamp with datetime"):
check(assert_type(df.loc[str_], pd.Series), pd.Series)
check(assert_type(df.loc[bytes_], pd.Series), pd.Series)
check(assert_type(df.loc[date], pd.Series), pd.Series)
check(assert_type(df.loc[datetime_], pd.Series), pd.Series)
check(assert_type(df.loc[timedelta], pd.Series), pd.Series)
check(assert_type(df.loc[int_], pd.Series), pd.Series)
check(assert_type(df.loc[float_], pd.Series), pd.Series)
check(assert_type(df.loc[complex_], pd.Series), pd.Series)
check(assert_type(df.loc[timestamp], pd.Series), pd.Series)
with pytest.warns(FutureWarning, match="Comparison of Timestamp with datetime"):
check(assert_type(df.loc[timestamp], pd.Series), pd.Series)
check(assert_type(df.loc[pd_timedelta], pd.Series), pd.Series)
check(assert_type(df.loc[none], pd.Series), pd.Series)

check(assert_type(df.loc[:, str_], pd.Series), pd.Series)
with pytest.warns(FutureWarning, match="Comparison of Timestamp with datetime"):
check(assert_type(df.loc[:, str_], pd.Series), pd.Series)
check(assert_type(df.loc[:, bytes_], pd.Series), pd.Series)
check(assert_type(df.loc[:, date], pd.Series), pd.Series)
check(assert_type(df.loc[:, datetime_], pd.Series), pd.Series)
check(assert_type(df.loc[:, timedelta], pd.Series), pd.Series)
check(assert_type(df.loc[:, int_], pd.Series), pd.Series)
check(assert_type(df.loc[:, float_], pd.Series), pd.Series)
check(assert_type(df.loc[:, complex_], pd.Series), pd.Series)
check(assert_type(df.loc[:, timestamp], pd.Series), pd.Series)
with pytest.warns(FutureWarning, match="Comparison of Timestamp with datetime"):
check(assert_type(df.loc[:, timestamp], pd.Series), pd.Series)
check(assert_type(df.loc[:, pd_timedelta], pd.Series), pd.Series)
check(assert_type(df.loc[:, none], pd.Series), pd.Series)

Expand Down Expand Up @@ -1635,7 +1638,10 @@ def sample_to_df(x: pd.DataFrame) -> pd.DataFrame:
return x.sample()

check(
assert_type(df.groupby("col1").apply(sample_to_df), pd.DataFrame), pd.DataFrame
assert_type(
df.groupby("col1", group_keys=True).apply(sample_to_df), pd.DataFrame
),
pd.DataFrame,
)


Expand Down
16 changes: 1 addition & 15 deletions tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,28 +109,14 @@ def test_read_stata_df():
check(assert_type(read_stata(path), pd.DataFrame), pd.DataFrame)


def test_read_stata_iterator_positional():
with ensure_clean() as path:
str_path = str(path)
DF.to_stata(str_path)
check(
assert_type(
read_stata(
str_path, False, False, None, False, False, None, False, 2, True
),
StataReader,
),
StataReader,
)


def test_read_stata_iterator():
with ensure_clean() as path:
str_path = str(path)
DF.to_stata(str_path)
check(
assert_type(read_stata(str_path, iterator=True), StataReader), StataReader
)
check(assert_type(read_stata(str_path, chunksize=1), StataReader), StataReader)


def test_clipboard():
Expand Down
3 changes: 2 additions & 1 deletion tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,8 @@ def test_types_rank() -> None:
s.rank(method="min", pct=True)
with pytest.warns(FutureWarning, match="Dropping of nuisance columns"):
s.rank(method="dense", ascending=True)
s.rank(method="first", numeric_only=True)
s2 = pd.Series([1, 1, 2, 5, 6, np.nan])
s2.rank(method="first", numeric_only=True)


def test_types_mean() -> None:
Expand Down