ENH: Synchronize io/stata with pandas master (#202)

bashtage · web-flow · commit a45600d8f78a · 2022-08-22T12:17:10.000-04:00
* ENH: Synchronize io/stata with pandas master

Sychronize and remvoe classes not part of the public API

* MAINT: Update frame stata io

* ENH: Add literals

Add literals for limited value inputs

* CLN: Remove non-public classes
diff --git a/pandas-stubs/_typing.pyi b/pandas-stubs/_typing.pyi
@@ -203,6 +203,23 @@ GroupByObjectNonScalar = Union[
 ]
 GroupByObject = Union[Scalar, GroupByObjectNonScalar]
 
+StataDateFormat = Literal[
+    "tc",
+    "%tc",
+    "td",
+    "%td",
+    "tw",
+    "%tw",
+    "tm",
+    "%tm",
+    "tq",
+    "%tq",
+    "th",
+    "%th",
+    "ty",
+    "%ty",
+]
+
 FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"]
 ReplaceMethod = Literal["pad", "ffill", "bfill"]
 SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"]
diff --git a/pandas-stubs/core/frame.pyi b/pandas-stubs/core/frame.pyi
@@ -51,6 +51,7 @@ from pandas._typing import (
     CompressionOptions,
     Dtype,
     DtypeNp,
+    FilePath,
     FilePathOrBuffer,
     FilePathOrBytesBuffer,
     FillnaOptions,
@@ -74,9 +75,12 @@ from pandas._typing import (
     ScalarT,
     SeriesAxisType,
     SortKind,
+    StataDateFormat,
+    StorageOptions,
     StrLike,
     T as TType,
     TimestampConvention,
+    WriteBuffer,
     np_ndarray_bool,
     np_ndarray_str,
     num,
@@ -245,15 +249,19 @@ class DataFrame(NDFrame, OpsMixin):
     ) -> np.recarray: ...
     def to_stata(
         self,
-        path: FilePathOrBuffer,
-        convert_dates: dict | None = ...,
+        path: FilePath | WriteBuffer[bytes],
+        convert_dates: dict[HashableT, StataDateFormat] | None = ...,
         write_index: _bool = ...,
         byteorder: Literal["<", ">", "little", "big"] | None = ...,
-        time_stamp=...,
+        time_stamp: _dt.datetime | None = ...,
         data_label: _str | None = ...,
-        variable_labels: dict | None = ...,
-        version: int = ...,
-        convert_strl: list[_str] | None = ...,
+        variable_labels: dict[HashableT, str] | None = ...,
+        version: Literal[114, 117, 118, 119] | None = ...,
+        convert_strl: list[HashableT] | None = ...,
+        compression: CompressionOptions = ...,
+        storage_options: StorageOptions = ...,
+        *,
+        value_labels: dict[Hashable, dict[float, str]] | None = ...,
     ) -> None: ...
     def to_feather(self, path: FilePathOrBuffer, **kwargs) -> None: ...
     @overload
diff --git a/pandas-stubs/io/stata.pyi b/pandas-stubs/io/stata.pyi
@@ -3,9 +3,9 @@ import datetime
 from io import BytesIO
 from types import TracebackType
 from typing import (
-    Hashable,
     Literal,
     Sequence,
+    overload,
 )
 
 import numpy as np
@@ -18,10 +18,12 @@ from pandas._typing import (
     FilePath,
     HashableT,
     ReadBuffer,
+    StataDateFormat,
     StorageOptions,
     WriteBuffer,
 )
 
+@overload
 def read_stata(
     path: FilePath | ReadBuffer[bytes],
     convert_dates: bool = ...,
@@ -32,70 +34,47 @@ def read_stata(
     columns: list[HashableT] | None = ...,
     order_categoricals: bool = ...,
     chunksize: int | None = ...,
-    iterator: bool = ...,
+    *,
+    iterator: Literal[True],
     compression: CompressionOptions = ...,
     storage_options: StorageOptions = ...,
-) -> DataFrame | StataReader: ...
-
-stata_epoch: datetime.datetime = ...
-excessive_string_length_error: str
+) -> StataReader: ...
+@overload
+def read_stata(
+    path: FilePath | ReadBuffer[bytes],
+    convert_dates: bool,
+    convert_categoricals: bool,
+    index_col: str | None,
+    convert_missing: bool,
+    preserve_dtypes: bool,
+    columns: list[HashableT] | None,
+    order_categoricals: bool,
+    chunksize: int | None,
+    iterator: Literal[True],
+    compression: CompressionOptions = ...,
+    storage_options: StorageOptions = ...,
+) -> StataReader: ...
+@overload
+def read_stata(
+    path: FilePath | ReadBuffer[bytes],
+    convert_dates: bool = ...,
+    convert_categoricals: bool = ...,
+    index_col: str | None = ...,
+    convert_missing: bool = ...,
+    preserve_dtypes: bool = ...,
+    columns: list[HashableT] | None = ...,
+    order_categoricals: bool = ...,
+    chunksize: int | None = ...,
+    iterator: Literal[False] = ...,
+    compression: CompressionOptions = ...,
+    storage_options: StorageOptions = ...,
+) -> DataFrame: ...
 
 class PossiblePrecisionLoss(Warning): ...
-
-precision_loss_doc: str
-
 class ValueLabelTypeMismatch(Warning): ...
-
-value_label_mismatch_doc: str
-
 class InvalidColumnName(Warning): ...
 
-invalid_name_doc: str
-
-class StataValueLabel:
-    labname: Hashable = ...
-    value_labels: list[tuple[float, str]] = ...
-    text_len: int = ...
-    off: npt.NDArray[np.int32] = ...
-    val: npt.NDArray[np.int32] = ...
-    txt: list[bytes] = ...
-    n: int = ...
-    len: int = ...
-    def __init__(
-        self, catarray: pd.Series, encoding: Literal["latin-1", "utf-8"] = ...
-    ) -> None: ...
-    def generate_value_label(self, byteorder: str) -> bytes: ...
-
-class StataMissingValue:
-    MISSING_VALUES: dict[float, str] = ...
-    bases: tuple[int, int, int] = ...
-    float32_base: bytes = ...
-    increment: int = ...
-    int_value: int = ...
-    float64_base: bytes = ...
-    BASE_MISSING_VALUES: dict[str, int] = ...
-    def __init__(self, value: float) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    @property
-    def string(self) -> str: ...
-    @property
-    def value(self) -> float: ...
-    @classmethod
-    def get_base_missing_value(cls, dtype): ...
-
 class StataParser:
-    DTYPE_MAP: dict[int, np.dtype] = ...
-    DTYPE_MAP_XML: dict[int, np.dtype] = ...
-    TYPE_MAP: list[tuple[int | str, ...]] = ...
-    TYPE_MAP_XML: dict[int, str] = ...
-    VALID_RANGE: dict[
-        str,
-        tuple[int, int] | tuple[np.float32, np.float32] | tuple[np.float64, np.float64],
-    ] = ...
-    OLD_TYPE_MAPPING: dict[int, int] = ...
-    MISSING_VALUES: dict[str, int] = ...
-    NUMPY_TYPE_MAP: dict[str, str] = ...
-    RESERVED_WORDS: tuple[str, ...] = ...
     def __init__(self) -> None: ...
 
 class StataReader(StataParser, abc.Iterator):
@@ -142,70 +121,19 @@ class StataReader(StataParser, abc.Iterator):
     def value_labels(self) -> dict[str, dict[float, str]]: ...
 
 class StataWriter(StataParser):
-    type_converters: dict[str, type[np.dtype]] = ...
     def __init__(
         self,
         fname: FilePath | WriteBuffer[bytes],
         data: DataFrame,
-        convert_dates: dict[Hashable, str] | None = ...,
+        convert_dates: dict[HashableT, StataDateFormat] | None = ...,
         write_index: bool = ...,
         byteorder: str | None = ...,
         time_stamp: datetime.datetime | None = ...,
         data_label: str | None = ...,
-        variable_labels: dict[Hashable, str] | None = ...,
+        variable_labels: dict[HashableT, str] | None = ...,
         compression: CompressionOptions = ...,
         storage_options: StorageOptions = ...,
         *,
-        value_labels: dict[Hashable, dict[float, str]] | None = ...,
+        value_labels: dict[HashableT, dict[float, str]] | None = ...,
     ) -> None: ...
     def write_file(self) -> None: ...
-
-class StataStrLWriter:
-    df: DataFrame = ...
-    columns: Sequence[str] = ...
-    def __init__(
-        self,
-        df: DataFrame,
-        columns: Sequence[str],
-        version: int = ...,
-        byteorder: str | None = ...,
-    ) -> None: ...
-    def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: ...
-    def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes: ...
-
-class StataWriter117(StataWriter):
-    def __init__(
-        self,
-        fname: FilePath | WriteBuffer[bytes],
-        data: DataFrame,
-        convert_dates: dict[Hashable, str] | None = ...,
-        write_index: bool = ...,
-        byteorder: str | None = ...,
-        time_stamp: datetime.datetime | None = ...,
-        data_label: str | None = ...,
-        variable_labels: dict[Hashable, str] | None = ...,
-        convert_strl: Sequence[Hashable] | None = ...,
-        compression: CompressionOptions = ...,
-        storage_options: StorageOptions = ...,
-        *,
-        value_labels: dict[Hashable, dict[float, str]] | None = ...,
-    ) -> None: ...
-
-class StataWriterUTF8(StataWriter117):
-    def __init__(
-        self,
-        fname: FilePath | WriteBuffer[bytes],
-        data: DataFrame,
-        convert_dates: dict[Hashable, str] | None = ...,
-        write_index: bool = ...,
-        byteorder: str | None = ...,
-        time_stamp: datetime.datetime | None = ...,
-        data_label: str | None = ...,
-        variable_labels: dict[Hashable, str] | None = ...,
-        convert_strl: Sequence[Hashable] | None = ...,
-        version: int | None = ...,
-        compression: CompressionOptions = ...,
-        storage_options: StorageOptions = ...,
-        *,
-        value_labels: dict[Hashable, dict[float, str]] | None = ...,
-    ) -> None: ...
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -1,18 +1,52 @@
+import pandas as pd
 from pandas import (
     DataFrame,
     read_clipboard,
+    read_stata,
 )
+from pandas._testing import ensure_clean
 import pytest
 from typing_extensions import assert_type
 
 from tests import check
 
 from pandas.io.clipboard import PyperclipException
 from pandas.io.parsers import TextFileReader
+from pandas.io.stata import StataReader
 
 DF = DataFrame({"a": [1, 2, 3], "b": [0.0, 0.0, 0.0]})
 
 
+def test_read_stata_df():
+    with ensure_clean() as path:
+        DF.to_stata(path)
+        check(assert_type(read_stata(path), pd.DataFrame), pd.DataFrame)
+
+
+def test_read_stata_iterator_positional():
+    with ensure_clean() as path:
+        str_path = str(path)
+        DF.to_stata(str_path)
+        check(
+            assert_type(
+                read_stata(
+                    str_path, False, False, None, False, False, None, False, 2, True
+                ),
+                StataReader,
+            ),
+            StataReader,
+        )
+
+
+def test_read_stata_iterator():
+    with ensure_clean() as path:
+        str_path = str(path)
+        DF.to_stata(str_path)
+        check(
+            assert_type(read_stata(str_path, iterator=True), StataReader), StataReader
+        )
+
+
 def test_clipboard():
     try:
         DF.to_clipboard()