diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0de7a2e745531..b398a197a4bc0 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -244,13 +244,13 @@ def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): necessary. mask : boolean ndarray other : scalar - The source value + The source value. Returns ------- result : ndarray - changed : boolean - Set to true if the result array was upcasted + changed : bool + Set to true if the result array was upcasted. Examples -------- @@ -337,6 +337,21 @@ def changeit(): def maybe_promote(dtype, fill_value=np.nan): + """ + Find the minimal dtype that can hold both the given dtype and fill_value. + + Parameters + ---------- + dtype : np.dtype or ExtensionDtype + fill_value : scalar, default np.nan + + Returns + ------- + dtype + Upcasted from dtype argument if necessary. + fill_value + Upcasted from fill_value argument if necessary. + """ if not is_scalar(fill_value) and not is_object_dtype(dtype): # with object dtype there is nothing to promote, and the user can # pass pretty much any weird fill_value they like @@ -592,11 +607,11 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False): def infer_dtype_from_array(arr, pandas_dtype: bool = False): """ - Infer the dtype from a scalar or array. + Infer the dtype from an array. Parameters ---------- - arr : scalar or array + arr : array pandas_dtype : bool, default False whether to infer dtype including pandas extension types. If False, array belongs to pandas extension types @@ -622,7 +637,6 @@ def infer_dtype_from_array(arr, pandas_dtype: bool = False): >>> infer_dtype_from_array([1, '1']) (numpy.object_, [1, '1']) - """ if isinstance(arr, np.ndarray): @@ -686,10 +700,12 @@ def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False): Parameters ---------- - values : the ndarray that we want to maybe upcast + values : ndarray or ExtensionArray + The array that we want to maybe upcast. fill_value : what we want to fill with dtype : if None, then use the dtype of the values, else coerce to this type - copy : if True always make a copy even if no upcast is required + copy : bool, default True + If True always make a copy even if no upcast is required. """ if not is_scalar(fill_value) and not is_object_dtype(values.dtype): # We allow arbitrary fill values for object dtype diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 55dd91a8129b5..851d353a79eb1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4354,7 +4354,7 @@ def _maybe_casted_values(index, labels=None): values = values._data if mask.any(): - values, changed = maybe_upcast_putmask(values, mask, np.nan) + values, _ = maybe_upcast_putmask(values, mask, np.nan) if issubclass(values_type, DatetimeLikeArray): values = values_type(values, dtype=values_dtype) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fc2412ceaca0e..470c3c76afe12 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -331,11 +331,12 @@ def __new__( # extension dtype elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): - data = np.asarray(data) if not (dtype is None or is_object_dtype(dtype)): # coerce to the provided dtype ea_cls = dtype.construct_array_type() data = ea_cls._from_sequence(data, dtype=dtype, copy=False) + else: + data = np.asarray(data, dtype=object) # coerce to the object dtype data = data.astype(object) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 77e9f0b44c0cb..8de955e5ff2cf 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -314,7 +314,7 @@ def _get_values( # promote if needed else: - values, changed = maybe_upcast_putmask(values, mask, fill_value) + values, _ = maybe_upcast_putmask(values, mask, fill_value) # return a platform independent precision dtype dtype_max = dtype diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index de34258f863d0..40bf19c60e144 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -110,7 +110,7 @@ def masked_arith_op(x, y, op): with np.errstate(all="ignore"): result[mask] = op(xrav[mask], y) - result, changed = maybe_upcast_putmask(result, ~mask, np.nan) + result, _ = maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) # 2D compat return result diff --git a/pandas/core/strings.py b/pandas/core/strings.py index d4d8be90402b7..24e2e674f6ae3 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -74,10 +74,12 @@ def cat_core(list_of_columns: List, sep: str): """ if sep == "": # no need to interleave sep if it is empty - return np.sum(list_of_columns, axis=0) + arr_of_cols = np.asarray(list_of_columns, dtype=object) + return np.sum(arr_of_cols, axis=0) list_with_sep = [sep] * (2 * len(list_of_columns) - 1) list_with_sep[::2] = list_of_columns - return np.sum(list_with_sep, axis=0) + arr_with_sep = np.asarray(list_with_sep) + return np.sum(arr_with_sep, axis=0) def cat_safe(list_of_columns: List, sep: str): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f4c3955bedb70..62b6211e2d82d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1314,7 +1314,7 @@ def create_table_index( optlevel : int or None, default None Optimization level, if None, pytables defaults to 6. kind : str or None, default None - Kind of index, if None, pytables defaults to "medium" + Kind of index, if None, pytables defaults to "medium". Raises ------ @@ -1741,24 +1741,24 @@ def _read_group(self, group: "Node"): class TableIterator: - """ define the iteration interface on a table - - Parameters - ---------- + """ + Define the iteration interface on a table - store : the reference store - s : the referred storer - func : the function to execute the query - where : the where of the query - nrows : the rows to iterate on - start : the passed start value (default is None) - stop : the passed stop value (default is None) - iterator : bool, default False - Whether to use the default iterator. - chunksize : the passed chunking value (default is 100000) - auto_close : boolean, automatically close the store at the end of - iteration, default is False - """ + Parameters + ---------- + store : HDFStore + s : the referred storer + func : the function to execute the query + where : the where of the query + nrows : the rows to iterate on + start : the passed start value (default is None) + stop : the passed stop value (default is None) + iterator : bool, default False + Whether to use the default iterator. + chunksize : the passed chunking value (default is 100000) + auto_close : bool, default False + Whether to automatically close the store at the end of iteration. + """ chunksize: Optional[int] store: HDFStore @@ -1974,7 +1974,8 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): if values.dtype.fields is not None: values = values[self.cname] - values = _maybe_convert(values, self.kind, encoding, errors) + val_kind = _ensure_decoded(self.kind) + values = _maybe_convert(values, val_kind, encoding, errors) kwargs = dict() kwargs["name"] = _ensure_decoded(self.index_name) @@ -2500,13 +2501,18 @@ class Fixed: pandas_kind: str obj_type: Type[Union[DataFrame, Series]] ndim: int + encoding: str parent: HDFStore group: "Node" errors: str is_table = False def __init__( - self, parent: HDFStore, group: "Node", encoding=None, errors: str = "strict" + self, + parent: HDFStore, + group: "Node", + encoding: str = "UTF-8", + errors: str = "strict", ): assert isinstance(parent, HDFStore), type(parent) assert _table_mod is not None # needed for mypy @@ -2556,10 +2562,6 @@ def copy(self): new_self = copy.copy(self) return new_self - @property - def storage_obj_type(self): - return self.obj_type - @property def shape(self): return self.nrows @@ -2584,10 +2586,6 @@ def _complevel(self) -> int: def _fletcher32(self) -> bool: return self.parent._fletcher32 - @property - def _complib(self): - return self.parent._complib - @property def attrs(self): return self.group._v_attrs @@ -3314,12 +3312,12 @@ def data_orientation(self): def queryables(self) -> Dict[str, Any]: """ return a dict of the kinds allowable columns for this object """ + # mypy doesnt recognize DataFrame._AXIS_NAMES, so we re-write it here + axis_names = {0: "index", 1: "columns"} + # compute the values_axes queryables d1 = [(a.cname, a) for a in self.index_axes] - d2 = [ - (self.storage_obj_type._AXIS_NAMES[axis], None) - for axis, values in self.non_index_axes - ] + d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes] d3 = [ (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns) ] @@ -3342,9 +3340,9 @@ def _get_metadata_path(self, key: str) -> str: group = self.group._v_pathname return f"{group}/meta/{key}/meta" - def write_metadata(self, key: str, values): + def write_metadata(self, key: str, values: np.ndarray): """ - write out a meta data array to the key as a fixed-format Series + Write out a metadata array to the key as a fixed-format Series. Parameters ---------- @@ -3498,9 +3496,7 @@ def f(i, c): def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): """ - Create a pytables index on the specified columns - note: cannot index Time64Col() or ComplexCol currently; - PyTables must be >= 3.0 + Create a pytables index on the specified columns. Parameters ---------- @@ -3515,12 +3511,16 @@ def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): optlevel : int or None, default None Optimization level, if None, pytables defaults to 6. kind : str or None, default None - Kind of index, if None, pytables defaults to "medium" + Kind of index, if None, pytables defaults to "medium". Raises ------ - raises if the node is not a table + TypeError if trying to create an index on a complex-type column. + Notes + ----- + Cannot index Time64Col or ComplexCol. + Pytables must be >= 3.0. """ if not self.infer_axes(): @@ -3964,10 +3964,10 @@ def process_filter(field, filt): def create_description( self, - complib=None, - complevel: Optional[int] = None, - fletcher32: bool = False, - expectedrows: Optional[int] = None, + complib, + complevel: Optional[int], + fletcher32: bool, + expectedrows: Optional[int], ) -> Dict[str, Any]: """ create the description of the table from the axes & values """ @@ -4216,7 +4216,13 @@ def write_data(self, chunksize: Optional[int], dropna: bool = False): values=[v[start_i:end_i] for v in bvalues], ) - def write_data_chunk(self, rows, indexes, mask, values): + def write_data_chunk( + self, + rows: np.ndarray, + indexes: List[np.ndarray], + mask: Optional[np.ndarray], + values: List[np.ndarray], + ): """ Parameters ---------- @@ -4424,7 +4430,6 @@ class AppendableSeriesTable(AppendableFrameTable): table_type = "appendable_series" ndim = 2 obj_type = Series - storage_obj_type = DataFrame @property def is_transposed(self) -> bool: @@ -4446,7 +4451,7 @@ def read( columns=None, start: Optional[int] = None, stop: Optional[int] = None, - ): + ) -> Series: is_multi_index = self.is_multi_index if columns is not None and is_multi_index: @@ -4589,7 +4594,7 @@ def read( return df -def _reindex_axis(obj, axis: int, labels: Index, other=None): +def _reindex_axis(obj: DataFrame, axis: int, labels: Index, other=None) -> DataFrame: ax = obj._get_axis(axis) labels = ensure_index(labels) @@ -4652,7 +4657,7 @@ def _set_tz( return values -def _convert_index(name: str, index: Index, encoding=None, errors="strict"): +def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol: assert isinstance(name, str) index_name = index.name @@ -4711,7 +4716,9 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): return IndexCol(name, converted, kind, atom, index_name=index_name,) -def _unconvert_index(data, kind: str, encoding=None, errors="strict"): +def _unconvert_index( + data, kind: str, encoding: str, errors: str +) -> Union[np.ndarray, Index]: index: Union[Index, np.ndarray] if kind == "datetime64": @@ -4802,61 +4809,59 @@ def _maybe_convert_for_string_atom( return data_converted -def _convert_string_array(data, encoding, errors, itemsize=None): +def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray: """ - we take a string-like that is object dtype and coerce to a fixed size - string type + Take a string-like that is object dtype and coerce to a fixed size string type. Parameters ---------- - data : a numpy array of object dtype - encoding : None or string-encoding - errors : handler for encoding errors - itemsize : integer, optional, defaults to the max length of the strings + data : np.ndarray[object] + encoding : str + errors : str + Handler for encoding errors. Returns ------- - data in a fixed-length string dtype, encoded to bytes if needed + np.ndarray[fixed-length-string] """ # encode if needed - if encoding is not None and len(data): + if len(data): data = ( Series(data.ravel()).str.encode(encoding, errors).values.reshape(data.shape) ) # create the sized dtype - if itemsize is None: - ensured = ensure_object(data.ravel()) - itemsize = max(1, libwriters.max_len_string_array(ensured)) + ensured = ensure_object(data.ravel()) + itemsize = max(1, libwriters.max_len_string_array(ensured)) data = np.asarray(data, dtype=f"S{itemsize}") return data -def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"): +def _unconvert_string_array( + data: np.ndarray, nan_rep, encoding: str, errors: str +) -> np.ndarray: """ - inverse of _convert_string_array + Inverse of _convert_string_array. Parameters ---------- - data : fixed length string dtyped array - nan_rep : the storage repr of NaN, optional - encoding : the encoding of the data, optional - errors : handler for encoding errors, default 'strict' + data : np.ndarray[fixed-length-string] + nan_rep : the storage repr of NaN + encoding : str + errors : str + Handler for encoding errors. Returns ------- - an object array of the decoded data - + np.ndarray[object] + Decoded data. """ shape = data.shape data = np.asarray(data.ravel(), dtype=object) - # guard against a None encoding (because of a legacy - # where the passed encoding is actually None) - encoding = _ensure_encoding(encoding) - if encoding is not None and len(data): + if len(data): itemsize = libwriters.max_len_string_array(ensure_object(data)) dtype = f"U{itemsize}" @@ -4873,8 +4878,8 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"): return data.reshape(shape) -def _maybe_convert(values: np.ndarray, val_kind, encoding: str, errors: str): - val_kind = _ensure_decoded(val_kind) +def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str): + assert isinstance(val_kind, str), type(val_kind) if _need_convert(val_kind): conv = _get_converter(val_kind, encoding, errors) values = conv(values) @@ -4885,12 +4890,14 @@ def _get_converter(kind: str, encoding: str, errors: str): if kind == "datetime64": return lambda x: np.asarray(x, dtype="M8[ns]") elif kind == "string": - return lambda x: _unconvert_string_array(x, encoding=encoding, errors=errors) + return lambda x: _unconvert_string_array( + x, nan_rep=None, encoding=encoding, errors=errors + ) else: # pragma: no cover raise ValueError(f"invalid kind {kind}") -def _need_convert(kind) -> bool: +def _need_convert(kind: str) -> bool: if kind in ("datetime64", "string"): return True return False diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 3fb4e291d7d91..47ac4113d90ce 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -449,7 +449,7 @@ def test_scientific_no_exponent(self): def test_convert_non_hashable(self): # GH13324 # make sure that we are handing non-hashables - arr = np.array([[10.0, 2], 1.0, "apple"]) + arr = np.array([[10.0, 2], 1.0, "apple"], dtype=object) result = lib.maybe_convert_numeric(arr, set(), False, True) tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 90ff7a585a323..ad6e0c963e730 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1725,9 +1725,9 @@ def test_constructor_with_datetimes(self): expected = DataFrame({"a": i.to_series().reset_index(drop=True), "b": i_no_tz}) tm.assert_frame_equal(df, expected) - def test_constructor_datetimes_with_nulls(self): - # gh-15869, GH#11220 - for arr in [ + @pytest.mark.parametrize( + "arr", + [ np.array([None, None, None, None, datetime.now(), None]), np.array([None, None, datetime.now(), None]), [[np.datetime64("NaT")], [None]], @@ -1736,10 +1736,13 @@ def test_constructor_datetimes_with_nulls(self): [[None], [pd.NaT]], [[pd.NaT], [np.datetime64("NaT")]], [[pd.NaT], [None]], - ]: - result = DataFrame(arr).dtypes - expected = Series([np.dtype("datetime64[ns]")]) - tm.assert_series_equal(result, expected) + ], + ) + def test_constructor_datetimes_with_nulls(self, arr): + # gh-15869, GH#11220 + result = DataFrame(arr).dtypes + expected = Series([np.dtype("datetime64[ns]")]) + tm.assert_series_equal(result, expected) def test_constructor_for_list_with_dtypes(self): # test list of lists/ndarrays diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 8dcc77fc2fbc1..bb150c5825650 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -761,8 +761,9 @@ def test_array_list(self): ["a", "b"], {"key": "val"}, ] - arr = np.array(arr_list) - tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + arr = np.array(arr_list, dtype=object) + result = np.array(ujson.decode(ujson.encode(arr)), dtype=object) + tm.assert_numpy_array_equal(result, arr) def test_array_float(self): dtype = np.float32 diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index ae16d0fa651d2..5d9ee691edbf0 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -120,7 +120,8 @@ def test_append_index(self): (1.2, tz.localize(datetime.datetime(2011, 1, 2)), "B"), (1.3, tz.localize(datetime.datetime(2011, 1, 3)), "C"), ] - + expected_tuples + + expected_tuples, + dtype=object, ), None, ) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 584550d562b0d..2e651c0b35deb 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2857,7 +2857,8 @@ def test_partition_index(self): result = values.str.partition("_", expand=False) exp = Index( np.array( - [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None] + [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None], + dtype=object, ) ) tm.assert_index_equal(result, exp) @@ -2866,7 +2867,8 @@ def test_partition_index(self): result = values.str.rpartition("_", expand=False) exp = Index( np.array( - [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None] + [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None], + dtype=object, ) ) tm.assert_index_equal(result, exp)