Merge remote-tracking branch 'upstream/main' into nonattype

Dr-Irv · Dr-Irv · commit d980ebe3e43b · 2022-03-07T11:30:26.000-05:00
diff --git a/doc/source/getting_started/intro_tutorials/03_subset_data.rst b/doc/source/getting_started/intro_tutorials/03_subset_data.rst
@@ -242,7 +242,7 @@ I want to work with passenger data for which the age is known.
     age_no_na.head()
 
 The :meth:`~Series.notna` conditional function returns a ``True`` for each row the
-values are not an ``Null`` value. As such, this can be combined with the
+values are not a ``Null`` value. As such, this can be combined with the
 selection brackets ``[]`` to filter the data table.
 
 .. raw:: html
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -1238,6 +1238,7 @@ def group_nth(
                     if nobs[i, j] < min_count:
                         if uses_mask:
                             result_mask[i, j] = True
+                            out[i, j] = 0
                         elif iu_64_floating_obj_t is int64_t:
                             # TODO: only if datetimelike?
                             out[i, j] = NPY_NAT
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -367,6 +367,7 @@ def _reconstruct_ea_result(
         """
         Construct an ExtensionArray result from an ndarray result.
         """
+        dtype: BaseMaskedDtype | StringDtype
 
         if isinstance(values.dtype, StringDtype):
             dtype = values.dtype
@@ -375,19 +376,17 @@ def _reconstruct_ea_result(
 
         elif isinstance(values.dtype, BaseMaskedDtype):
             new_dtype = self._get_result_dtype(values.dtype.numpy_dtype)
+            dtype = BaseMaskedDtype.from_numpy_dtype(new_dtype)
             # error: Incompatible types in assignment (expression has type
-            # "BaseMaskedDtype", variable has type "StringDtype")
-            dtype = BaseMaskedDtype.from_numpy_dtype(  # type: ignore[assignment]
-                new_dtype
-            )
-            cls = dtype.construct_array_type()
+            # "Type[BaseMaskedArray]", variable has type "Type[BaseStringArray]")
+            cls = dtype.construct_array_type()  # type: ignore[assignment]
             return cls._from_sequence(res_values, dtype=dtype)
 
-        elif needs_i8_conversion(values.dtype):
-            assert res_values.dtype.kind != "f"  # just to be on the safe side
-            i8values = res_values.view("i8")
-            # error: Too many arguments for "ExtensionArray"
-            return type(values)(i8values, dtype=values.dtype)  # type: ignore[call-arg]
+        elif isinstance(values, (DatetimeArray, TimedeltaArray, PeriodArray)):
+            # In to_cython_values we took a view as M8[ns]
+            assert res_values.dtype == "M8[ns]"
+            res_values = res_values.view(values._ndarray.dtype)
+            return values._from_backing_data(res_values)
 
         raise NotImplementedError
 
@@ -425,12 +424,8 @@ def _masked_ea_wrap_cython_operation(
             **kwargs,
         )
 
-        new_dtype = self._get_result_dtype(orig_values.dtype.numpy_dtype)
-        dtype = BaseMaskedDtype.from_numpy_dtype(new_dtype)
-        # TODO: avoid cast as res_values *should* already have the right
-        #  dtype; last attempt ran into trouble on 32bit linux build
-        res_values = res_values.astype(dtype.type, copy=False)
-
+        # res_values should already have the correct dtype, we just need to
+        #  wrap in a MaskedArray
         return orig_values._maybe_mask_result(res_values, result_mask)
 
     @final
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -212,6 +212,8 @@ def concatenate_managers(
     for placement, join_units in concat_plan:
         unit = join_units[0]
         blk = unit.block
+        # Assertion disabled for performance
+        # assert len(join_units) == len(mgrs_indexers)
 
         if len(join_units) == 1:
             values = blk.values
@@ -329,27 +331,20 @@ def _get_mgr_concatenation_plan(mgr: BlockManager):
     plan : list of (BlockPlacement, JoinUnit) tuples
 
     """
-    # Calculate post-reindex shape , save for item axis which will be separate
-    # for each block anyway.
-    mgr_shape_list = list(mgr.shape)
-    mgr_shape = tuple(mgr_shape_list)
 
     if mgr.is_single_block:
         blk = mgr.blocks[0]
-        return [(blk.mgr_locs, JoinUnit(blk, mgr_shape))]
+        return [(blk.mgr_locs, JoinUnit(blk))]
 
     blknos = mgr.blknos
     blklocs = mgr.blklocs
 
     plan = []
     for blkno, placements in libinternals.get_blkno_placements(blknos, group=False):
 
-        assert placements.is_slice_like
-        assert blkno != -1
-
-        shape_list = list(mgr_shape)
-        shape_list[0] = len(placements)
-        shape = tuple(shape_list)
+        # Assertions disabled for performance; these should always hold
+        # assert placements.is_slice_like
+        # assert blkno != -1
 
         blk = mgr.blocks[blkno]
         ax0_blk_indexer = blklocs[placements.indexer]
@@ -379,19 +374,16 @@ def _get_mgr_concatenation_plan(mgr: BlockManager):
 
         # Assertions disabled for performance
         # assert blk._mgr_locs.as_slice == placements.as_slice
-        # assert blk.shape[0] == shape[0]
-        unit = JoinUnit(blk, shape)
+        unit = JoinUnit(blk)
 
         plan.append((placements, unit))
 
     return plan
 
 
 class JoinUnit:
-    def __init__(self, block: Block, shape: Shape):
-        # Passing shape explicitly is required for cases when block is None.
+    def __init__(self, block: Block):
         self.block = block
-        self.shape = shape
 
     def __repr__(self) -> str:
         return f"{type(self).__name__}({repr(self.block)})"
@@ -404,22 +396,11 @@ def is_na(self) -> bool:
         return False
 
     def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike:
-        values: ArrayLike
-
         if self.is_na:
-            return make_na_array(empty_dtype, self.shape)
+            return make_na_array(empty_dtype, self.block.shape)
 
         else:
-
-            if not self.block._can_consolidate:
-                # preserve these for validation in concat_compat
-                return self.block.values
-
-            # No dtype upcasting is done here, it will be performed during
-            # concatenation itself.
-            values = self.block.values
-
-        return values
+            return self.block.values
 
 
 def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike:
@@ -558,6 +539,9 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
     first = join_units[0].block
     if first.dtype.kind == "V":
         return False
+    elif len(join_units) == 1:
+        # only use this path when there is something to concatenate
+        return False
     return (
         # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64
         all(type(ju.block) is type(first) for ju in join_units)
@@ -570,13 +554,8 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
             or ju.block.dtype.kind in ["b", "i", "u"]
             for ju in join_units
         )
-        and
-        # no blocks that would get missing values (can lead to type upcasts)
-        # unless we're an extension dtype.
-        all(not ju.is_na or ju.block.is_extension for ju in join_units)
-        and
-        # only use this path when there is something to concatenate
-        len(join_units) > 1
+        # this also precludes any blocks with dtype.kind == "V", since
+        #  we excluded that case for `first` above.
     )
 
 
@@ -598,10 +577,7 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit:
     extra_block = join_unit.block.getitem_block(slice(length, None))
     join_unit.block = join_unit.block.getitem_block(slice(length))
 
-    extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
-    join_unit.shape = (length,) + join_unit.shape[1:]
-
-    return JoinUnit(block=extra_block, shape=extra_shape)
+    return JoinUnit(block=extra_block)
 
 
 def _combine_concat_plans(plans):
diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py
@@ -131,3 +131,10 @@ def test_dtypes_timedeltas(self):
             index=list("ABCD"),
         )
         tm.assert_series_equal(result, expected)
+
+    def test_frame_apply_np_array_return_type(self):
+        # GH 35517
+        df = DataFrame([["foo"]])
+        result = df.apply(lambda col: np.array("bar"))
+        expected = Series(["bar"])
+        tm.assert_series_equal(result, expected)