pandas-dev
diff --git a/‎.github/workflows/ci.yml
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 9 additions & 0 deletions
diff --git a/‎.github/workflows/database.yml
Lines changed: 8 additions & 0 deletions b/‎.github/workflows/database.yml
Lines changed: 8 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 6 additions & 0 deletions b/‎.pre-commit-config.yaml
Lines changed: 6 additions & 0 deletions
diff --git a/‎MANIFEST.in
Lines changed: 7 additions & 5 deletions b/‎MANIFEST.in
Lines changed: 7 additions & 5 deletions
diff --git a/‎asv_bench/benchmarks/arithmetic.py
Lines changed: 25 additions & 9 deletions b/‎asv_bench/benchmarks/arithmetic.py
Lines changed: 25 additions & 9 deletions
diff --git a/‎asv_bench/benchmarks/ctors.py
Lines changed: 7 additions & 1 deletion b/‎asv_bench/benchmarks/ctors.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎asv_bench/benchmarks/dtypes.py
Lines changed: 4 additions & 1 deletion b/‎asv_bench/benchmarks/dtypes.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎asv_bench/benchmarks/frame_ctor.py
Lines changed: 11 additions & 2 deletions b/‎asv_bench/benchmarks/frame_ctor.py
Lines changed: 11 additions & 2 deletions
diff --git a/‎asv_bench/benchmarks/frame_methods.py
Lines changed: 9 additions & 1 deletion b/‎asv_bench/benchmarks/frame_methods.py
Lines changed: 9 additions & 1 deletion
diff --git a/‎asv_bench/benchmarks/gil.py
Lines changed: 7 additions & 1 deletion b/‎asv_bench/benchmarks/gil.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎asv_bench/benchmarks/groupby.py
Lines changed: 23 additions & 0 deletions b/‎asv_bench/benchmarks/groupby.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎asv_bench/benchmarks/inference.py
Lines changed: 9 additions & 3 deletions b/‎asv_bench/benchmarks/inference.py
Lines changed: 9 additions & 3 deletions
diff --git a/‎asv_bench/benchmarks/io/csv.py
Lines changed: 16 additions & 4 deletions b/‎asv_bench/benchmarks/io/csv.py
Lines changed: 16 additions & 4 deletions
diff --git a/‎asv_bench/benchmarks/io/excel.py
Lines changed: 11 additions & 2 deletions b/‎asv_bench/benchmarks/io/excel.py
Lines changed: 11 additions & 2 deletions
diff --git a/‎asv_bench/benchmarks/io/hdf.py
Lines changed: 11 additions & 3 deletions b/‎asv_bench/benchmarks/io/hdf.py
Lines changed: 11 additions & 3 deletions
diff --git a/‎asv_bench/benchmarks/io/json.py
Lines changed: 12 additions & 3 deletions b/‎asv_bench/benchmarks/io/json.py
Lines changed: 12 additions & 3 deletions
diff --git a/‎asv_bench/benchmarks/io/pickle.py
Lines changed: 10 additions & 3 deletions b/‎asv_bench/benchmarks/io/pickle.py
Lines changed: 10 additions & 3 deletions
diff --git a/‎asv_bench/benchmarks/io/sql.py
Lines changed: 6 additions & 1 deletion b/‎asv_bench/benchmarks/io/sql.py
Lines changed: 6 additions & 1 deletion
@@ -153,3 +153,12 @@ jobs:
       run: |
         source activate pandas-dev
         pytest pandas/tests/frame/methods --array-manager
+        pytest pandas/tests/arithmetic/ --array-manager
+
+        # indexing subset (temporary since other tests don't pass yet)
+        pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean --array-manager
+        pytest pandas/tests/frame/indexing/test_where.py --array-manager
+        pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_multi_index --array-manager
+        pytest pandas/tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_setitem_listlike_indexer_duplicate_columns --array-manager
+        pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_astype_assignment_with_dups --array-manager
+        pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_frame_setitem_multi_column --array-manager
@@ -170,3 +170,11 @@ jobs:
 
     - name: Print skipped tests
       run: python ci/print_skipped.py
+
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v1
+      with:
+        files: /tmp/test_coverage.xml
+        flags: unittests
+        name: codecov-pandas
+        fail_ci_if_error: true
@@ -180,6 +180,12 @@ repos:
         language: pygrep
         types: [python]
         files: ^pandas/tests/
+    -   id: title-capitalization
+        name: Validate correct capitalization among titles in documentation
+        entry: python scripts/validate_rst_title_capitalization.py
+        language: python
+        types: [rst]
+        files: ^doc/source/(development|reference)/
 -   repo: https://github.com/asottile/yesqa
     rev: v1.2.2
     hooks:
 
@@ -1,9 +1,4 @@
-include MANIFEST.in
-include LICENSE
 include RELEASE.md
-include README.md
-include setup.py
-include pyproject.toml
 
 graft doc
 prune doc/build
@@ -16,10 +11,12 @@ global-exclude *.bz2
 global-exclude *.csv
 global-exclude *.dta
 global-exclude *.feather
+global-exclude *.tar
 global-exclude *.gz
 global-exclude *.h5
 global-exclude *.html
 global-exclude *.json
+global-exclude *.jsonl
 global-exclude *.pickle
 global-exclude *.png
 global-exclude *.pyc
@@ -40,6 +37,11 @@ global-exclude .DS_Store
 global-exclude .git*
 global-exclude \#*
 
+# GH 39321
+# csv_dir_path fixture checks the existence of the directory
+# exclude the whole directory to avoid running related tests in sdist
+prune pandas/tests/io/parser/data
+
 include versioneer.py
 include pandas/_version.py
 include pandas/io/formats/templates/*.tpl
@@ -4,7 +4,13 @@
 import numpy as np
 
 import pandas as pd
-from pandas import DataFrame, Series, Timestamp, date_range, to_timedelta
+from pandas import (
+    DataFrame,
+    Series,
+    Timestamp,
+    date_range,
+    to_timedelta,
+)
 import pandas._testing as tm
 from pandas.core.algorithms import checked_add_with_arr
 
@@ -110,16 +116,26 @@ class FrameWithFrameWide:
             operator.add,
             operator.floordiv,
             operator.gt,
-        ]
+        ],
+        [
+            # (n_rows, n_columns)
+            (1_000_000, 10),
+            (100_000, 100),
+            (10_000, 1000),
+            (1000, 10_000),
+        ],
     ]
-    param_names = ["op"]
+    param_names = ["op", "shape"]
 
-    def setup(self, op):
+    def setup(self, op, shape):
         # we choose dtypes so as to make the blocks
         #  a) not perfectly match between right and left
         #  b) appreciably bigger than single columns
-        n_cols = 2000
-        n_rows = 500
+        n_rows, n_cols = shape
+
+        if op is operator.floordiv:
+            # floordiv is much slower than the other operations -> use less data
+            n_rows = n_rows // 10
 
         # construct dataframe with 2 blocks
         arr1 = np.random.randn(n_rows, n_cols // 2).astype("f8")
@@ -131,7 +147,7 @@ def setup(self, op):
         df._consolidate_inplace()
 
         # TODO: GH#33198 the setting here shoudlnt need two steps
-        arr1 = np.random.randn(n_rows, n_cols // 4).astype("f8")
+        arr1 = np.random.randn(n_rows, max(n_cols // 4, 3)).astype("f8")
         arr2 = np.random.randn(n_rows, n_cols // 2).astype("i8")
         arr3 = np.random.randn(n_rows, n_cols // 4).astype("f8")
         df2 = pd.concat(
@@ -145,11 +161,11 @@ def setup(self, op):
         self.left = df
         self.right = df2
 
-    def time_op_different_blocks(self, op):
+    def time_op_different_blocks(self, op, shape):
         # blocks (and dtypes) are not aligned
         op(self.left, self.right)
 
-    def time_op_same_blocks(self, op):
+    def time_op_same_blocks(self, op, shape):
         # blocks (and dtypes) are aligned
         op(self.left, self.left)
 
 
@@ -1,6 +1,12 @@
 import numpy as np
 
-from pandas import DatetimeIndex, Index, MultiIndex, Series, Timestamp
+from pandas import (
+    DatetimeIndex,
+    Index,
+    MultiIndex,
+    Series,
+    Timestamp,
+)
 
 from .pandas_vb_common import tm
 
 
@@ -5,7 +5,10 @@
 import pandas as pd
 from pandas import DataFrame
 import pandas._testing as tm
-from pandas.api.types import is_extension_array_dtype, pandas_dtype
+from pandas.api.types import (
+    is_extension_array_dtype,
+    pandas_dtype,
+)
 
 from .pandas_vb_common import (
     datetime_dtypes,
 
@@ -1,12 +1,21 @@
 import numpy as np
 
 import pandas as pd
-from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range
+from pandas import (
+    DataFrame,
+    MultiIndex,
+    Series,
+    Timestamp,
+    date_range,
+)
 
 from .pandas_vb_common import tm
 
 try:
-    from pandas.tseries.offsets import Hour, Nano
+    from pandas.tseries.offsets import (
+        Hour,
+        Nano,
+    )
 except ImportError:
     # For compatibility with older versions
     from pandas.core.datetools import *  # noqa
 
@@ -3,7 +3,15 @@
 
 import numpy as np
 
-from pandas import DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range
+from pandas import (
+    DataFrame,
+    MultiIndex,
+    NaT,
+    Series,
+    date_range,
+    isnull,
+    period_range,
+)
 
 from .pandas_vb_common import tm
 
 
@@ -1,6 +1,12 @@
 import numpy as np
 
-from pandas import DataFrame, Series, date_range, factorize, read_csv
+from pandas import (
+    DataFrame,
+    Series,
+    date_range,
+    factorize,
+    read_csv,
+)
 from pandas.core.algorithms import take_nd
 
 from .pandas_vb_common import tm
 
@@ -461,6 +461,29 @@ def time_dtype_as_field(self, dtype, method, application):
         self.as_field_method()
 
 
+class GroupByCythonAgg:
+    """
+    Benchmarks specifically targetting our cython aggregation algorithms
+    (using a big enough dataframe with simple key, so a large part of the
+    time is actually spent in the grouped aggregation).
+    """
+
+    param_names = ["dtype", "method"]
+    params = [
+        ["float64"],
+        ["sum", "prod", "min", "max", "mean", "median", "var", "first", "last"],
+    ]
+
+    def setup(self, dtype, method):
+        N = 1_000_000
+        df = DataFrame(np.random.randn(N, 10), columns=list("abcdefghij"))
+        df["key"] = np.random.randint(0, 100, size=N)
+        self.df = df
+
+    def time_frame_agg(self, dtype, method):
+        self.df.groupby("key").agg(method)
+
+
 class RankWithTies:
     # GH 21237
     param_names = ["dtype", "tie_method"]
 
@@ -1,8 +1,14 @@
 import numpy as np
 
-from pandas import Series, to_numeric
-
-from .pandas_vb_common import lib, tm
+from pandas import (
+    Series,
+    to_numeric,
+)
+
+from .pandas_vb_common import (
+    lib,
+    tm,
+)
 
 
 class ToNumeric:
 
@@ -1,12 +1,24 @@
-from io import BytesIO, StringIO
+from io import (
+    BytesIO,
+    StringIO,
+)
 import random
 import string
 
 import numpy as np
 
-from pandas import Categorical, DataFrame, date_range, read_csv, to_datetime
-
-from ..pandas_vb_common import BaseIO, tm
+from pandas import (
+    Categorical,
+    DataFrame,
+    date_range,
+    read_csv,
+    to_datetime,
+)
+
+from ..pandas_vb_common import (
+    BaseIO,
+    tm,
+)
 
 
 class ToCSV(BaseIO):
 
@@ -2,10 +2,19 @@
 
 import numpy as np
 from odf.opendocument import OpenDocumentSpreadsheet
-from odf.table import Table, TableCell, TableRow
+from odf.table import (
+    Table,
+    TableCell,
+    TableRow,
+)
 from odf.text import P
 
-from pandas import DataFrame, ExcelWriter, date_range, read_excel
+from pandas import (
+    DataFrame,
+    ExcelWriter,
+    date_range,
+    read_excel,
+)
 
 from ..pandas_vb_common import tm
 
 
@@ -1,8 +1,16 @@
 import numpy as np
 
-from pandas import DataFrame, HDFStore, date_range, read_hdf
-
-from ..pandas_vb_common import BaseIO, tm
+from pandas import (
+    DataFrame,
+    HDFStore,
+    date_range,
+    read_hdf,
+)
+
+from ..pandas_vb_common import (
+    BaseIO,
+    tm,
+)
 
 
 class HDFStoreDataFrame(BaseIO):
 
@@ -2,9 +2,18 @@
 
 import numpy as np
 
-from pandas import DataFrame, concat, date_range, read_json, timedelta_range
-
-from ..pandas_vb_common import BaseIO, tm
+from pandas import (
+    DataFrame,
+    concat,
+    date_range,
+    read_json,
+    timedelta_range,
+)
+
+from ..pandas_vb_common import (
+    BaseIO,
+    tm,
+)
 
 
 class ReadJSON(BaseIO):
 
@@ -1,8 +1,15 @@
 import numpy as np
 
-from pandas import DataFrame, date_range, read_pickle
-
-from ..pandas_vb_common import BaseIO, tm
+from pandas import (
+    DataFrame,
+    date_range,
+    read_pickle,
+)
+
+from ..pandas_vb_common import (
+    BaseIO,
+    tm,
+)
 
 
 class Pickle(BaseIO):
 
@@ -3,7 +3,12 @@
 import numpy as np
 from sqlalchemy import create_engine
 
-from pandas import DataFrame, date_range, read_sql_query, read_sql_table
+from pandas import (
+    DataFrame,
+    date_range,
+    read_sql_query,
+    read_sql_table,
+)
 
 from ..pandas_vb_common import tm