pandas-dev
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md
Lines changed: 24 additions & 0 deletions b/‎.github/PULL_REQUEST_TEMPLATE.md
Lines changed: 24 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 4 additions & 2 deletions b/‎.gitignore
Lines changed: 4 additions & 2 deletions
diff --git a/‎asv_bench/benchmarks/groupby.py
Lines changed: 58 additions & 79 deletions b/‎asv_bench/benchmarks/groupby.py
Lines changed: 58 additions & 79 deletions
diff --git a/‎asv_bench/benchmarks/io/csv.py
Lines changed: 0 additions & 32 deletions b/‎asv_bench/benchmarks/io/csv.py
Lines changed: 0 additions & 32 deletions
diff --git a/‎asv_bench/benchmarks/timeseries.py
Lines changed: 1 addition & 2 deletions b/‎asv_bench/benchmarks/timeseries.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎ci/build_docs.sh
Lines changed: 5 additions & 5 deletions b/‎ci/build_docs.sh
Lines changed: 5 additions & 5 deletions
diff --git a/‎ci/environment-dev.yaml
Lines changed: 1 addition & 0 deletions b/‎ci/environment-dev.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/requirements-2.7_COMPAT.pip
Lines changed: 1 addition & 1 deletion b/‎ci/requirements-2.7_COMPAT.pip
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/requirements-3.6_DOC.run
Lines changed: 1 addition & 1 deletion b/‎ci/requirements-3.6_DOC.run
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/requirements-3.6_NUMPY_DEV.build.sh
Lines changed: 1 addition & 2 deletions b/‎ci/requirements-3.6_NUMPY_DEV.build.sh
Lines changed: 1 addition & 2 deletions
diff --git a/‎ci/requirements-optional-conda.txt
Lines changed: 1 addition & 1 deletion b/‎ci/requirements-optional-conda.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/requirements-optional-pip.txt
Lines changed: 1 addition & 1 deletion b/‎ci/requirements-optional-pip.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/requirements_dev.txt
Lines changed: 2 additions & 1 deletion b/‎ci/requirements_dev.txt
Lines changed: 2 additions & 1 deletion
diff --git a/‎doc/cheatsheet/Pandas_Cheat_Sheet.pdf
160 KB b/‎doc/cheatsheet/Pandas_Cheat_Sheet.pdf
160 KB
diff --git a/‎doc/cheatsheet/Pandas_Cheat_Sheet.pptx
-71.3 KB b/‎doc/cheatsheet/Pandas_Cheat_Sheet.pptx
-71.3 KB
@@ -1,3 +1,27 @@
+Checklist for the pandas documentation sprint (ignore this if you are doing
+an unrelated PR):
+
+- [ ] PR title is "DOC: update the <your-function-or-method> docstring"
+- [ ] The validation script passes: `scripts/validate_docstrings.py <your-function-or-method>`
+- [ ] The PEP8 style check passes: `git diff upstream/master -u -- "*.py" | flake8 --diff`
+- [ ] The html version looks good: `python doc/make.py --single <your-function-or-method>`
+- [ ] It has been proofread on language by another sprint participant
+
+Please include the output of the validation script below between the "```" ticks:
+
+```
+# paste output of "scripts/validate_docstrings.py <your-function-or-method>" here
+# between the "```" (remove this comment, but keep the "```")
+
+```
+
+If the validation script still gives errors, but you think there is a good reason
+to deviate in this case (and there are certainly such cases), please state this
+explicitly.
+
+
+Checklist for other PRs (remove this part if you are doing a PR for the pandas documentation sprint):
+
 - [ ] closes #xxxx
 - [ ] tests added / passed
 - [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`
 
@@ -61,6 +61,7 @@ dist
 .coverage
 coverage.xml
 coverage_html_report
+*.pytest_cache
 
 # OS generated files #
 ######################
@@ -88,8 +89,8 @@ scikits
 *.c
 *.cpp
 
-# Performance Testing #
-#######################
+# Unit / Performance Testing #
+##############################
 asv_bench/env/
 asv_bench/html/
 asv_bench/results/
@@ -108,3 +109,4 @@ doc/tmp.sv
 doc/source/styled.xlsx
 doc/source/templates/
 env/
+doc/source/savefig/
@@ -14,7 +14,10 @@
 method_blacklist = {
     'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
                'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
-               'var', 'mad', 'describe', 'std'}
+               'var', 'mad', 'describe', 'std'},
+    'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
+                 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
+                 'std'}
 }
 
 
@@ -90,45 +93,6 @@ def time_series_groups(self, data, key):
         self.ser.groupby(self.ser).groups
 
 
-class FirstLast(object):
-
-    goal_time = 0.2
-
-    param_names = ['dtype']
-    params = ['float32', 'float64', 'datetime', 'object']
-
-    def setup(self, dtype):
-        N = 10**5
-        # with datetimes (GH7555)
-        if dtype == 'datetime':
-            self.df = DataFrame({'values': date_range('1/1/2011',
-                                                      periods=N,
-                                                      freq='s'),
-                                 'key': range(N)})
-        elif dtype == 'object':
-            self.df = DataFrame({'values': ['foo'] * N,
-                                 'key': range(N)})
-        else:
-            labels = np.arange(N / 10).repeat(10)
-            data = Series(np.random.randn(len(labels)), dtype=dtype)
-            data[::3] = np.nan
-            data[1::3] = np.nan
-            labels = labels.take(np.random.permutation(len(labels)))
-            self.df = DataFrame({'values': data, 'key': labels})
-
-    def time_groupby_first(self, dtype):
-        self.df.groupby('key').first()
-
-    def time_groupby_last(self, dtype):
-        self.df.groupby('key').last()
-
-    def time_groupby_nth_all(self, dtype):
-        self.df.groupby('key').nth(0, dropna='all')
-
-    def time_groupby_nth_none(self, dtype):
-        self.df.groupby('key').nth(0)
-
-
 class GroupManyLabels(object):
 
     goal_time = 0.2
@@ -149,39 +113,40 @@ class Nth(object):
 
     goal_time = 0.2
 
-    def setup_cache(self):
-        df = DataFrame(np.random.randint(1, 100, (10000, 2)))
-        df.iloc[1, 1] = np.nan
-        return df
-
-    def time_frame_nth_any(self, df):
-        df.groupby(0).nth(0, dropna='any')
-
-    def time_frame_nth(self, df):
-        df.groupby(0).nth(0)
-
+    param_names = ['dtype']
+    params = ['float32', 'float64', 'datetime', 'object']
 
-    def time_series_nth_any(self, df):
-        df[1].groupby(df[0]).nth(0, dropna='any')
+    def setup(self, dtype):
+        N = 10**5
+        # with datetimes (GH7555)
+        if dtype == 'datetime':
+            values = date_range('1/1/2011', periods=N, freq='s')
+        elif dtype == 'object':
+            values = ['foo'] * N
+        else:
+            values = np.arange(N).astype(dtype)
 
-    def time_series_nth(self, df):
-        df[1].groupby(df[0]).nth(0)
+        key = np.arange(N)
+        self.df = DataFrame({'key': key, 'values': values})
+        self.df.iloc[1, 1] = np.nan  # insert missing data
 
+    def time_frame_nth_any(self, dtype):
+        self.df.groupby('key').nth(0, dropna='any')
 
-class NthObject(object):
+    def time_groupby_nth_all(self, dtype):
+        self.df.groupby('key').nth(0, dropna='all')
 
-    goal_time = 0.2
+    def time_frame_nth(self, dtype):
+        self.df.groupby('key').nth(0)
 
-    def setup_cache(self):
-        df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g'])
-        df['obj'] = ['a'] * 5000 + ['b'] * 5000
-        return df
+    def time_series_nth_any(self, dtype):
+        self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
 
-    def time_nth(self, df):
-        df.groupby('g').nth(5)
+    def time_groupby_nth_all(self, dtype):
+        self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
 
-    def time_nth_last(self, df):
-        df.groupby('g').last()
+    def time_series_nth(self, dtype):
+        self.df['values'].groupby(self.df['key']).nth(0)
 
 
 class DateAttributes(object):
@@ -243,7 +208,7 @@ def time_multi_count(self, df):
         df.groupby(['key1', 'key2']).count()
 
 
-class CountInt(object):
+class CountMultiInt(object):
 
     goal_time = 0.2
 
@@ -255,18 +220,18 @@ def setup_cache(self):
                         'ints2': np.random.randint(0, 1000, size=n)})
         return df
 
-    def time_int_count(self, df):
+    def time_multi_int_count(self, df):
         df.groupby(['key1', 'key2']).count()
 
-    def time_int_nunique(self, df):
+    def time_multi_int_nunique(self, df):
         df.groupby(['key1', 'key2']).nunique()
 
 
 class AggFunctions(object):
 
     goal_time = 0.2
 
-    def setup_cache(self):
+    def setup_cache():
         N = 10**5
         fac1 = np.array(['A', 'B', 'C'], dtype='O')
         fac2 = np.array(['one', 'two'], dtype='O')
@@ -361,9 +326,6 @@ def setup(self):
     def time_multi_size(self):
         self.df.groupby(['key1', 'key2']).size()
 
-    def time_dt_size(self):
-        self.df.groupby(['dates']).size()
-
     def time_dt_timegrouper_size(self):
         with warnings.catch_warnings(record=True):
             self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
@@ -376,15 +338,16 @@ class GroupByMethods(object):
 
     goal_time = 0.2
 
-    param_names = ['dtype', 'method']
-    params = [['int', 'float', 'object'],
+    param_names = ['dtype', 'method', 'application']
+    params = [['int', 'float', 'object', 'datetime'],
               ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
                'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
                'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
                'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
-               'std', 'sum', 'tail', 'unique', 'value_counts', 'var']]
+               'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
+              ['direct', 'transformation']]
 
-    def setup(self, dtype, method):
+    def setup(self, dtype, method, application):
         if method in method_blacklist.get(dtype, {}):
             raise NotImplementedError  # skip benchmark
         ngroups = 1000
@@ -398,12 +361,28 @@ def setup(self, dtype, method):
                                   np.random.random(ngroups) * 10.0])
         elif dtype == 'object':
             key = ['foo'] * size
+        elif dtype == 'datetime':
+            key = date_range('1/1/2011', periods=size, freq='s')
 
         df = DataFrame({'values': values, 'key': key})
-        self.df_groupby_method = getattr(df.groupby('key')['values'], method)
 
-    def time_method(self, dtype, method):
-        self.df_groupby_method()
+        if application == 'transform':
+            if method == 'describe':
+                raise NotImplementedError
+
+            self.as_group_method = lambda: df.groupby(
+                'key')['values'].transform(method)
+            self.as_field_method = lambda: df.groupby(
+                'values')['key'].transform(method)
+        else:
+            self.as_group_method = getattr(df.groupby('key')['values'], method)
+            self.as_field_method = getattr(df.groupby('values')['key'], method)
+
+    def time_dtype_as_group(self, dtype, method, application):
+        self.as_group_method()
+
+    def time_dtype_as_field(self, dtype, method, application):
+        self.as_field_method()
 
 
 class Float32(object):
 
@@ -118,38 +118,6 @@ def time_read_uint64_na_values(self):
                  na_values=self.na_values)
 
 
-class S3(object):
-    # Make sure that we can read part of a file from S3 without
-    # needing to download the entire thing. Use the timeit.default_timer
-    # to measure wall time instead of CPU time -- we want to see
-    # how long it takes to download the data.
-    timer = timeit.default_timer
-    params = ([None, "gzip", "bz2"], ["python", "c"])
-    param_names = ["compression", "engine"]
-
-    def setup(self, compression, engine):
-        if compression == "bz2" and engine == "c" and PY2:
-            # The Python 2 C parser can't read bz2 from open files.
-            raise NotImplementedError
-        try:
-            import s3fs  # noqa
-        except ImportError:
-            # Skip these benchmarks if `boto` is not installed.
-            raise NotImplementedError
-
-        ext = ""
-        if compression == "gzip":
-            ext = ".gz"
-        elif compression == "bz2":
-            ext = ".bz2"
-        self.big_fname = "s3://pandas-test/large_random.csv" + ext
-
-    def time_read_csv_10_rows(self, compression, engine):
-        # Read a small number of rows from a huge (100,000 x 50) table.
-        read_csv(self.big_fname, nrows=10, compression=compression,
-                 engine=engine)
-
-
 class ReadCSVThousands(BaseIO):
 
     goal_time = 0.2
 
@@ -75,8 +75,7 @@ def setup(self):
                                                   freq='S'))
 
     def time_infer_dst(self):
-        with warnings.catch_warnings(record=True):
-            self.index.tz_localize('US/Eastern', infer_dst=True)
+        self.index.tz_localize('US/Eastern', ambiguous='infer')
 
 
 class ResetIndex(object):
 
@@ -10,11 +10,11 @@ echo "inside $0"
 
 git show --pretty="format:" --name-only HEAD~5.. --first-parent | grep -P "rst|txt|doc"
 
-if [ "$?" != "0" ]; then
-    echo "Skipping doc build, none were modified"
-    # nope, skip docs build
-    exit 0
-fi
+# if [ "$?" != "0" ]; then
+#     echo "Skipping doc build, none were modified"
+#     # nope, skip docs build
+#     exit 0
+# fi
 
 
 if [ "$DOC" ]; then
 
@@ -5,6 +5,7 @@ channels:
 dependencies:
   - Cython
   - NumPy
+  - flake8
   - moto
   - pytest>=3.1
   - python-dateutil>=2.5.0
 
@@ -1,4 +1,4 @@
 html5lib==1.0b2
-beautifulsoup4==4.2.0
+beautifulsoup4==4.2.1
 openpyxl
 argparse
@@ -5,7 +5,7 @@ sphinx
 nbconvert
 nbformat
 notebook
-matplotlib
+matplotlib=2.1*
 seaborn
 scipy
 lxml
 
@@ -12,8 +12,7 @@ PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf
 pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy
 
 # install dateutil from master
-# pip install -U git+git://github.com/dateutil/dateutil.git
-pip install dateutil
+pip install -U git+git://github.com/dateutil/dateutil.git
 
 # cython via pip
 pip install cython
 
@@ -1,4 +1,4 @@
-beautifulsoup4
+beautifulsoup4>=4.2.1
 blosc
 bottleneck
 fastparquet
 
@@ -1,6 +1,6 @@
 # This file was autogenerated by scripts/convert_deps.py
 # Do not modify directly
-beautifulsoup4
+beautifulsoup4>=4.2.1
 blosc
 bottleneck
 fastparquet
 
@@ -2,9 +2,10 @@
 # Do not modify directly
 Cython
 NumPy
+flake8
 moto
 pytest>=3.1
 python-dateutil>=2.5.0
 pytz
 setuptools>=3.3
-sphinx
+sphinx
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-beautifulsoup4`
	`1`	`+beautifulsoup4>=4.2.1`
`2`	`2`	`blosc`
`3`	`3`	`bottleneck`
`4`	`4`	`fastparquet`