Skip to content

Commit dd443a8

Browse files
authored
Merge branch 'master' into ndarray_tolerance
2 parents fb0a647 + 3c964a4 commit dd443a8

File tree

258 files changed

+11851
-6340
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

258 files changed

+11851
-6340
lines changed

.travis.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ script:
121121
- ci/script_single.sh
122122
- ci/script_multi.sh
123123
- ci/lint.sh
124+
- echo "checking imports"
125+
- source activate pandas && python ci/check_imports.py
124126
- echo "script done"
125127

126128
after_success:

MANIFEST.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
include MANIFEST.in
22
include LICENSE
33
include RELEASE.md
4-
include README.rst
4+
include README.md
55
include setup.py
66
include pyproject.toml
77

asv_bench/benchmarks/index_object.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,3 +199,23 @@ def time_datetime_level_values_full(self):
199199

200200
def time_datetime_level_values_sliced(self):
201201
self.mi[:10].values
202+
203+
204+
class Range(object):
205+
goal_time = 0.2
206+
207+
def setup(self):
208+
self.idx_inc = RangeIndex(start=0, stop=10**7, step=3)
209+
self.idx_dec = RangeIndex(start=10**7, stop=-1, step=-3)
210+
211+
def time_max(self):
212+
self.idx_inc.max()
213+
214+
def time_max_trivial(self):
215+
self.idx_dec.max()
216+
217+
def time_min(self):
218+
self.idx_dec.min()
219+
220+
def time_min_trivial(self):
221+
self.idx_inc.min()

asv_bench/benchmarks/io_bench.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from .pandas_vb_common import *
23
from pandas import concat, Timestamp, compat
34
try:
@@ -192,3 +193,32 @@ def time_read_nrows(self, compression, engine):
192193
ext = ".bz2"
193194
pd.read_csv(self.big_fname + ext, nrows=10,
194195
compression=compression, engine=engine)
196+
197+
198+
class read_json_lines(object):
199+
goal_time = 0.2
200+
fname = "__test__.json"
201+
202+
def setup(self):
203+
self.N = 100000
204+
self.C = 5
205+
self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]))
206+
self.df.to_json(self.fname,orient="records",lines=True)
207+
208+
def teardown(self):
209+
try:
210+
os.remove(self.fname)
211+
except:
212+
pass
213+
214+
def time_read_json_lines(self):
215+
pd.read_json(self.fname, lines=True)
216+
217+
def time_read_json_lines_chunk(self):
218+
pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4))
219+
220+
def peakmem_read_json_lines(self):
221+
pd.read_json(self.fname, lines=True)
222+
223+
def peakmem_read_json_lines_chunk(self):
224+
pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4))

asv_bench/benchmarks/sparse.py

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
from itertools import repeat
1+
import itertools
22

33
from .pandas_vb_common import *
44
import scipy.sparse
5-
from pandas import SparseSeries, SparseDataFrame
5+
from pandas import SparseSeries, SparseDataFrame, SparseArray
66

77

88
class sparse_series_to_frame(object):
@@ -23,6 +23,69 @@ def time_sparse_series_to_frame(self):
2323
SparseDataFrame(self.series)
2424

2525

26+
class sparse_array_constructor(object):
27+
goal_time = 0.2
28+
29+
def setup(self):
30+
np.random.seed(1)
31+
self.int64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=0, dtype=np.int64)
32+
self.int64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=0, dtype=np.int64)
33+
34+
self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64)
35+
self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64)
36+
37+
self.object_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=np.nan)
38+
self.object_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=np.nan)
39+
40+
self.object_non_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0)
41+
self.object_non_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0)
42+
43+
def make_numeric_array(self, length, dense_size, fill_value, dtype):
44+
arr = np.array([fill_value] * length, dtype=dtype)
45+
indexer = np.unique(np.random.randint(0, length, dense_size))
46+
arr[indexer] = np.random.randint(0, 100, len(indexer))
47+
return (arr, fill_value, dtype)
48+
49+
def make_object_array(self, length, dense_size, fill_value):
50+
elems = np.array(['a', 0.0, False, 1, 2], dtype=np.object)
51+
arr = np.array([fill_value] * length, dtype=np.object)
52+
indexer = np.unique(np.random.randint(0, length, dense_size))
53+
arr[indexer] = np.random.choice(elems, len(indexer))
54+
return (arr, fill_value, np.object)
55+
56+
def time_sparse_array_constructor_int64_10percent(self):
57+
arr, fill_value, dtype = self.int64_10percent
58+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
59+
60+
def time_sparse_array_constructor_int64_1percent(self):
61+
arr, fill_value, dtype = self.int64_1percent
62+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
63+
64+
def time_sparse_array_constructor_float64_10percent(self):
65+
arr, fill_value, dtype = self.float64_10percent
66+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
67+
68+
def time_sparse_array_constructor_float64_1percent(self):
69+
arr, fill_value, dtype = self.float64_1percent
70+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
71+
72+
def time_sparse_array_constructor_object_nan_fill_value_10percent(self):
73+
arr, fill_value, dtype = self.object_nan_fill_value_10percent
74+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
75+
76+
def time_sparse_array_constructor_object_nan_fill_value_1percent(self):
77+
arr, fill_value, dtype = self.object_nan_fill_value_1percent
78+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
79+
80+
def time_sparse_array_constructor_object_non_nan_fill_value_10percent(self):
81+
arr, fill_value, dtype = self.object_non_nan_fill_value_10percent
82+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
83+
84+
def time_sparse_array_constructor_object_non_nan_fill_value_1percent(self):
85+
arr, fill_value, dtype = self.object_non_nan_fill_value_1percent
86+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
87+
88+
2689
class sparse_frame_constructor(object):
2790
goal_time = 0.2
2891

@@ -33,7 +96,7 @@ def time_sparse_from_scipy(self):
3396
SparseDataFrame(scipy.sparse.rand(1000, 1000, 0.005))
3497

3598
def time_sparse_from_dict(self):
36-
SparseDataFrame(dict(zip(range(1000), repeat([0]))))
99+
SparseDataFrame(dict(zip(range(1000), itertools.repeat([0]))))
37100

38101

39102
class sparse_series_from_coo(object):

asv_bench/benchmarks/timeseries.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def setup(self):
5656
self.no_freq = self.rng7[:50000].append(self.rng7[50002:])
5757
self.d_freq = self.rng7[:50000].append(self.rng7[50000:])
5858

59-
self.rng8 = date_range(start='1/1/1700', freq='B', periods=100000)
59+
self.rng8 = date_range(start='1/1/1700', freq='B', periods=75000)
6060
self.b_freq = self.rng8[:50000].append(self.rng8[50000:])
6161

6262
def time_add_timedelta(self):

asv_bench/benchmarks/timestamp.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,9 @@ def time_replace_across_dst(self):
8181

8282
def time_replace_None(self):
8383
self.ts_tz.replace(tzinfo=None)
84+
85+
def time_to_pydatetime(self):
86+
self.ts.to_pydatetime()
87+
88+
def time_to_pydatetime_tz(self):
89+
self.ts_tz.to_pydatetime()

ci/check_imports.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
Check that certain modules are not loaded by `import pandas`
3+
"""
4+
import sys
5+
6+
blacklist = {
7+
'bs4',
8+
'html5lib',
9+
'ipython',
10+
'jinja2'
11+
'lxml',
12+
'matplotlib',
13+
'numexpr',
14+
'openpyxl',
15+
'py',
16+
'pytest',
17+
's3fs',
18+
'scipy',
19+
'tables',
20+
'xlrd',
21+
'xlsxwriter',
22+
'xlwt',
23+
}
24+
25+
26+
def main():
27+
import pandas # noqa
28+
29+
modules = set(x.split('.')[0] for x in sys.modules)
30+
imported = modules & blacklist
31+
if modules & blacklist:
32+
sys.exit("Imported {}".format(imported))
33+
34+
35+
if __name__ == '__main__':
36+
main()

ci/requirements-2.7.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ matplotlib
88
openpyxl=1.6.2
99
xlrd=0.9.2
1010
sqlalchemy=0.9.6
11-
lxml=3.2.1
11+
lxml
1212
scipy
1313
xlsxwriter=0.5.2
1414
s3fs

ci/requirements-2.7_LOCALE.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ xlrd=0.9.2
88
bottleneck=1.0.0
99
matplotlib=1.4.3
1010
sqlalchemy=0.8.1
11-
lxml=3.2.1
11+
lxml
1212
scipy

ci/requirements-2.7_SLOW.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ s3fs
1616
psycopg2
1717
pymysql
1818
html5lib
19-
beautiful-soup
19+
beautifulsoup4

ci/requirements-2.7_WIN.run

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ matplotlib
88
openpyxl
99
xlrd
1010
sqlalchemy
11-
lxml=3.2.1
11+
lxml
1212
scipy
1313
xlsxwriter
1414
s3fs
1515
bottleneck
1616
html5lib
17-
beautiful-soup
17+
beautifulsoup4
1818
jinja2=2.8

ci/requirements-3.6_NUMPY_DEV.build

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
python=3.6*
22
pytz
3-
cython

ci/requirements-3.6_NUMPY_DEV.build.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,7 @@ pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy
1414
# install dateutil from master
1515
pip install -U git+git://github.com/dateutil/dateutil.git
1616

17+
# cython via pip
18+
pip install cython
19+
1720
true

ci/requirements_all.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ xlrd
1313
xlwt
1414
html5lib
1515
patsy
16-
beautiful-soup
16+
beautifulsoup4
1717
numpy
1818
cython
1919
scipy

doc/source/10min.rst

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
np.random.seed(123456)
1212
np.set_printoptions(precision=4, suppress=True)
1313
import matplotlib
14-
matplotlib.style.use('ggplot')
14+
# matplotlib.style.use('default')
1515
pd.options.display.max_rows = 15
1616
1717
#### portions of this were borrowed from the
@@ -95,17 +95,7 @@ will be completed:
9595
df2.append df2.combine_first
9696
df2.apply df2.compound
9797
df2.applymap df2.consolidate
98-
df2.as_blocks df2.convert_objects
99-
df2.asfreq df2.copy
100-
df2.as_matrix df2.corr
101-
df2.astype df2.corrwith
102-
df2.at df2.count
103-
df2.at_time df2.cov
104-
df2.axes df2.cummax
105-
df2.B df2.cummin
106-
df2.between_time df2.cumprod
107-
df2.bfill df2.cumsum
108-
df2.blocks df2.D
98+
df2.D
10999

110100
As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically
111101
tab completed. ``E`` is there as well; the rest of the attributes have been

doc/source/advanced.rst

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -638,9 +638,11 @@ and allows efficient indexing and storage of an index with a large number of dup
638638

639639
.. ipython:: python
640640
641+
from pandas.api.types import CategoricalDtype
642+
641643
df = pd.DataFrame({'A': np.arange(6),
642644
'B': list('aabbca')})
643-
df['B'] = df['B'].astype('category', categories=list('cab'))
645+
df['B'] = df['B'].astype(CategoricalDtype(list('cab')))
644646
df
645647
df.dtypes
646648
df.B.cat.categories
@@ -831,12 +833,21 @@ Of course if you need integer based selection, then use ``iloc``
831833
IntervalIndex
832834
~~~~~~~~~~~~~
833835
836+
:class:`IntervalIndex` together with its own dtype, ``interval`` as well as the
837+
:class:`Interval` scalar type, allow first-class support in pandas for interval
838+
notation.
839+
840+
The ``IntervalIndex`` allows some unique indexing and is also used as a
841+
return type for the categories in :func:`cut` and :func:`qcut`.
842+
834843
.. versionadded:: 0.20.0
835844
836845
.. warning::
837846
838847
These indexing behaviors are provisional and may change in a future version of pandas.
839848
849+
An ``IntervalIndex`` can be used in ``Series`` and in ``DataFrame`` as the index.
850+
840851
.. ipython:: python
841852
842853
df = pd.DataFrame({'A': [1, 2, 3, 4]},
@@ -858,6 +869,20 @@ If you select a lable *contained* within an interval, this will also select the
858869
df.loc[2.5]
859870
df.loc[[2.5, 3.5]]
860871
872+
``Interval`` and ``IntervalIndex`` are used by ``cut`` and ``qcut``:
873+
874+
.. ipython:: python
875+
876+
c = pd.cut(range(4), bins=2)
877+
c
878+
c.categories
879+
880+
Furthermore, ``IntervalIndex`` allows one to bin *other* data with these same
881+
bins, with ``NaN`` representing a missing value similar to other dtypes.
882+
883+
.. ipython:: python
884+
885+
pd.cut([0, 3, 5, 1], bins=c.categories)
861886
862887
Miscellaneous indexing FAQ
863888
--------------------------
@@ -984,7 +1009,7 @@ The different indexing operation can potentially change the dtype of a ``Series`
9841009
9851010
series1 = pd.Series([1, 2, 3])
9861011
series1.dtype
987-
res = series1[[0,4]]
1012+
res = series1.reindex([0, 4])
9881013
res.dtype
9891014
res
9901015

0 commit comments

Comments
 (0)