From f2be8618e73f71133535a6a06e165c40bdf6242c Mon Sep 17 00:00:00 2001 From: Gianluca Rossi Date: Sat, 8 Aug 2015 22:14:34 +0100 Subject: [PATCH 01/12] Add ability to 'read_sql_table' to read views and implemnt unit test to check behaviour --- pandas/io/sql.py | 2 +- pandas/io/tests/test_sql.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 8eefe4ba98876..b587ec128c016 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -337,7 +337,7 @@ def read_sql_table(table_name, con, schema=None, index_col=None, from sqlalchemy.schema import MetaData meta = MetaData(con, schema=schema) try: - meta.reflect(only=[table_name]) + meta.reflect(only=[table_name], views=True) except sqlalchemy.exc.InvalidRequestError: raise ValueError("Table %s not found" % table_name) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 859c6d3250121..434f8c4b71e85 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -161,6 +161,20 @@ SELECT * FROM iris WHERE "Name"=%(name)s AND "SepalLength"=%(length)s """ + }, + 'create_view': { + 'sqlite': """ + CREATE VIEW iris_view AS + SELECT * FROM iris; + """, + 'mysql': """ + CREATE VIEW iris_view AS + SELECT * FROM iris; + """, + 'postgresql': """ + CREATE VIEW iris_view AS + SELECT * FROM iris; + """ } } @@ -244,6 +258,10 @@ def _load_iris_data(self): for row in r: self._get_exec().execute(ins, row) + def _load_iris_view(self): + self.drop_table('iris_view') + self._get_exec().execute(SQL_STRINGS['create_view'][self.flavor]) + def _check_iris_loaded_frame(self, iris_frame): pytype = iris_frame.dtypes[0].type row = iris_frame.iloc[0] @@ -482,6 +500,7 @@ class _TestSQLApi(PandasSQLTest): def setUp(self): self.conn = self.connect() self._load_iris_data() + self._load_iris_view() self._load_test1_data() self._load_test2_data() self._load_test3_data() @@ -492,6 +511,11 @@ def test_read_sql_iris(self): "SELECT * FROM iris", self.conn) self._check_iris_loaded_frame(iris_frame) + def test_read_sql_view(self): + iris_frame = sql.read_sql_query( + "SELECT * FROM iris_view", self.conn) + self._check_iris_loaded_frame(iris_frame) + def test_legacy_read_frame(self): with tm.assert_produces_warning(FutureWarning): iris_frame = sql.read_frame( From 69a28c764b61669f89a9ba9f9717c78d77ee983f Mon Sep 17 00:00:00 2001 From: Gianluca Rossi Date: Sun, 2 Aug 2015 12:24:53 +0100 Subject: [PATCH 02/12] BUG: Fix bug which was preventing the inheritance of Series' names when only few of them exist, and new column names are not provided via the 'keys' argument. Closes #10698 --- doc/source/whatsnew/v0.17.0.txt | 25 +++++++++++++++++++++++++ pandas/tools/merge.py | 12 +++++++++--- pandas/tools/tests/test_merge.py | 13 +++++++++++-- 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 70d616ca72c1b..843cc32a8ab9b 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -137,6 +137,7 @@ Other enhancements - ``.as_blocks`` will now take a ``copy`` optional argument to return a copy of the data, default is to copy (no change in behavior from prior versions), (:issue:`9607`) - ``regex`` argument to ``DataFrame.filter`` now handles numeric column names instead of raising ``ValueError`` (:issue:`10384`). + - ``pd.read_stata`` will now read Stata 118 type files. (:issue:`9882`) - ``pd.merge`` will now allow duplicate column names if they are not merged upon (:issue:`10639`). @@ -152,6 +153,30 @@ Other enhancements s.drop_duplicates(keep=False) +- ``concat`` will now inherit the existing series names (even when some are missing), if new ones are not provided through the ``keys`` argument (:issue:`10698`). + + Previous Behavior: + + .. code-block:: python + + In [1]: foo = pd.Series([1,2], name='foo') + In [2]: bar = pd.Series([1,2]) + In [3]: baz = pd.Series([4,5]) + In [4] pd.concat([foo, bar, baz], 1) + Out[4]: + 0 1 2 + 0 1 1 4 + 1 2 2 5 + + New Behavior: + + .. ipython:: python + + foo = pd.Series([1,2], name='foo') + bar = pd.Series([1,2]) + baz = pd.Series([4,5]) + pd.concat([foo, bar, baz], 1) + .. _whatsnew_0170.api: .. _whatsnew_0170.api_breaking: diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 430828a3db31b..d04cc8c4a7754 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -16,7 +16,7 @@ from pandas.core.internals import (items_overlap_with_suffix, concatenate_block_managers) from pandas.util.decorators import Appender, Substitution -from pandas.core.common import ABCSeries +from pandas.core.common import ABCSeries, isnull from pandas.io.parsers import TextFileReader import pandas.core.common as com @@ -896,8 +896,14 @@ def get_result(self): data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) - if columns is not None: - tmpdf.columns = columns + # checks if the column variable already stores valid column names (because set via the 'key' argument + # in the 'concat' function call. If that's not the case, use the series names as column names + if columns.equals(Index(np.arange(len(self.objs)))): + columns = np.array([ data[i].name for i in range(len(data)) ], dtype='object') + indexer = isnull(columns) + if indexer.any(): + columns[indexer] = np.arange(len(indexer[indexer])) + tmpdf.columns = columns return tmpdf.__finalize__(self, method='concat') # combine block managers diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 8b1457e7fd490..3be283eff1bb4 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1797,6 +1797,15 @@ def test_concat_dataframe_keys_bug(self): self.assertEqual(list(result.columns), [('t1', 'value'), ('t2', 'value')]) + def test_concat_series_partial_columns_names(self): + foo = pd.Series([1,2], name='foo') + bar = pd.Series([1,2]) + baz = pd.Series([4,5]) + + result = pd.concat([foo, bar, baz], 1) + expected = DataFrame({'foo' : [1,2], 0 : [1,2], 1 : [4,5]}, columns=['foo',0,1]) + tm.assert_frame_equal(result, expected) + def test_concat_dict(self): frames = {'foo': DataFrame(np.random.randn(4, 3)), 'bar': DataFrame(np.random.randn(4, 3)), @@ -2330,7 +2339,7 @@ def test_concat_series_axis1(self): s2.name = None result = concat([s, s2], axis=1) - self.assertTrue(np.array_equal(result.columns, lrange(2))) + self.assertTrue(np.array_equal(result.columns, Index(['A', 0], dtype='object'))) # must reindex, #2603 s = Series(randn(3), index=['c', 'a', 'b'], name='A') @@ -2431,7 +2440,7 @@ def test_concat_series_axis1_same_names_ignore_index(self): s2 = Series(randn(len(dates)), index=dates, name='value') result = concat([s1, s2], axis=1, ignore_index=True) - self.assertTrue(np.array_equal(result.columns, [0, 1])) + self.assertTrue(np.array_equal(result.columns, ['value', 'value'])) def test_concat_iterables(self): from collections import deque, Iterable From 9493f002bbbb77b82dead6a5ffdd2fcd2b769c1c Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 1 Aug 2015 23:06:52 +0900 Subject: [PATCH 03/12] BUG: Categorical doesn't show tzinfo properly --- doc/source/whatsnew/v0.17.0.txt | 3 + pandas/core/categorical.py | 33 +- pandas/core/format.py | 37 +- pandas/core/index.py | 9 + pandas/tests/test_categorical.py | 576 +++++++++++++++++++++++++++++++ pandas/tests/test_index.py | 9 + pandas/tseries/period.py | 7 + 7 files changed, 653 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 770ad8a268f11..86bb78f4066ab 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -606,6 +606,9 @@ Bug Fixes - Bug in ``read_stata`` when reading a file with a different order set in ``columns`` (:issue:`10757`) +- Bug in ``Categorical`` may not representing properly when category contains ``tz`` or ``Period`` (:issue:`10713`) +- Bug in ``Categorical.__iter__`` may not returning correct ``datetime`` and ``Period`` (:issue:`10713`) + - Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`). - Bug in ``read_msgpack`` where DataFrame to decode has duplicate column names (:issue:`9618`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b0d564caa5826..c9e30ea31dab8 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -12,7 +12,7 @@ import pandas.core.common as com from pandas.util.decorators import cache_readonly, deprecate_kwarg -from pandas.core.common import (CategoricalDtype, ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex, +from pandas.core.common import (CategoricalDtype, ABCSeries, ABCIndexClass, ABCCategoricalIndex, isnull, notnull, is_dtype_equal, is_categorical_dtype, is_integer_dtype, is_object_dtype, _possibly_infer_to_datetimelike, get_dtype_kinds, @@ -1053,15 +1053,12 @@ def get_values(self): Returns ------- values : numpy array - A numpy array of the same dtype as categorical.categories.dtype or dtype string if - periods + A numpy array of the same dtype as categorical.categories.dtype or + Index if datetime / periods """ - - # if we are a period index, return a string repr - if isinstance(self.categories, ABCPeriodIndex): - return take_1d(np.array(self.categories.to_native_types(), dtype=object), - self._codes) - + # if we are a datetime and period index, return Index to keep metadata + if com.is_datetimelike(self.categories): + return self.categories.take(self._codes) return np.array(self) def check_for_ordered(self, op): @@ -1308,7 +1305,7 @@ def __len__(self): def __iter__(self): """Returns an Iterator over the values of this Categorical.""" - return iter(np.array(self)) + return iter(self.get_values()) def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default footer) """ @@ -1328,7 +1325,7 @@ def _repr_categories(self): max_categories = (10 if get_option("display.max_categories") == 0 else get_option("display.max_categories")) from pandas.core import format as fmt - category_strs = fmt.format_array(self.categories.get_values(), None) + category_strs = fmt.format_array(self.categories, None) if len(category_strs) > max_categories: num = max_categories // 2 head = category_strs[:num] @@ -1343,8 +1340,9 @@ def _repr_categories_info(self): """ Returns a string representation of the footer.""" category_strs = self._repr_categories() - levheader = "Categories (%d, %s): " % (len(self.categories), - self.categories.dtype) + dtype = getattr(self.categories, 'dtype_str', str(self.categories.dtype)) + + levheader = "Categories (%d, %s): " % (len(self.categories), dtype) width, height = get_terminal_size() max_width = get_option("display.width") or width if com.in_ipython_frontend(): @@ -1352,13 +1350,14 @@ def _repr_categories_info(self): max_width = 0 levstring = "" start = True - cur_col_len = len(levheader) + cur_col_len = len(levheader) # header sep_len, sep = (3, " < ") if self.ordered else (2, ", ") + linesep = sep.rstrip() + "\n" # remove whitespace for val in category_strs: if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: - levstring += "\n" + (" "* len(levheader)) - cur_col_len = len(levheader) - if not start: + levstring += linesep + (" " * (len(levheader) + 1)) + cur_col_len = len(levheader) + 1 # header + a whitespace + elif not start: levstring += sep cur_col_len += len(val) levstring += val diff --git a/pandas/core/format.py b/pandas/core/format.py index a18d0cfa6f195..4ec4375349764 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -207,7 +207,7 @@ def _get_formatted_index(self): return fmt_index, have_header def _get_formatted_values(self): - return format_array(self.tr_series.get_values(), None, + return format_array(self.tr_series.values, None, float_format=self.float_format, na_rep=self.na_rep) @@ -681,7 +681,7 @@ def _format_col(self, i): frame = self.tr_frame formatter = self._get_formatter(i) return format_array( - (frame.iloc[:, i]).get_values(), + frame.iloc[:, i].values, formatter, float_format=self.float_format, na_rep=self.na_rep, space=self.col_space ) @@ -1895,8 +1895,13 @@ def get_formatted_cells(self): def format_array(values, formatter, float_format=None, na_rep='NaN', digits=None, space=None, justify='right'): - if com.is_float_dtype(values.dtype): + + if com.is_categorical_dtype(values): + fmt_klass = CategoricalArrayFormatter + elif com.is_float_dtype(values.dtype): fmt_klass = FloatArrayFormatter + elif com.is_period_arraylike(values): + fmt_klass = PeriodArrayFormatter elif com.is_integer_dtype(values.dtype): fmt_klass = IntArrayFormatter elif com.is_datetime64_dtype(values.dtype): @@ -1963,6 +1968,8 @@ def _format(x): return '%s' % formatter(x) vals = self.values + if isinstance(vals, Index): + vals = vals.values is_float = lib.map_infer(vals, com.is_float) & notnull(vals) leading_space = is_float.any() @@ -2076,8 +2083,30 @@ def _format_strings(self): values = values.asobject is_dates_only = _is_dates_only(values) formatter = (self.formatter or _get_format_datetime64(is_dates_only, values, date_format=self.date_format)) - fmt_values = [ formatter(x) for x in self.values ] + fmt_values = [ formatter(x) for x in values ] + + return fmt_values + +class PeriodArrayFormatter(IntArrayFormatter): + + def _format_strings(self): + values = np.array(self.values.to_native_types(), dtype=object) + formatter = self.formatter or (lambda x: '%s' % x) + fmt_values = [formatter(x) for x in values] + return fmt_values + + +class CategoricalArrayFormatter(GenericArrayFormatter): + + def __init__(self, values, *args, **kwargs): + GenericArrayFormatter.__init__(self, values, *args, **kwargs) + + def _format_strings(self): + fmt_values = format_array(self.values.get_values(), self.formatter, + float_format=self.float_format, + na_rep=self.na_rep, digits=self.digits, + space=self.space, justify=self.justify) return fmt_values diff --git a/pandas/core/index.py b/pandas/core/index.py index a9878f493251b..a9631d7aabedd 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -276,6 +276,11 @@ def dtype(self): """ return the dtype object of the underlying data """ return self._data.dtype + @cache_readonly + def dtype_str(self): + """ return the dtype str of the underlying data """ + return str(self.dtype) + @property def values(self): """ return the underlying data as an ndarray """ @@ -2994,6 +2999,10 @@ def equals(self, other): return False + @property + def _formatter_func(self): + return self.categories._formatter_func + def _format_attrs(self): """ Return a list of tuples of the (attr,formatted_value) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index a065d03d4ad72..680b370cbca41 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1736,6 +1736,582 @@ def test_repr(self): "Categories (26, object): [a < b < c < d ... w < x < y < z]") self.assertEqual(exp,a.__unicode__()) + def test_categorical_repr(self): + c = pd.Categorical([1, 2 ,3]) + exp = """[1, 2, 3] +Categories (3, int64): [1, 2, 3]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical([1, 2 ,3, 1, 2 ,3], categories=[1, 2, 3]) + exp = """[1, 2, 3, 1, 2, 3] +Categories (3, int64): [1, 2, 3]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical([1, 2, 3, 4, 5] * 10) + exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5] +Length: 50 +Categories (5, int64): [1, 2, 3, 4, 5]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(np.arange(20)) + exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19] +Length: 20 +Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_ordered(self): + c = pd.Categorical([1, 2 ,3], ordered=True) + exp = """[1, 2, 3] +Categories (3, int64): [1 < 2 < 3]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical([1, 2 ,3, 1, 2 ,3], categories=[1, 2, 3], ordered=True) + exp = """[1, 2, 3, 1, 2, 3] +Categories (3, int64): [1 < 2 < 3]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical([1, 2, 3, 4, 5] * 10, ordered=True) + exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5] +Length: 50 +Categories (5, int64): [1 < 2 < 3 < 4 < 5]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(np.arange(20), ordered=True) + exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19] +Length: 20 +Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_datetime(self): + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5) + c = pd.Categorical(idx) + exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, + 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx) + exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, + 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" + self.assertEqual(repr(c), exp) + + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + c = pd.Categorical(idx) + exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, + 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, + 2011-01-01 13:00:00-05:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx) + exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, + 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, + 2011-01-01 13:00:00-05:00]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_datetime_ordered(self): + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5) + c = pd.Categorical(idx, ordered=True) + exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < + 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < + 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" + self.assertEqual(repr(c), exp) + + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + c = pd.Categorical(idx, ordered=True) + exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_period(self): + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) + c = pd.Categorical(idx) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" + self.assertEqual(repr(c), exp) + + idx = pd.period_range('2011-01', freq='M', periods=5) + c = pd.Categorical(idx) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_period_ordered(self): + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) + c = pd.Categorical(idx, ordered=True) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" + self.assertEqual(repr(c), exp) + + idx = pd.period_range('2011-01', freq='M', periods=5) + c = pd.Categorical(idx, ordered=True) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_timedelta(self): + idx = pd.timedelta_range('1 days', periods=5) + c = pd.Categorical(idx) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" + self.assertEqual(repr(c), exp) + + idx = pd.timedelta_range('1 hours', periods=20) + c = pd.Categorical(idx) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 20 +Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, + 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00, + 18 days 01:00:00, 19 days 01:00:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 40 +Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, + 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00, + 18 days 01:00:00, 19 days 01:00:00]""" + self.assertEqual(repr(c), exp) + + def test_categorical_repr_timedelta_ordered(self): + idx = pd.timedelta_range('1 days', periods=5) + c = pd.Categorical(idx, ordered=True) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" + self.assertEqual(repr(c), exp) + + idx = pd.timedelta_range('1 hours', periods=20) + c = pd.Categorical(idx, ordered=True) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 20 +Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < + 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 < + 18 days 01:00:00 < 19 days 01:00:00]""" + self.assertEqual(repr(c), exp) + + c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 40 +Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < + 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 < + 18 days 01:00:00 < 19 days 01:00:00]""" + self.assertEqual(repr(c), exp) + + def test_categorical_series_repr(self): + s = pd.Series(pd.Categorical([1, 2 ,3])) + exp = """0 1 +1 2 +2 3 +dtype: category +Categories (3, int64): [1, 2, 3]""" + self.assertEqual(repr(s), exp) + + s = pd.Series(pd.Categorical(np.arange(10))) + exp = """0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +dtype: category +Categories (10, int64): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_ordered(self): + s = pd.Series(pd.Categorical([1, 2 ,3], ordered=True)) + exp = """0 1 +1 2 +2 3 +dtype: category +Categories (3, int64): [1 < 2 < 3]""" + self.assertEqual(repr(s), exp) + + s = pd.Series(pd.Categorical(np.arange(10), ordered=True)) + exp = """0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +dtype: category +Categories (10, int64): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_datetime(self): + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5) + s = pd.Series(pd.Categorical(idx)) + exp = """0 2011-01-01 09:00:00 +1 2011-01-01 10:00:00 +2 2011-01-01 11:00:00 +3 2011-01-01 12:00:00 +4 2011-01-01 13:00:00 +dtype: category +Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, + 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" + self.assertEqual(repr(s), exp) + + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + s = pd.Series(pd.Categorical(idx)) + exp = """0 2011-01-01 09:00:00-05:00 +1 2011-01-01 10:00:00-05:00 +2 2011-01-01 11:00:00-05:00 +3 2011-01-01 12:00:00-05:00 +4 2011-01-01 13:00:00-05:00 +dtype: category +Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, + 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, + 2011-01-01 13:00:00-05:00]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_datetime_ordered(self): + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5) + s = pd.Series(pd.Categorical(idx, ordered=True)) + exp = """0 2011-01-01 09:00:00 +1 2011-01-01 10:00:00 +2 2011-01-01 11:00:00 +3 2011-01-01 12:00:00 +4 2011-01-01 13:00:00 +dtype: category +Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < + 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" + self.assertEqual(repr(s), exp) + + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + s = pd.Series(pd.Categorical(idx, ordered=True)) + exp = """0 2011-01-01 09:00:00-05:00 +1 2011-01-01 10:00:00-05:00 +2 2011-01-01 11:00:00-05:00 +3 2011-01-01 12:00:00-05:00 +4 2011-01-01 13:00:00-05:00 +dtype: category +Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_period(self): + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) + s = pd.Series(pd.Categorical(idx)) + exp = """0 2011-01-01 09:00 +1 2011-01-01 10:00 +2 2011-01-01 11:00 +3 2011-01-01 12:00 +4 2011-01-01 13:00 +dtype: category +Categories (5, period): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" + self.assertEqual(repr(s), exp) + + idx = pd.period_range('2011-01', freq='M', periods=5) + s = pd.Series(pd.Categorical(idx)) + exp = """0 2011-01 +1 2011-02 +2 2011-03 +3 2011-04 +4 2011-05 +dtype: category +Categories (5, period): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_period_ordered(self): + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) + s = pd.Series(pd.Categorical(idx, ordered=True)) + exp = """0 2011-01-01 09:00 +1 2011-01-01 10:00 +2 2011-01-01 11:00 +3 2011-01-01 12:00 +4 2011-01-01 13:00 +dtype: category +Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" + self.assertEqual(repr(s), exp) + + idx = pd.period_range('2011-01', freq='M', periods=5) + s = pd.Series(pd.Categorical(idx, ordered=True)) + exp = """0 2011-01 +1 2011-02 +2 2011-03 +3 2011-04 +4 2011-05 +dtype: category +Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_timedelta(self): + idx = pd.timedelta_range('1 days', periods=5) + s = pd.Series(pd.Categorical(idx)) + exp = """0 1 days +1 2 days +2 3 days +3 4 days +4 5 days +dtype: category +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" + self.assertEqual(repr(s), exp) + + idx = pd.timedelta_range('1 hours', periods=10) + s = pd.Series(pd.Categorical(idx)) + exp = """0 0 days 01:00:00 +1 1 days 01:00:00 +2 2 days 01:00:00 +3 3 days 01:00:00 +4 4 days 01:00:00 +5 5 days 01:00:00 +6 6 days 01:00:00 +7 7 days 01:00:00 +8 8 days 01:00:00 +9 9 days 01:00:00 +dtype: category +Categories (10, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, + 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, + 8 days 01:00:00, 9 days 01:00:00]""" + self.assertEqual(repr(s), exp) + + def test_categorical_series_repr_timedelta_ordered(self): + idx = pd.timedelta_range('1 days', periods=5) + s = pd.Series(pd.Categorical(idx, ordered=True)) + exp = """0 1 days +1 2 days +2 3 days +3 4 days +4 5 days +dtype: category +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" + self.assertEqual(repr(s), exp) + + idx = pd.timedelta_range('1 hours', periods=10) + s = pd.Series(pd.Categorical(idx, ordered=True)) + exp = """0 0 days 01:00:00 +1 1 days 01:00:00 +2 2 days 01:00:00 +3 3 days 01:00:00 +4 4 days 01:00:00 +5 5 days 01:00:00 +6 6 days 01:00:00 +7 7 days 01:00:00 +8 8 days 01:00:00 +9 9 days 01:00:00 +dtype: category +Categories (10, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < + 3 days 01:00:00 ... 6 days 01:00:00 < 7 days 01:00:00 < + 8 days 01:00:00 < 9 days 01:00:00]""" + self.assertEqual(repr(s), exp) + + def test_categorical_index_repr(self): + idx = pd.CategoricalIndex(pd.Categorical([1, 2 ,3])) + exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" + self.assertEqual(repr(idx), exp) + + i = pd.CategoricalIndex(pd.Categorical(np.arange(10))) + exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_ordered(self): + i = pd.CategoricalIndex(pd.Categorical([1, 2 ,3], ordered=True)) + exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + i = pd.CategoricalIndex(pd.Categorical(np.arange(10), ordered=True)) + exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_datetime(self): + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', + '2011-01-01 11:00:00', '2011-01-01 12:00:00', + '2011-01-01 13:00:00'], + categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', + '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', + '2011-01-01 13:00:00-05:00'], + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_datetime_ordered(self): + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', + '2011-01-01 11:00:00', '2011-01-01 12:00:00', + '2011-01-01 13:00:00'], + categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + i = pd.CategoricalIndex(pd.Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', + '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', + '2011-01-01 13:00:00-05:00'], + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + i = pd.CategoricalIndex(pd.Categorical(idx.append(idx), ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', + '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', + '2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00', + '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', + '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'], + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_period(self): + # test all length + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=1) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=2) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=3) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', + '2011-01-01 12:00', '2011-01-01 13:00'], + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + i = pd.CategoricalIndex(pd.Categorical(idx.append(idx))) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', + '2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00', + '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', + '2011-01-01 13:00'], + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.period_range('2011-01', freq='M', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_period_ordered(self): + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', + '2011-01-01 12:00', '2011-01-01 13:00'], + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.period_range('2011-01', freq='M', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_timedelta(self): + idx = pd.timedelta_range('1 days', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.timedelta_range('1 hours', periods=10) + i = pd.CategoricalIndex(pd.Categorical(idx)) + exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00', + '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', + '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00', + '9 days 01:00:00'], + categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=False, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_index_repr_timedelta_ordered(self): + idx = pd.timedelta_range('1 days', periods=5) + i = pd.CategoricalIndex(pd.Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + idx = pd.timedelta_range('1 hours', periods=10) + i = pd.CategoricalIndex(pd.Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00', + '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', + '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00', + '9 days 01:00:00'], + categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=True, dtype='category')""" + self.assertEqual(repr(i), exp) + + def test_categorical_frame(self): + # normal DataFrame + dt = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') + p = pd.period_range('2011-01', freq='M', periods=5) + df = pd.DataFrame({'dt': dt, 'p': p}) + exp = """ dt p +0 2011-01-01 09:00:00-05:00 2011-01 +1 2011-01-01 10:00:00-05:00 2011-02 +2 2011-01-01 11:00:00-05:00 2011-03 +3 2011-01-01 12:00:00-05:00 2011-04 +4 2011-01-01 13:00:00-05:00 2011-05""" + + df = pd.DataFrame({'dt': pd.Categorical(dt), 'p': pd.Categorical(p)}) + self.assertEqual(repr(df), exp) + def test_info(self): # make sure it works diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 15023b77694e6..c7418a5651ad7 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -132,6 +132,15 @@ def test_str(self): self.assertTrue("'foo'" in str(idx)) self.assertTrue(idx.__class__.__name__ in str(idx)) + def test_dtype_str(self): + for idx in self.indices.values(): + dtype = idx.dtype_str + self.assertIsInstance(dtype, compat.string_types) + if isinstance(idx, PeriodIndex): + self.assertEqual(dtype, 'period') + else: + self.assertEqual(dtype, str(idx.dtype)) + def test_repr_max_seq_item_setting(self): # GH10182 idx = self.create_index() diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index bb0eda8260704..e7b229e91cbc8 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -21,6 +21,8 @@ _values_from_object, ABCSeries, is_integer, is_float, is_object_dtype) from pandas import compat +from pandas.util.decorators import cache_readonly + from pandas.lib import Timestamp, Timedelta import pandas.lib as lib import pandas.tslib as tslib @@ -534,6 +536,11 @@ def shift(self, n): values[mask] = tslib.iNaT return PeriodIndex(data=values, name=self.name, freq=self.freq) + @cache_readonly + def dtype_str(self): + """ return the dtype str of the underlying data """ + return self.inferred_type + @property def inferred_type(self): # b/c data is represented as ints make sure we can't have ambiguous From b410381375c4c6d3ad5bd45c8db631d257e8deec Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 30 Mar 2014 21:18:22 +0900 Subject: [PATCH 04/12] ENH: duplicated and drop_duplicates now accept take=all kw --- doc/source/indexing.rst | 10 +- doc/source/whatsnew/v0.17.0.txt | 10 ++ pandas/core/base.py | 27 +++-- pandas/core/frame.py | 27 +++-- pandas/core/index.py | 22 ++-- pandas/core/series.py | 13 ++- pandas/hashtable.pyx | 28 ++++- pandas/lib.pyx | 26 +++-- pandas/tests/test_base.py | 69 ++++++++--- pandas/tests/test_frame.py | 197 +++++++++++++++++++++++++++++++- pandas/tests/test_index.py | 6 +- pandas/tests/test_multilevel.py | 15 +++ pandas/tests/test_series.py | 72 +++++++++--- pandas/tests/test_tseries.py | 16 ++- 14 files changed, 448 insertions(+), 90 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 9f58ee2f8b99b..251d94cbdd911 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1178,8 +1178,7 @@ takes as an argument the columns to use to identify duplicated rows. - ``drop_duplicates`` removes duplicate rows. By default, the first observed row of a duplicate set is considered unique, but -each method has a ``take_last`` parameter that indicates the last observed row -should be taken instead. +each method has a ``keep`` parameter to specify targets to be kept. .. ipython:: python @@ -1187,8 +1186,11 @@ should be taken instead. 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], 'c' : np.random.randn(7)}) df2.duplicated(['a','b']) + df2.duplicated(['a','b'], keep='last') + df2.duplicated(['a','b'], keep=False) df2.drop_duplicates(['a','b']) - df2.drop_duplicates(['a','b'], take_last=True) + df2.drop_duplicates(['a','b'], keep='last') + df2.drop_duplicates(['a','b'], keep=False) An alternative way to drop duplicates on the index is ``.groupby(level=0)`` combined with ``first()`` or ``last()``. @@ -1199,7 +1201,7 @@ An alternative way to drop duplicates on the index is ``.groupby(level=0)`` comb df3.groupby(level=0).first() # a bit more verbose - df3.reset_index().drop_duplicates(subset='b', take_last=False).set_index('b') + df3.reset_index().drop_duplicates(subset='b', keep='first').set_index('b') .. _indexing.dictionarylike: diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 86bb78f4066ab..70d616ca72c1b 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -142,6 +142,15 @@ Other enhancements - ``pd.merge`` will now allow duplicate column names if they are not merged upon (:issue:`10639`). - ``pd.pivot`` will now allow passing index as ``None`` (:issue:`3962`). +- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations ` (:issue:`6511`, :issue:`8505`) + +.. ipython :: python + + s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D']) + s.drop_duplicates() + s.drop_duplicates(keep='last') + s.drop_duplicates(keep=False) + .. _whatsnew_0170.api: @@ -520,6 +529,7 @@ Deprecations ===================== ================================= - ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`). +- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was removed in favor of ``keep``. (:issue:`6511`, :issue:`8505`) .. _whatsnew_0170.prior_deprecations: diff --git a/pandas/core/base.py b/pandas/core/base.py index c3004aec60cc5..6d1c89a7a2f89 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -6,7 +6,7 @@ from pandas.core import common as com import pandas.core.nanops as nanops import pandas.lib as lib -from pandas.util.decorators import Appender, cache_readonly +from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg from pandas.core.strings import StringMethods from pandas.core.common import AbstractMethodError @@ -543,8 +543,12 @@ def _dir_deletions(self): Parameters ---------- - take_last : boolean, default False - Take the last observed index in a group. Default first + + keep : {'first', 'last', False}, default 'first' + - ``first`` : Drop duplicates except for the first occurrence. + - ``last`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + take_last : deprecated %(inplace)s Returns @@ -552,9 +556,10 @@ def _dir_deletions(self): deduplicated : %(klass)s """) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs) - def drop_duplicates(self, take_last=False, inplace=False): - duplicated = self.duplicated(take_last=take_last) + def drop_duplicates(self, keep='first', inplace=False): + duplicated = self.duplicated(keep=keep) result = self[np.logical_not(duplicated)] if inplace: return self._update_inplace(result) @@ -566,18 +571,22 @@ def drop_duplicates(self, take_last=False, inplace=False): Parameters ---------- - take_last : boolean, default False - Take the last observed index in a group. Default first + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + take_last : deprecated Returns ------- duplicated : %(duplicated)s """) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs) - def duplicated(self, take_last=False): + def duplicated(self, keep='first'): keys = com._ensure_object(self.values) - duplicated = lib.duplicated(keys, take_last=take_last) + duplicated = lib.duplicated(keys, keep=keep) try: return self._constructor(duplicated, index=self.index).__finalize__(self) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d8948bc82fe61..fe9c9bece1f79 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2866,8 +2866,9 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, else: return result + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @deprecate_kwarg(old_arg_name='cols', new_arg_name='subset') - def drop_duplicates(self, subset=None, take_last=False, inplace=False): + def drop_duplicates(self, subset=None, keep='first', inplace=False): """ Return DataFrame with duplicate rows removed, optionally only considering certain columns @@ -2877,8 +2878,11 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False): subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns - take_last : boolean, default False - Take the last observed row in a row. Defaults to the first row + keep : {'first', 'last', False}, default 'first' + - ``first`` : Drop duplicates except for the first occurrence. + - ``last`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + take_last : deprecated inplace : boolean, default False Whether to drop duplicates in place or to return a copy cols : kwargs only argument of subset [deprecated] @@ -2887,7 +2891,7 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False): ------- deduplicated : DataFrame """ - duplicated = self.duplicated(subset, take_last=take_last) + duplicated = self.duplicated(subset, keep=keep) if inplace: inds, = (-duplicated).nonzero() @@ -2896,8 +2900,9 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False): else: return self[-duplicated] + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @deprecate_kwarg(old_arg_name='cols', new_arg_name='subset') - def duplicated(self, subset=None, take_last=False): + def duplicated(self, subset=None, keep='first'): """ Return boolean Series denoting duplicate rows, optionally only considering certain columns @@ -2907,9 +2912,13 @@ def duplicated(self, subset=None, take_last=False): subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns - take_last : boolean, default False - For a set of distinct duplicate rows, flag all but the last row as - duplicated. Default is for all but the first row to be flagged + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the + first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the + last occurrence. + - False : Mark all duplicates as ``True``. + take_last : deprecated cols : kwargs only argument of subset [deprecated] Returns @@ -2935,7 +2944,7 @@ def f(vals): labels, shape = map(list, zip( * map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) - return Series(duplicated_int64(ids, take_last), index=self.index) + return Series(duplicated_int64(ids, keep), index=self.index) #---------------------------------------------------------------------- # Sorting diff --git a/pandas/core/index.py b/pandas/core/index.py index a9631d7aabedd..febcfa37994a3 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -16,7 +16,7 @@ from pandas.lib import Timestamp, Timedelta, is_datetime_array from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate from pandas.util.decorators import (Appender, Substitution, cache_readonly, - deprecate) + deprecate, deprecate_kwarg) import pandas.core.common as com from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype, _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, @@ -2628,13 +2628,15 @@ def drop(self, labels, errors='raise'): indexer = indexer[~mask] return self.delete(indexer) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs) - def drop_duplicates(self, take_last=False): - return super(Index, self).drop_duplicates(take_last=take_last) + def drop_duplicates(self, keep='first'): + return super(Index, self).drop_duplicates(keep=keep) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) - def duplicated(self, take_last=False): - return super(Index, self).duplicated(take_last=take_last) + def duplicated(self, keep='first'): + return super(Index, self).duplicated(keep=keep) def _evaluate_with_timedelta_like(self, other, op, opstr): raise TypeError("can only perform ops with timedelta like values") @@ -3065,10 +3067,11 @@ def _engine(self): def is_unique(self): return not self.duplicated().any() + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) - def duplicated(self, take_last=False): + def duplicated(self, keep='first'): from pandas.hashtable import duplicated_int64 - return duplicated_int64(self.codes.astype('i8'), take_last) + return duplicated_int64(self.codes.astype('i8'), keep) def get_loc(self, key, method=None): """ @@ -4228,15 +4231,16 @@ def _has_complex_internals(self): def is_unique(self): return not self.duplicated().any() + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) - def duplicated(self, take_last=False): + def duplicated(self, keep='first'): from pandas.core.groupby import get_group_index from pandas.hashtable import duplicated_int64 shape = map(len, self.levels) ids = get_group_index(self.labels, shape, sort=False, xnull=False) - return duplicated_int64(ids, take_last) + return duplicated_int64(ids, keep) def get_value(self, series, key): # somewhat broken encapsulation diff --git a/pandas/core/series.py b/pandas/core/series.py index 6586fa10935e6..87fde996aaa67 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -46,7 +46,7 @@ import pandas.core.datetools as datetools import pandas.core.format as fmt import pandas.core.nanops as nanops -from pandas.util.decorators import Appender, cache_readonly +from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg import pandas.lib as lib import pandas.tslib as tslib @@ -1155,14 +1155,15 @@ def mode(self): from pandas.core.algorithms import mode return mode(self) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs) - def drop_duplicates(self, take_last=False, inplace=False): - return super(Series, self).drop_duplicates(take_last=take_last, - inplace=inplace) + def drop_duplicates(self, keep='first', inplace=False): + return super(Series, self).drop_duplicates(keep=keep, inplace=inplace) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs) - def duplicated(self, take_last=False): - return super(Series, self).duplicated(take_last=take_last) + def duplicated(self, keep='first'): + return super(Series, self).duplicated(keep=keep) def idxmin(self, axis=None, out=None, skipna=True): """ diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 3b3ea9fa032f8..7dbd1b45c938f 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -1026,25 +1026,41 @@ def mode_int64(int64_t[:] values): @cython.wraparound(False) @cython.boundscheck(False) -def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last): +def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'): cdef: - int ret = 0 + int ret = 0, value, k Py_ssize_t i, n = len(values) kh_int64_t * table = kh_init_int64() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) - with nogil: - if take_last: + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: for i from n > i >=0: kh_put_int64(table, values[i], &ret) out[i] = ret == 0 - else: + elif keep == 'first': + with nogil: for i from 0 <= i < n: kh_put_int64(table, values[i], &ret) out[i] = ret == 0 - + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_int64(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_int64(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 kh_destroy_int64(table) return out diff --git a/pandas/lib.pyx b/pandas/lib.pyx index e839210fbbada..07f0c89535a77 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1348,35 +1348,47 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null): return result -def duplicated(ndarray[object] values, take_last=False): + +def duplicated(ndarray[object] values, object keep='first'): cdef: Py_ssize_t i, n - set seen = set() + dict seen = dict() object row n = len(values) cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) - if take_last: + if keep == 'last': for i from n > i >= 0: row = values[i] - if row in seen: result[i] = 1 else: - seen.add(row) + seen[row] = i result[i] = 0 - else: + elif keep == 'first': for i from 0 <= i < n: row = values[i] if row in seen: result[i] = 1 else: - seen.add(row) + seen[row] = i result[i] = 0 + elif keep is False: + for i from 0 <= i < n: + row = values[i] + if row in seen: + result[i] = 1 + result[seen[row]] = 1 + else: + seen[row] = i + result[i] = 0 + else: + raise ValueError('keep must be either "first", "last" or False') return result.view(np.bool_) + def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, start diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index d47e7dbe751c7..066b359d72b5c 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -683,6 +683,10 @@ def test_factorize(self): def test_duplicated_drop_duplicates(self): # GH 4060 + + import warnings + warnings.simplefilter('always') + for original in self.objs: if isinstance(original, Index): @@ -714,15 +718,36 @@ def test_duplicated_drop_duplicates(self): self.assertTrue(duplicated.dtype == bool) tm.assert_index_equal(idx.drop_duplicates(), original) - last_base = [False] * len(idx) - last_base[3] = True - last_base[5] = True - expected = np.array(last_base) - duplicated = idx.duplicated(take_last=True) + base = [False] * len(idx) + base[3] = True + base[5] = True + expected = np.array(base) + + duplicated = idx.duplicated(keep='last') + tm.assert_numpy_array_equal(duplicated, expected) + self.assertTrue(duplicated.dtype == bool) + result = idx.drop_duplicates(keep='last') + tm.assert_index_equal(result, idx[~expected]) + + # deprecate take_last + with tm.assert_produces_warning(FutureWarning): + duplicated = idx.duplicated(take_last=True) + tm.assert_numpy_array_equal(duplicated, expected) + self.assertTrue(duplicated.dtype == bool) + with tm.assert_produces_warning(FutureWarning): + result = idx.drop_duplicates(take_last=True) + tm.assert_index_equal(result, idx[~expected]) + + base = [False] * len(original) + [True, True] + base[3] = True + base[5] = True + expected = np.array(base) + + duplicated = idx.duplicated(keep=False) tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) - tm.assert_index_equal(idx.drop_duplicates(take_last=True), - idx[~np.array(last_base)]) + result = idx.drop_duplicates(keep=False) + tm.assert_index_equal(result, idx[~expected]) with tm.assertRaisesRegexp(TypeError, "drop_duplicates\(\) got an unexpected keyword argument"): @@ -745,13 +770,29 @@ def test_duplicated_drop_duplicates(self): tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) - last_base = [False] * len(idx) - last_base[3] = True - last_base[5] = True - expected = Series(last_base, index=idx, name='a') - tm.assert_series_equal(s.duplicated(take_last=True), expected) - tm.assert_series_equal(s.drop_duplicates(take_last=True), - s[~np.array(last_base)]) + base = [False] * len(idx) + base[3] = True + base[5] = True + expected = Series(base, index=idx, name='a') + + tm.assert_series_equal(s.duplicated(keep='last'), expected) + tm.assert_series_equal(s.drop_duplicates(keep='last'), + s[~np.array(base)]) + + # deprecate take_last + with tm.assert_produces_warning(FutureWarning): + tm.assert_series_equal(s.duplicated(take_last=True), expected) + with tm.assert_produces_warning(FutureWarning): + tm.assert_series_equal(s.drop_duplicates(take_last=True), + s[~np.array(base)]) + base = [False] * len(original) + [True, True] + base[3] = True + base[5] = True + expected = Series(base, index=idx, name='a') + + tm.assert_series_equal(s.duplicated(keep=False), expected) + tm.assert_series_equal(s.drop_duplicates(keep=False), + s[~np.array(base)]) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 77ef5fecf22c9..72eea5162caa5 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7848,7 +7848,7 @@ def test_dropna_multiple_axes(self): inp.dropna(how='all', axis=(0, 1), inplace=True) assert_frame_equal(inp, expected) - def test_drop_duplicates(self): + def test_aaa_drop_duplicates(self): df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], 'B': ['one', 'one', 'two', 'two', @@ -7861,10 +7861,21 @@ def test_drop_duplicates(self): expected = df[:2] assert_frame_equal(result, expected) - result = df.drop_duplicates('AAA', take_last=True) + result = df.drop_duplicates('AAA', keep='last') expected = df.ix[[6, 7]] assert_frame_equal(result, expected) + result = df.drop_duplicates('AAA', keep=False) + expected = df.ix[[]] + assert_frame_equal(result, expected) + self.assertEqual(len(result), 0) + + # deprecate take_last + with tm.assert_produces_warning(FutureWarning): + result = df.drop_duplicates('AAA', take_last=True) + expected = df.ix[[6, 7]] + assert_frame_equal(result, expected) + # multi column expected = df.ix[[0, 1, 2, 3]] result = df.drop_duplicates(np.array(['AAA', 'B'])) @@ -7872,6 +7883,15 @@ def test_drop_duplicates(self): result = df.drop_duplicates(['AAA', 'B']) assert_frame_equal(result, expected) + result = df.drop_duplicates(('AAA', 'B'), keep='last') + expected = df.ix[[0, 5, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AAA', 'B'), keep=False) + expected = df.ix[[0]] + assert_frame_equal(result, expected) + + # deprecate take_last result = df.drop_duplicates(('AAA', 'B'), take_last=True) expected = df.ix[[0, 5, 6, 7]] assert_frame_equal(result, expected) @@ -7884,10 +7904,53 @@ def test_drop_duplicates(self): expected = df2.drop_duplicates(['AAA', 'B']) assert_frame_equal(result, expected) + result = df2.drop_duplicates(keep='last') + expected = df2.drop_duplicates(['AAA', 'B'], keep='last') + assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep=False) + expected = df2.drop_duplicates(['AAA', 'B'], keep=False) + assert_frame_equal(result, expected) + + # deprecate take_last result = df2.drop_duplicates(take_last=True) expected = df2.drop_duplicates(['AAA', 'B'], take_last=True) assert_frame_equal(result, expected) + def test_drop_duplicates_for_take_all(self): + df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', + 'foo', 'bar', 'qux', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('AAA') + expected = df.iloc[[0, 1, 2, 6]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep='last') + expected = df.iloc[[2, 5, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep=False) + expected = df.iloc[[2, 6]] + assert_frame_equal(result, expected) + + # multiple columns + result = df.drop_duplicates(['AAA', 'B']) + expected = df.iloc[[0, 1, 2, 3, 4, 6]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['AAA', 'B'], keep='last') + expected = df.iloc[[0, 1, 2, 5, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['AAA', 'B'], keep=False) + expected = df.iloc[[0, 1, 2, 6]] + assert_frame_equal(result, expected) + def test_drop_duplicates_deprecated_warning(self): df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], @@ -7914,6 +7977,14 @@ def test_drop_duplicates_deprecated_warning(self): self.assertRaises(TypeError, df.drop_duplicates, kwargs={'subset': 'AAA', 'bad_arg': True}) + # deprecate take_last + # Raises warning + with tm.assert_produces_warning(FutureWarning): + result = df.drop_duplicates(take_last=False, subset='AAA') + assert_frame_equal(result, expected) + + self.assertRaises(ValueError, df.drop_duplicates, keep='invalid_name') + def test_drop_duplicates_tuple(self): df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], @@ -7927,6 +7998,16 @@ def test_drop_duplicates_tuple(self): expected = df[:2] assert_frame_equal(result, expected) + result = df.drop_duplicates(('AA', 'AB'), keep='last') + expected = df.ix[[6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AA', 'AB'), keep=False) + expected = df.ix[[]] # empty df + self.assertEqual(len(result), 0) + assert_frame_equal(result, expected) + + # deprecate take_last result = df.drop_duplicates(('AA', 'AB'), take_last=True) expected = df.ix[[6, 7]] assert_frame_equal(result, expected) @@ -7950,6 +8031,16 @@ def test_drop_duplicates_NA(self): expected = df.ix[[0, 2, 3]] assert_frame_equal(result, expected) + result = df.drop_duplicates('A', keep='last') + expected = df.ix[[1, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep=False) + expected = df.ix[[]] # empty df + assert_frame_equal(result, expected) + self.assertEqual(len(result), 0) + + # deprecate take_last result = df.drop_duplicates('A', take_last=True) expected = df.ix[[1, 6, 7]] assert_frame_equal(result, expected) @@ -7959,6 +8050,15 @@ def test_drop_duplicates_NA(self): expected = df.ix[[0, 2, 3, 6]] assert_frame_equal(result, expected) + result = df.drop_duplicates(['A', 'B'], keep='last') + expected = df.ix[[1, 5, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['A', 'B'], keep=False) + expected = df.ix[[6]] + assert_frame_equal(result, expected) + + # deprecate take_last result = df.drop_duplicates(['A', 'B'], take_last=True) expected = df.ix[[1, 5, 6, 7]] assert_frame_equal(result, expected) @@ -7976,6 +8076,16 @@ def test_drop_duplicates_NA(self): expected = df[:2] assert_frame_equal(result, expected) + result = df.drop_duplicates('C', keep='last') + expected = df.ix[[3, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep=False) + expected = df.ix[[]] # empty df + assert_frame_equal(result, expected) + self.assertEqual(len(result), 0) + + # deprecate take_last result = df.drop_duplicates('C', take_last=True) expected = df.ix[[3, 7]] assert_frame_equal(result, expected) @@ -7985,10 +8095,53 @@ def test_drop_duplicates_NA(self): expected = df.ix[[0, 1, 2, 4]] assert_frame_equal(result, expected) + result = df.drop_duplicates(['C', 'B'], keep='last') + expected = df.ix[[1, 3, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['C', 'B'], keep=False) + expected = df.ix[[1]] + assert_frame_equal(result, expected) + + # deprecate take_last result = df.drop_duplicates(['C', 'B'], take_last=True) expected = df.ix[[1, 3, 6, 7]] assert_frame_equal(result, expected) + def test_drop_duplicates_NA_for_take_all(self): + # none + df = DataFrame({'A': [None, None, 'foo', 'bar', + 'foo', 'baz', 'bar', 'qux'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]}) + + # single column + result = df.drop_duplicates('A') + expected = df.iloc[[0, 2, 3, 5, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep='last') + expected = df.iloc[[1, 4, 5, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep=False) + expected = df.iloc[[5, 7]] + assert_frame_equal(result, expected) + + # nan + + # single column + result = df.drop_duplicates('C') + expected = df.iloc[[0, 1, 5, 6]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep='last') + expected = df.iloc[[3, 5, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep=False) + expected = df.iloc[[5, 6]] + assert_frame_equal(result, expected) + def test_drop_duplicates_inplace(self): orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], @@ -8004,6 +8157,20 @@ def test_drop_duplicates_inplace(self): result = df assert_frame_equal(result, expected) + df = orig.copy() + df.drop_duplicates('A', keep='last', inplace=True) + expected = orig.ix[[6, 7]] + result = df + assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates('A', keep=False, inplace=True) + expected = orig.ix[[]] + result = df + assert_frame_equal(result, expected) + self.assertEqual(len(df), 0) + + # deprecate take_last df = orig.copy() df.drop_duplicates('A', take_last=True, inplace=True) expected = orig.ix[[6, 7]] @@ -8017,6 +8184,19 @@ def test_drop_duplicates_inplace(self): result = df assert_frame_equal(result, expected) + df = orig.copy() + df.drop_duplicates(['A', 'B'], keep='last', inplace=True) + expected = orig.ix[[0, 5, 6, 7]] + result = df + assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(['A', 'B'], keep=False, inplace=True) + expected = orig.ix[[0]] + result = df + assert_frame_equal(result, expected) + + # deprecate take_last df = orig.copy() df.drop_duplicates(['A', 'B'], take_last=True, inplace=True) expected = orig.ix[[0, 5, 6, 7]] @@ -8033,6 +8213,19 @@ def test_drop_duplicates_inplace(self): result = df2 assert_frame_equal(result, expected) + df2 = orig2.copy() + df2.drop_duplicates(keep='last', inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], keep='last') + result = df2 + assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(keep=False, inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], keep=False) + result = df2 + assert_frame_equal(result, expected) + + # deprecate take_last df2 = orig2.copy() df2.drop_duplicates(take_last=True, inplace=True) expected = orig2.drop_duplicates(['A', 'B'], take_last=True) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index c7418a5651ad7..d6e57e76d0ec9 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -4720,9 +4720,9 @@ def check(nlevels, with_nulls): labels = [np.random.choice(n, k * n) for lev in levels] mi = MultiIndex(levels=levels, labels=labels) - for take_last in [False, True]: - left = mi.duplicated(take_last=take_last) - right = pd.lib.duplicated(mi.values, take_last=take_last) + for keep in ['first', 'last', False]: + left = mi.duplicated(keep=keep) + right = pd.lib.duplicated(mi.values, keep=keep) tm.assert_numpy_array_equal(left, right) # GH5873 diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 65ba5fd036a35..fbe4eefabe02d 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2135,6 +2135,21 @@ def test_duplicated_drop_duplicates(self): expected = MultiIndex.from_arrays(([1, 2, 3, 2 ,3], [1, 1, 1, 2, 2])) tm.assert_index_equal(idx.drop_duplicates(), expected) + expected = np.array([True, False, False, False, False, False]) + duplicated = idx.duplicated(keep='last') + tm.assert_numpy_array_equal(duplicated, expected) + self.assertTrue(duplicated.dtype == bool) + expected = MultiIndex.from_arrays(([2, 3, 1, 2 ,3], [1, 1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(keep='last'), expected) + + expected = np.array([True, False, False, True, False, False]) + duplicated = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(duplicated, expected) + self.assertTrue(duplicated.dtype == bool) + expected = MultiIndex.from_arrays(([2, 3, 2 ,3], [1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(keep=False), expected) + + # deprecate take_last expected = np.array([True, False, False, False, False, False]) duplicated = idx.duplicated(take_last=True) tm.assert_numpy_array_equal(duplicated, expected) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 66a38cd858846..31843616956f6 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4782,29 +4782,63 @@ def test_axis_alias(self): self.assertEqual(s._get_axis_name('rows'), 'index') def test_drop_duplicates(self): - s = Series([1, 2, 3, 3]) + # check both int and object + for s in [Series([1, 2, 3, 3]), Series(['1', '2', '3', '3'])]: + expected = Series([False, False, False, True]) + assert_series_equal(s.duplicated(), expected) + assert_series_equal(s.drop_duplicates(), s[~expected]) + sc = s.copy() + sc.drop_duplicates(inplace=True) + assert_series_equal(sc, s[~expected]) - result = s.duplicated() - expected = Series([False, False, False, True]) - assert_series_equal(result, expected) + expected = Series([False, False, True, False]) + assert_series_equal(s.duplicated(keep='last'), expected) + assert_series_equal(s.drop_duplicates(keep='last'), s[~expected]) + sc = s.copy() + sc.drop_duplicates(keep='last', inplace=True) + assert_series_equal(sc, s[~expected]) + # deprecate take_last + assert_series_equal(s.duplicated(take_last=True), expected) + assert_series_equal(s.drop_duplicates(take_last=True), s[~expected]) + sc = s.copy() + sc.drop_duplicates(take_last=True, inplace=True) + assert_series_equal(sc, s[~expected]) - result = s.duplicated(take_last=True) - expected = Series([False, False, True, False]) - assert_series_equal(result, expected) + expected = Series([False, False, True, True]) + assert_series_equal(s.duplicated(keep=False), expected) + assert_series_equal(s.drop_duplicates(keep=False), s[~expected]) + sc = s.copy() + sc.drop_duplicates(keep=False, inplace=True) + assert_series_equal(sc, s[~expected]) + + for s in [Series([1, 2, 3, 5, 3, 2, 4]), + Series(['1', '2', '3', '5', '3', '2', '4'])]: + expected = Series([False, False, False, False, True, True, False]) + assert_series_equal(s.duplicated(), expected) + assert_series_equal(s.drop_duplicates(), s[~expected]) + sc = s.copy() + sc.drop_duplicates(inplace=True) + assert_series_equal(sc, s[~expected]) - result = s.drop_duplicates() - expected = s[[True, True, True, False]] - assert_series_equal(result, expected) - sc = s.copy() - sc.drop_duplicates(inplace=True) - assert_series_equal(sc, expected) + expected = Series([False, True, True, False, False, False, False]) + assert_series_equal(s.duplicated(keep='last'), expected) + assert_series_equal(s.drop_duplicates(keep='last'), s[~expected]) + sc = s.copy() + sc.drop_duplicates(keep='last', inplace=True) + assert_series_equal(sc, s[~expected]) + # deprecate take_last + assert_series_equal(s.duplicated(take_last=True), expected) + assert_series_equal(s.drop_duplicates(take_last=True), s[~expected]) + sc = s.copy() + sc.drop_duplicates(take_last=True, inplace=True) + assert_series_equal(sc, s[~expected]) - result = s.drop_duplicates(take_last=True) - expected = s[[True, True, False, True]] - assert_series_equal(result, expected) - sc = s.copy() - sc.drop_duplicates(take_last=True, inplace=True) - assert_series_equal(sc, expected) + expected = Series([False, True, True, False, True, True, False]) + assert_series_equal(s.duplicated(keep=False), expected) + assert_series_equal(s.drop_duplicates(keep=False), s[~expected]) + sc = s.copy() + sc.drop_duplicates(keep=False, inplace=True) + assert_series_equal(sc, s[~expected]) def test_sort(self): ts = self.ts.copy() diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index 035b3ac07342d..f10d541a7e23b 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -275,10 +275,18 @@ def test_duplicated_with_nas(): expected = [False, False, False, True, False, True] assert(np.array_equal(result, expected)) - result = lib.duplicated(keys, take_last=True) + result = lib.duplicated(keys, keep='first') + expected = [False, False, False, True, False, True] + assert(np.array_equal(result, expected)) + + result = lib.duplicated(keys, keep='last') expected = [True, False, True, False, False, False] assert(np.array_equal(result, expected)) + result = lib.duplicated(keys, keep=False) + expected = [True, False, True, True, False, True] + assert(np.array_equal(result, expected)) + keys = np.empty(8, dtype=object) for i, t in enumerate(zip([0, 0, nan, nan] * 2, [0, nan, 0, nan] * 2)): keys[i] = t @@ -289,10 +297,14 @@ def test_duplicated_with_nas(): expected = falses + trues assert(np.array_equal(result, expected)) - result = lib.duplicated(keys, take_last=True) + result = lib.duplicated(keys, keep='last') expected = trues + falses assert(np.array_equal(result, expected)) + result = lib.duplicated(keys, keep=False) + expected = trues + trues + assert(np.array_equal(result, expected)) + def test_maybe_booleans_to_slice(): arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8) From d44ecc3c8716883675cf76c9d1a3c5828b18047e Mon Sep 17 00:00:00 2001 From: sinhrks Date: Fri, 26 Jun 2015 04:54:37 +0900 Subject: [PATCH 05/12] TST: make assertion messages more understandable --- pandas/io/tests/test_json/test_pandas.py | 28 +- pandas/src/testing.pyx | 83 +++++- pandas/tests/test_index.py | 35 ++- pandas/tests/test_testing.py | 353 +++++++++++++++++++++- pandas/util/testing.py | 357 ++++++++++++++++++----- 5 files changed, 763 insertions(+), 93 deletions(-) diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index c145c717df4c4..66c2bbde0b3f8 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -178,7 +178,10 @@ def _check_orient(df, orient, dtype=None, numpy=False, self.assertTrue(df.columns.equals(unser.columns)) elif orient == "values": # index and cols are not captured in this orientation - assert_almost_equal(df.values, unser.values) + if numpy is True and df.shape == (0, 0): + assert unser.shape[0] == 0 + else: + assert_almost_equal(df.values, unser.values) elif orient == "split": # index and col labels might not be strings unser.index = [str(i) for i in unser.index] @@ -670,15 +673,20 @@ def test_doc_example(self): def test_misc_example(self): # parsing unordered input fails - result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]',numpy=True) - expected = DataFrame([[1,2],[1,2]],columns=['a','b']) - with tm.assertRaisesRegexp(AssertionError, - '\[index\] left \[.+\], right \[.+\]'): + result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]', numpy=True) + expected = DataFrame([[1,2], [1,2]], columns=['a', 'b']) + + error_msg = """DataFrame\\.index are different + +DataFrame\\.index values are different \\(100\\.0 %\\) +\\[left\\]: Index\\(\\[u?'a', u?'b'\\], dtype='object'\\) +\\[right\\]: Int64Index\\(\\[0, 1\\], dtype='int64'\\)""" + with tm.assertRaisesRegexp(AssertionError, error_msg): assert_frame_equal(result, expected) result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]') - expected = DataFrame([[1,2],[1,2]],columns=['a','b']) - assert_frame_equal(result,expected) + expected = DataFrame([[1,2], [1,2]], columns=['a','b']) + assert_frame_equal(result, expected) @network def test_round_trip_exception_(self): @@ -739,3 +747,9 @@ def my_handler_raises(obj): raise TypeError("raisin") self.assertRaises(TypeError, DataFrame({'a': [1, 2, object()]}).to_json, default_handler=my_handler_raises) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', + '--pdb-failure', '-s'], exit=False) \ No newline at end of file diff --git a/pandas/src/testing.pyx b/pandas/src/testing.pyx index 4977a80acc936..1abc758559e70 100644 --- a/pandas/src/testing.pyx +++ b/pandas/src/testing.pyx @@ -55,11 +55,39 @@ cpdef assert_dict_equal(a, b, bint compare_keys=True): return True -cpdef assert_almost_equal(a, b, bint check_less_precise=False): +cpdef assert_almost_equal(a, b, bint check_less_precise=False, + obj=None, lobj=None, robj=None): + """Check that left and right objects are almost equal. + + Parameters + ---------- + a : object + b : object + check_less_precise : bool, default False + Specify comparison precision. + 5 digits (False) or 3 digits (True) after decimal points are compared. + obj : str, default None + Specify object name being compared, internally used to show appropriate + assertion message + lobj : str, default None + Specify left object name being compared, internally used to show + appropriate assertion message + robj : str, default None + Specify right object name being compared, internally used to show + appropriate assertion message + """ + cdef: int decimal + double diff = 0.0 Py_ssize_t i, na, nb double fa, fb + bint is_unequal = False + + if lobj is None: + lobj = a + if robj is None: + robj = b if isinstance(a, dict) or isinstance(b, dict): return assert_dict_equal(a, b) @@ -70,33 +98,62 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False): return True if isiterable(a): - assert isiterable(b), ( - "First object is iterable, second isn't: %r != %r" % (a, b) - ) + + if not isiterable(b): + from pandas.util.testing import raise_assert_detail + if obj is None: + obj = 'Iterable' + msg = "First object is iterable, second isn't" + raise_assert_detail(obj, msg, a, b) + assert has_length(a) and has_length(b), ( "Can't compare objects without length, one or both is invalid: " "(%r, %r)" % (a, b) ) - na, nb = len(a), len(b) - assert na == nb, ( - "Length of two iterators not the same: %r != %r" % (na, nb) - ) if isinstance(a, np.ndarray) and isinstance(b, np.ndarray): + if obj is None: + obj = 'numpy array' + na, nb = a.size, b.size + if a.shape != b.shape: + from pandas.util.testing import raise_assert_detail + raise_assert_detail(obj, '{0} shapes are different'.format(obj), + a.shape, b.shape) try: if np.array_equal(a, b): return True except: pass + else: + if obj is None: + obj = 'Iterable' + na, nb = len(a), len(b) + + if na != nb: + from pandas.util.testing import raise_assert_detail + raise_assert_detail(obj, '{0} length are different'.format(obj), + na, nb) + + for i in xrange(len(a)): + try: + assert_almost_equal(a[i], b[i], check_less_precise) + except AssertionError: + is_unequal = True + diff += 1 - for i in xrange(na): - assert_almost_equal(a[i], b[i], check_less_precise) + if is_unequal: + from pandas.util.testing import raise_assert_detail + msg = '{0} values are different ({1} %)'.format(obj, np.round(diff * 100.0 / na, 5)) + raise_assert_detail(obj, msg, lobj, robj) return True + elif isiterable(b): - assert False, ( - "Second object is iterable, first isn't: %r != %r" % (a, b) - ) + from pandas.util.testing import raise_assert_detail + if obj is None: + obj = 'Iterable' + msg = "Second object is iterable, first isn't" + raise_assert_detail(obj, msg, a, b) if isnull(a): assert isnull(b), ( diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index d6e57e76d0ec9..3c988943301c0 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -3371,7 +3371,10 @@ def test_inplace_mutation_resets_values(self): # make sure label setting works too labels2 = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]] - exp_values = np.array([(long(1), 'a')] * 6, dtype=object) + exp_values = np.empty((6, ), dtype=object) + exp_values[:] = [(long(1), 'a')] * 6 + # must be 1d array of tuples + self.assertEqual(exp_values.shape, (6, )) new_values = mi2.set_labels(labels2).values # not inplace shouldn't change assert_almost_equal(mi2._tuples, vals2) @@ -4772,8 +4775,20 @@ def test_repr_roundtrip(self): mi = MultiIndex.from_product([list('ab'),range(3)],names=['first','second']) str(mi) - tm.assert_index_equal(eval(repr(mi)),mi,exact=True) - + + if compat.PY3: + tm.assert_index_equal(eval(repr(mi)), mi, exact=True) + else: + result = eval(repr(mi)) + # string coerces to unicode + tm.assert_index_equal(result, mi, exact=False) + self.assertEqual(mi.get_level_values('first').inferred_type, 'string') + self.assertEqual(result.get_level_values('first').inferred_type, 'unicode') + + mi_u = MultiIndex.from_product([list(u'ab'),range(3)],names=['first','second']) + result = eval(repr(mi_u)) + tm.assert_index_equal(result, mi_u, exact=True) + # formatting if compat.PY3: str(mi) @@ -4783,7 +4798,19 @@ def test_repr_roundtrip(self): # long format mi = MultiIndex.from_product([list('abcdefg'),range(10)],names=['first','second']) result = str(mi) - tm.assert_index_equal(eval(repr(mi)),mi,exact=True) + + if compat.PY3: + tm.assert_index_equal(eval(repr(mi)), mi, exact=True) + else: + result = eval(repr(mi)) + # string coerces to unicode + tm.assert_index_equal(result, mi, exact=False) + self.assertEqual(mi.get_level_values('first').inferred_type, 'string') + self.assertEqual(result.get_level_values('first').inferred_type, 'unicode') + + mi = MultiIndex.from_product([list(u'abcdefg'),range(10)],names=['first','second']) + result = eval(repr(mi_u)) + tm.assert_index_equal(result, mi_u, exact=True) def test_str(self): # tested elsewhere diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 668579911d6d5..f4fbc19535107 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -10,7 +10,8 @@ import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, assertRaisesRegexp, raise_with_traceback, - assert_series_equal, assert_frame_equal, RNGContext + assert_index_equal, assert_series_equal, assert_frame_equal, + assert_numpy_array_equal, assert_isinstance, RNGContext ) # let's get meta. @@ -132,6 +133,275 @@ def test_raise_with_traceback(self): raise_with_traceback(e, traceback) +class TestAssertNumpyArrayEqual(tm.TestCase): + + def test_numpy_array_equal_message(self): + + expected = """numpy array are different + +numpy array shapes are different +\\[left\\]: \\(2,\\) +\\[right\\]: \\(3,\\)""" + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5])) + + with assertRaisesRegexp(AssertionError, expected): + assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5])) + + # scalar comparison + expected = """: 1 != 2""" + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(1, 2) + expected = """expected 2\\.00000 but got 1\\.00000, with decimal 5""" + with assertRaisesRegexp(AssertionError, expected): + assert_almost_equal(1, 2) + + # array / scalar array comparison + expected = """(numpy array|Iterable) are different + +First object is iterable, second isn't +\\[left\\]: \\[1\\] +\\[right\\]: 1""" + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(np.array([1]), 1) + with assertRaisesRegexp(AssertionError, expected): + assert_almost_equal(np.array([1]), 1) + + # scalar / array comparison + expected = """(numpy array|Iterable) are different + +Second object is iterable, first isn't +\\[left\\]: 1 +\\[right\\]: \\[1\\]""" + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(1, np.array([1])) + with assertRaisesRegexp(AssertionError, expected): + assert_almost_equal(1, np.array([1])) + + expected = """numpy array are different + +numpy array values are different \\(66\\.66667 %\\) +\\[left\\]: \\[nan, 2\\.0, 3\\.0\\] +\\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) + with assertRaisesRegexp(AssertionError, expected): + assert_almost_equal(np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) + + expected = """numpy array are different + +numpy array values are different \\(50\\.0 %\\) +\\[left\\]: \\[1, 2\\] +\\[right\\]: \\[1, 3\\]""" + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(np.array([1, 2]), np.array([1, 3])) + with assertRaisesRegexp(AssertionError, expected): + assert_almost_equal(np.array([1, 2]), np.array([1, 3])) + + + expected = """numpy array are different + +numpy array values are different \\(50\\.0 %\\) +\\[left\\]: \\[1\\.1, 2\\.000001\\] +\\[right\\]: \\[1\\.1, 2.0\\]""" + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(np.array([1.1, 2.000001]), np.array([1.1, 2.0])) + + # must pass + assert_almost_equal(np.array([1.1, 2.000001]), np.array([1.1, 2.0])) + + expected = """numpy array are different + +numpy array values are different \\(16\\.66667 %\\) +\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\] +\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(np.array([[1, 2], [3, 4], [5, 6]]), + np.array([[1, 3], [3, 4], [5, 6]])) + with assertRaisesRegexp(AssertionError, expected): + assert_almost_equal(np.array([[1, 2], [3, 4], [5, 6]]), + np.array([[1, 3], [3, 4], [5, 6]])) + + expected = """numpy array are different + +numpy array values are different \\(25\\.0 %\\) +\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\]\\] +\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]""" + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(np.array([[1, 2], [3, 4]]), + np.array([[1, 3], [3, 4]])) + with assertRaisesRegexp(AssertionError, expected): + assert_almost_equal(np.array([[1, 2], [3, 4]]), + np.array([[1, 3], [3, 4]])) + + # allow to overwrite message + expected = """Index are different + +Index shapes are different +\\[left\\]: \\(2,\\) +\\[right\\]: \\(3,\\)""" + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5]), + obj='Index') + with assertRaisesRegexp(AssertionError, expected): + assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5]), + obj='Index') + + def test_assert_almost_equal_iterable_message(self): + + expected = """Iterable are different + +Iterable length are different +\\[left\\]: 2 +\\[right\\]: 3""" + with assertRaisesRegexp(AssertionError, expected): + assert_almost_equal([1, 2], [3, 4, 5]) + + expected = """Iterable are different + +Iterable values are different \\(50\\.0 %\\) +\\[left\\]: \\[1, 2\\] +\\[right\\]: \\[1, 3\\]""" + with assertRaisesRegexp(AssertionError, expected): + assert_almost_equal([1, 2], [1, 3]) + + +class TestAssertIndexEqual(unittest.TestCase): + _multiprocess_can_split_ = True + + def test_index_equal_message(self): + + expected = """Index are different + +Index levels are different +\\[left\\]: 1, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) +\\[right\\]: 2, MultiIndex\\(levels=\\[\\[u?'A', u?'B'\\], \\[1, 2, 3, 4\\]\\], + labels=\\[\\[0, 0, 1, 1\\], \\[0, 1, 2, 3\\]\\]\\)""" + idx1 = pd.Index([1, 2, 3]) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4)]) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2, exact=False) + + + expected = """MultiIndex level \\[1\\] are different + +MultiIndex level \\[1\\] values are different \\(25\\.0 %\\) +\\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" + idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), ('B', 3), ('B', 4)]) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4)]) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2, check_exact=False) + + expected = """Index are different + +Index length are different +\\[left\\]: 3, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) +\\[right\\]: 4, Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" + idx1 = pd.Index([1, 2, 3]) + idx2 = pd.Index([1, 2, 3, 4]) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2, check_exact=False) + + expected = """Index are different + +Index classes are different +\\[left\\]: Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) +\\[right\\]: Float64Index\\(\\[1\\.0, 2\\.0, 3\\.0\\], dtype='float64'\\)""" + idx1 = pd.Index([1, 2, 3]) + idx2 = pd.Index([1, 2, 3.0]) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2, exact=True) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2, exact=True, check_exact=False) + + expected = """Index are different + +Index values are different \\(33\\.33333 %\\) +\\[left\\]: Float64Index\\(\\[1.0, 2.0, 3.0], dtype='float64'\\) +\\[right\\]: Float64Index\\(\\[1.0, 2.0, 3.0000000001\\], dtype='float64'\\)""" + idx1 = pd.Index([1, 2, 3.]) + idx2 = pd.Index([1, 2, 3.0000000001]) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2) + + # must success + assert_index_equal(idx1, idx2, check_exact=False) + + expected = """Index are different + +Index values are different \\(33\\.33333 %\\) +\\[left\\]: Float64Index\\(\\[1.0, 2.0, 3.0], dtype='float64'\\) +\\[right\\]: Float64Index\\(\\[1.0, 2.0, 3.0001\\], dtype='float64'\\)""" + idx1 = pd.Index([1, 2, 3.]) + idx2 = pd.Index([1, 2, 3.0001]) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2, check_exact=False) + # must success + assert_index_equal(idx1, idx2, check_exact=False, check_less_precise=True) + + expected = """Index are different + +Index values are different \\(33\\.33333 %\\) +\\[left\\]: Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 4\\], dtype='int64'\\)""" + idx1 = pd.Index([1, 2, 3]) + idx2 = pd.Index([1, 2, 4]) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2, check_less_precise=True) + + expected = """MultiIndex level \\[1\\] are different + +MultiIndex level \\[1\\] values are different \\(25\\.0 %\\) +\\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" + idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), ('B', 3), ('B', 4)]) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4)]) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2, check_exact=False) + + def test_index_equal_metadata_message(self): + + expected = """Index are different + +Attribute "names" are different +\\[left\\]: \\[None\\] +\\[right\\]: \\[u?'x'\\]""" + idx1 = pd.Index([1, 2, 3]) + idx2 = pd.Index([1, 2, 3], name='x') + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2) + + # same name, should pass + assert_index_equal(pd.Index([1, 2, 3], name=np.nan), + pd.Index([1, 2, 3], name=np.nan)) + assert_index_equal(pd.Index([1, 2, 3], name=pd.NaT), + pd.Index([1, 2, 3], name=pd.NaT)) + + + expected = """Index are different + +Attribute "names" are different +\\[left\\]: \\[nan\\] +\\[right\\]: \\[NaT\\]""" + idx1 = pd.Index([1, 2, 3], name=np.nan) + idx2 = pd.Index([1, 2, 3], name=pd.NaT) + with assertRaisesRegexp(AssertionError, expected): + assert_index_equal(idx1, idx2) + + class TestAssertSeriesEqual(tm.TestCase): _multiprocess_can_split_ = True @@ -191,6 +461,28 @@ def test_multiindex_dtype(self): {'a':[1.0,2.0],'b':[2.1,1.5],'c':['l1','l2']}, index=['a','b']) self._assert_not_equal(df1.c, df2.c, check_index_type=True) + def test_series_equal_message(self): + + expected = """Series are different + +Series length are different +\\[left\\]: 3, Int64Index\\(\\[0, 1, 2\\], dtype='int64'\\) +\\[right\\]: 4, Int64Index\\(\\[0, 1, 2, 3\\], dtype='int64'\\)""" + with assertRaisesRegexp(AssertionError, expected): + assert_series_equal(pd.Series([1, 2, 3]), pd.Series([1, 2, 3, 4])) + + + expected = """Series are different + +Series values are different \\(33\\.33333 %\\) +\\[left\\]: \\[1, 2, 3\\] +\\[right\\]: \\[1, 2, 4\\]""" + with assertRaisesRegexp(AssertionError, expected): + assert_series_equal(pd.Series([1, 2, 3]), pd.Series([1, 2, 4])) + with assertRaisesRegexp(AssertionError, expected): + assert_series_equal(pd.Series([1, 2, 3]), pd.Series([1, 2, 4]), + check_less_precise=True) + class TestAssertFrameEqual(tm.TestCase): _multiprocess_can_split_ = True @@ -224,6 +516,65 @@ def test_empty_dtypes(self): self._assert_equal(df1, df2, check_dtype=False) self._assert_not_equal(df1, df2, check_dtype=True) + def test_frame_equal_message(self): + + expected = """DataFrame are different + +DataFrame shape \\(number of rows\\) are different +\\[left\\]: 3, Int64Index\\(\\[0, 1, 2\\], dtype='int64'\\) +\\[right\\]: 4, Int64Index\\(\\[0, 1, 2, 3\\], dtype='int64'\\)""" + with assertRaisesRegexp(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A':[1, 2, 3]}), + pd.DataFrame({'A':[1, 2, 3, 4]})) + + + expected = """DataFrame are different + +DataFrame shape \\(number of columns\\) are different +\\[left\\]: 2, Index\\(\\[u?'A', u?'B'\\], dtype='object'\\) +\\[right\\]: 1, Index\\(\\[u?'A'\\], dtype='object'\\)""" + with assertRaisesRegexp(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A':[1, 2, 3], 'B':[4, 5, 6]}), + pd.DataFrame({'A':[1, 2, 3]})) + + + expected = """DataFrame\\.index are different + +DataFrame\\.index values are different \\(33\\.33333 %\\) +\\[left\\]: Index\\(\\[u?'a', u?'b', u?'c'\\], dtype='object'\\) +\\[right\\]: Index\\(\\[u?'a', u?'b', u?'d'\\], dtype='object'\\)""" + with assertRaisesRegexp(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A':[1, 2, 3], 'B':[4, 5, 6]}, + index=['a', 'b', 'c']), + pd.DataFrame({'A':[1, 2, 3], 'B':[4, 5, 6]}, + index=['a', 'b', 'd'])) + + expected = """DataFrame\\.columns are different + +DataFrame\\.columns values are different \\(50\\.0 %\\) +\\[left\\]: Index\\(\\[u?'A', u?'B'\\], dtype='object'\\) +\\[right\\]: Index\\(\\[u?'A', u?'b'\\], dtype='object'\\)""" + with assertRaisesRegexp(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A':[1, 2, 3], 'B':[4, 5, 6]}, + index=['a', 'b', 'c']), + pd.DataFrame({'A':[1, 2, 3], 'b':[4, 5, 6]}, + index=['a', 'b', 'c'])) + + + expected = """DataFrame\\.iloc\\[:, 1\\] are different + +DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +\\[left\\]: \\[4, 5, 6\\] +\\[right\\]: \\[4, 5, 7\\]""" + with assertRaisesRegexp(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A':[1, 2, 3], 'B':[4, 5, 6]}), + pd.DataFrame({'A':[1, 2, 3], 'B':[4, 5, 7]})) + + with assertRaisesRegexp(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A':[1, 2, 3], 'B':[4, 5, 6]}), + pd.DataFrame({'A':[1, 2, 3], 'B':[4, 5, 7]}), + by_blocks=True) + class TestRNGContext(unittest.TestCase): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 979ac007c7500..4b7c8d4540e0f 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -23,8 +23,9 @@ import numpy as np import pandas as pd -from pandas.core.common import (is_sequence, array_equivalent, is_list_like, is_number, - is_datetimelike_v_numeric, is_datetimelike_v_object) +from pandas.core.common import (is_sequence, array_equivalent, is_list_like, + is_datetimelike_v_numeric, is_datetimelike_v_object, + is_number, pprint_thing, take_1d) import pandas.compat as compat from pandas.compat import( filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, @@ -536,23 +537,128 @@ def assert_equal(a, b, msg=""): assert a == b, "%s: %r != %r" % (msg.format(a,b), a, b) -def assert_index_equal(left, right, exact=False, check_names=True): +def assert_index_equal(left, right, exact=False, check_names=True, + check_less_precise=False, check_exact=True, obj='Index'): + """Check that left and right Index are equal. + + Parameters + ---------- + left : Index + right : Index + exact : bool, default False + Whether to check the Index class, dtype and inferred_type are identical. + check_names : bool, default True + Whether to check the names attribute. + check_less_precise : bool, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + check_exact : bool, default True + Whether to compare number exactly. + obj : str, default 'Index' + Specify object name being compared, internally used to show appropriate + assertion message + """ + + def _check_types(l, r, obj='Index'): + if exact: + if type(l) != type(r): + msg = '{0} classes are different'.format(obj) + raise_assert_detail(obj, msg, l, r) + assert_attr_equal('dtype', l, r, obj=obj) + assert_attr_equal('inferred_type', l, r, obj=obj) + + def _get_ilevel_values(index, level): + # accept level number only + unique = index.levels[level] + labels = index.labels[level] + filled = take_1d(unique.values, labels, fill_value=unique._na_value) + values = unique._simple_new(filled, index.names[level], + freq=getattr(unique, 'freq', None), + tz=getattr(unique, 'tz', None)) + return values + + # instance validation assertIsInstance(left, Index, '[index] ') assertIsInstance(right, Index, '[index] ') - if not left.equals(right) or (exact and type(left) != type(right)): - raise AssertionError("[index] left [{0} {1}], right [{2} {3}]".format(left.dtype, - left, - right, - right.dtype)) + + # class / dtype comparison + _check_types(left, right) + + # level comparison + if left.nlevels != right.nlevels: + raise_assert_detail(obj, '{0} levels are different'.format(obj), + '{0}, {1}'.format(left.nlevels, left), + '{0}, {1}'.format(right.nlevels, right)) + + # length comparison + if len(left) != len(right): + raise_assert_detail(obj, '{0} length are different'.format(obj), + '{0}, {1}'.format(len(left), left), + '{0}, {1}'.format(len(right), right)) + + # MultiIndex special comparison for little-friendly error messages + if left.nlevels > 1: + for level in range(left.nlevels): + # cannot use get_level_values here because it can change dtype + llevel = _get_ilevel_values(left, level) + rlevel = _get_ilevel_values(right, level) + + lobj = 'MultiIndex level [{0}]'.format(level) + assert_index_equal(llevel, rlevel, + exact=exact, check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, obj=lobj) + # get_level_values may change dtype + _check_types(left.levels[level], right.levels[level], obj=obj) + + if check_exact: + if not left.equals(right): + diff = np.sum((left.values != right.values).astype(int)) * 100.0 / len(left) + msg = '{0} values are different ({1} %)'.format(obj, np.round(diff, 5)) + raise_assert_detail(obj, msg, left, right) + else: + assert_almost_equal(left.values, right.values, + check_less_precise=check_less_precise, + obj=obj, lobj=left, robj=right) + + # metadata comparison if check_names: - assert_attr_equal('names', left, right) + assert_attr_equal('names', left, right, obj=obj) + +def assert_attr_equal(attr, left, right, obj='Attributes'): + """checks attributes are equal. Both objects must have attribute. + + Parameters + ---------- + attr : str + Attribute name being compared. + left : object + right : object + obj : str, default 'Attributes' + Specify object name being compared, internally used to show appropriate + assertion message + """ -def assert_attr_equal(attr, left, right): - """checks attributes are equal. Both objects must have attribute.""" left_attr = getattr(left, attr) right_attr = getattr(right, attr) - assert_equal(left_attr,right_attr,"attr is not equal [{0}]" .format(attr)) + + if left_attr is right_attr: + return True + elif (is_number(left_attr) and np.isnan(left_attr) and + is_number(right_attr) and np.isnan(right_attr)): + # np.nan + return True + + result = left_attr == right_attr + if not isinstance(result, bool): + result = result.all() + + if result: + return True + else: + raise_assert_detail(obj, 'Attribute "{0}" are different'.format(attr), + left_attr, right_attr) def isiterable(obj): @@ -607,6 +713,7 @@ def assertIsInstance(obj, cls, msg=''): def assert_isinstance(obj, class_type_or_tuple, msg=''): return deprecate('assert_isinstance', assertIsInstance)(obj, class_type_or_tuple, msg=msg) + def assertNotIsInstance(obj, cls, msg=''): """Test that obj is not an instance of cls (which can be a class or a tuple of classes, @@ -630,8 +737,23 @@ def assert_categorical_equal(res, exp): raise AssertionError("ordered not the same") -def assert_numpy_array_equal(np_array, assert_equal, - strict_nan=False, err_msg=None): +def raise_assert_detail(obj, message, left, right): + if isinstance(left, np.ndarray): + left = pprint_thing(left) + if isinstance(right, np.ndarray): + right = pprint_thing(right) + + msg = """{0} are different + +{1} +[left]: {2} +[right]: {3}""".format(obj, message, left, right) + raise AssertionError(msg) + + +def assert_numpy_array_equal(left, right, + strict_nan=False, err_msg=None, + obj='numpy array'): """Checks that 'np_array' is equivalent to 'assert_equal'. This is similar to ``numpy.testing.assert_array_equal``, but can @@ -639,10 +761,42 @@ def assert_numpy_array_equal(np_array, assert_equal, equivalent if the arrays have equal non-NaN elements, and `np.nan` in corresponding locations. """ - if array_equivalent(np_array, assert_equal, strict_nan=strict_nan): + + # compare shape and values + if array_equivalent(left, right, strict_nan=strict_nan): return + if err_msg is None: - err_msg = '{0} is not equivalent to {1}.'.format(np_array, assert_equal) + # show detailed error + + if np.isscalar(left) and np.isscalar(right): + # show scalar comparison error + assert_equal(left, right) + elif is_list_like(left) and is_list_like(right): + # some test cases pass list + left = np.asarray(left) + right = np.array(right) + + if left.shape != right.shape: + raise_assert_detail(obj, '{0} shapes are different'.format(obj), + left.shape, right.shape) + + diff = 0 + for l, r in zip(left, right): + # count up differences + if not array_equivalent(l, r, strict_nan=strict_nan): + diff += 1 + + diff = diff * 100.0 / left.size + msg = '{0} values are different ({1} %)'.format(obj, np.round(diff, 5)) + raise_assert_detail(obj, msg, left, right) + elif is_list_like(left): + msg = "First object is iterable, second isn't" + raise_assert_detail(obj, msg, left, right) + else: + msg = "Second object is iterable, first isn't" + raise_assert_detail(obj, msg, left, right) + raise AssertionError(err_msg) @@ -651,17 +805,62 @@ def assert_series_equal(left, right, check_dtype=True, check_index_type=False, check_series_type=False, check_less_precise=False, - check_exact=False, check_names=True, - check_datetimelike_compat=False): + check_exact=False, + check_datetimelike_compat=False, + obj='Series'): + + """Check that left and right Series are equal. + + Parameters + ---------- + left : Series + right : Series + check_dtype : bool, default True + Whether to check the Series dtype is identical. + check_index_type : bool, default False + Whether to check the Index class, dtype and inferred_type are identical. + check_series_type : bool, default False + Whether to check the Series class is identical. + check_less_precise : bool, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + check_exact : bool, default False + Whether to compare number exactly. + check_names : bool, default True + Whether to check the Series and Index names attribute. + check_dateteimelike_compat : bool, default False + Compare datetime-like which is comparable ignoring dtype. + obj : str, default 'Series' + Specify object name being compared, internally used to show appropriate + assertion message + """ + + # instance validation + assertIsInstance(left, Series, '[Series] ') + assertIsInstance(right, Series, '[Series] ') + if check_series_type: assertIsInstance(left, type(right)) + + # length comparison + if len(left) != len(right): + raise_assert_detail(obj, 'Series length are different', + '{0}, {1}'.format(len(left), left.index), + '{0}, {1}'.format(len(right), right.index)) + + # index comparison + assert_index_equal(left.index, right.index, exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, check_exact=check_exact, + obj='{0}.index'.format(obj)) + if check_dtype: assert_attr_equal('dtype', left, right) + if check_exact: - if not np.array_equal(left.values, right.values): - raise AssertionError('{0} is not equal to {1}.'.format(left.values, - right.values)) + assert_numpy_array_equal(left.get_values(), right.get_values(), + obj='{0}'.format(obj)) elif check_datetimelike_compat: # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check the values in that case @@ -675,27 +874,12 @@ def assert_series_equal(left, right, check_dtype=True, else: assert_numpy_array_equal(left.values, right.values) else: - assert_almost_equal(left.values, right.values, check_less_precise) - if check_less_precise: - assert_almost_equal( - left.index.values, right.index.values, check_less_precise) - else: - assert_index_equal(left.index, right.index, check_names=check_names) - if check_index_type: - for level in range(left.index.nlevels): - lindex = left.index.get_level_values(level) - rindex = right.index.get_level_values(level) - assertIsInstance(lindex, type(rindex)) - assert_attr_equal('dtype', lindex, rindex) - assert_attr_equal('inferred_type', lindex, rindex) + assert_almost_equal(left.get_values(), right.get_values(), + check_less_precise, obj='{0}'.format(obj)) + + # metadata comparison if check_names: - if is_number(left.name) and np.isnan(left.name): - # Series.name can be np.nan in some test cases - assert is_number(right.name) and np.isnan(right.name) - elif left.name is pd.NaT: - assert right.name is pd.NaT - else: - assert_attr_equal('name', left, right) + assert_attr_equal('name', left, right, obj=obj) # This could be refactored to use the NDFrame.equals method @@ -707,19 +891,69 @@ def assert_frame_equal(left, right, check_dtype=True, check_names=True, by_blocks=False, check_exact=False, - check_datetimelike_compat=False): + check_datetimelike_compat=False, + obj='DataFrame'): + + """Check that left and right DataFrame are equal. + + Parameters + ---------- + left : DataFrame + right : DataFrame + check_dtype : bool, default True + Whether to check the DataFrame dtype is identical. + check_index_type : bool, default False + Whether to check the Index class, dtype and inferred_type are identical. + check_column_type : bool, default False + Whether to check the columns class, dtype and inferred_type are identical. + check_frame_type : bool, default False + Whether to check the DataFrame class is identical. + check_less_precise : bool, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + check_names : bool, default True + Whether to check the Index names attribute. + by_blocks : bool, default False + Specify how to compare internal data. If False, compare by columns. + If True, compare by blocks. + check_exact : bool, default False + Whether to compare number exactly. + check_dateteimelike_compat : bool, default False + Compare datetime-like which is comparable ignoring dtype. + obj : str, default 'DataFrame' + Specify object name being compared, internally used to show appropriate + assertion message + """ + + # instance validation + assertIsInstance(left, DataFrame, '[DataFrame] ') + assertIsInstance(right, DataFrame, '[DataFrame] ') + if check_frame_type: assertIsInstance(left, type(right)) - assertIsInstance(left, DataFrame) - assertIsInstance(right, DataFrame) - if check_less_precise: - if not by_blocks: - assert_almost_equal(left.columns, right.columns) - assert_almost_equal(left.index, right.index) - else: - if not by_blocks: - assert_index_equal(left.columns, right.columns, check_names=check_names) + # shape comparison (row) + if left.shape[0] != right.shape[0]: + raise_assert_detail(obj, 'DataFrame shape (number of rows) are different', + '{0}, {1}'.format(left.shape[0], left.index), + '{0}, {1}'.format(right.shape[0], right.index)) + # shape comparison (columns) + if left.shape[1] != right.shape[1]: + raise_assert_detail(obj, 'DataFrame shape (number of columns) are different', + '{0}, {1}'.format(left.shape[1], left.columns), + '{0}, {1}'.format(right.shape[1], right.columns)) + + # index comparison + assert_index_equal(left.index, right.index, exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, check_exact=check_exact, + obj='{0}.index'.format(obj)) + + # column comparison + assert_index_equal(left.columns, right.columns, exact=check_column_type, + check_names=check_names, + check_less_precise=check_less_precise, check_exact=check_exact, + obj='{0}.columns'.format(obj)) # compare by blocks if by_blocks: @@ -728,7 +962,8 @@ def assert_frame_equal(left, right, check_dtype=True, for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): assert dtype in lblocks assert dtype in rblocks - assert_frame_equal(lblocks[dtype],rblocks[dtype], check_dtype=check_dtype) + assert_frame_equal(lblocks[dtype], rblocks[dtype], + check_dtype=check_dtype, obj='DataFrame.blocks') # compare by columns else: @@ -742,22 +977,8 @@ def assert_frame_equal(left, right, check_dtype=True, check_less_precise=check_less_precise, check_exact=check_exact, check_names=check_names, - check_datetimelike_compat=check_datetimelike_compat) - - if check_index_type: - for level in range(left.index.nlevels): - lindex = left.index.get_level_values(level) - rindex = right.index.get_level_values(level) - assertIsInstance(lindex, type(rindex)) - assert_attr_equal('dtype', lindex, rindex) - assert_attr_equal('inferred_type', lindex, rindex) - if check_column_type: - assertIsInstance(left.columns, type(right.columns)) - assert_attr_equal('dtype', left.columns, right.columns) - assert_attr_equal('inferred_type', left.columns, right.columns) - if check_names: - assert_attr_equal('names', left.index, right.index) - assert_attr_equal('names', left.columns, right.columns) + check_datetimelike_compat=check_datetimelike_compat, + obj='DataFrame.iloc[:, {0}]'.format(i)) def assert_panelnd_equal(left, right, From 2f2c5744d895433b2096d884228236b01113123a Mon Sep 17 00:00:00 2001 From: ganego Date: Mon, 10 Aug 2015 14:49:43 +0200 Subject: [PATCH 06/12] Update install.rst - Added hint regarding pip install on low memory machines. - Added hint to python 3 version of pandas from distributon repos. --- doc/source/install.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index aaa39dd383e2e..42cfd95becabb 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -153,7 +153,8 @@ and can take a few minutes to complete. Installing using your Linux distribution's package manager. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - +The commands in this table will install pandas for Python 2 from your distribution. +To install pandas for Python 3 you may need to use the package ``python3-pandas``. .. csv-table:: :header: "Distribution", "Status", "Download / Repository Link", "Install method" From 17917cbf964f7b62918ffb1b1eb4b6e095b61958 Mon Sep 17 00:00:00 2001 From: Gianluca Rossi Date: Tue, 11 Aug 2015 21:56:14 +0100 Subject: [PATCH 07/12] BUG: Allow 'read_sql_table' to read from views. Solves #10750. --- doc/source/whatsnew/v0.17.0.txt | 13 +------------ pandas/io/tests/test_sql.py | 3 +++ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 70d616ca72c1b..142bb6b4e8f9e 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -142,15 +142,8 @@ Other enhancements - ``pd.merge`` will now allow duplicate column names if they are not merged upon (:issue:`10639`). - ``pd.pivot`` will now allow passing index as ``None`` (:issue:`3962`). -- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations ` (:issue:`6511`, :issue:`8505`) - -.. ipython :: python - - s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D']) - s.drop_duplicates() - s.drop_duplicates(keep='last') - s.drop_duplicates(keep=False) +- ``read_sql_table`` will now allow reading from views (:issue:`10750`). .. _whatsnew_0170.api: @@ -529,7 +522,6 @@ Deprecations ===================== ================================= - ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`). -- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was removed in favor of ``keep``. (:issue:`6511`, :issue:`8505`) .. _whatsnew_0170.prior_deprecations: @@ -616,9 +608,6 @@ Bug Fixes - Bug in ``read_stata`` when reading a file with a different order set in ``columns`` (:issue:`10757`) -- Bug in ``Categorical`` may not representing properly when category contains ``tz`` or ``Period`` (:issue:`10713`) -- Bug in ``Categorical.__iter__`` may not returning correct ``datetime`` and ``Period`` (:issue:`10713`) - - Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`). - Bug in ``read_msgpack`` where DataFrame to decode has duplicate column names (:issue:`9618`) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 434f8c4b71e85..5ac7f84c6da1f 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -166,6 +166,7 @@ 'sqlite': """ CREATE VIEW iris_view AS SELECT * FROM iris; +<<<<<<< HEAD """, 'mysql': """ CREATE VIEW iris_view AS @@ -174,6 +175,8 @@ 'postgresql': """ CREATE VIEW iris_view AS SELECT * FROM iris; +======= +>>>>>>> BUG: Add ability to 'read_sql_table' to read views and implement unit test to check behaviour. Closes #10750. """ } } From 5939fa409e4462e6fe9649ec1bf0d29759a3e1cd Mon Sep 17 00:00:00 2001 From: Gianluca Rossi Date: Tue, 11 Aug 2015 22:27:25 +0100 Subject: [PATCH 08/12] BUG: Allow 'read_sql_table' to read from views. Solves #10750. --- doc/source/whatsnew/v0.17.0.txt | 6 ++---- pandas/io/tests/test_sql.py | 6 ++++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 843cc32a8ab9b..e50deb689214a 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -143,6 +143,7 @@ Other enhancements - ``pd.merge`` will now allow duplicate column names if they are not merged upon (:issue:`10639`). - ``pd.pivot`` will now allow passing index as ``None`` (:issue:`3962`). + - ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations ` (:issue:`6511`, :issue:`8505`) .. ipython :: python @@ -152,6 +153,7 @@ Other enhancements s.drop_duplicates(keep='last') s.drop_duplicates(keep=False) +- ``read_sql_table`` will now allow reading from views (:issue:`10750`). - ``concat`` will now inherit the existing series names (even when some are missing), if new ones are not provided through the ``keys`` argument (:issue:`10698`). @@ -554,7 +556,6 @@ Deprecations ===================== ================================= - ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`). -- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was removed in favor of ``keep``. (:issue:`6511`, :issue:`8505`) .. _whatsnew_0170.prior_deprecations: @@ -641,9 +642,6 @@ Bug Fixes - Bug in ``read_stata`` when reading a file with a different order set in ``columns`` (:issue:`10757`) -- Bug in ``Categorical`` may not representing properly when category contains ``tz`` or ``Period`` (:issue:`10713`) -- Bug in ``Categorical.__iter__`` may not returning correct ``datetime`` and ``Period`` (:issue:`10713`) - - Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`). - Bug in ``read_msgpack`` where DataFrame to decode has duplicate column names (:issue:`9618`) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 859c6d3250121..402a7af9b6c62 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -161,6 +161,12 @@ SELECT * FROM iris WHERE "Name"=%(name)s AND "SepalLength"=%(length)s """ + }, + 'create_view': { + 'sqlite': """ + CREATE VIEW iris_view AS + SELECT * FROM iris; + """ } } From 36acb79400ab755b271bc01d630eb2d38f49ea66 Mon Sep 17 00:00:00 2001 From: Gianluca Rossi Date: Tue, 11 Aug 2015 22:46:30 +0100 Subject: [PATCH 09/12] # This is a combination of 5 commits. # The first commit's message is: Merge # This is the 2nd commit message: BUG: Categorical doesn't show tzinfo properly # This is the 3rd commit message: ENH: duplicated and drop_duplicates now accept take=all kw # This is the 4th commit message: TST: make assertion messages more understandable # This is the 5th commit message: Update install.rst - Added hint regarding pip install on low memory machines. - Added hint to python 3 version of pandas from distributon repos. --- doc/source/whatsnew/v0.17.0.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 2344f81f291b4..4ac17367c7c2d 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -143,6 +143,15 @@ Other enhancements - ``pd.merge`` will now allow duplicate column names if they are not merged upon (:issue:`10639`). - ``pd.pivot`` will now allow passing index as ``None`` (:issue:`3962`). +- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations ` (:issue:`6511`, :issue:`8505`) + +.. ipython :: python + + s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D']) + s.drop_duplicates() + s.drop_duplicates(keep='last') + s.drop_duplicates(keep=False) + - ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations ` (:issue:`6511`, :issue:`8505`) @@ -558,6 +567,7 @@ Deprecations ===================== ================================= - ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`). +- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was removed in favor of ``keep``. (:issue:`6511`, :issue:`8505`) .. _whatsnew_0170.prior_deprecations: @@ -644,6 +654,9 @@ Bug Fixes - Bug in ``read_stata`` when reading a file with a different order set in ``columns`` (:issue:`10757`) +- Bug in ``Categorical`` may not representing properly when category contains ``tz`` or ``Period`` (:issue:`10713`) +- Bug in ``Categorical.__iter__`` may not returning correct ``datetime`` and ``Period`` (:issue:`10713`) + - Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`). - Bug in ``read_msgpack`` where DataFrame to decode has duplicate column names (:issue:`9618`) From 1af51797b4c5b81db57d6f7d1878b4c55dd4bacc Mon Sep 17 00:00:00 2001 From: Gianluca Rossi Date: Tue, 11 Aug 2015 22:49:01 +0100 Subject: [PATCH 10/12] Add whatsnew note --- doc/source/whatsnew/v0.17.0.txt | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 4ac17367c7c2d..29895867740ae 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -143,15 +143,8 @@ Other enhancements - ``pd.merge`` will now allow duplicate column names if they are not merged upon (:issue:`10639`). - ``pd.pivot`` will now allow passing index as ``None`` (:issue:`3962`). -- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations ` (:issue:`6511`, :issue:`8505`) - -.. ipython :: python - - s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D']) - s.drop_duplicates() - s.drop_duplicates(keep='last') - s.drop_duplicates(keep=False) +- ``read_sql_table`` will now allow reading from views (:issue:`10750`). - ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations ` (:issue:`6511`, :issue:`8505`) @@ -567,7 +560,6 @@ Deprecations ===================== ================================= - ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`). -- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was removed in favor of ``keep``. (:issue:`6511`, :issue:`8505`) .. _whatsnew_0170.prior_deprecations: @@ -654,9 +646,6 @@ Bug Fixes - Bug in ``read_stata`` when reading a file with a different order set in ``columns`` (:issue:`10757`) -- Bug in ``Categorical`` may not representing properly when category contains ``tz`` or ``Period`` (:issue:`10713`) -- Bug in ``Categorical.__iter__`` may not returning correct ``datetime`` and ``Period`` (:issue:`10713`) - - Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`). - Bug in ``read_msgpack`` where DataFrame to decode has duplicate column names (:issue:`9618`) From 942cec2ab2623403b98c5731aa44ceca58acdea9 Mon Sep 17 00:00:00 2001 From: Gianluca Rossi Date: Sat, 8 Aug 2015 22:14:34 +0100 Subject: [PATCH 11/12] Add ability to 'read_sql_table' to read views and implemnt unit test to check behaviour --- pandas/io/sql.py | 2 +- pandas/io/tests/test_sql.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 8eefe4ba98876..b587ec128c016 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -337,7 +337,7 @@ def read_sql_table(table_name, con, schema=None, index_col=None, from sqlalchemy.schema import MetaData meta = MetaData(con, schema=schema) try: - meta.reflect(only=[table_name]) + meta.reflect(only=[table_name], views=True) except sqlalchemy.exc.InvalidRequestError: raise ValueError("Table %s not found" % table_name) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 859c6d3250121..434f8c4b71e85 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -161,6 +161,20 @@ SELECT * FROM iris WHERE "Name"=%(name)s AND "SepalLength"=%(length)s """ + }, + 'create_view': { + 'sqlite': """ + CREATE VIEW iris_view AS + SELECT * FROM iris; + """, + 'mysql': """ + CREATE VIEW iris_view AS + SELECT * FROM iris; + """, + 'postgresql': """ + CREATE VIEW iris_view AS + SELECT * FROM iris; + """ } } @@ -244,6 +258,10 @@ def _load_iris_data(self): for row in r: self._get_exec().execute(ins, row) + def _load_iris_view(self): + self.drop_table('iris_view') + self._get_exec().execute(SQL_STRINGS['create_view'][self.flavor]) + def _check_iris_loaded_frame(self, iris_frame): pytype = iris_frame.dtypes[0].type row = iris_frame.iloc[0] @@ -482,6 +500,7 @@ class _TestSQLApi(PandasSQLTest): def setUp(self): self.conn = self.connect() self._load_iris_data() + self._load_iris_view() self._load_test1_data() self._load_test2_data() self._load_test3_data() @@ -492,6 +511,11 @@ def test_read_sql_iris(self): "SELECT * FROM iris", self.conn) self._check_iris_loaded_frame(iris_frame) + def test_read_sql_view(self): + iris_frame = sql.read_sql_query( + "SELECT * FROM iris_view", self.conn) + self._check_iris_loaded_frame(iris_frame) + def test_legacy_read_frame(self): with tm.assert_produces_warning(FutureWarning): iris_frame = sql.read_frame( From 3d342785b470d8d00ca23b4d171787179bca146c Mon Sep 17 00:00:00 2001 From: Gianluca Rossi Date: Wed, 12 Aug 2015 00:33:50 +0100 Subject: [PATCH 12/12] Add whatsnew note and remove redundant tests --- doc/source/whatsnew/v0.17.0.txt | 3 +++ pandas/io/tests/test_sql.py | 8 -------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 70d616ca72c1b..0b65d4651d133 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -137,6 +137,7 @@ Other enhancements - ``.as_blocks`` will now take a ``copy`` optional argument to return a copy of the data, default is to copy (no change in behavior from prior versions), (:issue:`9607`) - ``regex`` argument to ``DataFrame.filter`` now handles numeric column names instead of raising ``ValueError`` (:issue:`10384`). + - ``pd.read_stata`` will now read Stata 118 type files. (:issue:`9882`) - ``pd.merge`` will now allow duplicate column names if they are not merged upon (:issue:`10639`). @@ -152,6 +153,8 @@ Other enhancements s.drop_duplicates(keep=False) +- ``read_sql_table`` will now allow reading from views (:issue:`10750`). + .. _whatsnew_0170.api: .. _whatsnew_0170.api_breaking: diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 434f8c4b71e85..c78d193124b76 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -164,14 +164,6 @@ }, 'create_view': { 'sqlite': """ - CREATE VIEW iris_view AS - SELECT * FROM iris; - """, - 'mysql': """ - CREATE VIEW iris_view AS - SELECT * FROM iris; - """, - 'postgresql': """ CREATE VIEW iris_view AS SELECT * FROM iris; """