diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index bbc006b41a433..0603bd4deeafa 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -240,6 +240,7 @@ Bug Fixes - Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`). - Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`). - Bug in ``unstack`` with ``TimedeltaIndex`` or ``DatetimeIndex`` and nulls (:issue:`9491`). +- Bug in ``rank`` where comparing floats with tolerance will cause inconsistent behaviour (:issue:`8365`). - Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`). diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 316a282b71609..5f68c1ee26e87 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -7,7 +7,6 @@ cimport cython import_array() cdef float64_t FP_ERR = 1e-13 -cdef float64_t REL_TOL = 1e-07 cimport util @@ -136,18 +135,6 @@ cdef _take_2d_object(ndarray[object, ndim=2] values, return result -cdef inline bint float64_are_diff(float64_t left, float64_t right): - cdef double abs_diff, allowed - if right == MAXfloat64 or right == -MAXfloat64: - if left == right: - return False - else: - return True - else: - abs_diff = fabs(left - right) - allowed = REL_TOL * fabs(right) - return abs_diff > allowed - def rank_1d_float64(object in_arr, ties_method='average', ascending=True, na_option='keep', pct=False): """ @@ -202,7 +189,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, ranks[argsorted[i]] = nan continue count += 1.0 - if i == n - 1 or float64_are_diff(sorted_data[i + 1], val): + if i == n - 1 or sorted_data[i + 1] != val: if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups @@ -361,7 +348,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', ranks[i, argsorted[i, j]] = nan continue count += 1.0 - if j == k - 1 or float64_are_diff(values[i, j + 1], val): + if j == k - 1 or values[i, j + 1] != val: if tiebreak == TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = sum_ranks / dups @@ -1087,7 +1074,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, sum_wt = 1. sum_wt2 = 1. old_wt = 1. - + for i from 1 <= i < N: cur_x = input_x[i] cur_y = input_y[i] @@ -1117,7 +1104,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, elif is_observation: mean_x = cur_x mean_y = cur_y - + if nobs >= minp: if not bias: numerator = sum_wt * sum_wt diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 1a2fc5a8fc13c..3b7b13fd8ed4f 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4707,7 +4707,7 @@ def test_rank(self): assert_series_equal(iranks, exp) iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20+1e-30, 1e-1]) - exp = Series([2, 1, 3.5, 5, 3.5, 6]) + exp = Series([2, 1, 3, 5, 4, 6.0]) iranks = iseries.rank() assert_series_equal(iranks, exp) diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py index eaaf89a52c2dc..9acd7c2233b7b 100644 --- a/pandas/tests/test_stats.py +++ b/pandas/tests/test_stats.py @@ -44,6 +44,43 @@ def _check(s, expected, method='average'): series = s if dtype is None else s.astype(dtype) _check(series, results[method], method=method) + def test_rank_methods_series(self): + tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') + from scipy.stats import rankdata + + xs = np.random.randn(9) + xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates + np.random.shuffle(xs) + + index = [chr(ord('a') + i) for i in range(len(xs))] + + for vals in [xs, xs + 1e6, xs * 1e-6]: + ts = Series(vals, index=index) + + for m in ['average', 'min', 'max', 'first', 'dense']: + result = ts.rank(m) + sprank = rankdata(vals, m if m != 'first' else 'ordinal') + tm.assert_series_equal(result, Series(sprank, index=index)) + + def test_rank_methods_frame(self): + tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') + from scipy.stats import rankdata + + xs = np.random.randint(0, 21, (100, 26)) + xs = (xs - 10.0) / 10.0 + cols = [chr(ord('z') - i) for i in range(xs.shape[1])] + + for vals in [xs, xs + 1e6, xs * 1e-6]: + df = DataFrame(vals, columns=cols) + + for ax in [0, 1]: + for m in ['average', 'min', 'max', 'first', 'dense']: + result = df.rank(axis=ax, method=m) + sprank = np.apply_along_axis(rankdata, ax, vals, + m if m != 'first' else 'ordinal') + expected = DataFrame(sprank, columns=cols) + tm.assert_frame_equal(result, expected) + def test_rank_dense_method(self): dtypes = ['O', 'f8', 'i8'] in_out = [([1], [1]),