Skip to content

Commit 739c25d

Browse files
committed
Merge pull request #9597 from jreback/cat_merge
BUG: Regression in merging Categorical and object dtypes (GH9426)
2 parents 1fab6fc + e7c562e commit 739c25d

File tree

6 files changed

+69
-33
lines changed

6 files changed

+69
-33
lines changed

doc/source/whatsnew/v0.16.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ Bug Fixes
521521

522522

523523
- ``SparseSeries`` and ``SparsePanel`` now accept zero argument constructors (same as their non-sparse counterparts) (:issue:`9272`).
524-
524+
- Regression in merging Categoricals and object dtypes (:issue:`9426`)
525525
- Bug in ``read_csv`` with buffer overflows with certain malformed input files (:issue:`9205`)
526526
- Bug in groupby MultiIndex with missing pair (:issue:`9049`, :issue:`9344`)
527527
- Fixed bug in ``Series.groupby`` where grouping on ``MultiIndex`` levels would ignore the sort argument (:issue:`9444`)

pandas/core/common.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1146,7 +1146,9 @@ def _maybe_promote(dtype, fill_value=np.nan):
11461146
dtype = np.object_
11471147

11481148
# in case we have a string that looked like a number
1149-
if issubclass(np.dtype(dtype).type, compat.string_types):
1149+
if is_categorical_dtype(dtype):
1150+
dtype = dtype
1151+
elif issubclass(np.dtype(dtype).type, compat.string_types):
11501152
dtype = np.object_
11511153

11521154
return dtype, fill_value

pandas/core/internals.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4327,8 +4327,9 @@ def dtype(self):
43274327
if not self.needs_filling:
43284328
return self.block.dtype
43294329
else:
4330-
return np.dtype(com._maybe_promote(self.block.dtype,
4331-
self.block.fill_value)[0])
4330+
return com._get_dtype(com._maybe_promote(self.block.dtype,
4331+
self.block.fill_value)[0])
4332+
43324333
return self._dtype
43334334

43344335
@cache_readonly

pandas/io/tests/test_data.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def test_get_multi2(self):
112112

113113
# sanity checking
114114

115-
assert np.issubdtype(result.dtype, np.floating)
115+
self.assertTrue(np.issubdtype(result.dtype, np.floating))
116116
result = pan.Open.ix['Jan-15-12':'Jan-20-12']
117117
self.assertEqual((4, 3), result.shape)
118118
assert_n_failed_equals_n_null_columns(w, result)
@@ -121,11 +121,11 @@ def test_get_multi2(self):
121121
def test_dtypes(self):
122122
#GH3995, #GH8980
123123
data = web.get_data_google('F', start='JAN-01-10', end='JAN-27-13')
124-
assert np.issubdtype(data.Open.dtype, np.number)
125-
assert np.issubdtype(data.Close.dtype, np.number)
126-
assert np.issubdtype(data.Low.dtype, np.number)
127-
assert np.issubdtype(data.High.dtype, np.number)
128-
assert np.issubdtype(data.Volume.dtype, np.number)
124+
self.assertTrue(np.issubdtype(data.Open.dtype, np.number))
125+
self.assertTrue(np.issubdtype(data.Close.dtype, np.number))
126+
self.assertTrue(np.issubdtype(data.Low.dtype, np.number))
127+
self.assertTrue(np.issubdtype(data.High.dtype, np.number))
128+
self.assertTrue(np.issubdtype(data.Volume.dtype, np.number))
129129

130130
@network
131131
def test_unicode_date(self):
@@ -183,15 +183,15 @@ def test_get_components_dow_jones(self):
183183
raise nose.SkipTest('unreliable test, receive partial components back for dow_jones')
184184

185185
df = web.get_components_yahoo('^DJI') #Dow Jones
186-
assert isinstance(df, pd.DataFrame)
186+
self.assertIsInstance(df, pd.DataFrame)
187187
self.assertEqual(len(df), 30)
188188

189189
@network
190190
def test_get_components_dax(self):
191191
raise nose.SkipTest('unreliable test, receive partial components back for dax')
192192

193193
df = web.get_components_yahoo('^GDAXI') #DAX
194-
assert isinstance(df, pd.DataFrame)
194+
self.assertIsInstance(df, pd.DataFrame)
195195
self.assertEqual(len(df), 30)
196196
self.assertEqual(df[df.name.str.contains('adidas', case=False)].index,
197197
'ADS.DE')
@@ -202,13 +202,13 @@ def test_get_components_nasdaq_100(self):
202202
raise nose.SkipTest('unreliable test, receive partial components back for nasdaq_100')
203203

204204
df = web.get_components_yahoo('^NDX') #NASDAQ-100
205-
assert isinstance(df, pd.DataFrame)
205+
self.assertIsInstance(df, pd.DataFrame)
206206

207207
if len(df) > 1:
208208
# Usual culprits, should be around for a while
209-
assert 'AAPL' in df.index
210-
assert 'GOOG' in df.index
211-
assert 'AMZN' in df.index
209+
self.assertTrue('AAPL' in df.index)
210+
self.assertTrue('GOOG' in df.index)
211+
self.assertTrue('AMZN' in df.index)
212212
else:
213213
expected = DataFrame({'exchange': 'N/A', 'name': '@^NDX'},
214214
index=['@^NDX'])
@@ -256,7 +256,7 @@ def test_get_data_multiple_symbols_two_dates(self):
256256
self.assertEqual(len(result), 3)
257257

258258
# sanity checking
259-
assert np.issubdtype(result.dtype, np.floating)
259+
self.assertTrue(np.issubdtype(result.dtype, np.floating))
260260

261261
expected = np.array([[18.99, 28.4, 25.18],
262262
[18.58, 28.31, 25.13],
@@ -276,7 +276,7 @@ def test_get_date_ret_index(self):
276276
self.assertEqual(result, 1.0)
277277

278278
# sanity checking
279-
assert np.issubdtype(pan.values.dtype, np.floating)
279+
self.assertTrue(np.issubdtype(pan.values.dtype, np.floating))
280280

281281

282282
class TestYahooOptions(tm.TestCase):
@@ -383,26 +383,26 @@ def test_get_underlying_price(self):
383383
quote_price = options_object._underlying_price_from_root(root)
384384
except RemoteDataError as e:
385385
raise nose.SkipTest(e)
386-
self.assert_(isinstance(quote_price, float))
386+
self.assertIsInstance(quote_price, float)
387387

388388
def test_sample_page_price_quote_time1(self):
389389
#Tests the weekend quote time format
390390
price, quote_time = self.aapl._underlying_price_and_time_from_url(self.html1)
391-
self.assert_(isinstance(price, (int, float, complex)))
392-
self.assert_(isinstance(quote_time, (datetime, Timestamp)))
391+
self.assertIsInstance(price, (int, float, complex))
392+
self.assertIsInstance(quote_time, (datetime, Timestamp))
393393

394394
def test_chop(self):
395395
#regression test for #7625
396396
self.aapl.chop_data(self.data1, above_below=2, underlying_price=np.nan)
397397
chopped = self.aapl.chop_data(self.data1, above_below=2, underlying_price=100)
398-
self.assert_(isinstance(chopped, DataFrame))
398+
self.assertIsInstance(chopped, DataFrame)
399399
self.assertTrue(len(chopped) > 1)
400400

401401
def test_chop_out_of_strike_range(self):
402402
#regression test for #7625
403403
self.aapl.chop_data(self.data1, above_below=2, underlying_price=np.nan)
404404
chopped = self.aapl.chop_data(self.data1, above_below=2, underlying_price=100000)
405-
self.assert_(isinstance(chopped, DataFrame))
405+
self.assertIsInstance(chopped, DataFrame)
406406
self.assertTrue(len(chopped) > 1)
407407

408408

@@ -411,8 +411,8 @@ def test_sample_page_price_quote_time2(self):
411411
#Tests the EDT page format
412412
#regression test for #8741
413413
price, quote_time = self.aapl._underlying_price_and_time_from_url(self.html2)
414-
self.assert_(isinstance(price, (int, float, complex)))
415-
self.assert_(isinstance(quote_time, (datetime, Timestamp)))
414+
self.assertIsInstance(price, (int, float, complex))
415+
self.assertIsInstance(quote_time, (datetime, Timestamp))
416416

417417
@network
418418
def test_sample_page_chg_float(self):
@@ -452,26 +452,26 @@ def test_is_s3_url(self):
452452
@network
453453
def test_read_yahoo(self):
454454
gs = DataReader("GS", "yahoo")
455-
assert isinstance(gs, DataFrame)
455+
self.assertIsInstance(gs, DataFrame)
456456

457457
@network
458458
def test_read_google(self):
459459
gs = DataReader("GS", "google")
460-
assert isinstance(gs, DataFrame)
460+
self.assertIsInstance(gs, DataFrame)
461461

462462
@network
463463
def test_read_fred(self):
464464
vix = DataReader("VIXCLS", "fred")
465-
assert isinstance(vix, DataFrame)
465+
self.assertIsInstance(vix, DataFrame)
466466

467467
@network
468468
def test_read_famafrench(self):
469469
for name in ("F-F_Research_Data_Factors",
470470
"F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3",
471471
"F-F_ST_Reversal_Factor", "F-F_Momentum_Factor"):
472472
ff = DataReader(name, "famafrench")
473-
assert ff
474-
assert isinstance(ff, dict)
473+
self.assertTrue(ff is not None)
474+
self.assertIsInstance(ff, dict)
475475

476476

477477
class TestFred(tm.TestCase):
@@ -498,7 +498,7 @@ def test_fred_nan(self):
498498
start = datetime(2010, 1, 1)
499499
end = datetime(2013, 1, 27)
500500
df = web.DataReader("DFII5", "fred", start, end)
501-
assert pd.isnull(df.ix['2010-01-01'][0])
501+
self.assertTrue(pd.isnull(df.ix['2010-01-01'][0]))
502502

503503
@network
504504
def test_fred_parts(self):
@@ -510,7 +510,7 @@ def test_fred_parts(self):
510510
self.assertEqual(df.ix['2010-05-01'][0], 217.23)
511511

512512
t = df.CPIAUCSL.values
513-
assert np.issubdtype(t.dtype, np.floating)
513+
self.assertTrue(np.issubdtype(t.dtype, np.floating))
514514
self.assertEqual(t.shape, (37,))
515515

516516
@network

pandas/tests/test_categorical.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2423,6 +2423,39 @@ def f():
24232423
df.append(df_wrong_categories)
24242424
self.assertRaises(ValueError, f)
24252425

2426+
2427+
def test_merge(self):
2428+
# GH 9426
2429+
2430+
right = DataFrame({'c': {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'},
2431+
'd': {0: 'null', 1: 'null', 2: 'null', 3: 'null', 4: 'null'}})
2432+
left = DataFrame({'a': {0: 'f', 1: 'f', 2: 'f', 3: 'f', 4: 'f'},
2433+
'b': {0: 'g', 1: 'g', 2: 'g', 3: 'g', 4: 'g'}})
2434+
df = pd.merge(left, right, how='left', left_on='b', right_on='c')
2435+
2436+
# object-object
2437+
expected = df.copy()
2438+
2439+
# object-cat
2440+
cright = right.copy()
2441+
cright['d'] = cright['d'].astype('category')
2442+
result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
2443+
tm.assert_frame_equal(result, expected)
2444+
2445+
# cat-object
2446+
cleft = left.copy()
2447+
cleft['b'] = cleft['b'].astype('category')
2448+
result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
2449+
tm.assert_frame_equal(result, expected)
2450+
2451+
# cat-cat
2452+
cright = right.copy()
2453+
cright['d'] = cright['d'].astype('category')
2454+
cleft = left.copy()
2455+
cleft['b'] = cleft['b'].astype('category')
2456+
result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
2457+
tm.assert_frame_equal(result, expected)
2458+
24262459
def test_na_actions(self):
24272460

24282461
cat = pd.Categorical([1,2,3,np.nan], categories=[1,2,3])

pandas/tseries/tests/test_tslib.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def test_repr(self):
167167

168168
# dateutil zone change (only matters for repr)
169169
import dateutil
170-
if dateutil.__version__ >= LooseVersion('2.3'):
170+
if dateutil.__version__ >= LooseVersion('2.3') and dateutil.__version__ <= LooseVersion('2.4'):
171171
timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']
172172
else:
173173
timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/America/Los_Angeles']

0 commit comments

Comments
 (0)