From 99b85c8ae72f6234c11cf00fc182b2e244942e7a Mon Sep 17 00:00:00 2001 From: PKEuS Date: Tue, 11 Nov 2014 10:57:33 +0100 Subject: [PATCH 1/2] StataReader: Support sorting categoricals --- doc/source/whatsnew/v0.15.2.txt | 1 + pandas/io/stata.py | 21 +++++++++++++++------ pandas/io/tests/data/stata10_117.dta | Bin 0 -> 1268 bytes pandas/io/tests/test_stata.py | 9 +++++++++ 4 files changed, 25 insertions(+), 6 deletions(-) create mode 100644 pandas/io/tests/data/stata10_117.dta diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index b1fa6dbed442d..7041352971407 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -27,6 +27,7 @@ API changes Enhancements ~~~~~~~~~~~~ +- StataReader: Properly support sorting categorical variables read from stata files. .. _whatsnew_0152.performance: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index c2542594861c4..58bf344998834 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1139,12 +1139,21 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, )[0] for i in cols: col = data.columns[i] - labeled_data = np.copy(data[col]) - labeled_data = labeled_data.astype(object) - for k, v in compat.iteritems( - self.value_label_dict[self.lbllist[i]]): - labeled_data[(data[col] == k).values] = v - data[col] = Categorical.from_array(labeled_data) + codes = np.copy(data[col]) + labeldict = self.value_label_dict[self.lbllist[i]] + for j in range(len(codes)): + if np.isnan(codes[j]): + codes[j] = -1 + else: + codes[j] = codes[j]-1 + codes = codes.astype(int) + categories = [] + for j in range(max(labeldict.keys())): + try: + categories.append(labeldict[j+1]) + except: + categories.append(j+1) + data[col] = Categorical.from_codes(codes, categories, ordered=True) if not preserve_dtypes: retyped_data = [] diff --git a/pandas/io/tests/data/stata10_117.dta b/pandas/io/tests/data/stata10_117.dta new file mode 100644 index 0000000000000000000000000000000000000000..79dfffd94483f30bcf6f155711caa392d849f970 GIT binary patch literal 1268 zcmbtUPiqrF6d$94coTZ@A_FC$V4ZA9(a2+55bL25gn-v&lDsy7Nj9CG26Jk`leb>I z_3p)Q(6isdqaQ#Jyu|lrXPdRgia0R){>*RQ|5+%!@g^I3qhW+*h)#n;i>5%GB#%K= z0`wOibH{*Jy`i^YNZa$aWt)wlJ zCr?`)P^(i=RTL95(_}5I50P$ME16EDr;&b)R5r@<_mOt5m-1nxpCUbv^y`iC{9B}; z*elG98=Jnx%r9nG=%IiYt^@?*r7e)Ld=2_*SGz8#>oIQ4Q%%UPbg z2XcqmL~I@k9)~ZV&$Hz*F=y`@ET8_n!GsO&{Lx@-CSuGm7#U{<9vyXXJSa4%-!wyd zZ`9nAlxH-h!Anz`#*=3^qbQD%xbWfAvy_4-2&ixo3hyl2jkH!s(%Qb&J-S`*0Nbsv zTTx*BX5{(VZqU9<`yOqs7l+5sgr5duUsMGbZ_q8|L7orAOaAPdu@kSU1N_=4fM24h b2IX&2uvhlZQE)W8{x6F9jFkneesn(pwkR(F literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 2cb7809166be5..10df5cc91558e 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -13,6 +13,7 @@ import pandas as pd from pandas.compat import iterkeys +from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame, Series from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, @@ -81,6 +82,8 @@ def setUp(self): self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta') self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta') + self.dta19_117 = os.path.join(self.dirpath, 'stata10_117.dta') + def read_dta(self, file): # Legacy default reader configuration @@ -744,6 +747,12 @@ def test_drop_column(self): columns = ['byte_', 'int_', 'long_', 'not_found'] read_stata(self.dta15_117, convert_dates=True, columns=columns) + def test_categorical_sorting(self): + dataset = read_stata(self.dta19_117) + dataset = dataset.sort("srh") + expected = Categorical.from_codes(codes=[-1, -1, 0, 1, 1, 1, 2, 2, 3, 4], categories=["Poor", "Fair", "Good", "Very good", "Excellent"]) + tm.assert_equal(True, (np.asarray(expected)==np.asarray(dataset["srh"])).all()) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From c410441149f48f7e11dbbe197d8a824da1ac9f5e Mon Sep 17 00:00:00 2001 From: PKEuS Date: Fri, 14 Nov 2014 14:40:48 +0100 Subject: [PATCH 2/2] Vectorized loop --- pandas/io/stata.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 58bf344998834..36ac705e4e3ed 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1139,15 +1139,11 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, )[0] for i in cols: col = data.columns[i] - codes = np.copy(data[col]) - labeldict = self.value_label_dict[self.lbllist[i]] - for j in range(len(codes)): - if np.isnan(codes[j]): - codes[j] = -1 - else: - codes[j] = codes[j]-1 + codes = np.nan_to_num(data[col]) codes = codes.astype(int) + codes = codes-1 categories = [] + labeldict = self.value_label_dict[self.lbllist[i]] for j in range(max(labeldict.keys())): try: categories.append(labeldict[j+1])