From 0880d0291c2ad9da69e705c643d457c1359de3ae Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 9 Aug 2016 14:21:36 +0100 Subject: [PATCH] ENH: Improve error message for repeated Stata categories Improve the error message to be more explicit when attempting to read Stata files containing repeated categories. closes #13923 --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/io/stata.py | 10 +++++++++- pandas/io/tests/data/stata15.dta | Bin 0 -> 3183 bytes pandas/io/tests/test_stata.py | 7 +++++++ 4 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 pandas/io/tests/data/stata15.dta diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 30a0d918b46ec..e4ecd090f882b 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -397,6 +397,7 @@ Other enhancements - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) - ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) - ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`) +- ``read_stata()`` and ``StataReader`` raise with a more explicit error message when reading Stata files with repeated value labels when ``convert_categoricals=True`` (:issue:`13923`) - ``DataFrame.style`` will now render sparsified MultiIndexes (:issue:`11655`) - ``DataFrame.style`` will now show column level names (e.g. ``DataFrame.columns.names``) (:issue:`13775`) - ``DataFrame`` has gained support to re-order the columns based on the values diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 59bc24acac6f8..a67fb6651edd0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1649,7 +1649,15 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist, categories.append(value_label_dict[label][category]) else: categories.append(category) # Partially labeled - cat_data.categories = categories + try: + cat_data.categories = categories + except ValueError: + vc = Series(categories).value_counts() + repeats = list(vc.index[vc > 1]) + repeats = '\n' + '-' * 80 + '\n'.join(repeats) + msg = 'Value labels for column {0} are not unique. The ' \ + 'repeated labels are:\n{1}'.format(col, repeats) + raise ValueError(msg) # TODO: is the next line needed above in the data(...) method? cat_data = Series(cat_data, index=data.index) cat_converted_data.append((col, cat_data)) diff --git a/pandas/io/tests/data/stata15.dta b/pandas/io/tests/data/stata15.dta new file mode 100644 index 0000000000000000000000000000000000000000..d13e2fa337db39f73c2fc2a252126a1a73396180 GIT binary patch literal 3183 zcmc(iO^g&p6vy8JA;cJh9~VWS6GS1-FzgzUk4~#>R0w1f116e1tkpd=GsVxfv2M#2b3h;8*+tF2X?%qM{<|`mgTl*{^ujNq+shzFxh0 z?^RWgA15qfL)C-@euJ|rH-T@s!7bi8?~%0b^{we$>-9dcp|97kOt$)!!ju+k zLqYyK?&KXk`HS!5c_+Vj^1(TI`$Z@F=jG*^lW#luo|DnddAoMfuh?1rVP*siB_B#U zZ5{{LC;ZBcD%(XdEGghgLy0I>xuXB!nc0@x;>X$~Qp8E-IKR>@`ZaA@EQx~^>sGI+ z6Rqg&%jNn^Ott#bLwV^^{|0YIqub(F3KdwoO!-h2*P(pgM|W%<*wVj!pxgVuez887 zY1CvCGgFI5iKJ8VVy<|dNoVGduPn0p<2|ePP8=Y*_v!Bz+VbtiDprpldlU7iPEX)R zbfo=!*5BnE(b&1;wtdm@V^7%e{ZHlNd)&CO6M6l^u6^LO3z`0Rm)@G`DaO-`{^Z7e zboHC8|N6onCyV(Ddy07q`)?}xJ3ar_znK5ynfAMJgzxp=W0VXwhIEphC~G!l5o<$c;O* z^kfzwa`ko}>~kl~0=(Y_7K7#BKF|j?f=yr>coGbP5ZE1O&yob}1p0v!60B@9J~OIg45tDa1Go53+BLHungP-N?;?{ z0zyy+2J8l7;3e=fI14U=&2zB^upb-+C%{|aL+~Y-55Ft~OThiW2lkX61RSJR!uA@# zL6!MKK<)%hVE1ebSRZN72D^X(F|dA1!7vyByFmwxf<3_cbRXCcuvcjejDv&V5O@)s z1D}9T!Drxe@CEo5TmTorC2$2?1wVnG!7tz%u)hBd{0^>zKfn#}C-@8e4JH7=-#uUs zm<#3sn@IMSyAv!1cY!5fIk+3F0A;WmRKQwr7+isGLnf3?n~+t3ZK;|Zuac?6%2tRf z$T-I3Mj@|?s7aAlg4I(&k>Of$!nkH58;Cclt%j=5l2I*%9zi2o>KaL=Lr#*1I;9qq z!ggyh6|uU?UaF{Jwq_@*Ysl1y1f!XvHqGPPGdkA zR?W;wbQfzGj0%ntw=y(rbd(JXDfud@vk=YsR8#as92%J{^e|b+ zNrxYJ9_IL9BQdP8;z(n?SVJp>adiY3&r~SR%2+FFnody+1=1s(UaDAiUarnY zdrCk_2`o5&LUSy*G0c-(BM%ad*iSX9