From 0880d0291c2ad9da69e705c643d457c1359de3ae Mon Sep 17 00:00:00 2001
From: Kevin Sheppard <kevin.k.sheppard@gmail.com>
Date: Tue, 9 Aug 2016 14:21:36 +0100
Subject: [PATCH] ENH: Improve error message for repeated Stata categories

Improve the error message to be more explicit when attempting to read Stata
files containing repeated categories.

closes #13923
---
 doc/source/whatsnew/v0.19.0.txt  |   1 +
 pandas/io/stata.py               |  10 +++++++++-
 pandas/io/tests/data/stata15.dta | Bin 0 -> 3183 bytes
 pandas/io/tests/test_stata.py    |   7 +++++++
 4 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 pandas/io/tests/data/stata15.dta

diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
index 30a0d918b46ec..e4ecd090f882b 100644
--- a/doc/source/whatsnew/v0.19.0.txt
+++ b/doc/source/whatsnew/v0.19.0.txt
@@ -397,6 +397,7 @@ Other enhancements
 - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`)
 - ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`)
 - ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`)
+- ``read_stata()`` and ``StataReader`` raise with a more explicit error message when reading Stata files with repeated value labels when ``convert_categoricals=True`` (:issue:`13923`)
 - ``DataFrame.style`` will now render sparsified MultiIndexes (:issue:`11655`)
 - ``DataFrame.style`` will now show column level names (e.g. ``DataFrame.columns.names``) (:issue:`13775`)
 - ``DataFrame`` has gained support to re-order the columns based on the values
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 59bc24acac6f8..a67fb6651edd0 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -1649,7 +1649,15 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist,
                         categories.append(value_label_dict[label][category])
                     else:
                         categories.append(category)  # Partially labeled
-                cat_data.categories = categories
+                try:
+                    cat_data.categories = categories
+                except ValueError:
+                    vc = Series(categories).value_counts()
+                    repeats = list(vc.index[vc > 1])
+                    repeats = '\n' + '-' * 80 + '\n'.join(repeats)
+                    msg = 'Value labels for column {0} are not unique. The ' \
+                          'repeated labels are:\n{1}'.format(col, repeats)
+                    raise ValueError(msg)
                 # TODO: is the next line needed above in the data(...) method?
                 cat_data = Series(cat_data, index=data.index)
                 cat_converted_data.append((col, cat_data))
diff --git a/pandas/io/tests/data/stata15.dta b/pandas/io/tests/data/stata15.dta
new file mode 100644
index 0000000000000000000000000000000000000000..d13e2fa337db39f73c2fc2a252126a1a73396180
GIT binary patch
literal 3183
zcmc(iO^g&p6vy8JA;cJh9~VWS6GS1-FzgzUk4~#>R0w1f116e1tkpd=GsV<Y4b|Pd
z%<{Dh8e@zEOf(({B;Je>xfv2M#2b3h;8*+tF2X?%qM{<|`mgTl*{^ujNq+shzFxh0
z?^RWgA15qfL)C-@euJ|rH-T@s<SgbvZ|^$4QWX5KlW=Wphi%&*^DC1@e{0Y~er0Rm
z4+Kjo|M``Hz?UrKG9a|&Z+;?L9P_o>!7bi8?~%0b^{we$>-9dcp|97kOt$)!!ju+k
zLqYyK?&KXk`HS!5c_+Vj^1(TI`$Z@F=jG*^lW#luo|DnddAoMfuh?1rVP*siB_B#U
zZ5{{LC;ZBcD%(XdEGghgLy0I>xuXB!nc0@x;>X$~Qp8E-IKR>@`ZaA@EQx~^>sGI+
z6Rqg&%jNn^Ott#bLwV^^{|0YIqub(F3KdwoO!-h2*P(pgM|W%<*wVj!pxgVuez887
zY1CvCGgFI5iKJ8VVy<|dNoVGduPn0p<2|ePP8=Y*_v!Bz+VbtiDprpldlU7iPEX)R
zbfo=!*5BnE(b&1;wtdm@V^7%e{ZHlNd)&CO6M6l^u6^LO3z`0Rm)@G`DaO-`{^Z7e
zboHC8|N6onCyV(Ddy07q`)?}xJ3ar_znK5ynfAMJg<fpt?k;dH^`izeEK0aR(qR__
zeq~lo6^BfqX%`)&I*rm^GSr}P+D#+0hxSoS38mDgLnNq49pbc~T6BOUjgg|ARKv8H
z3O`Pav@<w4CE}=+p$x9gCz_gyUBg{~j?7%3bI>zxp=W0VXwhIEphC~G!l5o<$c;O*
z^kfzwa`ko}>~kl~0=(Y_7K7#BKF|j?f=yr>coGbP5ZE1O&yob}1<!%Q;8pNCcmtdS
z?|?JleeeM|555MM!4Ke9unZAj4>p0v!60B@9J~OIg45tDa1Go53+BLHungP-N?;?{
z0zyy+2J8l7;3e=fI14U=&2zB^upb-+C%{|aL+~Y-55Ft~OThiW2lkX61RSJR!uA@#
zL6!MKK<)%hVE1ebSRZN72D^X(F|dA1!7vyByFmwxf<3_cbRXCcuvcjejDv&V5O@)s
z1D}9T!Drxe@CEo5TmTorC2$2?1wVnG!7tz%u)hBd{0^>zKfn#}C-@8e4JH7=-#uUs
zm<#3sn@IMSyAv!1cY!5fIk+3F0A;WmRKQwr7+isGLnf3?n~+t3ZK;|Zuac?6%2tRf
z$T-I3Mj@|?s7aAlg4I(&k>Of$!nkH58;Cclt%j=5l2I*%9zi2o>KaL=Lr#*1I;9qq
z!ggyh6|uU?UaF{Jwq_@*Ysl1y1f!XvHq<afZJtVk9js|5jk)25VjbZX_E>GPPGdkA
zR?W;wbQ<YY^3*sRU?zIlzB*}e<E<1bk|}1cdx|Bgfmx%x>fzGj0%n<b6&9PS#B@BN
zJbdHe%u*h!84)pxaFZLY72)uzTBp>tw=y(rbd(JXDfud@vk=YsR8#as92%J{^e|b+
zNr<IuqoW8*&t=JT4J9v0m52o^c~XQO2wv~Bli7L|?s2SY(!fd6;g)GlA&1K-OifcL
zR<5yFV>xYJ9_IL9BQdP8;z(n?SVJp>adiY3&r~SR%2+FFnody+1=1s(UaDAiUarnY
zdrCk_2`o5&LUSy*G0c-(BM%ad*iSX9<Ym1}US<<!hD{|eGar7jz2Ot<K3vb4qH?Cl
z4YSp`<wdYiwm~Ujb%+-PVuxoQk#ZhcRW`hxhEhZ&Z<GncF=1*_RTU-AjM(Ht2zI3`
z(U@0yxl?9`V+5bBQ=*Zlftvx-XKCd|A*sV|RTeAHCYQ~!EU4X5*U-GxgE3yjjDZiZ
zQIIAH(${{W^LVyFMc!%SP3D7}2K$mJB&(C~X2<5_tlVYDLuw<@Qu`%T!+C)XII9<S
koNjYsDQ;Z^f4Uw4+fvn3dPLbTeO$rQKE(3Ny7-Ur4^y`;UH||9

literal 0
HcmV?d00001

diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
index 2e3182b69acaf..7752fff5247c0 100644
--- a/pandas/io/tests/test_stata.py
+++ b/pandas/io/tests/test_stata.py
@@ -80,6 +80,7 @@ def setUp(self):
         self.dta21_117 = os.path.join(self.dirpath, 'stata12_117.dta')
 
         self.dta22_118 = os.path.join(self.dirpath, 'stata14_118.dta')
+        self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
 
     def read_dta(self, file):
         # Legacy default reader configuration
@@ -1212,6 +1213,12 @@ def test_unsupported_datetype(self):
             with tm.ensure_clean() as path:
                 original.to_stata(path)
 
+    def test_repeated_column_labels(self):
+        # GH 13923
+        with tm.assertRaises(ValueError) as cm:
+            read_stata(self.dta23, convert_categoricals=True)
+            tm.assertTrue('wolof' in cm.exception)
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)