From e0bb6f17950c42e379b59982f27d13fb3e07360f Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 13 Jun 2014 14:51:45 -0400 Subject: [PATCH 1/2] FIX: Enable fixed width strings to be read from Stata 13 (117) files Fixes a bug which prevented files containing fixed width string data from being read. Stata 13 files also allow variable length strings, which are not supported in the current version, and an explicit exception regarding this type is now given. Added tests which cover these cases, and Stata 13 format files. fixes #7360 --- doc/source/v0.14.1.txt | 2 ++ pandas/io/stata.py | 6 ++++-- pandas/io/tests/data/stata5_117.dta | Bin 0 -> 5366 bytes pandas/io/tests/data/stata6_117.dta | Bin 0 -> 3490 bytes pandas/io/tests/test_stata.py | 7 +++++++ 5 files changed, 13 insertions(+), 2 deletions(-) create mode 100644 pandas/io/tests/data/stata5_117.dta create mode 100644 pandas/io/tests/data/stata6_117.dta diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index be0b3bc543c39..24e7c81cd5f7e 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -220,6 +220,8 @@ Bug Fixes - Bug where ``nanops._has_infs`` doesn't work with many dtypes (:issue:`7357`) - Bug in ``StataReader.data`` where reading a 0-observation dta failed (:issue:`7369`) +- Bug in when reading Stata 13 (117) files containing fixed width strings (:issue:`7360`) + diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7bb466794c44d..fd83ba6a9365a 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -512,8 +512,10 @@ def _read_header(self): try: i = 0 for typ in typlist: - if typ <= 2045 or typ == 32768: - self.typlist[i] = None + if typ <= 2045: + self.typlist[i] = typ + elif typ == 32768: + raise ValueError("Long strings are not supported") else: self.typlist[i] = self.TYPE_MAP_XML[typ] i += 1 diff --git a/pandas/io/tests/data/stata5_117.dta b/pandas/io/tests/data/stata5_117.dta new file mode 100644 index 0000000000000000000000000000000000000000..afbd3b0e0afe31bef669b8e9398c623beff57d74 GIT binary patch literal 5366 zcmeHL%WoS+7@s9b^`SyS$^p1^qtL2IDBg7fA=PMUQ{t4SgqE~C%E64iWACQBUUzrg zCLpoyt&tE9i3=}@1Fr*r00#YQ=e2>$%Zx^`0GwDT;(W)%b50K7{G5a5 zoxKQgb>6`NDbv8=x>7LnL(01f8_xaqxOu!;Hw%_MVcOOqi+9qg#d$r7kTe_~O_ujS z4jW3D0a?CB*()F)z88QXZtYRZNKVObhLyYu@;8t-K;pdU6<;8+Ps#x;vE>F$oWHZb zvd!#fc7y-@i3~oe9)M_2CwzjAV5NjC8~WcW{5s?HI{<>HUfVH3HHZk0v3+r)rWpGU z0Kx~!-0cAgi+wSQZ3_UlpQfZpQG$~wW`UnDzM2PMopfNFZ&%`K6o-T*&i<(}tEK^W zBKx8NUyj{Px-c=3+zC4xvwtetT@N^~SA?f`(4-}~lS^t*b<{zZBF}rd=^P)DvBW_? z9F%zvhZbt0nQ5V0Kb<)V)>{k*8=eKxV{yu15#O+R{k`gX-mj1g6tSrlfcHi|gGSgQ#S7hdYIy$Rt=c=?X zqwnR-ZGerS-745a-_<1k4_y!8b$x4d8a)3o9zb3B^}LLi`zRU`E-m%-bg3&|J1{83 zFp!#a>q+I>5HclYglM(q`HSa*upTvf+vz#=6BlK^PNE>x`WCnll59<)$hPuImB$pIGz z-y=W2TO5I{@QA*c!T!wXae)6ANc|>h{4*^*n-A8d)R*#GLs#UzwP*buDUW_IdQkul zFM0Ak^7ij<$oD~GShf4O{E4)6+%FatzP0b1jH~e`^mF;m6^WOXUH>#5-&FDSJPv-Y z@WMWlW+@+3ihcwx9^zc^v*Q5786+;w@;w6Q1 qZcj~G!s;qIx4K+X?T0-whV$~X1oLw;5MM1IyFSK+x5w=K)Oj2C0b$Aj literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata6_117.dta b/pandas/io/tests/data/stata6_117.dta new file mode 100644 index 0000000000000000000000000000000000000000..bf4f3838e8be7d375d83dffe1ab3c06493f791f2 GIT binary patch literal 3490 zcmeHK&1)1f6wg@2UK9lHM*4A*&Td} z(2L;BKcUi-pn_*F9t!pkC@NY#bj;`OvSs^mleU53_g?bLOY-val0l3UPKrJ!7zLF0 zG{q>T5haY`eEu48t%AHdr%5Uaw`Zr2+fvaq?ghx5#%Kng0YHc|7)8XR5f-u*5D#N2 zhKj5BLVg6^s#V}{Ejs}2d~Fc9a#XZP(vWx&E%JJmGW<9DlWk|evtQ!-2^_Jg4ub4ZA!318 zdZh?*c{BcQ2rFE)??H%?N_kHUb5TNsjqb%yYI5#-5Qvy0_q!7qPeVD2uEh|#FH^Zl zy9BW$oOk-Bl|Gcm7zXeARot7T1!|x$DdpUkyA!rfI`V_z znPeq0jV|2Ld=OjyiNO;en&3D@*tFeSCNfbeASo$vnu`1!mKa9vt{Pvw51JQob5Jc1 z0X9lInzWi_nH0|dxUV98V|!Ia{+qrVTF0N2{tSFlH@9PSVESzsz2{?D@xR-4Gnj2} zY)_h+edc3W@4hW4d%drc9$974UQa7+_L_!`KpY3?H8;j})z~ ziYrOAv!7klQSyZ+_DL218tcXv>tO5U6!A*O&KD(rb6Lbiqi-wQ8R2iyr6+3{xZ@Y@ z1tEj_!2|=spt!hU^N_JCkR^}@gu^g}g2)33W3m*+wHQXUQsx1y(lib^r#|%Ra8^Fo zhv0v1)L(HLn_0FjeDPHK;_EVa;1@((`0Zz8A!WR__-LvA*qjy>OGhhTP~@r Date: Fri, 13 Jun 2014 18:28:36 -0400 Subject: [PATCH 2/2] FIX: Stata writer no longer ignores encoding when writing strings The encoding was not used when writing strings. Fixes #7286 --- doc/source/v0.14.1.txt | 2 +- pandas/io/stata.py | 10 ++++++++-- pandas/io/tests/test_stata.py | 5 +++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index 24e7c81cd5f7e..c3e5e4989139f 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -221,7 +221,7 @@ Bug Fixes (:issue:`7357`) - Bug in ``StataReader.data`` where reading a 0-observation dta failed (:issue:`7369`) - Bug in when reading Stata 13 (117) files containing fixed width strings (:issue:`7360`) - +- Bug in when writing Stata files where the encoding was ignored (:issue:`7286`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index fd83ba6a9365a..ed6b540b890a2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1328,7 +1328,10 @@ def _write_data_nodates(self): var = _pad_bytes('', typ) if len(var) < typ: var = _pad_bytes(var, typ) - self._write(var) + if compat.PY3: + self._write(var) + else: + self._write(var.encode(self._encoding)) else: try: self._file.write(struct.pack(byteorder + TYPE_MAP[typ], @@ -1358,7 +1361,10 @@ def _write_data_dates(self): if typ <= 244: # we've got a string if len(var) < typ: var = _pad_bytes(var, typ) - self._write(var) + if compat.PY3: + self._write(var) + else: + self._write(var.encode(self._encoding)) else: self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var)) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index c1bdf25bea227..b045867b06263 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -283,6 +283,11 @@ def test_encoding(self): self.assertEqual(result, expected) self.assertIsInstance(result, unicode) + with tm.ensure_clean() as path: + encoded.to_stata(path,encoding='latin-1', write_index=False) + reread_encoded = read_stata(path, encoding='latin-1') + tm.assert_frame_equal(encoded, reread_encoded) + def test_read_write_dta11(self): original = DataFrame([(1, 2, 3, 4)], columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______'])