From ed775bc5dd6771c492bf0b3c918a768f60835cf9 Mon Sep 17 00:00:00 2001 From: Ka Wo Chen Date: Tue, 7 Jul 2015 23:59:15 -0400 Subject: [PATCH] BUG: GH9618 in read_msgpack where DataFrame has duplicate column names --- doc/source/whatsnew/v0.17.0.txt | 4 +- pandas/io/packers.py | 9 +- .../0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack | Bin 0 -> 4684 bytes .../0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack | Bin 0 -> 4684 bytes .../0.16.2/0.16.2_x86_64_linux_2.7.10.pickle | Bin 0 -> 14893 bytes .../0.16.2/0.16.2_x86_64_linux_3.4.3.pickle | Bin 0 -> 14116 bytes pandas/io/tests/generate_legacy_pickles.py | 167 -------------- .../io/tests/generate_legacy_storage_files.py | 205 ++++++++++++++++++ pandas/io/tests/test_packers.py | 75 ++++++- pandas/io/tests/test_pickle.py | 8 +- setup.py | 1 + 11 files changed, 295 insertions(+), 174 deletions(-) create mode 100644 pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack create mode 100644 pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack create mode 100644 pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle delete mode 100644 pandas/io/tests/generate_legacy_pickles.py create mode 100644 pandas/io/tests/generate_legacy_storage_files.py diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 5bd0f46dd0b18..ddd97c8d1b199 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -384,4 +384,6 @@ Bug Fixes - Bug in operator equal on Index not being consistent with Series (:issue:`9947`) -- Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`). \ No newline at end of file +- Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`). + +- Bug in `read_msgpack` where DataFrame to decode has duplicate column names (:issue:`9618`) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index f5e000449f232..847a7c4f90216 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -357,6 +357,7 @@ def encode(obj): 'klass': obj.__class__.__name__, 'axes': data.axes, 'blocks': [{'items': data.items.take(b.mgr_locs), + 'locs': b.mgr_locs.as_array, 'values': convert(b.values), 'shape': b.values.shape, 'dtype': b.dtype.num, @@ -485,9 +486,15 @@ def decode(obj): def create_block(b): values = unconvert(b['values'], dtype_for(b['dtype']), b['compress']).reshape(b['shape']) + + # locs handles duplicate column names, and should be used instead of items; see GH 9618 + if 'locs' in b: + placement = b['locs'] + else: + placement = axes[0].get_indexer(b['items']) return make_block(values=values, klass=getattr(internals, b['klass']), - placement=axes[0].get_indexer(b['items'])) + placement=placement) blocks = [create_block(b) for b in obj['blocks']] return globals()[obj['klass']](BlockManager(blocks, axes)) diff --git a/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack b/pandas/io/tests/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack new file mode 100644 index 0000000000000000000000000000000000000000..6bf1b9b9afaaafbd8feabf50954cabe53309b6e5 GIT binary patch literal 4684 zcmds4OHUI~6dp=>DJGQAKj20eY9gWw7g`=6A%=+Yu{8s|m61-T&J2oZLWl^u5M`9a z7#Eaw8WnZp!lf7!7h-g!7#A8h#^^%h3U}T!=iHV~J9IE;h+W+8&g0(s?shFqE5W80i;bCttg`vEk}$_{icD(ql=EMfoL>ck6d53MeD8nd76sicT32O*z3F`81fdULk#k6!d2PB4l#6M z$Z_5DLhi$35Poo&yAW80DXU5Qv>YE~uR~+EUcQ=?q^uT;8O)qd$24srLJJ`}*6SH- zC_Ia-n>)0ah{Hq&lcf$5%9|3+mTFD4rP@;+sa7`J&#<6Y2cePqZS4wS)~3 zI~LdEb!+xj9%pG}R#k-6DLCEswkj&Eil7y!v;usrqAchhjGIc%@^dO&Vh-A%T!xj_ z5-e?=(f+bKW=(_muYWaPohd{H@zR37a{A;~;JM33GderbJYL%4H?4H9tb_k>!kq|{ zVR2^|+-iXm;T@PbcTNos;T@RpJAOmOjw!lfa^>vP5>Y#*^Y|G9Z5zw>#cGYBF0_orzRmo)))bPYz0H#!0WUw>yd%u<-I2%V0k?cU zun-R6*13PA%cLvpk6im|$xUZ?HQxzCGYq?6Xn|oj?lgC>_ay6EE}|hHB6>sg?4{|f zkMfq36D&UZCsoew;@wHBGy06u>w&63t_7J9VTdTooj@r<&k4m}5rO$KhwpinMX`@q uCs3>x*hJqE`*Q|eJ>7SvwG?V&`^ z@syhEVSHNYq_q_JvTC0t8F_#CXChVFCepj@(%S{Gax{zSe7-Sq_%Suj>`(+(z0>AK zx*byDB$9^uzvTpT@ofj?WG7-H;g5d&JlFRtz56gH%gb{z-VQNa$gGU%+5)vn*3NgS zX5L6KJh4t2aMCBQ^yTFh$2w**HmeiI_x2<$twOI%=BZ^BeUqj*1@cZeHs0EttqA`+ggU-173cz zdPB2|Zx22^2e{_ro~h;-@@g*Uba zp+YR@GO@4gIP3OHX(qRup1}IAxs848A|&ZNI16@(0wQdN_yvFZX}aOnDn)7TGY$d9 zmxT*}4{qJ~4j9_7zV)?>`yV~N3OL$%_#NQa51-e)adA8EKMQyz-1O|tDs&^rzzcE( zvAhptAOWbCV^-f}Az`wUxwc}OnVJ#5C>e&yBt!@s2&06Jgi8pQ5-uZLPPl>)0}urR zbr)ZS6iR79B71QY6P<#N*aB`2{F{^sp97+oDCIgCp~h(-j1o2yE+Hg#`0ko)nUhvi zEIIId4>LlZ$?0$nRI4~HI)6D|Tj%$sqZCX6>2V|~JLwBT>4_w+r0dcnNeueDT&Kym z9%i{_Pc`qXp*W3%A|Wg&rxR+$^&_Un_|TZey)L7Oc)I?l>R3q6;ujqZPw{{VOi2kq zBq&!J?5!aAt||@ear@w^Tjt zh=J~9Z1Cpu7sHAIVw`t>u%@VP6)e}-%ndB)q*-$|TqX8pLpoM()AEE+Dy*nXspDQQ z<^+0%f|*5BF0?jQ&ViaF~TSji%$1rc0VRs`ETj^n&Zl>JhP%_#Q_=Iz@BgpR)7{Jh%IAIG@cZqndiCDxs+a0jb#r8?=f)CNMQca1 zJ=;a6EMb*5r*pPj-qe-uXijHqa#>rn+SwXO5>Iv5vr-d4kZWzX>&O*%D~gjws46|T zSHddI&2_69tnwCNyKZ%qv9qNmYv;1@3F$d;H@?+TZg020FJ9vHFtwlH;bj{j340eAZ40yDgn-ov8@i{@qsjj9JZD zcYtTbI_hKjM!Dj&yGBz-zP)QsO;e|^YufW|xz@B0m1Uu^p(;4azuhZCM$KJb}02jsBEj&!^2uHBGdIMrRpDr@b?x$7GB%M}Og ztL?g8k@32Y+~|>de0|T|AmOax40H-Q*ih-f!LnP;vNuZ9mo2+&z0eZ*iz!f2s$rRi z2dk{5 zv(p_rsaDIebB&nPj%u|#MOVA?s;qXGwCC<>RSvBgHnL`zTkE;wkVMn7?W{ZA8EREF zQtw%Jx1>A4b9YZzRqfM73b?F0u_={m%(o)bbfi*otgZlI^$BN)Lrd-jOHNj6YlyCG?^Rjb6ew++ zx}T=zw{LWQ)^Y^uI4Ozd1K-p3gYNftpz{L~&Uk0C11%ruxd$bjiB3V02Rn7n)PMr5 zvgUl3dx+Fx(sK_rT1-KUhZWLAAT8=7frm9lr{AQfKN@b^oUmf!tk`%fwwo23fHu5N zCnOt+YzZlHx@JU3w3v`^dQ6pm5GtL1y{p!3t%6k=_N$e;ZiTKf*-Uh@GXj%s>v^*6 z;O(GRJE_&fsMRjdb;DW}(CYM{Rx4WCI@38fEA^W5+`Q52Oz3r%vi|JojOMJ0_2)vV zhli9pPm6N`X-jFSCygzgFE!?=`5h6R-~3gX-;vPcQPktn)Z;Ab@fgoNHmt`5(Br(I z9!uL>-Gx$#$9eAYMv2`};t55ab24ic?V%@n?nw!&vMt?cw`JXvlkO=AXP`--(xDsE zSvx*abk4Oq8YYZWHO#4g4Re}aF8A+Yx@q$C zL9Qjfz#06zl-fGsKLn3?IOH*p=*O~>R5=OZqZ`JTd-Bc|7ruDhqe?lC1(fr+=RUEF za-LkCa-RBcDCcRo#WQq^XXzIA&@Gb9>3+_op3`+M{o(BYe*WQhmb(>q0LQ@Sm_TcGW}1;=~abKgO(O*M5k zBelI7vh{oVxk!R#r6ov_R#i&o6BPCDcU!$vMlv`*z}%sa52JJcqn5$S;v;0u#pM{yW& z|I{_^cQEt!YK1>USGbfbTwNyk**%&0w}w_+3+%1oY8v*@u&;)`8~q4V{8ZR=_4?Iy z!8Eyiz3u+X-*$iI{Hqr@*i81&;xy=~Z0T342nK36Si^*dTi50$&0nf$j3rWPDsg!d z)CI%8>sJps?teHF>=G;hM(P6KmmaH9dH$wCu=v0=fM8zJf_~|?s(4&yhRflaM8q(* za3TCvl%gdKD#}P7&eEb>%lLm{R#AcZq&-o|N!NO!mz?r==X*qms6xjR)e@yn9@vVo z);NjloS#*wSn-zDwziZ!)QR30@-5`7ZS`tY5h7N@fK{GCEQmgA>Wcx<7oB{gbBBYV z5v!x)36??$=UZ!lju>8p>97&~F{UC%3?NxjtU(O7PccyPaR)W7VXj8Z4cz#Awzb2~ zW_c4Q27w&UTob?(Ye`<*!&NF;F&GH!yts4Yt(#bz-5z#cu@0G4sKUX_#h6%EqF3r} z4B`Fa22YRsu763%1~(;7BWxe{%?E_{y|Qq~_kg{d*PTbWbgj3i{{T4Y>h?Pb7mY2g zSPJ;bvx}w?j!bT|gmB~jpP&3AVAm4wJ;Db^4A|x;z`l=PF^4d@-;PiH)Zc`+SPzp; zC&l_)=Rn7gVX=W;+wT^}P9#>vhWd#JS4J6wy+|x>6s8Kr;>H+LvA79Z$~iVAy)3UX z+VGnJrbH-BQ3k~W%hl45%)js=Jq zBb%Z7T(JXjuwVHC2W5yD%Wm+F!>>Zbju=u7w3C#>>|z{fXJ91HWzoQp+aD;w5 zw88GKpAeIwU(VVlvIS2JQ!wo1_B#o9gnVeed;IS6toS{lL>tRm$)_AauH zDVnxYu45>r6@DvT$8!8+pYSXwH8RhsVG3mx?BmnaJmKH_l0u$Qge9Bd6tQ1H08_;N z=psX;@fi6kMjQZy;y$nl_d#LqgC+O)0{0<4cd`igp<(WnBB`HO&Ie;Ps~J%S6E_} zYy@tx#B5@av#S<3IRBz9iN#7w%gUtV`S*lkPJ{~AS^S6FLfmMHxg51BT3Q9(ydI7Y z>tMWxZ5H#;;Z2wDNTNLJ2-yt#<$U4_Ti212iEjK&SaF>GQ@8|e z>TDAy0As3fc|j}5@e17E{!au2w*Qj=GJ5kqnPQUS6aqElQ)M$eN01A{EF2*AO$i1-5-#!a3#r+4rm0XgR1~N zakaz-v%xhSugg#^s&H(y`q2LPydovvn22jJl+jkcj%{t_>(PdlZy-?9zEL*AI=YFt zf?wXut}4+A#n~-DDofv5M8a)ag1+93TH4%A`WC+(^sx9HQdIERf2S6No8pCZn!(Ev zMOV2C^q#m|$_cy5Jz5f`@#EvKfBDBg7RPp-RoKQJ@K4G)4$Aa@;nLZGSoeGix>QHc)i>$UR2}Mwc0K40@--mOF$ZLdzqX` z@d|-j#jCOz*1>DU6};_rcB9_*29VAa=<`iBf)4VQ;jXWc4&s+=z=wW4^^Gr@@o1m# z^2FO1RV#Q0t!S9POI%?E@39+Q!TUfW)IR{AEj}a-Y(d@77Jmfv#77dVLS1d_A9K7e z!&T}~cd|y+e*)5|`p@J{icbjCL_d|yu&Vz;Tw$W0u^XM}=Ri7hq3SQ#2&(!^!%bDM ze098ZzOuuLs!sLVRQ#2Xl>zfNJ}+{G_jfY~wE=y_Zgd4-0|{0C1AwajhBQz$bwgEu z3+Rb|N^DTo-*LPy!&MqpFEpzD9!R6=AIO;$O9|9Of0WIzs(&J`pz43I8=dISKtcja zpMOgUD1H7T8^O}&7h)g?U(O(J6?;D_coP~VDJ>~AMNf=FvK*21gsgH(iBmJjn}lUl z%ukX6Gv}8{2?tp(T-oqLB_)HODn-r2&z2|y$(EUuUoYW(qE(S|gpF^7$Q!A0_(7AY za`;gb9FX5M@f7TT3nuy!1F_JC^uR%tkkbbOp)@zB2=|&{?zJR$U`H4X1akV?0CetkNDk+w zH`2M+W#1F)Np!GgSf31a8EOzYeFJ~oqHv-al|HkA@icJXADmInP zunsmOu3-Po*;U?;zlF_n3*aFH<>Hn_ge0_(-}ud#*b4kG*VcSU(E8hGxn?vPK+pJc>Ei^2=M^X4~YvtCmEvF})X$a>P$#*qouF9FVW4DMRhXRk3 zHw=JYF`Q)Z3R;O?F#^yNBPCWjg*q0F;&@$#3Na~fD`WK0KpLZOPtK$mL!jomgKUP4 zK9)EbUEl2O$gY_yzg{ZXeJ2blyYE~?&MsQcZ?*fb;D_C7rRbpH$7#X8GJSAk3OUgD zpqygqLm9a6F5dJba&`;K!KVXALtifb@w9<>LojyuDIXGcKe|cHAi%fwy>lMnmY>!? zJBV)%g6oOxdS#%qR=nqiD)7@dKSn4KM* z(8tJ7+iXy1JI|5Lu$|`;SBTof(G5lIJRp>pctyC62y@Su+<~Y)(&s*^2=~!p?qei( zAZm~Gxfc}SUZ}bCP1$km%1>egj(I$gh}>=fM(zouLFCf+8M!C2?}?KnIvBYplb|j` z2SenY;*XoiJyngft=u3>*A>;$U!HhF`$03G9I8 zE(BhM^hHI4T&#us*6nz)Fa44t(wAuIE4mZB6m;fOmDEV2 zjJ(QdNoDw}*)8G-*8q<)@>&3f&~+q32+_+JLe~R&;s%KgmXSAdye>mi=rS@RHze)1 zHvw<__GU6C#VrJCy0^+^*l%wm4t}e{{dRV(in^TLK2F?$PKJL9Thg8AIa4ycB!Y|F zMe2}`+)WJgp6?_6Em4X586PpQMC;bTD&05Gd@5zGHkPRn?+HH<`;7lYv#%Hk%Fxf? z4eV)LmTC9$IZBWB$!1uO_Y=p3E3-dl)PWdZ`{(HP=oH7PKwvL5Sb$0;0hus;!U$EC#GC$yWU52{TIU=90`5&828NUDk literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle b/pandas/io/tests/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle new file mode 100644 index 0000000000000000000000000000000000000000..6d5451f96e20d6e8aea11a1777edb024a7a561a5 GIT binary patch literal 14116 zcmeHNdyE}b8NcuDqua-_D^-;63SFVBZ7H-umECS>sn;&WQvO?S=I)*CxzqdD-no}P zXsIN2amfJ*9f=|S(I6pF`Gbf`LNtUJpBPOvicgGxXne%*5?lz0{=V~=nLGC}_inc< zsI%GmX6~GG=A7^Qea|zWN__6UTO;B)>5TALG4Gsoc%ot+-0bkRvqmXr*rU_siaDAu z<;id}Vh z$~LFVrJQ5(RKdK{EQqHpg|U+UVG`8SVFNsoGpdHehfB5M>@hi+BE8e7RE%Q|Uzw@U zcy_f?o31)%9K4bmI(!8tFPfZj(vj27L^SK%>wLl)Nk`L((_XV=u6k_Nq>iy$x1nV^ zDqF>Q^n^1xIr*`F;Si_vO;sm5PfLksjfz9(<2cWh%Qzpd9d%4Rug&2dSj`| zHq6$si)aGu1N!wb+*~OKDwj-jt{y3Sory0-R`X>YU*i`;*09lGU{|Gl+$>G*9QQ5I z8@~tRU(RFsP5$Mu_4SL1&|)TfB9Uk`7K_In++wC<^)scBc2ldU{G(Q;Sfb5)?Agpr zg@gpiBxed`1H!#PP1c?@Dz-T)pYzrG#qCZAa+m0F#`)k$=OJ_);TL3b=8OS>W@hq* zLgr4RP&1v~rxqkC`HGBP%R)jBv>a2tlMlH44gs5(LAFqyJ_PV)MC)Ecje31Dqaftx zWWH22D3t{f+ui4iDLVvLSEJh}t_6ZqfhiINYgH&$)4 zk~eLwMT0_%zz(yTFPa+l>%RgXiH5;GO^u*2UI7_mGvW1CehqJKxt&X-Kv z-U+Sn@wc~Bt`v)$maU9LM~~>zoa;OxS9TJlQ7limv2&6W0LkLx<=6>k_EFTn=OgVA|?Zq6p5r-PFc62MI}_LHq$0}V%mV2|L9y!ve)~^B)vq@%j)FU zd^sc`;w|-T&;35tqRlc=av2+GFz*Ooh973DVlsL%=t- z?{K)=4wY^@*lq_C4lK2Cbkmm6O~NImgewto3EAJC;=xJ=9s)+TCtJ%S^ne-TNxP7r zrnrGZ1{pN)l<;1*!yd1pqqR92iCEX+ga$F$#M*{i_PxO4SUT>1{JX&8csk*K{94WA z3F9cjDweMyqRQ5?HlnWCjC3+$2{CdB+WSbWGO8Z)mD2X@7cR-?D%cbslua|_4D@hp zi1%izw(89c3gF0@Ic%Ec_~8D0>0rTZ;X&Ckb{9CoPENAB**)6lv(xizE*)j}vim$w z!|re6WL5)5h-m7aOoK(no0_?pqf6~#B9Tki{^kBR;c{jlJ^5Rr&t5xt^;@8;zVrAo zqEmO=`2Dv$S7bl~8VqW$ROwqhb*8Ium6q+-jY&G9=v6_Bw7SvSYtuvp;( zE#!U80$9XN2dq74h)W6Oh8)I zJRnoLlJ6jT0psO?cZq}KO$RwwI)w8AO^Q3|mg$nPL5J|wpuOTQ5+P4UN53QjWpp{9 z?pG;F$P+=@f0t&YtjEB~wxY3~KnnqR$%_9R=evXa*D+BI`A6xRAa6Z_D>sa-WotZ% zTnu#ruz;s#;U>yCfR_&kk{0!Z_wN%;{FuI*NB24{kd0VQv2R~OAvLLJO3@XH_9)t` zXrH3}imp_2Kv5TvqTFEC8@W?FUY}IKfgv$fSrnXD)J>NLGd0^pMWSF-^LMIzN;Mo-9Cx(9b!l1{CIFe+@F9g*-u_*OMu|OV*yNk9b?o@y*U28yv zshB@%<}$e&sseRIDwXf|B8#psG_Vbb} zQ%1N8l-X*qQWQ-m=S2SE4k!n16(ua3q5&H{zZj?0+=6q}_w9yy8xo$Mz^wju_RR-} z5b@lw`GMaPo&WY_qr;%Ty#47DMA!c*{rs?qgc@fR6RVSzums9zn^0~5LFtjmwUTU+ zyFDw2n25pM)}Ytdnk?P3z?UHL|XpER}mnG*a3dfi&x8 z2+G<;r`>eAg-&~LdeI`US**Xk20h-`BA`J}G`6VE|E;f53#CE#(Y4SZ z7F~qyc!mT1%%?Z3hoaQJN!Cv;ae%Ed*=oOY#82%uF4$GElOt>A;sp% zT7DUdz+R{7lC7}g`a;oG_u7c+=fQh2rlI0#&WUj0)alrBKH+@D6%Oy4HmVtH+)tMa zi%~T<4Ds)YgqT1qN3nB_=Avc~u|w~7@(Wd7BtF@|G|hwMN`BfXV0(DNcjwz4?#YNx zHOw4#hf|qpW15Nm{6s02FJiCOMk$7VFXyP>-}FlccOGbXmuzczVQ=){sE+|}$eZP3bF zIvT@z1?v^r7de;LaWikdfiry10*I8i^(HPGUqTiSvW~|hBG|I1INV;wo{dDfSSRH! zf}yTA0zI_~4Tgf*FIE%?hTOe9ncLn-EmSac30(`Xi{cdY&Mb&cgF(owzA7c3@#H?y zM(JM1*{^HK@Of0#ERH+YalF{o80%XQmi0B9TE@MUoVVEn4?5P!m_%VufO=7meBP$k zL~m?m{c`RjG;P&iV@wBg zI`Zx|vKH*Ho&rBzMG1Ejb-w=@5{!zkpKyg_9U+ExFDK6uE@OBd8uxV=RWt$zF}eoM z%guok6m(Fqdi#3uQM!EZ=$ND%72TxhW<{@6bc>?bDSEx4TNT}==!X@(LD3r(-LB|I zmNh$g)d#|#xge~$Uto|E-vxuo2mNuM+DhiB{S9AuV8=MHv+Uc?!aiP!ueZv(fyJ_RMmaZwxsb!`0 z>&Fi+N*V#xHtMI6o!rN0++J%cQtE(0O^4fGvS>#GEq)fEoJ#sRb(@bi%c-^d`_-A9v}7b_>jkA)WgFVTwmqhin6n+a(jzo zTU!*mb-oRgCl0Vl9BwM+cdu+)vr&AB9l{22z=(?j%Mgb<+EUcP@sWN^gvY*5YXX_# z;`OO+!{C37mE8gM86RLD@&L=@s4zI8p!KlX^b1)Zs1%0W%31?Np~y`#ew1{UA;s^+ zh;+4!REo!WXJDwBn1hm9Cf8HZurNeNRTJ6MQTqYi{= '0.14.1': - # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and - # panels if their columns/items were non-unique. - mixed_dup_df = DataFrame(data) - mixed_dup_df.columns = list("ABCDA") - - mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) - mixed_dup_panel.items = ['ItemA', 'ItemA'] - - frame['mixed_dup'] = mixed_dup_df - panel['mixed_dup'] = mixed_dup_panel - - return dict( series = series, - frame = frame, - panel = panel, - index = index, - mi = mi, - sp_series = dict(float = _create_sp_series(), - ts = _create_sp_tsseries()), - sp_frame = dict(float = _create_sp_frame()) - ) - -def write_legacy_pickles(): - - # force our cwd to be the first searched - import sys - sys.path.insert(0,'.') - - import os, os.path - import numpy as np - import pandas - import pandas.util.testing as tm - import platform as pl - - # make sure we are < 0.13 compat (in py3) - try: - from pandas.compat import zip, cPickle as pickle - except: - import pickle - - version = pandas.__version__ - if len(sys.argv) != 2: - exit("Specify output directory: generate_legacy_pickles.py ") - - output_dir = str(sys.argv[1]) - - print("This script generates a pickle file for the current arch, system, and python version") - print(" pandas version: {0}".format(version)) - print(" output dir : {0}".format(output_dir)) - - # construct a reasonable platform name - f = '_'.join([ str(version), str(pl.machine()), str(pl.system().lower()), str(pl.python_version()) ]) - pth = '{0}.pickle'.format(f) - - fh = open(os.path.join(output_dir,pth),'wb') - pickle.dump(create_data(),fh,pickle.HIGHEST_PROTOCOL) - fh.close() - - print("created pickle file: %s" % pth) - -if __name__ == '__main__': - write_legacy_pickles() diff --git a/pandas/io/tests/generate_legacy_storage_files.py b/pandas/io/tests/generate_legacy_storage_files.py new file mode 100644 index 0000000000000..e7cc89fcc0b61 --- /dev/null +++ b/pandas/io/tests/generate_legacy_storage_files.py @@ -0,0 +1,205 @@ +""" self-contained to write legacy storage (pickle/msgpack) files """ +from __future__ import print_function +from distutils.version import LooseVersion +from pandas import (Series, TimeSeries, DataFrame, Panel, + SparseSeries, SparseTimeSeries, SparseDataFrame, SparsePanel, + Index, MultiIndex, PeriodIndex, bdate_range, to_msgpack, + date_range, period_range, bdate_range, Timestamp, Categorical) +import os +import sys +import numpy as np +import pandas +import pandas.util.testing as tm +import platform as pl + + +def _create_sp_series(): + nan = np.nan + + # nan-based + arr = np.arange(15, dtype=np.float64) + arr[7:12] = nan + arr[-1:] = nan + + bseries = SparseSeries(arr, kind='block') + bseries.name = 'bseries' + return bseries + + +def _create_sp_tsseries(): + nan = np.nan + + # nan-based + arr = np.arange(15, dtype=np.float64) + arr[7:12] = nan + arr[-1:] = nan + + date_index = bdate_range('1/1/2011', periods=len(arr)) + bseries = SparseTimeSeries(arr, index=date_index, kind='block') + bseries.name = 'btsseries' + return bseries + + +def _create_sp_frame(): + nan = np.nan + + data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + 'C': np.arange(10).astype(np.int64), + 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + + dates = bdate_range('1/1/2011', periods=10) + return SparseDataFrame(data, index=dates) + + +def create_data(): + """ create the pickle/msgpack data """ + + data = { + 'A': [0., 1., 2., 3., np.nan], + 'B': [0, 1, 0, 1, 0], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': date_range('1/1/2009', periods=5), + 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] + } + + index = dict(int=Index(np.arange(10)), + date=date_range('20130101', periods=10), + period=period_range('2013-01-01', freq='M', periods=10)) + + mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), + names=['first', 'second'])) + series = dict(float=Series(data['A']), + int=Series(data['B']), + mixed=Series(data['E']), + ts=TimeSeries(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)), + mi=Series(np.arange(5).astype(np.float64), + index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), + names=['one', 'two'])), + dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), + cat=Series(Categorical(['foo', 'bar', 'baz']))) + + mixed_dup_df = DataFrame(data) + mixed_dup_df.columns = list("ABCDA") + frame = dict(float=DataFrame(dict(A=series['float'], B=series['float'] + 1)), + int=DataFrame(dict(A=series['int'], B=series['int'] + 1)), + mixed=DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])), + mi=DataFrame(dict(A=np.arange(5).astype(np.float64), B=np.arange(5).astype(np.int64)), + index=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'baz'], + ['one', 'two', 'one', 'two', 'three']])), + names=['first', 'second'])), + dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), + columns=['A', 'B', 'A']), + cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), + cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']), + B=np.arange(3).astype(np.int64))), + mixed_dup=mixed_dup_df) + + mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) + mixed_dup_panel.items = ['ItemA', 'ItemA'] + panel = dict(float=Panel(dict(ItemA=frame['float'], ItemB=frame['float'] + 1)), + dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), + items=['A', 'B', 'A']), + mixed_dup=mixed_dup_panel) + + return dict(series=series, + frame=frame, + panel=panel, + index=index, + mi=mi, + sp_series=dict(float=_create_sp_series(), + ts=_create_sp_tsseries()), + sp_frame=dict(float=_create_sp_frame())) + + +def create_pickle_data(): + data = create_data() + + # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and + # panels if their columns/items were non-unique. + if LooseVersion(pandas.__version__) < '0.14.1': + del data['frame']['mixed_dup'] + del data['panel']['mixed_dup'] + return data + + +def create_msgpack_data(): + data = create_data() + if LooseVersion(pandas.__version__) < '0.17.0': + del data['frame']['mixed_dup'] + del data['panel']['mixed_dup'] + del data['frame']['dup'] + del data['panel']['dup'] + # Not supported + del data['sp_series'] + del data['sp_frame'] + del data['series']['cat'] + del data['frame']['cat_onecol'] + del data['frame']['cat_and_float'] + return data + + +def platform_name(): + return '_'.join([str(pandas.__version__), str(pl.machine()), str(pl.system().lower()), str(pl.python_version())]) + + +def write_legacy_pickles(output_dir): + + # make sure we are < 0.13 compat (in py3) + try: + from pandas.compat import zip, cPickle as pickle + except: + import pickle + + version = pandas.__version__ + + print("This script generates a storage file for the current arch, system, and python version") + print(" pandas version: {0}".format(version)) + print(" output dir : {0}".format(output_dir)) + print(" storage format: pickle") + + pth = '{0}.pickle'.format(platform_name()) + + fh = open(os.path.join(output_dir, pth), 'wb') + pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL) + fh.close() + + print("created pickle file: %s" % pth) + + +def write_legacy_msgpack(output_dir): + + version = pandas.__version__ + + print("This script generates a storage file for the current arch, system, and python version") + print(" pandas version: {0}".format(version)) + print(" output dir : {0}".format(output_dir)) + print(" storage format: msgpack") + + pth = '{0}.msgpack'.format(platform_name()) + to_msgpack(os.path.join(output_dir, pth), create_msgpack_data()) + + print("created msgpack file: %s" % pth) + + +def write_legacy_file(): + # force our cwd to be the first searched + sys.path.insert(0, '.') + + if len(sys.argv) != 3: + exit("Specify output directory and storage type: generate_legacy_storage_files.py ") + + output_dir = str(sys.argv[1]) + storage_type = str(sys.argv[2]) + + if storage_type == 'pickle': + write_legacy_pickles(output_dir=output_dir) + elif storage_type == 'msgpack': + write_legacy_msgpack(output_dir=output_dir) + else: + exit("storage_type must be one of {'pickle', 'msgpack'}") + + +if __name__ == '__main__': + write_legacy_file() diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 9f1fd41e90413..33b7cc79083db 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -1,5 +1,6 @@ import nose +import os import datetime import numpy as np import sys @@ -11,7 +12,7 @@ date_range, period_range, Index, SparseSeries, SparseDataFrame, SparsePanel) import pandas.util.testing as tm -from pandas.util.testing import ensure_clean +from pandas.util.testing import ensure_clean, assert_index_equal from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal from pandas.tests.test_panel import assert_panel_equal @@ -39,6 +40,8 @@ def check_arbitrary(a, b): assert_frame_equal(a, b) elif isinstance(a, Series): assert_series_equal(a, b) + elif isinstance(a, Index): + assert_index_equal(a, b) else: assert(a == b) @@ -396,6 +399,24 @@ def tests_datetimeindex_freq_issue(self): result = self.encode_decode(df) assert_frame_equal(result, df) + def test_dataframe_duplicate_column_names(self): + + # GH 9618 + expected_1 = DataFrame(columns=['a', 'a']) + expected_2 = DataFrame(columns=[1]*100) + expected_2.loc[0] = np.random.randn(100) + expected_3 = DataFrame(columns=[1, 1]) + expected_3.loc[0] = ['abc', np.nan] + + result_1 = self.encode_decode(expected_1) + result_2 = self.encode_decode(expected_2) + result_3 = self.encode_decode(expected_3) + + assert_frame_equal(result_1, expected_1) + assert_frame_equal(result_2, expected_2) + assert_frame_equal(result_3, expected_3) + + class TestSparse(TestPackers): def _check_roundtrip(self, obj, comparator, **kwargs): @@ -496,6 +517,58 @@ def test_compression_blosc(self): assert_frame_equal(self.frame[k], i_rec[k]) +class TestMsgpack(): + """ + How to add msgpack tests: + + 1. Install pandas version intended to output the msgpack. + + 2. Execute "generate_legacy_storage_files.py" to create the msgpack. + $ python generate_legacy_storage_files.py msgpack + + 3. Move the created pickle to "data/legacy_msgpack/" directory. + + NOTE: TestMsgpack can't be a subclass of tm.Testcase to use test generator. + http://stackoverflow.com/questions/6689537/nose-test-generators-inside-class + """ + def setUp(self): + from pandas.io.tests.generate_legacy_storage_files import create_msgpack_data + self.data = create_msgpack_data() + self.path = u('__%s__.msgpack' % tm.rands(10)) + + def compare(self, vf): + data = read_msgpack(vf) + for typ, dv in data.items(): + for dt, result in dv.items(): + try: + expected = self.data[typ][dt] + except KeyError: + continue + check_arbitrary(result, expected) + + return data + + def read_msgpacks(self, version): + + pth = tm.get_data_path('legacy_msgpack/{0}'.format(str(version))) + n = 0 + for f in os.listdir(pth): + vf = os.path.join(pth, f) + self.compare(vf) + n += 1 + assert n > 0, 'Msgpack files are not tested' + + def test_msgpack(self): + msgpack_path = tm.get_data_path('legacy_msgpack') + n = 0 + for v in os.listdir(msgpack_path): + pth = os.path.join(msgpack_path, v) + if os.path.isdir(pth): + yield self.read_msgpacks, v + n += 1 + assert n > 0, 'Msgpack files are not tested' + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index d1396463f3b23..e691fac215002 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -24,8 +24,8 @@ class TestPickle(): 1. Install pandas version intended to output the pickle. - 2. Execute "generate_legacy_pkcles.py" to create the pickle. - $ python generate_legacy_pickles.py + 2. Execute "generate_legacy_storage_files.py" to create the pickle. + $ python generate_legacy_storage_files.py pickle 3. Move the created pickle to "data/legacy_pickle/" directory. @@ -35,8 +35,8 @@ class TestPickle(): _multiprocess_can_split_ = True def setUp(self): - from pandas.io.tests.generate_legacy_pickles import create_data - self.data = create_data() + from pandas.io.tests.generate_legacy_storage_files import create_pickle_data + self.data = create_pickle_data() self.path = u('__%s__.pickle' % tm.rands(10)) def compare_element(self, typ, result, expected): diff --git a/setup.py b/setup.py index f20b0ac0a5fb5..30c5d1052d9b3 100755 --- a/setup.py +++ b/setup.py @@ -537,6 +537,7 @@ def pxd(name): ], package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5', 'tests/data/legacy_pickle/*/*.pickle', + 'tests/data/legacy_msgpack/*/*.msgpack', 'tests/data/*.csv*', 'tests/data/*.dta', 'tests/data/*.txt',