@@ -287,7 +287,10 @@ def test_read_write_dta5(self):
287
287
with tm .ensure_clean () as path :
288
288
original .to_stata (path , convert_dates = None )
289
289
written_and_read_again = self .read_dta (path )
290
- tm .assert_frame_equal (written_and_read_again .set_index ("index" ), original )
290
+
291
+ expected = original .copy ()
292
+ expected .index = expected .index .astype (np .int32 )
293
+ tm .assert_frame_equal (written_and_read_again .set_index ("index" ), expected )
291
294
292
295
def test_write_dta6 (self , datapath ):
293
296
original = self .read_csv (datapath ("io" , "data" , "stata" , "stata3.csv" ))
@@ -380,7 +383,10 @@ def test_read_write_dta11(self):
380
383
original .to_stata (path , convert_dates = None )
381
384
382
385
written_and_read_again = self .read_dta (path )
383
- tm .assert_frame_equal (written_and_read_again .set_index ("index" ), formatted )
386
+
387
+ expected = formatted .copy ()
388
+ expected .index = expected .index .astype (np .int32 )
389
+ tm .assert_frame_equal (written_and_read_again .set_index ("index" ), expected )
384
390
385
391
@pytest .mark .parametrize ("version" , [114 , 117 , 118 , 119 , None ])
386
392
def test_read_write_dta12 (self , version ):
@@ -417,7 +423,10 @@ def test_read_write_dta12(self, version):
417
423
assert len (w ) == 1
418
424
419
425
written_and_read_again = self .read_dta (path )
420
- tm .assert_frame_equal (written_and_read_again .set_index ("index" ), formatted )
426
+
427
+ expected = formatted .copy ()
428
+ expected .index = expected .index .astype (np .int32 )
429
+ tm .assert_frame_equal (written_and_read_again .set_index ("index" ), expected )
421
430
422
431
def test_read_write_dta13 (self ):
423
432
s1 = Series (2 ** 9 , dtype = np .int16 )
@@ -432,7 +441,10 @@ def test_read_write_dta13(self):
432
441
with tm .ensure_clean () as path :
433
442
original .to_stata (path )
434
443
written_and_read_again = self .read_dta (path )
435
- tm .assert_frame_equal (written_and_read_again .set_index ("index" ), formatted )
444
+
445
+ expected = formatted .copy ()
446
+ expected .index = expected .index .astype (np .int32 )
447
+ tm .assert_frame_equal (written_and_read_again .set_index ("index" ), expected )
436
448
437
449
@pytest .mark .parametrize ("version" , [114 , 117 , 118 , 119 , None ])
438
450
@pytest .mark .parametrize (
@@ -455,7 +467,10 @@ def test_read_write_reread_dta14(self, file, parsed_114, version, datapath):
455
467
with tm .ensure_clean () as path :
456
468
parsed_114 .to_stata (path , convert_dates = {"date_td" : "td" }, version = version )
457
469
written_and_read_again = self .read_dta (path )
458
- tm .assert_frame_equal (written_and_read_again .set_index ("index" ), parsed_114 )
470
+
471
+ expected = parsed_114 .copy ()
472
+ expected .index = expected .index .astype (np .int32 )
473
+ tm .assert_frame_equal (written_and_read_again .set_index ("index" ), expected )
459
474
460
475
@pytest .mark .parametrize (
461
476
"file" , ["stata6_113" , "stata6_114" , "stata6_115" , "stata6_117" ]
@@ -510,11 +525,15 @@ def test_numeric_column_names(self):
510
525
original .to_stata (path )
511
526
512
527
written_and_read_again = self .read_dta (path )
513
- written_and_read_again = written_and_read_again .set_index ("index" )
514
- columns = list (written_and_read_again .columns )
515
- convert_col_name = lambda x : int (x [1 ])
516
- written_and_read_again .columns = map (convert_col_name , columns )
517
- tm .assert_frame_equal (original , written_and_read_again )
528
+
529
+ written_and_read_again = written_and_read_again .set_index ("index" )
530
+ columns = list (written_and_read_again .columns )
531
+ convert_col_name = lambda x : int (x [1 ])
532
+ written_and_read_again .columns = map (convert_col_name , columns )
533
+
534
+ expected = original .copy ()
535
+ expected .index = expected .index .astype (np .int32 )
536
+ tm .assert_frame_equal (expected , written_and_read_again )
518
537
519
538
@pytest .mark .parametrize ("version" , [114 , 117 , 118 , 119 , None ])
520
539
def test_nan_to_missing_value (self , version ):
@@ -524,11 +543,15 @@ def test_nan_to_missing_value(self, version):
524
543
s2 [1 ::2 ] = np .nan
525
544
original = DataFrame ({"s1" : s1 , "s2" : s2 })
526
545
original .index .name = "index"
546
+
527
547
with tm .ensure_clean () as path :
528
548
original .to_stata (path , version = version )
529
549
written_and_read_again = self .read_dta (path )
530
- written_and_read_again = written_and_read_again .set_index ("index" )
531
- tm .assert_frame_equal (written_and_read_again , original )
550
+
551
+ written_and_read_again = written_and_read_again .set_index ("index" )
552
+ expected = original .copy ()
553
+ expected .index = expected .index .astype (np .int32 )
554
+ tm .assert_frame_equal (written_and_read_again , expected )
532
555
533
556
def test_no_index (self ):
534
557
columns = ["x" , "y" ]
@@ -548,7 +571,10 @@ def test_string_no_dates(self):
548
571
with tm .ensure_clean () as path :
549
572
original .to_stata (path )
550
573
written_and_read_again = self .read_dta (path )
551
- tm .assert_frame_equal (written_and_read_again .set_index ("index" ), original )
574
+
575
+ expected = original .copy ()
576
+ expected .index = expected .index .astype (np .int32 )
577
+ tm .assert_frame_equal (written_and_read_again .set_index ("index" ), expected )
552
578
553
579
def test_large_value_conversion (self ):
554
580
s0 = Series ([1 , 99 ], dtype = np .int8 )
@@ -562,11 +588,13 @@ def test_large_value_conversion(self):
562
588
original .to_stata (path )
563
589
564
590
written_and_read_again = self .read_dta (path )
565
- modified = original .copy ()
566
- modified ["s1" ] = Series (modified ["s1" ], dtype = np .int16 )
567
- modified ["s2" ] = Series (modified ["s2" ], dtype = np .int32 )
568
- modified ["s3" ] = Series (modified ["s3" ], dtype = np .float64 )
569
- tm .assert_frame_equal (written_and_read_again .set_index ("index" ), modified )
591
+
592
+ modified = original .copy ()
593
+ modified ["s1" ] = Series (modified ["s1" ], dtype = np .int16 )
594
+ modified ["s2" ] = Series (modified ["s2" ], dtype = np .int32 )
595
+ modified ["s3" ] = Series (modified ["s3" ], dtype = np .float64 )
596
+ modified .index = original .index .astype (np .int32 )
597
+ tm .assert_frame_equal (written_and_read_again .set_index ("index" ), modified )
570
598
571
599
def test_dates_invalid_column (self ):
572
600
original = DataFrame ([datetime (2006 , 11 , 19 , 23 , 13 , 20 )])
@@ -576,9 +604,11 @@ def test_dates_invalid_column(self):
576
604
original .to_stata (path , convert_dates = {0 : "tc" })
577
605
578
606
written_and_read_again = self .read_dta (path )
579
- modified = original .copy ()
580
- modified .columns = ["_0" ]
581
- tm .assert_frame_equal (written_and_read_again .set_index ("index" ), modified )
607
+
608
+ modified = original .copy ()
609
+ modified .columns = ["_0" ]
610
+ modified .index = original .index .astype (np .int32 )
611
+ tm .assert_frame_equal (written_and_read_again .set_index ("index" ), modified )
582
612
583
613
def test_105 (self , datapath ):
584
614
# Data obtained from:
@@ -619,21 +649,32 @@ def test_date_export_formats(self):
619
649
datetime (2006 , 1 , 1 ),
620
650
] # Year
621
651
622
- expected = DataFrame ([expected_values ], columns = columns )
623
- expected .index .name = "index"
652
+ expected = DataFrame (
653
+ [expected_values ],
654
+ index = pd .Index ([0 ], dtype = np .int32 , name = "index" ),
655
+ columns = columns ,
656
+ )
657
+
624
658
with tm .ensure_clean () as path :
625
659
original .to_stata (path , convert_dates = conversions )
626
660
written_and_read_again = self .read_dta (path )
627
- tm .assert_frame_equal (written_and_read_again .set_index ("index" ), expected )
661
+
662
+ tm .assert_frame_equal (written_and_read_again .set_index ("index" ), expected )
628
663
629
664
def test_write_missing_strings (self ):
630
665
original = DataFrame ([["1" ], [None ]], columns = ["foo" ])
631
- expected = DataFrame ([["1" ], ["" ]], columns = ["foo" ])
632
- expected .index .name = "index"
666
+
667
+ expected = DataFrame (
668
+ [["1" ], ["" ]],
669
+ index = pd .Index ([0 , 1 ], dtype = np .int32 , name = "index" ),
670
+ columns = ["foo" ],
671
+ )
672
+
633
673
with tm .ensure_clean () as path :
634
674
original .to_stata (path )
635
675
written_and_read_again = self .read_dta (path )
636
- tm .assert_frame_equal (written_and_read_again .set_index ("index" ), expected )
676
+
677
+ tm .assert_frame_equal (written_and_read_again .set_index ("index" ), expected )
637
678
638
679
@pytest .mark .parametrize ("version" , [114 , 117 , 118 , 119 , None ])
639
680
@pytest .mark .parametrize ("byteorder" , [">" , "<" ])
@@ -651,6 +692,7 @@ def test_bool_uint(self, byteorder, version):
651
692
)
652
693
original .index .name = "index"
653
694
expected = original .copy ()
695
+ expected .index = original .index .astype (np .int32 )
654
696
expected_types = (
655
697
np .int8 ,
656
698
np .int8 ,
@@ -666,8 +708,9 @@ def test_bool_uint(self, byteorder, version):
666
708
with tm .ensure_clean () as path :
667
709
original .to_stata (path , byteorder = byteorder , version = version )
668
710
written_and_read_again = self .read_dta (path )
669
- written_and_read_again = written_and_read_again .set_index ("index" )
670
- tm .assert_frame_equal (written_and_read_again , expected )
711
+
712
+ written_and_read_again = written_and_read_again .set_index ("index" )
713
+ tm .assert_frame_equal (written_and_read_again , expected )
671
714
672
715
def test_variable_labels (self , datapath ):
673
716
with StataReader (datapath ("io" , "data" , "stata" , "stata7_115.dta" )) as rdr :
@@ -818,11 +861,12 @@ def test_big_dates(self, datapath):
818
861
expected .index .name = "index"
819
862
expected .to_stata (path , convert_dates = date_conversion )
820
863
written_and_read_again = self .read_dta (path )
821
- tm .assert_frame_equal (
822
- written_and_read_again .set_index ("index" ),
823
- expected ,
824
- check_datetimelike_compat = True ,
825
- )
864
+
865
+ tm .assert_frame_equal (
866
+ written_and_read_again .set_index ("index" ),
867
+ expected .set_index (expected .index .astype (np .int32 )),
868
+ check_datetimelike_compat = True ,
869
+ )
826
870
827
871
def test_dtype_conversion (self , datapath ):
828
872
expected = self .read_csv (datapath ("io" , "data" , "stata" , "stata6.csv" ))
@@ -936,7 +980,7 @@ def test_categorical_writing(self, version):
936
980
original = pd .concat (
937
981
[original [col ].astype ("category" ) for col in original ], axis = 1
938
982
)
939
- expected .index . name = "index"
983
+ expected .index = expected . index . set_names ( "index" ). astype ( np . int32 )
940
984
941
985
expected ["incompletely_labeled" ] = expected ["incompletely_labeled" ].apply (str )
942
986
expected ["unlabeled" ] = expected ["unlabeled" ].apply (str )
@@ -955,8 +999,9 @@ def test_categorical_writing(self, version):
955
999
with tm .ensure_clean () as path :
956
1000
original .to_stata (path , version = version )
957
1001
written_and_read_again = self .read_dta (path )
958
- res = written_and_read_again .set_index ("index" )
959
- tm .assert_frame_equal (res , expected )
1002
+
1003
+ res = written_and_read_again .set_index ("index" )
1004
+ tm .assert_frame_equal (res , expected )
960
1005
961
1006
def test_categorical_warnings_and_errors (self ):
962
1007
# Warning for non-string labels
@@ -1000,15 +1045,17 @@ def test_categorical_with_stata_missing_values(self, version):
1000
1045
with tm .ensure_clean () as path :
1001
1046
original .to_stata (path , version = version )
1002
1047
written_and_read_again = self .read_dta (path )
1003
- res = written_and_read_again .set_index ("index" )
1004
1048
1005
- expected = original .copy ()
1006
- for col in expected :
1007
- cat = expected [col ]._values
1008
- new_cats = cat .remove_unused_categories ().categories
1009
- cat = cat .set_categories (new_cats , ordered = True )
1010
- expected [col ] = cat
1011
- tm .assert_frame_equal (res , expected )
1049
+ res = written_and_read_again .set_index ("index" )
1050
+
1051
+ expected = original .copy ()
1052
+ for col in expected :
1053
+ cat = expected [col ]._values
1054
+ new_cats = cat .remove_unused_categories ().categories
1055
+ cat = cat .set_categories (new_cats , ordered = True )
1056
+ expected [col ] = cat
1057
+ expected .index = expected .index .astype (np .int32 )
1058
+ tm .assert_frame_equal (res , expected )
1012
1059
1013
1060
@pytest .mark .parametrize ("file" , ["stata10_115" , "stata10_117" ])
1014
1061
def test_categorical_order (self , file , datapath ):
@@ -1456,8 +1503,11 @@ def test_out_of_range_float(self):
1456
1503
with tm .ensure_clean () as path :
1457
1504
original .to_stata (path )
1458
1505
reread = read_stata (path )
1459
- original ["ColumnTooBig" ] = original ["ColumnTooBig" ].astype (np .float64 )
1460
- tm .assert_frame_equal (original , reread .set_index ("index" ))
1506
+
1507
+ original ["ColumnTooBig" ] = original ["ColumnTooBig" ].astype (np .float64 )
1508
+ expected = original .copy ()
1509
+ expected .index = expected .index .astype (np .int32 )
1510
+ tm .assert_frame_equal (reread .set_index ("index" ), expected )
1461
1511
1462
1512
@pytest .mark .parametrize ("infval" , [np .inf , - np .inf ])
1463
1513
def test_inf (self , infval ):
@@ -1885,7 +1935,10 @@ def test_compression(compression, version, use_dict, infer):
1885
1935
elif compression is None :
1886
1936
fp = path
1887
1937
reread = read_stata (fp , index_col = "index" )
1888
- tm .assert_frame_equal (reread , df )
1938
+
1939
+ expected = df .copy ()
1940
+ expected .index = expected .index .astype (np .int32 )
1941
+ tm .assert_frame_equal (reread , expected )
1889
1942
1890
1943
1891
1944
@pytest .mark .parametrize ("method" , ["zip" , "infer" ])
@@ -1906,20 +1959,29 @@ def test_compression_dict(method, file_ext):
1906
1959
else :
1907
1960
fp = path
1908
1961
reread = read_stata (fp , index_col = "index" )
1909
- tm .assert_frame_equal (reread , df )
1962
+
1963
+ expected = df .copy ()
1964
+ expected .index = expected .index .astype (np .int32 )
1965
+ tm .assert_frame_equal (reread , expected )
1910
1966
1911
1967
1912
1968
@pytest .mark .parametrize ("version" , [114 , 117 , 118 , 119 , None ])
1913
1969
def test_chunked_categorical (version ):
1914
1970
df = DataFrame ({"cats" : Series (["a" , "b" , "a" , "b" , "c" ], dtype = "category" )})
1915
1971
df .index .name = "index"
1972
+
1973
+ expected = df .copy ()
1974
+ expected .index = expected .index .astype (np .int32 )
1975
+
1916
1976
with tm .ensure_clean () as path :
1917
1977
df .to_stata (path , version = version )
1918
1978
with StataReader (path , chunksize = 2 , order_categoricals = False ) as reader :
1919
1979
for i , block in enumerate (reader ):
1920
1980
block = block .set_index ("index" )
1921
1981
assert "cats" in block
1922
- tm .assert_series_equal (block .cats , df .cats .iloc [2 * i : 2 * (i + 1 )])
1982
+ tm .assert_series_equal (
1983
+ block .cats , expected .cats .iloc [2 * i : 2 * (i + 1 )]
1984
+ )
1923
1985
1924
1986
1925
1987
def test_chunked_categorical_partial (datapath ):
0 commit comments