From 8402674275dd4c17203567c4d9972170c87534a7 Mon Sep 17 00:00:00 2001 From: PKEuS Date: Mon, 17 Jun 2013 16:04:14 +0200 Subject: [PATCH] FIX: Bug in stata parser causing failure when reading dataset with string data --- pandas/io/stata.py | 13 ++++++------- pandas/io/tests/test_stata.py | 22 +++++++++++++++++++--- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ddc9db0b76539..632e97c24721f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -407,7 +407,7 @@ def _null_terminate(self, s): def _next(self): typlist = self.typlist - if self._has_string_data: + if self.has_string_data: data = [None] * self.nvar for i in range(len(data)): if type(typlist[i]) is int: @@ -523,7 +523,8 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None): for i in cols_: if self.dtyplist[i] is not None: col = data.columns[i] - data[col] = Series(data[col], data[col].index, self.dtyplist[i]) + if data[col].dtype is not np.dtype(object): + data[col] = Series(data[col], data[col].index, self.dtyplist[i]) if convert_dates: cols = np.where(map(lambda x: x in _date_formats, self.fmtlist))[0] @@ -856,7 +857,7 @@ def _write_data_nodates(self): typ = ord(typlist[i]) if typ <= 244: # we've got a string if len(var) < typ: - var = _pad_bytes(self._decode_bytes(var), len(var) + 1) + var = _pad_bytes(var, typ) self._write(var) else: try: @@ -884,15 +885,13 @@ def _write_data_dates(self): if i in convert_dates: var = _datetime_to_stata_elapsed(var, self.fmtlist[i]) if typ <= 244: # we've got a string - if isnull(var): - var = "" # missing string if len(var) < typ: - var = _pad_bytes(var, len(var) + 1) + var = _pad_bytes(var, typ) self._write(var) else: if isnull(var): # this only matters for floats var = MISSING_VALUES[typ] - self._write(struct.pack(byteorder+TYPE_MAP[typ], var)) + self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var)) def _null_terminate(self, s, as_string=False): null_byte = '\x00' diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 4584976c41383..0e32fb91fc743 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -3,19 +3,19 @@ from datetime import datetime import os import unittest -import sys import warnings import nose import numpy as np -from pandas.core.frame import DataFrame +from pandas.core.frame import DataFrame, Series from pandas.io.parsers import read_csv from pandas.io.stata import read_stata, StataReader, StataWriter import pandas.util.testing as tm from pandas.util.testing import ensure_clean from pandas.util.misc import is_little_endian + class StataTests(unittest.TestCase): def setUp(self): @@ -35,6 +35,7 @@ def setUp(self): self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') self.dta9 = os.path.join(self.dirpath, 'lbw.dta') self.csv9 = os.path.join(self.dirpath, 'lbw.csv') + self.dta10 = os.path.join(self.dirpath, 'stata10.dta') def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -189,9 +190,24 @@ def test_read_dta9(self): decimal=3 ) + def test_read_dta10(self): + original = DataFrame( + data= + [ + ["string", "object", 1, 1.1, np.datetime64('2003-12-25')] + ], + columns=['string', 'object', 'integer', 'float', 'datetime']) + original["object"] = Series(original["object"], dtype=object) + original.index.name = 'index' + + with ensure_clean(self.dta10) as path: + original.to_stata(path, {'datetime': 'tc'}, False) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), original) + def test_stata_doc_examples(self): with ensure_clean(self.dta5) as path: - df = DataFrame(np.random.randn(10,2),columns=list('AB')) + df = DataFrame(np.random.randn(10, 2), columns=list('AB')) df.to_stata(path) if __name__ == '__main__':