From 709c034b817f3ec6b6326d9d08e4ed14378bca0f Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 13 Feb 2015 09:08:04 -0500 Subject: [PATCH] Read Stata file incrementally Remove testing code Use partition in null_terminate Manage warnings better in test Further warning management in testing; add skip_data argument Major refactoring to address code review Fix strl reading, templatize docstrings Fix bug in attaching docstring Add new test file Add release note Call read instead of data when calling pandas.read_stata various small issues following code review Improve performance of %td processing Docs edit (minor) --- doc/source/io.rst | 33 +- doc/source/release.rst | 2 + pandas/io/stata.py | 603 ++++++++++++++++++--------- pandas/io/tests/data/stata12_117.dta | Bin 0 -> 1285 bytes pandas/io/tests/test_stata.py | 151 ++++++- 5 files changed, 570 insertions(+), 219 deletions(-) create mode 100644 pandas/io/tests/data/stata12_117.dta diff --git a/doc/source/io.rst b/doc/source/io.rst index e71b4134f5b9c..c4865fddb099b 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3821,22 +3821,41 @@ outside of this range, the variable is cast to ``int16``. Reading from Stata format ~~~~~~~~~~~~~~~~~~~~~~~~~ -The top-level function ``read_stata`` will read a dta files -and return a DataFrame. Alternatively, the class :class:`~pandas.io.stata.StataReader` -can be used if more granular access is required. :class:`~pandas.io.stata.StataReader` -reads the header of the dta file at initialization. The method -:func:`~pandas.io.stata.StataReader.data` reads and converts observations to a DataFrame. +The top-level function ``read_stata`` will read a dta file and return +either a DataFrame or a :class:`~pandas.io.stata.StataReader` that can +be used to read the file incrementally. .. ipython:: python pd.read_stata('stata.dta') +.. versionadded:: 0.16.0 + +Specifying a ``chunksize`` yields a +:class:`~pandas.io.stata.StataReader` instance that can be used to +read ``chunksize`` lines from the file at a time. The ``StataReader`` +object can be used as an iterator. + + reader = pd.read_stata('stata.dta', chunksize=1000) + for df in reader: + do_something(df) + +For more fine-grained control, use ``iterator=True`` and specify +``chunksize`` with each call to +:func:`~pandas.io.stata.StataReader.read`. + +.. ipython:: python + + reader = pd.read_stata('stata.dta', iterator=True) + chunk1 = reader.read(10) + chunk2 = reader.read(20) + Currently the ``index`` is retrieved as a column. The parameter ``convert_categoricals`` indicates whether value labels should be read and used to create a ``Categorical`` variable from them. Value labels can -also be retrieved by the function ``variable_labels``, which requires data to be -called before use (see ``pandas.io.stata.StataReader``). +also be retrieved by the function ``value_labels``, which requires :func:`~pandas.io.stata.StataReader.read` +to be called before use. The parameter ``convert_missing`` indicates whether missing value representations in Stata should be preserved. If ``False`` (the default), diff --git a/doc/source/release.rst b/doc/source/release.rst index 164e381499490..0912a11e28801 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -55,6 +55,8 @@ performance improvements along with a large number of bug fixes. Highlights include: +- Allow Stata files to be read incrementally, support for long strings in Stata files (issue:`9493`:) :ref:`here`. + See the :ref:`v0.16.0 Whatsnew ` overview or the issue tracker on GitHub for an extensive list of all API changes, enhancements and bugs that have been fixed in 0.16.0. diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 0d6e554b8b474..7dd32fd00a4d2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -22,51 +22,144 @@ from pandas import compat, to_timedelta, to_datetime, isnull, DatetimeIndex from pandas.compat import lrange, lmap, lzip, text_type, string_types, range, \ zip, BytesIO +from pandas.util.decorators import Appender import pandas.core.common as com from pandas.io.common import get_filepath_or_buffer from pandas.lib import max_len_string_array, infer_dtype from pandas.tslib import NaT, Timestamp +_statafile_processing_params1 = """\ +convert_dates : boolean, defaults to True + Convert date variables to DataFrame time values +convert_categoricals : boolean, defaults to True + Read value labels and convert columns to Categorical/Factor variables""" + +_encoding_params = """\ +encoding : string, None or encoding + Encoding used to parse the files. Note that Stata doesn't + support unicode. None defaults to iso-8859-1.""" + +_statafile_processing_params2 = """\ +index : identifier of index column + identifier of column that should be used as index of the DataFrame +convert_missing : boolean, defaults to False + Flag indicating whether to convert missing values to their Stata + representations. If False, missing values are replaced with nans. + If True, columns containing missing values are returned with + object data types and missing values are represented by + StataMissingValue objects. +preserve_dtypes : boolean, defaults to True + Preserve Stata datatypes. If False, numeric data are upcast to pandas + default types for foreign data (float64 or int64) +columns : list or None + Columns to retain. Columns will be returned in the given order. None + returns all columns +order_categoricals : boolean, defaults to True + Flag indicating whether converted categorical data are ordered.""" + +_chunksize_params = """\ +chunksize : int, default None + Return StataReader object for iterations, returns chunks with + given number of lines""" + +_iterator_params = """\ +iterator : boolean, default False + Return StataReader object""" + +_read_stata_doc = """Read Stata file into DataFrame + +Parameters +---------- +filepath_or_buffer : string or file-like object + Path to .dta file or object implementing a binary read() functions +%s +%s +%s +%s +%s + +Returns +------- +DataFrame or StataReader + +Examples +-------- +Read a Stata dta file: +>> df = pandas.read_stata('filename.dta') + +Read a Stata dta file in 10,000 line chunks: +>> itr = pandas.read_stata('filename.dta', chunksize=10000) +>> for chunk in itr: +>> do_something(chunk) +""" % (_statafile_processing_params1, _encoding_params, + _statafile_processing_params2, _chunksize_params, + _iterator_params) + +_data_method_doc = """Reads observations from Stata file, converting them into a dataframe + +This is a legacy method. Use `read` in new code. + +Parameters +---------- +%s +%s + +Returns +------- +DataFrame +""" % (_statafile_processing_params1, _statafile_processing_params2) + + +_read_method_doc = """\ +Reads observations from Stata file, converting them into a dataframe + +Parameters +---------- +nrows : int + Number of lines to read from data file, if None read whole file. +%s +%s + +Returns +------- +DataFrame +""" % (_statafile_processing_params1, _statafile_processing_params2) + + +_stata_reader_doc = """\ +Class for reading Stata dta files. + +Parameters +---------- +path_or_buf : string or file-like object + Path to .dta file or object implementing a binary read() functions +%s +%s +%s +%s +""" % (_statafile_processing_params1, _statafile_processing_params2, + _encoding_params, _chunksize_params) + + +@Appender(_read_stata_doc) def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None, convert_missing=False, preserve_dtypes=True, columns=None, - order_categoricals=True): - """ - Read Stata file into DataFrame + order_categoricals=True, chunksize=None, iterator=False): - Parameters - ---------- - filepath_or_buffer : string or file-like object - Path to .dta file or object implementing a binary read() functions - convert_dates : boolean, defaults to True - Convert date variables to DataFrame time values - convert_categoricals : boolean, defaults to True - Read value labels and convert columns to Categorical/Factor variables - encoding : string, None or encoding - Encoding used to parse the files. Note that Stata doesn't - support unicode. None defaults to iso-8859-1. - index : identifier of index column - identifier of column that should be used as index of the DataFrame - convert_missing : boolean, defaults to False - Flag indicating whether to convert missing values to their Stata - representations. If False, missing values are replaced with nans. - If True, columns containing missing values are returned with - object data types and missing values are represented by - StataMissingValue objects. - preserve_dtypes : boolean, defaults to True - Preserve Stata datatypes. If False, numeric data are upcast to pandas - default types for foreign data (float64 or int64) - columns : list or None - Columns to retain. Columns will be returned in the given order. None - returns all columns - order_categoricals : boolean, defaults to True - Flag indicating whether converted categorical data are ordered. - """ - reader = StataReader(filepath_or_buffer, encoding) + reader = StataReader(filepath_or_buffer, + convert_dates=convert_dates, + convert_categoricals=convert_categoricals, + index=index, convert_missing=convert_missing, + preserve_dtypes=preserve_dtypes, + columns=columns, + order_categoricals=order_categoricals, + chunksize=chunksize, encoding=encoding) + + if iterator or chunksize: + return reader - return reader.data(convert_dates, convert_categoricals, index, - convert_missing, preserve_dtypes, columns, - order_categoricals) + return reader.read() _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] @@ -139,8 +232,10 @@ def convert_year_month_safe(year, month): if year.max() < MAX_YEAR and year.min() > MIN_YEAR: return to_datetime(100 * year + month, format='%Y%m') else: + index = getattr(year, 'index', None) return Series( - [datetime.datetime(y, m, 1) for y, m in zip(year, month)]) + [datetime.datetime(y, m, 1) for y, m in zip(year, month)], + index=index) def convert_year_days_safe(year, days): """ @@ -150,9 +245,10 @@ def convert_year_days_safe(year, days): if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: return to_datetime(year, format='%Y') + to_timedelta(days, unit='d') else: + index = getattr(year, 'index', None) value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) for y, d in zip(year, days)] - return Series(value) + return Series(value, index=index) def convert_delta_safe(base, deltas, unit): """ @@ -160,18 +256,18 @@ def convert_delta_safe(base, deltas, unit): versions if the deltas satisfy restrictions required to be expressed as dates in pandas. """ + index = getattr(deltas, 'index', None) if unit == 'd': if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: values = [base + relativedelta(days=int(d)) for d in deltas] - return Series(values) + return Series(values, index=index) elif unit == 'ms': if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: values = [base + relativedelta(microseconds=(int(d) * 1000)) for d in deltas] - return Series(values) + return Series(values, index=index) else: raise ValueError('format not understood') - base = to_datetime(base) deltas = to_timedelta(deltas, unit=unit) return base + deltas @@ -226,6 +322,7 @@ def convert_delta_safe(base, deltas, unit): if has_bad_values: # Restore NaT for bad values conv_dates[bad_locs] = NaT + return conv_dates @@ -717,7 +814,7 @@ def __init__(self, encoding): self.DTYPE_MAP_XML = \ dict( [ - (32768, np.string_), + (32768, np.uint8), # Keys to GSO (65526, np.float64), (65527, np.float32), (65528, np.int32), @@ -729,6 +826,7 @@ def __init__(self, encoding): self.TYPE_MAP_XML = \ dict( [ + (32768, 'L'), (65526, 'd'), (65527, 'f'), (65528, 'l'), @@ -776,7 +874,8 @@ def __init__(self, encoding): 'h': 'i2', 'l': 'i4', 'f': 'f4', - 'd': 'f8' + 'd': 'f8', + 'L': 'u8' } # Reserved words cannot be used as variable names @@ -797,42 +896,39 @@ def _decode_bytes(self, str, errors=None): else: return str - class StataReader(StataParser): - """ - Class for working with a Stata dataset. There are two possibilities for - usage: - - * The from_dta() method on the DataFrame class. - This will return a DataFrame with the Stata dataset. Note that when - using the from_dta() method, you will not have access to - meta-information like variable labels or the data label. - - * Work with this object directly. Upon instantiation, the header of the - Stata data file is read, giving you access to attributes like - variable_labels(), data_label(), nobs(), ... A DataFrame with the data - is returned by the read() method; this will also fill up the - value_labels. Note that calling the value_labels() method will result in - an error if the read() method has not been called yet. This is because - the value labels are stored at the end of a Stata dataset, after the - data. - - Parameters - ---------- - path_or_buf : string or file-like object - Path to .dta file or object implementing a binary read() functions - encoding : string, None or encoding - Encoding used to parse the files. Note that Stata doesn't - support unicode. None defaults to iso-8859-1. - """ + __doc__ = _stata_reader_doc - def __init__(self, path_or_buf, encoding='iso-8859-1'): + def __init__(self, path_or_buf, convert_dates=True, + convert_categoricals=True, index=None, + convert_missing=False, preserve_dtypes=True, + columns=None, order_categoricals=True, + encoding='iso-8859-1', chunksize=None): super(StataReader, self).__init__(encoding) self.col_sizes = () + + # Arguments to the reader (can be temporarily overridden in + # calls to read). + self._convert_dates = convert_dates + self._convert_categoricals = convert_categoricals + self._index = index + self._convert_missing = convert_missing + self._preserve_dtypes = preserve_dtypes + self._columns = columns + self._order_categoricals = order_categoricals + self._encoding = encoding + self._chunksize = chunksize + + # State variables for the file self._has_string_data = False self._missing_values = False - self._data_read = False + self._can_read_value_labels = False + self._column_selector_set = False self._value_labels_read = False + self._data_read = False + self._dtype = None + self._lines_read = 0 + self._native_byteorder = _set_endianness(sys.byteorder) if isinstance(path_or_buf, str): path_or_buf, encoding = get_filepath_or_buffer( @@ -917,8 +1013,8 @@ def _read_header(self): for typ in typlist: if typ <= 2045: self.typlist[i] = typ - elif typ == 32768: - raise ValueError("Long strings are not supported") + #elif typ == 32768: + # raise ValueError("Long strings are not supported") else: self.typlist[i] = self.TYPE_MAP_XML[typ] i += 1 @@ -1060,9 +1156,13 @@ def _read_header(self): self.has_string_data = len([x for x in self.typlist if type(x) is int]) > 0 - """Calculate size of a data record.""" + # calculate size of a data record self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist) + # remove format details from %td + self.fmtlist = ["%td" if x.startswith("%td") else x for x in self.fmtlist] + + def _calcsize(self, fmt): return (type(fmt) is int and fmt or struct.calcsize(self.byteorder + fmt)) @@ -1070,11 +1170,7 @@ def _calcsize(self, fmt): def _null_terminate(self, s): if compat.PY3 or self._encoding is not None: # have bytes not strings, # so must decode - null_byte = b"\0" - try: - s = s[:s.index(null_byte)] - except: - pass + s = s.partition(b"\0")[0] return s.decode(self._encoding or self._default_encoding) else: null_byte = "\0" @@ -1084,30 +1180,30 @@ def _null_terminate(self, s): return s def _read_value_labels(self): + if self.format_version <= 108: + # Value labels are not supported in version 108 and earlier. + return + if self._value_labels_read: + # Don't read twice + return + if self.format_version >= 117: self.path_or_buf.seek(self.seek_value_labels) else: - if not self._data_read: - raise Exception("Data has not been read. Because of the " - "layout of Stata files, this is necessary " - "before reading value labels.") - if self._value_labels_read: - raise Exception("Value labels have already been read.") + offset = self.nobs * self._dtype.itemsize + self.path_or_buf.seek(self.data_location + offset) + self._value_labels_read = True self.value_label_dict = dict() - if self.format_version <= 108: - # Value labels are not supported in version 108 and earlier. - return - while True: if self.format_version >= 117: if self.path_or_buf.read(5) == b' - break # end o f variable lable table + break # end of variable label table slength = self.path_or_buf.read(4) if not slength: - break # end of variable lable table (format < 117) + break # end of variable label table (format < 117) labname = self._null_terminate(self.path_or_buf.read(33)) self.path_or_buf.read(3) # padding @@ -1141,72 +1237,126 @@ def _read_strls(self): if self.path_or_buf.read(3) != b'GSO': break - v_o = struct.unpack(self.byteorder + 'L', - self.path_or_buf.read(8))[0] - typ = self.path_or_buf.read(1) + v_o = struct.unpack(self.byteorder + 'Q', self.path_or_buf.read(8))[0] + typ = struct.unpack('B', self.path_or_buf.read(1))[0] length = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] - self.GSO[v_o] = self.path_or_buf.read(length-1) - self.path_or_buf.read(1) # zero-termination + va = self.path_or_buf.read(length) + if typ == 130: + va = va[0:-1].decode(self._encoding or self._default_encoding) + self.GSO[v_o] = va + + # legacy + @Appender('DEPRECATED: ' + _data_method_doc) + def data(self, **kwargs): + + import warnings + warnings.warn("'data' is deprecated, use 'read' instead") + + if self._data_read: + raise Exception("Data has already been read.") + self._data_read = True + + return self.read(None, **kwargs) + - def data(self, convert_dates=True, convert_categoricals=True, index=None, - convert_missing=False, preserve_dtypes=True, columns=None, - order_categoricals=True): + def __iter__(self): + try: + if self._chunksize: + while True: + yield self.read(self._chunksize) + else: + yield self.read() + except StopIteration: + pass + + + def get_chunk(self, size=None): """ - Reads observations from Stata file, converting them into a dataframe + Reads lines from Stata file and returns as dataframe Parameters ---------- - convert_dates : boolean, defaults to True - Convert date variables to DataFrame time values - convert_categoricals : boolean, defaults to True - Read value labels and convert columns to Categorical/Factor - variables - index : identifier of index column - identifier of column that should be used as index of the DataFrame - convert_missing : boolean, defaults to False - Flag indicating whether to convert missing values to their Stata - representation. If False, missing values are replaced with - nans. If True, columns containing missing values are returned with - object data types and missing values are represented by - StataMissingValue objects. - preserve_dtypes : boolean, defaults to True - Preserve Stata datatypes. If False, numeric data are upcast to - pandas default types for foreign data (float64 or int64) - columns : list or None - Columns to retain. Columns will be returned in the given order. - None returns all columns - order_categoricals : boolean, defaults to True - Flag indicating whether converted categorical data are ordered. + size : int, defaults to None + Number of lines to read. If None, reads whole file. Returns ------- - y : DataFrame instance + DataFrame """ - self._missing_values = convert_missing - if self._data_read: - raise Exception("Data has already been read.") - self._data_read = True - - if self.format_version >= 117: + if size is None: + size = self._chunksize + return self.read(nrows=size) + + + @Appender(_read_method_doc) + def read(self, nrows=None, convert_dates=None, + convert_categoricals=None, index=None, + convert_missing=None, preserve_dtypes=None, + columns=None, order_categoricals=None): + + # Handle empty file or chunk. If reading incrementally raise + # StopIteration. If reading the whole thing return an empty + # data frame. + if (self.nobs == 0) and (nrows is None): + self._can_read_value_labels = True + self._data_read = True + return DataFrame(columns=self.varlist) + + # Handle options + if convert_dates is None: + convert_dates = self._convert_dates + if convert_categoricals is None: + convert_categoricals = self._convert_categoricals + if convert_missing is None: + convert_missing = self._convert_missing + if preserve_dtypes is None: + preserve_dtypes = self._preserve_dtypes + if columns is None: + columns = self._columns + if order_categoricals is None: + order_categoricals = self._order_categoricals + + if nrows is None: + nrows = self.nobs + + if (self.format_version >= 117) and (self._dtype is None): + self._can_read_value_labels = True self._read_strls() + # Setup the dtype. + if self._dtype is None: + dtype = [] # Convert struct data types to numpy data type + for i, typ in enumerate(self.typlist): + if typ in self.NUMPY_TYPE_MAP: + dtype.append(('s' + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ])) + else: + dtype.append(('s' + str(i), 'S' + str(typ))) + dtype = np.dtype(dtype) + self._dtype = dtype + # Read data - count = self.nobs - dtype = [] # Convert struct data types to numpy data type - for i, typ in enumerate(self.typlist): - if typ in self.NUMPY_TYPE_MAP: - dtype.append(('s' + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ])) - else: - dtype.append(('s' + str(i), 'S' + str(typ))) - dtype = np.dtype(dtype) - read_len = count * dtype.itemsize - self.path_or_buf.seek(self.data_location) - data = np.frombuffer(self.path_or_buf.read(read_len),dtype=dtype,count=count) + dtype = self._dtype + max_read_len = (self.nobs - self._lines_read) * dtype.itemsize + read_len = nrows * dtype.itemsize + read_len = min(read_len, max_read_len) + if read_len <= 0: + # Iterator has finished, should never be here unless + # we are reading the file incrementally + self._read_value_labels() + raise StopIteration + offset = self._lines_read * dtype.itemsize + self.path_or_buf.seek(self.data_location + offset) + read_lines = min(nrows, self.nobs - self._lines_read) + data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype, + count=read_lines) + self._lines_read += read_lines + if self._lines_read == self.nobs: + self._can_read_value_labels = True + self._data_read = True # if necessary, swap the byte order to native here if self.byteorder != self._native_byteorder: data = data.byteswap().newbyteorder() - self._data_read = True if convert_categoricals: self._read_value_labels() @@ -1217,39 +1367,22 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, data = DataFrame.from_records(data, index=index) data.columns = self.varlist - if columns is not None: - column_set = set(columns) - if len(column_set) != len(columns): - raise ValueError('columns contains duplicate entries') - unmatched = column_set.difference(data.columns) - if unmatched: - raise ValueError('The following columns were not found in the ' - 'Stata data set: ' + - ', '.join(list(unmatched))) - # Copy information for retained columns for later processing - dtyplist = [] - typlist = [] - fmtlist = [] - lbllist = [] - matched = set() - for i, col in enumerate(data.columns): - if col in column_set: - matched.update([col]) - dtyplist.append(self.dtyplist[i]) - typlist.append(self.typlist[i]) - fmtlist.append(self.fmtlist[i]) - lbllist.append(self.lbllist[i]) + # If index is not specified, use actual row number rather than + # restarting at 0 for each chunk. + if index is None: + ix = np.arange(self._lines_read - read_lines, self._lines_read) + data = data.set_index(ix) - data = data[columns] - self.dtyplist = dtyplist - self.typlist = typlist - self.fmtlist = fmtlist - self.lbllist = lbllist + if columns is not None: + data = self._do_select_columns(data, columns) + # Decode strings for col, typ in zip(data, self.typlist): if type(typ) is int: data[col] = data[col].apply(self._null_terminate, convert_dtype=True) + data = self._insert_strls(data) + cols_ = np.where(self.dtyplist)[0] # Convert columns (if needed) to match input type @@ -1269,7 +1402,39 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, data = DataFrame.from_items(data_formatted) del data_formatted + self._do_convert_missing(data, convert_missing) + + if convert_dates: + cols = np.where(lmap(lambda x: x in _date_formats, + self.fmtlist))[0] + for i in cols: + col = data.columns[i] + data[col] = _stata_elapsed_date_to_datetime_vec(data[col], self.fmtlist[i]) + + if convert_categoricals and self.value_label_dict: + data = self._do_convert_categoricals(data, self.value_label_dict, self.lbllist, + order_categoricals) + + if not preserve_dtypes: + retyped_data = [] + convert = False + for col in data: + dtype = data[col].dtype + if dtype in (np.float16, np.float32): + dtype = np.float64 + convert = True + elif dtype in (np.int8, np.int16, np.int32): + dtype = np.int64 + convert = True + retyped_data.append((col, data[col].astype(dtype))) + if convert: + data = DataFrame.from_items(retyped_data) + + return data + + def _do_convert_missing(self, data, convert_missing): # Check for missing values, and replace if found + for i, colname in enumerate(data): fmt = self.typlist[i] if fmt not in self.VALID_RANGE: @@ -1282,7 +1447,7 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, if not missing.any(): continue - if self._missing_values: # Replacement follows Stata notation + if convert_missing: # Replacement follows Stata notation missing_loc = np.argwhere(missing) umissing, umissing_loc = np.unique(series[missing], return_inverse=True) @@ -1301,48 +1466,72 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, data[colname] = replacement - if convert_dates: - cols = np.where(lmap(lambda x: x in _date_formats, - self.fmtlist))[0] - for i in cols: - col = data.columns[i] - data[col] = _stata_elapsed_date_to_datetime_vec(data[col], self.fmtlist[i]) + def _insert_strls(self, data): + if not hasattr(self, 'GSO') or len(self.GSO) == 0: + return data + for i, typ in enumerate(self.typlist): + if typ != 'L': + continue + data.iloc[:, i] = [self.GSO[k] for k in data.iloc[:, i]] + return data - if convert_categoricals and self.value_label_dict: - value_labels = list(compat.iterkeys(self.value_label_dict)) - cat_converted_data = [] - for col, label in zip(data, self.lbllist): - if label in value_labels: - # Explicit call with ordered=True - cat_data = Categorical(data[col], ordered=order_categoricals) - value_label_dict = self.value_label_dict[label] - categories = [] - for category in cat_data.categories: - if category in value_label_dict: - categories.append(value_label_dict[category]) - else: - categories.append(category) # Partially labeled - cat_data.categories = categories - cat_converted_data.append((col, cat_data)) - else: - cat_converted_data.append((col, data[col])) - data = DataFrame.from_items(cat_converted_data) + def _do_select_columns(self, data, columns): - if not preserve_dtypes: - retyped_data = [] - convert = False - for col in data: - dtype = data[col].dtype - if dtype in (np.float16, np.float32): - dtype = np.float64 - convert = True - elif dtype in (np.int8, np.int16, np.int32): - dtype = np.int64 - convert = True - retyped_data.append((col, data[col].astype(dtype))) - if convert: - data = DataFrame.from_items(retyped_data) + if not self._column_selector_set: + column_set = set(columns) + if len(column_set) != len(columns): + raise ValueError('columns contains duplicate entries') + unmatched = column_set.difference(data.columns) + if unmatched: + raise ValueError('The following columns were not found in the ' + 'Stata data set: ' + + ', '.join(list(unmatched))) + # Copy information for retained columns for later processing + dtyplist = [] + typlist = [] + fmtlist = [] + lbllist = [] + matched = set() + for i, col in enumerate(data.columns): + if col in column_set: + matched.update([col]) + dtyplist.append(self.dtyplist[i]) + typlist.append(self.typlist[i]) + fmtlist.append(self.fmtlist[i]) + lbllist.append(self.lbllist[i]) + self.dtyplist = dtyplist + self.typlist = typlist + self.fmtlist = fmtlist + self.lbllist = lbllist + self._column_selector_set = True + + return data[columns] + + + def _do_convert_categoricals(self, data, value_label_dict, lbllist, order_categoricals): + """ + Converts categorical columns to Categorical type. + """ + value_labels = list(compat.iterkeys(value_label_dict)) + cat_converted_data = [] + for col, label in zip(data, lbllist): + if label in value_labels: + # Explicit call with ordered=True + cat_data = Categorical(data[col], ordered=order_categoricals) + categories = [] + for category in cat_data.categories: + if category in value_label_dict[label]: + categories.append(value_label_dict[label][category]) + else: + categories.append(category) # Partially labeled + cat_data.categories = categories + # TODO: is the next line needed above in the data(...) method? + cat_data = Series(cat_data, index=data.index) + cat_converted_data.append((col, cat_data)) + else: + cat_converted_data.append((col, data[col])) + data = DataFrame.from_items(cat_converted_data) return data def data_label(self): diff --git a/pandas/io/tests/data/stata12_117.dta b/pandas/io/tests/data/stata12_117.dta new file mode 100644 index 0000000000000000000000000000000000000000..7d1d6181f53bf2b443798b2842cdb95f258b56a1 GIT binary patch literal 1285 zcmd5+%}yIJ5Oxdnz>Nwa^%hm)S|>;dZ5g?!2M_|mfd`1Zn+aKILl)bm4Qdg}od@W1 zaN{XBaYDQX;zT|6EJ;{6RO+ECt-qOX{_NTDf_2*IcE@Q2J=7gE3JeygEvi;)4P>Q2 zI&?TNF>LSZBOsykK6;8Ct zby{HC4NJ7@{_&j<=OCZa2?HX|=9}>aiNwfpJA{-BWu&K2gve zn7(#aEhnFvhxI(KGs}s6-T0E_uLD3t_N*3(934vinPJhKX9jp_LR zhM&OmFFMV*U-1@hY-Fhx*riUy`*`(L2sCa;480&jFj!{_tF{jIiKO7sykC624t5WF pxnK(=8NK(*@psA+mMB&p6hERnOc{Opn0Ru^=m-;EpVaZ6?+@SE-rE2G literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index f896b98fddf5b..8b44be61d5f66 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -75,6 +75,8 @@ def setUp(self): self.dta20_115 = os.path.join(self.dirpath, 'stata11_115.dta') self.dta20_117 = os.path.join(self.dirpath, 'stata11_117.dta') + self.dta21_117 = os.path.join(self.dirpath, 'stata12_117.dta') + def read_dta(self, file): # Legacy default reader configuration return read_stata(file, convert_dates=True) @@ -90,11 +92,21 @@ def test_read_empty_dta(self): empty_ds2 = read_stata(path) tm.assert_frame_equal(empty_ds, empty_ds2) + def test_data_method(self): + # Minimal testing of legacy data method + reader_114 = StataReader(self.dta1_114) + with warnings.catch_warnings(record=True) as w: + parsed_114_data = reader_114.data() + + reader_114 = StataReader(self.dta1_114) + parsed_114_read = reader_114.read() + tm.assert_frame_equal(parsed_114_data, parsed_114_read) + def test_read_dta1(self): reader_114 = StataReader(self.dta1_114) - parsed_114 = reader_114.data() + parsed_114 = reader_114.read() reader_117 = StataReader(self.dta1_117) - parsed_117 = reader_117.data() + parsed_117 = reader_117.read() # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], @@ -152,14 +164,18 @@ def test_read_dta2(self): expected['yearly_date'] = expected['yearly_date'].astype('O') with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") parsed_114 = self.read_dta(self.dta2_114) parsed_115 = self.read_dta(self.dta2_115) parsed_117 = self.read_dta(self.dta2_117) - # 113 is buggy due ot limits date format support in Stata + # 113 is buggy due to limits of date format support in Stata # parsed_113 = self.read_dta(self.dta2_113) - # should get a warning for that format. - tm.assert_equal(len(w), 1) + # Remove resource warnings + w = [x for x in w if x.category is UserWarning] + + # should get warning for each call to read_dta + tm.assert_equal(len(w), 3) # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats @@ -215,6 +231,19 @@ def test_read_dta4(self): tm.assert_frame_equal(parsed_115, expected) tm.assert_frame_equal(parsed_117, expected) + # File containing strls + def test_read_dta12(self): + parsed_117 = self.read_dta(self.dta21_117) + expected = DataFrame.from_records( + [ + [1, "abc", "abcdefghi"], + [3, "cba", "qwertywertyqwerty"], + [93, "", "strl"], + ], + columns=['x', 'y', 'z']) + + tm.assert_frame_equal(parsed_117, expected, check_dtype=False) + def test_read_write_dta5(self): original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', @@ -858,6 +887,118 @@ def test_categorical_ordering(self): tm.assert_equal(False, parsed_115_unordered[col].cat.ordered) tm.assert_equal(False, parsed_117_unordered[col].cat.ordered) + + def test_read_chunks_117(self): + files_117 = [self.dta1_117, self.dta2_117, self.dta3_117, + self.dta4_117, self.dta14_117, self.dta15_117, + self.dta16_117, self.dta17_117, self.dta18_117, + self.dta19_117, self.dta20_117] + + for fname in files_117: + for chunksize in 1,2: + for convert_categoricals in False, True: + for convert_dates in False, True: + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + parsed = read_stata(fname, convert_categoricals=convert_categoricals, + convert_dates=convert_dates) + itr = read_stata(fname, iterator=True) + + pos = 0 + for j in range(5): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + try: + chunk = itr.read(chunksize) + except StopIteration: + break + from_frame = parsed.iloc[pos:pos+chunksize, :] + try: + tm.assert_frame_equal(from_frame, chunk, check_dtype=False) + except AssertionError: + # datetime.datetime and pandas.tslib.Timestamp may hold + # equivalent values but fail assert_frame_equal + assert(all([x == y for x, y in zip(from_frame, chunk)])) + + pos += chunksize + + def test_iterator(self): + + fname = self.dta3_117 + + parsed = read_stata(fname) + + itr = read_stata(fname, iterator=True) + chunk = itr.read(5) + tm.assert_frame_equal(parsed.iloc[0:5, :], chunk) + + itr = read_stata(fname, chunksize=5) + chunk = list(itr) + tm.assert_frame_equal(parsed.iloc[0:5, :], chunk[0]) + + itr = read_stata(fname, iterator=True) + chunk = itr.get_chunk(5) + tm.assert_frame_equal(parsed.iloc[0:5, :], chunk) + + itr = read_stata(fname, chunksize=5) + chunk = itr.get_chunk() + tm.assert_frame_equal(parsed.iloc[0:5, :], chunk) + + + def test_read_chunks_115(self): + files_115 = [self.dta2_115, self.dta3_115, self.dta4_115, + self.dta14_115, self.dta15_115, self.dta16_115, + self.dta17_115, self.dta18_115, self.dta19_115, + self.dta20_115] + + for fname in files_115: + for chunksize in 1,2: + for convert_categoricals in False, True: + for convert_dates in False, True: + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + parsed = read_stata(fname, convert_categoricals=convert_categoricals, + convert_dates=convert_dates) + itr = read_stata(fname, iterator=True, + convert_categoricals=convert_categoricals) + + pos = 0 + for j in range(5): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + try: + chunk = itr.read(chunksize) + except StopIteration: + break + from_frame = parsed.iloc[pos:pos+chunksize, :] + try: + tm.assert_frame_equal(from_frame, chunk, check_dtype=False) + except AssertionError: + # datetime.datetime and pandas.tslib.Timestamp may hold + # equivalent values but fail assert_frame_equal + assert(all([x == y for x, y in zip(from_frame, chunk)])) + + pos += chunksize + + def test_read_chunks_columns(self): + fname = self.dta3_117 + columns = ['quarter', 'cpi', 'm1'] + chunksize = 2 + + parsed = read_stata(fname, columns=columns) + itr = read_stata(fname, iterator=True) + pos = 0 + for j in range(5): + chunk = itr.read(chunksize, columns=columns) + if chunk is None: + break + from_frame = parsed.iloc[pos:pos+chunksize, :] + tm.assert_frame_equal(from_frame, chunk, check_dtype=False) + pos += chunksize + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)