From 709c034b817f3ec6b6326d9d08e4ed14378bca0f Mon Sep 17 00:00:00 2001
From: Kerby Shedden <kshedden@umich.edu>
Date: Fri, 13 Feb 2015 09:08:04 -0500
Subject: [PATCH] Read Stata file incrementally

Remove testing code

Use partition in null_terminate

Manage warnings better in test

Further warning management in testing; add skip_data argument

Major refactoring to address code review

Fix strl reading, templatize docstrings

Fix bug in attaching docstring

Add new test file

Add release note

Call read instead of data when calling pandas.read_stata

various small issues following code review

Improve performance of %td processing

Docs edit (minor)
---
 doc/source/io.rst                    |  33 +-
 doc/source/release.rst               |   2 +
 pandas/io/stata.py                   | 603 ++++++++++++++++++---------
 pandas/io/tests/data/stata12_117.dta | Bin 0 -> 1285 bytes
 pandas/io/tests/test_stata.py        | 151 ++++++-
 5 files changed, 570 insertions(+), 219 deletions(-)
 create mode 100644 pandas/io/tests/data/stata12_117.dta

diff --git a/doc/source/io.rst b/doc/source/io.rst
index e71b4134f5b9c..c4865fddb099b 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -3821,22 +3821,41 @@ outside of this range, the variable is cast to ``int16``.
 Reading from Stata format
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The top-level function ``read_stata`` will read a dta files
-and return a DataFrame.  Alternatively,  the class :class:`~pandas.io.stata.StataReader`
-can be used if more granular access is required. :class:`~pandas.io.stata.StataReader`
-reads the header of the dta file at initialization. The method
-:func:`~pandas.io.stata.StataReader.data` reads and converts observations to a DataFrame.
+The top-level function ``read_stata`` will read a dta file and return
+either a DataFrame or a :class:`~pandas.io.stata.StataReader` that can
+be used to read the file incrementally.
 
 .. ipython:: python
 
    pd.read_stata('stata.dta')
 
+.. versionadded:: 0.16.0
+
+Specifying a ``chunksize`` yields a
+:class:`~pandas.io.stata.StataReader` instance that can be used to
+read ``chunksize`` lines from the file at a time.  The ``StataReader``
+object can be used as an iterator.
+
+    reader = pd.read_stata('stata.dta', chunksize=1000)
+    for df in reader:
+        do_something(df)
+
+For more fine-grained control, use ``iterator=True`` and specify
+``chunksize`` with each call to
+:func:`~pandas.io.stata.StataReader.read`.
+
+.. ipython:: python
+
+  reader = pd.read_stata('stata.dta', iterator=True)
+  chunk1 = reader.read(10)
+  chunk2 = reader.read(20)
+
 Currently the ``index`` is retrieved as a column.
 
 The parameter ``convert_categoricals`` indicates whether value labels should be
 read and used to create a ``Categorical`` variable from them. Value labels can
-also be retrieved by the function ``variable_labels``, which requires data to be
-called before use (see ``pandas.io.stata.StataReader``).
+also be retrieved by the function ``value_labels``, which requires :func:`~pandas.io.stata.StataReader.read`
+to be called before use.
 
 The parameter ``convert_missing`` indicates whether missing value
 representations in Stata should be preserved.  If ``False`` (the default),
diff --git a/doc/source/release.rst b/doc/source/release.rst
index 164e381499490..0912a11e28801 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -55,6 +55,8 @@ performance improvements along with a large number of bug fixes.
 
 Highlights include:
 
+- Allow Stata files to be read incrementally, support for long strings in Stata files (issue:`9493`:) :ref:`here<io.stata_reader>`.
+
 See the :ref:`v0.16.0 Whatsnew <whatsnew_0160>` overview or the issue tracker on GitHub for an extensive list
 of all API changes, enhancements and bugs that have been fixed in 0.16.0.
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 0d6e554b8b474..7dd32fd00a4d2 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -22,51 +22,144 @@
 from pandas import compat, to_timedelta, to_datetime, isnull, DatetimeIndex
 from pandas.compat import lrange, lmap, lzip, text_type, string_types, range, \
     zip, BytesIO
+from pandas.util.decorators import Appender
 import pandas.core.common as com
 from pandas.io.common import get_filepath_or_buffer
 from pandas.lib import max_len_string_array, infer_dtype
 from pandas.tslib import NaT, Timestamp
 
+_statafile_processing_params1 = """\
+convert_dates : boolean, defaults to True
+    Convert date variables to DataFrame time values
+convert_categoricals : boolean, defaults to True
+    Read value labels and convert columns to Categorical/Factor variables"""
+
+_encoding_params = """\
+encoding : string, None or encoding
+    Encoding used to parse the files. Note that Stata doesn't
+    support unicode. None defaults to iso-8859-1."""
+
+_statafile_processing_params2 = """\
+index : identifier of index column
+    identifier of column that should be used as index of the DataFrame
+convert_missing : boolean, defaults to False
+    Flag indicating whether to convert missing values to their Stata
+    representations.  If False, missing values are replaced with nans.
+    If True, columns containing missing values are returned with
+    object data types and missing values are represented by
+    StataMissingValue objects.
+preserve_dtypes : boolean, defaults to True
+    Preserve Stata datatypes. If False, numeric data are upcast to pandas
+    default types for foreign data (float64 or int64)
+columns : list or None
+    Columns to retain.  Columns will be returned in the given order.  None
+    returns all columns
+order_categoricals : boolean, defaults to True
+    Flag indicating whether converted categorical data are ordered."""
+
+_chunksize_params = """\
+chunksize : int, default None
+    Return StataReader object for iterations, returns chunks with
+    given number of lines"""
+
+_iterator_params = """\
+iterator : boolean, default False
+    Return StataReader object"""
+
+_read_stata_doc = """Read Stata file into DataFrame
+
+Parameters
+----------
+filepath_or_buffer : string or file-like object
+    Path to .dta file or object implementing a binary read() functions
+%s
+%s
+%s
+%s
+%s
+
+Returns
+-------
+DataFrame or StataReader
+
+Examples
+--------
+Read a Stata dta file:
+>> df = pandas.read_stata('filename.dta')
+
+Read a Stata dta file in 10,000 line chunks:
+>> itr = pandas.read_stata('filename.dta', chunksize=10000)
+>> for chunk in itr:
+>>     do_something(chunk)
+""" % (_statafile_processing_params1, _encoding_params,
+       _statafile_processing_params2, _chunksize_params,
+       _iterator_params)
+
+_data_method_doc = """Reads observations from Stata file, converting them into a dataframe
+
+This is a legacy method.  Use `read` in new code.
+
+Parameters
+----------
+%s
+%s
+
+Returns
+-------
+DataFrame
+""" % (_statafile_processing_params1, _statafile_processing_params2)
+
+
+_read_method_doc = """\
+Reads observations from Stata file, converting them into a dataframe
+
+Parameters
+----------
+nrows : int
+    Number of lines to read from data file, if None read whole file.
+%s
+%s
+
+Returns
+-------
+DataFrame
+""" % (_statafile_processing_params1, _statafile_processing_params2)
+
+
+_stata_reader_doc = """\
+Class for reading Stata dta files.
+
+Parameters
+----------
+path_or_buf : string or file-like object
+    Path to .dta file or object implementing a binary read() functions
+%s
+%s
+%s
+%s
+""" % (_statafile_processing_params1, _statafile_processing_params2,
+       _encoding_params, _chunksize_params)
+
+
+@Appender(_read_stata_doc)
 def read_stata(filepath_or_buffer, convert_dates=True,
                convert_categoricals=True, encoding=None, index=None,
                convert_missing=False, preserve_dtypes=True, columns=None,
-               order_categoricals=True):
-    """
-    Read Stata file into DataFrame
+               order_categoricals=True, chunksize=None, iterator=False):
 
-    Parameters
-    ----------
-    filepath_or_buffer : string or file-like object
-        Path to .dta file or object implementing a binary read() functions
-    convert_dates : boolean, defaults to True
-        Convert date variables to DataFrame time values
-    convert_categoricals : boolean, defaults to True
-        Read value labels and convert columns to Categorical/Factor variables
-    encoding : string, None or encoding
-        Encoding used to parse the files. Note that Stata doesn't
-        support unicode. None defaults to iso-8859-1.
-    index : identifier of index column
-        identifier of column that should be used as index of the DataFrame
-    convert_missing : boolean, defaults to False
-        Flag indicating whether to convert missing values to their Stata
-        representations.  If False, missing values are replaced with nans.
-        If True, columns containing missing values are returned with
-        object data types and missing values are represented by
-        StataMissingValue objects.
-    preserve_dtypes : boolean, defaults to True
-        Preserve Stata datatypes. If False, numeric data are upcast to pandas
-        default types for foreign data (float64 or int64)
-    columns : list or None
-        Columns to retain.  Columns will be returned in the given order.  None
-        returns all columns
-    order_categoricals : boolean, defaults to True
-        Flag indicating whether converted categorical data are ordered.
-    """
-    reader = StataReader(filepath_or_buffer, encoding)
+    reader = StataReader(filepath_or_buffer,
+                         convert_dates=convert_dates,
+                         convert_categoricals=convert_categoricals,
+                         index=index, convert_missing=convert_missing,
+                         preserve_dtypes=preserve_dtypes,
+                         columns=columns,
+                         order_categoricals=order_categoricals,
+                         chunksize=chunksize, encoding=encoding)
+
+    if iterator or chunksize:
+        return reader
 
-    return reader.data(convert_dates, convert_categoricals, index,
-                       convert_missing, preserve_dtypes, columns,
-                       order_categoricals)
+    return reader.read()
 
 _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
 
@@ -139,8 +232,10 @@ def convert_year_month_safe(year, month):
         if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
             return to_datetime(100 * year + month, format='%Y%m')
         else:
+            index = getattr(year, 'index', None)
             return Series(
-                [datetime.datetime(y, m, 1) for y, m in zip(year, month)])
+                [datetime.datetime(y, m, 1) for y, m in zip(year, month)],
+                index=index)
 
     def convert_year_days_safe(year, days):
         """
@@ -150,9 +245,10 @@ def convert_year_days_safe(year, days):
         if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
             return to_datetime(year, format='%Y') + to_timedelta(days, unit='d')
         else:
+            index = getattr(year, 'index', None)
             value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) for
                      y, d in zip(year, days)]
-            return Series(value)
+            return Series(value, index=index)
 
     def convert_delta_safe(base, deltas, unit):
         """
@@ -160,18 +256,18 @@ def convert_delta_safe(base, deltas, unit):
         versions if the deltas satisfy restrictions required to be expressed
         as dates in pandas.
         """
+        index = getattr(deltas, 'index', None)
         if unit == 'd':
             if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
                 values = [base + relativedelta(days=int(d)) for d in deltas]
-                return Series(values)
+                return Series(values, index=index)
         elif unit == 'ms':
             if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
                 values = [base + relativedelta(microseconds=(int(d) * 1000)) for
                           d in deltas]
-                return Series(values)
+                return Series(values, index=index)
         else:
             raise ValueError('format not understood')
-
         base = to_datetime(base)
         deltas = to_timedelta(deltas, unit=unit)
         return base + deltas
@@ -226,6 +322,7 @@ def convert_delta_safe(base, deltas, unit):
 
     if has_bad_values:  # Restore NaT for bad values
         conv_dates[bad_locs] = NaT
+
     return conv_dates
 
 
@@ -717,7 +814,7 @@ def __init__(self, encoding):
         self.DTYPE_MAP_XML = \
             dict(
                 [
-                    (32768, np.string_),
+                    (32768, np.uint8),  # Keys to GSO
                     (65526, np.float64),
                     (65527, np.float32),
                     (65528, np.int32),
@@ -729,6 +826,7 @@ def __init__(self, encoding):
         self.TYPE_MAP_XML = \
             dict(
                 [
+                    (32768, 'L'),
                     (65526, 'd'),
                     (65527, 'f'),
                     (65528, 'l'),
@@ -776,7 +874,8 @@ def __init__(self, encoding):
                 'h': 'i2',
                 'l': 'i4',
                 'f': 'f4',
-                'd': 'f8'
+                'd': 'f8',
+                'L': 'u8'
         }
 
         # Reserved words cannot be used as variable names
@@ -797,42 +896,39 @@ def _decode_bytes(self, str, errors=None):
         else:
             return str
 
-
 class StataReader(StataParser):
-    """
-    Class for working with a Stata dataset. There are two possibilities for
-    usage:
-
-     * The from_dta() method on the DataFrame class.
-       This will return a DataFrame with the Stata dataset. Note that when
-       using the from_dta() method, you will not have access to
-       meta-information like variable labels or the data label.
-
-     * Work with this object directly. Upon instantiation, the header of the
-       Stata data file is read, giving you access to attributes like
-       variable_labels(), data_label(), nobs(), ... A DataFrame with the data
-       is returned by the read() method; this will also fill up the
-       value_labels. Note that calling the value_labels() method will result in
-       an error if the read() method has not been called yet. This is because
-       the value labels are stored at the end of a Stata dataset, after the
-       data.
-
-    Parameters
-    ----------
-    path_or_buf : string or file-like object
-        Path to .dta file or object implementing a binary read() functions
-    encoding : string, None or encoding
-        Encoding used to parse the files. Note that Stata doesn't
-        support unicode. None defaults to iso-8859-1.
-    """
+    __doc__ = _stata_reader_doc
 
-    def __init__(self, path_or_buf, encoding='iso-8859-1'):
+    def __init__(self, path_or_buf, convert_dates=True,
+                 convert_categoricals=True, index=None,
+                 convert_missing=False, preserve_dtypes=True,
+                 columns=None, order_categoricals=True,
+                 encoding='iso-8859-1', chunksize=None):
         super(StataReader, self).__init__(encoding)
         self.col_sizes = ()
+
+        # Arguments to the reader (can be temporarily overridden in
+        # calls to read).
+        self._convert_dates = convert_dates
+        self._convert_categoricals = convert_categoricals
+        self._index = index
+        self._convert_missing = convert_missing
+        self._preserve_dtypes = preserve_dtypes
+        self._columns = columns
+        self._order_categoricals = order_categoricals
+        self._encoding = encoding
+        self._chunksize = chunksize
+
+        # State variables for the file
         self._has_string_data = False
         self._missing_values = False
-        self._data_read = False
+        self._can_read_value_labels = False
+        self._column_selector_set = False
         self._value_labels_read = False
+        self._data_read = False
+        self._dtype = None
+        self._lines_read = 0
+
         self._native_byteorder =  _set_endianness(sys.byteorder)
         if isinstance(path_or_buf, str):
             path_or_buf, encoding = get_filepath_or_buffer(
@@ -917,8 +1013,8 @@ def _read_header(self):
                 for typ in typlist:
                     if typ <= 2045:
                         self.typlist[i] = typ
-                    elif typ == 32768:
-                        raise ValueError("Long strings are not supported")
+                    #elif typ == 32768:
+                    #    raise ValueError("Long strings are not supported")
                     else:
                         self.typlist[i] = self.TYPE_MAP_XML[typ]
                     i += 1
@@ -1060,9 +1156,13 @@ def _read_header(self):
         self.has_string_data = len([x for x in self.typlist
                                     if type(x) is int]) > 0
 
-        """Calculate size of a data record."""
+        # calculate size of a data record
         self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist)
 
+        # remove format details from %td
+        self.fmtlist = ["%td" if x.startswith("%td") else x for x in self.fmtlist]
+
+
     def _calcsize(self, fmt):
         return (type(fmt) is int and fmt
                 or struct.calcsize(self.byteorder + fmt))
@@ -1070,11 +1170,7 @@ def _calcsize(self, fmt):
     def _null_terminate(self, s):
         if compat.PY3 or self._encoding is not None:  # have bytes not strings,
                                                       # so must decode
-            null_byte = b"\0"
-            try:
-                s = s[:s.index(null_byte)]
-            except:
-                pass
+            s = s.partition(b"\0")[0]
             return s.decode(self._encoding or self._default_encoding)
         else:
             null_byte = "\0"
@@ -1084,30 +1180,30 @@ def _null_terminate(self, s):
                 return s
 
     def _read_value_labels(self):
+        if self.format_version <= 108:
+            # Value labels are not supported in version 108 and earlier.
+            return
+        if self._value_labels_read:
+            # Don't read twice
+            return
+
         if self.format_version >= 117:
             self.path_or_buf.seek(self.seek_value_labels)
         else:
-            if not self._data_read:
-                raise Exception("Data has not been read. Because of the "
-                                "layout of Stata files, this is necessary "
-                                "before reading value labels.")
-            if self._value_labels_read:
-                raise Exception("Value labels have already been read.")
+            offset = self.nobs * self._dtype.itemsize
+            self.path_or_buf.seek(self.data_location + offset)
 
+        self._value_labels_read = True
         self.value_label_dict = dict()
 
-        if self.format_version <= 108:
-            # Value labels are not supported in version 108 and earlier.
-            return
-
         while True:
             if self.format_version >= 117:
                 if self.path_or_buf.read(5) == b'</val':  # <lbl>
-                    break  # end o f variable lable table
+                    break  # end of variable label table
 
             slength = self.path_or_buf.read(4)
             if not slength:
-                break  # end of variable lable table (format < 117)
+                break  # end of variable label table (format < 117)
             labname = self._null_terminate(self.path_or_buf.read(33))
             self.path_or_buf.read(3)  # padding
 
@@ -1141,72 +1237,126 @@ def _read_strls(self):
             if self.path_or_buf.read(3) != b'GSO':
                 break
 
-            v_o = struct.unpack(self.byteorder + 'L',
-                                self.path_or_buf.read(8))[0]
-            typ = self.path_or_buf.read(1)
+            v_o = struct.unpack(self.byteorder + 'Q', self.path_or_buf.read(8))[0]
+            typ = struct.unpack('B', self.path_or_buf.read(1))[0]
             length = struct.unpack(self.byteorder + 'I',
                                    self.path_or_buf.read(4))[0]
-            self.GSO[v_o] = self.path_or_buf.read(length-1)
-            self.path_or_buf.read(1)  # zero-termination
+            va = self.path_or_buf.read(length)
+            if typ == 130:
+                va = va[0:-1].decode(self._encoding or self._default_encoding)
+            self.GSO[v_o] = va
+
+    # legacy
+    @Appender('DEPRECATED: ' + _data_method_doc)
+    def data(self, **kwargs):
+
+        import warnings
+        warnings.warn("'data' is deprecated, use 'read' instead")
+
+        if self._data_read:
+            raise Exception("Data has already been read.")
+        self._data_read = True
+
+        return self.read(None, **kwargs)
+
 
-    def data(self, convert_dates=True, convert_categoricals=True, index=None,
-             convert_missing=False, preserve_dtypes=True, columns=None,
-             order_categoricals=True):
+    def __iter__(self):
+        try:
+            if self._chunksize:
+                while True:
+                    yield self.read(self._chunksize)
+            else:
+                yield self.read()
+        except StopIteration:
+            pass
+
+
+    def get_chunk(self, size=None):
         """
-        Reads observations from Stata file, converting them into a dataframe
+        Reads lines from Stata file and returns as dataframe
 
         Parameters
         ----------
-        convert_dates : boolean, defaults to True
-            Convert date variables to DataFrame time values
-        convert_categoricals : boolean, defaults to True
-            Read value labels and convert columns to Categorical/Factor
-            variables
-        index : identifier of index column
-            identifier of column that should be used as index of the DataFrame
-        convert_missing : boolean, defaults to False
-            Flag indicating whether to convert missing values to their Stata
-            representation.  If False, missing values are replaced with
-            nans.  If True, columns containing missing values are returned with
-            object data types and missing values are represented by
-            StataMissingValue objects.
-        preserve_dtypes : boolean, defaults to True
-            Preserve Stata datatypes. If False, numeric data are upcast to
-            pandas default types for foreign data (float64 or int64)
-        columns : list or None
-            Columns to retain.  Columns will be returned in the given order.
-            None returns all columns
-        order_categoricals : boolean, defaults to True
-            Flag indicating whether converted categorical data are ordered.
+        size : int, defaults to None
+            Number of lines to read.  If None, reads whole file.
 
         Returns
         -------
-        y : DataFrame instance
+        DataFrame
         """
-        self._missing_values = convert_missing
-        if self._data_read:
-            raise Exception("Data has already been read.")
-        self._data_read = True
-
-        if self.format_version >= 117:
+        if size is None:
+            size = self._chunksize
+        return self.read(nrows=size)
+
+
+    @Appender(_read_method_doc)
+    def read(self, nrows=None, convert_dates=None,
+             convert_categoricals=None, index=None,
+             convert_missing=None, preserve_dtypes=None,
+             columns=None, order_categoricals=None):
+
+        # Handle empty file or chunk.  If reading incrementally raise
+        # StopIteration.  If reading the whole thing return an empty
+        # data frame.
+        if (self.nobs == 0) and (nrows is None):
+            self._can_read_value_labels = True
+            self._data_read = True
+            return DataFrame(columns=self.varlist)
+
+        # Handle options
+        if convert_dates is None:
+            convert_dates = self._convert_dates
+        if convert_categoricals is None:
+            convert_categoricals = self._convert_categoricals
+        if convert_missing is None:
+            convert_missing = self._convert_missing
+        if preserve_dtypes is None:
+            preserve_dtypes = self._preserve_dtypes
+        if columns is None:
+            columns = self._columns
+        if order_categoricals is None:
+            order_categoricals = self._order_categoricals
+
+        if nrows is None:
+            nrows = self.nobs
+
+        if (self.format_version >= 117) and (self._dtype is None):
+            self._can_read_value_labels = True
             self._read_strls()
 
+        # Setup the dtype.
+        if self._dtype is None:
+            dtype = []  # Convert struct data types to numpy data type
+            for i, typ in enumerate(self.typlist):
+                if typ in self.NUMPY_TYPE_MAP:
+                    dtype.append(('s' + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ]))
+                else:
+                    dtype.append(('s' + str(i), 'S' + str(typ)))
+            dtype = np.dtype(dtype)
+            self._dtype = dtype
+
         # Read data
-        count = self.nobs
-        dtype = []  # Convert struct data types to numpy data type
-        for i, typ in enumerate(self.typlist):
-            if typ in self.NUMPY_TYPE_MAP:
-                dtype.append(('s' + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ]))
-            else:
-                dtype.append(('s' + str(i), 'S' + str(typ)))
-        dtype = np.dtype(dtype)
-        read_len = count * dtype.itemsize
-        self.path_or_buf.seek(self.data_location)
-        data = np.frombuffer(self.path_or_buf.read(read_len),dtype=dtype,count=count)
+        dtype = self._dtype
+        max_read_len = (self.nobs - self._lines_read) * dtype.itemsize
+        read_len = nrows * dtype.itemsize
+        read_len = min(read_len, max_read_len)
+        if read_len <= 0:
+            # Iterator has finished, should never be here unless
+            # we are reading the file incrementally
+            self._read_value_labels()
+            raise StopIteration
+        offset = self._lines_read * dtype.itemsize
+        self.path_or_buf.seek(self.data_location + offset)
+        read_lines = min(nrows, self.nobs - self._lines_read)
+        data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype,
+                             count=read_lines)
+        self._lines_read += read_lines
+        if self._lines_read == self.nobs:
+            self._can_read_value_labels = True
+            self._data_read = True
         # if necessary, swap the byte order to native here
         if self.byteorder != self._native_byteorder:
             data = data.byteswap().newbyteorder()
-        self._data_read = True
 
         if convert_categoricals:
             self._read_value_labels()
@@ -1217,39 +1367,22 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
             data = DataFrame.from_records(data, index=index)
             data.columns = self.varlist
 
-        if columns is not None:
-            column_set = set(columns)
-            if len(column_set) != len(columns):
-                raise ValueError('columns contains duplicate entries')
-            unmatched = column_set.difference(data.columns)
-            if unmatched:
-                raise ValueError('The following columns were not found in the '
-                                 'Stata data set: ' +
-                                 ', '.join(list(unmatched)))
-            # Copy information for retained columns for later processing
-            dtyplist = []
-            typlist = []
-            fmtlist = []
-            lbllist = []
-            matched = set()
-            for i, col in enumerate(data.columns):
-                if col in column_set:
-                    matched.update([col])
-                    dtyplist.append(self.dtyplist[i])
-                    typlist.append(self.typlist[i])
-                    fmtlist.append(self.fmtlist[i])
-                    lbllist.append(self.lbllist[i])
+        # If index is not specified, use actual row number rather than
+        # restarting at 0 for each chunk.
+        if index is None:
+            ix = np.arange(self._lines_read - read_lines, self._lines_read)
+            data = data.set_index(ix)
 
-            data = data[columns]
-            self.dtyplist = dtyplist
-            self.typlist = typlist
-            self.fmtlist = fmtlist
-            self.lbllist = lbllist
+        if columns is not None:
+            data = self._do_select_columns(data, columns)
 
+        # Decode strings
         for col, typ in zip(data, self.typlist):
             if type(typ) is int:
                 data[col] = data[col].apply(self._null_terminate, convert_dtype=True)
 
+        data = self._insert_strls(data)
+
         cols_ = np.where(self.dtyplist)[0]
 
         # Convert columns (if needed) to match input type
@@ -1269,7 +1402,39 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
             data = DataFrame.from_items(data_formatted)
         del data_formatted
 
+        self._do_convert_missing(data, convert_missing)
+
+        if convert_dates:
+            cols = np.where(lmap(lambda x: x in _date_formats,
+                                 self.fmtlist))[0]
+            for i in cols:
+                col = data.columns[i]
+                data[col] = _stata_elapsed_date_to_datetime_vec(data[col], self.fmtlist[i])
+
+        if convert_categoricals and self.value_label_dict:
+            data = self._do_convert_categoricals(data, self.value_label_dict, self.lbllist,
+                                             order_categoricals)
+
+        if not preserve_dtypes:
+            retyped_data = []
+            convert = False
+            for col in data:
+                dtype = data[col].dtype
+                if dtype in (np.float16, np.float32):
+                    dtype = np.float64
+                    convert = True
+                elif dtype in (np.int8, np.int16, np.int32):
+                    dtype = np.int64
+                    convert = True
+                retyped_data.append((col, data[col].astype(dtype)))
+            if convert:
+                data = DataFrame.from_items(retyped_data)
+
+        return data
+
+    def _do_convert_missing(self, data, convert_missing):
         # Check for missing values, and replace if found
+
         for i, colname in enumerate(data):
             fmt = self.typlist[i]
             if fmt not in self.VALID_RANGE:
@@ -1282,7 +1447,7 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
             if not missing.any():
                 continue
 
-            if self._missing_values:  # Replacement follows Stata notation
+            if convert_missing:  # Replacement follows Stata notation
                 missing_loc = np.argwhere(missing)
                 umissing, umissing_loc = np.unique(series[missing],
                                                    return_inverse=True)
@@ -1301,48 +1466,72 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
 
             data[colname] = replacement
 
-        if convert_dates:
-            cols = np.where(lmap(lambda x: x in _date_formats,
-                                 self.fmtlist))[0]
-            for i in cols:
-                col = data.columns[i]
-                data[col] = _stata_elapsed_date_to_datetime_vec(data[col], self.fmtlist[i])
+    def _insert_strls(self, data):
+        if not hasattr(self, 'GSO') or len(self.GSO) == 0:
+            return data
+        for i, typ in enumerate(self.typlist):
+            if typ != 'L':
+                continue
+            data.iloc[:, i] = [self.GSO[k] for k in data.iloc[:, i]]
+        return data
 
-        if convert_categoricals and self.value_label_dict:
-            value_labels = list(compat.iterkeys(self.value_label_dict))
-            cat_converted_data = []
-            for col, label in zip(data, self.lbllist):
-                if label in value_labels:
-                    # Explicit call with ordered=True
-                    cat_data = Categorical(data[col], ordered=order_categoricals)
-                    value_label_dict = self.value_label_dict[label]
-                    categories = []
-                    for category in cat_data.categories:
-                        if category in value_label_dict:
-                            categories.append(value_label_dict[category])
-                        else:
-                            categories.append(category)  # Partially labeled
-                    cat_data.categories = categories
-                    cat_converted_data.append((col, cat_data))
-                else:
-                    cat_converted_data.append((col, data[col]))
-            data = DataFrame.from_items(cat_converted_data)
+    def _do_select_columns(self, data, columns):
 
-        if not preserve_dtypes:
-            retyped_data = []
-            convert = False
-            for col in data:
-                dtype = data[col].dtype
-                if dtype in (np.float16, np.float32):
-                    dtype = np.float64
-                    convert = True
-                elif dtype in (np.int8, np.int16, np.int32):
-                    dtype = np.int64
-                    convert = True
-                retyped_data.append((col, data[col].astype(dtype)))
-            if convert:
-                data = DataFrame.from_items(retyped_data)
+        if not self._column_selector_set:
+            column_set = set(columns)
+            if len(column_set) != len(columns):
+                raise ValueError('columns contains duplicate entries')
+            unmatched = column_set.difference(data.columns)
+            if unmatched:
+                raise ValueError('The following columns were not found in the '
+                                 'Stata data set: ' +
+                                 ', '.join(list(unmatched)))
+            # Copy information for retained columns for later processing
+            dtyplist = []
+            typlist = []
+            fmtlist = []
+            lbllist = []
+            matched = set()
+            for i, col in enumerate(data.columns):
+                if col in column_set:
+                    matched.update([col])
+                    dtyplist.append(self.dtyplist[i])
+                    typlist.append(self.typlist[i])
+                    fmtlist.append(self.fmtlist[i])
+                    lbllist.append(self.lbllist[i])
 
+            self.dtyplist = dtyplist
+            self.typlist = typlist
+            self.fmtlist = fmtlist
+            self.lbllist = lbllist
+            self._column_selector_set = True
+
+        return data[columns]
+
+
+    def _do_convert_categoricals(self, data, value_label_dict, lbllist, order_categoricals):
+        """
+        Converts categorical columns to Categorical type.
+        """
+        value_labels = list(compat.iterkeys(value_label_dict))
+        cat_converted_data = []
+        for col, label in zip(data, lbllist):
+            if label in value_labels:
+                # Explicit call with ordered=True
+                cat_data = Categorical(data[col], ordered=order_categoricals)
+                categories = []
+                for category in cat_data.categories:
+                    if category in value_label_dict[label]:
+                        categories.append(value_label_dict[label][category])
+                    else:
+                        categories.append(category)  # Partially labeled
+                cat_data.categories = categories
+                # TODO: is the next line needed above in the data(...) method?
+                cat_data = Series(cat_data, index=data.index)
+                cat_converted_data.append((col, cat_data))
+            else:
+                cat_converted_data.append((col, data[col]))
+        data = DataFrame.from_items(cat_converted_data)
         return data
 
     def data_label(self):
diff --git a/pandas/io/tests/data/stata12_117.dta b/pandas/io/tests/data/stata12_117.dta
new file mode 100644
index 0000000000000000000000000000000000000000..7d1d6181f53bf2b443798b2842cdb95f258b56a1
GIT binary patch
literal 1285
zcmd5+%}yIJ5Oxdnz>Nwa^%hm)S|>;dZ5g?!2M_|mfd`1Zn+aKILl)bm4Qdg}od@W1
zaN{XBaYDQX;zT|6EJ;{6RO+ECt-qOX{_NTDf_2*IcE@Q2J=7gE3JeygEvi;)4P>Q2
zI&?TN<gk0N339B$j+zl5cNFY-L3nXbL7`JDl$WCbF7Kl+)i2dTD`{+E+E{6}RvXRa
z^GXxs*b1`RMPHZ7XA947#>F>LSZBOsyk<Ok%Z?07vr$MGe=^=OJ}}PBML&>K6;8Ct
zby{HC4NJ7@{_&j<=OCZa<aVU}vDA<0s&#Exr>2?HX|=9}>aiNwfpJA{-BWu&K2gve
zn7(#aEhnFvhxI(KGs}s6-T0E_uLD3<oAa$JzF|AeMmwJBr3(L;qaHplJO0*TGgb+Q
zG{hn-9?K7BLA?029kx@l{o@VmY`<f@imUt^|9`@X$>t_N*3(934vinPJhKX9jp_LR
zhM&OmFFMV*U-1@hY-Fhx*riUy`*`(L2sCa;480&jFj!{_tF{jIiKO7sykC624t5WF
pxnK(=8NK(*@psA+mMB&p6hERnOc{Opn0Ru^=m-;EpVaZ6?+@SE-rE2G

literal 0
HcmV?d00001

diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
index f896b98fddf5b..8b44be61d5f66 100644
--- a/pandas/io/tests/test_stata.py
+++ b/pandas/io/tests/test_stata.py
@@ -75,6 +75,8 @@ def setUp(self):
         self.dta20_115 = os.path.join(self.dirpath, 'stata11_115.dta')
         self.dta20_117 = os.path.join(self.dirpath, 'stata11_117.dta')
 
+        self.dta21_117 = os.path.join(self.dirpath, 'stata12_117.dta')
+
     def read_dta(self, file):
         # Legacy default reader configuration
         return read_stata(file, convert_dates=True)
@@ -90,11 +92,21 @@ def test_read_empty_dta(self):
             empty_ds2 = read_stata(path)
             tm.assert_frame_equal(empty_ds, empty_ds2)
 
+    def test_data_method(self):
+        # Minimal testing of legacy data method
+        reader_114 = StataReader(self.dta1_114)
+        with warnings.catch_warnings(record=True) as w:
+            parsed_114_data = reader_114.data()
+
+        reader_114 = StataReader(self.dta1_114)
+        parsed_114_read = reader_114.read()
+        tm.assert_frame_equal(parsed_114_data, parsed_114_read)
+
     def test_read_dta1(self):
         reader_114 = StataReader(self.dta1_114)
-        parsed_114 = reader_114.data()
+        parsed_114 = reader_114.read()
         reader_117 = StataReader(self.dta1_117)
-        parsed_117 = reader_117.data()
+        parsed_117 = reader_117.read()
         # Pandas uses np.nan as missing value.
         # Thus, all columns will be of type float, regardless of their name.
         expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)],
@@ -152,14 +164,18 @@ def test_read_dta2(self):
         expected['yearly_date'] = expected['yearly_date'].astype('O')
 
         with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
             parsed_114 = self.read_dta(self.dta2_114)
             parsed_115 = self.read_dta(self.dta2_115)
             parsed_117 = self.read_dta(self.dta2_117)
-            # 113 is buggy due ot limits date format support in Stata
+            # 113 is buggy due to limits of date format support in Stata
             # parsed_113 = self.read_dta(self.dta2_113)
 
-            # should get a warning for that format.
-            tm.assert_equal(len(w), 1)
+            # Remove resource warnings
+            w = [x for x in w if x.category is UserWarning]
+
+            # should get warning for each call to read_dta
+            tm.assert_equal(len(w), 3)
 
         # buggy test because of the NaT comparison on certain platforms
         # Format 113 test fails since it does not support tc and tC formats
@@ -215,6 +231,19 @@ def test_read_dta4(self):
         tm.assert_frame_equal(parsed_115, expected)
         tm.assert_frame_equal(parsed_117, expected)
 
+    # File containing strls
+    def test_read_dta12(self):
+        parsed_117 = self.read_dta(self.dta21_117)
+        expected = DataFrame.from_records(
+            [
+                [1, "abc", "abcdefghi"],
+                [3, "cba", "qwertywertyqwerty"],
+                [93, "", "strl"],
+            ],
+            columns=['x', 'y', 'z'])
+
+        tm.assert_frame_equal(parsed_117, expected, check_dtype=False)
+
     def test_read_write_dta5(self):
         original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)],
                              columns=['float_miss', 'double_miss', 'byte_miss',
@@ -858,6 +887,118 @@ def test_categorical_ordering(self):
             tm.assert_equal(False, parsed_115_unordered[col].cat.ordered)
             tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)
 
+
+    def test_read_chunks_117(self):
+        files_117 = [self.dta1_117, self.dta2_117, self.dta3_117,
+                     self.dta4_117, self.dta14_117, self.dta15_117,
+                     self.dta16_117, self.dta17_117, self.dta18_117,
+                     self.dta19_117, self.dta20_117]
+
+        for fname in files_117:
+            for chunksize in 1,2:
+                for convert_categoricals in False, True:
+                    for convert_dates in False, True:
+
+                        with warnings.catch_warnings(record=True) as w:
+                            warnings.simplefilter("always")
+                            parsed = read_stata(fname, convert_categoricals=convert_categoricals,
+                                                convert_dates=convert_dates)
+                        itr = read_stata(fname, iterator=True)
+
+                        pos = 0
+                        for j in range(5):
+                            with warnings.catch_warnings(record=True) as w:
+                                warnings.simplefilter("always")
+                                try:
+                                    chunk = itr.read(chunksize)
+                                except StopIteration:
+                                    break
+                            from_frame = parsed.iloc[pos:pos+chunksize, :]
+                            try:
+                                tm.assert_frame_equal(from_frame, chunk, check_dtype=False)
+                            except AssertionError:
+                                # datetime.datetime and pandas.tslib.Timestamp may hold
+                                # equivalent values but fail assert_frame_equal
+                                assert(all([x == y for x, y in zip(from_frame, chunk)]))
+
+                            pos += chunksize
+
+    def test_iterator(self):
+
+        fname = self.dta3_117
+
+        parsed = read_stata(fname)
+
+        itr = read_stata(fname, iterator=True)
+        chunk = itr.read(5)
+        tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
+
+        itr = read_stata(fname, chunksize=5)
+        chunk = list(itr)
+        tm.assert_frame_equal(parsed.iloc[0:5, :], chunk[0])
+
+        itr = read_stata(fname, iterator=True)
+        chunk = itr.get_chunk(5)
+        tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
+
+        itr = read_stata(fname, chunksize=5)
+        chunk = itr.get_chunk()
+        tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
+
+
+    def test_read_chunks_115(self):
+        files_115 = [self.dta2_115, self.dta3_115, self.dta4_115,
+                     self.dta14_115, self.dta15_115, self.dta16_115,
+                     self.dta17_115, self.dta18_115, self.dta19_115,
+                     self.dta20_115]
+
+        for fname in files_115:
+            for chunksize in 1,2:
+                for convert_categoricals in False, True:
+                    for convert_dates in False, True:
+
+                        with warnings.catch_warnings(record=True) as w:
+                            warnings.simplefilter("always")
+                            parsed = read_stata(fname, convert_categoricals=convert_categoricals,
+                                                convert_dates=convert_dates)
+                        itr = read_stata(fname, iterator=True,
+                                         convert_categoricals=convert_categoricals)
+
+                        pos = 0
+                        for j in range(5):
+                            with warnings.catch_warnings(record=True) as w:
+                                warnings.simplefilter("always")
+                                try:
+                                    chunk = itr.read(chunksize)
+                                except StopIteration:
+                                    break
+                            from_frame = parsed.iloc[pos:pos+chunksize, :]
+                            try:
+                                tm.assert_frame_equal(from_frame, chunk, check_dtype=False)
+                            except AssertionError:
+                                # datetime.datetime and pandas.tslib.Timestamp may hold
+                                # equivalent values but fail assert_frame_equal
+                                assert(all([x == y for x, y in zip(from_frame, chunk)]))
+
+                            pos += chunksize
+
+    def test_read_chunks_columns(self):
+        fname = self.dta3_117
+        columns = ['quarter', 'cpi', 'm1']
+        chunksize = 2
+
+        parsed = read_stata(fname, columns=columns)
+        itr = read_stata(fname, iterator=True)
+        pos = 0
+        for j in range(5):
+            chunk = itr.read(chunksize, columns=columns)
+            if chunk is None:
+                break
+            from_frame = parsed.iloc[pos:pos+chunksize, :]
+            tm.assert_frame_equal(from_frame, chunk, check_dtype=False)
+            pos += chunksize
+
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)