Merge pull request #4437 from cpcloud/fix-astype-calls

cpcloud · cpcloud · commit aca1a4219a48 · 2013-08-02T20:29:56.000-07:00
BUG: fix string truncation for astype(str)
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -117,6 +117,8 @@ pandas 0.13
     set _ref_locs (:issue:`4403`)
   - Fixed an issue where hist subplots were being overwritten when they were
     called using the top level matplotlib API (:issue:`4408`)
+  - Fixed a bug where calling ``Series.astype(str)`` would truncate the string
+    (:issue:`4405`, :issue:`4437`)
 
 pandas 0.12
 ===========
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -3,7 +3,6 @@
 """
 
 import re
-from datetime import datetime
 import codecs
 import csv
 
@@ -1628,7 +1627,7 @@ def _is_sequence(x):
 _ensure_object = algos.ensure_object
 
 
-def _astype_nansafe(arr, dtype, copy = True):
+def _astype_nansafe(arr, dtype, copy=True):
     """ return a view if copy is False """
     if not isinstance(dtype, np.dtype):
         dtype = np.dtype(dtype)
@@ -1659,6 +1658,8 @@ def _astype_nansafe(arr, dtype, copy = True):
     elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
         # work around NumPy brokenness, #1987
         return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
+    elif issubclass(dtype.type, compat.string_types):
+        return lib.astype_str(arr.ravel()).reshape(arr.shape)
 
     if copy:
         return arr.astype(dtype)
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -5,7 +5,6 @@
 # pylint: disable=E1101,E1103
 # pylint: disable=W0703,W0622,W0613,W0201
 
-from pandas import compat
 import operator
 from distutils.version import LooseVersion
 import types
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -722,6 +722,16 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
 
     return result
 
+cpdef ndarray[object] astype_str(ndarray arr):
+    cdef:
+        Py_ssize_t i, n = arr.size
+        ndarray[object] result = np.empty(n, dtype=object)
+
+    for i in range(n):
+        util.set_value_at(result, i, str(arr[i]))
+
+    return result
+
 def clean_index_list(list obj):
     '''
     Utility used in pandas.core.index._ensure_index
@@ -838,7 +848,7 @@ def write_csv_rows(list data, list data_index, int nlevels, list cols, object wr
 def create_hdf_rows_2d(ndarray indexer0,
                        object dtype,
                        ndarray[np.uint8_t, ndim=1] mask,
-                       ndarray[np.uint8_t, ndim=1] searchable,	 
+                       ndarray[np.uint8_t, ndim=1] searchable,
                        list values):
     """ return a list of objects ready to be converted to rec-array format """
 
@@ -857,7 +867,7 @@ def create_hdf_rows_2d(ndarray indexer0,
     for i in range(n_indexer0):
 
         if not mask[i]:
-         
+
             tup = PyTuple_New(tup_size)
 
             v  = indexer0[i]
@@ -869,7 +879,7 @@ def create_hdf_rows_2d(ndarray indexer0,
                 v = values[b][i]
                 if searchable[b]:
                     v = v[0]
-        
+
                 PyTuple_SET_ITEM(tup, b+1, v)
                 Py_INCREF(v)
 
@@ -882,8 +892,8 @@ def create_hdf_rows_2d(ndarray indexer0,
 @cython.wraparound(False)
 def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1,
                        object dtype,
-                       ndarray[np.uint8_t, ndim=2] mask, 
-                       ndarray[np.uint8_t, ndim=1] searchable,	 
+                       ndarray[np.uint8_t, ndim=2] mask,
+                       ndarray[np.uint8_t, ndim=1] searchable,
                        list values):
     """ return a list of objects ready to be converted to rec-array format """
 
@@ -932,8 +942,8 @@ def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1,
 @cython.wraparound(False)
 def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2,
                        object dtype,
-                       ndarray[np.uint8_t, ndim=3] mask, 
-                       ndarray[np.uint8_t, ndim=1] searchable,	 
+                       ndarray[np.uint8_t, ndim=3] mask,
+                       ndarray[np.uint8_t, ndim=1] searchable,
                        list values):
     """ return a list of objects ready to be converted to rec-array format """
 
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -4,6 +4,7 @@
 import os
 import operator
 import unittest
+import string
 
 import nose
 
@@ -2029,6 +2030,7 @@ def test_timedelta64_functions(self):
         expected = Series([timedelta(1)],dtype='timedelta64[ns]')
         assert_series_equal(result,expected)
 
+
     def test_sub_of_datetime_from_TimeSeries(self):
         from pandas.core import common as com
         from datetime import datetime
@@ -3354,6 +3356,19 @@ def test_astype_datetimes(self):
         s = s.astype('O')
         self.assert_(s.dtype == np.object_)
 
+    def test_astype_str(self):
+        # GH4405
+        digits = string.digits
+        s1 = Series([digits * 10, tm.rands(63), tm.rands(64),
+                    tm.rands(1000)])
+        s2 = Series([digits * 10, tm.rands(63), tm.rands(64), nan, 1.0])
+        types = (compat.text_type,) + (np.str_, np.unicode_)
+        for typ in types:
+            for s in (s1, s2):
+                res = s.astype(typ)
+                expec = s.map(compat.text_type)
+                assert_series_equal(res, expec)
+
     def test_map(self):
         index, data = tm.getMixedTypeDict()