diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index fdd31cd4d8e36..84472802d73be 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -171,7 +171,7 @@ Performance Improvements - Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`) - Improved csv write performance generally by 2x (:issue:`9940`) - +- Improved the performance of ``pd.lib.max_len_string_array`` by 5-7x (:issue:`10024`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 3972bad7b2d83..eecc225d06beb 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1626,7 +1626,7 @@ def _dtype_to_stata_type(dtype, column): elif dtype.type == np.object_: # try to coerce it to the biggest string # not memory efficient, what else could we # do? - itemsize = max_len_string_array(column.values) + itemsize = max_len_string_array(com._ensure_object(column.values)) return chr(max(itemsize, 1)) elif dtype == np.float64: return chr(255) @@ -1664,7 +1664,7 @@ def _dtype_to_default_stata_fmt(dtype, column): if not (inferred_dtype in ('string', 'unicode') or len(column) == 0): raise ValueError('Writing general object arrays is not supported') - itemsize = max_len_string_array(column.values) + itemsize = max_len_string_array(com._ensure_object(column.values)) if itemsize > 244: raise ValueError(excessive_string_length_error % column.name) return "%" + str(max(itemsize, 1)) + "s" diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index 9b27d612cdeee..63ed26ea7d931 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -105,6 +105,7 @@ def test_get_multi_all_invalid(self): sl = ['INVALID', 'INVALID2', 'INVALID3'] self.assertRaises(RemoteDataError, web.get_data_google, sl, '2012') + @network def test_get_multi2(self): with warnings.catch_warnings(record=True) as w: for locale in self.locales: diff --git a/pandas/lib.pyx b/pandas/lib.pyx index de966d6e03ee2..cc4c43494176e 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1,6 +1,7 @@ cimport numpy as np cimport cython import numpy as np +import sys from numpy cimport * @@ -10,6 +11,7 @@ cdef extern from "numpy/arrayobject.h": cdef enum NPY_TYPES: NPY_intp "NPY_INTP" + from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, PyDict_Contains, PyDict_Keys, Py_INCREF, PyTuple_SET_ITEM, @@ -18,7 +20,14 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, PyBytes_Check, PyTuple_SetItem, PyTuple_New, - PyObject_SetAttrString) + PyObject_SetAttrString, + PyBytes_GET_SIZE, + PyUnicode_GET_SIZE) + +try: + from cpython cimport PyString_GET_SIZE +except ImportError: + from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX @@ -32,7 +41,6 @@ cdef extern from "Python.h": Py_ssize_t *slicelength) except -1 - cimport cpython isnan = np.isnan @@ -896,23 +904,32 @@ def clean_index_list(list obj): return maybe_convert_objects(converted), 0 + +ctypedef fused pandas_string: + str + unicode + bytes + + @cython.boundscheck(False) @cython.wraparound(False) -def max_len_string_array(ndarray arr): +cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr): """ return the maximum size of elements in a 1-dim string array """ cdef: - int i, m, l - int length = arr.shape[0] - object v + Py_ssize_t i, m = 0, l = 0, length = arr.shape[0] + pandas_string v - m = 0 - for i from 0 <= i < length: + for i in range(length): v = arr[i] - if PyString_Check(v) or PyBytes_Check(v) or PyUnicode_Check(v): - l = len(v) - - if l > m: - m = l + if PyString_Check(v): + l = PyString_GET_SIZE(v) + elif PyBytes_Check(v): + l = PyBytes_GET_SIZE(v) + elif PyUnicode_Check(v): + l = PyUnicode_GET_SIZE(v) + + if l > m: + m = l return m diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index bb860269c5144..4c134b25636a7 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -8,18 +8,29 @@ import pandas.util.testing as tm from pandas.compat import u + class TestMisc(tm.TestCase): def test_max_len_string_array(self): - arr = np.array(['foo','b',np.nan],dtype='object') - self.assertTrue(max_len_string_array(arr),3) + arr = a = np.array(['foo', 'b', np.nan], dtype='object') + self.assertTrue(max_len_string_array(arr), 3) # unicode - arr = arr.astype('U') - self.assertTrue(max_len_string_array(arr),3) + arr = a.astype('U').astype(object) + self.assertTrue(max_len_string_array(arr), 3) + + # bytes for python3 + arr = a.astype('S').astype(object) + self.assertTrue(max_len_string_array(arr), 3) + + # raises + tm.assertRaises(TypeError, + lambda: max_len_string_array(arr.astype('U'))) + class TestIsscalar(tm.TestCase): + def test_isscalar_builtin_scalars(self): self.assertTrue(isscalar(None)) self.assertTrue(isscalar(True))