Merge pull request #6467 from cpcloud/str-pat-n-fix-6466

cpcloud · cpcloud · commit 27a7f94f67e8 · 2014-02-24T18:35:20.000-05:00
BUG: split should respect maxsplit when no pat is given
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -163,11 +163,11 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
     na : default NaN, fill value for missing values.
     regex : bool, default True
         If True use re.search, otherwise use Python in operator
-        
+
     Returns
     -------
     Series of boolean values
-        
+
     See Also
     --------
     match : analagous, but stricter, relying on re.match instead of re.search
@@ -345,7 +345,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
 
     See Also
     --------
-    contains : analagous, but less strict, relying on re.search instead of 
+    contains : analagous, but less strict, relying on re.search instead of
         re.match
     extract : now preferred to the deprecated usage of match (as_indexer=False)
 
@@ -413,23 +413,23 @@ def str_extract(arr, pat, flags=0):
     dtype: object
 
     A pattern with more than one group will return a DataFrame.
-    
+
     >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)')
          0    1
     0    a    1
     1    b    2
     2  NaN  NaN
 
     A pattern may contain optional groups.
-    
+
     >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])?(\d)')
          0  1
     0    a  1
     1    b  2
     2  NaN  3
 
     Named groups will become column names in the result.
-    
+
     >>> Series(['a1', 'b2', 'c3']).str.extract('(?P<letter>[ab])(?P<digit>\d)')
       letter digit
     0      a     1
@@ -451,14 +451,14 @@ def f(x):
         else:
             return empty_row
     if regex.groups == 1:
-        result = Series([f(val)[0] for val in arr], 
+        result = Series([f(val)[0] for val in arr],
                         name=regex.groupindex.get(1),
                         index=arr.index)
     else:
         names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
         columns = [names.get(1 + i, i) for i in range(regex.groups)]
-        result = DataFrame([f(val) for val in arr], 
-                           columns=columns, 
+        result = DataFrame([f(val) for val in arr],
+                           columns=columns,
                            index=arr.index)
     return result
 
@@ -617,7 +617,7 @@ def str_split(arr, pat=None, n=None):
     if pat is None:
         if n is None or n == 0:
             n = -1
-        f = lambda x: x.split()
+        f = lambda x: x.split(pat, n)
     else:
         if len(pat) == 1:
             if n is None or n == 0:
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -565,7 +565,6 @@ def check_index(index):
                        tm.makeDateIndex, tm.makePeriodIndex ]:
             check_index(index())
 
-
     def test_get_dummies(self):
         s = Series(['a|b', 'a|c', np.nan])
         result = s.str.get_dummies('|')
@@ -796,6 +795,12 @@ def test_split_maxsplit(self):
         result = s.str.split('asdf', n=-1)
         tm.assert_series_equal(result, xp)
 
+    def test_split_no_pat_with_nonzero_n(self):
+        s = Series(['split once', 'split once too!'])
+        result = s.str.split(n=1)
+        expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']})
+        tm.assert_series_equal(expected, result)
+
     def test_pipe_failures(self):
         # #2119
         s = Series(['A|B|C'])
@@ -1092,6 +1097,7 @@ def test_encode_decode_errors(self):
 
         tm.assert_series_equal(result, exp)
 
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)