From 78bb4671ed5eba36188b48792496d900440a68cd Mon Sep 17 00:00:00 2001
From: Dan Birken <birken@gmail.com>
Date: Thu, 23 Jan 2014 18:07:24 -0800
Subject: [PATCH 1/2] PERF: Speed up pd.to_datetime() by optionally inferring
 dt format #5490

Given an array of strings that represent datetimes, infer_format=True
will attempt to guess the format of the datetimes, and if it can infer
the format, it will use a faster function to convert/import the
datetimes.  In cases where this speed-up can be used, the function
should be about 10x faster.
---
 pandas/tseries/tests/test_timeseries.py | 183 +++++++++++++++++++++++-
 pandas/tseries/tools.py                 | 178 +++++++++++++++++++++--
 2 files changed, 347 insertions(+), 14 deletions(-)

diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py
index bae93602cb840..8cce0162e0854 100644
--- a/pandas/tseries/tests/test_timeseries.py
+++ b/pandas/tseries/tests/test_timeseries.py
@@ -18,6 +18,7 @@
 from pandas.core.daterange import DateRange
 import pandas.core.datetools as datetools
 import pandas.tseries.offsets as offsets
+import pandas.tseries.tools as tools
 import pandas.tseries.frequencies as fmod
 import pandas as pd
 
@@ -49,6 +50,11 @@ def _skip_if_no_pytz():
     except ImportError:
         raise nose.SkipTest("pytz not installed")
 
+def _skip_if_has_locale():
+    import locale
+    lang, _ = locale.getlocale()
+    if lang is not None:
+        raise nose.SkipTest("Specific locale is set {0}".format(lang))
 
 class TestTimeSeriesDuplicates(tm.TestCase):
     _multiprocess_can_split_ = True
@@ -909,12 +915,8 @@ def test_to_datetime_on_datetime64_series(self):
         self.assertEquals(result[0], s[0])
 
     def test_to_datetime_with_apply(self):
-
         # this is only locale tested with US/None locales
-        import locale
-        (lang,encoding) = locale.getlocale()
-        if lang is not None:
-            raise nose.SkipTest("format codes cannot work with a locale of {0}".format(lang))
+        _skip_if_has_locale()
 
         # GH 5195
         # with a format and coerce a single item to_datetime fails
@@ -3124,6 +3126,177 @@ def test_date_range_fy5252(self):
         self.assertEqual(dr[1], Timestamp('2014-01-30'))
 
 
+class TestToDatetimeInferFormat(tm.TestCase):
+    def test_to_datetime_infer_datetime_format_consistent_format(self):
+        time_series = pd.Series(
+            pd.date_range('20000101', periods=50, freq='H')
+        )
+
+        test_formats = [
+            '%m-%d-%Y',
+            '%m/%d/%Y %H:%M:%S.%f',
+            '%Y-%m-%dT%H:%M:%S.%f',
+        ]
+
+        for test_format in test_formats:
+            s_as_dt_strings = time_series.apply(
+                lambda x: x.strftime(test_format)
+            )
+
+            with_format = pd.to_datetime(s_as_dt_strings, format=test_format)
+            no_infer = pd.to_datetime(
+                s_as_dt_strings, infer_datetime_format=False
+            )
+            yes_infer = pd.to_datetime(
+                s_as_dt_strings, infer_datetime_format=True
+            )
+
+            # Whether the format is explicitly passed, it is inferred, or
+            # it is not inferred, the results should all be the same
+            self.assert_(np.array_equal(with_format, no_infer))
+            self.assert_(np.array_equal(no_infer, yes_infer))
+
+    def test_to_datetime_infer_datetime_format_inconsistent_format(self):
+        test_series = pd.Series(
+            np.array([
+                '01/01/2011 00:00:00',
+                '01-02-2011 00:00:00',
+                '2011-01-03T00:00:00',
+        ]))
+
+        # When the format is inconsistent, infer_datetime_format should just
+        # fallback to the default parsing
+        self.assert_(np.array_equal(
+            pd.to_datetime(test_series, infer_datetime_format=False),
+            pd.to_datetime(test_series, infer_datetime_format=True)
+        ))
+
+        test_series = pd.Series(
+            np.array([
+                'Jan/01/2011',
+                'Feb/01/2011',
+                'Mar/01/2011',
+        ]))
+
+        self.assert_(np.array_equal(
+            pd.to_datetime(test_series, infer_datetime_format=False),
+            pd.to_datetime(test_series, infer_datetime_format=True)
+        ))
+
+    def test_to_datetime_infer_datetime_format_series_with_nans(self):
+        test_series = pd.Series(
+            np.array([
+                '01/01/2011 00:00:00',
+                np.nan,
+                '01/03/2011 00:00:00',
+                np.nan,
+        ]))
+
+        self.assert_(np.array_equal(
+            pd.to_datetime(test_series, infer_datetime_format=False),
+            pd.to_datetime(test_series, infer_datetime_format=True)
+        ))
+
+    def test_to_datetime_infer_datetime_format_series_starting_with_nans(self):
+        test_series = pd.Series(
+            np.array([
+                np.nan,
+                np.nan,
+                '01/01/2011 00:00:00',
+                '01/02/2011 00:00:00',
+                '01/03/2011 00:00:00',
+        ]))
+
+        self.assert_(np.array_equal(
+            pd.to_datetime(test_series, infer_datetime_format=False),
+            pd.to_datetime(test_series, infer_datetime_format=True)
+        ))
+
+
+class TestGuessDatetimeFormat(tm.TestCase):
+    def test_guess_datetime_format_with_parseable_formats(self):
+        dt_string_to_format = (
+            ('20111230', '%Y%m%d'),
+            ('2011-12-30', '%Y-%m-%d'),
+            ('30-12-2011', '%d-%m-%Y'),
+            ('2011-12-30 00:00:00', '%Y-%m-%d %H:%M:%S'),
+            ('2011-12-30T00:00:00', '%Y-%m-%dT%H:%M:%S'),
+            ('2011-12-30 00:00:00.000000', '%Y-%m-%d %H:%M:%S.%f'),
+        )
+
+        for dt_string, dt_format in dt_string_to_format:
+            self.assertEquals(
+                tools._guess_datetime_format(dt_string),
+                dt_format
+            )
+
+    def test_guess_datetime_format_with_dayfirst(self):
+        ambiguous_string = '01/01/2011'
+        self.assertEquals(
+            tools._guess_datetime_format(ambiguous_string, dayfirst=True),
+            '%d/%m/%Y'
+        )
+        self.assertEquals(
+            tools._guess_datetime_format(ambiguous_string, dayfirst=False),
+            '%m/%d/%Y'
+        )
+
+    def test_guess_datetime_format_with_locale_specific_formats(self):
+        # The month names will vary depending on the locale, in which
+        # case these wont be parsed properly (dateutil can't parse them)
+        _skip_if_has_locale()
+
+        dt_string_to_format = (
+            ('30/Dec/2011', '%d/%b/%Y'),
+            ('30/December/2011', '%d/%B/%Y'),
+            ('30/Dec/2011 00:00:00', '%d/%b/%Y %H:%M:%S'),
+        )
+
+        for dt_string, dt_format in dt_string_to_format:
+            self.assertEquals(
+                tools._guess_datetime_format(dt_string),
+                dt_format
+            )
+
+    def test_guess_datetime_format_invalid_inputs(self):
+        # A datetime string must include a year, month and a day for it
+        # to be guessable, in addition to being a string that looks like
+        # a datetime
+        invalid_dts = [
+            '2013',
+            '01/2013',
+            '12:00:00',
+            '1/1/1/1',
+            'this_is_not_a_datetime',
+            '51a',
+            9,
+            datetime(2011, 1, 1),
+        ]
+
+        for invalid_dt in invalid_dts:
+            self.assertTrue(tools._guess_datetime_format(invalid_dt) is None)
+
+    def test_guess_datetime_format_for_array(self):
+        expected_format = '%Y-%m-%d %H:%M:%S.%f'
+        dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format)
+
+        test_arrays = [
+            np.array([dt_string, dt_string, dt_string], dtype='O'),
+            np.array([np.nan, np.nan, dt_string], dtype='O'),
+            np.array([dt_string, 'random_string'], dtype='O'),
+        ]
+
+        for test_array in test_arrays:
+            self.assertEqual(
+                tools._guess_datetime_format_for_array(test_array),
+                expected_format
+            )
+
+        format_for_string_of_nans = tools._guess_datetime_format_for_array(
+            np.array([np.nan, np.nan, np.nan], dtype='O')
+        )
+        self.assertTrue(format_for_string_of_nans is None)
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)
diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py
index 2d4f27cb12ece..6761b5cbb04b0 100644
--- a/pandas/tseries/tools.py
+++ b/pandas/tseries/tools.py
@@ -24,6 +24,21 @@
     print('Please install python-dateutil via easy_install or some method!')
     raise  # otherwise a 2nd import won't show the message
 
+_DATEUTIL_LEXER_SPLIT = None
+try:
+    # Since these are private methods from dateutil, it is safely imported
+    # here so in case this interface changes, pandas will just fallback
+    # to not using the functionality
+    from dateutil.parser import _timelex
+
+    if hasattr(_timelex, 'split'):
+        def _lexer_split_from_str(dt_str):
+            # The StringIO(str(_)) is for dateutil 2.2 compatibility
+            return _timelex.split(StringIO(str(dt_str)))
+
+        _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
+except (ImportError, AttributeError):
+    pass
 
 def _infer_tzinfo(start, end):
     def _infer(a, b):
@@ -50,9 +65,126 @@ def _maybe_get_tz(tz):
         tz = pytz.FixedOffset(tz / 60)
     return tz
 
+def _guess_datetime_format(dt_str, dayfirst=False,
+                           dt_str_parse=compat.parse_date,
+                           dt_str_split=_DATEUTIL_LEXER_SPLIT):
+    """
+    Guess the datetime format of a given datetime string.
+
+    Parameters
+    ----------
+    dt_str : string, datetime string to guess the format of
+    dayfirst : boolean, default False
+        If True parses dates with the day first, eg 20/01/2005
+        Warning: dayfirst=True is not strict, but will prefer to parse
+        with day first (this is a known bug).
+    dt_str_parse : function, defaults to `compate.parse_date` (dateutil)
+        This function should take in a datetime string and return
+        a `datetime.datetime` guess that the datetime string represents
+    dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil)
+        This function should take in a datetime string and return
+        a list of strings, the guess of the various specific parts
+        e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30']
+
+    Returns
+    -------
+    ret : datetime formatt string (for `strftime` or `strptime`)
+    """
+    if dt_str_parse is None or dt_str_split is None:
+        return None
+
+    if not isinstance(dt_str, compat.string_types):
+        return None
+
+    day_attribute_and_format = (('day',), '%d')
+
+    datetime_attrs_to_format = [
+        (('year', 'month', 'day'), '%Y%m%d'),
+        (('year',), '%Y'),
+        (('month',), '%B'),
+        (('month',), '%b'),
+        (('month',), '%m'),
+        day_attribute_and_format,
+        (('hour',), '%H'),
+        (('minute',), '%M'),
+        (('second',), '%S'),
+        (('microsecond',), '%f'),
+        (('second', 'microsecond'), '%S.%f'),
+    ]
+
+    if dayfirst:
+        datetime_attrs_to_format.remove(day_attribute_and_format)
+        datetime_attrs_to_format.insert(0, day_attribute_and_format)
+
+    try:
+        parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst)
+    except:
+        # In case the datetime can't be parsed, its format cannot be guessed
+        return None
+
+    if parsed_datetime is None:
+        return None
+
+    try:
+        tokens = dt_str_split(dt_str)
+    except:
+        # In case the datetime string can't be split, its format cannot
+        # be guessed
+        return None
+
+    format_guess = [None] * len(tokens)
+    found_attrs = set()
+
+    for attrs, attr_format in datetime_attrs_to_format:
+        # If a given attribute has been placed in the format string, skip
+        # over other formats for that same underlying attribute (IE, month
+        # can be represented in multiple different ways)
+        if set(attrs) & found_attrs:
+            continue
+
+        if all(getattr(parsed_datetime, attr) is not None for attr in attrs):
+            for i, token_format in enumerate(format_guess):
+                if (token_format is None and
+                        tokens[i] == parsed_datetime.strftime(attr_format)):
+                    format_guess[i] = attr_format
+                    found_attrs.update(attrs)
+                    break
+
+    # Only consider it a valid guess if we have a year, month and day
+    if len(set(['year', 'month', 'day']) & found_attrs) != 3:
+        return None
+
+    output_format = []
+    for i, guess in enumerate(format_guess):
+        if guess is not None:
+            # Either fill in the format placeholder (like %Y)
+            output_format.append(guess)
+        else:
+            # Or just the token separate (IE, the dashes in "01-01-2013")
+            try:
+                # If the token is numeric, then we likely didn't parse it
+                # properly, so our guess is wrong
+                float(tokens[i])
+                return None
+            except ValueError:
+                pass
+
+            output_format.append(tokens[i])
+
+    guessed_format = ''.join(output_format)
+
+    if parsed_datetime.strftime(guessed_format) == dt_str:
+        return guessed_format
+
+def _guess_datetime_format_for_array(arr, **kwargs):
+    # Try to guess the format based on the first non-NaN element
+    non_nan_elements = com.notnull(arr).nonzero()[0]
+    if len(non_nan_elements):
+        return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
 
 def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
-                format=None, coerce=False, unit='ns'):
+                format=None, coerce=False, unit='ns',
+                infer_datetime_format=False):
     """
     Convert argument to datetime
 
@@ -75,6 +207,9 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
     coerce : force errors to NaT (False by default)
     unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
         (e.g. a unix timestamp), which is an integer/float number
+    infer_datetime_format: boolean, default False
+        If no `format` is given, try to infer the format based on the first
+        datetime string. Provides a large speed-up in many cases.
 
     Returns
     -------
@@ -98,7 +233,7 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
     from pandas.core.series import Series
     from pandas.tseries.index import DatetimeIndex
 
-    def _convert_listlike(arg, box):
+    def _convert_listlike(arg, box, format):
 
         if isinstance(arg, (list,tuple)):
             arg = np.array(arg, dtype='O')
@@ -113,10 +248,26 @@ def _convert_listlike(arg, box):
             return arg
 
         arg = com._ensure_object(arg)
-        try:
+
+        if infer_datetime_format and format is None:
+            format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
+
             if format is not None:
-                result = None
+                # There is a special fast-path for iso8601 formatted
+                # datetime strings, so in those cases don't use the inferred
+                # format because this path makes process slower in this
+                # special case
+                format_is_iso8601 = (
+                    '%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or
+                    '%Y-%m-%d %H:%M:%S.%f'.startswith(format)
+                )
+                if format_is_iso8601:
+                    format = None
 
+        try:
+            result = None
+
+            if format is not None:
                 # shortcut formatting here
                 if format == '%Y%m%d':
                     try:
@@ -127,15 +278,24 @@ def _convert_listlike(arg, box):
                 # fallback
                 if result is None:
                     try:
-                        result = tslib.array_strptime(arg, format, coerce=coerce)
+                        result = tslib.array_strptime(
+                            arg, format, coerce=coerce
+                        )
                     except (tslib.OutOfBoundsDatetime):
                         if errors == 'raise':
                             raise
                         result = arg
-            else:
+                    except ValueError:
+                        # Only raise this error if the user provided the
+                        # datetime format, and not when it was inferred
+                        if not infer_datetime_format:
+                            raise
+
+            if result is None and (format is None or infer_datetime_format):
                 result = tslib.array_to_datetime(arg, raise_=errors == 'raise',
                                                  utc=utc, dayfirst=dayfirst,
                                                  coerce=coerce, unit=unit)
+
             if com.is_datetime64_dtype(result) and box:
                 result = DatetimeIndex(result, tz='utc' if utc else None)
             return result
@@ -152,12 +312,12 @@ def _convert_listlike(arg, box):
     elif isinstance(arg, Timestamp):
         return arg
     elif isinstance(arg, Series):
-        values = _convert_listlike(arg.values, box=False)
+        values = _convert_listlike(arg.values, False, format)
         return Series(values, index=arg.index, name=arg.name)
     elif com.is_list_like(arg):
-        return _convert_listlike(arg, box=box)
+        return _convert_listlike(arg, box, format)
 
-    return _convert_listlike(np.array([ arg ]), box=box)[0]
+    return _convert_listlike(np.array([ arg ]), box, format)[0]
 
 class DateParseError(ValueError):
     pass

From 879f270c120a2c0f63de449ab6fd1bcff2628175 Mon Sep 17 00:00:00 2001
From: Dan Birken <birken@gmail.com>
Date: Thu, 23 Jan 2014 18:07:31 -0800
Subject: [PATCH 2/2] PERF: Add infer_datetime_format to read_csv() #5490

This allows read_csv() to attempt to infer the datetime format for any
columns where parse_dates is enabled.  In cases where the datetime
format can be inferred, this should speed up processing datetimes
by ~10x.

Additionally add documentation and benchmarks for read_csv().
---
 doc/source/io.rst      | 34 ++++++++++++++++++++++++++++++++++
 doc/source/v0.13.1.txt | 14 ++++++++++++++
 pandas/core/frame.py   | 10 ++++++++--
 pandas/core/series.py  |  9 +++++++--
 pandas/io/parsers.py   | 35 ++++++++++++++++++++++++++++-------
 vb_suite/io_bench.py   | 33 +++++++++++++++++++++++++++++++++
 6 files changed, 124 insertions(+), 11 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index fdad28688b7c6..e0ed2b930f449 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -500,6 +500,40 @@ a single date rather than the entire array.
 
 .. _io.dayfirst:
 
+
+Inferring Datetime Format
+~~~~~~~~~~~~~~~~~~~~~~~~~
+If you have `parse_dates` enabled for some or all of your columns, and your
+datetime strings are all formatted the same way, you may get a large speed
+up by setting `infer_datetime_format=True`.  If set, pandas will attempt
+to guess the format of your datetime strings, and then use a faster means
+of parsing the strings.  5-10x parsing speeds have been observed.  Pandas
+will fallback to the usual parsing if either the format cannot be guessed
+or the format that was guessed cannot properly parse the entire column
+of strings.  So in general, `infer_datetime_format` should not have any
+negative consequences if enabled.
+
+Here are some examples of datetime strings that can be guessed (All
+representing December 30th, 2011 at 00:00:00)
+
+"20111230"
+"2011/12/30"
+"20111230 00:00:00"
+"12/30/2011 00:00:00"
+"30/Dec/2011 00:00:00"
+"30/December/2011 00:00:00"
+
+`infer_datetime_format` is sensitive to `dayfirst`.  With `dayfirst=True`, it
+will guess "01/12/2011" to be December 1st.  With `dayfirst=False` (default)
+it will guess "01/12/2011" to be January 12th.
+
+.. ipython:: python
+
+   # Try to infer the format for the index column
+   df = pd.read_csv('foo.csv', index_col=0, parse_dates=True,
+                    infer_datetime_format=True)
+
+
 International Date Formats
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 While US date formats tend to be MM/DD/YYYY, many international formats use
diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt
index ee9e16d5f02f6..c877724613ef4 100644
--- a/doc/source/v0.13.1.txt
+++ b/doc/source/v0.13.1.txt
@@ -107,6 +107,20 @@ Enhancements
      result
      result.loc[:,:,'ItemA']
 
+- Added optional `infer_datetime_format` to `read_csv`, `Series.from_csv` and
+  `DataFrame.read_csv` (:issue:`5490`)
+ 
+  If `parse_dates` is enabled and this flag is set, pandas will attempt to
+  infer the format of the datetime strings in the columns, and if it can
+  be inferred, switch to a faster method of parsing them.  In some cases
+  this can increase the parsing speed by ~5-10x.
+
+  .. ipython:: python
+
+     # Try to infer the format for the index column
+     df = pd.read_csv('foo.csv', index_col=0, parse_dates=True,
+                      infer_datetime_format=True)
+
 Experimental
 ~~~~~~~~~~~~
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index f4bdde332ac81..5a0d975a473e1 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -947,7 +947,8 @@ def _from_arrays(cls, arrays, columns, index, dtype=None):
 
     @classmethod
     def from_csv(cls, path, header=0, sep=',', index_col=0,
-                 parse_dates=True, encoding=None, tupleize_cols=False):
+                 parse_dates=True, encoding=None, tupleize_cols=False,
+                 infer_datetime_format=False):
         """
         Read delimited file into DataFrame
 
@@ -966,6 +967,10 @@ def from_csv(cls, path, header=0, sep=',', index_col=0,
         tupleize_cols : boolean, default False
             write multi_index columns as a list of tuples (if True)
             or new (expanded format) if False)
+        infer_datetime_format: boolean, default False
+            If True and `parse_dates` is True for a column, try to infer the
+            datetime format based on the first datetime string. If the format
+            can be inferred, there often will be a large parsing speed-up.
 
         Notes
         -----
@@ -980,7 +985,8 @@ def from_csv(cls, path, header=0, sep=',', index_col=0,
         from pandas.io.parsers import read_table
         return read_table(path, header=header, sep=sep,
                           parse_dates=parse_dates, index_col=index_col,
-                          encoding=encoding, tupleize_cols=tupleize_cols)
+                          encoding=encoding, tupleize_cols=tupleize_cols,
+                          infer_datetime_format=infer_datetime_format)
 
     def to_sparse(self, fill_value=None, kind='block'):
         """
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 555208a7849dc..a3bf9be71af3c 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -2178,7 +2178,7 @@ def between(self, left, right, inclusive=True):
 
     @classmethod
     def from_csv(cls, path, sep=',', parse_dates=True, header=None,
-                 index_col=0, encoding=None):
+                 index_col=0, encoding=None, infer_datetime_format=False):
         """
         Read delimited file into Series
 
@@ -2197,6 +2197,10 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None,
         encoding : string, optional
             a string representing the encoding to use if the contents are
             non-ascii, for python versions prior to 3
+        infer_datetime_format: boolean, default False
+            If True and `parse_dates` is True for a column, try to infer the
+            datetime format based on the first datetime string. If the format
+            can be inferred, there often will be a large parsing speed-up.
 
         Returns
         -------
@@ -2205,7 +2209,8 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None,
         from pandas.core.frame import DataFrame
         df = DataFrame.from_csv(path, header=header, index_col=index_col,
                                 sep=sep, parse_dates=parse_dates,
-                                encoding=encoding)
+                                encoding=encoding,
+                                infer_datetime_format=infer_datetime_format)
         result = df.icol(0)
         result.index.name = result.name = None
         return result
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 689d4aab48758..6b0d56b5c383e 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -16,6 +16,7 @@
 from pandas.core.config import get_option
 from pandas.io.date_converters import generic_parser
 from pandas.io.common import get_filepath_or_buffer
+from pandas.tseries import tools
 
 from pandas.util.decorators import Appender
 
@@ -143,6 +144,9 @@
 warn_bad_lines: boolean, default True
     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
     "bad line" will be output. (Only valid with C parser).
+infer_datetime_format : boolean, default False
+    If True and parse_dates is enabled for a column, attempt to infer
+    the datetime format to speed up the processing
 
 Returns
 -------
@@ -262,6 +266,7 @@ def _read(filepath_or_buffer, kwds):
     'compression': None,
     'mangle_dupe_cols': True,
     'tupleize_cols': False,
+    'infer_datetime_format': False,
 }
 
 
@@ -349,7 +354,8 @@ def parser_f(filepath_or_buffer,
                  encoding=None,
                  squeeze=False,
                  mangle_dupe_cols=True,
-                 tupleize_cols=False):
+                 tupleize_cols=False,
+                 infer_datetime_format=False):
 
         # Alias sep -> delimiter.
         if delimiter is None:
@@ -408,7 +414,8 @@ def parser_f(filepath_or_buffer,
                     low_memory=low_memory,
                     buffer_lines=buffer_lines,
                     mangle_dupe_cols=mangle_dupe_cols,
-                    tupleize_cols=tupleize_cols)
+                    tupleize_cols=tupleize_cols,
+                    infer_datetime_format=infer_datetime_format)
 
         return _read(filepath_or_buffer, kwds)
 
@@ -665,9 +672,13 @@ def __init__(self, kwds):
         self.true_values = kwds.get('true_values')
         self.false_values = kwds.get('false_values')
         self.tupleize_cols = kwds.get('tupleize_cols', False)
+        self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
 
-        self._date_conv = _make_date_converter(date_parser=self.date_parser,
-                                               dayfirst=self.dayfirst)
+        self._date_conv = _make_date_converter(
+            date_parser=self.date_parser,
+            dayfirst=self.dayfirst,
+            infer_datetime_format=self.infer_datetime_format
+        )
 
         # validate header options for mi
         self.header = kwds.get('header')
@@ -1178,6 +1189,10 @@ def TextParser(*args, **kwds):
         Encoding to use for UTF when reading/writing (ex. 'utf-8')
     squeeze : boolean, default False
         returns Series if only one column
+    infer_datetime_format: boolean, default False
+        If True and `parse_dates` is True for a column, try to infer the
+        datetime format based on the first datetime string. If the format
+        can be inferred, there often will be a large parsing speed-up.
     """
     kwds['engine'] = 'python'
     return TextFileReader(*args, **kwds)
@@ -1870,13 +1885,19 @@ def _get_lines(self, rows=None):
         return self._check_thousands(lines)
 
 
-def _make_date_converter(date_parser=None, dayfirst=False):
+def _make_date_converter(date_parser=None, dayfirst=False,
+                         infer_datetime_format=False):
     def converter(*date_cols):
         if date_parser is None:
             strs = _concat_date_cols(date_cols)
             try:
-                return tslib.array_to_datetime(com._ensure_object(strs),
-                                               utc=None, dayfirst=dayfirst)
+                return tools.to_datetime(
+                    com._ensure_object(strs),
+                    utc=None,
+                    box=False,
+                    dayfirst=dayfirst,
+                    infer_datetime_format=infer_datetime_format
+                )
             except:
                 return lib.try_parse_dates(strs, dayfirst=dayfirst)
         else:
diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py
index 4fc14459e83f5..b70a060233dae 100644
--- a/vb_suite/io_bench.py
+++ b/vb_suite/io_bench.py
@@ -98,3 +98,36 @@ def create_cols(name):
 
 frame_to_csv_date_formatting = Benchmark(stmt, setup,
                                      start_date=datetime(2013, 9, 1))
+
+#----------------------------------------------------------------------
+# infer datetime format
+
+setup = common_setup + """
+rng = date_range('1/1/2000', periods=1000)
+data = '\\n'.join(rng.map(lambda x: x.strftime("%Y-%m-%d %H:%M:%S")))
+"""
+
+stmt = ("read_csv(StringIO(data), header=None, names=['foo'], "
+        "         parse_dates=['foo'], infer_datetime_format=True)")
+
+read_csv_infer_datetime_format_iso8601 = Benchmark(stmt, setup)
+
+setup = common_setup + """
+rng = date_range('1/1/2000', periods=1000)
+data = '\\n'.join(rng.map(lambda x: x.strftime("%Y%m%d")))
+"""
+
+stmt = ("read_csv(StringIO(data), header=None, names=['foo'], "
+        "         parse_dates=['foo'], infer_datetime_format=True)")
+
+read_csv_infer_datetime_format_ymd = Benchmark(stmt, setup)
+
+setup = common_setup + """
+rng = date_range('1/1/2000', periods=1000)
+data = '\\n'.join(rng.map(lambda x: x.strftime("%m/%d/%Y %H:%M:%S.%f")))
+"""
+
+stmt = ("read_csv(StringIO(data), header=None, names=['foo'], "
+        "         parse_dates=['foo'], infer_datetime_format=True)")
+
+read_csv_infer_datetime_format_custom = Benchmark(stmt, setup)