From 78bb4671ed5eba36188b48792496d900440a68cd Mon Sep 17 00:00:00 2001 From: Dan Birken Date: Thu, 23 Jan 2014 18:07:24 -0800 Subject: [PATCH 1/2] PERF: Speed up pd.to_datetime() by optionally inferring dt format #5490 Given an array of strings that represent datetimes, infer_format=True will attempt to guess the format of the datetimes, and if it can infer the format, it will use a faster function to convert/import the datetimes. In cases where this speed-up can be used, the function should be about 10x faster. --- pandas/tseries/tests/test_timeseries.py | 183 +++++++++++++++++++++++- pandas/tseries/tools.py | 178 +++++++++++++++++++++-- 2 files changed, 347 insertions(+), 14 deletions(-) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index bae93602cb840..8cce0162e0854 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -18,6 +18,7 @@ from pandas.core.daterange import DateRange import pandas.core.datetools as datetools import pandas.tseries.offsets as offsets +import pandas.tseries.tools as tools import pandas.tseries.frequencies as fmod import pandas as pd @@ -49,6 +50,11 @@ def _skip_if_no_pytz(): except ImportError: raise nose.SkipTest("pytz not installed") +def _skip_if_has_locale(): + import locale + lang, _ = locale.getlocale() + if lang is not None: + raise nose.SkipTest("Specific locale is set {0}".format(lang)) class TestTimeSeriesDuplicates(tm.TestCase): _multiprocess_can_split_ = True @@ -909,12 +915,8 @@ def test_to_datetime_on_datetime64_series(self): self.assertEquals(result[0], s[0]) def test_to_datetime_with_apply(self): - # this is only locale tested with US/None locales - import locale - (lang,encoding) = locale.getlocale() - if lang is not None: - raise nose.SkipTest("format codes cannot work with a locale of {0}".format(lang)) + _skip_if_has_locale() # GH 5195 # with a format and coerce a single item to_datetime fails @@ -3124,6 +3126,177 @@ def test_date_range_fy5252(self): self.assertEqual(dr[1], Timestamp('2014-01-30')) +class TestToDatetimeInferFormat(tm.TestCase): + def test_to_datetime_infer_datetime_format_consistent_format(self): + time_series = pd.Series( + pd.date_range('20000101', periods=50, freq='H') + ) + + test_formats = [ + '%m-%d-%Y', + '%m/%d/%Y %H:%M:%S.%f', + '%Y-%m-%dT%H:%M:%S.%f', + ] + + for test_format in test_formats: + s_as_dt_strings = time_series.apply( + lambda x: x.strftime(test_format) + ) + + with_format = pd.to_datetime(s_as_dt_strings, format=test_format) + no_infer = pd.to_datetime( + s_as_dt_strings, infer_datetime_format=False + ) + yes_infer = pd.to_datetime( + s_as_dt_strings, infer_datetime_format=True + ) + + # Whether the format is explicitly passed, it is inferred, or + # it is not inferred, the results should all be the same + self.assert_(np.array_equal(with_format, no_infer)) + self.assert_(np.array_equal(no_infer, yes_infer)) + + def test_to_datetime_infer_datetime_format_inconsistent_format(self): + test_series = pd.Series( + np.array([ + '01/01/2011 00:00:00', + '01-02-2011 00:00:00', + '2011-01-03T00:00:00', + ])) + + # When the format is inconsistent, infer_datetime_format should just + # fallback to the default parsing + self.assert_(np.array_equal( + pd.to_datetime(test_series, infer_datetime_format=False), + pd.to_datetime(test_series, infer_datetime_format=True) + )) + + test_series = pd.Series( + np.array([ + 'Jan/01/2011', + 'Feb/01/2011', + 'Mar/01/2011', + ])) + + self.assert_(np.array_equal( + pd.to_datetime(test_series, infer_datetime_format=False), + pd.to_datetime(test_series, infer_datetime_format=True) + )) + + def test_to_datetime_infer_datetime_format_series_with_nans(self): + test_series = pd.Series( + np.array([ + '01/01/2011 00:00:00', + np.nan, + '01/03/2011 00:00:00', + np.nan, + ])) + + self.assert_(np.array_equal( + pd.to_datetime(test_series, infer_datetime_format=False), + pd.to_datetime(test_series, infer_datetime_format=True) + )) + + def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): + test_series = pd.Series( + np.array([ + np.nan, + np.nan, + '01/01/2011 00:00:00', + '01/02/2011 00:00:00', + '01/03/2011 00:00:00', + ])) + + self.assert_(np.array_equal( + pd.to_datetime(test_series, infer_datetime_format=False), + pd.to_datetime(test_series, infer_datetime_format=True) + )) + + +class TestGuessDatetimeFormat(tm.TestCase): + def test_guess_datetime_format_with_parseable_formats(self): + dt_string_to_format = ( + ('20111230', '%Y%m%d'), + ('2011-12-30', '%Y-%m-%d'), + ('30-12-2011', '%d-%m-%Y'), + ('2011-12-30 00:00:00', '%Y-%m-%d %H:%M:%S'), + ('2011-12-30T00:00:00', '%Y-%m-%dT%H:%M:%S'), + ('2011-12-30 00:00:00.000000', '%Y-%m-%d %H:%M:%S.%f'), + ) + + for dt_string, dt_format in dt_string_to_format: + self.assertEquals( + tools._guess_datetime_format(dt_string), + dt_format + ) + + def test_guess_datetime_format_with_dayfirst(self): + ambiguous_string = '01/01/2011' + self.assertEquals( + tools._guess_datetime_format(ambiguous_string, dayfirst=True), + '%d/%m/%Y' + ) + self.assertEquals( + tools._guess_datetime_format(ambiguous_string, dayfirst=False), + '%m/%d/%Y' + ) + + def test_guess_datetime_format_with_locale_specific_formats(self): + # The month names will vary depending on the locale, in which + # case these wont be parsed properly (dateutil can't parse them) + _skip_if_has_locale() + + dt_string_to_format = ( + ('30/Dec/2011', '%d/%b/%Y'), + ('30/December/2011', '%d/%B/%Y'), + ('30/Dec/2011 00:00:00', '%d/%b/%Y %H:%M:%S'), + ) + + for dt_string, dt_format in dt_string_to_format: + self.assertEquals( + tools._guess_datetime_format(dt_string), + dt_format + ) + + def test_guess_datetime_format_invalid_inputs(self): + # A datetime string must include a year, month and a day for it + # to be guessable, in addition to being a string that looks like + # a datetime + invalid_dts = [ + '2013', + '01/2013', + '12:00:00', + '1/1/1/1', + 'this_is_not_a_datetime', + '51a', + 9, + datetime(2011, 1, 1), + ] + + for invalid_dt in invalid_dts: + self.assertTrue(tools._guess_datetime_format(invalid_dt) is None) + + def test_guess_datetime_format_for_array(self): + expected_format = '%Y-%m-%d %H:%M:%S.%f' + dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) + + test_arrays = [ + np.array([dt_string, dt_string, dt_string], dtype='O'), + np.array([np.nan, np.nan, dt_string], dtype='O'), + np.array([dt_string, 'random_string'], dtype='O'), + ] + + for test_array in test_arrays: + self.assertEqual( + tools._guess_datetime_format_for_array(test_array), + expected_format + ) + + format_for_string_of_nans = tools._guess_datetime_format_for_array( + np.array([np.nan, np.nan, np.nan], dtype='O') + ) + self.assertTrue(format_for_string_of_nans is None) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 2d4f27cb12ece..6761b5cbb04b0 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -24,6 +24,21 @@ print('Please install python-dateutil via easy_install or some method!') raise # otherwise a 2nd import won't show the message +_DATEUTIL_LEXER_SPLIT = None +try: + # Since these are private methods from dateutil, it is safely imported + # here so in case this interface changes, pandas will just fallback + # to not using the functionality + from dateutil.parser import _timelex + + if hasattr(_timelex, 'split'): + def _lexer_split_from_str(dt_str): + # The StringIO(str(_)) is for dateutil 2.2 compatibility + return _timelex.split(StringIO(str(dt_str))) + + _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str +except (ImportError, AttributeError): + pass def _infer_tzinfo(start, end): def _infer(a, b): @@ -50,9 +65,126 @@ def _maybe_get_tz(tz): tz = pytz.FixedOffset(tz / 60) return tz +def _guess_datetime_format(dt_str, dayfirst=False, + dt_str_parse=compat.parse_date, + dt_str_split=_DATEUTIL_LEXER_SPLIT): + """ + Guess the datetime format of a given datetime string. + + Parameters + ---------- + dt_str : string, datetime string to guess the format of + dayfirst : boolean, default False + If True parses dates with the day first, eg 20/01/2005 + Warning: dayfirst=True is not strict, but will prefer to parse + with day first (this is a known bug). + dt_str_parse : function, defaults to `compate.parse_date` (dateutil) + This function should take in a datetime string and return + a `datetime.datetime` guess that the datetime string represents + dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil) + This function should take in a datetime string and return + a list of strings, the guess of the various specific parts + e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30'] + + Returns + ------- + ret : datetime formatt string (for `strftime` or `strptime`) + """ + if dt_str_parse is None or dt_str_split is None: + return None + + if not isinstance(dt_str, compat.string_types): + return None + + day_attribute_and_format = (('day',), '%d') + + datetime_attrs_to_format = [ + (('year', 'month', 'day'), '%Y%m%d'), + (('year',), '%Y'), + (('month',), '%B'), + (('month',), '%b'), + (('month',), '%m'), + day_attribute_and_format, + (('hour',), '%H'), + (('minute',), '%M'), + (('second',), '%S'), + (('microsecond',), '%f'), + (('second', 'microsecond'), '%S.%f'), + ] + + if dayfirst: + datetime_attrs_to_format.remove(day_attribute_and_format) + datetime_attrs_to_format.insert(0, day_attribute_and_format) + + try: + parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst) + except: + # In case the datetime can't be parsed, its format cannot be guessed + return None + + if parsed_datetime is None: + return None + + try: + tokens = dt_str_split(dt_str) + except: + # In case the datetime string can't be split, its format cannot + # be guessed + return None + + format_guess = [None] * len(tokens) + found_attrs = set() + + for attrs, attr_format in datetime_attrs_to_format: + # If a given attribute has been placed in the format string, skip + # over other formats for that same underlying attribute (IE, month + # can be represented in multiple different ways) + if set(attrs) & found_attrs: + continue + + if all(getattr(parsed_datetime, attr) is not None for attr in attrs): + for i, token_format in enumerate(format_guess): + if (token_format is None and + tokens[i] == parsed_datetime.strftime(attr_format)): + format_guess[i] = attr_format + found_attrs.update(attrs) + break + + # Only consider it a valid guess if we have a year, month and day + if len(set(['year', 'month', 'day']) & found_attrs) != 3: + return None + + output_format = [] + for i, guess in enumerate(format_guess): + if guess is not None: + # Either fill in the format placeholder (like %Y) + output_format.append(guess) + else: + # Or just the token separate (IE, the dashes in "01-01-2013") + try: + # If the token is numeric, then we likely didn't parse it + # properly, so our guess is wrong + float(tokens[i]) + return None + except ValueError: + pass + + output_format.append(tokens[i]) + + guessed_format = ''.join(output_format) + + if parsed_datetime.strftime(guessed_format) == dt_str: + return guessed_format + +def _guess_datetime_format_for_array(arr, **kwargs): + # Try to guess the format based on the first non-NaN element + non_nan_elements = com.notnull(arr).nonzero()[0] + if len(non_nan_elements): + return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, - format=None, coerce=False, unit='ns'): + format=None, coerce=False, unit='ns', + infer_datetime_format=False): """ Convert argument to datetime @@ -75,6 +207,9 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, coerce : force errors to NaT (False by default) unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number + infer_datetime_format: boolean, default False + If no `format` is given, try to infer the format based on the first + datetime string. Provides a large speed-up in many cases. Returns ------- @@ -98,7 +233,7 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, from pandas.core.series import Series from pandas.tseries.index import DatetimeIndex - def _convert_listlike(arg, box): + def _convert_listlike(arg, box, format): if isinstance(arg, (list,tuple)): arg = np.array(arg, dtype='O') @@ -113,10 +248,26 @@ def _convert_listlike(arg, box): return arg arg = com._ensure_object(arg) - try: + + if infer_datetime_format and format is None: + format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) + if format is not None: - result = None + # There is a special fast-path for iso8601 formatted + # datetime strings, so in those cases don't use the inferred + # format because this path makes process slower in this + # special case + format_is_iso8601 = ( + '%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or + '%Y-%m-%d %H:%M:%S.%f'.startswith(format) + ) + if format_is_iso8601: + format = None + try: + result = None + + if format is not None: # shortcut formatting here if format == '%Y%m%d': try: @@ -127,15 +278,24 @@ def _convert_listlike(arg, box): # fallback if result is None: try: - result = tslib.array_strptime(arg, format, coerce=coerce) + result = tslib.array_strptime( + arg, format, coerce=coerce + ) except (tslib.OutOfBoundsDatetime): if errors == 'raise': raise result = arg - else: + except ValueError: + # Only raise this error if the user provided the + # datetime format, and not when it was inferred + if not infer_datetime_format: + raise + + if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, coerce=coerce, unit=unit) + if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result @@ -152,12 +312,12 @@ def _convert_listlike(arg, box): elif isinstance(arg, Timestamp): return arg elif isinstance(arg, Series): - values = _convert_listlike(arg.values, box=False) + values = _convert_listlike(arg.values, False, format) return Series(values, index=arg.index, name=arg.name) elif com.is_list_like(arg): - return _convert_listlike(arg, box=box) + return _convert_listlike(arg, box, format) - return _convert_listlike(np.array([ arg ]), box=box)[0] + return _convert_listlike(np.array([ arg ]), box, format)[0] class DateParseError(ValueError): pass From 879f270c120a2c0f63de449ab6fd1bcff2628175 Mon Sep 17 00:00:00 2001 From: Dan Birken Date: Thu, 23 Jan 2014 18:07:31 -0800 Subject: [PATCH 2/2] PERF: Add infer_datetime_format to read_csv() #5490 This allows read_csv() to attempt to infer the datetime format for any columns where parse_dates is enabled. In cases where the datetime format can be inferred, this should speed up processing datetimes by ~10x. Additionally add documentation and benchmarks for read_csv(). --- doc/source/io.rst | 34 ++++++++++++++++++++++++++++++++++ doc/source/v0.13.1.txt | 14 ++++++++++++++ pandas/core/frame.py | 10 ++++++++-- pandas/core/series.py | 9 +++++++-- pandas/io/parsers.py | 35 ++++++++++++++++++++++++++++------- vb_suite/io_bench.py | 33 +++++++++++++++++++++++++++++++++ 6 files changed, 124 insertions(+), 11 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index fdad28688b7c6..e0ed2b930f449 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -500,6 +500,40 @@ a single date rather than the entire array. .. _io.dayfirst: + +Inferring Datetime Format +~~~~~~~~~~~~~~~~~~~~~~~~~ +If you have `parse_dates` enabled for some or all of your columns, and your +datetime strings are all formatted the same way, you may get a large speed +up by setting `infer_datetime_format=True`. If set, pandas will attempt +to guess the format of your datetime strings, and then use a faster means +of parsing the strings. 5-10x parsing speeds have been observed. Pandas +will fallback to the usual parsing if either the format cannot be guessed +or the format that was guessed cannot properly parse the entire column +of strings. So in general, `infer_datetime_format` should not have any +negative consequences if enabled. + +Here are some examples of datetime strings that can be guessed (All +representing December 30th, 2011 at 00:00:00) + +"20111230" +"2011/12/30" +"20111230 00:00:00" +"12/30/2011 00:00:00" +"30/Dec/2011 00:00:00" +"30/December/2011 00:00:00" + +`infer_datetime_format` is sensitive to `dayfirst`. With `dayfirst=True`, it +will guess "01/12/2011" to be December 1st. With `dayfirst=False` (default) +it will guess "01/12/2011" to be January 12th. + +.. ipython:: python + + # Try to infer the format for the index column + df = pd.read_csv('foo.csv', index_col=0, parse_dates=True, + infer_datetime_format=True) + + International Date Formats ~~~~~~~~~~~~~~~~~~~~~~~~~~ While US date formats tend to be MM/DD/YYYY, many international formats use diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt index ee9e16d5f02f6..c877724613ef4 100644 --- a/doc/source/v0.13.1.txt +++ b/doc/source/v0.13.1.txt @@ -107,6 +107,20 @@ Enhancements result result.loc[:,:,'ItemA'] +- Added optional `infer_datetime_format` to `read_csv`, `Series.from_csv` and + `DataFrame.read_csv` (:issue:`5490`) + + If `parse_dates` is enabled and this flag is set, pandas will attempt to + infer the format of the datetime strings in the columns, and if it can + be inferred, switch to a faster method of parsing them. In some cases + this can increase the parsing speed by ~5-10x. + + .. ipython:: python + + # Try to infer the format for the index column + df = pd.read_csv('foo.csv', index_col=0, parse_dates=True, + infer_datetime_format=True) + Experimental ~~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f4bdde332ac81..5a0d975a473e1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -947,7 +947,8 @@ def _from_arrays(cls, arrays, columns, index, dtype=None): @classmethod def from_csv(cls, path, header=0, sep=',', index_col=0, - parse_dates=True, encoding=None, tupleize_cols=False): + parse_dates=True, encoding=None, tupleize_cols=False, + infer_datetime_format=False): """ Read delimited file into DataFrame @@ -966,6 +967,10 @@ def from_csv(cls, path, header=0, sep=',', index_col=0, tupleize_cols : boolean, default False write multi_index columns as a list of tuples (if True) or new (expanded format) if False) + infer_datetime_format: boolean, default False + If True and `parse_dates` is True for a column, try to infer the + datetime format based on the first datetime string. If the format + can be inferred, there often will be a large parsing speed-up. Notes ----- @@ -980,7 +985,8 @@ def from_csv(cls, path, header=0, sep=',', index_col=0, from pandas.io.parsers import read_table return read_table(path, header=header, sep=sep, parse_dates=parse_dates, index_col=index_col, - encoding=encoding, tupleize_cols=tupleize_cols) + encoding=encoding, tupleize_cols=tupleize_cols, + infer_datetime_format=infer_datetime_format) def to_sparse(self, fill_value=None, kind='block'): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 555208a7849dc..a3bf9be71af3c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2178,7 +2178,7 @@ def between(self, left, right, inclusive=True): @classmethod def from_csv(cls, path, sep=',', parse_dates=True, header=None, - index_col=0, encoding=None): + index_col=0, encoding=None, infer_datetime_format=False): """ Read delimited file into Series @@ -2197,6 +2197,10 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None, encoding : string, optional a string representing the encoding to use if the contents are non-ascii, for python versions prior to 3 + infer_datetime_format: boolean, default False + If True and `parse_dates` is True for a column, try to infer the + datetime format based on the first datetime string. If the format + can be inferred, there often will be a large parsing speed-up. Returns ------- @@ -2205,7 +2209,8 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None, from pandas.core.frame import DataFrame df = DataFrame.from_csv(path, header=header, index_col=index_col, sep=sep, parse_dates=parse_dates, - encoding=encoding) + encoding=encoding, + infer_datetime_format=infer_datetime_format) result = df.icol(0) result.index.name = result.name = None return result diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 689d4aab48758..6b0d56b5c383e 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -16,6 +16,7 @@ from pandas.core.config import get_option from pandas.io.date_converters import generic_parser from pandas.io.common import get_filepath_or_buffer +from pandas.tseries import tools from pandas.util.decorators import Appender @@ -143,6 +144,9 @@ warn_bad_lines: boolean, default True If error_bad_lines is False, and warn_bad_lines is True, a warning for each "bad line" will be output. (Only valid with C parser). +infer_datetime_format : boolean, default False + If True and parse_dates is enabled for a column, attempt to infer + the datetime format to speed up the processing Returns ------- @@ -262,6 +266,7 @@ def _read(filepath_or_buffer, kwds): 'compression': None, 'mangle_dupe_cols': True, 'tupleize_cols': False, + 'infer_datetime_format': False, } @@ -349,7 +354,8 @@ def parser_f(filepath_or_buffer, encoding=None, squeeze=False, mangle_dupe_cols=True, - tupleize_cols=False): + tupleize_cols=False, + infer_datetime_format=False): # Alias sep -> delimiter. if delimiter is None: @@ -408,7 +414,8 @@ def parser_f(filepath_or_buffer, low_memory=low_memory, buffer_lines=buffer_lines, mangle_dupe_cols=mangle_dupe_cols, - tupleize_cols=tupleize_cols) + tupleize_cols=tupleize_cols, + infer_datetime_format=infer_datetime_format) return _read(filepath_or_buffer, kwds) @@ -665,9 +672,13 @@ def __init__(self, kwds): self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') self.tupleize_cols = kwds.get('tupleize_cols', False) + self.infer_datetime_format = kwds.pop('infer_datetime_format', False) - self._date_conv = _make_date_converter(date_parser=self.date_parser, - dayfirst=self.dayfirst) + self._date_conv = _make_date_converter( + date_parser=self.date_parser, + dayfirst=self.dayfirst, + infer_datetime_format=self.infer_datetime_format + ) # validate header options for mi self.header = kwds.get('header') @@ -1178,6 +1189,10 @@ def TextParser(*args, **kwds): Encoding to use for UTF when reading/writing (ex. 'utf-8') squeeze : boolean, default False returns Series if only one column + infer_datetime_format: boolean, default False + If True and `parse_dates` is True for a column, try to infer the + datetime format based on the first datetime string. If the format + can be inferred, there often will be a large parsing speed-up. """ kwds['engine'] = 'python' return TextFileReader(*args, **kwds) @@ -1870,13 +1885,19 @@ def _get_lines(self, rows=None): return self._check_thousands(lines) -def _make_date_converter(date_parser=None, dayfirst=False): +def _make_date_converter(date_parser=None, dayfirst=False, + infer_datetime_format=False): def converter(*date_cols): if date_parser is None: strs = _concat_date_cols(date_cols) try: - return tslib.array_to_datetime(com._ensure_object(strs), - utc=None, dayfirst=dayfirst) + return tools.to_datetime( + com._ensure_object(strs), + utc=None, + box=False, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format + ) except: return lib.try_parse_dates(strs, dayfirst=dayfirst) else: diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py index 4fc14459e83f5..b70a060233dae 100644 --- a/vb_suite/io_bench.py +++ b/vb_suite/io_bench.py @@ -98,3 +98,36 @@ def create_cols(name): frame_to_csv_date_formatting = Benchmark(stmt, setup, start_date=datetime(2013, 9, 1)) + +#---------------------------------------------------------------------- +# infer datetime format + +setup = common_setup + """ +rng = date_range('1/1/2000', periods=1000) +data = '\\n'.join(rng.map(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))) +""" + +stmt = ("read_csv(StringIO(data), header=None, names=['foo'], " + " parse_dates=['foo'], infer_datetime_format=True)") + +read_csv_infer_datetime_format_iso8601 = Benchmark(stmt, setup) + +setup = common_setup + """ +rng = date_range('1/1/2000', periods=1000) +data = '\\n'.join(rng.map(lambda x: x.strftime("%Y%m%d"))) +""" + +stmt = ("read_csv(StringIO(data), header=None, names=['foo'], " + " parse_dates=['foo'], infer_datetime_format=True)") + +read_csv_infer_datetime_format_ymd = Benchmark(stmt, setup) + +setup = common_setup + """ +rng = date_range('1/1/2000', periods=1000) +data = '\\n'.join(rng.map(lambda x: x.strftime("%m/%d/%Y %H:%M:%S.%f"))) +""" + +stmt = ("read_csv(StringIO(data), header=None, names=['foo'], " + " parse_dates=['foo'], infer_datetime_format=True)") + +read_csv_infer_datetime_format_custom = Benchmark(stmt, setup)