diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 377a9dea126e6..fc560782816ff 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -77,6 +77,7 @@ Enhancements Performance ~~~~~~~~~~~ +- Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`) .. _whatsnew_0152.experimental: @@ -99,6 +100,7 @@ Bug Fixes - ``sql_schema`` now generates dialect appropriate ``CREATE TABLE`` statements (:issue:`8697`) - ``slice`` string method now takes step into account (:issue:`8754`) - Bug in ``BlockManager`` where setting values with different type would break block integrity (:issue:`8850`) +- Bug in ``DatetimeIndex`` when using ``time`` object as key (:issue:`8667`) - Fix negative step support for label-based slices (:issue:`8753`) Old behavior: @@ -144,7 +146,7 @@ Bug Fixes - +- BUG: Option context applies on __enter__ (:issue:`8514`) @@ -153,8 +155,9 @@ Bug Fixes - Bug in `pd.infer_freq`/`DataFrame.inferred_freq` that prevented proper sub-daily frequency inference when the index contained DST days (:issue:`8772`). - Bug where index name was still used when plotting a series with ``use_index=False`` (:issue:`8558`). - - Bugs when trying to stack multiple columns, when some (or all) of the level names are numbers (:issue:`8584`). - Bug in ``MultiIndex`` where ``__contains__`` returns wrong result if index is not lexically sorted or unique (:issue:`7724`) +- BUG CSV: fix problem with trailing whitespace in skipped rows, (:issue:`8679`), (:issue:`8661`) +- Regression in ``Timestamp`` does not parse 'Z' zone designator for UTC (:issue:`8771`) diff --git a/pandas/core/common.py b/pandas/core/common.py index 759f5f1dfaf7a..6aff67412d677 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -404,9 +404,13 @@ def array_equivalent(left, right, strict_nan=False): Examples -------- - >>> array_equivalent(np.array([1, 2, nan]), np.array([1, 2, nan])) + >>> array_equivalent( + ... np.array([1, 2, np.nan]), + ... np.array([1, 2, np.nan])) True - >>> array_equivalent(np.array([1, nan, 2]), np.array([1, 2, nan])) + >>> array_equivalent( + ... np.array([1, np.nan, 2]), + ... np.array([1, 2, np.nan])) False """ @@ -2171,8 +2175,8 @@ def iterpairs(seq): Examples -------- - >>> iterpairs([1, 2, 3, 4]) - [(1, 2), (2, 3), (3, 4) + >>> list(iterpairs([1, 2, 3, 4])) + [(1, 2), (2, 3), (3, 4)] """ # input may not be sliceable seq_it = iter(seq) diff --git a/pandas/core/config.py b/pandas/core/config.py index 60dc1d7d0341e..b59e07251ce3e 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -51,6 +51,7 @@ import re from collections import namedtuple +from contextlib import contextmanager import warnings from pandas.compat import map, lmap, u import pandas.compat as compat @@ -384,19 +385,18 @@ def __init__(self, *args): 'option_context(pat, val, [(pat, val), ...)).' ) - ops = list(zip(args[::2], args[1::2])) + self.ops = list(zip(args[::2], args[1::2])) + + def __enter__(self): undo = [] - for pat, val in ops: + for pat, val in self.ops: undo.append((pat, _get_option(pat, silent=True))) self.undo = undo - for pat, val in ops: + for pat, val in self.ops: _set_option(pat, val, silent=True) - def __enter__(self): - pass - def __exit__(self, *args): if self.undo: for pat, val in self.undo: @@ -681,8 +681,6 @@ def pp(name, ks): # # helpers -from contextlib import contextmanager - @contextmanager def config_prefix(prefix): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 237012a71aeb4..a464b687209cb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3279,7 +3279,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, Parameters ---------- other : DataFrame, or object coercible into a DataFrame - join : {'left', 'right', 'outer', 'inner'}, default 'left' + join : {'left'}, default 'left' overwrite : boolean, default True If True then overwrite values for common keys in the calling frame filter_func : callable(1d-array) -> 1d-array, default None diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7201428e6b935..7b9ce2b86f730 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2872,11 +2872,13 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, GroupBy object """ - from pandas.core.groupby import groupby + + if level is None and by is None: + raise TypeError('You have to specify at least one of "by" and "level"') axis = self._get_axis_number(axis) - return groupby(self, by, axis=axis, level=level, as_index=as_index, - sort=sort, group_keys=group_keys, squeeze=squeeze) + return groupby(self, by=by, axis=axis, level=level, as_index=as_index, + sort=sort, group_keys=group_keys, squeeze=squeeze) def asfreq(self, freq, method=None, how=None, normalize=False): """ diff --git a/pandas/index.pyx b/pandas/index.pyx index 73d886f10b241..9be7e7404f3fe 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -545,8 +545,14 @@ cdef class DatetimeEngine(Int64Engine): val = _to_i8(val) return self._get_loc_duplicates(val) values = self._get_index_values() - conv = _to_i8(val) - loc = values.searchsorted(conv, side='left') + + try: + conv = _to_i8(val) + loc = values.searchsorted(conv, side='left') + except TypeError: + self._date_check_type(val) + raise KeyError(val) + if loc == len(values) or util.get_value_at(values, loc) != conv: raise KeyError(val) return loc diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 228dad984bb3c..59647b4c781e5 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3048,6 +3048,29 @@ def test_comment_skiprows(self): df = self.read_csv(StringIO(data), comment='#', skiprows=4) tm.assert_almost_equal(df.values, expected) + def test_trailing_spaces(self): + data = """skip +random line with trailing spaces +skip +1,2,3 +1,2.,4. +random line with trailing tabs\t\t\t + +5.,NaN,10.0 +""" + expected = pd.DataFrame([[1., 2., 4.], + [5., np.nan, 10.]]) + # this should ignore six lines including lines with trailing + # whitespace and blank lines. issues 8661, 8679 + df = self.read_csv(StringIO(data.replace(',', ' ')), + header=None, delim_whitespace=True, + skiprows=[0,1,2,3,5,6], skip_blank_lines=True) + tm.assert_frame_equal(df, expected) + df = self.read_table(StringIO(data.replace(',', ' ')), + header=None, delim_whitespace=True, + skiprows=[0,1,2,3,5,6], skip_blank_lines=True) + tm.assert_frame_equal(df, expected) + def test_comment_header(self): data = """# empty # second empty line diff --git a/pandas/parser.pyx b/pandas/parser.pyx index afaa5219ab0cd..0409ee56f22bb 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -86,6 +86,7 @@ cdef extern from "parser/tokenizer.h": EAT_COMMENT EAT_LINE_COMMENT WHITESPACE_LINE + SKIP_LINE FINISHED enum: ERROR_OVERFLOW @@ -158,6 +159,7 @@ cdef extern from "parser/tokenizer.h": int header_end # header row end void *skipset + int64_t skip_first_N_rows int skip_footer double (*converter)(const char *, char **, char, char, char, int) @@ -181,6 +183,8 @@ cdef extern from "parser/tokenizer.h": void parser_free(parser_t *self) nogil int parser_add_skiprow(parser_t *self, int64_t row) + int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) + void parser_set_default_options(parser_t *self) int parser_consume_rows(parser_t *self, size_t nrows) @@ -524,10 +528,10 @@ cdef class TextReader: cdef _make_skiprow_set(self): if isinstance(self.skiprows, (int, np.integer)): - self.skiprows = range(self.skiprows) - - for i in self.skiprows: - parser_add_skiprow(self.parser, i) + parser_set_skipfirstnrows(self.parser, self.skiprows) + else: + for i in self.skiprows: + parser_add_skiprow(self.parser, i) cdef _setup_parser_source(self, source): cdef: diff --git a/pandas/src/datetime/np_datetime_strings.c b/pandas/src/datetime/np_datetime_strings.c index 3f09de851e231..44363fd930510 100644 --- a/pandas/src/datetime/np_datetime_strings.c +++ b/pandas/src/datetime/np_datetime_strings.c @@ -363,7 +363,8 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, * to be cast to the 'unit' parameter. * * 'out' gets filled with the parsed date-time. - * 'out_local' gets whether returned value contains timezone. 0 for UTC, 1 for local time. + * 'out_local' gets set to 1 if the parsed time contains timezone, + * to 0 otherwise. * 'out_tzoffset' gets set to timezone offset by minutes * if the parsed time was in local time, * to 0 otherwise. The values 'now' and 'today' don't get counted @@ -785,11 +786,15 @@ parse_iso_8601_datetime(char *str, int len, /* UTC specifier */ if (*substr == 'Z') { - /* "Z" means not local */ + /* "Z" should be equivalent to tz offset "+00:00" */ if (out_local != NULL) { - *out_local = 0; + *out_local = 1; } + if (out_tzoffset != NULL) { + *out_tzoffset = 0; + } + if (sublen == 1) { goto finish; } diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 9a7303b6874db..fc96cc5429775 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -156,6 +156,7 @@ void parser_set_default_options(parser_t *self) { self->thousands = '\0'; self->skipset = NULL; + self-> skip_first_N_rows = -1; self->skip_footer = 0; } @@ -444,21 +445,17 @@ static int end_line(parser_t *self) { } } - if (self->skipset != NULL) { - k = kh_get_int64((kh_int64_t*) self->skipset, self->file_lines); - - if (k != ((kh_int64_t*)self->skipset)->n_buckets) { - TRACE(("Skipping row %d\n", self->file_lines)); - // increment file line count - self->file_lines++; - - // skip the tokens from this bad line - self->line_start[self->lines] += fields; + if (self->state == SKIP_LINE) { + TRACE(("Skipping row %d\n", self->file_lines)); + // increment file line count + self->file_lines++; + + // skip the tokens from this bad line + self->line_start[self->lines] += fields; - // reset field count - self->line_fields[self->lines] = 0; - return 0; - } + // reset field count + self->line_fields[self->lines] = 0; + return 0; } /* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */ @@ -556,6 +553,15 @@ int parser_add_skiprow(parser_t *self, int64_t row) { return 0; } +int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { + // self->file_lines is zero based so subtract 1 from nrows + if (nrows > 0) { + self->skip_first_N_rows = nrows - 1; + } + + return 0; +} + static int parser_buffer_bytes(parser_t *self, size_t nbytes) { int status; size_t bytes_read; @@ -656,6 +662,15 @@ typedef int (*parser_op)(parser_t *self, size_t line_limit); TRACE(("datapos: %d, datalen: %d\n", self->datapos, self->datalen)); +int skip_this_line(parser_t *self, int64_t rownum) { + if (self->skipset != NULL) { + return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != + ((kh_int64_t*)self->skipset)->n_buckets ); + } + else { + return ( rownum <= self->skip_first_N_rows ); + } +} int tokenize_delimited(parser_t *self, size_t line_limit) { @@ -688,10 +703,25 @@ int tokenize_delimited(parser_t *self, size_t line_limit) switch(self->state) { + case SKIP_LINE: +// TRACE(("tokenize_delimited SKIP_LINE %c, state %d\n", c, self->state)); + if (c == '\n') { + END_LINE(); + } + break; + case START_RECORD: // start of record - - if (c == '\n') { + if (skip_this_line(self, self->file_lines)) { + if (c == '\n') { + END_LINE() + } + else { + self->state = SKIP_LINE; + } + break; + } + else if (c == '\n') { // \n\r possible? if (self->skip_empty_lines) { @@ -1006,9 +1036,26 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) self->state)); switch(self->state) { + + case SKIP_LINE: +// TRACE(("tokenize_delim_customterm SKIP_LINE %c, state %d\n", c, self->state)); + if (c == self->lineterminator) { + END_LINE(); + } + break; + case START_RECORD: // start of record - if (c == self->lineterminator) { + if (skip_this_line(self, self->file_lines)) { + if (c == self->lineterminator) { + END_LINE() + } + else { + self->state = SKIP_LINE; + } + break; + } + else if (c == self->lineterminator) { // \n\r possible? if (self->skip_empty_lines) { @@ -1252,6 +1299,14 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) self->state)); switch(self->state) { + + case SKIP_LINE: +// TRACE(("tokenize_whitespace SKIP_LINE %c, state %d\n", c, self->state)); + if (c == '\n') { + END_LINE(); + } + break; + case WHITESPACE_LINE: if (c == '\n') { self->file_lines++; @@ -1283,9 +1338,17 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) case START_RECORD: // start of record - if (c == '\n') { - // \n\r possible? + if (skip_this_line(self, self->file_lines)) { + if (c == '\n') { + END_LINE() + } + else { + self->state = SKIP_LINE; + } + break; + } else if (c == '\n') { if (self->skip_empty_lines) + // \n\r possible? { self->file_lines++; } diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 0947315fbe6b7..07f4153038dd8 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -127,6 +127,7 @@ typedef enum { EAT_COMMENT, EAT_LINE_COMMENT, WHITESPACE_LINE, + SKIP_LINE, FINISHED } ParserState; @@ -203,6 +204,7 @@ typedef struct parser_t { int header_end; // header row end void *skipset; + int64_t skip_first_N_rows; int skip_footer; double (*converter)(const char *, char **, char, char, char, int); @@ -240,6 +242,8 @@ int parser_trim_buffers(parser_t *self); int parser_add_skiprow(parser_t *self, int64_t row); +int parser_set_skipfirstnrows(parser_t *self, int64_t nrows); + void parser_free(parser_t *self); void parser_set_default_options(parser_t *self); diff --git a/pandas/tests/test_config.py b/pandas/tests/test_config.py index dc5e9a67bdb65..3a8fdd877f5a0 100644 --- a/pandas/tests/test_config.py +++ b/pandas/tests/test_config.py @@ -425,3 +425,24 @@ def f3(key): options.c = 1 self.assertEqual(len(holder), 1) + def test_option_context_scope(self): + # Ensure that creating a context does not affect the existing + # environment as it is supposed to be used with the `with` statement. + # See https://github.com/pydata/pandas/issues/8514 + + original_value = 60 + context_value = 10 + option_name = 'a' + + self.cf.register_option(option_name, original_value) + + # Ensure creating contexts didn't affect the current context. + ctx = self.cf.option_context(option_name, context_value) + self.assertEqual(self.cf.get_option(option_name), original_value) + + # Ensure the correct value is available inside the context. + with ctx: + self.assertEqual(self.cf.get_option(option_name), context_value) + + # Ensure the current context is reset + self.assertEqual(self.cf.get_option(option_name), original_value) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index ef3fc03fc8d22..cd768423e492a 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1961,6 +1961,9 @@ def test_groupby_level(self): # raise exception for non-MultiIndex self.assertRaises(ValueError, self.df.groupby, level=1) + + + def test_groupby_level_index_names(self): ## GH4014 this used to raise ValueError since 'exp'>1 (in py2) df = DataFrame({'exp' : ['A']*3 + ['B']*3, 'var1' : lrange(6),}).set_index('exp') @@ -1999,6 +2002,17 @@ def test_groupby_level_apply(self): result = frame['A'].groupby(level=0).count() self.assertEqual(result.index.name, 'first') + def test_groupby_args(self): + #PR8618 and issue 8015 + frame = self.mframe + def j(): + frame.groupby() + self.assertRaisesRegexp(TypeError, "You have to supply one of 'by' and 'level'", j) + + def k(): + frame.groupby(by=None, level=None) + self.assertRaisesRegexp(TypeError, "You have to supply one of 'by' and 'level'", k) + def test_groupby_level_mapper(self): frame = self.mframe deleveled = frame.reset_index() @@ -3689,8 +3703,9 @@ def test_cumcount(self): assert_series_equal(expected, sg.cumcount()) def test_cumcount_empty(self): - ge = DataFrame().groupby() - se = Series().groupby() + dfe = DataFrame() + ge = dfe.groupby(dfe.index) + se = Series().groupby(by=1, level=2) e = Series(dtype='int64') # edge case, as this is usually considered float diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index b7a18da3924c8..3c57dd764e3aa 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1886,6 +1886,27 @@ def test_reindex_preserves_tz_if_target_is_empty_list_or_array(self): self.assertEqual(str(index.reindex([])[0].tz), 'US/Eastern') self.assertEqual(str(index.reindex(np.array([]))[0].tz), 'US/Eastern') + def test_time_loc(self): # GH8667 + from datetime import time + from pandas.index import _SIZE_CUTOFF + + ns = _SIZE_CUTOFF + np.array([-100, 100],dtype=np.int64) + key = time(15, 11, 30) + start = key.hour * 3600 + key.minute * 60 + key.second + step = 24 * 3600 + + for n in ns: + idx = pd.date_range('2014-11-26', periods=n, freq='S') + ts = pd.Series(np.random.randn(n), index=idx) + i = np.arange(start, n, step) + + tm.assert_array_equal(ts.index.get_loc(key), i) + tm.assert_series_equal(ts[key], ts.iloc[i]) + + left, right = ts.copy(), ts.copy() + left[key] *= -10 + right.iloc[i] *= -10 + tm.assert_series_equal(left, right) class TestPeriodIndex(Base, tm.TestCase): _holder = PeriodIndex diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 202e30cc2eb5e..e7c001ac57c0a 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1210,6 +1210,10 @@ def get_value(self, series, key): return self.get_value_maybe_box(series, key) + if isinstance(key, time): + locs = self.indexer_at_time(key) + return series.take(locs) + try: return _maybe_box(self, Index.get_value(self, series, key), series, key) except KeyError: @@ -1219,10 +1223,6 @@ def get_value(self, series, key): except (TypeError, ValueError, KeyError): pass - if isinstance(key, time): - locs = self.indexer_at_time(key) - return series.take(locs) - try: return self.get_value_maybe_box(series, key) except (TypeError, ValueError, KeyError): @@ -1250,6 +1250,9 @@ def get_loc(self, key): stamp = Timestamp(key, tz=self.tz) return self._engine.get_loc(stamp) + if isinstance(key, time): + return self.indexer_at_time(key) + try: return Index.get_loc(self, key) except (KeyError, ValueError): @@ -1258,9 +1261,6 @@ def get_loc(self, key): except (TypeError, KeyError, ValueError): pass - if isinstance(key, time): - return self.indexer_at_time(key) - try: stamp = Timestamp(key, tz=self.tz) return self._engine.get_loc(stamp) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 9adcbb4ea4a41..6c358bd99e620 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -6,7 +6,7 @@ import datetime from pandas.core.api import Timestamp, Series -from pandas.tslib import period_asfreq, period_ordinal +from pandas.tslib import period_asfreq, period_ordinal, get_timezone from pandas.tseries.index import date_range from pandas.tseries.frequencies import get_freq import pandas.tseries.offsets as offsets @@ -298,6 +298,9 @@ def test_barely_oob_dts(self): # One us more than the maximum is an error self.assertRaises(ValueError, Timestamp, max_ts_us + one_us) + def test_utc_z_designator(self): + self.assertEqual(get_timezone(Timestamp('2014-11-02 01:00Z').tzinfo), 'UTC') + class TestDatetimeParsingWrappers(tm.TestCase): def test_does_not_convert_mixed_integer(self): diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 45bea00ac104f..f29ab14ed8745 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -177,7 +177,7 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, format=None, coerce=False, unit='ns', infer_datetime_format=False): """ - Convert argument to datetime + Convert argument to datetime. Parameters ---------- @@ -198,13 +198,16 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, coerce : force errors to NaT (False by default) unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number - infer_datetime_format: boolean, default False + infer_datetime_format : boolean, default False If no `format` is given, try to infer the format based on the first datetime string. Provides a large speed-up in many cases. Returns ------- - ret : datetime if parsing succeeded + ret : datetime if parsing succeeded. Return type depends on input: + - list-like: DatetimeIndex + - Series: Series of datetime64 dtype + - scalar: Timestamp Examples -------- diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py index 0b9f68f0e6ed5..a70c543ca59eb 100644 --- a/vb_suite/io_bench.py +++ b/vb_suite/io_bench.py @@ -21,6 +21,22 @@ read_csv_standard = Benchmark("read_csv('__test__.csv')", setup1, start_date=datetime(2011, 9, 15)) +#---------------------------------- +# skiprows + +setup1 = common_setup + """ +index = tm.makeStringIndex(20000) +df = DataFrame({'float1' : randn(20000), + 'float2' : randn(20000), + 'string1' : ['foo'] * 20000, + 'bool1' : [True] * 20000, + 'int1' : np.random.randint(0, 200000, size=20000)}, + index=index) +df.to_csv('__test__.csv') +""" + +read_csv_skiprows = Benchmark("read_csv('__test__.csv', skiprows=10000)", setup1, + start_date=datetime(2011, 9, 15)) #---------------------------------------------------------------------- # write_csv