Skip to content

Iterating through TableIterator with where clause can incorrectly ignore data #8014

Closed
@bboerner

Description

@bboerner

Expected behaviour: Using appendable table stored using HDFStore summed length of DataFrames returned using an iterator with a where clause should equal the length of the DataFrame when returned using the same where clause but with iterator=False e.g. TableIterator.get_values().

The attached code generates appendable tables of size 100064, 200064, ..., 400064. It uses a where clause which is a superset of all possible values to get DataFrames with iterator=False, with and without the where clause, and with iterator=True, also with and without the where clause. In all cases except for iterator=True with the where clause the length of the returned DataFrames is correct.

For the failure cases in closer inspection in iPython it is the last 64 rows which are not being returned.

Note: in create_file() the appending of DataFrames with lengths of 58689 and 41375 was chosen specifically to reproduce the problem. I originally encountered the problem with a dataset with length 174000064 and the last append was size 41375. I attempted to reproduce the problem by creating various length tables in chunks of 100000 with a final append of 64 and wasn't able to do so.

Creating the table with the last chunk = 41375 with total length exceeding 300000 does in my tests reproduce the problem.

Output:

iteration: 0 PASSED
expected: 100064, df len: 100064, it (no where clause) len: 100064, it len: 100064
iteration: 1 PASSED
expected: 200064, df len: 200064, it (no where clause) len: 200064, it len: 200064
iteration: 2 FAILED
expected: 300064, df len: 300064, it (no where clause) len: 300064, it len: 300000
iteration: 3 FAILED
expected: 400064, df len: 400064, it (no where clause) len: 400064, it len: 400000

pd.show_versions()

INSTALLED VERSIONS

commit: None
python: 2.7.6.final.0
python-bits: 64
OS: Linux
OS-release: 3.13.0-32-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8

pandas: 0.13.1
Cython: 0.20.1
numpy: 1.8.1
scipy: 0.14.0
statsmodels: None
IPython: 1.2.1
sphinx: None
patsy: None
scikits.timeseries: None
dateutil: 1.5
pytz: 2012c
bottleneck: 0.8.0
tables: 3.1.1
numexpr: 2.4
matplotlib: 1.3.1
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
sqlalchemy: None
lxml: 3.3.3
bs4: 4.3.2
html5lib: 0.999
bq: None
apiclient: None

import os
from dateutil.relativedelta import relativedelta

import numpy as np
randn = np.random.randn
from pandas import DataFrame, HDFStore, date_range

def create_df(beg_dt, periods=1e5):
    """ Create a DataFrame containing values v. """

    dr = date_range(beg_dt, periods=periods, freq='S')
    df = DataFrame(index=dr, data=np.random.randn(periods, 4), columns=['bid_price','bid_vol', 'ask_price','ask_vol'])
    return(df)

def create_file(iterations=1):
    beg_dt = '2014-08-12 13:30:00.000000'
    periods = 1e5
    for i in xrange(iterations):
        df = create_df(beg_dt, periods)
        store_append(store, df, key="df")
        beg_dt = df.index[-1:][0] + relativedelta(seconds=1)

    df = create_df(beg_dt, 58689)
    store_append(store, df, key="df")
    beg_dt = df.index[-1:][0] + relativedelta(seconds=1)
    df = create_df(beg_dt, 41375)
    store_append(store, df, key="df")
    beg_dt = df.index[-1:][0] + relativedelta(seconds=1)

    return(df)

def store_open(fname):
    return(HDFStore(fname))

def store_get(store, key="df", where=None, start=None, stop=None, iterator=False, chunksize=None):
    df = None
    try:
        df = store.select(key, where=where, start=start, stop=stop, iterator=iterator, chunksize=chunksize)
    except (KeyError, TypeError, ):
        pass

    return(df)

def store_append(store, df, key="df", where=""):
    store.append(key, df, format='table')

path = '.'
fname = '/'.join([path, 'delme_test.h5'])

store = None
for n in xrange(0, 4):
    pass

    if store:
        try: store.close()
        except: pass
    try: os.unlink(fname)
    except: pass
    store = store_open(fname)

    create_file(n)
    store.close()

    store = store_open(fname)
    where = None
    df = store_get(store, 'df', where=where, iterator=False)

    expected_ln = len(df)

    beg_dt = '2014-08-12 13:30:00.000000'
    end_dt = '2032-12-31 13:30:00.000000'
    where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)

    # where clause, iterator=False
    df = store_get(store, 'df', where=where, iterator=False)
    ln_df = len(df)

    # no where clause
    it = store_get(store, 'df', where=None, iterator=True)
    dfs = [df for df in it if not df.empty]
    ln_it_no_where_clause = sum([len(df) for df in dfs])

    # where clause, iterator=True
    it = store_get(store, 'df', where=where, iterator=True)
    dfs = [df for df in it if not df.empty]
    ln_it = sum([len(df) for df in dfs])

    if expected_ln == ln_df and expected_ln == ln_it:
        print("iteration: %d PASSED" % n)
    else:
        print("iteration: %d FAILED" % n)
    print("expected: %d, df len: %d, it (no where clause) len: %d, it len: %d" %
        (expected_ln, ln_df, ln_it_no_where_clause, ln_it))

store.close()

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions