Skip to content

CI: Fastparquet upgrade broke CI #42588

Closed
@lithomas1

Description

@lithomas1

Fastparquet updated to 0.7 which is causing failure.
See these logs from database build for example.

___________________________ test_cross_engine_fp_pa ____________________________
[gw1] linux -- Python 3.8.10 /usr/share/miniconda/envs/pandas-dev/bin/python

request = <FixtureRequest for <Function test_cross_engine_fp_pa>>
df_cross_compat =    a  b    d      e          f
0  a  1  4.0   True 2013-01-01
1  b  2  5.0  False 2013-01-02
2  c  3  6.0   True 2013-01-03
pa = 'pyarrow', fp = 'fastparquet'

    def test_cross_engine_fp_pa(request, df_cross_compat, pa, fp):
        # cross-compat with differing reading/writing engines
        df = df_cross_compat
        with tm.ensure_clean() as path:
            df.to_parquet(path, engine=fp, compression=None)
    
            with catch_warnings(record=True):
                result = read_parquet(path, engine=pa)
>               tm.assert_frame_equal(result, df)
E               AssertionError: Attributes of DataFrame.iloc[:, 4] (column name="f") are different
E               
E               Attribute "dtype" are different
E               [left]:  datetime64[ns, UTC]
E               [right]: datetime64[ns]

pandas/tests/io/test_parquet.py:337: AssertionError
______________________ TestParquetFastParquet.test_basic _______________________
[gw1] linux -- Python 3.8.10 /usr/share/miniconda/envs/pandas-dev/bin/python

self = <pandas.tests.io.test_parquet.TestParquetFastParquet object at 0x7fd39d1cc070>
fp = 'fastparquet'
df_full =   string string_with_nan  ...               datetime_tz timedelta
0      a               a  ... 2013-01-01 00:00:00-05...01-02 00:00:00-05:00    2 days
2      c               c  ... 2013-01-03 00:00:00-05:00    3 days

[3 rows x 14 columns]

    def test_basic(self, fp, df_full):
        df = df_full
    
        dti = pd.date_range("20130101", periods=3, tz="US/Eastern")
        dti = dti._with_freq(None)  # freq doesn't round-trip
        df["datetime_tz"] = dti
        df["timedelta"] = pd.timedelta_range("1 day", periods=3)
>       check_round_trip(df, fp)

pandas/tests/io/test_parquet.py:915: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
pandas/tests/io/test_parquet.py:220: in check_round_trip
    compare(repeat)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

repeat = 2

    def compare(repeat):
        for _ in range(repeat):
            df.to_parquet(path, **write_kwargs)
            with catch_warnings(record=True):
                actual = read_parquet(path, **read_kwargs)
    
>           tm.assert_frame_equal(
                expected,
                actual,
                check_names=check_names,
                check_like=check_like,
                check_dtype=check_dtype,
            )
E           AssertionError: Attributes of DataFrame.iloc[:, 6] (column name="uint") are different
E           
E           Attribute "dtype" are different
E           [left]:  uint8
E           [right]: UInt8

pandas/tests/io/test_parquet.py:210: AssertionError
__________________ TestParquetFastParquet.test_bool_with_none __________________
[gw1] linux -- Python 3.8.10 /usr/share/miniconda/envs/pandas-dev/bin/python

self = <pandas.tests.io.test_parquet.TestParquetFastParquet object at 0x7fd39fd34af0>
fp = 'fastparquet'

    def test_bool_with_none(self, fp):
        df = pd.DataFrame({"a": [True, None, False]})
        expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
>       check_round_trip(df, fp, expected=expected)

pandas/tests/io/test_parquet.py:928: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
pandas/tests/io/test_parquet.py:220: in check_round_trip
    compare(repeat)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

repeat = 2

    def compare(repeat):
        for _ in range(repeat):
            df.to_parquet(path, **write_kwargs)
            with catch_warnings(record=True):
                actual = read_parquet(path, **read_kwargs)
    
>           tm.assert_frame_equal(
                expected,
                actual,
                check_names=check_names,
                check_like=check_like,
                check_dtype=check_dtype,
            )
E           AssertionError: Attributes of DataFrame.iloc[:, 0] (column name="a") are different
E           
E           Attribute "dtype" are different
E           [left]:  float16
E           [right]: boolean

Metadata

Metadata

Assignees

No one assigned

    Labels

    CIContinuous IntegrationIO Parquetparquet, feather

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions