|
1 | 1 | import pytest
|
2 |
| -import numpy as np |
| 2 | +import math |
3 | 3 |
|
4 | 4 | @pytest.mark.parametrize("test_data",
|
5 | 5 | [
|
6 |
| - ({'a': [np.array([1, 2, 3]), np.array([4, 5, 6])], |
7 |
| - 'b': [np.array([1.5, 2.0, 3.2]), np.array([4.1, 5.7, 6.9])]}, |
8 |
| - np.object_, None), |
9 |
| - ({'a': [1.5, 2.5, 3.5], 'b': [9.2, 10.5, 11.8]}, np.float64, None), |
10 |
| - ({'A': [1, 2, 3, 4], 'B': [1, 2, 3, 4]}, np.int64, np.float64) |
| 6 | + {'a': ["foo", "bar"], |
| 7 | + 'b': ["baz", "qux"]}, |
| 8 | + {'a': [1.5, 2.5, 3.5], 'b': [9.2, 10.5, 11.8]}, |
| 9 | + {'A': [1, 2, 3, 4], 'B': [1, 2, 3, 4]} |
11 | 10 | ],
|
12 |
| - ids=["array_data", "float_data", "int_data"]) |
13 |
| -def test_only_one_data(test_data, create_df_from_dict): |
14 |
| - data, dtype, new_dtype = test_data |
15 |
| - columns = list(data.keys()) |
16 |
| - df = create_df_from_dict(data) |
17 |
| - df2 = df.__dataframe__() |
18 |
| - new_dtype = dtype if new_dtype is None else new_dtype |
19 |
| - assert df.columns.values.tolist() == columns |
20 |
| - val = len(df[columns[0]])-1 |
21 |
| - column_size = df.size |
22 |
| - for column in columns: |
23 |
| - assert df[column].tolist() == df[column].tolist() |
24 |
| - assert df[column].dtype.type is dtype |
25 |
| - assert df2.get_column_by_name(column).null_count == 0 |
26 |
| - assert df2.get_column_by_name(column).size == column_size |
27 |
| - assert df2.get_column_by_name(column).offset == 0 |
28 |
| - assert not df2["x"].is_masked |
29 |
| - n = np.random.randint(0, val) |
30 |
| - (df[column])[n] = None |
31 |
| - assert df[column].dtype.type is new_dtype |
32 |
| - assert df2.get_column_by_name(column).null_count == 1 |
33 |
| - |
34 |
| - |
35 |
| -def test_float_int(create_df_from_dict): |
36 |
| - df = create_df_from_dict({'a': [1, 2, 3], 'b': [3, 4, 5], |
37 |
| - 'c': [1.5, 2.5, 3.5], 'd': [9, 10, 11]}) |
38 |
| - df2 = df.__dataframe__() |
39 |
| - columns = ['a', 'b', 'c', 'd'] |
40 |
| - assert df.columns.values.tolist() == columns |
41 |
| - for column in columns: |
42 |
| - assert df[column].tolist() == df[column].tolist() |
43 |
| - if column is 'c': |
44 |
| - assert df[column].dtype.type is np.float64 |
45 |
| - else: |
46 |
| - assert df[column].dtype.type is np.int64 |
47 |
| - |
48 |
| - assert df2.get_column_by_name(column).null_count == 0 |
49 |
| - assert df2.get_column_by_name(column).size == 3 |
50 |
| - assert df2.get_column_by_name(column).offset == 0 |
51 |
| - |
52 |
| - n = np.random.randint(0, 2) |
53 |
| - (df[column])[n] = None |
54 |
| - assert df[column].dtype.type is np.float64 |
55 |
| - assert df2.get_column_by_name(column).null_count == 1 |
56 |
| - |
57 |
| - |
58 |
| -def test_mixed_intfloatbool(create_df_from_dict): |
59 |
| - df = create_df_from_dict({"x": np.array([True, True, False]), |
60 |
| - "y": np.array([1, 2, 0]), |
61 |
| - "z": np.array([9.2, 10.5, 11.8])}) |
62 |
| - df2 = df.__dataframe__() |
63 |
| - columns = ['x', 'y', 'z'] |
64 |
| - assert df.columns.values.tolist() == columns |
65 |
| - for column in columns: |
66 |
| - assert df[column].tolist() == df[column].tolist() |
67 |
| - assert df2.get_column_by_name(column).null_count == 0 |
68 |
| - assert df2.get_column_by_name(column).size == 3 |
69 |
| - assert df2.get_column_by_name(column).offset == 0 |
| 11 | + ids=["str_data", "float_data", "int_data"]) |
| 12 | +def test_only_one_dtype(test_data, df_from_dict): |
| 13 | + columns = list(test_data.keys()) |
| 14 | + df = df_from_dict(test_data) |
| 15 | + dfX = df.__dataframe__() |
70 | 16 |
|
71 |
| - assert df["x"].dtype.type is np.bool_ |
72 |
| - assert df["y"].dtype.type is np.int32 |
73 |
| - assert df["z"].dtype.type is np.float64 |
74 |
| - |
75 |
| - assert df2.get_column_by_name("x")._allow_copy == True |
76 |
| - |
77 |
| - for column in columns: |
78 |
| - n = np.random.randint(0, 2) |
79 |
| - (df[column])[n] = None |
80 |
| - if column is "x": |
81 |
| - assert df[column].dtype.type is np.object_ |
82 |
| - else: |
83 |
| - assert df[column].dtype.type is np.float64 |
84 |
| - assert df2.get_column_by_name(column).null_count == 1 |
85 |
| - |
86 |
| - |
87 |
| -def test_string_dtype(create_df_from_dict): |
88 |
| - df = create_df_from_dict({"A": ["a", "b", "cdef", "", "g"]}) |
89 |
| - df2 = df.__dataframe__() |
90 |
| - columns = ['A'] |
91 |
| - assert df.columns.values.tolist() == columns |
| 17 | + column_size = len(test_data[columns[0]]) |
92 | 18 | for column in columns:
|
93 |
| - assert df[column].tolist() == df[column].tolist() |
94 |
| - assert df[column].dtype.type is np.object_ |
95 |
| - assert df2.get_column_by_name(column).null_count == 0 |
96 |
| - |
97 |
| - |
98 |
| -def test_categorical(create_df_from_dict): |
99 |
| - df = create_df_from_dict({"year": [2012, 2013, 2015, 2019], "weekday": [0, 1, 4, 6]}) |
100 |
| - df = df.categorize("year", min_value=2012, max_value=2019) |
101 |
| - df = df.categorize("weekday", labels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]) |
102 |
| - # Some detailed testing for correctness of dtype and null handling: |
103 |
| - col = df.__dataframe__().get_column_by_name("year") |
104 |
| - assert col.describe_categorical == (False, True, {0: 2012, 1: 2013, 2: 2014, 3: 2015, 4: 2016, 5: 2017, 6: 2018, 7: 2019}) |
105 |
| - assert col.describe_null == (0, None) |
106 |
| - col2 = df.__dataframe__().get_column_by_name("weekday") |
107 |
| - assert col2.describe_categorical == (False, True, {0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"}) |
108 |
| - assert col2.describe_null == (0, None) |
109 |
| - |
110 |
| - |
111 |
| -def test_dataframe(create_df_from_dict): |
112 |
| - df = create_df_from_dict({"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}) |
113 |
| - df2 = df.__dataframe__() |
114 |
| - assert df2._allow_copy == True |
115 |
| - assert df2.num_columns() == 3 |
116 |
| - assert df2.num_rows() == 3 |
117 |
| - assert df2.num_chunks() == 1 |
118 |
| - assert df2.column_names() == ["x", "y", "z"] |
119 |
| - assert df2.select_columns((0, 2))._df[:, 0].tolist() == df2.select_columns_by_name(("x", "z"))._df[:, 0].tolist() |
120 |
| - assert df2.select_columns((0, 2))._df[:, 1].tolist() == df2.select_columns_by_name(("x", "z"))._df[:, 1].tolist() |
121 |
| - |
122 |
| - |
123 |
| -def test_chunks(create_df_from_dict): |
124 |
| - df = create_df_from_dict({"x": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}) |
125 |
| - df2 = df.__dataframe__() |
126 |
| - chunk_iter = iter(df2.get_chunks(3)) |
127 |
| - chunk = next(chunk_iter) |
128 |
| - assert chunk.num_rows() == 4 |
129 |
| - chunk = next(chunk_iter) |
130 |
| - assert chunk.num_rows() == 4 |
131 |
| - chunk = next(chunk_iter) |
132 |
| - assert chunk.num_rows() == 2 |
133 |
| - with pytest.raises(StopIteration): |
134 |
| - chunk = next(chunk_iter) |
135 |
| - |
136 |
| - |
137 |
| -def test_get_chunks(create_df_from_dict): |
138 |
| - df = create_df_from_dict({"x": [1]}) |
139 |
| - df2 = df.__dataframe__() |
140 |
| - assert df2.get_chunks() == 1 |
| 19 | + assert dfX.get_column_by_name(column).null_count == 0 |
| 20 | + assert dfX.get_column_by_name(column).size == column_size |
| 21 | + assert dfX.get_column_by_name(column).offset == 0 |
| 22 | + |
| 23 | + |
| 24 | +def test_float_int(df_from_dict): |
| 25 | + df = df_from_dict({'a': [1, 2, 3], 'b': [3, 4, 5], |
| 26 | + 'c': [1.5, 2.5, 3.5], 'd': [9, 10, 11], |
| 27 | + 'e': [True, False, True], |
| 28 | + 'f': ["a", "", "c"]}) |
| 29 | + dfX = df.__dataframe__() |
| 30 | + columns = {'a': 0, 'b': 0, 'c': 2, 'd': 0, 'e': 20, 'f': 21} |
| 31 | + |
| 32 | + for column, kind in columns.items(): |
| 33 | + colX = dfX.get_column_by_name(column) |
| 34 | + assert colX.null_count == 0 |
| 35 | + assert colX.size == 3 |
| 36 | + assert colX.offset == 0 |
| 37 | + |
| 38 | + assert colX.dtype[0] == kind |
| 39 | + |
| 40 | + assert dfX.get_column_by_name("c").dtype[1] == 64 |
| 41 | + |
| 42 | + |
| 43 | +def test_na_float(df_from_dict): |
| 44 | + df = df_from_dict({'a': [1.0, math.nan, 2.0]}) |
| 45 | + dfX = df.__dataframe__() |
| 46 | + colX = dfX.get_column_by_name('a') |
| 47 | + assert colX.null_count == 1 |
| 48 | + |
| 49 | +def test_noncategorical(df_from_dict): |
| 50 | + df = df_from_dict({'a': [1, 2, 3]}) |
| 51 | + dfX = df.__dataframe__() |
| 52 | + colX = dfX.get_column_by_name('a') |
| 53 | + with pytest.raises(TypeError): |
| 54 | + colX.describe_categorical |
| 55 | + |
| 56 | +def test_categorical(df_from_dict): |
| 57 | + df = df_from_dict({"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]}, is_categorical=True) |
| 58 | + |
| 59 | + colX = df.__dataframe__().get_column_by_name("weekday") |
| 60 | + is_ordered, is_dictionary, _ = colX.describe_categorical |
| 61 | + assert isinstance(is_ordered, bool) |
| 62 | + assert isinstance(is_dictionary, bool) |
| 63 | + |
| 64 | + |
| 65 | +def test_dataframe(df_from_dict): |
| 66 | + df = df_from_dict({"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}) |
| 67 | + dfX = df.__dataframe__() |
| 68 | + |
| 69 | + assert dfX.num_columns() == 3 |
| 70 | + assert dfX.num_rows() == 3 |
| 71 | + assert dfX.num_chunks() == 1 |
| 72 | + assert dfX.column_names() == ["x", "y", "z"] |
| 73 | + assert dfX.select_columns((0, 2)).column_names() == dfX.select_columns_by_name(("x", "z")).column_names() |
| 74 | + |
| 75 | +@pytest.mark.parametrize(["size", "n_chunks"], |
| 76 | + [(10, 3), (12, 3), (12, 5)] |
| 77 | +) |
| 78 | +def test_chunks(size, n_chunks, df_from_dict): |
| 79 | + df = df_from_dict({"x": list(range(size))}) |
| 80 | + dfX = df.__dataframe__() |
| 81 | + chunks = list(dfX.get_chunks(n_chunks)) |
| 82 | + assert len(chunks) == n_chunks |
| 83 | + assert sum(chunk.num_rows() for chunk in chunks) == size |
| 84 | + |
| 85 | + |
| 86 | +def test_get_chunks(df_from_dict): |
| 87 | + df = df_from_dict({"x": [1]}) |
| 88 | + dfX = df.__dataframe__() |
| 89 | + assert len(list(dfX.get_chunks())) == 1 |
0 commit comments