From 7e461a18d9f6928132afec6f48ce968b3e989ba6 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Mon, 3 Dec 2018 17:43:52 +0100 Subject: [PATCH 01/30] remove \n from docstring --- pandas/core/arrays/datetimes.py | 26 +++++++++++++------------- pandas/core/arrays/timedeltas.py | 16 ++++++++-------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index cfe3afcf3730a..b3df505d56d78 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -82,7 +82,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -1072,19 +1072,19 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") - year = _field_accessor('year', 'Y', "\n The year of the datetime\n") + year = _field_accessor('year', 'Y', "The year of the datetime") month = _field_accessor('month', 'M', - "\n The month as January=1, December=12 \n") - day = _field_accessor('day', 'D', "\nThe days of the datetime\n") - hour = _field_accessor('hour', 'h', "\nThe hours of the datetime\n") - minute = _field_accessor('minute', 'm', "\nThe minutes of the datetime\n") - second = _field_accessor('second', 's', "\nThe seconds of the datetime\n") + "The month as January=1, December=12") + day = _field_accessor('day', 'D', "The days of the datetime") + hour = _field_accessor('hour', 'h', "The hours of the datetime") + minute = _field_accessor('minute', 'm', "The minutes of the datetime") + second = _field_accessor('second', 's', "The seconds of the datetime") microsecond = _field_accessor('microsecond', 'us', - "\nThe microseconds of the datetime\n") + "The microseconds of the datetime") nanosecond = _field_accessor('nanosecond', 'ns', - "\nThe nanoseconds of the datetime\n") + "The nanoseconds of the datetime") weekofyear = _field_accessor('weekofyear', 'woy', - "\nThe week ordinal of the year\n") + "The week ordinal of the year") week = weekofyear _dayofweek_doc = """ The day of the week with Monday=0, Sunday=6. @@ -1129,12 +1129,12 @@ def date(self): "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") dayofyear = _field_accessor('dayofyear', 'doy', - "\nThe ordinal day of the year\n") - quarter = _field_accessor('quarter', 'q', "\nThe quarter of the date\n") + "The ordinal day of the year") + quarter = _field_accessor('quarter', 'q', "The quarter of the date") days_in_month = _field_accessor( 'days_in_month', 'dim', - "\nThe number of days in the month\n") + "The number of days in the month") daysinmonth = days_in_month _is_month_doc = """ Indicates whether the date is the {first_or_last} day of the month. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 830283d31a929..4afc9f5483c2a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -59,7 +59,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -684,16 +684,16 @@ def to_pytimedelta(self): return tslibs.ints_to_pytimedelta(self.asi8) days = _field_accessor("days", "days", - "\nNumber of days for each element.\n") + "Number of days for each element.") seconds = _field_accessor("seconds", "seconds", - "\nNumber of seconds (>= 0 and less than 1 day) " - "for each element.\n") + "Number of seconds (>= 0 and less than 1 day) " + "for each element.") microseconds = _field_accessor("microseconds", "microseconds", - "\nNumber of microseconds (>= 0 and less " - "than 1 second) for each element.\n") + "Number of microseconds (>= 0 and less " + "than 1 second) for each element.") nanoseconds = _field_accessor("nanoseconds", "nanoseconds", - "\nNumber of nanoseconds (>= 0 and less " - "than 1 microsecond) for each element.\n") + "Number of nanoseconds (>= 0 and less " + "than 1 microsecond) for each element.") @property def components(self): From dea38f24c0067ae3fe9484b837c9649714213bba Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 14 Jan 2020 21:26:31 +0100 Subject: [PATCH 02/30] fix issue 17038 --- pandas/core/reshape/pivot.py | 4 +++- pandas/tests/reshape/test_pivot.py | 20 ++++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b443ba142369c..9743d90f4dd04 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -117,7 +117,9 @@ def pivot_table( agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged - if table.index.nlevels > 1: + + # GH 17038, this check should only happen if index is specified + if table.index.nlevels > 1 and index: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 743fc50c87e96..46a05123c9fdd 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -896,12 +896,6 @@ def _check_output( totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() - # no rows - rtable = self.data.pivot_table( - columns=["AA", "BB"], margins=True, aggfunc=np.mean - ) - assert isinstance(rtable, Series) - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") for item in ["DD", "EE", "FF"]: totals = table.loc[("All", ""), item] @@ -972,6 +966,20 @@ def test_pivot_integer_columns(self): tm.assert_frame_equal(table, table2, check_names=False) + @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) + def test_pivot_table_multiindex_only(self, cols): + # GH 17038 + df2 = DataFrame({cols[0]: [1, 2, 3], cols[1]: [1, 2, 3], "v": [4, 5, 6]}) + + result = df2.pivot_table(values="v", columns=cols) + expected = DataFrame( + [[4, 5, 6]], + columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), + index=Index(["v"]), + ) + + tm.assert_frame_equal(result, expected) + def test_pivot_no_level_overlap(self): # GH #1181 From cd9e7ac3f31ffaf95cd628863df911dea9fa1248 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 14 Jan 2020 21:29:43 +0100 Subject: [PATCH 03/30] revert change --- pandas/core/reshape/pivot.py | 3 +-- pandas/tests/reshape/test_pivot.py | 20 ++++++-------------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 9743d90f4dd04..a7cdbb0da7a4e 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -118,8 +118,7 @@ def pivot_table( table = agged - # GH 17038, this check should only happen if index is specified - if table.index.nlevels > 1 and index: + if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 46a05123c9fdd..743fc50c87e96 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -896,6 +896,12 @@ def _check_output( totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() + # no rows + rtable = self.data.pivot_table( + columns=["AA", "BB"], margins=True, aggfunc=np.mean + ) + assert isinstance(rtable, Series) + table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") for item in ["DD", "EE", "FF"]: totals = table.loc[("All", ""), item] @@ -966,20 +972,6 @@ def test_pivot_integer_columns(self): tm.assert_frame_equal(table, table2, check_names=False) - @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) - def test_pivot_table_multiindex_only(self, cols): - # GH 17038 - df2 = DataFrame({cols[0]: [1, 2, 3], cols[1]: [1, 2, 3], "v": [4, 5, 6]}) - - result = df2.pivot_table(values="v", columns=cols) - expected = DataFrame( - [[4, 5, 6]], - columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), - index=Index(["v"]), - ) - - tm.assert_frame_equal(result, expected) - def test_pivot_no_level_overlap(self): # GH #1181 From e5e912be0f596943067a7df812442764d311a086 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 14 Jan 2020 21:30:16 +0100 Subject: [PATCH 04/30] revert change --- pandas/core/reshape/pivot.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index a7cdbb0da7a4e..b443ba142369c 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -117,7 +117,6 @@ def pivot_table( agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged - if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer From c8ee822415ea765b417eff2b468bcd663986c888 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 23 Feb 2020 16:33:45 +0100 Subject: [PATCH 05/30] fix 32173 --- pandas/core/internals/construction.py | 20 ++++++++++++++++++- pandas/tests/frame/test_constructors.py | 26 +++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 57ed2555761be..7176b7f363387 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -574,12 +574,30 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None): if columns is None: columns = ibase.default_index(len(content)) else: - if len(columns) != len(content): # pragma: no cover + is_mi_list = isinstance(columns, list) and all( + isinstance(col, list) for col in columns + ) + + if not is_mi_list and len(columns) != len(content): # pragma: no cover # caller's responsibility to check for this... raise AssertionError( f"{len(columns)} columns passed, passed data had " f"{len(content)} columns" ) + elif is_mi_list: + + # check if nested list column, length of each sub-list should be equal + if len(set([len(col) for col in columns])) > 1: + raise ValueError( + "Length of columns passed for MultiIndex columns is different" + ) + + # if columns is not empty and then length of sub-list is not equal to content + elif columns and len(columns[0]) != len(content): + raise ValueError( + f"{len(columns[0])} columns passed, passed data had " + f"{len(content)} columns" + ) # provide soft conversion of object dtypes def convert(arr): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 8c9b7cd060059..75c18ffbe127e 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1062,6 +1062,32 @@ def test_constructor_list_of_lists(self): result = DataFrame(data) tm.assert_frame_equal(result, expected) + def test_constructor_list_like_data_nested_list_column(self): + # GH 32173 + arrays = [list("abcd"), list("cdef")] + result = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) + + mi = MultiIndex.from_arrays(arrays) + expected = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=mi) + + tm.assert_frame_equal(result, expected) + + def test_constructor_wrong_length_nested_list_column(self): + # GH 32173 + arrays = [list("abc"), list("cde")] + + msg = "3 columns passed, passed data had 4" + with pytest.raises(ValueError, match=msg): + DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) + + def test_constructor_inequal_length_nested_list_column(self): + # GH 32173 + arrays = [list("abcd"), list("cde")] + + msg = "Length of columns passed for MultiIndex columns is different" + with pytest.raises(ValueError, match=msg): + DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) + def test_constructor_sequence_like(self): # GH 3783 # collections.Squence like From 07ffde2c05be247dcf6a03565612a92e3ecd4f5d Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 23 Feb 2020 16:36:40 +0100 Subject: [PATCH 06/30] linting --- pandas/core/internals/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 7176b7f363387..fcd34a6d2a5f4 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -592,7 +592,7 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None): "Length of columns passed for MultiIndex columns is different" ) - # if columns is not empty and then length of sub-list is not equal to content + # if columns is not empty and then length of sublist is not equal to content elif columns and len(columns[0]) != len(content): raise ValueError( f"{len(columns[0])} columns passed, passed data had " From b3f3da06c6e919a93e3a981bcf1d7583726eda7a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 23 Feb 2020 17:09:04 +0100 Subject: [PATCH 07/30] linting --- pandas/core/internals/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index fcd34a6d2a5f4..3cd709237b341 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -587,7 +587,7 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None): elif is_mi_list: # check if nested list column, length of each sub-list should be equal - if len(set([len(col) for col in columns])) > 1: + if len(set(len(col) for col in columns)) > 1: raise ValueError( "Length of columns passed for MultiIndex columns is different" ) From 2f2054cbd9192fa5a5cd19c4d64440205331c1c8 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 23 Feb 2020 17:14:35 +0100 Subject: [PATCH 08/30] add whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7449c62a5ad31..15ce7d7607109 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -249,6 +249,8 @@ Other instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) - Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`) - Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`) +- Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`) +- - .. --------------------------------------------------------------------------- From 91763899b515713dcb1a5aaa8a5c0c5cefc6aee0 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 23 Feb 2020 18:25:39 +0100 Subject: [PATCH 09/30] fix linting --- pandas/core/internals/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 3cd709237b341..d75033de50d50 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -587,7 +587,7 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None): elif is_mi_list: # check if nested list column, length of each sub-list should be equal - if len(set(len(col) for col in columns)) > 1: + if len({len(col) for col in columns}) > 1: raise ValueError( "Length of columns passed for MultiIndex columns is different" ) From a5e0d104c58a97c9afb40a6971da5ef4158e4691 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 4 Mar 2020 22:14:09 +0100 Subject: [PATCH 10/30] separate out column validation --- pandas/core/internals/construction.py | 59 +++++++++++++++++---------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d75033de50d50..ff0685f2246a5 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -570,34 +570,49 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): ) -def _convert_object_array(content, columns, coerce_float=False, dtype=None): - if columns is None: - columns = ibase.default_index(len(content)) - else: - is_mi_list = isinstance(columns, list) and all( - isinstance(col, list) for col in columns +def _validate_columns(content, columns): + """Validate if columns are valid in length. + + Raises: + 1. When content is not composed of list of lists, and if length of columns + is not equal to length of content. + 2. When content is list of lists, but length of each sub-list is not equal + 3. When content is list of lists, but length of sub-list is not equal to + length of content + """ + + # Add mask for data which is composed of list of lists + is_mi_list = isinstance(columns, list) and all( + isinstance(col, list) for col in columns + ) + + if not is_mi_list and len(columns) != len(content): # pragma: no cover + # caller's responsibility to check for this... + raise AssertionError( + f"{len(columns)} columns passed, passed data had " + f"{len(content)} columns" ) + elif is_mi_list: + + # check if nested list column, length of each sub-list should be equal + if len({len(col) for col in columns}) > 1: + raise ValueError( + "Length of columns passed for MultiIndex columns is different" + ) - if not is_mi_list and len(columns) != len(content): # pragma: no cover - # caller's responsibility to check for this... - raise AssertionError( - f"{len(columns)} columns passed, passed data had " + # if columns is not empty and then length of sublist is not equal to content + elif columns and len(columns[0]) != len(content): + raise ValueError( + f"{len(columns[0])} columns passed, passed data had " f"{len(content)} columns" ) - elif is_mi_list: - # check if nested list column, length of each sub-list should be equal - if len({len(col) for col in columns}) > 1: - raise ValueError( - "Length of columns passed for MultiIndex columns is different" - ) - # if columns is not empty and then length of sublist is not equal to content - elif columns and len(columns[0]) != len(content): - raise ValueError( - f"{len(columns[0])} columns passed, passed data had " - f"{len(content)} columns" - ) +def _convert_object_array(content, columns, coerce_float=False, dtype=None): + if columns is None: + columns = ibase.default_index(len(content)) + else: + _validate_columns(content, columns) # provide soft conversion of object dtypes def convert(arr): From 6073ed7b9b2edd32bc28c94b8f93fbdb2b9bb3e5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 4 Mar 2020 22:29:57 +0100 Subject: [PATCH 11/30] code change based on JR review --- pandas/core/internals/construction.py | 71 ++++++++++++++------------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index ff0685f2246a5..551332073df4a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -526,9 +526,9 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): if values.dtype == np.object_: content = list(values.T) - return _convert_object_array( - content, columns, dtype=dtype, coerce_float=coerce_float - ) + columns = _validate_or_indexify_columns(content, columns) + content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) + return content, columns else: return values.T, columns @@ -565,13 +565,14 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): data = [(type(d) is dict) and d or dict(d) for d in data] content = list(lib.dicts_to_array(data, list(columns)).T) - return _convert_object_array( - content, columns, dtype=dtype, coerce_float=coerce_float - ) + columns = _validate_or_indexify_columns(content, columns) + content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) + return content, columns -def _validate_columns(content, columns): - """Validate if columns are valid in length. +def _validate_or_indexify_columns(content, columns): + """If columns is None, make numbers as column names; If not None, validate if + columns are valid in length. Raises: 1. When content is not composed of list of lists, and if length of columns @@ -580,39 +581,39 @@ def _validate_columns(content, columns): 3. When content is list of lists, but length of sub-list is not equal to length of content """ + if columns is None: + columns = ibase.default_index(len(content)) + else: - # Add mask for data which is composed of list of lists - is_mi_list = isinstance(columns, list) and all( - isinstance(col, list) for col in columns - ) - - if not is_mi_list and len(columns) != len(content): # pragma: no cover - # caller's responsibility to check for this... - raise AssertionError( - f"{len(columns)} columns passed, passed data had " - f"{len(content)} columns" + # Add mask for data which is composed of list of lists + is_mi_list = isinstance(columns, list) and all( + isinstance(col, list) for col in columns ) - elif is_mi_list: - - # check if nested list column, length of each sub-list should be equal - if len({len(col) for col in columns}) > 1: - raise ValueError( - "Length of columns passed for MultiIndex columns is different" - ) - # if columns is not empty and then length of sublist is not equal to content - elif columns and len(columns[0]) != len(content): - raise ValueError( - f"{len(columns[0])} columns passed, passed data had " + if not is_mi_list and len(columns) != len(content): # pragma: no cover + # caller's responsibility to check for this... + raise AssertionError( + f"{len(columns)} columns passed, passed data had " f"{len(content)} columns" ) + elif is_mi_list: + # check if nested list column, length of each sub-list should be equal + if len({len(col) for col in columns}) > 1: + raise ValueError( + "Length of columns passed for MultiIndex columns is different" + ) -def _convert_object_array(content, columns, coerce_float=False, dtype=None): - if columns is None: - columns = ibase.default_index(len(content)) - else: - _validate_columns(content, columns) + # if columns is not empty and length of sublist is not equal to content + elif columns and len(columns[0]) != len(content): + raise ValueError( + f"{len(columns[0])} columns passed, passed data had " + f"{len(content)} columns" + ) + return columns + + +def _convert_object_array(content, coerce_float=False, dtype=None): # provide soft conversion of object dtypes def convert(arr): @@ -623,7 +624,7 @@ def convert(arr): arrays = [convert(arr) for arr in content] - return arrays, columns + return arrays # --------------------------------------------------------------------- From e8f6d6738ccef2f4c5cc2fd2a54544a11c0706e5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 4 Mar 2020 23:01:00 +0100 Subject: [PATCH 12/30] fixup --- pandas/core/internals/construction.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 551332073df4a..efb02ff74ac18 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -457,7 +457,8 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) + columns = _validate_or_indexify_columns(data, columns) + return _list_to_arrays(data, coerce_float=coerce_float, dtype=dtype), columns elif isinstance(data[0], abc.Mapping): return _list_of_dict_to_arrays( data, columns, coerce_float=coerce_float, dtype=dtype @@ -481,10 +482,11 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): else: # last ditch effort data = [tuple(x) for x in data] - return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) + columns = _validate_or_indexify_columns(data, columns) + return _list_to_arrays(data, coerce_float=coerce_float, dtype=dtype), columns -def _list_to_arrays(data, columns, coerce_float=False, dtype=None): +def _list_to_arrays(data, coerce_float=False, dtype=None): if len(data) > 0 and isinstance(data[0], tuple): content = list(lib.to_object_array_tuples(data).T) else: @@ -493,7 +495,7 @@ def _list_to_arrays(data, columns, coerce_float=False, dtype=None): # gh-26429 do not raise user-facing AssertionError try: result = _convert_object_array( - content, columns, dtype=dtype, coerce_float=coerce_float + content, dtype=dtype, coerce_float=coerce_float ) except AssertionError as e: raise ValueError(e) from e From 559b5d6f94cf34ca2987b76aea29b1319b83f52b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 4 Mar 2020 23:41:08 +0100 Subject: [PATCH 13/30] fixup --- pandas/core/internals/construction.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index efb02ff74ac18..273ee19b58020 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -457,8 +457,7 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] if isinstance(data[0], (list, tuple)): - columns = _validate_or_indexify_columns(data, columns) - return _list_to_arrays(data, coerce_float=coerce_float, dtype=dtype), columns + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], abc.Mapping): return _list_of_dict_to_arrays( data, columns, coerce_float=coerce_float, dtype=dtype @@ -482,11 +481,10 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): else: # last ditch effort data = [tuple(x) for x in data] - columns = _validate_or_indexify_columns(data, columns) - return _list_to_arrays(data, coerce_float=coerce_float, dtype=dtype), columns + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) -def _list_to_arrays(data, coerce_float=False, dtype=None): +def _list_to_arrays(data, columns, coerce_float=False, dtype=None): if len(data) > 0 and isinstance(data[0], tuple): content = list(lib.to_object_array_tuples(data).T) else: @@ -494,12 +492,11 @@ def _list_to_arrays(data, coerce_float=False, dtype=None): content = list(lib.to_object_array(data).T) # gh-26429 do not raise user-facing AssertionError try: - result = _convert_object_array( - content, dtype=dtype, coerce_float=coerce_float - ) + columns = _validate_or_indexify_columns(content, columns) + result = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) except AssertionError as e: raise ValueError(e) from e - return result + return result, columns def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): From 2428edb4fe7a463369e6ca63cb7a91f890eb3868 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Mar 2020 10:18:26 +0100 Subject: [PATCH 14/30] add docs and annotation --- doc/source/whatsnew/v1.1.0.rst | 1 - pandas/core/internals/construction.py | 22 +++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 86e58d65b8efd..19268caad9122 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -395,7 +395,6 @@ Other - Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`) - Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) - .. --------------------------------------------------------------------------- .. _whatsnew_110.contributors: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 55ce4b1338ee7..7094afd21cde5 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,6 +3,7 @@ constructors before passing them to a BlockManager. """ from collections import abc +from typing import Optional, Union, Iterable import numpy as np import numpy.ma as ma @@ -596,15 +597,26 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): return content, columns -def _validate_or_indexify_columns(content, columns): +def _validate_or_indexify_columns(content: list, columns: Union[Iterable, None]) -> Iterable: """If columns is None, make numbers as column names; If not None, validate if columns are valid in length. - Raises: - 1. When content is not composed of list of lists, and if length of columns + Parameters + ---------- + content: list of processed data records + columns: Iterable or None + + Returns + ------- + columns: If columns is Iterable, return as is; If columns is None, assign + positional column index value as columns. + + Raises + ------ + 1. When content is not composed of list of lists, and if length of columns is not equal to length of content. - 2. When content is list of lists, but length of each sub-list is not equal - 3. When content is list of lists, but length of sub-list is not equal to + 2. When content is list of lists, but length of each sub-list is not equal + 3. When content is list of lists, but length of sub-list is not equal to length of content """ if columns is None: From fe18e50952bcc249d954adda0dfa4778699aac9a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Mar 2020 10:20:11 +0100 Subject: [PATCH 15/30] black --- pandas/core/internals/construction.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 7094afd21cde5..688675a4b625b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,7 +3,7 @@ constructors before passing them to a BlockManager. """ from collections import abc -from typing import Optional, Union, Iterable +from typing import Iterable, Union import numpy as np import numpy.ma as ma @@ -597,7 +597,9 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): return content, columns -def _validate_or_indexify_columns(content: list, columns: Union[Iterable, None]) -> Iterable: +def _validate_or_indexify_columns( + content: list, columns: Union[Iterable, None] +) -> Iterable: """If columns is None, make numbers as column names; If not None, validate if columns are valid in length. From a5d159bef11fb1da596897323f441f76062a5653 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Mar 2020 10:40:59 +0100 Subject: [PATCH 16/30] Add more docs --- pandas/core/internals/construction.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 688675a4b625b..5b8ebd3341936 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,7 +3,7 @@ constructors before passing them to a BlockManager. """ from collections import abc -from typing import Iterable, Union +from typing import Iterable, Optional, Union import numpy as np import numpy.ma as ma @@ -598,7 +598,7 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): def _validate_or_indexify_columns( - content: list, columns: Union[Iterable, None] + content: list, columns: Union[list, None] ) -> Iterable: """If columns is None, make numbers as column names; If not None, validate if columns are valid in length. @@ -653,8 +653,21 @@ def _validate_or_indexify_columns( return columns -def _convert_object_array(content, coerce_float=False, dtype=None): +def _convert_object_array( + content: list, coerce_float: bool = False, dtype: Optional[np.dtype] = None +) -> list: + """Internal function ot convert object array. + Parameters + ---------- + content: list of processed data records + coerce_float: bool, to coerce floats or not, default is False + dtype: np.dtype, default is None + + Returns + ------- + arrays: list of converted arrays + """ # provide soft conversion of object dtypes def convert(arr): if dtype != object and dtype != np.object: From 7516964c8b2993e08332f23d849eea92c37aa356 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Mar 2020 11:00:44 +0100 Subject: [PATCH 17/30] add annotation --- pandas/core/internals/construction.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 5b8ebd3341936..6ffa12e30c2cb 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,7 +3,7 @@ constructors before passing them to a BlockManager. """ from collections import abc -from typing import Iterable, Optional, Union +from typing import Iterable, Optional, Tuple, Union import numpy as np import numpy.ma as ma @@ -512,7 +512,9 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) -def _list_to_arrays(data, columns, coerce_float=False, dtype=None): +def _list_to_arrays( + data, columns, coerce_float=False, dtype=None +) -> Tuple[list, Iterable]: if len(data) > 0 and isinstance(data[0], tuple): content = list(lib.to_object_array_tuples(data).T) else: @@ -527,7 +529,9 @@ def _list_to_arrays(data, columns, coerce_float=False, dtype=None): return result, columns -def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): +def _list_of_series_to_arrays( + data, columns, coerce_float=False, dtype=None +) -> Tuple[list, Iterable]: if columns is None: # We know pass_data is non-empty because data[0] is a Series pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] @@ -560,7 +564,9 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): return values.T, columns -def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): +def _list_of_dict_to_arrays( + data, columns, coerce_float=False, dtype=None +) -> Tuple[list, Iterable]: """ Convert list of dicts to numpy arrays From 86bd69910fff6ba8622bdd32c8f9086a92e265ea Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Mar 2020 11:51:14 +0100 Subject: [PATCH 18/30] add for dict --- pandas/core/internals/construction.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 6ffa12e30c2cb..601d3ec00386e 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,12 +3,13 @@ constructors before passing them to a BlockManager. """ from collections import abc -from typing import Iterable, Optional, Tuple, Union +from typing import Dict, Iterable, Optional, Tuple, Union import numpy as np import numpy.ma as ma from pandas._libs import lib +from pandas._typing import Dtype, Scalar, T from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -537,7 +538,7 @@ def _list_of_series_to_arrays( pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] columns = get_objs_combined_axis(pass_data, sort=False) - indexer_cache = {} + indexer_cache: Dict[int, Scalar] = {} aligned_values = [] for s in data: @@ -660,7 +661,7 @@ def _validate_or_indexify_columns( def _convert_object_array( - content: list, coerce_float: bool = False, dtype: Optional[np.dtype] = None + content: list, coerce_float: bool = False, dtype: Optional[Dtype] = None ) -> list: """Internal function ot convert object array. From 30a70a7f486068c0997e769d69cbe1a95bdab120 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Mar 2020 12:12:13 +0100 Subject: [PATCH 19/30] remove unused import --- pandas/core/internals/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 601d3ec00386e..e6ea61f95b507 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -9,7 +9,7 @@ import numpy.ma as ma from pandas._libs import lib -from pandas._typing import Dtype, Scalar, T +from pandas._typing import Dtype, Scalar from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, From ed6dc4a00927a9358f6d292f3906ba8b7270e90d Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 2 Apr 2020 20:48:57 +0200 Subject: [PATCH 20/30] improve annotation --- pandas/core/internals/construction.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index ec4eac742fb52..0c2626a0516a3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,13 +3,13 @@ constructors before passing them to a BlockManager. """ from collections import abc -from typing import Dict, Iterable, Optional, Tuple, Union +from typing import Dict, Iterable, List, Optional, Tuple, Union import numpy as np import numpy.ma as ma from pandas._libs import lib -from pandas._typing import Dtype, Scalar +from pandas._typing import Dtype, Label, Scalar from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -535,7 +535,7 @@ def _list_to_arrays( def _list_of_series_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[list, Iterable]: +) -> Tuple[List[Label], Iterable[Label]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] @@ -570,7 +570,7 @@ def _list_of_series_to_arrays( def _list_of_dict_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[list, Iterable]: +) -> Tuple[List[Label], Iterable[Label]]: """ Convert list of dicts to numpy arrays @@ -608,8 +608,8 @@ def _list_of_dict_to_arrays( def _validate_or_indexify_columns( - content: list, columns: Union[list, None] -) -> Iterable: + content: List, columns: Union[Iterable[Label], None] +) -> Iterable[Label]: """If columns is None, make numbers as column names; If not None, validate if columns are valid in length. @@ -664,8 +664,8 @@ def _validate_or_indexify_columns( def _convert_object_array( - content: list, coerce_float: bool = False, dtype: Optional[Dtype] = None -) -> list: + content: List[Label], coerce_float: bool = False, dtype: Optional[Dtype] = None +) -> List[Label]: """Internal function ot convert object array. Parameters From f058d2cadb6bb4c4970b90e8012a305aea8fb7a9 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 2 Apr 2020 21:24:01 +0200 Subject: [PATCH 21/30] fix annotation --- pandas/core/internals/construction.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 0c2626a0516a3..46daccb635f3c 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,13 +3,13 @@ constructors before passing them to a BlockManager. """ from collections import abc -from typing import Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import numpy as np import numpy.ma as ma from pandas._libs import lib -from pandas._typing import Dtype, Label, Scalar +from pandas._typing import Axes, Dtype, Label, Scalar from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -518,7 +518,7 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): def _list_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[list, Iterable]: +) -> Tuple[List[Any], Axes[Label]]: if len(data) > 0 and isinstance(data[0], tuple): content = list(lib.to_object_array_tuples(data).T) else: @@ -535,7 +535,7 @@ def _list_to_arrays( def _list_of_series_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[List[Label], Iterable[Label]]: +) -> Tuple[List[Any], Axes[Label]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] @@ -570,7 +570,7 @@ def _list_of_series_to_arrays( def _list_of_dict_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[List[Label], Iterable[Label]]: +) -> Tuple[List[Any], Axes[Label]]: """ Convert list of dicts to numpy arrays @@ -608,8 +608,8 @@ def _list_of_dict_to_arrays( def _validate_or_indexify_columns( - content: List, columns: Union[Iterable[Label], None] -) -> Iterable[Label]: + content: List, columns: Union[Axes[Label], None] +) -> Axes[Label]: """If columns is None, make numbers as column names; If not None, validate if columns are valid in length. @@ -664,8 +664,8 @@ def _validate_or_indexify_columns( def _convert_object_array( - content: List[Label], coerce_float: bool = False, dtype: Optional[Dtype] = None -) -> List[Label]: + content: List[Any], coerce_float: bool = False, dtype: Optional[Dtype] = None +) -> List[Any]: """Internal function ot convert object array. Parameters From 2852579ccb9f31074ff2f1eea065e5a2d213ca5c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 2 Apr 2020 21:51:16 +0200 Subject: [PATCH 22/30] fix annotation --- pandas/core/internals/construction.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 46daccb635f3c..b10222fcc3ab0 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,7 +3,7 @@ constructors before passing them to a BlockManager. """ from collections import abc -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np import numpy.ma as ma @@ -535,7 +535,7 @@ def _list_to_arrays( def _list_of_series_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[List[Any], Axes[Label]]: +) -> Tuple[List[Any], Axes[Union[str, int]]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] @@ -570,7 +570,7 @@ def _list_of_series_to_arrays( def _list_of_dict_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[List[Any], Axes[Label]]: +) -> Tuple[List[Any], Axes[Union[str, int]]]: """ Convert list of dicts to numpy arrays @@ -608,8 +608,8 @@ def _list_of_dict_to_arrays( def _validate_or_indexify_columns( - content: List, columns: Union[Axes[Label], None] -) -> Axes[Label]: + content: List, columns: Union[Axes[Union[str, int]], None] +) -> Axes[Union[str, int]]: """If columns is None, make numbers as column names; If not None, validate if columns are valid in length. From 493ac33d906eb374ee29dd3095f9cc45a6441f14 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 2 Apr 2020 22:31:06 +0200 Subject: [PATCH 23/30] fixup --- pandas/core/internals/construction.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b10222fcc3ab0..16a9cdde6591a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -9,7 +9,7 @@ import numpy.ma as ma from pandas._libs import lib -from pandas._typing import Axes, Dtype, Label, Scalar +from pandas._typing import Dtype, Scalar from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -518,7 +518,7 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): def _list_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[List[Any], Axes[Label]]: +) -> Tuple[List[Any], Union[Index, List]]: if len(data) > 0 and isinstance(data[0], tuple): content = list(lib.to_object_array_tuples(data).T) else: @@ -535,7 +535,7 @@ def _list_to_arrays( def _list_of_series_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[List[Any], Axes[Union[str, int]]]: +) -> Tuple[List[Any], Union[Index, List]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] @@ -570,7 +570,7 @@ def _list_of_series_to_arrays( def _list_of_dict_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[List[Any], Axes[Union[str, int]]]: +) -> Tuple[List[Any], Union[Index, List]]: """ Convert list of dicts to numpy arrays @@ -608,8 +608,8 @@ def _list_of_dict_to_arrays( def _validate_or_indexify_columns( - content: List, columns: Union[Axes[Union[str, int]], None] -) -> Axes[Union[str, int]]: + content: List, columns: Union[Index, List, None] +) -> Union[Index, List]: """If columns is None, make numbers as column names; If not None, validate if columns are valid in length. From 3ecd6b818a972c59bc03fc4f5713c6b5725f739b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Apr 2020 17:50:38 +0200 Subject: [PATCH 24/30] more details --- pandas/core/internals/construction.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 16a9cdde6591a..73b306b934401 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -9,7 +9,7 @@ import numpy.ma as ma from pandas._libs import lib -from pandas._typing import Dtype, Scalar +from pandas._typing import Dtype, Scalar, Axis from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -535,7 +535,7 @@ def _list_to_arrays( def _list_of_series_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[List[Any], Union[Index, List]]: +) -> Tuple[List[Any], Union[Index, List[Axis]]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] @@ -570,7 +570,7 @@ def _list_of_series_to_arrays( def _list_of_dict_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[List[Any], Union[Index, List]]: +) -> Tuple[List[Any], Union[Index, List[Axis]]]: """ Convert list of dicts to numpy arrays @@ -608,8 +608,8 @@ def _list_of_dict_to_arrays( def _validate_or_indexify_columns( - content: List, columns: Union[Index, List, None] -) -> Union[Index, List]: + content: List, columns: Union[Index, List[Axis], None] +) -> Union[Index, List[Axis]]: """If columns is None, make numbers as column names; If not None, validate if columns are valid in length. From 851a3e14405208e8c525eeb096b2f1d16412364c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Apr 2020 17:50:55 +0200 Subject: [PATCH 25/30] isort --- pandas/core/internals/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 73b306b934401..3345bd9cd209f 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -9,7 +9,7 @@ import numpy.ma as ma from pandas._libs import lib -from pandas._typing import Dtype, Scalar, Axis +from pandas._typing import Axis, Dtype, Scalar from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, From 9860985fe8c912c6a2a1c65e10c0e3cd2eafcc01 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Apr 2020 17:53:14 +0200 Subject: [PATCH 26/30] better annotation --- pandas/core/internals/construction.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 3345bd9cd209f..46edb81655cee 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -518,7 +518,7 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): def _list_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[List[Any], Union[Index, List]]: +) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if len(data) > 0 and isinstance(data[0], tuple): content = list(lib.to_object_array_tuples(data).T) else: @@ -535,7 +535,7 @@ def _list_to_arrays( def _list_of_series_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[List[Any], Union[Index, List[Axis]]]: +) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] @@ -570,7 +570,7 @@ def _list_of_series_to_arrays( def _list_of_dict_to_arrays( data, columns, coerce_float=False, dtype=None -) -> Tuple[List[Any], Union[Index, List[Axis]]]: +) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: """ Convert list of dicts to numpy arrays @@ -664,8 +664,8 @@ def _validate_or_indexify_columns( def _convert_object_array( - content: List[Any], coerce_float: bool = False, dtype: Optional[Dtype] = None -) -> List[Any]: + content: List[Scalar], coerce_float: bool = False, dtype: Optional[Dtype] = None +) -> List[Scalar]: """Internal function ot convert object array. Parameters From ffc6561b4b2e273a4629fb7e13c23bda8f812fff Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Apr 2020 18:09:48 +0200 Subject: [PATCH 27/30] removed unused import --- pandas/core/internals/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 46edb81655cee..58962c87350e2 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,7 +3,7 @@ constructors before passing them to a BlockManager. """ from collections import abc -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np import numpy.ma as ma From a028c33fa3292046a986e8a4205e498010dbf0ad Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Apr 2020 18:13:36 +0200 Subject: [PATCH 28/30] linting --- pandas/core/internals/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 58962c87350e2..2c27f8c87087f 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -608,7 +608,7 @@ def _list_of_dict_to_arrays( def _validate_or_indexify_columns( - content: List, columns: Union[Index, List[Axis], None] + content: List, columns: Union[Index, List, None] ) -> Union[Index, List[Axis]]: """If columns is None, make numbers as column names; If not None, validate if columns are valid in length. From 5af0f8e352d55ff416a8d75a73d04f9eb84379eb Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Apr 2020 22:11:36 +0200 Subject: [PATCH 29/30] code change on JB review --- pandas/core/internals/construction.py | 38 ++++++++++++++++--------- pandas/tests/frame/test_constructors.py | 2 +- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2c27f8c87087f..26d0cd6f60a5e 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -517,7 +517,10 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): def _list_to_arrays( - data, columns, coerce_float=False, dtype=None + data: List[Scalar], + columns: Union[Index, List], + coerce_float: bool = False, + dtype: Optional[Dtype] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if len(data) > 0 and isinstance(data[0], tuple): content = list(lib.to_object_array_tuples(data).T) @@ -534,7 +537,10 @@ def _list_to_arrays( def _list_of_series_to_arrays( - data, columns, coerce_float=False, dtype=None + data: List[Scalar], + columns: Union[Index, List], + coerce_float: bool = False, + dtype: Optional[Dtype] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series @@ -569,7 +575,10 @@ def _list_of_series_to_arrays( def _list_of_dict_to_arrays( - data, columns, coerce_float=False, dtype=None + data: List[Scalar], + columns: Union[Index, List], + coerce_float: bool = False, + dtype: Optional[Dtype] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: """ Convert list of dicts to numpy arrays @@ -610,12 +619,13 @@ def _list_of_dict_to_arrays( def _validate_or_indexify_columns( content: List, columns: Union[Index, List, None] ) -> Union[Index, List[Axis]]: - """If columns is None, make numbers as column names; If not None, validate if - columns are valid in length. + """ + If columns is None, make numbers as column names; Otherwise, validate that + columns have valid length. Parameters ---------- - content: list of processed data records + content: list of data columns: Iterable or None Returns @@ -625,11 +635,12 @@ def _validate_or_indexify_columns( Raises ------ - 1. When content is not composed of list of lists, and if length of columns - is not equal to length of content. - 2. When content is list of lists, but length of each sub-list is not equal - 3. When content is list of lists, but length of sub-list is not equal to - length of content + 1. AssertionError when content is not composed of list of lists, and if + length of columns is not equal to length of content. + 2. ValueError when content is list of lists, but length of each sub-list + is not equal + 3. ValueError when content is list of lists, but length of sub-list is + not equal to length of content """ if columns is None: columns = ibase.default_index(len(content)) @@ -666,7 +677,8 @@ def _validate_or_indexify_columns( def _convert_object_array( content: List[Scalar], coerce_float: bool = False, dtype: Optional[Dtype] = None ) -> List[Scalar]: - """Internal function ot convert object array. + """ + Internal function ot convert object array. Parameters ---------- @@ -676,7 +688,7 @@ def _convert_object_array( Returns ------- - arrays: list of converted arrays + arrays: casted content if not object dtype, otherwise return as is in list. """ # provide soft conversion of object dtypes def convert(arr): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 55803d48d7e8f..da98da33f3235 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1081,7 +1081,7 @@ def test_constructor_wrong_length_nested_list_column(self): with pytest.raises(ValueError, match=msg): DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) - def test_constructor_inequal_length_nested_list_column(self): + def test_constructor_unequal_length_nested_list_column(self): # GH 32173 arrays = [list("abcd"), list("cde")] From 9eda16a5a23a0398ba0743f9fc9793d146525329 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Apr 2020 22:46:32 +0200 Subject: [PATCH 30/30] fixup --- pandas/core/internals/construction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 26d0cd6f60a5e..e14c041b6747a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -537,7 +537,7 @@ def _list_to_arrays( def _list_of_series_to_arrays( - data: List[Scalar], + data: List, columns: Union[Index, List], coerce_float: bool = False, dtype: Optional[Dtype] = None, @@ -575,7 +575,7 @@ def _list_of_series_to_arrays( def _list_of_dict_to_arrays( - data: List[Scalar], + data: List, columns: Union[Index, List], coerce_float: bool = False, dtype: Optional[Dtype] = None,