diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b9cc1dad53674..014bd22aa2dab 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1016,6 +1016,7 @@ Other - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) +- Backtick quoting in :meth:`DataFrame.query` and :meth:`DataFrame.eval` can now also be used to use invalid identifiers like names that start with a digit, are python keywords, or are using single character operators. (:issue:`27017`) - Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`) - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`) - Fix :class:`AbstractHolidayCalendar` to return correct results for diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 994f470942cd1..19a8898a2987c 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -4,9 +4,6 @@ from pandas._config import get_option -# A token value Python's tokenizer probably will never use. -_BACKTICK_QUOTED_STRING = 100 - def _ensure_decoded(s): """ @@ -29,16 +26,5 @@ def result_type_many(*arrays_and_dtypes): return reduce(np.result_type, arrays_and_dtypes) -def _remove_spaces_column_name(name): - """ - Check if name contains any spaces, if it contains any spaces - the spaces will be removed and an underscore suffix is added. - """ - if not isinstance(name, str) or " " not in name: - return name - - return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING" - - class NameResolutionError(NameError): pass diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 7599a82ddffed..5c320042721dc 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -12,7 +12,8 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.computation.engines import _engines -from pandas.core.computation.expr import Expr, _parsers, tokenize_string +from pandas.core.computation.expr import Expr, _parsers +from pandas.core.computation.parsing import tokenize_string from pandas.core.computation.scope import ensure_scope from pandas.io.formats.printing import pprint_thing diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 9b422b28c3c27..1350587b5ca90 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -3,19 +3,13 @@ import ast from functools import partial, reduce -from io import StringIO -import itertools as it -import operator +from keyword import iskeyword import tokenize from typing import Optional, Type import numpy as np import pandas.core.common as com -from pandas.core.computation.common import ( - _BACKTICK_QUOTED_STRING, - _remove_spaces_column_name, -) from pandas.core.computation.ops import ( _LOCAL_TAG, BinOp, @@ -34,38 +28,12 @@ _unary_ops_syms, is_term, ) +from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string from pandas.core.computation.scope import Scope import pandas.io.formats.printing as printing -def tokenize_string(source: str): - """ - Tokenize a Python source code string. - - Parameters - ---------- - source : str - A Python source code string - """ - line_reader = StringIO(source).readline - token_generator = tokenize.generate_tokens(line_reader) - - # Loop over all tokens till a backtick (`) is found. - # Then, take all tokens till the next backtick to form a backtick quoted - # string. - for toknum, tokval, _, _, _ in token_generator: - if tokval == "`": - tokval = " ".join( - it.takewhile( - lambda tokval: tokval != "`", - map(operator.itemgetter(1), token_generator), - ) - ) - toknum = _BACKTICK_QUOTED_STRING - yield toknum, tokval - - def _rewrite_assign(tok): """Rewrite the assignment operator for PyTables expressions that use ``=`` as a substitute for ``==``. @@ -133,31 +101,6 @@ def _replace_locals(tok): return toknum, tokval -def _clean_spaces_backtick_quoted_names(tok): - """Clean up a column name if surrounded by backticks. - - Backtick quoted string are indicated by a certain tokval value. If a string - is a backtick quoted token it will processed by - :func:`_remove_spaces_column_name` so that the parser can find this - string when the query is executed. - See also :meth:`NDFrame._get_space_character_free_column_resolver`. - - Parameters - ---------- - tok : tuple of int, str - ints correspond to the all caps constants in the tokenize module - - Returns - ------- - t : tuple of int, str - Either the input or token or the replacement values - """ - toknum, tokval = tok - if toknum == _BACKTICK_QUOTED_STRING: - return tokenize.NAME, _remove_spaces_column_name(tokval) - return toknum, tokval - - def _compose2(f, g): """Compose 2 callables""" return lambda *args, **kwargs: f(g(*args, **kwargs)) @@ -172,10 +115,7 @@ def _compose(*funcs): def _preparse( source: str, f=_compose( - _replace_locals, - _replace_booleans, - _rewrite_assign, - _clean_spaces_backtick_quoted_names, + _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks ), ): """Compose a collection of tokenization functions @@ -426,8 +366,6 @@ def visit(self, node, **kwargs): try: node = ast.fix_missing_locations(ast.parse(clean)) except SyntaxError as e: - from keyword import iskeyword - if any(iskeyword(x) for x in clean.split()): e.msg = "Python keyword not valid identifier in numexpr query" raise e @@ -781,9 +719,7 @@ def __init__( parser, preparser=partial( _preparse, - f=_compose( - _replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names - ), + f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks), ), ): super().__init__(env, engine, parser, preparser) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py new file mode 100644 index 0000000000000..ce213c8532834 --- /dev/null +++ b/pandas/core/computation/parsing.py @@ -0,0 +1,190 @@ +""":func:`~pandas.eval` source string parsing functions +""" + +from io import StringIO +from keyword import iskeyword +import token +import tokenize +from typing import Iterator, Tuple + +# A token value Python's tokenizer probably will never use. +BACKTICK_QUOTED_STRING = 100 + + +def create_valid_python_identifier(name: str) -> str: + """ + Create valid Python identifiers from any string. + + Check if name contains any special characters. If it contains any + special characters, the special characters will be replaced by + a special string and a prefix is added. + + Raises + ------ + SyntaxError + If the returned name is not a Python valid identifier, raise an exception. + This can happen if there is a hashtag in the name, as the tokenizer will + than terminate and not find the backtick. + But also for characters that fall out of the range of (U+0001..U+007F). + """ + if name.isidentifier() and not iskeyword(name): + return name + + # Create a dict with the special characters and their replacement string. + # EXACT_TOKEN_TYPES contains these special characters + # toke.tok_name contains a readable description of the replacement string. + special_characters_replacements = { + char: f"_{token.tok_name[tokval]}_" + # The ignore here is because of a bug in mypy that is resolved in 0.740 + for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore + } + special_characters_replacements.update( + { + " ": "_", + "?": "_QUESTIONMARK_", + "!": "_EXCLAMATIONMARK_", + "$": "_DOLLARSIGN_", + "€": "_EUROSIGN_", + # Including quotes works, but there are exceptions. + "'": "_SINGLEQUOTE_", + '"': "_DOUBLEQUOTE_", + # Currently not possible. Terminates parser and won't find backtick. + # "#": "_HASH_", + } + ) + + name = "".join(special_characters_replacements.get(char, char) for char in name) + name = "BACKTICK_QUOTED_STRING_" + name + + if not name.isidentifier(): + raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") + + return name + + +def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]: + """ + Clean up a column name if surrounded by backticks. + + Backtick quoted string are indicated by a certain tokval value. If a string + is a backtick quoted token it will processed by + :func:`_create_valid_python_identifier` so that the parser can find this + string when the query is executed. + In this case the tok will get the NAME tokval. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tok : Tuple[int, str] + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == BACKTICK_QUOTED_STRING: + return tokenize.NAME, create_valid_python_identifier(tokval) + return toknum, tokval + + +def clean_column_name(name: str) -> str: + """ + Function to emulate the cleaning of a backtick quoted name. + + The purpose for this function is to see what happens to the name of + identifier if it goes to the process of being parsed a Python code + inside a backtick quoted string and than being cleaned + (removed of any special characters). + + Parameters + ---------- + name : str + Name to be cleaned. + + Returns + ------- + name : str + Returns the name after tokenizing and cleaning. + + Notes + ----- + For some cases, a name cannot be converted to a valid Python identifier. + In that case :func:`tokenize_string` raises a SyntaxError. + In that case, we just return the name unmodified. + + If this name was used in the query string (this makes the query call impossible) + an error will be raised by :func:`tokenize_backtick_quoted_string` instead, + which is not catched and propogates to the user level. + """ + try: + tokenized = tokenize_string(f"`{name}`") + tokval = next(tokenized)[1] + return create_valid_python_identifier(tokval) + except SyntaxError: + return name + + +def tokenize_backtick_quoted_string( + token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int +) -> Tuple[int, str]: + """ + Creates a token from a backtick quoted string. + + Moves the token_generator forwards till right after the next backtick. + + Parameters + ---------- + token_generator : Iterator[tokenize.TokenInfo] + The generator that yields the tokens of the source string (Tuple[int, str]). + The generator is at the first token after the backtick (`) + + source : str + The Python source code string. + + string_start : int + This is the start of backtick quoted string inside the source string. + + Returns + ------- + tok: Tuple[int, str] + The token that represents the backtick quoted string. + The integer is equal to BACKTICK_QUOTED_STRING (100). + """ + for _, tokval, start, _, _ in token_generator: + if tokval == "`": + string_end = start[1] + break + + return BACKTICK_QUOTED_STRING, source[string_start:string_end] + + +def tokenize_string(source: str) -> Iterator[Tuple[int, str]]: + """ + Tokenize a Python source code string. + + Parameters + ---------- + source : str + The Python source code string. + + Returns + ------- + tok_generator : Iterator[Tuple[int, str]] + An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). + """ + line_reader = StringIO(source).readline + token_generator = tokenize.generate_tokens(line_reader) + + # Loop over all tokens till a backtick (`) is found. + # Then, take all tokens till the next backtick to form a backtick quoted string + for toknum, tokval, start, _, _ in token_generator: + if tokval == "`": + try: + yield tokenize_backtick_quoted_string( + token_generator, source, string_start=start[1] + 1 + ) + except Exception: + raise SyntaxError(f"Failed to parse backticks in '{source}'.") + else: + yield toknum, tokval diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 65b315167bd58..97b218878f4cc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3066,18 +3066,27 @@ def query(self, expr, inplace=False, **kwargs): Parameters ---------- expr : str - The query string to evaluate. You can refer to variables + The query string to evaluate. + + You can refer to variables in the environment by prefixing them with an '@' character like ``@a + b``. - .. versionadded:: 0.25.0 - - You can refer to column names that contain spaces by surrounding - them in backticks. + You can refer to column names that contain spaces or operators by + surrounding them in backticks. This way you can also escape + names that start with a digit, or those that are a Python keyword. + Basically when it is not valid Python identifier. See notes down + for more details. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. + .. versionadded:: 0.25.0 + Backtick quoting introduced. + + .. versionadded:: 1.0.0 + Expanding functionality of backtick quoting for more than only spaces. + inplace : bool Whether the query should modify the data in place or return a modified copy. @@ -3132,6 +3141,32 @@ def query(self, expr, inplace=False, **kwargs): For further details and examples see the ``query`` documentation in :ref:`indexing `. + *Backtick quoted variables* + + Backtick quoted variables are parsed as literal Python code and + are converted internally to a Python valid identifier. + This can lead to the following problems. + + During parsing a number of disallowed characters inside the backtick + quoted string are replaced by strings that are allowed as a Python identifier. + These characters include all operators in Python, the space character, the + question mark, the exclamation mark, the dollar sign, and the euro sign. + For other characters that fall outside the ASCII range (U+0001..U+007F) + and those that are not further specified in PEP 3131, + the query parser will raise an error. + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). + + In a special case, quotes that make a pair around a backtick can + confuse the parser. + For example, ```it's` > `that's``` will raise an error, + as it forms a quoted string (``'s > `that'``) with a backtick inside. + + See also the Python documentation about lexical analysis + (https://docs.python.org/3/reference/lexical_analysis.html) + in combination with the source code in :mod:`pandas.core.computation.parsing`. + Examples -------- >>> df = pd.DataFrame({'A': range(1, 6), @@ -3281,11 +3316,12 @@ def eval(self, expr, inplace=False, **kwargs): kwargs["level"] = kwargs.pop("level", 0) + 1 if resolvers is None: index_resolvers = self._get_index_resolvers() - column_resolvers = self._get_space_character_free_column_resolvers() + column_resolvers = self._get_cleaned_column_resolvers() resolvers = column_resolvers, index_resolvers if "target" not in kwargs: kwargs["target"] = self kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers) + return _eval(expr, inplace=inplace, **kwargs) def select_dtypes(self, include=None, exclude=None): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f5b0ce1ae77fb..21a22322daece 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -424,7 +424,7 @@ def _get_block_manager_axis(cls, axis): return m - axis return axis - def _get_axis_resolvers(self, axis): + def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: # index or columns axis_index = getattr(self, axis) d = dict() @@ -454,22 +454,29 @@ def _get_axis_resolvers(self, axis): d[axis] = dindex return d - def _get_index_resolvers(self): - d = {} + def _get_index_resolvers(self) -> Dict[str, ABCSeries]: + from pandas.core.computation.parsing import clean_column_name + + d: Dict[str, ABCSeries] = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) - return d - def _get_space_character_free_column_resolvers(self): - """Return the space character free column resolvers of a dataframe. + return {clean_column_name(k): v for k, v in d.items() if k is not int} - Column names with spaces are 'cleaned up' so that they can be referred - to by backtick quoting. + def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]: + """ + Return the special character free column resolvers of a dataframe. + + Column names with special characters are 'cleaned up' so that they can + be referred to by backtick quoting. Used in :meth:`DataFrame.eval`. """ - from pandas.core.computation.common import _remove_spaces_column_name + from pandas.core.computation.parsing import clean_column_name + + if isinstance(self, ABCSeries): + return {clean_column_name(self.name): self} - return {_remove_spaces_column_name(k): v for k, v in self.items()} + return {clean_column_name(k): v for k, v in self.items() if k is not int} @property def _info_axis(self): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 9cd26160ec877..578487ea3f54c 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1048,13 +1048,34 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op): class TestDataFrameQueryBacktickQuoting: @pytest.fixture(scope="class") def df(self): + """ + Yields a dataframe with strings that may or may not need escaping + by backticks. The last two columns cannot be escaped by backticks + and should raise a ValueError. + """ yield DataFrame( { "A": [1, 2, 3], "B B": [3, 2, 1], "C C": [4, 5, 6], + "C C": [7, 4, 3], "C_C": [8, 9, 10], "D_D D": [11, 1, 101], + "E.E": [6, 3, 5], + "F-F": [8, 1, 10], + "1e1": [2, 4, 8], + "def": [10, 11, 2], + "A (x)": [4, 1, 3], + "B(x)": [1, 1, 5], + "B (x)": [2, 7, 4], + " &^ :!€$?(} > <++*'' ": [2, 5, 6], + "": [10, 11, 1], + " A": [4, 7, 9], + " ": [1, 2, 1], + "it's": [6, 3, 1], + "that's": [9, 1, 8], + "☺": [8, 7, 6], + "foo#bar": [2, 4, 5], } ) @@ -1093,7 +1114,64 @@ def test_mixed_underscores_and_spaces(self, df): expect = df["A"] + df["D_D D"] tm.assert_series_equal(res, expect) - def backtick_quote_name_with_no_spaces(self, df): + def test_backtick_quote_name_with_no_spaces(self, df): res = df.eval("A + `C_C`") expect = df["A"] + df["C_C"] tm.assert_series_equal(res, expect) + + def test_special_characters(self, df): + res = df.eval("`E.E` + `F-F` - A") + expect = df["E.E"] + df["F-F"] - df["A"] + tm.assert_series_equal(res, expect) + + def test_start_with_digit(self, df): + res = df.eval("A + `1e1`") + expect = df["A"] + df["1e1"] + tm.assert_series_equal(res, expect) + + def test_keyword(self, df): + res = df.eval("A + `def`") + expect = df["A"] + df["def"] + tm.assert_series_equal(res, expect) + + def test_unneeded_quoting(self, df): + res = df.query("`A` > 2") + expect = df[df["A"] > 2] + tm.assert_frame_equal(res, expect) + + def test_parenthesis(self, df): + res = df.query("`A (x)` > 2") + expect = df[df["A (x)"] > 2] + tm.assert_frame_equal(res, expect) + + def test_empty_string(self, df): + res = df.query("`` > 5") + expect = df[df[""] > 5] + tm.assert_frame_equal(res, expect) + + def test_multiple_spaces(self, df): + res = df.query("`C C` > 5") + expect = df[df["C C"] > 5] + tm.assert_frame_equal(res, expect) + + def test_start_with_spaces(self, df): + res = df.eval("` A` + ` `") + expect = df[" A"] + df[" "] + tm.assert_series_equal(res, expect) + + def test_lots_of_operators_string(self, df): + res = df.query("` &^ :!€$?(} > <++*'' ` > 4") + expect = df[df[" &^ :!€$?(} > <++*'' "] > 4] + tm.assert_frame_equal(res, expect) + + def test_failing_quote(self, df): + with pytest.raises(SyntaxError): + df.query("`it's` > `that's`") + + def test_failing_character_outside_range(self, df): + with pytest.raises(SyntaxError): + df.query("`☺` > 4") + + def test_failing_hashtag(self, df): + with pytest.raises(SyntaxError): + df.query("`foo#bar` > 4")