Skip to content

Commit 5875421

Browse files
move to parsing.py, split better, add tests
1 parent 921f4b9 commit 5875421

File tree

3 files changed

+95
-30
lines changed

3 files changed

+95
-30
lines changed

pandas/core/computation/parsing.py

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44

55
from __future__ import annotations
66

7-
from io import StringIO
7+
from io import (
8+
BytesIO,
9+
StringIO,
10+
)
811
from keyword import iskeyword
912
import token
1013
import tokenize
@@ -58,7 +61,7 @@ def create_valid_python_identifier(name: str) -> str:
5861
"'": "_SINGLEQUOTE_",
5962
'"': "_DOUBLEQUOTE_",
6063
# Currently not possible. Terminates parser and won't find backtick.
61-
# "#": "_HASH_",
64+
"#": "_HASH_",
6265
}
6366
)
6467

@@ -168,6 +171,69 @@ def tokenize_backtick_quoted_string(
168171
return BACKTICK_QUOTED_STRING, source[string_start:string_end]
169172

170173

174+
def split_by_backtick(s: str) -> list[tuple[bool, str]]:
175+
substrings = []
176+
substring = ""
177+
i = 0
178+
while i < len(s):
179+
backtick_index = s.find("`", i)
180+
181+
# No backticks
182+
if backtick_index == -1:
183+
substrings.append((False, substring + s[i:]))
184+
break
185+
186+
single_quote_index = s.find("'", i)
187+
double_quote_index = s.find('"', i)
188+
if (single_quote_index == -1) and (double_quote_index == -1):
189+
quote_index = -1
190+
elif single_quote_index == -1:
191+
quote_index = double_quote_index
192+
elif double_quote_index == -1:
193+
quote_index = single_quote_index
194+
else:
195+
quote_index = min(single_quote_index, double_quote_index)
196+
197+
# No quotes
198+
if quote_index == -1:
199+
next_backtick_index = s.find("`", backtick_index + 1)
200+
# Backtick opened before quote
201+
elif backtick_index < quote_index:
202+
next_backtick_index = s.find("`", backtick_index + 1)
203+
# Quote opened before backtick
204+
else:
205+
next_quote_index = -1
206+
line_reader = BytesIO(s[i:].encode("utf-8")).readline
207+
token_generator = tokenize.tokenize(line_reader)
208+
for toknum, _, (_, _), (_, end), _ in token_generator:
209+
if toknum == tokenize.STRING:
210+
next_quote_index = i + end - 1
211+
break
212+
213+
# Quote is unmatched
214+
if next_quote_index == -1:
215+
next_backtick_index = s.find("`", backtick_index + 1)
216+
# Quote is matched
217+
else:
218+
substring += s[i:next_quote_index]
219+
i = next_quote_index
220+
continue
221+
222+
# Backtick is unmatched
223+
if next_backtick_index == -1:
224+
substrings.append((False, substring + s[i:]))
225+
break
226+
# Backtick is matched
227+
else:
228+
if i != backtick_index:
229+
substrings.append((False, substring + s[i:backtick_index]))
230+
substrings.append((True, s[backtick_index : next_backtick_index + 1]))
231+
substring = ""
232+
i = next_backtick_index + 1
233+
234+
return substrings
235+
236+
171237
def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
172238
"""
173239
Tokenize a Python source code string.
@@ -182,6 +248,16 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
182248
tok_generator : Iterator[Tuple[int, str]]
183249
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
184250
"""
251+
# GH 59285
252+
source = "".join(
253+
(
254+
f"`{create_valid_python_identifier(substring[1:-1])}`"
255+
if is_backticked
256+
else substring
257+
)
258+
for is_backticked, substring in split_by_backtick(source)
259+
)
260+
185261
line_reader = StringIO(source).readline
186262
token_generator = tokenize.generate_tokens(line_reader)
187263

pandas/core/frame.py

Lines changed: 1 addition & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
cast,
3535
overload,
3636
)
37-
import urllib.parse
3837
import warnings
3938

4039
import numpy as np
@@ -4612,33 +4611,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
46124611
kwargs["level"] = kwargs.pop("level", 0) + 1
46134612
kwargs["target"] = None
46144613

4615-
# GH 59285
4616-
if any(("#" in col) or ("'" in col) or ('"' in col) for col in self.columns):
4617-
# Create a copy of `self` with column names escaped
4618-
escaped_self = self.copy()
4619-
escaped_self.columns = map(urllib.parse.quote, escaped_self.columns)
4620-
4621-
# In expr, escape column names between backticks
4622-
column_name_to_escaped = {
4623-
col: urllib.parse.quote(col) for col in self.columns
4624-
}
4625-
# A `token` with an odd-number index is a column name
4626-
escaped_expr = "`".join(
4627-
(column_name_to_escaped.get(token, token) if (i % 2) else token)
4628-
for i, token in enumerate(expr.split("`"))
4629-
)
4630-
4631-
# eval
4632-
escaped_res = escaped_self.eval(escaped_expr, **kwargs)
4633-
4634-
# If `res` is a Series or DataFrame, unescape names
4635-
res = escaped_res.copy()
4636-
if isinstance(res, Series) and res.name:
4637-
res.name = urllib.parse.unquote(res.name)
4638-
elif isinstance(res, DataFrame):
4639-
res.columns = map(urllib.parse.unquote, res.columns)
4640-
else:
4641-
res = self.eval(expr, **kwargs)
4614+
res = self.eval(expr, **kwargs)
46424615

46434616
try:
46444617
result = self.loc[res]

pandas/tests/computation/test_eval.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1994,6 +1994,22 @@ def test_query_on_expr_with_comment():
19941994
tm.assert_frame_equal(result, expected)
19951995

19961996

1997+
def test_query_on_expr_with_backticks():
1998+
# GH 59285
1999+
df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"])
2000+
result = df.query("'`' < `#backticks`")
2001+
expected = df["`" < df["#backticks"]]
2002+
tm.assert_frame_equal(result, expected)
2003+
2004+
2005+
def test_query_on_expr_with_backticked_string_same_as_column_name():
2006+
# GH 59285
2007+
df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"])
2008+
result = df.query("'`#backticks`' < `#backticks`")
2009+
expected = df["`#backticks`" < df["#backticks"]]
2010+
tm.assert_frame_equal(result, expected)
2011+
2012+
19972013
@pytest.mark.parametrize(
19982014
"col1,col2,expr",
19992015
[

0 commit comments

Comments
 (0)