Skip to content

Commit 212943c

Browse files
committed
[GR-56700] - Expose Java Tokenizer in _tokenize module
PullRequest: graalpython/3432
2 parents bf3d23d + 4fc712c commit 212943c

File tree

8 files changed

+345
-2
lines changed

8 files changed

+345
-2
lines changed

graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_tokenize.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
1+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_additive
2+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_async
3+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_comparison
4+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_continuation_lines_indentation
5+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_float
6+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_function
7+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_int
8+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_invalid_syntax
9+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_max_indent
10+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_method
11+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_multiplicative
12+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_selector
13+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_string
14+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_tabs
15+
*graalpython.lib-python.3.test.test_tokenize.CTokenizeTest.test_unary
116
*graalpython.lib-python.3.test.test_tokenize.CTokenizerBufferTests.test_newline_at_the_end_of_buffer
217
*graalpython.lib-python.3.test.test_tokenize.GenerateTokensTest.test_additive
318
*graalpython.lib-python.3.test.test_tokenize.GenerateTokensTest.test_async

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/Python3Core.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@
112112
import com.oracle.graal.python.builtins.modules.SysModuleBuiltins;
113113
import com.oracle.graal.python.builtins.modules.ThreadModuleBuiltins;
114114
import com.oracle.graal.python.builtins.modules.TimeModuleBuiltins;
115+
import com.oracle.graal.python.builtins.modules.TokenizeModuleBuiltins;
115116
import com.oracle.graal.python.builtins.modules.TracemallocModuleBuiltins;
116117
import com.oracle.graal.python.builtins.modules.UnicodeDataModuleBuiltins;
117118
import com.oracle.graal.python.builtins.modules.WarningsModuleBuiltins;
@@ -348,6 +349,7 @@
348349
import com.oracle.graal.python.builtins.objects.thread.RLockBuiltins;
349350
import com.oracle.graal.python.builtins.objects.thread.ThreadBuiltins;
350351
import com.oracle.graal.python.builtins.objects.thread.ThreadLocalBuiltins;
352+
import com.oracle.graal.python.builtins.objects.tokenize.TokenizerIterBuiltins;
351353
import com.oracle.graal.python.builtins.objects.traceback.TracebackBuiltins;
352354
import com.oracle.graal.python.builtins.objects.tuple.TupleBuiltins;
353355
import com.oracle.graal.python.builtins.objects.tuple.TupleGetterBuiltins;
@@ -779,7 +781,11 @@ private static PythonBuiltins[] initializeBuiltins(boolean nativeAccessAllowed,
779781
new AsyncioModuleBuiltins(),
780782
new AsyncGeneratorBuiltins(),
781783
new AsyncGenSendBuiltins(),
782-
new AsyncGenThrowBuiltins()));
784+
new AsyncGenThrowBuiltins(),
785+
786+
// _tokenizer
787+
new TokenizeModuleBuiltins(),
788+
new TokenizerIterBuiltins()));
783789
if (hasProfilerTool) {
784790
builtins.add(new LsprofModuleBuiltins());
785791
builtins.add(LsprofModuleBuiltins.newProfilerBuiltins());

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/PythonBuiltinClassType.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,8 @@ public enum PythonBuiltinClassType implements TruffleObject {
544544

545545
Capsule("capsule"),
546546

547+
PTokenizerIter("TokenizerIter", "_tokenize"),
548+
547549
// A marker for @Builtin that is not a class. Must always come last.
548550
nil("nil");
549551

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
/*
2+
* Copyright (c) 2024, 2024, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* The Universal Permissive License (UPL), Version 1.0
6+
*
7+
* Subject to the condition set forth below, permission is hereby granted to any
8+
* person obtaining a copy of this software, associated documentation and/or
9+
* data (collectively the "Software"), free of charge and under any and all
10+
* copyright rights in the Software, and any and all patent rights owned or
11+
* freely licensable by each licensor hereunder covering either (i) the
12+
* unmodified Software as contributed to or provided by such licensor, or (ii)
13+
* the Larger Works (as defined below), to deal in both
14+
*
15+
* (a) the Software, and
16+
*
17+
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
18+
* one is included with the Software each a "Larger Work" to which the Software
19+
* is contributed by such licensors),
20+
*
21+
* without restriction, including without limitation the rights to copy, create
22+
* derivative works of, display, perform, and distribute the Software and make,
23+
* use, sell, offer for sale, import, export, have made, and have sold the
24+
* Software and the Larger Work(s), and to sublicense the foregoing rights on
25+
* either these or other terms.
26+
*
27+
* This license is subject to the following condition:
28+
*
29+
* The above copyright notice and either this complete permission notice or at a
30+
* minimum a reference to the UPL must be included in all copies or substantial
31+
* portions of the Software.
32+
*
33+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39+
* SOFTWARE.
40+
*/
41+
package com.oracle.graal.python.builtins.modules;
42+
43+
import java.util.List;
44+
45+
import com.oracle.graal.python.annotations.ArgumentClinic;
46+
import com.oracle.graal.python.annotations.ArgumentClinic.ClinicConversion;
47+
import com.oracle.graal.python.builtins.Builtin;
48+
import com.oracle.graal.python.builtins.CoreFunctions;
49+
import com.oracle.graal.python.builtins.PythonBuiltinClassType;
50+
import com.oracle.graal.python.builtins.PythonBuiltins;
51+
import com.oracle.graal.python.builtins.modules.TokenizeModuleBuiltinsClinicProviders.TokenizerIterNodeClinicProviderGen;
52+
import com.oracle.graal.python.builtins.objects.tokenize.PTokenizerIter;
53+
import com.oracle.graal.python.nodes.function.PythonBuiltinBaseNode;
54+
import com.oracle.graal.python.nodes.function.builtins.PythonBinaryClinicBuiltinNode;
55+
import com.oracle.graal.python.nodes.function.builtins.clinic.ArgumentClinicProvider;
56+
import com.oracle.graal.python.runtime.object.PythonObjectFactory;
57+
import com.oracle.truffle.api.dsl.Cached;
58+
import com.oracle.truffle.api.dsl.GenerateNodeFactory;
59+
import com.oracle.truffle.api.dsl.NodeFactory;
60+
import com.oracle.truffle.api.dsl.Specialization;
61+
import com.oracle.truffle.api.strings.TruffleString;
62+
63+
@CoreFunctions(defineModule = "_tokenize", isEager = true)
64+
public final class TokenizeModuleBuiltins extends PythonBuiltins {
65+
66+
@Override
67+
protected List<? extends NodeFactory<? extends PythonBuiltinBaseNode>> getNodeFactories() {
68+
return TokenizeModuleBuiltinsFactory.getFactories();
69+
}
70+
71+
@Builtin(name = "TokenizerIter", minNumOfPositionalArgs = 2, parameterNames = {"$cls", "source"}, constructsClass = PythonBuiltinClassType.PTokenizerIter)
72+
@ArgumentClinic(name = "source", conversion = ClinicConversion.TString)
73+
@GenerateNodeFactory
74+
abstract static class TokenizerIterNode extends PythonBinaryClinicBuiltinNode {
75+
76+
@Override
77+
protected ArgumentClinicProvider getArgumentClinic() {
78+
return TokenizerIterNodeClinicProviderGen.INSTANCE;
79+
}
80+
81+
@Specialization
82+
static PTokenizerIter tokenizerIter(Object cls, TruffleString source,
83+
@Cached TruffleString.ToJavaStringNode toJavaStringNode,
84+
@Cached PythonObjectFactory factory) {
85+
return factory.createTokenizerIter(cls, toJavaStringNode.execute(source));
86+
}
87+
}
88+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
* Copyright (c) 2024, 2024, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* The Universal Permissive License (UPL), Version 1.0
6+
*
7+
* Subject to the condition set forth below, permission is hereby granted to any
8+
* person obtaining a copy of this software, associated documentation and/or
9+
* data (collectively the "Software"), free of charge and under any and all
10+
* copyright rights in the Software, and any and all patent rights owned or
11+
* freely licensable by each licensor hereunder covering either (i) the
12+
* unmodified Software as contributed to or provided by such licensor, or (ii)
13+
* the Larger Works (as defined below), to deal in both
14+
*
15+
* (a) the Software, and
16+
*
17+
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
18+
* one is included with the Software each a "Larger Work" to which the Software
19+
* is contributed by such licensors),
20+
*
21+
* without restriction, including without limitation the rights to copy, create
22+
* derivative works of, display, perform, and distribute the Software and make,
23+
* use, sell, offer for sale, import, export, have made, and have sold the
24+
* Software and the Larger Work(s), and to sublicense the foregoing rights on
25+
* either these or other terms.
26+
*
27+
* This license is subject to the following condition:
28+
*
29+
* The above copyright notice and either this complete permission notice or at a
30+
* minimum a reference to the UPL must be included in all copies or substantial
31+
* portions of the Software.
32+
*
33+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39+
* SOFTWARE.
40+
*/
41+
package com.oracle.graal.python.builtins.objects.tokenize;
42+
43+
import java.util.EnumSet;
44+
45+
import com.oracle.graal.python.PythonLanguage;
46+
import com.oracle.graal.python.builtins.objects.object.PythonBuiltinObject;
47+
import com.oracle.graal.python.compiler.RaisePythonExceptionErrorCallback;
48+
import com.oracle.graal.python.pegparser.ErrorCallback.ErrorType;
49+
import com.oracle.graal.python.pegparser.tokenizer.Token;
50+
import com.oracle.graal.python.pegparser.tokenizer.Token.Kind;
51+
import com.oracle.graal.python.pegparser.tokenizer.Tokenizer;
52+
import com.oracle.graal.python.pegparser.tokenizer.Tokenizer.Flag;
53+
import com.oracle.graal.python.pegparser.tokenizer.Tokenizer.StatusCode;
54+
import com.oracle.graal.python.runtime.PythonOptions;
55+
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
56+
import com.oracle.truffle.api.object.Shape;
57+
import com.oracle.truffle.api.source.Source;
58+
59+
public final class PTokenizerIter extends PythonBuiltinObject {
60+
61+
private final Source source;
62+
private final RaisePythonExceptionErrorCallback errorCallback;
63+
private final Tokenizer tokenizer;
64+
65+
@TruffleBoundary
66+
public PTokenizerIter(Object cls, Shape instanceShape, String sourceString) {
67+
super(cls, instanceShape);
68+
source = Source.newBuilder(PythonLanguage.ID, sourceString, "<string>").build();
69+
errorCallback = new RaisePythonExceptionErrorCallback(source, PythonOptions.isPExceptionWithJavaStacktrace(PythonLanguage.get(null)));
70+
tokenizer = Tokenizer.fromString(errorCallback, sourceString, EnumSet.of(Flag.EXEC_INPUT), null);
71+
}
72+
73+
@TruffleBoundary
74+
Token getNextToken() {
75+
Token token = tokenizer.next();
76+
errorCallback.triggerAndClearDeprecationWarnings();
77+
if (token.type == Kind.ERRORTOKEN && tokenizer.getDone() == StatusCode.SYNTAX_ERROR) {
78+
throw errorCallback.raiseSyntaxError(ErrorType.Syntax, token.sourceRange, (String) token.extraData);
79+
}
80+
return token;
81+
}
82+
83+
@TruffleBoundary
84+
String getTokenString(Token token) {
85+
return token.type == Kind.NEWLINE ? "" : tokenizer.getTokenString(token);
86+
}
87+
88+
@TruffleBoundary
89+
String getLine(Token token) {
90+
return source.getCharacters(token.sourceRange.startLine) + "\n";
91+
}
92+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
/*
2+
* Copyright (c) 2024, 2024, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* The Universal Permissive License (UPL), Version 1.0
6+
*
7+
* Subject to the condition set forth below, permission is hereby granted to any
8+
* person obtaining a copy of this software, associated documentation and/or
9+
* data (collectively the "Software"), free of charge and under any and all
10+
* copyright rights in the Software, and any and all patent rights owned or
11+
* freely licensable by each licensor hereunder covering either (i) the
12+
* unmodified Software as contributed to or provided by such licensor, or (ii)
13+
* the Larger Works (as defined below), to deal in both
14+
*
15+
* (a) the Software, and
16+
*
17+
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
18+
* one is included with the Software each a "Larger Work" to which the Software
19+
* is contributed by such licensors),
20+
*
21+
* without restriction, including without limitation the rights to copy, create
22+
* derivative works of, display, perform, and distribute the Software and make,
23+
* use, sell, offer for sale, import, export, have made, and have sold the
24+
* Software and the Larger Work(s), and to sublicense the foregoing rights on
25+
* either these or other terms.
26+
*
27+
* This license is subject to the following condition:
28+
*
29+
* The above copyright notice and either this complete permission notice or at a
30+
* minimum a reference to the UPL must be included in all copies or substantial
31+
* portions of the Software.
32+
*
33+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39+
* SOFTWARE.
40+
*/
41+
package com.oracle.graal.python.builtins.objects.tokenize;
42+
43+
import static com.oracle.graal.python.nodes.SpecialMethodNames.J___ITER__;
44+
import static com.oracle.graal.python.nodes.SpecialMethodNames.J___NEXT__;
45+
import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING;
46+
import static com.oracle.graal.python.util.PythonUtils.tsLiteral;
47+
48+
import java.util.List;
49+
50+
import com.oracle.graal.python.builtins.Builtin;
51+
import com.oracle.graal.python.builtins.CoreFunctions;
52+
import com.oracle.graal.python.builtins.PythonBuiltinClassType;
53+
import com.oracle.graal.python.builtins.PythonBuiltins;
54+
import com.oracle.graal.python.builtins.objects.tuple.PTuple;
55+
import com.oracle.graal.python.nodes.PRaiseNode;
56+
import com.oracle.graal.python.nodes.function.PythonBuiltinBaseNode;
57+
import com.oracle.graal.python.nodes.function.builtins.PythonUnaryBuiltinNode;
58+
import com.oracle.graal.python.pegparser.tokenizer.Token;
59+
import com.oracle.graal.python.pegparser.tokenizer.Token.Kind;
60+
import com.oracle.graal.python.runtime.object.PythonObjectFactory;
61+
import com.oracle.truffle.api.dsl.Bind;
62+
import com.oracle.truffle.api.dsl.Cached;
63+
import com.oracle.truffle.api.dsl.GenerateNodeFactory;
64+
import com.oracle.truffle.api.dsl.NodeFactory;
65+
import com.oracle.truffle.api.dsl.Specialization;
66+
import com.oracle.truffle.api.nodes.Node;
67+
import com.oracle.truffle.api.strings.TruffleString;
68+
69+
@CoreFunctions(extendClasses = PythonBuiltinClassType.PTokenizerIter)
70+
public final class TokenizerIterBuiltins extends PythonBuiltins {
71+
72+
@Override
73+
protected List<? extends NodeFactory<? extends PythonBuiltinBaseNode>> getNodeFactories() {
74+
return TokenizerIterBuiltinsFactory.getFactories();
75+
}
76+
77+
@Builtin(name = J___ITER__, minNumOfPositionalArgs = 1)
78+
@GenerateNodeFactory
79+
abstract static class IterNode extends PythonUnaryBuiltinNode {
80+
@Specialization
81+
static PTokenizerIter iter(PTokenizerIter self) {
82+
return self;
83+
}
84+
}
85+
86+
@Builtin(name = J___NEXT__, minNumOfPositionalArgs = 1)
87+
@GenerateNodeFactory
88+
abstract static class NextNode extends PythonUnaryBuiltinNode {
89+
private static final TruffleString T_EOF = tsLiteral("EOF");
90+
91+
@Specialization
92+
static PTuple next(PTokenizerIter self,
93+
@Bind("this") Node inliningTarget,
94+
@Cached TruffleString.FromJavaStringNode fromJavaStringNode,
95+
@Cached PythonObjectFactory factory,
96+
@Cached PRaiseNode.Lazy raiseNode) {
97+
Token token = self.getNextToken();
98+
if (token.type == Kind.ERRORTOKEN || token.type == Kind.ENDMARKER) {
99+
throw raiseNode.get(inliningTarget).raiseStopIteration(T_EOF);
100+
}
101+
int startColumn;
102+
int endColumn;
103+
if (token.type == Kind.INDENT || token.type == Kind.DEDENT) {
104+
startColumn = -1;
105+
endColumn = -1;
106+
} else {
107+
startColumn = token.sourceRange.startColumn;
108+
endColumn = token.sourceRange.endColumn;
109+
}
110+
if (token.type == Kind.NEWLINE) {
111+
endColumn--;
112+
}
113+
return factory.createTuple(new Object[]{
114+
fromJavaStringNode.execute(self.getTokenString(token), TS_ENCODING),
115+
token.type,
116+
token.sourceRange.startLine,
117+
token.sourceRange.endLine,
118+
startColumn,
119+
endColumn,
120+
fromJavaStringNode.execute(self.getLine(token), TS_ENCODING)
121+
});
122+
}
123+
}
124+
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/compiler/RaisePythonExceptionErrorCallback.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022, 2023, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2022, 2024, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0
@@ -101,6 +101,10 @@ public void reportIncompleteSource(int line) {
101101

102102
@Override
103103
public void onError(ErrorType errorType, SourceRange sourceRange, String message) {
104+
throw raiseSyntaxError(errorType, sourceRange, message);
105+
}
106+
107+
public PException raiseSyntaxError(ErrorType errorType, SourceRange sourceRange, String message) {
104108
throw raiseSyntaxError(errorType, sourceRange, toTruffleStringUncached(message), source, withJavaStackTrace);
105109
}
106110

@@ -215,6 +219,13 @@ public void triggerDeprecationWarnings() {
215219
}
216220
}
217221

222+
public void triggerAndClearDeprecationWarnings() {
223+
if (deprecationWarnings != null) {
224+
triggerDeprecationWarningsBoundary();
225+
deprecationWarnings.clear();
226+
}
227+
}
228+
218229
@TruffleBoundary
219230
private void triggerDeprecationWarningsBoundary() {
220231
PythonModule warnings = PythonContext.get(null).lookupBuiltinModule(T__WARNINGS);

0 commit comments

Comments
 (0)