Skip to content

Commit 44b247e

Browse files
authored
Merge pull request #310 from certik/tokenizer
Add initial tokenizer for Python
2 parents ea28c90 + 896164e commit 44b247e

28 files changed

+2250
-165
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ src/lpython/tests/print_integer
2323
src/lpython/tests/cmp32
2424
src/lpython/tests/x
2525
src/lpython/tests/ref_pickle.txt.new
26+
src/lpython/parser/tokenizer.cpp
2627

2728
## CMake
2829
CMakeCache.txt

build0.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,6 @@ python grammar/asdl_py.py
1212
python grammar/asdl_cpp.py grammar/Python.asdl src/lpython/python_ast.h
1313
# Generate a Fortran ASR from ASR.asdl (C++)
1414
python grammar/asdl_cpp.py grammar/ASR.asdl src/libasr/asr.h
15+
16+
# Generate the tokenizer
17+
(cd src/lpython/parser && re2c -W -b tokenizer.re -o tokenizer.cpp)

ci/build.xsh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ python grammar/asdl_cpp.py grammar/Python.asdl src/lpython/python_ast.h
3333
# Generate a Python AST from Python.asdl (Python)
3434
python grammar/asdl_py.py
3535

36+
# Generate the tokenizer
37+
pushd src/lpython/parser && re2c -W -b tokenizer.re -o tokenizer.cpp && popd
38+
3639
$lpython_version=$(cat version).strip()
3740
$dest="lpython-" + $lpython_version
3841
bash ci/create_source_tarball0.sh

run_tests.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ def main():
7878
run_test("cpp", "lpython --no-color --show-cpp {infile}",
7979
filename, update_reference, extra_args)
8080

81+
if tokens:
82+
run_test("tokens", "lpython --no-color --show-tokens {infile} -o {outfile}",
83+
filename, update_reference, extra_args)
84+
8185
print()
8286

8387
if list_tests:

src/bin/lpython.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include <libasr/string_utils.h>
3232
#include <lpython/utils.h>
3333
#include <lpython/python_serialization.h>
34+
#include <lpython/parser/tokenizer.h>
3435

3536
#include <cpp-terminal/terminal.h>
3637
#include <cpp-terminal/prompt0.h>
@@ -92,6 +93,38 @@ std::string get_kokkos_dir()
9293

9394
#endif
9495

96+
int emit_tokens(const std::string &infile, bool line_numbers, const CompilerOptions &compiler_options)
97+
{
98+
std::string input = read_file(infile);
99+
// Src -> Tokens
100+
Allocator al(64*1024*1024);
101+
std::vector<int> toks;
102+
std::vector<LFortran::YYSTYPE> stypes;
103+
std::vector<LFortran::Location> locations;
104+
LFortran::diag::Diagnostics diagnostics;
105+
auto res = LFortran::tokens(al, input, diagnostics, &stypes, &locations);
106+
LFortran::LocationManager lm;
107+
lm.in_filename = infile;
108+
lm.init_simple(input);
109+
std::cerr << diagnostics.render(input, lm, compiler_options);
110+
if (res.ok) {
111+
toks = res.result;
112+
} else {
113+
LFORTRAN_ASSERT(diagnostics.has_error())
114+
return 1;
115+
}
116+
117+
for (size_t i=0; i < toks.size(); i++) {
118+
std::cout << LFortran::pickle_token(toks[i], stypes[i]);
119+
if (line_numbers) {
120+
std::cout << " " << locations[i].first << ":" << locations[i].last;
121+
}
122+
std::cout << std::endl;
123+
}
124+
return 0;
125+
}
126+
127+
95128
int emit_ast(const std::string &infile,
96129
const std::string &runtime_library_dir,
97130
CompilerOptions &compiler_options)
@@ -470,6 +503,7 @@ int main(int argc, char *argv[])
470503
std::string arg_o;
471504
std::vector<std::string> arg_files;
472505
bool arg_version = false;
506+
bool show_tokens = false;
473507
bool show_ast = false;
474508
bool show_asr = false;
475509
bool show_cpp = false;
@@ -520,6 +554,7 @@ int main(int argc, char *argv[])
520554

521555
// LPython specific options
522556
app.add_flag("--cpp", compiler_options.c_preprocessor, "Enable C preprocessing");
557+
app.add_flag("--show-tokens", show_tokens, "Show tokens for the given python file and exit");
523558
app.add_flag("--show-ast", show_ast, "Show AST for the given python file and exit");
524559
app.add_flag("--show-asr", show_asr, "Show ASR for the given python file and exit");
525560
app.add_flag("--show-llvm", show_llvm, "Show LLVM IR for the given file and exit");
@@ -661,6 +696,8 @@ int main(int argc, char *argv[])
661696
outfile = basename + ".s";
662697
} else if (arg_c) {
663698
outfile = basename + ".o";
699+
} else if (show_tokens) {
700+
outfile = basename + ".tok";
664701
} else if (show_ast) {
665702
outfile = basename + ".ast";
666703
} else if (show_asr) {
@@ -699,6 +736,9 @@ int main(int argc, char *argv[])
699736
}
700737
show_asr = true;
701738
}
739+
if (show_tokens) {
740+
return emit_tokens(arg_file, true, compiler_options);
741+
}
702742
if (show_ast) {
703743
return emit_ast(arg_file, runtime_library_dir, compiler_options);
704744
}

src/libasr/bigint.h

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
#ifndef LFORTRAN_BIGINT_H
2+
#define LFORTRAN_BIGINT_H
3+
4+
#include <cstdint>
5+
6+
#include <libasr/containers.h>
7+
8+
namespace LFortran {
9+
10+
namespace BigInt {
11+
12+
/*
13+
* Arbitrary size integer implementation.
14+
*
15+
* We use tagged signed 64bit integers with no padding bits and using 2's
16+
* complement for negative values (int64_t) as the underlying data structure.
17+
* Little-endian is assumed.
18+
*
19+
* Bits (from the left):
20+
* 1 ..... sign: 0 positive, 1 negative
21+
* 2 ..... tag: bits 1-2 equal to 01: pointer; otherwise integer
22+
* 3-64 .. if the tag is - integer: rest of the signed integer bits in 2's
23+
* complement
24+
* - pointer: 64 bit pointer shifted by 2
25+
* to the right (>> 2)
26+
*
27+
* The pointer must be aligned to 4 bytes (bits 63-64 must be 00).
28+
* Small signed integers are represented directly as integers in int64_t, large
29+
* integers are allocated on heap and a pointer to it is used as "tag pointer"
30+
* in int64_t.
31+
*
32+
* To check if the integer has a pointer tag, we check that the first two bits
33+
* (1-2) are equal to 01:
34+
*/
35+
36+
// Returns true if "i" is a pointer and false if "i" is an integer
37+
inline static bool is_int_ptr(int64_t i) {
38+
return (((uint64_t)i) >> (64 - 2)) == 1;
39+
}
40+
41+
/*
42+
* A pointer is converted to integer by shifting by 2 to the right and adding
43+
* 01 to the first two bits to tag it as a pointer:
44+
*/
45+
46+
// Converts a pointer "p" (must be aligned to 4 bytes) to a tagged int64_t
47+
inline static int64_t ptr_to_int(void *p) {
48+
return (int64_t)( (((uint64_t)p) >> 2) | (1ULL << (64 - 2)) );
49+
}
50+
51+
/* An integer with the pointer tag is converted to a pointer by shifting by 2
52+
* to the left, which erases the tag and puts 00 to bits 63-64:
53+
*/
54+
55+
// Converts a tagged int64_t to a pointer (aligned to 4 bytes)
56+
inline static void* int_to_ptr(int64_t i) {
57+
return (void *)(((uint64_t)i) << 2);
58+
}
59+
60+
/* The maximum small int is 2^62-1
61+
*/
62+
const int64_t MAX_SMALL_INT = (int64_t)((1ULL << 62)-1);
63+
64+
/* The minimum small int is -2^63
65+
*/
66+
const int64_t MIN_SMALL_INT = (int64_t)(-(1ULL << 63));
67+
68+
// Returns true if "i" is a small int
69+
inline static bool is_small_int(int64_t i) {
70+
return (MIN_SMALL_INT <= i && i <= MAX_SMALL_INT);
71+
}
72+
73+
/* Arbitrary integer implementation
74+
* For now large integers are implemented as strings with decimal digits. The
75+
* only supported operation on this is converting to and from a string. Later
76+
* we will replace with an actual large integer implementation and add other
77+
* operations.
78+
*/
79+
80+
// Converts a string to a large int (allocated on heap, returns a pointer)
81+
inline static int64_t string_to_largeint(Allocator &al, const Str &s) {
82+
char *cs = s.c_str(al);
83+
return ptr_to_int(cs);
84+
}
85+
86+
// Converts a large int to a string
87+
inline static char* largeint_to_string(int64_t i) {
88+
LFORTRAN_ASSERT(is_int_ptr(i));
89+
void *p = int_to_ptr(i);
90+
char *cs = (char*)p;
91+
return cs;
92+
}
93+
94+
inline static std::string int_to_str(int64_t i) {
95+
if (is_int_ptr(i)) {
96+
return std::string(largeint_to_string(i));
97+
} else {
98+
return std::to_string(i);
99+
}
100+
}
101+
102+
inline static bool is_int64(std::string str_repr) {
103+
std::string str_int64 = "9223372036854775807";
104+
if( str_repr.size() > str_int64.size() ) {
105+
return false;
106+
}
107+
108+
if( str_repr.size() < str_int64.size() ) {
109+
return true;
110+
}
111+
112+
size_t i;
113+
for( i = 0; i < str_repr.size() - 1 && str_repr[i] == str_int64[i]; i++ ) {
114+
}
115+
return i == str_repr.size() - 1 || str_repr[i] < str_int64[i];
116+
}
117+
118+
/* BigInt is a thin wrapper over the functionality exposed in the functions
119+
* above. The idea is that one can use the int64_t type directly and just use
120+
* the function above to handle the large integer aspects, and if it is a small
121+
* integer, one can use it directly as int64 integer.
122+
*
123+
* Alternatively, one can use the BigInt class below that exposes the
124+
* functionality via methods.
125+
*/
126+
127+
struct BigInt {
128+
int64_t n;
129+
130+
BigInt() = default;
131+
BigInt(const BigInt &) = default;
132+
BigInt& operator=(const BigInt &) = default;
133+
134+
void from_smallint(int64_t i) {
135+
LFORTRAN_ASSERT(is_small_int(i));
136+
n = i;
137+
}
138+
139+
void from_largeint(Allocator &al, const Str &s) {
140+
n = string_to_largeint(al, s);
141+
}
142+
143+
bool is_large() const {
144+
return is_int_ptr(n);
145+
}
146+
147+
int64_t as_smallint() const {
148+
LFORTRAN_ASSERT(!is_large());
149+
return n;
150+
}
151+
152+
std::string str() const {
153+
return int_to_str(n);
154+
}
155+
156+
};
157+
158+
static_assert(std::is_standard_layout<BigInt>::value);
159+
static_assert(std::is_trivial<BigInt>::value);
160+
static_assert(sizeof(BigInt) == sizeof(int64_t));
161+
static_assert(sizeof(BigInt) == 8);
162+
163+
164+
} // BigInt
165+
166+
} // LFortran
167+
168+
#endif // LFORTRAN_BIGINT_H

src/lpython/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
set(SRC
2+
parser/tokenizer.cpp
23
semantics/python_ast_to_asr.cpp
34

45
python_evaluator.cpp

0 commit comments

Comments
 (0)