Skip to content

Add initial tokenizer for Python #310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Apr 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ src/lpython/tests/print_integer
src/lpython/tests/cmp32
src/lpython/tests/x
src/lpython/tests/ref_pickle.txt.new
src/lpython/parser/tokenizer.cpp

## CMake
CMakeCache.txt
Expand Down
3 changes: 3 additions & 0 deletions build0.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ python grammar/asdl_py.py
python grammar/asdl_cpp.py grammar/Python.asdl src/lpython/python_ast.h
# Generate a Fortran ASR from ASR.asdl (C++)
python grammar/asdl_cpp.py grammar/ASR.asdl src/libasr/asr.h

# Generate the tokenizer
(cd src/lpython/parser && re2c -W -b tokenizer.re -o tokenizer.cpp)
3 changes: 3 additions & 0 deletions ci/build.xsh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ python grammar/asdl_cpp.py grammar/Python.asdl src/lpython/python_ast.h
# Generate a Python AST from Python.asdl (Python)
python grammar/asdl_py.py

# Generate the tokenizer
pushd src/lpython/parser && re2c -W -b tokenizer.re -o tokenizer.cpp && popd

$lpython_version=$(cat version).strip()
$dest="lpython-" + $lpython_version
bash ci/create_source_tarball0.sh
Expand Down
4 changes: 4 additions & 0 deletions run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ def main():
run_test("cpp", "lpython --no-color --show-cpp {infile}",
filename, update_reference, extra_args)

if tokens:
run_test("tokens", "lpython --no-color --show-tokens {infile} -o {outfile}",
filename, update_reference, extra_args)

print()

if list_tests:
Expand Down
40 changes: 40 additions & 0 deletions src/bin/lpython.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <libasr/string_utils.h>
#include <lpython/utils.h>
#include <lpython/python_serialization.h>
#include <lpython/parser/tokenizer.h>

#include <cpp-terminal/terminal.h>
#include <cpp-terminal/prompt0.h>
Expand Down Expand Up @@ -92,6 +93,38 @@ std::string get_kokkos_dir()

#endif

int emit_tokens(const std::string &infile, bool line_numbers, const CompilerOptions &compiler_options)
{
std::string input = read_file(infile);
// Src -> Tokens
Allocator al(64*1024*1024);
std::vector<int> toks;
std::vector<LFortran::YYSTYPE> stypes;
std::vector<LFortran::Location> locations;
LFortran::diag::Diagnostics diagnostics;
auto res = LFortran::tokens(al, input, diagnostics, &stypes, &locations);
LFortran::LocationManager lm;
lm.in_filename = infile;
lm.init_simple(input);
std::cerr << diagnostics.render(input, lm, compiler_options);
if (res.ok) {
toks = res.result;
} else {
LFORTRAN_ASSERT(diagnostics.has_error())
return 1;
}

for (size_t i=0; i < toks.size(); i++) {
std::cout << LFortran::pickle_token(toks[i], stypes[i]);
if (line_numbers) {
std::cout << " " << locations[i].first << ":" << locations[i].last;
}
std::cout << std::endl;
}
return 0;
}


int emit_ast(const std::string &infile,
const std::string &runtime_library_dir,
CompilerOptions &compiler_options)
Expand Down Expand Up @@ -470,6 +503,7 @@ int main(int argc, char *argv[])
std::string arg_o;
std::vector<std::string> arg_files;
bool arg_version = false;
bool show_tokens = false;
bool show_ast = false;
bool show_asr = false;
bool show_cpp = false;
Expand Down Expand Up @@ -520,6 +554,7 @@ int main(int argc, char *argv[])

// LPython specific options
app.add_flag("--cpp", compiler_options.c_preprocessor, "Enable C preprocessing");
app.add_flag("--show-tokens", show_tokens, "Show tokens for the given python file and exit");
app.add_flag("--show-ast", show_ast, "Show AST for the given python file and exit");
app.add_flag("--show-asr", show_asr, "Show ASR for the given python file and exit");
app.add_flag("--show-llvm", show_llvm, "Show LLVM IR for the given file and exit");
Expand Down Expand Up @@ -661,6 +696,8 @@ int main(int argc, char *argv[])
outfile = basename + ".s";
} else if (arg_c) {
outfile = basename + ".o";
} else if (show_tokens) {
outfile = basename + ".tok";
} else if (show_ast) {
outfile = basename + ".ast";
} else if (show_asr) {
Expand Down Expand Up @@ -699,6 +736,9 @@ int main(int argc, char *argv[])
}
show_asr = true;
}
if (show_tokens) {
return emit_tokens(arg_file, true, compiler_options);
}
if (show_ast) {
return emit_ast(arg_file, runtime_library_dir, compiler_options);
}
Expand Down
168 changes: 168 additions & 0 deletions src/libasr/bigint.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#ifndef LFORTRAN_BIGINT_H
#define LFORTRAN_BIGINT_H

#include <cstdint>

#include <libasr/containers.h>

namespace LFortran {

namespace BigInt {

/*
* Arbitrary size integer implementation.
*
* We use tagged signed 64bit integers with no padding bits and using 2's
* complement for negative values (int64_t) as the underlying data structure.
* Little-endian is assumed.
*
* Bits (from the left):
* 1 ..... sign: 0 positive, 1 negative
* 2 ..... tag: bits 1-2 equal to 01: pointer; otherwise integer
* 3-64 .. if the tag is - integer: rest of the signed integer bits in 2's
* complement
* - pointer: 64 bit pointer shifted by 2
* to the right (>> 2)
*
* The pointer must be aligned to 4 bytes (bits 63-64 must be 00).
* Small signed integers are represented directly as integers in int64_t, large
* integers are allocated on heap and a pointer to it is used as "tag pointer"
* in int64_t.
*
* To check if the integer has a pointer tag, we check that the first two bits
* (1-2) are equal to 01:
*/

// Returns true if "i" is a pointer and false if "i" is an integer
inline static bool is_int_ptr(int64_t i) {
return (((uint64_t)i) >> (64 - 2)) == 1;
}

/*
* A pointer is converted to integer by shifting by 2 to the right and adding
* 01 to the first two bits to tag it as a pointer:
*/

// Converts a pointer "p" (must be aligned to 4 bytes) to a tagged int64_t
inline static int64_t ptr_to_int(void *p) {
return (int64_t)( (((uint64_t)p) >> 2) | (1ULL << (64 - 2)) );
}

/* An integer with the pointer tag is converted to a pointer by shifting by 2
* to the left, which erases the tag and puts 00 to bits 63-64:
*/

// Converts a tagged int64_t to a pointer (aligned to 4 bytes)
inline static void* int_to_ptr(int64_t i) {
return (void *)(((uint64_t)i) << 2);
}

/* The maximum small int is 2^62-1
*/
const int64_t MAX_SMALL_INT = (int64_t)((1ULL << 62)-1);

/* The minimum small int is -2^63
*/
const int64_t MIN_SMALL_INT = (int64_t)(-(1ULL << 63));

// Returns true if "i" is a small int
inline static bool is_small_int(int64_t i) {
return (MIN_SMALL_INT <= i && i <= MAX_SMALL_INT);
}

/* Arbitrary integer implementation
* For now large integers are implemented as strings with decimal digits. The
* only supported operation on this is converting to and from a string. Later
* we will replace with an actual large integer implementation and add other
* operations.
*/

// Converts a string to a large int (allocated on heap, returns a pointer)
inline static int64_t string_to_largeint(Allocator &al, const Str &s) {
char *cs = s.c_str(al);
return ptr_to_int(cs);
}

// Converts a large int to a string
inline static char* largeint_to_string(int64_t i) {
LFORTRAN_ASSERT(is_int_ptr(i));
void *p = int_to_ptr(i);
char *cs = (char*)p;
return cs;
}

inline static std::string int_to_str(int64_t i) {
if (is_int_ptr(i)) {
return std::string(largeint_to_string(i));
} else {
return std::to_string(i);
}
}

inline static bool is_int64(std::string str_repr) {
std::string str_int64 = "9223372036854775807";
if( str_repr.size() > str_int64.size() ) {
return false;
}

if( str_repr.size() < str_int64.size() ) {
return true;
}

size_t i;
for( i = 0; i < str_repr.size() - 1 && str_repr[i] == str_int64[i]; i++ ) {
}
return i == str_repr.size() - 1 || str_repr[i] < str_int64[i];
}

/* BigInt is a thin wrapper over the functionality exposed in the functions
* above. The idea is that one can use the int64_t type directly and just use
* the function above to handle the large integer aspects, and if it is a small
* integer, one can use it directly as int64 integer.
*
* Alternatively, one can use the BigInt class below that exposes the
* functionality via methods.
*/

struct BigInt {
int64_t n;

BigInt() = default;
BigInt(const BigInt &) = default;
BigInt& operator=(const BigInt &) = default;

void from_smallint(int64_t i) {
LFORTRAN_ASSERT(is_small_int(i));
n = i;
}

void from_largeint(Allocator &al, const Str &s) {
n = string_to_largeint(al, s);
}

bool is_large() const {
return is_int_ptr(n);
}

int64_t as_smallint() const {
LFORTRAN_ASSERT(!is_large());
return n;
}

std::string str() const {
return int_to_str(n);
}

};

static_assert(std::is_standard_layout<BigInt>::value);
static_assert(std::is_trivial<BigInt>::value);
static_assert(sizeof(BigInt) == sizeof(int64_t));
static_assert(sizeof(BigInt) == 8);


} // BigInt

} // LFortran

#endif // LFORTRAN_BIGINT_H
1 change: 1 addition & 0 deletions src/lpython/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
set(SRC
parser/tokenizer.cpp
semantics/python_ast_to_asr.cpp

python_evaluator.cpp
Expand Down
Loading