lcompilers · certik · Apr 2, 2022 · Mar 31, 2022 · Apr 1, 2022 · Apr 1, 2022
diff --git a/.gitignore b/.gitignore
@@ -23,6 +23,7 @@ src/lpython/tests/print_integer
 src/lpython/tests/cmp32
 src/lpython/tests/x
 src/lpython/tests/ref_pickle.txt.new
+src/lpython/parser/tokenizer.cpp
 
 ## CMake
 CMakeCache.txt

diff --git a/build0.sh b/build0.sh
@@ -12,3 +12,6 @@ python grammar/asdl_py.py
 python grammar/asdl_cpp.py grammar/Python.asdl src/lpython/python_ast.h
 # Generate a Fortran ASR from ASR.asdl (C++)
 python grammar/asdl_cpp.py grammar/ASR.asdl src/libasr/asr.h
+
+# Generate the tokenizer
+(cd src/lpython/parser && re2c -W -b tokenizer.re -o tokenizer.cpp)
diff --git a/ci/build.xsh b/ci/build.xsh
@@ -33,6 +33,9 @@ python grammar/asdl_cpp.py grammar/Python.asdl src/lpython/python_ast.h
 # Generate a Python AST from Python.asdl (Python)
 python grammar/asdl_py.py
 
+# Generate the tokenizer
+pushd src/lpython/parser && re2c -W -b tokenizer.re -o tokenizer.cpp && popd
+
 $lpython_version=$(cat version).strip()
 $dest="lpython-" + $lpython_version
 bash ci/create_source_tarball0.sh

diff --git a/run_tests.py b/run_tests.py
@@ -78,6 +78,10 @@ def main():
             run_test("cpp", "lpython --no-color --show-cpp {infile}",
                     filename, update_reference, extra_args)
 
+        if tokens:
+            run_test("tokens", "lpython --no-color --show-tokens {infile} -o {outfile}",
+                    filename, update_reference, extra_args)
+
         print()
 
     if list_tests:

diff --git a/src/bin/lpython.cpp b/src/bin/lpython.cpp
@@ -31,6 +31,7 @@
 #include <libasr/string_utils.h>
 #include <lpython/utils.h>
 #include <lpython/python_serialization.h>
+#include <lpython/parser/tokenizer.h>
 
 #include <cpp-terminal/terminal.h>
 #include <cpp-terminal/prompt0.h>
@@ -92,6 +93,38 @@ std::string get_kokkos_dir()
 
 #endif
 
+int emit_tokens(const std::string &infile, bool line_numbers, const CompilerOptions &compiler_options)
+{
+    std::string input = read_file(infile);
+    // Src -> Tokens
+    Allocator al(64*1024*1024);
+    std::vector<int> toks;
+    std::vector<LFortran::YYSTYPE> stypes;
+    std::vector<LFortran::Location> locations;
+    LFortran::diag::Diagnostics diagnostics;
+    auto res = LFortran::tokens(al, input, diagnostics, &stypes, &locations);
+    LFortran::LocationManager lm;
+    lm.in_filename = infile;
+    lm.init_simple(input);
+    std::cerr << diagnostics.render(input, lm, compiler_options);
+    if (res.ok) {
+        toks = res.result;
+    } else {
+        LFORTRAN_ASSERT(diagnostics.has_error())
+        return 1;
+    }
+
+    for (size_t i=0; i < toks.size(); i++) {
+        std::cout << LFortran::pickle_token(toks[i], stypes[i]);
+        if (line_numbers) {
+            std::cout << " " << locations[i].first << ":" << locations[i].last;
+        }
+        std::cout << std::endl;
+    }
+    return 0;
+}
+
+
 int emit_ast(const std::string &infile,
     const std::string &runtime_library_dir,
     CompilerOptions &compiler_options)
@@ -470,6 +503,7 @@ int main(int argc, char *argv[])
         std::string arg_o;
         std::vector<std::string> arg_files;
         bool arg_version = false;
+        bool show_tokens = false;
         bool show_ast = false;
         bool show_asr = false;
         bool show_cpp = false;
@@ -520,6 +554,7 @@ int main(int argc, char *argv[])
 
         // LPython specific options
         app.add_flag("--cpp", compiler_options.c_preprocessor, "Enable C preprocessing");
+        app.add_flag("--show-tokens", show_tokens, "Show tokens for the given python file and exit");
         app.add_flag("--show-ast", show_ast, "Show AST for the given python file and exit");
         app.add_flag("--show-asr", show_asr, "Show ASR for the given python file and exit");
         app.add_flag("--show-llvm", show_llvm, "Show LLVM IR for the given file and exit");
@@ -661,6 +696,8 @@ int main(int argc, char *argv[])
             outfile = basename + ".s";
         } else if (arg_c) {
             outfile = basename + ".o";
+        } else if (show_tokens) {
+            outfile = basename + ".tok";
         } else if (show_ast) {
             outfile = basename + ".ast";
         } else if (show_asr) {
@@ -699,6 +736,9 @@ int main(int argc, char *argv[])
             }
             show_asr = true;
         }
+        if (show_tokens) {
+            return emit_tokens(arg_file, true, compiler_options);
+        }
         if (show_ast) {
             return emit_ast(arg_file, runtime_library_dir, compiler_options);
         }

diff --git a/src/libasr/bigint.h b/src/libasr/bigint.h
@@ -0,0 +1,168 @@
+#ifndef LFORTRAN_BIGINT_H
+#define LFORTRAN_BIGINT_H
+
+#include <cstdint>
+
+#include <libasr/containers.h>
+
+namespace LFortran {
+
+namespace BigInt {
+
+/*
+ * Arbitrary size integer implementation.
+ *
+ * We use tagged signed 64bit integers with no padding bits and using 2's
+ * complement for negative values (int64_t) as the underlying data structure.
+ * Little-endian is assumed.
+ *
+ * Bits (from the left):
+ * 1 ..... sign: 0 positive, 1 negative
+ * 2 ..... tag: bits 1-2 equal to 01: pointer; otherwise integer
+ * 3-64 .. if the tag is - integer: rest of the signed integer bits in 2's
+ *                                  complement
+ *                       - pointer: 64 bit pointer shifted by 2
+ *                                  to the right (>> 2)
+ *
+ * The pointer must be aligned to 4 bytes (bits 63-64 must be 00).
+ * Small signed integers are represented directly as integers in int64_t, large
+ * integers are allocated on heap and a pointer to it is used as "tag pointer"
+ * in int64_t.
+ *
+ * To check if the integer has a pointer tag, we check that the first two bits
+ * (1-2) are equal to 01:
+ */
+
+// Returns true if "i" is a pointer and false if "i" is an integer
+inline static bool is_int_ptr(int64_t i) {
+    return (((uint64_t)i) >> (64 - 2)) == 1;
+}
+
+/*
+ * A pointer is converted to integer by shifting by 2 to the right and adding
+ * 01 to the first two bits to tag it as a pointer:
+ */
+
+// Converts a pointer "p" (must be aligned to 4 bytes) to a tagged int64_t
+inline static int64_t ptr_to_int(void *p) {
+    return (int64_t)( (((uint64_t)p) >> 2) | (1ULL << (64 - 2)) );
+}
+
+/* An integer with the pointer tag is converted to a pointer by shifting by 2
+ * to the left, which erases the tag and puts 00 to bits 63-64:
+ */
+
+// Converts a tagged int64_t to a pointer (aligned to 4 bytes)
+inline static void* int_to_ptr(int64_t i) {
+    return (void *)(((uint64_t)i) << 2);
+}
+
+/* The maximum small int is 2^62-1
+ */
+const int64_t MAX_SMALL_INT = (int64_t)((1ULL << 62)-1);
+
+/* The minimum small int is -2^63
+ */
+const int64_t MIN_SMALL_INT = (int64_t)(-(1ULL << 63));
+
+// Returns true if "i" is a small int
+inline static bool is_small_int(int64_t i) {
+    return (MIN_SMALL_INT <= i && i <= MAX_SMALL_INT);
+}
+
+/* Arbitrary integer implementation
+ * For now large integers are implemented as strings with decimal digits. The
+ * only supported operation on this is converting to and from a string. Later
+ * we will replace with an actual large integer implementation and add other
+ * operations.
+ */
+
+// Converts a string to a large int (allocated on heap, returns a pointer)
+inline static int64_t string_to_largeint(Allocator &al, const Str &s) {
+    char *cs = s.c_str(al);
+    return ptr_to_int(cs);
+}
+
+// Converts a large int to a string
+inline static char* largeint_to_string(int64_t i) {
+    LFORTRAN_ASSERT(is_int_ptr(i));
+    void *p = int_to_ptr(i);
+    char *cs = (char*)p;
+    return cs;
+}
+
+inline static std::string int_to_str(int64_t i) {
+    if (is_int_ptr(i)) {
+        return std::string(largeint_to_string(i));
+    } else {
+        return std::to_string(i);
+    }
+}
+
+inline static bool is_int64(std::string str_repr) {
+    std::string str_int64 = "9223372036854775807";
+    if( str_repr.size() > str_int64.size() ) {
+        return false;
+    }
+
+    if( str_repr.size() < str_int64.size() ) {
+        return true;
+    }
+
+    size_t i;
+    for( i = 0; i < str_repr.size() - 1 && str_repr[i] == str_int64[i]; i++  ) {
+    }
+    return i == str_repr.size() - 1 || str_repr[i] < str_int64[i];
+}
+
+/* BigInt is a thin wrapper over the functionality exposed in the functions
+ * above.  The idea is that one can use the int64_t type directly and just use
+ * the function above to handle the large integer aspects, and if it is a small
+ * integer, one can use it directly as int64 integer.
+ *
+ * Alternatively, one can use the BigInt class below that exposes the
+ * functionality via methods.
+ */
+
+struct BigInt {
+    int64_t n;
+
+    BigInt() = default;
+    BigInt(const BigInt &) = default;
+    BigInt& operator=(const BigInt &) = default;
+
+    void from_smallint(int64_t i) {
+        LFORTRAN_ASSERT(is_small_int(i));
+        n = i;
+    }
+
+    void from_largeint(Allocator &al, const Str &s) {
+        n = string_to_largeint(al, s);
+    }
+
+    bool is_large() const {
+        return is_int_ptr(n);
+    }
+
+    int64_t as_smallint() const {
+        LFORTRAN_ASSERT(!is_large());
+        return n;
+    }
+
+    std::string str() const {
+        return int_to_str(n);
+    }
+
+};
+
+static_assert(std::is_standard_layout<BigInt>::value);
+static_assert(std::is_trivial<BigInt>::value);
+static_assert(sizeof(BigInt) == sizeof(int64_t));
+static_assert(sizeof(BigInt) == 8);
+
+
+} // BigInt
+
+} // LFortran
+
+#endif // LFORTRAN_BIGINT_H
diff --git a/src/lpython/CMakeLists.txt b/src/lpython/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(SRC
+    parser/tokenizer.cpp
     semantics/python_ast_to_asr.cpp
 
     python_evaluator.cpp