From d670834c8338845aa68c4fbcb74fff4d66846ec5 Mon Sep 17 00:00:00 2001
From: Cristiano Calcagno <cristianoc@users.noreply.github.com>
Date: Tue, 14 Mar 2023 15:21:18 +0100
Subject: [PATCH] Make intenral encoding of locations aware of unicode

When some unicode characters are present on a line, the existing encoding of positions, based on number of bytes since line start, is incorrect.
This can be seen in e.g. error messages picked up in the editor (or on the command-line).

This PR takes unicode into account. Even thought the ocaml locations are byte-based, one can trick the system by encoding as pos_cnum:
  (number of bytes from file start to line start) + (number of utf16 code units since line start)
Since the compiler's printer performs a subtraction, the utf16 character position is shown.
Notice that editors, vscode in particular, show you something in "Col", but its internal commands expect correct utf16 character which is different.
---
 CHANGELOG.md                                  |  1 +
 .../expected/unicode_location.res.expected    | 12 ++++++
 .../fixtures/unicode_location.res             |  2 +
 res_syntax/src/res_core.ml                    |  2 +
 res_syntax/src/res_parser.ml                  |  2 +
 res_syntax/src/res_scanner.ml                 | 40 +++++++++++++------
 res_syntax/src/res_scanner.mli                |  4 +-
 7 files changed, 49 insertions(+), 14 deletions(-)
 create mode 100644 jscomp/build_tests/super_errors/expected/unicode_location.res.expected
 create mode 100644 jscomp/build_tests/super_errors/fixtures/unicode_location.res

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2616f95612..1154b6b97e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -52,6 +52,7 @@ These are only breaking changes for unformatted code.
 - Parser: fix location of variable when function definition `{v => ...}` is enclosed in braces https://github.com/rescript-lang/rescript-compiler/pull/5949
 - Fix issue with error messages for uncurried functions where expected and given type were swapped https://github.com/rescript-lang/rescript-compiler/pull/5973
 - Fix issue with integer overflow check https://github.com/rescript-lang/rescript-compiler/pull/6028
+- Make internal encoding of locations aware of unicode https://github.com/rescript-lang/rescript-compiler/pull/6073
 
 #### :nail_care: Polish
 
diff --git a/jscomp/build_tests/super_errors/expected/unicode_location.res.expected b/jscomp/build_tests/super_errors/expected/unicode_location.res.expected
new file mode 100644
index 0000000000..e21c17cd39
--- /dev/null
+++ b/jscomp/build_tests/super_errors/expected/unicode_location.res.expected
@@ -0,0 +1,12 @@
+
+  [1;31mWe've found a bug for you![0m
+  [36m/.../fixtures/unicode_location.res[0m:[2m1:43[0m
+
+  [1;31m1[0m [2m│[0m let q = "💩💩💩💩💩💩💩💩�[1;31m�[0m��💩" ++ ("a" ++ 3 ++ "b")
+  2 [2m│[0m //                                         ^ character position 33 + 10 
+    [2m│[0m (unicode symbols of length 2)
+
+  This has type: [1;31mint[0m
+  Somewhere wanted: [1;33mstring[0m
+  
+  You can convert [1;33mint[0m to [1;33mstring[0m with [1;33mBelt.Int.toString[0m.
\ No newline at end of file
diff --git a/jscomp/build_tests/super_errors/fixtures/unicode_location.res b/jscomp/build_tests/super_errors/fixtures/unicode_location.res
new file mode 100644
index 0000000000..e64643ca36
--- /dev/null
+++ b/jscomp/build_tests/super_errors/fixtures/unicode_location.res
@@ -0,0 +1,2 @@
+let q = "💩💩💩💩💩💩💩💩💩💩" ++ ("a" ++ 3 ++ "b")
+//                                         ^ character position 33 + 10 (unicode symbols of length 2)
\ No newline at end of file
diff --git a/res_syntax/src/res_core.ml b/res_syntax/src/res_core.ml
index 6f52e1d606..1e728ccaab 100644
--- a/res_syntax/src/res_core.ml
+++ b/res_syntax/src/res_core.ml
@@ -2469,6 +2469,7 @@ and parseAttributesAndBinding (p : Parser.t) =
   let err = p.scanner.err in
   let ch = p.scanner.ch in
   let offset = p.scanner.offset in
+  let offset16 = p.scanner.offset16 in
   let lineOffset = p.scanner.lineOffset in
   let lnum = p.scanner.lnum in
   let mode = p.scanner.mode in
@@ -2490,6 +2491,7 @@ and parseAttributesAndBinding (p : Parser.t) =
       p.scanner.err <- err;
       p.scanner.ch <- ch;
       p.scanner.offset <- offset;
+      p.scanner.offset16 <- offset16;
       p.scanner.lineOffset <- lineOffset;
       p.scanner.lnum <- lnum;
       p.scanner.mode <- mode;
diff --git a/res_syntax/src/res_parser.ml b/res_syntax/src/res_parser.ml
index 1d10263984..387172a36e 100644
--- a/res_syntax/src/res_parser.ml
+++ b/res_syntax/src/res_parser.ml
@@ -159,6 +159,7 @@ let lookahead p callback =
   let err = p.scanner.err in
   let ch = p.scanner.ch in
   let offset = p.scanner.offset in
+  let offset16 = p.scanner.offset16 in
   let lineOffset = p.scanner.lineOffset in
   let lnum = p.scanner.lnum in
   let mode = p.scanner.mode in
@@ -177,6 +178,7 @@ let lookahead p callback =
   p.scanner.err <- err;
   p.scanner.ch <- ch;
   p.scanner.offset <- offset;
+  p.scanner.offset16 <- offset16;
   p.scanner.lineOffset <- lineOffset;
   p.scanner.lnum <- lnum;
   p.scanner.mode <- mode;
diff --git a/res_syntax/src/res_scanner.ml b/res_syntax/src/res_scanner.ml
index 04a949121c..371711796e 100644
--- a/res_syntax/src/res_scanner.ml
+++ b/res_syntax/src/res_scanner.ml
@@ -19,7 +19,9 @@ type t = {
     Diagnostics.category ->
     unit;
   mutable ch: charEncoding; (* current character *)
-  mutable offset: int; (* character offset *)
+  mutable offset: int; (* current byte offset *)
+  mutable offset16: int;
+      (* current number of utf16 code units since line start *)
   mutable lineOffset: int; (* current line offset *)
   mutable lnum: int; (* current line number *)
   mutable mode: mode list;
@@ -51,12 +53,11 @@ let position scanner =
       (* line number *)
       pos_lnum = scanner.lnum;
       (* offset of the beginning of the line (number
-         of characters between the beginning of the scanner and the beginning
+         of bytes between the beginning of the scanner and the beginning
          of the line) *)
       pos_bol = scanner.lineOffset;
-      (* [pos_cnum] is the offset of the position (number of
-         characters between the beginning of the scanner and the position). *)
-      pos_cnum = scanner.offset;
+      (* [pos_cnum - pos_bol]  is the number of utf16 code units since line start *)
+      pos_cnum = scanner.lineOffset + scanner.offset16;
     }
 
 (* Small debugging util
@@ -95,19 +96,29 @@ let _printDebug ~startPos ~endPos scanner token =
 
 let next scanner =
   let nextOffset = scanner.offset + 1 in
-  (match scanner.ch with
-  | '\n' ->
-    scanner.lineOffset <- nextOffset;
-    scanner.lnum <- scanner.lnum + 1
+  let utf16len =
+    match Ext_utf8.classify scanner.ch with
+    | Single _ | Invalid -> 1
+    | Leading (n, _) -> ( (((n + 1) / 2) [@doesNotRaise]))
+    | Cont _ -> 0
+  in
+  let newline =
+    scanner.ch = '\n'
     (* What about CRLF (\r + \n) on windows?
-     * \r\n will always be terminated by a \n
-     * -> we can just bump the line count on \n *)
-  | _ -> ());
+       \r\n will always be terminated by a \n
+       -> we can just bump the line count on \n *)
+  in
+  if newline then (
+    scanner.lineOffset <- nextOffset;
+    scanner.offset16 <- 0;
+    scanner.lnum <- scanner.lnum + 1)
+  else scanner.offset16 <- scanner.offset16 + utf16len;
   if nextOffset < String.length scanner.src then (
     scanner.offset <- nextOffset;
-    scanner.ch <- String.unsafe_get scanner.src scanner.offset)
+    scanner.ch <- String.unsafe_get scanner.src nextOffset)
   else (
     scanner.offset <- String.length scanner.src;
+    scanner.offset16 <- scanner.offset - scanner.lineOffset;
     scanner.ch <- hackyEOFChar)
 
 let next2 scanner =
@@ -141,6 +152,7 @@ let make ~filename src =
     err = (fun ~startPos:_ ~endPos:_ _ -> ());
     ch = (if src = "" then hackyEOFChar else String.unsafe_get src 0);
     offset = 0;
+    offset16 = 0;
     lineOffset = 0;
     lnum = 1;
     mode = [];
@@ -847,6 +859,7 @@ let rec scan scanner =
       | ch, _ ->
         next scanner;
         let offset = scanner.offset in
+        let offset16 = scanner.offset16 in
         let codepoint, length =
           Res_utf8.decodeCodePoint scanner.offset scanner.src
             (String.length scanner.src)
@@ -863,6 +876,7 @@ let rec scan scanner =
         else (
           scanner.ch <- ch;
           scanner.offset <- offset;
+          scanner.offset16 <- offset16;
           SingleQuote))
     | '!' -> (
       match (peek scanner, peek2 scanner) with
diff --git a/res_syntax/src/res_scanner.mli b/res_syntax/src/res_scanner.mli
index 0b53a1c71f..cc002699fd 100644
--- a/res_syntax/src/res_scanner.mli
+++ b/res_syntax/src/res_scanner.mli
@@ -11,7 +11,9 @@ type t = {
     Res_diagnostics.category ->
     unit;
   mutable ch: charEncoding; (* current character *)
-  mutable offset: int; (* character offset *)
+  mutable offset: int; (* current byte offset *)
+  mutable offset16: int;
+      (* current number of utf16 code units since line start *)
   mutable lineOffset: int; (* current line offset *)
   mutable lnum: int; (* current line number *)
   mutable mode: mode list;