diff --git a/CHANGELOG.md b/CHANGELOG.md index 2616f95612..1154b6b97e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,6 +52,7 @@ These are only breaking changes for unformatted code. - Parser: fix location of variable when function definition `{v => ...}` is enclosed in braces https://github.com/rescript-lang/rescript-compiler/pull/5949 - Fix issue with error messages for uncurried functions where expected and given type were swapped https://github.com/rescript-lang/rescript-compiler/pull/5973 - Fix issue with integer overflow check https://github.com/rescript-lang/rescript-compiler/pull/6028 +- Make internal encoding of locations aware of unicode https://github.com/rescript-lang/rescript-compiler/pull/6073 #### :nail_care: Polish diff --git a/jscomp/build_tests/super_errors/expected/unicode_location.res.expected b/jscomp/build_tests/super_errors/expected/unicode_location.res.expected new file mode 100644 index 0000000000..e21c17cd39 --- /dev/null +++ b/jscomp/build_tests/super_errors/expected/unicode_location.res.expected @@ -0,0 +1,12 @@ + + We've found a bug for you! + /.../fixtures/unicode_location.res:1:43 + + 1 │ let q = "💩💩💩💩💩💩💩💩����💩" ++ ("a" ++ 3 ++ "b") + 2 │ // ^ character position 33 + 10 + │ (unicode symbols of length 2) + + This has type: int + Somewhere wanted: string + + You can convert int to string with Belt.Int.toString. \ No newline at end of file diff --git a/jscomp/build_tests/super_errors/fixtures/unicode_location.res b/jscomp/build_tests/super_errors/fixtures/unicode_location.res new file mode 100644 index 0000000000..e64643ca36 --- /dev/null +++ b/jscomp/build_tests/super_errors/fixtures/unicode_location.res @@ -0,0 +1,2 @@ +let q = "💩💩💩💩💩💩💩💩💩💩" ++ ("a" ++ 3 ++ "b") +// ^ character position 33 + 10 (unicode symbols of length 2) \ No newline at end of file diff --git a/res_syntax/src/res_core.ml b/res_syntax/src/res_core.ml index 6f52e1d606..1e728ccaab 100644 --- a/res_syntax/src/res_core.ml +++ b/res_syntax/src/res_core.ml @@ -2469,6 +2469,7 @@ and parseAttributesAndBinding (p : Parser.t) = let err = p.scanner.err in let ch = p.scanner.ch in let offset = p.scanner.offset in + let offset16 = p.scanner.offset16 in let lineOffset = p.scanner.lineOffset in let lnum = p.scanner.lnum in let mode = p.scanner.mode in @@ -2490,6 +2491,7 @@ and parseAttributesAndBinding (p : Parser.t) = p.scanner.err <- err; p.scanner.ch <- ch; p.scanner.offset <- offset; + p.scanner.offset16 <- offset16; p.scanner.lineOffset <- lineOffset; p.scanner.lnum <- lnum; p.scanner.mode <- mode; diff --git a/res_syntax/src/res_parser.ml b/res_syntax/src/res_parser.ml index 1d10263984..387172a36e 100644 --- a/res_syntax/src/res_parser.ml +++ b/res_syntax/src/res_parser.ml @@ -159,6 +159,7 @@ let lookahead p callback = let err = p.scanner.err in let ch = p.scanner.ch in let offset = p.scanner.offset in + let offset16 = p.scanner.offset16 in let lineOffset = p.scanner.lineOffset in let lnum = p.scanner.lnum in let mode = p.scanner.mode in @@ -177,6 +178,7 @@ let lookahead p callback = p.scanner.err <- err; p.scanner.ch <- ch; p.scanner.offset <- offset; + p.scanner.offset16 <- offset16; p.scanner.lineOffset <- lineOffset; p.scanner.lnum <- lnum; p.scanner.mode <- mode; diff --git a/res_syntax/src/res_scanner.ml b/res_syntax/src/res_scanner.ml index 04a949121c..371711796e 100644 --- a/res_syntax/src/res_scanner.ml +++ b/res_syntax/src/res_scanner.ml @@ -19,7 +19,9 @@ type t = { Diagnostics.category -> unit; mutable ch: charEncoding; (* current character *) - mutable offset: int; (* character offset *) + mutable offset: int; (* current byte offset *) + mutable offset16: int; + (* current number of utf16 code units since line start *) mutable lineOffset: int; (* current line offset *) mutable lnum: int; (* current line number *) mutable mode: mode list; @@ -51,12 +53,11 @@ let position scanner = (* line number *) pos_lnum = scanner.lnum; (* offset of the beginning of the line (number - of characters between the beginning of the scanner and the beginning + of bytes between the beginning of the scanner and the beginning of the line) *) pos_bol = scanner.lineOffset; - (* [pos_cnum] is the offset of the position (number of - characters between the beginning of the scanner and the position). *) - pos_cnum = scanner.offset; + (* [pos_cnum - pos_bol] is the number of utf16 code units since line start *) + pos_cnum = scanner.lineOffset + scanner.offset16; } (* Small debugging util @@ -95,19 +96,29 @@ let _printDebug ~startPos ~endPos scanner token = let next scanner = let nextOffset = scanner.offset + 1 in - (match scanner.ch with - | '\n' -> - scanner.lineOffset <- nextOffset; - scanner.lnum <- scanner.lnum + 1 + let utf16len = + match Ext_utf8.classify scanner.ch with + | Single _ | Invalid -> 1 + | Leading (n, _) -> ( (((n + 1) / 2) [@doesNotRaise])) + | Cont _ -> 0 + in + let newline = + scanner.ch = '\n' (* What about CRLF (\r + \n) on windows? - * \r\n will always be terminated by a \n - * -> we can just bump the line count on \n *) - | _ -> ()); + \r\n will always be terminated by a \n + -> we can just bump the line count on \n *) + in + if newline then ( + scanner.lineOffset <- nextOffset; + scanner.offset16 <- 0; + scanner.lnum <- scanner.lnum + 1) + else scanner.offset16 <- scanner.offset16 + utf16len; if nextOffset < String.length scanner.src then ( scanner.offset <- nextOffset; - scanner.ch <- String.unsafe_get scanner.src scanner.offset) + scanner.ch <- String.unsafe_get scanner.src nextOffset) else ( scanner.offset <- String.length scanner.src; + scanner.offset16 <- scanner.offset - scanner.lineOffset; scanner.ch <- hackyEOFChar) let next2 scanner = @@ -141,6 +152,7 @@ let make ~filename src = err = (fun ~startPos:_ ~endPos:_ _ -> ()); ch = (if src = "" then hackyEOFChar else String.unsafe_get src 0); offset = 0; + offset16 = 0; lineOffset = 0; lnum = 1; mode = []; @@ -847,6 +859,7 @@ let rec scan scanner = | ch, _ -> next scanner; let offset = scanner.offset in + let offset16 = scanner.offset16 in let codepoint, length = Res_utf8.decodeCodePoint scanner.offset scanner.src (String.length scanner.src) @@ -863,6 +876,7 @@ let rec scan scanner = else ( scanner.ch <- ch; scanner.offset <- offset; + scanner.offset16 <- offset16; SingleQuote)) | '!' -> ( match (peek scanner, peek2 scanner) with diff --git a/res_syntax/src/res_scanner.mli b/res_syntax/src/res_scanner.mli index 0b53a1c71f..cc002699fd 100644 --- a/res_syntax/src/res_scanner.mli +++ b/res_syntax/src/res_scanner.mli @@ -11,7 +11,9 @@ type t = { Res_diagnostics.category -> unit; mutable ch: charEncoding; (* current character *) - mutable offset: int; (* character offset *) + mutable offset: int; (* current byte offset *) + mutable offset16: int; + (* current number of utf16 code units since line start *) mutable lineOffset: int; (* current line offset *) mutable lnum: int; (* current line number *) mutable mode: mode list;