From b562eaead25b3e0726410cc00ade2ab5a97c10aa Mon Sep 17 00:00:00 2001 From: Gusted Date: Sat, 26 Feb 2022 01:48:19 +0100 Subject: [PATCH 1/6] Don't treat BOM escape sequence as hidden character. - BOM sequence is a common non-harmfull escape sequence, it shouldn't be shown as hidden character. - Follows GitHub's behavior. - Resolves #18837 --- modules/charset/escape.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/charset/escape.go b/modules/charset/escape.go index d2e8fb0d87067..078b11852c354 100644 --- a/modules/charset/escape.go +++ b/modules/charset/escape.go @@ -144,7 +144,8 @@ readingloop: return } writePos = i + size - case unicode.Is(unicode.C, r): + // 65279 == BOM rune. + case unicode.Is(unicode.C, r) && r != rune(65279): escaped.Escaped = true escaped.HasControls = true if writePos < i { From dc2b17435e771eb27ee28ca23f381e8e5c97617e Mon Sep 17 00:00:00 2001 From: Gusted Date: Sat, 26 Feb 2022 10:46:25 +0100 Subject: [PATCH 2/6] Add test --- modules/charset/escape_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/charset/escape_test.go b/modules/charset/escape_test.go index 1804381413b38..b89a2472b9a94 100644 --- a/modules/charset/escape_test.go +++ b/modules/charset/escape_test.go @@ -129,6 +129,12 @@ then resh (ר), and finally heh (ה) (which should appear leftmost).`, "\n" + `if access_level != "user` + "\u202e" + ` ` + "\u2066" + `// Check if admin` + "\u2069" + ` ` + "\u2066" + `" {` + "\n", status: EscapeStatus{Escaped: true, HasBIDI: true, BadBIDI: true, HasLTRScript: true, HasRTLScript: true}, }, + { + name: "BOM encoding UTF-8", + text: string([]byte{'\xef', '\xbb', '\xbf'}), + result: string([]byte{'\xef', '\xbb', '\xbf'}), + status: EscapeStatus{}, + }, } func TestEscapeControlString(t *testing.T) { From 81d2ced22abac3d23d8d42db1a8d208bc5a4f7ee Mon Sep 17 00:00:00 2001 From: Gusted Date: Sat, 26 Feb 2022 15:08:44 +0100 Subject: [PATCH 3/6] Add UTF-16 test case --- modules/charset/escape.go | 4 ++-- modules/charset/escape_test.go | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/modules/charset/escape.go b/modules/charset/escape.go index 078b11852c354..01064be51b0ab 100644 --- a/modules/charset/escape.go +++ b/modules/charset/escape.go @@ -69,7 +69,6 @@ func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus lineHasBIDI := false lineHasRTLScript := false lineHasLTRScript := false - readingloop: for err == nil { n, err = text.Read(buf[readStart:]) @@ -79,6 +78,7 @@ readingloop: for i < len(bs) { r, size := utf8.DecodeRune(bs[i:]) + // Now handle the codepoints switch { case r == utf8.RuneError: @@ -145,7 +145,7 @@ readingloop: } writePos = i + size // 65279 == BOM rune. - case unicode.Is(unicode.C, r) && r != rune(65279): + case r != rune(65279) && unicode.Is(unicode.C, r): escaped.Escaped = true escaped.HasControls = true if writePos < i { diff --git a/modules/charset/escape_test.go b/modules/charset/escape_test.go index b89a2472b9a94..9d793a4986fef 100644 --- a/modules/charset/escape_test.go +++ b/modules/charset/escape_test.go @@ -135,6 +135,14 @@ then resh (ר), and finally heh (ה) (which should appear leftmost).`, result: string([]byte{'\xef', '\xbb', '\xbf'}), status: EscapeStatus{}, }, + { + name: "BOM encoding UTF-16", + text: string([]byte{239, 187, 191, 228, 189, 160, 229, 165, 189, 239, 188, 140, 228, 184, 150, 231, 149, 140, 10, 104, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 10}), + result: string([]byte{239, 187, 191, 228, 189, 160, 229, 165, 189, 239, 188, 140, 228, 184, 150, 231, 149, 140, 10, 104, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 10}), + status: EscapeStatus{ + HasLTRScript: true, + }, + }, } func TestEscapeControlString(t *testing.T) { From cec21e6dbb7e41b71155490f6a3a8cc2d51ae1b2 Mon Sep 17 00:00:00 2001 From: Gusted Date: Sat, 26 Feb 2022 15:10:41 +0100 Subject: [PATCH 4/6] Fix newlines --- modules/charset/escape.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/charset/escape.go b/modules/charset/escape.go index 01064be51b0ab..54b152e554925 100644 --- a/modules/charset/escape.go +++ b/modules/charset/escape.go @@ -69,6 +69,7 @@ func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus lineHasBIDI := false lineHasRTLScript := false lineHasLTRScript := false + readingloop: for err == nil { n, err = text.Read(buf[readStart:]) @@ -78,7 +79,6 @@ readingloop: for i < len(bs) { r, size := utf8.DecodeRune(bs[i:]) - // Now handle the codepoints switch { case r == utf8.RuneError: From b5065b70318f9263a4df19a3b2a5b685fffd1b46 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Sat, 26 Feb 2022 22:54:57 +0800 Subject: [PATCH 5/6] refactor --- modules/charset/escape.go | 8 ++++++-- modules/charset/escape_test.go | 34 ++++++++++++++++++---------------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/modules/charset/escape.go b/modules/charset/escape.go index 54b152e554925..20a4bb2a104d2 100644 --- a/modules/charset/escape.go +++ b/modules/charset/escape.go @@ -63,6 +63,7 @@ func EscapeControlBytes(text []byte) (EscapeStatus, []byte) { func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) { buf := make([]byte, 4096) readStart := 0 + runeCount := 0 var n int var writePos int @@ -79,6 +80,8 @@ readingloop: for i < len(bs) { r, size := utf8.DecodeRune(bs[i:]) + runeCount++ + // Now handle the codepoints switch { case r == utf8.RuneError: @@ -113,6 +116,8 @@ readingloop: lineHasRTLScript = false lineHasLTRScript = false + case runeCount == 1 && r == 0xFEFF: // UTF BOM + // the first BOM is safe case r == '\r' || r == '\t' || r == ' ': // These are acceptable control characters and space characters case unicode.IsSpace(r): @@ -144,8 +149,7 @@ readingloop: return } writePos = i + size - // 65279 == BOM rune. - case r != rune(65279) && unicode.Is(unicode.C, r): + case unicode.Is(unicode.C, r): escaped.Escaped = true escaped.HasControls = true if writePos < i { diff --git a/modules/charset/escape_test.go b/modules/charset/escape_test.go index 9d793a4986fef..735a2dba709f6 100644 --- a/modules/charset/escape_test.go +++ b/modules/charset/escape_test.go @@ -130,18 +130,12 @@ then resh (ר), and finally heh (ה) (which should appear leftmost).`, status: EscapeStatus{Escaped: true, HasBIDI: true, BadBIDI: true, HasLTRScript: true, HasRTLScript: true}, }, { - name: "BOM encoding UTF-8", - text: string([]byte{'\xef', '\xbb', '\xbf'}), - result: string([]byte{'\xef', '\xbb', '\xbf'}), - status: EscapeStatus{}, - }, - { - name: "BOM encoding UTF-16", - text: string([]byte{239, 187, 191, 228, 189, 160, 229, 165, 189, 239, 188, 140, 228, 184, 150, 231, 149, 140, 10, 104, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 10}), - result: string([]byte{239, 187, 191, 228, 189, 160, 229, 165, 189, 239, 188, 140, 228, 184, 150, 231, 149, 140, 10, 104, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 10}), - status: EscapeStatus{ - HasLTRScript: true, - }, + // UTF-8/16/32 all use the same codepoint for BOM + // Gitea could read UTF-16/32 content and convert into UTF-8 internally then render it, so we only process UTF-8 internally + name: "UTF BOM", + text: "\xef\xbb\xbftest", + result: "\xef\xbb\xbftest", + status: EscapeStatus{HasLTRScript: true}, }, } @@ -177,10 +171,18 @@ func TestEscapeControlReader(t *testing.T) { // lets add some control characters to the tests tests := make([]escapeControlTest, 0, len(escapeControlTests)*3) copy(tests, escapeControlTests) + + // if there is a BOM, we should keep the BOM + addPrefix := func(prefix string, s string) string { + if strings.HasPrefix(s, "\xef\xbb\xbf") { + return s[:3] + prefix + s[3:] + } + return prefix + s + } for _, test := range escapeControlTests { test.name += " (+Control)" - test.text = "\u001E" + test.text - test.result = `` + "\u001e" + `` + test.result + test.text = addPrefix("\u001E", test.text) + test.result = addPrefix(``+"\u001e"+``, test.result) test.status.Escaped = true test.status.HasControls = true tests = append(tests, test) @@ -188,8 +190,8 @@ func TestEscapeControlReader(t *testing.T) { for _, test := range escapeControlTests { test.name += " (+Mark)" - test.text = "\u0300" + test.text - test.result = `` + "\u0300" + `` + test.result + test.text = addPrefix("\u0300", test.text) + test.result = addPrefix(``+"\u0300"+``, test.result) test.status.Escaped = true test.status.HasMarks = true tests = append(tests, test) From 91a5c86c14f397c20e5d6ef8090338974ff74978 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Sat, 26 Feb 2022 23:28:19 +0800 Subject: [PATCH 6/6] fix fmt --- modules/charset/escape_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/charset/escape_test.go b/modules/charset/escape_test.go index 735a2dba709f6..01ccca77249b7 100644 --- a/modules/charset/escape_test.go +++ b/modules/charset/escape_test.go @@ -173,7 +173,7 @@ func TestEscapeControlReader(t *testing.T) { copy(tests, escapeControlTests) // if there is a BOM, we should keep the BOM - addPrefix := func(prefix string, s string) string { + addPrefix := func(prefix, s string) string { if strings.HasPrefix(s, "\xef\xbb\xbf") { return s[:3] + prefix + s[3:] }