Skip to content

Commit ffb32dc

Browse files
committed
feat: add HTML5 variations of the sanitizers
which use Loofah.html5_fragment. Note that we repeat the sanitizer tests for both variations using a module that's mixed into two test classes.
1 parent f03c34c commit ffb32dc

File tree

2 files changed

+173
-23
lines changed

2 files changed

+173
-23
lines changed

lib/rails/html/sanitizer.rb

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,14 @@ def safe_list_sanitizer
1919
def white_list_sanitizer # :nodoc:
2020
safe_list_sanitizer
2121
end
22+
23+
def html5_support?
24+
unless @html5_support_set
25+
@html5_support = Loofah.respond_to?(:html5_support?) && Loofah.html5_support?
26+
@html5_support_set = true
27+
end
28+
@html5_support
29+
end
2230
end
2331

2432
def sanitize(html, options = {})
@@ -52,6 +60,12 @@ def parse_fragment(html)
5260
Loofah.html4_fragment(html)
5361
end
5462
end
63+
64+
module HTML5
65+
def parse_fragment(html)
66+
Loofah.html5_fragment(html)
67+
end
68+
end if Rails::HTML::Sanitizer.html5_support?
5569
end
5670

5771
module Scrubber
@@ -286,6 +300,96 @@ class SafeListSanitizer < Rails::HTML::Sanitizer
286300
end
287301
end
288302

303+
module HTML5
304+
# == Rails::HTML5::FullSanitizer
305+
#
306+
# Removes all tags from HTML5 but strips out scripts, forms and comments.
307+
#
308+
# full_sanitizer = Rails::HTML5::FullSanitizer.new
309+
# full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
310+
# # => "Bold no more! See more here..."
311+
#
312+
class FullSanitizer < Rails::HTML::Sanitizer
313+
include HTML::Concern::ComposedSanitize
314+
include HTML::Concern::Parser::HTML5
315+
include HTML::Concern::Scrubber::Full
316+
include HTML::Concern::Serializer::UTF8Encode
317+
end
318+
319+
# == Rails::HTML5::LinkSanitizer
320+
#
321+
# Removes +a+ tags and +href+ attributes from HTML5 leaving only the link text.
322+
#
323+
# link_sanitizer = Rails::HTML5::LinkSanitizer.new
324+
# link_sanitizer.sanitize('<a href="example.com">Only the link text will be kept.</a>')
325+
# # => "Only the link text will be kept."
326+
#
327+
class LinkSanitizer < Rails::HTML::Sanitizer
328+
include HTML::Concern::ComposedSanitize
329+
include HTML::Concern::Parser::HTML5
330+
include HTML::Concern::Scrubber::Link
331+
include HTML::Concern::Serializer::SimpleString
332+
end
333+
334+
# == Rails::HTML5::SafeListSanitizer
335+
#
336+
# Sanitizes HTML5 and CSS from an extensive safe list.
337+
#
338+
# === Whitespace
339+
#
340+
# We can't make any guarantees about whitespace being kept or stripped. Loofah uses Nokogiri,
341+
# which wraps either a C or Java parser for the respective Ruby implementation. Those two
342+
# parsers determine how whitespace is ultimately handled.
343+
#
344+
# When the stripped markup will be rendered the users browser won't take whitespace into account
345+
# anyway. It might be better to suggest your users wrap their whitespace sensitive content in
346+
# pre tags or that you do so automatically.
347+
#
348+
# === Options
349+
#
350+
# Sanitizes both html and css via the safe lists found in
351+
# Rails::HTML::Concern::Scrubber::SafeList
352+
#
353+
# SafeListSanitizer also accepts options to configure the safe list used when sanitizing html.
354+
# There's a class level option:
355+
#
356+
# Rails::HTML5::SafeListSanitizer.allowed_tags = %w(table tr td)
357+
# Rails::HTML5::SafeListSanitizer.allowed_attributes = %w(id class style)
358+
#
359+
# Tags and attributes can also be passed to +sanitize+. Passed options take precedence over the
360+
# class level options.
361+
#
362+
# === Examples
363+
#
364+
# safe_list_sanitizer = Rails::HTML5::SafeListSanitizer.new
365+
#
366+
# # default: sanitize via a extensive safe list of allowed elements
367+
# safe_list_sanitizer.sanitize(@article.body)
368+
#
369+
# # sanitize via the supplied tags and attributes
370+
# safe_list_sanitizer.sanitize(
371+
# @article.body,
372+
# tags: %w(table tr td),
373+
# attributes: %w(id class style),
374+
# )
375+
#
376+
# # sanitize via a custom Loofah scrubber
377+
# safe_list_sanitizer.sanitize(@article.body, scrubber: ArticleScrubber.new)
378+
#
379+
# # prune nodes from the tree instead of stripping tags and leaving inner content
380+
# safe_list_sanitizer = Rails::HTML5::SafeListSanitizer.new(prune: true)
381+
#
382+
# # the sanitizer can also sanitize CSS
383+
# safe_list_sanitizer.sanitize_css('background-color: #000;')
384+
#
385+
class SafeListSanitizer < Rails::HTML::Sanitizer
386+
include HTML::Concern::ComposedSanitize
387+
include HTML::Concern::Parser::HTML5
388+
include HTML::Concern::Scrubber::SafeList
389+
include HTML::Concern::Serializer::UTF8Encode
390+
end
391+
end if Rails::HTML::Sanitizer.html5_support?
392+
289393
module HTML
290394
FullSanitizer = HTML4::FullSanitizer # :nodoc:
291395
LinkSanitizer = HTML4::LinkSanitizer # :nodoc:

test/sanitizer_test.rb

Lines changed: 69 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,22 @@
2020
# In many other cases, it's because the parser used by Nokogiri on JRuby (xerces+nekohtml) parses
2121
# slightly differently than libxml2 in edge cases.
2222
#
23-
module TestRailsSanitizers
24-
class XpathRemovalTestSanitizer < Rails::Html::Sanitizer
25-
def sanitize(html, options = {})
26-
fragment = Loofah.fragment(html)
27-
remove_xpaths(fragment, options[:xpaths]).to_s
28-
end
23+
module SanitizerTests
24+
def self.loofah_html5_support?
25+
Loofah.respond_to?(:html5_support?) && Loofah.html5_support?
2926
end
3027

3128
class BaseSanitizerTest < Minitest::Test
29+
class XpathRemovalTestSanitizer < Rails::HTML::Sanitizer
30+
def sanitize(html, options = {})
31+
fragment = Loofah.fragment(html)
32+
remove_xpaths(fragment, options[:xpaths]).to_s
33+
end
34+
end
35+
3236
def test_sanitizer_sanitize_raises_not_implemented_error
3337
assert_raises NotImplementedError do
34-
Rails::Html::Sanitizer.new.sanitize("asdf")
38+
Rails::HTML::Sanitizer.new.sanitize("asdf")
3539
end
3640
end
3741

@@ -65,7 +69,15 @@ def xpath_sanitize(input, options = {})
6569
end
6670
end
6771

68-
class FullSanitizerTest < Minitest::Test
72+
module ModuleUnderTest
73+
def module_under_test
74+
self.class.instance_variable_get(:@module_under_test)
75+
end
76+
end
77+
78+
module FullSanitizerTest
79+
include ModuleUnderTest
80+
6981
def test_strip_tags_with_quote
7082
input = '<" <img src="trollface.gif" onload="alert(1)"> hi'
7183
result = full_sanitize(input)
@@ -164,11 +176,23 @@ def test_full_sanitize_respect_html_escaping_of_the_given_string
164176

165177
protected
166178
def full_sanitize(input, options = {})
167-
Rails::Html::FullSanitizer.new.sanitize(input, options)
179+
module_under_test::FullSanitizer.new.sanitize(input, options)
168180
end
169181
end
170182

171-
class LinkSanitizerTest < Minitest::Test
183+
class HTML4FullSanitizerTest < Minitest::Test
184+
@module_under_test = Rails::HTML4
185+
include FullSanitizerTest
186+
end
187+
188+
class HTML5FullSanitizerTest < Minitest::Test
189+
@module_under_test = Rails::HTML5
190+
include FullSanitizerTest
191+
end if loofah_html5_support?
192+
193+
module LinkSanitizerTest
194+
include ModuleUnderTest
195+
172196
def test_strip_links_with_tags_in_tags
173197
expected = "&lt;a href='hello'&gt;all <b>day</b> long&lt;/a&gt;"
174198
input = "<<a>a href='hello'>all <b>day</b> long<</A>/a>"
@@ -201,11 +225,23 @@ def test_strip_links_with_linkception
201225

202226
protected
203227
def link_sanitize(input, options = {})
204-
Rails::Html::LinkSanitizer.new.sanitize(input, options)
228+
module_under_test::LinkSanitizer.new.sanitize(input, options)
205229
end
206230
end
207231

208-
class SafeListSanitizerTest < Minitest::Test
232+
class HTML4LinkSanitizerTest < Minitest::Test
233+
@module_under_test = Rails::HTML4
234+
include LinkSanitizerTest
235+
end
236+
237+
class HTML5LinkSanitizerTest < Minitest::Test
238+
@module_under_test = Rails::HTML5
239+
include LinkSanitizerTest
240+
end if loofah_html5_support?
241+
242+
module SafeListSanitizerTest
243+
include ModuleUnderTest
244+
209245
def test_sanitize_nested_script
210246
assert_equal '&lt;script&gt;alert("XSS");&lt;/script&gt;', safe_list_sanitize('<script><script></script>alert("XSS");<script><</script>/</script><script>script></script>', tags: %w(em))
211247
end
@@ -369,7 +405,7 @@ def test_custom_attributes_overrides_allowed_attributes
369405
end
370406

371407
def test_should_allow_prune
372-
sanitizer = Rails::Html::SafeListSanitizer.new(prune: true)
408+
sanitizer = module_under_test::SafeListSanitizer.new(prune: true)
373409
text = "<u>leave me <b>now</b></u>"
374410
assert_equal "<u>leave me </u>", sanitizer.sanitize(text, tags: %w(u))
375411
end
@@ -919,31 +955,31 @@ def test_should_sanitize_across_newlines
919955

920956
protected
921957
def safe_list_sanitize(input, options = {})
922-
Rails::Html::SafeListSanitizer.new.sanitize(input, options)
958+
module_under_test::SafeListSanitizer.new.sanitize(input, options)
923959
end
924960

925961
def assert_sanitized(input, expected = nil)
926962
assert_equal((expected || input), safe_list_sanitize(input))
927963
end
928964

929965
def scope_allowed_tags(tags)
930-
old_tags = Rails::Html::SafeListSanitizer.allowed_tags
931-
Rails::Html::SafeListSanitizer.allowed_tags = tags
932-
yield Rails::Html::SafeListSanitizer.new
966+
old_tags = module_under_test::SafeListSanitizer.allowed_tags
967+
module_under_test::SafeListSanitizer.allowed_tags = tags
968+
yield module_under_test::SafeListSanitizer.new
933969
ensure
934-
Rails::Html::SafeListSanitizer.allowed_tags = old_tags
970+
module_under_test::SafeListSanitizer.allowed_tags = old_tags
935971
end
936972

937973
def scope_allowed_attributes(attributes)
938-
old_attributes = Rails::Html::SafeListSanitizer.allowed_attributes
939-
Rails::Html::SafeListSanitizer.allowed_attributes = attributes
940-
yield Rails::Html::SafeListSanitizer.new
974+
old_attributes = module_under_test::SafeListSanitizer.allowed_attributes
975+
module_under_test::SafeListSanitizer.allowed_attributes = attributes
976+
yield module_under_test::SafeListSanitizer.new
941977
ensure
942-
Rails::Html::SafeListSanitizer.allowed_attributes = old_attributes
978+
module_under_test::SafeListSanitizer.allowed_attributes = old_attributes
943979
end
944980

945981
def sanitize_css(input)
946-
Rails::HTML4::SafeListSanitizer.new.sanitize_css(input)
982+
module_under_test::SafeListSanitizer.new.sanitize_css(input)
947983
end
948984

949985
# note that this is used for testing CSS hex encoding: \\[0-9a-f]{1,6}
@@ -957,4 +993,14 @@ def convert_to_css_hex(string, escape_parens = false)
957993
end.join
958994
end
959995
end
996+
997+
class HTML4SafeListSanitizerTest < Minitest::Test
998+
@module_under_test = Rails::HTML4
999+
include SafeListSanitizerTest
1000+
end
1001+
1002+
class HTML5SafeListSanitizerTest < Minitest::Test
1003+
@module_under_test = Rails::HTML5
1004+
include SafeListSanitizerTest
1005+
end if loofah_html5_support?
9601006
end

0 commit comments

Comments
 (0)