File 0003_CVE-2022-23517_CVE-2022-23518_CVE-2022-23519_CVE-2022-23520.patch of Package rubygem-rails-html-sanitizer.30440
diff --color -rubN rails-html-sanitizer-1.0.3-ori/lib/crass/parser.rb rails-html-sanitizer-1.0.3/lib/crass/parser.rb
--- rails-html-sanitizer-1.0.3-ori/lib/crass/parser.rb 1970-01-01 01:00:00.000000000 +0100
+++ rails-html-sanitizer-1.0.3/lib/crass/parser.rb 2015-04-18 06:16:46.000000000 +0200
@@ -0,0 +1,648 @@
+# encoding: utf-8
+require_relative 'token-scanner'
+require_relative 'tokenizer'
+
+module Crass
+
+ # Parses a CSS string or list of tokens.
+ #
+ # 5. http://dev.w3.org/csswg/css-syntax/#parsing
+ class Parser
+ BLOCK_END_TOKENS = {
+ :'{' => :'}',
+ :'[' => :']',
+ :'(' => :')'
+ }
+
+ # -- Class Methods ---------------------------------------------------------
+
+ # Parses CSS properties (such as the contents of an HTML element's `style`
+ # attribute) and returns a parse tree.
+ #
+ # See {Tokenizer#initialize} for _options_.
+ #
+ # 5.3.6. http://dev.w3.org/csswg/css-syntax/#parse-a-list-of-declarations
+ def self.parse_properties(input, options = {})
+ Parser.new(input, options).parse_properties
+ end
+
+ # Parses CSS rules (such as the content of a `@media` block) and returns a
+ # parse tree. The only difference from {parse_stylesheet} is that CDO/CDC
+ # nodes (`<!--` and `-->`) aren't ignored.
+ #
+ # See {Tokenizer#initialize} for _options_.
+ #
+ # 5.3.3. http://dev.w3.org/csswg/css-syntax/#parse-a-list-of-rules
+ def self.parse_rules(input, options = {})
+ parser = Parser.new(input, options)
+ rules = parser.consume_rules
+
+ rules.map do |rule|
+ if rule[:node] == :qualified_rule
+ parser.create_style_rule(rule)
+ else
+ rule
+ end
+ end
+ end
+
+ # Parses a CSS stylesheet and returns a parse tree.
+ #
+ # See {Tokenizer#initialize} for _options_.
+ #
+ # 5.3.2. http://dev.w3.org/csswg/css-syntax/#parse-a-stylesheet
+ def self.parse_stylesheet(input, options = {})
+ parser = Parser.new(input, options)
+ rules = parser.consume_rules(:top_level => true)
+
+ rules.map do |rule|
+ if rule[:node] == :qualified_rule
+ parser.create_style_rule(rule)
+ else
+ rule
+ end
+ end
+ end
+
+ # Converts a node or array of nodes into a CSS string based on their
+ # original tokenized input.
+ #
+ # Options:
+ #
+ # * **:exclude_comments** - When `true`, comments will be excluded.
+ #
+ def self.stringify(nodes, options = {})
+ nodes = [nodes] unless nodes.is_a?(Array)
+ string = ''
+
+ nodes.each do |node|
+ next if node.nil?
+
+ case node[:node]
+ when :at_rule
+ string << '@'
+ string << node[:name]
+ string << self.stringify(node[:prelude], options)
+
+ if node[:block]
+ string << '{' << self.stringify(node[:block], options) << '}'
+ else
+ string << ';'
+ end
+
+ when :comment
+ string << node[:raw] unless options[:exclude_comments]
+
+ when :simple_block
+ string << node[:start]
+ string << self.stringify(node[:value], options)
+ string << node[:end]
+
+ when :style_rule
+ string << self.stringify(node[:selector][:tokens], options)
+ string << '{' << self.stringify(node[:children], options) << '}'
+
+ else
+ if node.key?(:raw)
+ string << node[:raw]
+ elsif node.key?(:tokens)
+ string << self.stringify(node[:tokens], options)
+ end
+ end
+ end
+
+ string
+ end
+
+ # -- Instance Methods ------------------------------------------------------
+
+ # {TokenScanner} wrapping the tokens generated from this parser's input.
+ attr_reader :tokens
+
+ # Initializes a parser based on the given _input_, which may be a CSS string
+ # or an array of tokens.
+ #
+ # See {Tokenizer#initialize} for _options_.
+ def initialize(input, options = {})
+ unless input.kind_of?(Enumerable)
+ input = Tokenizer.tokenize(input, options)
+ end
+
+ @tokens = TokenScanner.new(input)
+ end
+
+ # Consumes an at-rule and returns it.
+ #
+ # 5.4.2. http://dev.w3.org/csswg/css-syntax-3/#consume-at-rule
+ def consume_at_rule(input = @tokens)
+ rule = {}
+
+ rule[:tokens] = input.collect do
+ rule[:name] = input.consume[:value]
+ rule[:prelude] = []
+
+ while token = input.consume
+ node = token[:node]
+
+ if node == :comment # Non-standard.
+ next
+
+ elsif node == :semicolon
+ break
+
+ elsif node === :'{'
+ # Note: The spec says the block should _be_ the consumed simple
+ # block, but Simon Sapin's CSS parsing tests and tinycss2 expect
+ # only the _value_ of the consumed simple block here. I assume I'm
+ # interpreting the spec too literally, so I'm going with the
+ # tinycss2 behavior.
+ rule[:block] = consume_simple_block(input)[:value]
+ break
+
+ elsif node == :simple_block && token[:start] == '{'
+ # Note: The spec says the block should _be_ the simple block, but
+ # Simon Sapin's CSS parsing tests and tinycss2 expect only the
+ # _value_ of the simple block here. I assume I'm interpreting the
+ # spec too literally, so I'm going with the tinycss2 behavior.
+ rule[:block] = token[:value]
+ break
+
+ else
+ input.reconsume
+ rule[:prelude] << consume_component_value(input)
+ end
+ end
+ end
+
+ create_node(:at_rule, rule)
+ end
+
+ # Consumes a component value and returns it, or `nil` if there are no more
+ # tokens.
+ #
+ # 5.4.6. http://dev.w3.org/csswg/css-syntax-3/#consume-a-component-value
+ def consume_component_value(input = @tokens)
+ return nil unless token = input.consume
+
+ case token[:node]
+ when :'{', :'[', :'('
+ consume_simple_block(input)
+
+ when :function
+ if token.key?(:name)
+ # This is a parsed function, not a function token. This step isn't
+ # mentioned in the spec, but it's necessary to avoid re-parsing
+ # functions that have already been parsed.
+ token
+ else
+ consume_function(input)
+ end
+
+ else
+ token
+ end
+ end
+
+ # Consumes a declaration and returns it, or `nil` on parse error.
+ #
+ # 5.4.5. http://dev.w3.org/csswg/css-syntax-3/#consume-a-declaration
+ def consume_declaration(input = @tokens)
+ declaration = {}
+ value = []
+
+ declaration[:tokens] = input.collect do
+ declaration[:name] = input.consume[:value]
+
+ next_token = input.peek
+
+ while next_token && next_token[:node] == :whitespace
+ input.consume
+ next_token = input.peek
+ end
+
+ unless next_token && next_token[:node] == :colon
+ # Parse error.
+ #
+ # Note: The spec explicitly says to return nothing here, but Simon
+ # Sapin's CSS parsing tests expect an error node.
+ return create_node(:error, :value => 'invalid')
+ end
+
+ input.consume
+
+ until input.peek.nil?
+ value << consume_component_value(input)
+ end
+ end
+
+ # Look for !important.
+ important_tokens = value.reject {|token|
+ node = token[:node]
+ node == :whitespace || node == :comment || node == :semicolon
+ }.last(2)
+
+ if important_tokens.size == 2 &&
+ important_tokens[0][:node] == :delim &&
+ important_tokens[0][:value] == '!' &&
+ important_tokens[1][:node] == :ident &&
+ important_tokens[1][:value].downcase == 'important'
+
+ declaration[:important] = true
+ excl_index = value.index(important_tokens[0])
+
+ # Technically the spec doesn't require us to trim trailing tokens after
+ # the !important, but Simon Sapin's CSS parsing tests expect it and
+ # tinycss2 does it, so we'll go along with the cool kids.
+ value.slice!(excl_index, value.size - excl_index)
+ else
+ declaration[:important] = false
+ end
+
+ declaration[:value] = value
+ create_node(:declaration, declaration)
+ end
+
+ # Consumes a list of declarations and returns them.
+ #
+ # By default, the returned list may include `:comment`, `:semicolon`, and
+ # `:whitespace` nodes, which is non-standard.
+ #
+ # Options:
+ #
+ # * **:strict** - Set to `true` to exclude non-standard `:comment`,
+ # `:semicolon`, and `:whitespace` nodes.
+ #
+ # 5.4.4. http://dev.w3.org/csswg/css-syntax/#consume-a-list-of-declarations
+ def consume_declarations(input = @tokens, options = {})
+ declarations = []
+
+ while token = input.consume
+ case token[:node]
+
+ # Non-standard: Preserve comments, semicolons, and whitespace.
+ when :comment, :semicolon, :whitespace
+ declarations << token unless options[:strict]
+
+ when :at_keyword
+ # When parsing a style rule, this is a parse error. Otherwise it's
+ # not.
+ input.reconsume
+ declarations << consume_at_rule(input)
+
+ when :ident
+ decl_tokens = [token]
+
+ while next_token = input.peek
+ break if next_token[:node] == :semicolon
+ decl_tokens << consume_component_value(input)
+ end
+
+ if decl = consume_declaration(TokenScanner.new(decl_tokens))
+ declarations << decl
+ end
+
+ else
+ # Parse error (invalid property name, etc.).
+ #
+ # Note: The spec doesn't say we should append anything to the list of
+ # declarations here, but Simon Sapin's CSS parsing tests expect an
+ # error node.
+ declarations << create_node(:error, :value => 'invalid')
+ input.reconsume
+
+ while next_token = input.peek
+ break if next_token[:node] == :semicolon
+ consume_component_value(input)
+ end
+ end
+ end
+
+ declarations
+ end
+
+ # Consumes a function and returns it.
+ #
+ # 5.4.8. http://dev.w3.org/csswg/css-syntax-3/#consume-a-function
+ def consume_function(input = @tokens)
+ function = {
+ :name => input.current[:value],
+ :value => [],
+ :tokens => [input.current] # Non-standard, used for serialization.
+ }
+
+ function[:tokens].concat(input.collect {
+ while token = input.consume
+ case token[:node]
+ when :')'
+ break
+
+ # Non-standard.
+ when :comment
+ next
+
+ else
+ input.reconsume
+ function[:value] << consume_component_value(input)
+ end
+ end
+ })
+
+ create_node(:function, function)
+ end
+
+ # Consumes a qualified rule and returns it, or `nil` if a parse error
+ # occurs.
+ #
+ # 5.4.3. http://dev.w3.org/csswg/css-syntax-3/#consume-a-qualified-rule
+ def consume_qualified_rule(input = @tokens)
+ rule = {:prelude => []}
+
+ rule[:tokens] = input.collect do
+ while true
+ unless token = input.consume
+ # Parse error.
+ #
+ # Note: The spec explicitly says to return nothing here, but Simon
+ # Sapin's CSS parsing tests expect an error node.
+ return create_node(:error, :value => 'invalid')
+ end
+
+ if token[:node] == :'{'
+ # Note: The spec says the block should _be_ the consumed simple
+ # block, but Simon Sapin's CSS parsing tests and tinycss2 expect
+ # only the _value_ of the consumed simple block here. I assume I'm
+ # interpreting the spec too literally, so I'm going with the
+ # tinycss2 behavior.
+ rule[:block] = consume_simple_block(input)[:value]
+ break
+ elsif token[:node] == :simple_block && token[:start] == '{'
+ # Note: The spec says the block should _be_ the simple block, but
+ # Simon Sapin's CSS parsing tests and tinycss2 expect only the
+ # _value_ of the simple block here. I assume I'm interpreting the
+ # spec too literally, so I'm going with the tinycss2 behavior.
+ rule[:block] = token[:value]
+ break
+ else
+ input.reconsume
+ rule[:prelude] << consume_component_value(input)
+ end
+ end
+ end
+
+ create_node(:qualified_rule, rule)
+ end
+
+ # Consumes a list of rules and returns them.
+ #
+ # 5.4.1. http://dev.w3.org/csswg/css-syntax/#consume-a-list-of-rules
+ def consume_rules(flags = {})
+ rules = []
+
+ while token = @tokens.consume
+ case token[:node]
+ # Non-standard. Spec says to discard comments and whitespace, but we
+ # keep them so we can serialize faithfully.
+ when :comment, :whitespace
+ rules << token
+
+ when :cdc, :cdo
+ unless flags[:top_level]
+ @tokens.reconsume
+ rule = consume_qualified_rule
+ rules << rule if rule
+ end
+
+ when :at_keyword
+ @tokens.reconsume
+ rule = consume_at_rule
+ rules << rule if rule
+
+ else
+ @tokens.reconsume
+ rule = consume_qualified_rule
+ rules << rule if rule
+ end
+ end
+
+ rules
+ end
+
+ # Consumes and returns a simple block associated with the current input
+ # token.
+ #
+ # 5.4.7. http://dev.w3.org/csswg/css-syntax/#consume-a-simple-block
+ def consume_simple_block(input = @tokens)
+ start_token = input.current[:node]
+ end_token = BLOCK_END_TOKENS[start_token]
+
+ block = {
+ :start => start_token.to_s,
+ :end => end_token.to_s,
+ :value => [],
+ :tokens => [input.current] # Non-standard. Used for serialization.
+ }
+
+ block[:tokens].concat(input.collect do
+ while token = input.consume
+ break if token[:node] == end_token
+
+ input.reconsume
+ block[:value] << consume_component_value(input)
+ end
+ end)
+
+ create_node(:simple_block, block)
+ end
+
+ # Creates and returns a new parse node with the given _properties_.
+ def create_node(type, properties = {})
+ {:node => type}.merge!(properties)
+ end
+
+ # Parses the given _input_ tokens into a selector node and returns it.
+ #
+ # Doesn't bother splitting the selector list into individual selectors or
+ # validating them. Feel free to do that yourself! It'll be fun!
+ def create_selector(input)
+ create_node(:selector,
+ :value => parse_value(input),
+ :tokens => input)
+ end
+
+ # Creates a `:style_rule` node from the given qualified _rule_, and returns
+ # it.
+ def create_style_rule(rule)
+ create_node(:style_rule,
+ :selector => create_selector(rule[:prelude]),
+ :children => parse_properties(rule[:block]))
+ end
+
+ # Parses a single component value and returns it.
+ #
+ # 5.3.7. http://dev.w3.org/csswg/css-syntax-3/#parse-a-component-value
+ def parse_component_value(input = @tokens)
+ input = TokenScanner.new(input) unless input.is_a?(TokenScanner)
+
+ while input.peek && input.peek[:node] == :whitespace
+ input.consume
+ end
+
+ if input.peek.nil?
+ return create_node(:error, :value => 'empty')
+ end
+
+ value = consume_component_value(input)
+
+ while input.peek && input.peek[:node] == :whitespace
+ input.consume
+ end
+
+ if input.peek.nil?
+ value
+ else
+ create_node(:error, :value => 'extra-input')
+ end
+ end
+
+ # Parses a list of component values and returns an array of parsed tokens.
+ #
+ # 5.3.8. http://dev.w3.org/csswg/css-syntax/#parse-a-list-of-component-values
+ def parse_component_values(input = @tokens)
+ input = TokenScanner.new(input) unless input.is_a?(TokenScanner)
+ tokens = []
+
+ while token = consume_component_value(input)
+ tokens << token
+ end
+
+ tokens
+ end
+
+ # Parses a single declaration and returns it.
+ #
+ # 5.3.5. http://dev.w3.org/csswg/css-syntax/#parse-a-declaration
+ def parse_declaration(input = @tokens)
+ input = TokenScanner.new(input) unless input.is_a?(TokenScanner)
+
+ while input.peek && input.peek[:node] == :whitespace
+ input.consume
+ end
+
+ if input.peek.nil?
+ # Syntax error.
+ return create_node(:error, :value => 'empty')
+ elsif input.peek[:node] != :ident
+ # Syntax error.
+ return create_node(:error, :value => 'invalid')
+ end
+
+ if decl = consume_declaration(input)
+ return decl
+ end
+
+ # Syntax error.
+ create_node(:error, :value => 'invalid')
+ end
+
+ # Parses a list of declarations and returns them.
+ #
+ # See {#consume_declarations} for _options_.
+ #
+ # 5.3.6. http://dev.w3.org/csswg/css-syntax/#parse-a-list-of-declarations
+ def parse_declarations(input = @tokens, options = {})
+ input = TokenScanner.new(input) unless input.is_a?(TokenScanner)
+ consume_declarations(input, options)
+ end
+
+ # Parses a list of declarations and returns an array of `:property` nodes
+ # (and any non-declaration nodes that were in the input). This is useful for
+ # parsing the contents of an HTML element's `style` attribute.
+ def parse_properties(input = @tokens)
+ properties = []
+
+ parse_declarations(input).each do |decl|
+ unless decl[:node] == :declaration
+ properties << decl
+ next
+ end
+
+ children = decl[:value].dup
+ children.pop if children.last && children.last[:node] == :semicolon
+
+ properties << create_node(:property,
+ :name => decl[:name],
+ :value => parse_value(decl[:value]),
+ :children => children,
+ :important => decl[:important],
+ :tokens => decl[:tokens])
+ end
+
+ properties
+ end
+
+ # Parses a single rule and returns it.
+ #
+ # 5.3.4. http://dev.w3.org/csswg/css-syntax-3/#parse-a-rule
+ def parse_rule(input = @tokens)
+ input = TokenScanner.new(input) unless input.is_a?(TokenScanner)
+
+ while input.peek && input.peek[:node] == :whitespace
+ input.consume
+ end
+
+ if input.peek.nil?
+ # Syntax error.
+ return create_node(:error, :value => 'empty')
+ elsif input.peek[:node] == :at_keyword
+ rule = consume_at_rule(input)
+ else
+ rule = consume_qualified_rule(input)
+ end
+
+ while input.peek && input.peek[:node] == :whitespace
+ input.consume
+ end
+
+ if input.peek.nil?
+ rule
+ else
+ # Syntax error.
+ create_node(:error, :value => 'extra-input')
+ end
+ end
+
+ # Returns the unescaped value of a selector name or property declaration.
+ def parse_value(nodes)
+ nodes = [nodes] unless nodes.is_a?(Array)
+ string = ''
+
+ nodes.each do |node|
+ case node[:node]
+ when :comment, :semicolon
+ next
+
+ when :at_keyword, :ident
+ string << node[:value]
+
+ when :function
+ if node[:value].is_a?(String)
+ string << node[:value]
+ string << '('
+ else
+ string << parse_value(node[:tokens])
+ end
+
+ else
+ if node.key?(:raw)
+ string << node[:raw]
+ elsif node.key?(:tokens)
+ string << parse_value(node[:tokens])
+ end
+ end
+ end
+
+ string.strip
+ end
+ end
+
+end
diff --color -rubN rails-html-sanitizer-1.0.3-ori/lib/crass/scanner.rb rails-html-sanitizer-1.0.3/lib/crass/scanner.rb
--- rails-html-sanitizer-1.0.3-ori/lib/crass/scanner.rb 1970-01-01 01:00:00.000000000 +0100
+++ rails-html-sanitizer-1.0.3/lib/crass/scanner.rb 2015-04-18 06:16:46.000000000 +0200
@@ -0,0 +1,125 @@
+# encoding: utf-8
+require 'strscan'
+
+module Crass
+
+ # Similar to a StringScanner, but with extra functionality needed to tokenize
+ # CSS while preserving the original text.
+ class Scanner
+ # Current character, or `nil` if the scanner hasn't yet consumed a
+ # character, or is at the end of the string.
+ attr_reader :current
+
+ # Current marker position. Use {#marked} to get the substring between
+ # {#marker} and {#pos}.
+ attr_accessor :marker
+
+ # Position of the next character that will be consumed. This is a character
+ # position, not a byte position, so it accounts for multi-byte characters.
+ attr_accessor :pos
+
+ # String being scanned.
+ attr_reader :string
+
+ # Creates a Scanner instance for the given _input_ string or IO instance.
+ def initialize(input)
+ @string = input.is_a?(IO) ? input.read : input.to_s
+ @scanner = StringScanner.new(@string)
+
+ reset
+ end
+
+ # Consumes the next character and returns it, advancing the pointer, or
+ # an empty string if the end of the string has been reached.
+ def consume
+ if @pos < @len
+ @pos += 1
+ @current = @scanner.getch
+ else
+ ''
+ end
+ end
+
+ # Consumes the rest of the string and returns it, advancing the pointer to
+ # the end of the string. Returns an empty string is the end of the string
+ # has already been reached.
+ def consume_rest
+ result = @scanner.rest
+
+ @current = result[-1]
+ @pos = @len
+
+ result
+ end
+
+ # Returns `true` if the end of the string has been reached, `false`
+ # otherwise.
+ def eos?
+ @pos == @len
+ end
+
+ # Sets the marker to the position of the next character that will be
+ # consumed.
+ def mark
+ @marker = @pos
+ end
+
+ # Returns the substring between {#marker} and {#pos}, without altering the
+ # pointer.
+ def marked
+ if result = @string[@marker, @pos - @marker]
+ result
+ else
+ ''
+ end
+ end
+
+ # Returns up to _length_ characters starting at the current position, but
+ # doesn't consume them. The number of characters returned may be less than
+ # _length_ if the end of the string is reached.
+ def peek(length = 1)
+ @string[pos, length]
+ end
+
+ # Moves the pointer back one character without changing the value of
+ # {#current}. The next call to {#consume} will re-consume the current
+ # character.
+ def reconsume
+ @scanner.unscan
+ @pos -= 1 if @pos > 0
+ end
+
+ # Resets the pointer to the beginning of the string.
+ def reset
+ @current = nil
+ @len = @string.size
+ @marker = 0
+ @pos = 0
+ end
+
+ # Tries to match _pattern_ at the current position. If it matches, the
+ # matched substring will be returned and the pointer will be advanced.
+ # Otherwise, `nil` will be returned.
+ def scan(pattern)
+ if match = @scanner.scan(pattern)
+ @pos += match.size
+ @current = match[-1]
+ end
+
+ match
+ end
+
+ # Scans the string until the _pattern_ is matched. Returns the substring up
+ # to and including the end of the match, and advances the pointer. If there
+ # is no match, `nil` is returned and the pointer is not advanced.
+ def scan_until(pattern)
+ if match = @scanner.scan_until(pattern)
+ @pos += match.size
+ @current = match[-1]
+ end
+
+ match
+ end
+ end
+
+end
diff --color -rubN rails-html-sanitizer-1.0.3-ori/lib/crass/tokenizer.rb rails-html-sanitizer-1.0.3/lib/crass/tokenizer.rb
--- rails-html-sanitizer-1.0.3-ori/lib/crass/tokenizer.rb 1970-01-01 01:00:00.000000000 +0100
+++ rails-html-sanitizer-1.0.3/lib/crass/tokenizer.rb 2015-04-18 06:16:46.000000000 +0200
@@ -0,0 +1,689 @@
+# encoding: utf-8
+require_relative 'scanner'
+
+module Crass
+
+ # Tokenizes a CSS string.
+ #
+ # 4. http://dev.w3.org/csswg/css-syntax/#tokenization
+ class Tokenizer
+ RE_COMMENT_CLOSE = /\*\//
+ RE_DIGIT = /[0-9]+/
+ RE_ESCAPE = /\\[^\n]/
+ RE_HEX = /[0-9A-Fa-f]{1,6}/
+ RE_NAME = /[0-9A-Za-z_\u0080-\u{10ffff}-]+/
+ RE_NAME_START = /[A-Za-z_\u0080-\u{10ffff}]+/
+ RE_NON_PRINTABLE = /[\u0000-\u0008\u000b\u000e-\u001f\u007f]+/
+ RE_NUMBER_DECIMAL = /\.[0-9]+/
+ RE_NUMBER_EXPONENT = /[Ee][+-]?[0-9]+/
+ RE_NUMBER_SIGN = /[+-]/
+
+ RE_NUMBER_STR = /\A
+ (?<sign> [+-]?)
+ (?<integer> [0-9]*)
+ (?:\.
+ (?<fractional> [0-9]*)
+ )?
+ (?:[Ee]
+ (?<exponent_sign> [+-]?)
+ (?<exponent> [0-9]*)
+ )?
+ \z/x
+
+ RE_QUOTED_URL_START = /\A[\n\u0009\u0020]?["']/
+ RE_UNICODE_RANGE_START = /\+(?:[0-9A-Fa-f]|\?)/
+ RE_UNICODE_RANGE_END = /-[0-9A-Fa-f]/
+ RE_WHITESPACE = /[\n\u0009\u0020]+/
+ RE_WHITESPACE_ANCHORED = /\A[\n\u0009\u0020]+\z/
+
+ # -- Class Methods ---------------------------------------------------------
+
+ # Tokenizes the given _input_ as a CSS string and returns an array of
+ # tokens.
+ #
+ # See {#initialize} for _options_.
+ def self.tokenize(input, options = {})
+ Tokenizer.new(input, options).tokenize
+ end
+
+ # -- Instance Methods ------------------------------------------------------
+
+ # Initializes a new Tokenizer.
+ #
+ # Options:
+ #
+ # * **:preserve_comments** - If `true`, comments will be preserved as
+ # `:comment` tokens.
+ #
+ # * **:preserve_hacks** - If `true`, certain non-standard browser hacks
+ # such as the IE "*" hack will be preserved even though they violate
+ # CSS 3 syntax rules.
+ #
+ def initialize(input, options = {})
+ @s = Scanner.new(preprocess(input))
+ @options = options
+ end
+
+ # Consumes a token and returns the token that was consumed.
+ #
+ # 4.3.1. http://dev.w3.org/csswg/css-syntax/#consume-a-token
+ def consume
+ return nil if @s.eos?
+
+ @s.mark
+
+ # Consume comments.
+ if comment_token = consume_comments
+ if @options[:preserve_comments]
+ return comment_token
+ else
+ return consume
+ end
+ end
+
+ # Consume whitespace.
+ return create_token(:whitespace) if @s.scan(RE_WHITESPACE)
+
+ char = @s.consume
+
+ case char.to_sym
+ when :'"'
+ consume_string
+
+ when :'#'
+ if @s.peek =~ RE_NAME || valid_escape?(@s.peek(2))
+ create_token(:hash,
+ :type => start_identifier?(@s.peek(3)) ? :id : :unrestricted,
+ :value => consume_name)
+ else
+ create_token(:delim, :value => char)
+ end
+
+ when :'$'
+ if @s.peek == '='
+ @s.consume
+ create_token(:suffix_match)
+ else
+ create_token(:delim, :value => char)
+ end
+
+ when :"'"
+ consume_string
+
+ when :'('
+ create_token(:'(')
+
+ when :')'
+ create_token(:')')
+
+ when :*
+ if @s.peek == '='
+ @s.consume
+ create_token(:substring_match)
+
+ # Non-standard: Preserve the IE * hack.
+ elsif @options[:preserve_hacks] && @s.peek =~ RE_NAME_START
+ @s.reconsume
+ consume_ident
+
+ else
+ create_token(:delim, :value => char)
+ end
+
+ when :+
+ if start_number?
+ @s.reconsume
+ consume_numeric
+ else
+ create_token(:delim, :value => char)
+ end
+
+ when :','
+ create_token(:comma)
+
+ when :-
+ nextTwoChars = @s.peek(2)
+ nextThreeChars = char + nextTwoChars
+
+ if start_number?(nextThreeChars)
+ @s.reconsume
+ consume_numeric
+ elsif nextTwoChars == '->'
+ @s.consume
+ @s.consume
+ create_token(:cdc)
+ elsif start_identifier?(nextThreeChars)
+ @s.reconsume
+ consume_ident
+ else
+ create_token(:delim, :value => char)
+ end
+
+ when :'.'
+ if start_number?
+ @s.reconsume
+ consume_numeric
+ else
+ create_token(:delim, :value => char)
+ end
+
+ when :':'
+ create_token(:colon)
+
+ when :';'
+ create_token(:semicolon)
+
+ when :<
+ if @s.peek(3) == '!--'
+ @s.consume
+ @s.consume
+ @s.consume
+
+ create_token(:cdo)
+ else
+ create_token(:delim, :value => char)
+ end
+
+ when :'@'
+ if start_identifier?(@s.peek(3))
+ create_token(:at_keyword, :value => consume_name)
+ else
+ create_token(:delim, :value => char)
+ end
+
+ when :'['
+ create_token(:'[')
+
+ when :'\\'
+ if valid_escape?
+ @s.reconsume
+ consume_ident
+ else
+ # Parse error.
+ create_token(:delim,
+ :error => true,
+ :value => char)
+ end
+
+ when :']'
+ create_token(:']')
+
+ when :'^'
+ if @s.peek == '='
+ @s.consume
+ create_token(:prefix_match)
+ else
+ create_token(:delim, :value => char)
+ end
+
+ when :'{'
+ create_token(:'{')
+
+ when :'}'
+ create_token(:'}')
+
+ when :U, :u
+ if @s.peek(2) =~ RE_UNICODE_RANGE_START
+ @s.consume
+ consume_unicode_range
+ else
+ @s.reconsume
+ consume_ident
+ end
+
+ when :|
+ case @s.peek
+ when '='
+ @s.consume
+ create_token(:dash_match)
+
+ when '|'
+ @s.consume
+ create_token(:column)
+
+ else
+ create_token(:delim, :value => char)
+ end
+
+ when :~
+ if @s.peek == '='
+ @s.consume
+ create_token(:include_match)
+ else
+ create_token(:delim, :value => char)
+ end
+
+ else
+ case char
+ when RE_DIGIT
+ @s.reconsume
+ consume_numeric
+
+ when RE_NAME_START
+ @s.reconsume
+ consume_ident
+
+ else
+ create_token(:delim, :value => char)
+ end
+ end
+ end
+
+ # Consumes the remnants of a bad URL and returns the consumed text.
+ #
+ # 4.3.15. http://dev.w3.org/csswg/css-syntax/#consume-the-remnants-of-a-bad-url
+ def consume_bad_url
+ text = ''
+
+ until @s.eos?
+ if valid_escape?
+ text << consume_escaped
+ elsif valid_escape?(@s.peek(2))
+ @s.consume
+ text << consume_escaped
+ else
+ char = @s.consume
+
+ if char == ')'
+ break
+ else
+ text << char
+ end
+ end
+ end
+
+ text
+ end
+
+ # Consumes comments and returns them, or `nil` if no comments were consumed.
+ #
+ # 4.3.2. http://dev.w3.org/csswg/css-syntax/#consume-comments
+ def consume_comments
+ if @s.peek(2) == '/*'
+ @s.consume
+ @s.consume
+
+ if text = @s.scan_until(RE_COMMENT_CLOSE)
+ text.slice!(-2, 2)
+ else
+ # Parse error.
+ text = @s.consume_rest
+ end
+
+ return create_token(:comment, :value => text)
+ end
+
+ nil
+ end
+
+ # Consumes an escaped code point and returns its unescaped value.
+ #
+ # This method assumes that the `\` has already been consumed, and that the
+ # next character in the input has already been verified not to be a newline
+ # or EOF.
+ #
+ # 4.3.8. http://dev.w3.org/csswg/css-syntax/#consume-an-escaped-code-point
+ def consume_escaped
+ return "\ufffd" if @s.eos?
+
+ if hex_str = @s.scan(RE_HEX)
+ @s.consume if @s.peek =~ RE_WHITESPACE
+
+ codepoint = hex_str.hex
+
+ if codepoint == 0 ||
+ codepoint.between?(0xD800, 0xDFFF) ||
+ codepoint > 0x10FFFF
+
+ return "\ufffd"
+ else
+ return codepoint.chr(Encoding::UTF_8)
+ end
+ end
+
+ @s.consume
+ end
+
+ # Consumes an ident-like token and returns it.
+ #
+ # 4.3.4. http://dev.w3.org/csswg/css-syntax/#consume-an-ident-like-token
+ def consume_ident
+ value = consume_name
+
+ if @s.peek == '('
+ @s.consume
+
+ if value.downcase == 'url'
+ @s.consume while @s.peek(2) =~ RE_WHITESPACE_ANCHORED
+
+ if @s.peek(2) =~ RE_QUOTED_URL_START
+ create_token(:function, :value => value)
+ else
+ consume_url
+ end
+ else
+ create_token(:function, :value => value)
+ end
+ else
+ create_token(:ident, :value => value)
+ end
+ end
+
+ # Consumes a name and returns it.
+ #
+ # 4.3.12. http://dev.w3.org/csswg/css-syntax/#consume-a-name
+ def consume_name
+ result = ''
+
+ until @s.eos?
+ if match = @s.scan(RE_NAME)
+ result << match
+ next
+ end
+
+ char = @s.consume
+
+ if valid_escape?
+ result << consume_escaped
+
+ # Non-standard: IE * hack
+ elsif char == '*' && @options[:preserve_hacks]
+ result << @s.consume
+
+ else
+ @s.reconsume
+ return result
+ end
+ end
+
+ result
+ end
+
+ # Consumes a number and returns a 3-element array containing the number's
+ # original representation, its numeric value, and its type (either
+ # `:integer` or `:number`).
+ #
+ # 4.3.13. http://dev.w3.org/csswg/css-syntax/#consume-a-number
+ def consume_number
+ repr = ''
+ type = :integer
+
+ repr << @s.consume if @s.peek =~ RE_NUMBER_SIGN
+ repr << (@s.scan(RE_DIGIT) || '')
+
+ if match = @s.scan(RE_NUMBER_DECIMAL)
+ repr << match
+ type = :number
+ end
+
+ if match = @s.scan(RE_NUMBER_EXPONENT)
+ repr << match
+ type = :number
+ end
+
+ [repr, convert_string_to_number(repr), type]
+ end
+
+ # Consumes a numeric token and returns it.
+ #
+ # 4.3.3. http://dev.w3.org/csswg/css-syntax/#consume-a-numeric-token
+ def consume_numeric
+ number = consume_number
+
+ if start_identifier?(@s.peek(3))
+ create_token(:dimension,
+ :repr => number[0],
+ :type => number[2],
+ :unit => consume_name,
+ :value => number[1])
+
+ elsif @s.peek == '%'
+ @s.consume
+
+ create_token(:percentage,
+ :repr => number[0],
+ :type => number[2],
+ :value => number[1])
+
+ else
+ create_token(:number,
+ :repr => number[0],
+ :type => number[2],
+ :value => number[1])
+ end
+ end
+
+ # Consumes a string token that ends at the given character, and returns the
+ # token.
+ #
+ # 4.3.5. http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
+ def consume_string(ending = nil)
+ ending = @s.current if ending.nil?
+ value = ''
+
+ until @s.eos?
+ case char = @s.consume
+ when ending
+ break
+
+ when "\n"
+ # Parse error.
+ @s.reconsume
+ return create_token(:bad_string,
+ :error => true,
+ :value => value)
+
+ when '\\'
+ case @s.peek
+ when ''
+ # End of the input, so do nothing.
+ next
+
+ when "\n"
+ @s.consume
+
+ else
+ value << consume_escaped
+ end
+
+ else
+ value << char
+ end
+ end
+
+ create_token(:string, :value => value)
+ end
+
+ # Consumes a Unicode range token and returns it. Assumes the initial "u+" or
+ # "U+" has already been consumed.
+ #
+ # 4.3.7. http://dev.w3.org/csswg/css-syntax/#consume-a-unicode-range-token
+ def consume_unicode_range
+ value = @s.scan(RE_HEX) || ''
+
+ while value.length < 6
+ break unless @s.peek == '?'
+ value << @s.consume
+ end
+
+ range = {}
+
+ if value.include?('?')
+ range[:start] = value.gsub('?', '0').hex
+ range[:end] = value.gsub('?', 'F').hex
+ return create_token(:unicode_range, range)
+ end
+
+ range[:start] = value.hex
+
+ if @s.peek(2) =~ RE_UNICODE_RANGE_END
+ @s.consume
+ range[:end] = (@s.scan(RE_HEX) || '').hex
+ else
+ range[:end] = range[:start]
+ end
+
+ create_token(:unicode_range, range)
+ end
+
+ # Consumes a URL token and returns it. Assumes the original "url(" has
+ # already been consumed.
+ #
+ # 4.3.6. http://dev.w3.org/csswg/css-syntax/#consume-a-url-token
+ def consume_url
+ value = ''
+
+ @s.scan(RE_WHITESPACE)
+
+ until @s.eos?
+ case char = @s.consume
+ when ')'
+ break
+
+ when RE_WHITESPACE
+ @s.scan(RE_WHITESPACE)
+
+ if @s.eos? || @s.peek == ')'
+ @s.consume
+ break
+ else
+ return create_token(:bad_url, :value => value + consume_bad_url)
+ end
+
+ when '"', "'", '(', RE_NON_PRINTABLE
+ # Parse error.
+ return create_token(:bad_url,
+ :error => true,
+ :value => value + consume_bad_url)
+
+ when '\\'
+ if valid_escape?
+ value << consume_escaped
+ else
+ # Parse error.
+ return create_token(:bad_url,
+ :error => true,
+ :value => value + consume_bad_url
+ )
+ end
+
+ else
+ value << char
+ end
+ end
+
+ create_token(:url, :value => value)
+ end
+
+ # Converts a valid CSS number string into a number and returns the number.
+ #
+ # 4.3.14. http://dev.w3.org/csswg/css-syntax/#convert-a-string-to-a-number
+ def convert_string_to_number(str)
+ matches = RE_NUMBER_STR.match(str)
+
+ s = matches[:sign] == '-' ? -1 : 1
+ i = matches[:integer].to_i
+ f = matches[:fractional].to_i
+ d = matches[:fractional] ? matches[:fractional].length : 0
+ t = matches[:exponent_sign] == '-' ? -1 : 1
+ e = matches[:exponent].to_i
+
+ # I know this looks nutty, but it's exactly what's defined in the spec,
+ # and it works.
+ s * (i + f * 10**-d) * 10**(t * e)
+ end
+
+ # Creates and returns a new token with the given _properties_.
+ def create_token(type, properties = {})
+ {
+ :node => type,
+ :pos => @s.marker,
+ :raw => @s.marked
+ }.merge!(properties)
+ end
+
+ # Preprocesses _input_ to prepare it for the tokenizer.
+ #
+ # 3.3. http://dev.w3.org/csswg/css-syntax/#input-preprocessing
+ def preprocess(input)
+ input = input.to_s.encode('UTF-8',
+ :invalid => :replace,
+ :undef => :replace)
+
+ input.gsub!(/(?:\r\n|[\r\f])/, "\n")
+ input.gsub!("\u0000", "\ufffd")
+ input
+ end
+
+ # Returns `true` if the given three-character _text_ would start an
+ # identifier. If _text_ is `nil`, the current and next two characters in the
+ # input stream will be checked, but will not be consumed.
+ #
+ # 4.3.10. http://dev.w3.org/csswg/css-syntax/#would-start-an-identifier
+ def start_identifier?(text = nil)
+ text = @s.current + @s.peek(2) if text.nil?
+
+ case text[0]
+ when '-'
+ nextChar = text[1]
+ !!(nextChar == '-' || nextChar =~ RE_NAME_START || valid_escape?(text[1, 2]))
+
+ when RE_NAME_START
+ true
+
+ when '\\'
+ valid_escape?(text[0, 2])
+
+ else
+ false
+ end
+ end
+
+ # Returns `true` if the given three-character _text_ would start a number.
+ # If _text_ is `nil`, the current and next two characters in the input
+ # stream will be checked, but will not be consumed.
+ #
+ # 4.3.11. http://dev.w3.org/csswg/css-syntax/#starts-with-a-number
+ def start_number?(text = nil)
+ text = @s.current + @s.peek(2) if text.nil?
+
+ case text[0]
+ when '+', '-'
+ !!(text[1] =~ RE_DIGIT || (text[1] == '.' && text[2] =~ RE_DIGIT))
+
+ when '.'
+ !!(text[1] =~ RE_DIGIT)
+
+ when RE_DIGIT
+ true
+
+ else
+ false
+ end
+ end
+
+ # Tokenizes the input stream and returns an array of tokens.
+ def tokenize
+ @s.reset
+
+ tokens = []
+
+ while token = consume
+ tokens << token
+ end
+
+ tokens
+ end
+
+ # Returns `true` if the given two-character _text_ is the beginning of a
+ # valid escape sequence. If _text_ is `nil`, the current and next character
+ # in the input stream will be checked, but will not be consumed.
+ #
+ # 4.3.9. http://dev.w3.org/csswg/css-syntax/#starts-with-a-valid-escape
+ def valid_escape?(text = nil)
+ text = @s.current + @s.peek if text.nil?
+ !!(text[0] == '\\' && text[1] != "\n")
+ end
+ end
+
+end
diff --color -rubN rails-html-sanitizer-1.0.3-ori/lib/crass/token-scanner.rb rails-html-sanitizer-1.0.3/lib/crass/token-scanner.rb
--- rails-html-sanitizer-1.0.3-ori/lib/crass/token-scanner.rb 1970-01-01 01:00:00.000000000 +0100
+++ rails-html-sanitizer-1.0.3/lib/crass/token-scanner.rb 2015-04-18 06:16:46.000000000 +0200
@@ -0,0 +1,50 @@
+# encoding: utf-8
+
+module Crass
+
+ # Like {Scanner}, but for tokens!
+ class TokenScanner
+ attr_reader :current, :pos, :tokens
+
+ def initialize(tokens)
+ @tokens = tokens.to_a
+ reset
+ end
+
+ # Executes the given block, collects all tokens that are consumed during its
+ # execution, and returns them.
+ def collect
+ start = @pos
+ yield
+ @tokens[start...@pos] || []
+ end
+
+ # Consumes the next token and returns it, advancing the pointer. Returns
+ # `nil` if there is no next token.
+ def consume
+ @current = @tokens[@pos]
+ @pos += 1 if @current
+ @current
+ end
+
+ # Returns the next token without consuming it, or `nil` if there is no next
+ # token.
+ def peek
+ @tokens[@pos]
+ end
+
+ # Reconsumes the current token, moving the pointer back one position.
+ #
+ # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#reconsume-the-current-input-token
+ def reconsume
+ @pos -= 1 if @pos > 0
+ end
+
+ # Resets the pointer to the first token in the list.
+ def reset
+ @current = nil
+ @pos = 0
+ end
+ end
+
+end
diff --color -rubN rails-html-sanitizer-1.0.3-ori/lib/crass/version.rb rails-html-sanitizer-1.0.3/lib/crass/version.rb
--- rails-html-sanitizer-1.0.3-ori/lib/crass/version.rb 1970-01-01 01:00:00.000000000 +0100
+++ rails-html-sanitizer-1.0.3/lib/crass/version.rb 2015-04-18 06:16:46.000000000 +0200
@@ -0,0 +1,5 @@
+# encoding: utf-8
+
+module Crass
+ VERSION = '1.0.2'
+end
diff --color -rubN rails-html-sanitizer-1.0.3-ori/lib/crass.rb rails-html-sanitizer-1.0.3/lib/crass.rb
--- rails-html-sanitizer-1.0.3-ori/lib/crass.rb 1970-01-01 01:00:00.000000000 +0100
+++ rails-html-sanitizer-1.0.3/lib/crass.rb 2015-04-18 06:16:46.000000000 +0200
@@ -0,0 +1,22 @@
+# encoding: utf-8
+require_relative 'crass/parser'
+
+# A CSS parser based on the CSS Syntax Module Level 3 spec.
+module Crass
+
+ # Parses _input_ as a CSS stylesheet and returns a parse tree.
+ #
+ # See {Tokenizer#initialize} for _options_.
+ def self.parse(input, options = {})
+ Parser.parse_stylesheet(input, options)
+ end
+
+ # Parses _input_ as a string of CSS properties (such as the contents of an
+ # HTML element's `style` attribute) and returns a parse tree.
+ #
+ # See {Tokenizer#initialize} for _options_.
+ def self.parse_properties(input, options = {})
+ Parser.parse_properties(input, options)
+ end
+
+end
diff --color -rubN rails-html-sanitizer-1.0.3-ori/lib/loofah/html5/scrub.rb rails-html-sanitizer-1.0.3/lib/loofah/html5/scrub.rb
--- rails-html-sanitizer-1.0.3-ori/lib/loofah/html5/scrub.rb 1970-01-01 01:00:00.000000000 +0100
+++ rails-html-sanitizer-1.0.3/lib/loofah/html5/scrub.rb 2023-08-29 12:50:26.371689045 +0200
@@ -0,0 +1,26 @@
+require "crass"
+
+module Loofah
+ module HTML5
+ module Scrub
+ def scrub_attribute_that_allows_local_ref(attr_node)
+ return unless attr_node.value
+
+ nodes = Crass::Parser.new(attr_node.value).parse_component_values
+
+ values = nodes.map do |node|
+ case node[:node]
+ when :url
+ if node[:value].start_with?("#")
+ node[:raw]
+ end
+ when :hash, :ident, :string
+ node[:raw]
+ end
+ end.compact
+
+ attr_node.value = values.join(" ")
+ end
+ end
+ end
+end
diff --color -rubN rails-html-sanitizer-1.0.3-ori/lib/rails/html/scrubbers.rb rails-html-sanitizer-1.0.3/lib/rails/html/scrubbers.rb
--- rails-html-sanitizer-1.0.3-ori/lib/rails/html/scrubbers.rb 2023-08-28 14:25:04.825458612 +0200
+++ rails-html-sanitizer-1.0.3/lib/rails/html/scrubbers.rb 2023-08-28 14:41:54.377457394 +0200
@@ -60,9 +60,9 @@
end
def scrub(node)
- if node.cdata?
- text = node.document.create_text_node node.text
- node.replace text
+ if Loofah::HTML5::Scrub.cdata_needs_escaping?(node)
+ replacement = Loofah::HTML5::Scrub.cdata_escape(node)
+ node.replace(replacement)
return CONTINUE
end
return CONTINUE if skip_node?(node)
@@ -138,14 +138,10 @@
end
if Loofah::HTML5::WhiteList::ATTR_VAL_IS_URI.include?(attr_name)
- # this block lifted nearly verbatim from HTML5 sanitization
- val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(Loofah::HTML5::Scrub::CONTROL_CHARACTERS,'').downcase
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! Loofah::HTML5::WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(Loofah::HTML5::WhiteList::PROTOCOL_SEPARATOR)[0])
- attr_node.remove
- end
+ return if Loofah::HTML5::Scrub.scrub_uri_attribute(attr_node)
end
if Loofah::HTML5::WhiteList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
- attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
+ Loofah::HTML5::Scrub.scrub_attribute_that_allows_local_ref(attr_node)
end
if Loofah::HTML5::WhiteList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
attr_node.remove
diff --color -rubN rails-html-sanitizer-1.0.3-ori/test/sanitizer_test.rb rails-html-sanitizer-1.0.3/test/sanitizer_test.rb
--- rails-html-sanitizer-1.0.3-ori/test/sanitizer_test.rb 2023-08-28 14:25:04.825458612 +0200
+++ rails-html-sanitizer-1.0.3/test/sanitizer_test.rb 2023-08-28 16:39:45.544550156 +0200
@@ -54,6 +54,7 @@
def test_strip_tags_with_quote
input = '<" <img src="trollface.gif" onload="alert(1)"> hi'
+ expected = libxml_2_9_14_recovery_lt? ? %{<" hi} : %{ hi}
assert_equal ' hi', full_sanitize(input)
end
@@ -79,11 +80,16 @@
end
def test_strip_cdata
- assert_equal "This has a ]]> here.", full_sanitize("This has a <![CDATA[<section>]]> here.")
+ input = "This has a <![CDATA[<section>]]> here."
+ expected = libxml_2_9_14_recovery_lt_bang? ? %{This has a <![CDATA[]]> here.} : %{This has a ]]> here.}
+ assert_equal(expected, full_sanitize(input))
end
def test_strip_unclosed_cdata
- assert_equal "This has an unclosed ]] here...", full_sanitize("This has an unclosed <![CDATA[<section>]] here...")
+ input = "This has an unclosed <![CDATA[<section>]] here..."
+ expected = libxml_2_9_14_recovery_lt_bang? ? %{This has an unclosed <![CDATA[]] here...} : %{This has an unclosed ]] here...}
+ assert_equal(expected, full_sanitize(input))
+
end
def test_strip_blank_string
@@ -434,11 +440,15 @@
end
def test_should_sanitize_cdata_section
- assert_sanitized "<![CDATA[<span>section</span>]]>", "section]]>"
+ input = "<![CDATA[<span>section</span>]]>"
+ expected = libxml_2_9_14_recovery_lt_bang? ? %{<![CDATA[<span>section</span>]]>} : %{section]]>}
+ assert_sanitized(input, expected)
end
def test_should_sanitize_unterminated_cdata_section
- assert_sanitized "<![CDATA[<span>neverending...", "neverending..."
+ input = "<![CDATA[<span>neverending..."
+ expected = libxml_2_9_14_recovery_lt_bang? ? %{<![CDATA[<span>neverending...</span>} : %{neverending...}
+ assert_sanitized(input, expected)
end
def test_should_not_mangle_urls_with_ampersand
@@ -482,7 +492,7 @@
assert_equal %(<a data-foo="foo">foo</a>), white_list_sanitize(text, attributes: ['data-foo'])
end
-protected
+ protected
def xpath_sanitize(input, options = {})
XpathRemovalTestSanitizer.new.sanitize(input, options)
@@ -527,4 +537,119 @@
ensure
Rails::Html::WhiteListSanitizer.allowed_attributes = old_attributes
end
+
+ def test_mediatype_text_html_disallowed
+ input = %q(<img src="data:text/html;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4=">)
+ expected = %q(<img>)
+ actual = safe_list_sanitize(input)
+ assert_equal(expected, actual)
+
+ input = %q(<img src="DATA:text/html;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4=">)
+ expected = %q(<img>)
+ actual = safe_list_sanitize(input)
+ assert_equal(expected, actual)
+ end
+
+ def test_mediatype_image_svg_xml_disallowed
+ input = %q(<img src="data:image/svg+xml;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4=">)
+ expected = %q(<img>)
+ actual = safe_list_sanitize(input)
+ assert_equal(expected, actual)
+
+ input = %q(<img src="DATA:image/svg+xml;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4=">)
+ expected = %q(<img>)
+ actual = safe_list_sanitize(input)
+ assert_equal(expected, actual)
+ end
+
+ def test_mediatype_other_disallowed
+ input = %q(<a href="data:foo;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4=">foo</a>)
+ expected = %q(<a>foo</a>)
+ actual = safe_list_sanitize(input)
+ assert_equal(expected, actual)
+
+ input = %q(<a href="DATA:foo;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4=">foo</a>)
+ expected = %q(<a>foo</a>)
+ actual = safe_list_sanitize(input)
+ assert_equal(expected, actual)
+ end
+
+ def test_scrubbing_svg_attr_values_that_allow_ref
+ input = %Q(<div fill="yellow url(http://bad.com/) #fff">hey</div>)
+ expected = %Q(<div fill="yellow #fff">hey</div>)
+ actual = scope_allowed_attributes %w(fill) do
+ safe_list_sanitize(input)
+ end
+
+ assert_equal(expected, actual)
+ end
+
+ def test_style_with_css_payload
+ input, tags = "<style>div > span { background: \"red\"; }</style>", ["style"]
+ expected = "<style>div > span { background: \"red\"; }</style>"
+ actual = safe_list_sanitize(input, tags: tags)
+
+ assert_equal(expected, actual)
+ end
+
+ def test_combination_of_select_and_style_with_css_payload
+ input, tags = "<select><style>div > span { background: \"red\"; }</style></select>", ["select", "style"]
+ expected = "<select><style>div > span { background: \"red\"; }</style></select>"
+ actual = safe_list_sanitize(input, tags: tags)
+
+ assert_equal(expected, actual)
+ end
+
+ def test_combination_of_select_and_style_with_script_payload
+ input, tags = "<select><style><script>alert(1)</script></style></select>", ["select", "style"]
+ expected = "<select><style><script>alert(1)</script></style></select>"
+ actual = safe_list_sanitize(input, tags: tags)
+
+ assert_equal(expected, actual)
+ end
+
+ def test_combination_of_svg_and_style_with_script_payload
+ input, tags = "<svg><style><script>alert(1)</script></style></svg>", ["svg", "style"]
+ expected = "<svg><style><script>alert(1)</script></style></svg>"
+ actual = safe_list_sanitize(input, tags: tags)
+
+ assert_equal(expected, actual)
+ end
+
+ def test_combination_of_math_and_style_with_img_payload
+ input, tags = "<math><style><img src=x onerror=alert(1)></style></math>", ["math", "style"]
+ expected = "<math><style><img src=x onerror=alert(1)></style></math>"
+ actual = safe_list_sanitize(input, tags: tags)
+
+ assert_equal(expected, actual)
+
+ input, tags = "<math><style><img src=x onerror=alert(1)></style></math>", ["math", "style", "img"]
+ expected = "<math><style><img src=x onerror=alert(1)></style></math>"
+ actual = safe_list_sanitize(input, tags: tags)
+
+ assert_equal(expected, actual)
+ end
+
+ def test_combination_of_svg_and_style_with_img_payload
+ input, tags = "<svg><style><img src=x onerror=alert(1)></style></svg>", ["svg", "style"]
+ expected = "<svg><style><img src=x onerror=alert(1)></style></svg>"
+ actual = safe_list_sanitize(input, tags: tags)
+
+ assert_equal(expected, actual)
+
+ input, tags = "<svg><style><img src=x onerror=alert(1)></style></svg>", ["svg", "style", "img"]
+ expected = "<svg><style><img src=x onerror=alert(1)></style></svg>"
+ actual = safe_list_sanitize(input, tags: tags)
+
+ assert_equal(expected, actual)
+ end
+ def libxml_2_9_14_recovery_lt?
+ # changed in 2.9.14, see https://github.com/sparklemotion/nokogiri/releases/tag/v1.13.5
+ Nokogiri.method(:uses_libxml?).arity == -1 && Nokogiri.uses_libxml?(">= 2.9.14")
+ end
+ def libxml_2_9_14_recovery_lt_bang?
+ # changed in 2.9.14, see https://github.com/sparklemotion/nokogiri/releases/tag/v1.13.5
+ # then reverted in 2.10.0, see https://gitlab.gnome.org/GNOME/libxml2/-/issues/380
+ Nokogiri.method(:uses_libxml?).arity == -1 && Nokogiri.uses_libxml?("= 2.9.14")
+ end
end