611 lines
18 KiB
Python
611 lines
18 KiB
Python
# flake8: noqa
|
|
"""
|
|
Shim module between Bleach and html5lib. This makes it easier to upgrade the
|
|
html5lib library without having to change a lot of code.
|
|
"""
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
import re
|
|
import string
|
|
|
|
import six
|
|
|
|
from bleach._vendor.html5lib import (
|
|
HTMLParser,
|
|
getTreeWalker,
|
|
)
|
|
from bleach._vendor.html5lib import constants
|
|
from bleach._vendor.html5lib.constants import (
|
|
namespaces,
|
|
prefixes,
|
|
)
|
|
from bleach._vendor.html5lib.constants import _ReparseException as ReparseException
|
|
from bleach._vendor.html5lib.filters.base import Filter
|
|
from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols
|
|
from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter
|
|
from bleach._vendor.html5lib._inputstream import HTMLInputStream
|
|
from bleach._vendor.html5lib.serializer import HTMLSerializer
|
|
from bleach._vendor.html5lib._tokenizer import HTMLTokenizer
|
|
from bleach._vendor.html5lib._trie import Trie
|
|
|
|
|
|
#: Map of entity name to expanded entity
|
|
ENTITIES = constants.entities
|
|
|
|
#: Trie of html entity string -> character representation
|
|
ENTITIES_TRIE = Trie(ENTITIES)
|
|
|
|
#: Token type constants--these never change
|
|
TAG_TOKEN_TYPES = {
|
|
constants.tokenTypes['StartTag'],
|
|
constants.tokenTypes['EndTag'],
|
|
constants.tokenTypes['EmptyTag']
|
|
}
|
|
CHARACTERS_TYPE = constants.tokenTypes['Characters']
|
|
PARSEERROR_TYPE = constants.tokenTypes['ParseError']
|
|
|
|
|
|
#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
|
|
#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
|
|
HTML_TAGS = [
|
|
'a',
|
|
'abbr',
|
|
'address',
|
|
'area',
|
|
'article',
|
|
'aside',
|
|
'audio',
|
|
'b',
|
|
'base',
|
|
'bdi',
|
|
'bdo',
|
|
'blockquote',
|
|
'body',
|
|
'br',
|
|
'button',
|
|
'canvas',
|
|
'caption',
|
|
'cite',
|
|
'code',
|
|
'col',
|
|
'colgroup',
|
|
'data',
|
|
'datalist',
|
|
'dd',
|
|
'del',
|
|
'details',
|
|
'dfn',
|
|
'dialog',
|
|
'div',
|
|
'dl',
|
|
'dt',
|
|
'em',
|
|
'embed',
|
|
'fieldset',
|
|
'figcaption',
|
|
'figure',
|
|
'footer',
|
|
'form',
|
|
'h1',
|
|
'h2',
|
|
'h3',
|
|
'h4',
|
|
'h5',
|
|
'h6',
|
|
'head',
|
|
'header',
|
|
'hgroup',
|
|
'hr',
|
|
'html',
|
|
'i',
|
|
'iframe',
|
|
'img',
|
|
'input',
|
|
'ins',
|
|
'kbd',
|
|
'keygen',
|
|
'label',
|
|
'legend',
|
|
'li',
|
|
'link',
|
|
'map',
|
|
'mark',
|
|
'menu',
|
|
'meta',
|
|
'meter',
|
|
'nav',
|
|
'noscript',
|
|
'object',
|
|
'ol',
|
|
'optgroup',
|
|
'option',
|
|
'output',
|
|
'p',
|
|
'param',
|
|
'picture',
|
|
'pre',
|
|
'progress',
|
|
'q',
|
|
'rp',
|
|
'rt',
|
|
'ruby',
|
|
's',
|
|
'samp',
|
|
'script',
|
|
'section',
|
|
'select',
|
|
'slot',
|
|
'small',
|
|
'source',
|
|
'span',
|
|
'strong',
|
|
'style',
|
|
'sub',
|
|
'summary',
|
|
'sup',
|
|
'table',
|
|
'tbody',
|
|
'td',
|
|
'template',
|
|
'textarea',
|
|
'tfoot',
|
|
'th',
|
|
'thead',
|
|
'time',
|
|
'title',
|
|
'tr',
|
|
'track',
|
|
'u',
|
|
'ul',
|
|
'var',
|
|
'video',
|
|
'wbr',
|
|
]
|
|
|
|
|
|
class InputStreamWithMemory(object):
|
|
"""Wraps an HTMLInputStream to remember characters since last <
|
|
|
|
This wraps existing HTMLInputStream classes to keep track of the stream
|
|
since the last < which marked an open tag state.
|
|
|
|
"""
|
|
def __init__(self, inner_stream):
|
|
self._inner_stream = inner_stream
|
|
self.reset = self._inner_stream.reset
|
|
self.position = self._inner_stream.position
|
|
self._buffer = []
|
|
|
|
@property
|
|
def errors(self):
|
|
return self._inner_stream.errors
|
|
|
|
@property
|
|
def charEncoding(self):
|
|
return self._inner_stream.charEncoding
|
|
|
|
@property
|
|
def changeEncoding(self):
|
|
return self._inner_stream.changeEncoding
|
|
|
|
def char(self):
|
|
c = self._inner_stream.char()
|
|
# char() can return None if EOF, so ignore that
|
|
if c:
|
|
self._buffer.append(c)
|
|
return c
|
|
|
|
def charsUntil(self, characters, opposite=False):
|
|
chars = self._inner_stream.charsUntil(characters, opposite=opposite)
|
|
self._buffer.extend(list(chars))
|
|
return chars
|
|
|
|
def unget(self, char):
|
|
if self._buffer:
|
|
self._buffer.pop(-1)
|
|
return self._inner_stream.unget(char)
|
|
|
|
def get_tag(self):
|
|
"""Returns the stream history since last '<'
|
|
|
|
Since the buffer starts at the last '<' as as seen by tagOpenState(),
|
|
we know that everything from that point to when this method is called
|
|
is the "tag" that is being tokenized.
|
|
|
|
"""
|
|
return six.text_type('').join(self._buffer)
|
|
|
|
def start_tag(self):
|
|
"""Resets stream history to just '<'
|
|
|
|
This gets called by tagOpenState() which marks a '<' that denotes an
|
|
open tag. Any time we see that, we reset the buffer.
|
|
|
|
"""
|
|
self._buffer = ['<']
|
|
|
|
|
|
class BleachHTMLTokenizer(HTMLTokenizer):
|
|
"""Tokenizer that doesn't consume character entities"""
|
|
def __init__(self, consume_entities=False, **kwargs):
|
|
super(BleachHTMLTokenizer, self).__init__(**kwargs)
|
|
|
|
self.consume_entities = consume_entities
|
|
|
|
# Wrap the stream with one that remembers the history
|
|
self.stream = InputStreamWithMemory(self.stream)
|
|
|
|
def __iter__(self):
|
|
last_error_token = None
|
|
|
|
for token in super(BleachHTMLTokenizer, self).__iter__():
|
|
if last_error_token is not None:
|
|
if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and
|
|
token['type'] in TAG_TOKEN_TYPES and
|
|
token.get('data'))):
|
|
# Remove attribute names that have ', " or < in them
|
|
# because those characters are invalid for attribute names.
|
|
token['data'] = [
|
|
item for item in token['data']
|
|
if ('"' not in item[0] and
|
|
"'" not in item[0] and
|
|
'<' not in item[0])
|
|
]
|
|
last_error_token = None
|
|
yield token
|
|
|
|
elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
|
|
self.parser.tags is not None and
|
|
token['data'].lower().strip() not in self.parser.tags)):
|
|
# We've got either a malformed tag or a pseudo-tag or
|
|
# something that html5lib wants to turn into a malformed
|
|
# comment which Bleach clean() will drop so we interfere
|
|
# with the token stream to handle it more correctly.
|
|
#
|
|
# If this is an allowed tag, it's malformed and we just let
|
|
# the html5lib parser deal with it--we don't enter into this
|
|
# block.
|
|
#
|
|
# If this is not an allowed tag, then we convert it to
|
|
# characters and it'll get escaped in the sanitizer.
|
|
token['data'] = self.stream.get_tag()
|
|
token['type'] = CHARACTERS_TYPE
|
|
|
|
last_error_token = None
|
|
yield token
|
|
|
|
elif token['type'] == PARSEERROR_TYPE:
|
|
# If the token is a parse error, then let the last_error_token
|
|
# go, and make token the new last_error_token
|
|
yield last_error_token
|
|
last_error_token = token
|
|
|
|
else:
|
|
yield last_error_token
|
|
yield token
|
|
last_error_token = None
|
|
|
|
continue
|
|
|
|
# If the token is a ParseError, we hold on to it so we can get the
|
|
# next token and potentially fix it.
|
|
if token['type'] == PARSEERROR_TYPE:
|
|
last_error_token = token
|
|
continue
|
|
|
|
yield token
|
|
|
|
if last_error_token:
|
|
yield last_error_token
|
|
|
|
def consumeEntity(self, allowedChar=None, fromAttribute=False):
|
|
# If this tokenizer is set to consume entities, then we can let the
|
|
# superclass do its thing.
|
|
if self.consume_entities:
|
|
return super(BleachHTMLTokenizer, self).consumeEntity(allowedChar, fromAttribute)
|
|
|
|
# If this tokenizer is set to not consume entities, then we don't want
|
|
# to consume and convert them, so this overrides the html5lib tokenizer's
|
|
# consumeEntity so that it's now a no-op.
|
|
#
|
|
# However, when that gets called, it's consumed an &, so we put that back in
|
|
# the stream.
|
|
if fromAttribute:
|
|
self.currentToken['data'][-1][1] += '&'
|
|
|
|
else:
|
|
self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'})
|
|
|
|
def tagOpenState(self):
|
|
# This state marks a < that is either a StartTag, EndTag, EmptyTag,
|
|
# or ParseError. In all cases, we want to drop any stream history
|
|
# we've collected so far and we do that by calling start_tag() on
|
|
# the input stream wrapper.
|
|
self.stream.start_tag()
|
|
return super(BleachHTMLTokenizer, self).tagOpenState()
|
|
|
|
def emitCurrentToken(self):
|
|
token = self.currentToken
|
|
|
|
if ((self.parser.tags is not None and
|
|
token['type'] in TAG_TOKEN_TYPES and
|
|
token['name'].lower() not in self.parser.tags)):
|
|
# If this is a start/end/empty tag for a tag that's not in our
|
|
# allowed list, then it gets stripped or escaped. In both of these
|
|
# cases it gets converted to a Characters token.
|
|
if self.parser.strip:
|
|
# If we're stripping the token, we just throw in an empty
|
|
# string token.
|
|
new_data = ''
|
|
|
|
else:
|
|
# If we're escaping the token, we want to escape the exact
|
|
# original string. Since tokenizing also normalizes data
|
|
# and this is a tag-like thing, we've lost some information.
|
|
# So we go back through the stream to get the original
|
|
# string and use that.
|
|
new_data = self.stream.get_tag()
|
|
|
|
new_token = {
|
|
'type': CHARACTERS_TYPE,
|
|
'data': new_data
|
|
}
|
|
|
|
self.currentToken = new_token
|
|
self.tokenQueue.append(new_token)
|
|
self.state = self.dataState
|
|
return
|
|
|
|
super(BleachHTMLTokenizer, self).emitCurrentToken()
|
|
|
|
|
|
class BleachHTMLParser(HTMLParser):
|
|
"""Parser that uses BleachHTMLTokenizer"""
|
|
def __init__(self, tags, strip, consume_entities, **kwargs):
|
|
"""
|
|
:arg tags: list of allowed tags--everything else is either stripped or
|
|
escaped; if None, then this doesn't look at tags at all
|
|
:arg strip: whether to strip disallowed tags (True) or escape them (False);
|
|
if tags=None, then this doesn't have any effect
|
|
:arg consume_entities: whether to consume entities (default behavior) or
|
|
leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
|
|
|
|
"""
|
|
self.tags = [tag.lower() for tag in tags] if tags is not None else None
|
|
self.strip = strip
|
|
self.consume_entities = consume_entities
|
|
super(BleachHTMLParser, self).__init__(**kwargs)
|
|
|
|
def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs):
|
|
# set scripting=True to parse <noscript> as though JS is enabled to
|
|
# match the expected context in browsers
|
|
#
|
|
# https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
|
|
#
|
|
# Override HTMLParser so we can swap out the tokenizer for our own.
|
|
self.innerHTMLMode = innerHTML
|
|
self.container = container
|
|
self.scripting = scripting
|
|
self.tokenizer = BleachHTMLTokenizer(
|
|
stream=stream,
|
|
consume_entities=self.consume_entities,
|
|
parser=self,
|
|
**kwargs
|
|
)
|
|
self.reset()
|
|
|
|
try:
|
|
self.mainLoop()
|
|
except ReparseException:
|
|
self.reset()
|
|
self.mainLoop()
|
|
|
|
|
|
def convert_entity(value):
|
|
"""Convert an entity (minus the & and ; part) into what it represents
|
|
|
|
This handles numeric, hex, and text entities.
|
|
|
|
:arg value: the string (minus the ``&`` and ``;`` part) to convert
|
|
|
|
:returns: unicode character or None if it's an ambiguous ampersand that
|
|
doesn't match a character entity
|
|
|
|
"""
|
|
if value[0] == '#':
|
|
if value[1] in ('x', 'X'):
|
|
return six.unichr(int(value[2:], 16))
|
|
return six.unichr(int(value[1:], 10))
|
|
|
|
return ENTITIES.get(value, None)
|
|
|
|
|
|
def convert_entities(text):
|
|
"""Converts all found entities in the text
|
|
|
|
:arg text: the text to convert entities in
|
|
|
|
:returns: unicode text with converted entities
|
|
|
|
"""
|
|
if '&' not in text:
|
|
return text
|
|
|
|
new_text = []
|
|
for part in next_possible_entity(text):
|
|
if not part:
|
|
continue
|
|
|
|
if part.startswith('&'):
|
|
entity = match_entity(part)
|
|
if entity is not None:
|
|
converted = convert_entity(entity)
|
|
|
|
# If it's not an ambiguous ampersand, then replace with the
|
|
# unicode character. Otherwise, we leave the entity in.
|
|
if converted is not None:
|
|
new_text.append(converted)
|
|
remainder = part[len(entity) + 2:]
|
|
if part:
|
|
new_text.append(remainder)
|
|
continue
|
|
|
|
new_text.append(part)
|
|
|
|
return ''.join(new_text)
|
|
|
|
|
|
def match_entity(stream):
|
|
"""Returns first entity in stream or None if no entity exists
|
|
|
|
Note: For Bleach purposes, entities must start with a "&" and end with
|
|
a ";". This ignoresambiguous character entities that have no ";" at the
|
|
end.
|
|
|
|
:arg stream: the character stream
|
|
|
|
:returns: ``None`` or the entity string without "&" or ";"
|
|
|
|
"""
|
|
# Nix the & at the beginning
|
|
if stream[0] != '&':
|
|
raise ValueError('Stream should begin with "&"')
|
|
|
|
stream = stream[1:]
|
|
|
|
stream = list(stream)
|
|
possible_entity = ''
|
|
end_characters = '<&=;' + string.whitespace
|
|
|
|
# Handle number entities
|
|
if stream and stream[0] == '#':
|
|
possible_entity = '#'
|
|
stream.pop(0)
|
|
|
|
if stream and stream[0] in ('x', 'X'):
|
|
allowed = '0123456789abcdefABCDEF'
|
|
possible_entity += stream.pop(0)
|
|
else:
|
|
allowed = '0123456789'
|
|
|
|
# FIXME(willkg): Do we want to make sure these are valid number
|
|
# entities? This doesn't do that currently.
|
|
while stream and stream[0] not in end_characters:
|
|
c = stream.pop(0)
|
|
if c not in allowed:
|
|
break
|
|
possible_entity += c
|
|
|
|
if possible_entity and stream and stream[0] == ';':
|
|
return possible_entity
|
|
return None
|
|
|
|
# Handle character entities
|
|
while stream and stream[0] not in end_characters:
|
|
c = stream.pop(0)
|
|
if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
|
|
break
|
|
possible_entity += c
|
|
|
|
if possible_entity and stream and stream[0] == ';':
|
|
return possible_entity
|
|
|
|
return None
|
|
|
|
|
|
AMP_SPLIT_RE = re.compile('(&)')
|
|
|
|
|
|
def next_possible_entity(text):
|
|
"""Takes a text and generates a list of possible entities
|
|
|
|
:arg text: the text to look at
|
|
|
|
:returns: generator where each part (except the first) starts with an
|
|
"&"
|
|
|
|
"""
|
|
for i, part in enumerate(AMP_SPLIT_RE.split(text)):
|
|
if i == 0:
|
|
yield part
|
|
elif i % 2 == 0:
|
|
yield '&' + part
|
|
|
|
|
|
class BleachHTMLSerializer(HTMLSerializer):
|
|
"""HTMLSerializer that undoes & -> & in attributes and sets
|
|
escape_rcdata to True
|
|
"""
|
|
|
|
# per the HTMLSerializer.__init__ docstring:
|
|
#
|
|
# Whether to escape characters that need to be
|
|
# escaped within normal elements within rcdata elements such as
|
|
# style.
|
|
#
|
|
escape_rcdata = True
|
|
|
|
def escape_base_amp(self, stoken):
|
|
"""Escapes just bare & in HTML attribute values"""
|
|
# First, undo escaping of &. We need to do this because html5lib's
|
|
# HTMLSerializer expected the tokenizer to consume all the character
|
|
# entities and convert them to their respective characters, but the
|
|
# BleachHTMLTokenizer doesn't do that. For example, this fixes
|
|
# &entity; back to &entity; .
|
|
stoken = stoken.replace('&', '&')
|
|
|
|
# However, we do want all bare & that are not marking character
|
|
# entities to be changed to &, so let's do that carefully here.
|
|
for part in next_possible_entity(stoken):
|
|
if not part:
|
|
continue
|
|
|
|
if part.startswith('&'):
|
|
entity = match_entity(part)
|
|
# Only leave entities in that are not ambiguous. If they're
|
|
# ambiguous, then we escape the ampersand.
|
|
if entity is not None and convert_entity(entity) is not None:
|
|
yield '&' + entity + ';'
|
|
|
|
# Length of the entity plus 2--one for & at the beginning
|
|
# and one for ; at the end
|
|
part = part[len(entity) + 2:]
|
|
if part:
|
|
yield part
|
|
continue
|
|
|
|
yield part.replace('&', '&')
|
|
|
|
def serialize(self, treewalker, encoding=None):
|
|
"""Wrap HTMLSerializer.serialize and conver & to & in attribute values
|
|
|
|
Note that this converts & to & in attribute values where the & isn't
|
|
already part of an unambiguous character entity.
|
|
|
|
"""
|
|
in_tag = False
|
|
after_equals = False
|
|
|
|
for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
|
|
if in_tag:
|
|
if stoken == '>':
|
|
in_tag = False
|
|
|
|
elif after_equals:
|
|
if stoken != '"':
|
|
for part in self.escape_base_amp(stoken):
|
|
yield part
|
|
|
|
after_equals = False
|
|
continue
|
|
|
|
elif stoken == '=':
|
|
after_equals = True
|
|
|
|
yield stoken
|
|
else:
|
|
if stoken.startswith('<'):
|
|
in_tag = True
|
|
yield stoken
|