hub/venv/lib/python3.7/site-packages/bleach/html5lib_shim.py

# flake8: noqa
"""
Shim module between Bleach and html5lib. This makes it easier to upgrade the
html5lib library without having to change a lot of code.
"""

from __future__ import unicode_literals

import re
import string

import six

from bleach._vendor.html5lib import (
    HTMLParser,
    getTreeWalker,
)
from bleach._vendor.html5lib import constants
from bleach._vendor.html5lib.constants import (
    namespaces,
    prefixes,
)
from bleach._vendor.html5lib.constants import _ReparseException as ReparseException
from bleach._vendor.html5lib.filters.base import Filter
from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols
from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter
from bleach._vendor.html5lib._inputstream import HTMLInputStream
from bleach._vendor.html5lib.serializer import HTMLSerializer
from bleach._vendor.html5lib._tokenizer import HTMLTokenizer
from bleach._vendor.html5lib._trie import Trie


#: Map of entity name to expanded entity
ENTITIES = constants.entities

#: Trie of html entity string -> character representation
ENTITIES_TRIE = Trie(ENTITIES)

#: Token type constants--these never change
TAG_TOKEN_TYPES = {
    constants.tokenTypes['StartTag'],
    constants.tokenTypes['EndTag'],
    constants.tokenTypes['EmptyTag']
}
CHARACTERS_TYPE = constants.tokenTypes['Characters']
PARSEERROR_TYPE = constants.tokenTypes['ParseError']


#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
HTML_TAGS = [
    'a',
    'abbr',
    'address',
    'area',
    'article',
    'aside',
    'audio',
    'b',
    'base',
    'bdi',
    'bdo',
    'blockquote',
    'body',
    'br',
    'button',
    'canvas',
    'caption',
    'cite',
    'code',
    'col',
    'colgroup',
    'data',
    'datalist',
    'dd',
    'del',
    'details',
    'dfn',
    'dialog',
    'div',
    'dl',
    'dt',
    'em',
    'embed',
    'fieldset',
    'figcaption',
    'figure',
    'footer',
    'form',
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
    'head',
    'header',
    'hgroup',
    'hr',
    'html',
    'i',
    'iframe',
    'img',
    'input',
    'ins',
    'kbd',
    'keygen',
    'label',
    'legend',
    'li',
    'link',
    'map',
    'mark',
    'menu',
    'meta',
    'meter',
    'nav',
    'noscript',
    'object',
    'ol',
    'optgroup',
    'option',
    'output',
    'p',
    'param',
    'picture',
    'pre',
    'progress',
    'q',
    'rp',
    'rt',
    'ruby',
    's',
    'samp',
    'script',
    'section',
    'select',
    'slot',
    'small',
    'source',
    'span',
    'strong',
    'style',
    'sub',
    'summary',
    'sup',
    'table',
    'tbody',
    'td',
    'template',
    'textarea',
    'tfoot',
    'th',
    'thead',
    'time',
    'title',
    'tr',
    'track',
    'u',
    'ul',
    'var',
    'video',
    'wbr',
]


class InputStreamWithMemory(object):
    """Wraps an HTMLInputStream to remember characters since last <

    This wraps existing HTMLInputStream classes to keep track of the stream
    since the last < which marked an open tag state.

    """
    def __init__(self, inner_stream):
        self._inner_stream = inner_stream
        self.reset = self._inner_stream.reset
        self.position = self._inner_stream.position
        self._buffer = []

    @property
    def errors(self):
        return self._inner_stream.errors

    @property
    def charEncoding(self):
        return self._inner_stream.charEncoding

    @property
    def changeEncoding(self):
        return self._inner_stream.changeEncoding

    def char(self):
        c = self._inner_stream.char()
        # char() can return None if EOF, so ignore that
        if c:
            self._buffer.append(c)
        return c

    def charsUntil(self, characters, opposite=False):
        chars = self._inner_stream.charsUntil(characters, opposite=opposite)
        self._buffer.extend(list(chars))
        return chars

    def unget(self, char):
        if self._buffer:
            self._buffer.pop(-1)
        return self._inner_stream.unget(char)

    def get_tag(self):
        """Returns the stream history since last '<'

        Since the buffer starts at the last '<' as as seen by tagOpenState(),
        we know that everything from that point to when this method is called
        is the "tag" that is being tokenized.

        """
        return six.text_type('').join(self._buffer)

    def start_tag(self):
        """Resets stream history to just '<'

        This gets called by tagOpenState() which marks a '<' that denotes an
        open tag. Any time we see that, we reset the buffer.

        """
        self._buffer = ['<']


class BleachHTMLTokenizer(HTMLTokenizer):
    """Tokenizer that doesn't consume character entities"""
    def __init__(self, consume_entities=False, **kwargs):
        super(BleachHTMLTokenizer, self).__init__(**kwargs)

        self.consume_entities = consume_entities

        # Wrap the stream with one that remembers the history
        self.stream = InputStreamWithMemory(self.stream)

    def __iter__(self):
        last_error_token = None

        for token in super(BleachHTMLTokenizer, self).__iter__():
            if last_error_token is not None:
                if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and
                     token['type'] in TAG_TOKEN_TYPES and
                     token.get('data'))):
                    # Remove attribute names that have ', " or < in them
                    # because those characters are invalid for attribute names.
                    token['data'] = [
                        item for item in token['data']
                        if ('"' not in item[0] and
                            "'" not in item[0] and
                            '<' not in item[0])
                    ]
                    last_error_token = None
                    yield token

                elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
                       self.parser.tags is not None and
                       token['data'].lower().strip() not in self.parser.tags)):
                    # We've got either a malformed tag or a pseudo-tag or
                    # something that html5lib wants to turn into a malformed
                    # comment which Bleach clean() will drop so we interfere
                    # with the token stream to handle it more correctly.
                    #
                    # If this is an allowed tag, it's malformed and we just let
                    # the html5lib parser deal with it--we don't enter into this
                    # block.
                    #
                    # If this is not an allowed tag, then we convert it to
                    # characters and it'll get escaped in the sanitizer.
                    token['data'] = self.stream.get_tag()
                    token['type'] = CHARACTERS_TYPE

                    last_error_token = None
                    yield token

                elif token['type'] == PARSEERROR_TYPE:
                    # If the token is a parse error, then let the last_error_token
                    # go, and make token the new last_error_token
                    yield last_error_token
                    last_error_token = token

                else:
                    yield last_error_token
                    yield token
                    last_error_token = None

                continue

            # If the token is a ParseError, we hold on to it so we can get the
            # next token and potentially fix it.
            if token['type'] == PARSEERROR_TYPE:
                last_error_token = token
                continue

            yield token

        if last_error_token:
            yield last_error_token

    def consumeEntity(self, allowedChar=None, fromAttribute=False):
        # If this tokenizer is set to consume entities, then we can let the
        # superclass do its thing.
        if self.consume_entities:
            return super(BleachHTMLTokenizer, self).consumeEntity(allowedChar, fromAttribute)

        # If this tokenizer is set to not consume entities, then we don't want
        # to consume and convert them, so this overrides the html5lib tokenizer's
        # consumeEntity so that it's now a no-op.
        #
        # However, when that gets called, it's consumed an &, so we put that back in
        # the stream.
        if fromAttribute:
            self.currentToken['data'][-1][1] += '&'

        else:
            self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'})

    def tagOpenState(self):
        # This state marks a < that is either a StartTag, EndTag, EmptyTag,
        # or ParseError. In all cases, we want to drop any stream history
        # we've collected so far and we do that by calling start_tag() on
        # the input stream wrapper.
        self.stream.start_tag()
        return super(BleachHTMLTokenizer, self).tagOpenState()

    def emitCurrentToken(self):
        token = self.currentToken

        if ((self.parser.tags is not None and
             token['type'] in TAG_TOKEN_TYPES and
             token['name'].lower() not in self.parser.tags)):
            # If this is a start/end/empty tag for a tag that's not in our
            # allowed list, then it gets stripped or escaped. In both of these
            # cases it gets converted to a Characters token.
            if self.parser.strip:
                # If we're stripping the token, we just throw in an empty
                # string token.
                new_data = ''

            else:
                # If we're escaping the token, we want to escape the exact
                # original string. Since tokenizing also normalizes data
                # and this is a tag-like thing, we've lost some information.
                # So we go back through the stream to get the original
                # string and use that.
                new_data = self.stream.get_tag()

            new_token = {
                'type': CHARACTERS_TYPE,
                'data': new_data
            }

            self.currentToken = new_token
            self.tokenQueue.append(new_token)
            self.state = self.dataState
            return

        super(BleachHTMLTokenizer, self).emitCurrentToken()


class BleachHTMLParser(HTMLParser):
    """Parser that uses BleachHTMLTokenizer"""
    def __init__(self, tags, strip, consume_entities, **kwargs):
        """
        :arg tags: list of allowed tags--everything else is either stripped or
            escaped; if None, then this doesn't look at tags at all
        :arg strip: whether to strip disallowed tags (True) or escape them (False);
            if tags=None, then this doesn't have any effect
        :arg consume_entities: whether to consume entities (default behavior) or
            leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)

        """
        self.tags = [tag.lower() for tag in tags] if tags is not None else None
        self.strip = strip
        self.consume_entities = consume_entities
        super(BleachHTMLParser, self).__init__(**kwargs)

    def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs):
        # set scripting=True to parse <noscript> as though JS is enabled to
        # match the expected context in browsers
        #
        # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
        #
        # Override HTMLParser so we can swap out the tokenizer for our own.
        self.innerHTMLMode = innerHTML
        self.container = container
        self.scripting = scripting
        self.tokenizer = BleachHTMLTokenizer(
            stream=stream,
            consume_entities=self.consume_entities,
            parser=self,
            **kwargs
        )
        self.reset()

        try:
            self.mainLoop()
        except ReparseException:
            self.reset()
            self.mainLoop()


def convert_entity(value):
    """Convert an entity (minus the & and ; part) into what it represents

    This handles numeric, hex, and text entities.

    :arg value: the string (minus the ``&`` and ``;`` part) to convert

    :returns: unicode character or None if it's an ambiguous ampersand that
        doesn't match a character entity

    """
    if value[0] == '#':
        if value[1] in ('x', 'X'):
            return six.unichr(int(value[2:], 16))
        return six.unichr(int(value[1:], 10))

    return ENTITIES.get(value, None)


def convert_entities(text):
    """Converts all found entities in the text

    :arg text: the text to convert entities in

    :returns: unicode text with converted entities

    """
    if '&' not in text:
        return text

    new_text = []
    for part in next_possible_entity(text):
        if not part:
            continue

        if part.startswith('&'):
            entity = match_entity(part)
            if entity is not None:
                converted = convert_entity(entity)

                # If it's not an ambiguous ampersand, then replace with the
                # unicode character. Otherwise, we leave the entity in.
                if converted is not None:
                    new_text.append(converted)
                    remainder = part[len(entity) + 2:]
                    if part:
                        new_text.append(remainder)
                    continue

        new_text.append(part)

    return ''.join(new_text)


def match_entity(stream):
    """Returns first entity in stream or None if no entity exists

    Note: For Bleach purposes, entities must start with a "&" and end with
    a ";". This ignoresambiguous character entities that have no ";" at the
    end.

    :arg stream: the character stream

    :returns: ``None`` or the entity string without "&" or ";"

    """
    # Nix the & at the beginning
    if stream[0] != '&':
        raise ValueError('Stream should begin with "&"')

    stream = stream[1:]

    stream = list(stream)
    possible_entity = ''
    end_characters = '<&=;' + string.whitespace

    # Handle number entities
    if stream and stream[0] == '#':
        possible_entity = '#'
        stream.pop(0)

        if stream and stream[0] in ('x', 'X'):
            allowed = '0123456789abcdefABCDEF'
            possible_entity += stream.pop(0)
        else:
            allowed = '0123456789'

        # FIXME(willkg): Do we want to make sure these are valid number
        # entities? This doesn't do that currently.
        while stream and stream[0] not in end_characters:
            c = stream.pop(0)
            if c not in allowed:
                break
            possible_entity += c

        if possible_entity and stream and stream[0] == ';':
            return possible_entity
        return None

    # Handle character entities
    while stream and stream[0] not in end_characters:
        c = stream.pop(0)
        if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
            break
        possible_entity += c

    if possible_entity and stream and stream[0] == ';':
        return possible_entity

    return None


AMP_SPLIT_RE = re.compile('(&)')


def next_possible_entity(text):
    """Takes a text and generates a list of possible entities

    :arg text: the text to look at

    :returns: generator where each part (except the first) starts with an
        "&"

    """
    for i, part in enumerate(AMP_SPLIT_RE.split(text)):
        if i == 0:
            yield part
        elif i % 2 == 0:
            yield '&' + part


class BleachHTMLSerializer(HTMLSerializer):
    """HTMLSerializer that undoes & -> &amp; in attributes and sets
    escape_rcdata to True
    """

    # per the HTMLSerializer.__init__ docstring:
    #
    # Whether to escape characters that need to be
    # escaped within normal elements within rcdata elements such as
    # style.
    #
    escape_rcdata = True

    def escape_base_amp(self, stoken):
        """Escapes just bare & in HTML attribute values"""
        # First, undo escaping of &. We need to do this because html5lib's
        # HTMLSerializer expected the tokenizer to consume all the character
        # entities and convert them to their respective characters, but the
        # BleachHTMLTokenizer doesn't do that. For example, this fixes
        # &amp;entity; back to &entity; .
        stoken = stoken.replace('&amp;', '&')

        # However, we do want all bare & that are not marking character
        # entities to be changed to &amp;, so let's do that carefully here.
        for part in next_possible_entity(stoken):
            if not part:
                continue

            if part.startswith('&'):
                entity = match_entity(part)
                # Only leave entities in that are not ambiguous. If they're
                # ambiguous, then we escape the ampersand.
                if entity is not None and convert_entity(entity) is not None:
                    yield '&' + entity + ';'

                    # Length of the entity plus 2--one for & at the beginning
                    # and one for ; at the end
                    part = part[len(entity) + 2:]
                    if part:
                        yield part
                    continue

            yield part.replace('&', '&amp;')

    def serialize(self, treewalker, encoding=None):
        """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values

        Note that this converts & to &amp; in attribute values where the & isn't
        already part of an unambiguous character entity.

        """
        in_tag = False
        after_equals = False

        for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
            if in_tag:
                if stoken == '>':
                    in_tag = False

                elif after_equals:
                    if stoken != '"':
                        for part in self.escape_base_amp(stoken):
                            yield part

                        after_equals = False
                        continue

                elif stoken == '=':
                    after_equals = True

                yield stoken
            else:
                if stoken.startswith('<'):
                    in_tag = True
                yield stoken
Add virtual environment to the git repository, this isn't 100% right but it's practical at this development point 2020-06-16 10:34:17 -04:00			`# flake8: noqa`
			`"""`
			`Shim module between Bleach and html5lib. This makes it easier to upgrade the`
			`html5lib library without having to change a lot of code.`
			`"""`

			`from __future__ import unicode_literals`

			`import re`
			`import string`

			`import six`

			`from bleach._vendor.html5lib import (`
			`HTMLParser,`
			`getTreeWalker,`
			`)`
			`from bleach._vendor.html5lib import constants`
			`from bleach._vendor.html5lib.constants import (`
			`namespaces,`
			`prefixes,`
			`)`
			`from bleach._vendor.html5lib.constants import _ReparseException as ReparseException`
			`from bleach._vendor.html5lib.filters.base import Filter`
			`from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols`
			`from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter`
			`from bleach._vendor.html5lib._inputstream import HTMLInputStream`
			`from bleach._vendor.html5lib.serializer import HTMLSerializer`
			`from bleach._vendor.html5lib._tokenizer import HTMLTokenizer`
			`from bleach._vendor.html5lib._trie import Trie`


			`#: Map of entity name to expanded entity`
			`ENTITIES = constants.entities`

			`#: Trie of html entity string -> character representation`
			`ENTITIES_TRIE = Trie(ENTITIES)`

			`#: Token type constants--these never change`
			`TAG_TOKEN_TYPES = {`
			`constants.tokenTypes['StartTag'],`
			`constants.tokenTypes['EndTag'],`
			`constants.tokenTypes['EmptyTag']`
			`}`
			`CHARACTERS_TYPE = constants.tokenTypes['Characters']`
			`PARSEERROR_TYPE = constants.tokenTypes['ParseError']`


			`#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17`
			`#: https://html.spec.whatwg.org/multipage/indices.html#elements-3`
			`HTML_TAGS = [`
			`'a',`
			`'abbr',`
			`'address',`
			`'area',`
			`'article',`
			`'aside',`
			`'audio',`
			`'b',`
			`'base',`
			`'bdi',`
			`'bdo',`
			`'blockquote',`
			`'body',`
			`'br',`
			`'button',`
			`'canvas',`
			`'caption',`
			`'cite',`
			`'code',`
			`'col',`
			`'colgroup',`
			`'data',`
			`'datalist',`
			`'dd',`
			`'del',`
			`'details',`
			`'dfn',`
			`'dialog',`
			`'div',`
			`'dl',`
			`'dt',`
			`'em',`
			`'embed',`
			`'fieldset',`
			`'figcaption',`
			`'figure',`
			`'footer',`
			`'form',`
			`'h1',`
			`'h2',`
			`'h3',`
			`'h4',`
			`'h5',`
			`'h6',`
			`'head',`
			`'header',`
			`'hgroup',`
			`'hr',`
			`'html',`
			`'i',`
			`'iframe',`
			`'img',`
			`'input',`
			`'ins',`
			`'kbd',`
			`'keygen',`
			`'label',`
			`'legend',`
			`'li',`
			`'link',`
			`'map',`
			`'mark',`
			`'menu',`
			`'meta',`
			`'meter',`
			`'nav',`
			`'noscript',`
			`'object',`
			`'ol',`
			`'optgroup',`
			`'option',`
			`'output',`
			`'p',`
			`'param',`
			`'picture',`
			`'pre',`
			`'progress',`
			`'q',`
			`'rp',`
			`'rt',`
			`'ruby',`
			`'s',`
			`'samp',`
			`'script',`
			`'section',`
			`'select',`
			`'slot',`
			`'small',`
			`'source',`
			`'span',`
			`'strong',`
			`'style',`
			`'sub',`
			`'summary',`
			`'sup',`
			`'table',`
			`'tbody',`
			`'td',`
			`'template',`
			`'textarea',`
			`'tfoot',`
			`'th',`
			`'thead',`
			`'time',`
			`'title',`
			`'tr',`
			`'track',`
			`'u',`
			`'ul',`
			`'var',`
			`'video',`
			`'wbr',`
			`]`


			`class InputStreamWithMemory(object):`
			`"""Wraps an HTMLInputStream to remember characters since last <`

			`This wraps existing HTMLInputStream classes to keep track of the stream`
			`since the last < which marked an open tag state.`

			`"""`
			`def __init__(self, inner_stream):`
			`self._inner_stream = inner_stream`
			`self.reset = self._inner_stream.reset`
			`self.position = self._inner_stream.position`
			`self._buffer = []`

			`@property`
			`def errors(self):`
			`return self._inner_stream.errors`

			`@property`
			`def charEncoding(self):`
			`return self._inner_stream.charEncoding`

			`@property`
			`def changeEncoding(self):`
			`return self._inner_stream.changeEncoding`

			`def char(self):`
			`c = self._inner_stream.char()`
			`# char() can return None if EOF, so ignore that`
			`if c:`
			`self._buffer.append(c)`
			`return c`

			`def charsUntil(self, characters, opposite=False):`
			`chars = self._inner_stream.charsUntil(characters, opposite=opposite)`
			`self._buffer.extend(list(chars))`
			`return chars`

			`def unget(self, char):`
			`if self._buffer:`
			`self._buffer.pop(-1)`
			`return self._inner_stream.unget(char)`

			`def get_tag(self):`
			`"""Returns the stream history since last '<'`

			`Since the buffer starts at the last '<' as as seen by tagOpenState(),`
			`we know that everything from that point to when this method is called`
			`is the "tag" that is being tokenized.`

			`"""`
			`return six.text_type('').join(self._buffer)`

			`def start_tag(self):`
			`"""Resets stream history to just '<'`

			`This gets called by tagOpenState() which marks a '<' that denotes an`
			`open tag. Any time we see that, we reset the buffer.`

			`"""`
			`self._buffer = ['<']`


			`class BleachHTMLTokenizer(HTMLTokenizer):`
			`"""Tokenizer that doesn't consume character entities"""`
			`def __init__(self, consume_entities=False, **kwargs):`
			`super(BleachHTMLTokenizer, self).__init__(**kwargs)`

			`self.consume_entities = consume_entities`

			`# Wrap the stream with one that remembers the history`
			`self.stream = InputStreamWithMemory(self.stream)`

			`def __iter__(self):`
			`last_error_token = None`

			`for token in super(BleachHTMLTokenizer, self).__iter__():`
			`if last_error_token is not None:`
			`if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and`
			`token['type'] in TAG_TOKEN_TYPES and`
			`token.get('data'))):`
			`# Remove attribute names that have ', " or < in them`
			`# because those characters are invalid for attribute names.`
			`token['data'] = [`
			`item for item in token['data']`
			`if ('"' not in item[0] and`
			`"'" not in item[0] and`
			`'<' not in item[0])`
			`]`
			`last_error_token = None`
			`yield token`

			`elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and`
			`self.parser.tags is not None and`
			`token['data'].lower().strip() not in self.parser.tags)):`
			`# We've got either a malformed tag or a pseudo-tag or`
			`# something that html5lib wants to turn into a malformed`
			`# comment which Bleach clean() will drop so we interfere`
			`# with the token stream to handle it more correctly.`
			`#`
			`# If this is an allowed tag, it's malformed and we just let`
			`# the html5lib parser deal with it--we don't enter into this`
			`# block.`
			`#`
			`# If this is not an allowed tag, then we convert it to`
			`# characters and it'll get escaped in the sanitizer.`
			`token['data'] = self.stream.get_tag()`
			`token['type'] = CHARACTERS_TYPE`

			`last_error_token = None`
			`yield token`

			`elif token['type'] == PARSEERROR_TYPE:`
			`# If the token is a parse error, then let the last_error_token`
			`# go, and make token the new last_error_token`
			`yield last_error_token`
			`last_error_token = token`

			`else:`
			`yield last_error_token`
			`yield token`
			`last_error_token = None`

			`continue`

			`# If the token is a ParseError, we hold on to it so we can get the`
			`# next token and potentially fix it.`
			`if token['type'] == PARSEERROR_TYPE:`
			`last_error_token = token`
			`continue`

			`yield token`

			`if last_error_token:`
			`yield last_error_token`

			`def consumeEntity(self, allowedChar=None, fromAttribute=False):`
			`# If this tokenizer is set to consume entities, then we can let the`
			`# superclass do its thing.`
			`if self.consume_entities:`
			`return super(BleachHTMLTokenizer, self).consumeEntity(allowedChar, fromAttribute)`

			`# If this tokenizer is set to not consume entities, then we don't want`
			`# to consume and convert them, so this overrides the html5lib tokenizer's`
			`# consumeEntity so that it's now a no-op.`
			`#`
			`# However, when that gets called, it's consumed an &, so we put that back in`
			`# the stream.`
			`if fromAttribute:`
			`self.currentToken['data'][-1][1] += '&'`

			`else:`
			`self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'})`

			`def tagOpenState(self):`
			`# This state marks a < that is either a StartTag, EndTag, EmptyTag,`
			`# or ParseError. In all cases, we want to drop any stream history`
			`# we've collected so far and we do that by calling start_tag() on`
			`# the input stream wrapper.`
			`self.stream.start_tag()`
			`return super(BleachHTMLTokenizer, self).tagOpenState()`

			`def emitCurrentToken(self):`
			`token = self.currentToken`

			`if ((self.parser.tags is not None and`
			`token['type'] in TAG_TOKEN_TYPES and`
			`token['name'].lower() not in self.parser.tags)):`
			`# If this is a start/end/empty tag for a tag that's not in our`
			`# allowed list, then it gets stripped or escaped. In both of these`
			`# cases it gets converted to a Characters token.`
			`if self.parser.strip:`
			`# If we're stripping the token, we just throw in an empty`
			`# string token.`
			`new_data = ''`

			`else:`
			`# If we're escaping the token, we want to escape the exact`
			`# original string. Since tokenizing also normalizes data`
			`# and this is a tag-like thing, we've lost some information.`
			`# So we go back through the stream to get the original`
			`# string and use that.`
			`new_data = self.stream.get_tag()`

			`new_token = {`
			`'type': CHARACTERS_TYPE,`
			`'data': new_data`
			`}`

			`self.currentToken = new_token`
			`self.tokenQueue.append(new_token)`
			`self.state = self.dataState`
			`return`

			`super(BleachHTMLTokenizer, self).emitCurrentToken()`


			`class BleachHTMLParser(HTMLParser):`
			`"""Parser that uses BleachHTMLTokenizer"""`
			`def __init__(self, tags, strip, consume_entities, **kwargs):`
			`"""`
			`:arg tags: list of allowed tags--everything else is either stripped or`
			`escaped; if None, then this doesn't look at tags at all`
			`:arg strip: whether to strip disallowed tags (True) or escape them (False);`
			`if tags=None, then this doesn't have any effect`
			`:arg consume_entities: whether to consume entities (default behavior) or`
			`leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)`

			`"""`
			`self.tags = [tag.lower() for tag in tags] if tags is not None else None`
			`self.strip = strip`
			`self.consume_entities = consume_entities`
			`super(BleachHTMLParser, self).__init__(**kwargs)`

			`def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs):`
			`# set scripting=True to parse <noscript> as though JS is enabled to`
			`# match the expected context in browsers`
			`#`
			`# https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element`
			`#`
			`# Override HTMLParser so we can swap out the tokenizer for our own.`
			`self.innerHTMLMode = innerHTML`
			`self.container = container`
			`self.scripting = scripting`
			`self.tokenizer = BleachHTMLTokenizer(`
			`stream=stream,`
			`consume_entities=self.consume_entities,`
			`parser=self,`
			`**kwargs`
			`)`
			`self.reset()`

			`try:`
			`self.mainLoop()`
			`except ReparseException:`
			`self.reset()`
			`self.mainLoop()`


			`def convert_entity(value):`
			`"""Convert an entity (minus the & and ; part) into what it represents`

			`This handles numeric, hex, and text entities.`

			:arg value: the string (minus the ``&`` and ``;`` part) to convert

			`:returns: unicode character or None if it's an ambiguous ampersand that`
			`doesn't match a character entity`

			`"""`
			`if value[0] == '#':`
			`if value[1] in ('x', 'X'):`
			`return six.unichr(int(value[2:], 16))`
			`return six.unichr(int(value[1:], 10))`

			`return ENTITIES.get(value, None)`


			`def convert_entities(text):`
			`"""Converts all found entities in the text`

			`:arg text: the text to convert entities in`

			`:returns: unicode text with converted entities`

			`"""`
			`if '&' not in text:`
			`return text`

			`new_text = []`
			`for part in next_possible_entity(text):`
			`if not part:`
			`continue`

			`if part.startswith('&'):`
			`entity = match_entity(part)`
			`if entity is not None:`
			`converted = convert_entity(entity)`

			`# If it's not an ambiguous ampersand, then replace with the`
			`# unicode character. Otherwise, we leave the entity in.`
			`if converted is not None:`
			`new_text.append(converted)`
			`remainder = part[len(entity) + 2:]`
			`if part:`
			`new_text.append(remainder)`
			`continue`

			`new_text.append(part)`

			`return ''.join(new_text)`


			`def match_entity(stream):`
			`"""Returns first entity in stream or None if no entity exists`

			`Note: For Bleach purposes, entities must start with a "&" and end with`
			`a ";". This ignoresambiguous character entities that have no ";" at the`
			`end.`

			`:arg stream: the character stream`

			:returns: ``None`` or the entity string without "&" or ";"

			`"""`
			`# Nix the & at the beginning`
			`if stream[0] != '&':`
			`raise ValueError('Stream should begin with "&"')`

			`stream = stream[1:]`

			`stream = list(stream)`
			`possible_entity = ''`
			`end_characters = '<&=;' + string.whitespace`

			`# Handle number entities`
			`if stream and stream[0] == '#':`
			`possible_entity = '#'`
			`stream.pop(0)`

			`if stream and stream[0] in ('x', 'X'):`
			`allowed = '0123456789abcdefABCDEF'`
			`possible_entity += stream.pop(0)`
			`else:`
			`allowed = '0123456789'`

			`# FIXME(willkg): Do we want to make sure these are valid number`
			`# entities? This doesn't do that currently.`
			`while stream and stream[0] not in end_characters:`
			`c = stream.pop(0)`
			`if c not in allowed:`
			`break`
			`possible_entity += c`

			`if possible_entity and stream and stream[0] == ';':`
			`return possible_entity`
			`return None`

			`# Handle character entities`
			`while stream and stream[0] not in end_characters:`
			`c = stream.pop(0)`
			`if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):`
			`break`
			`possible_entity += c`

			`if possible_entity and stream and stream[0] == ';':`
			`return possible_entity`

			`return None`


			`AMP_SPLIT_RE = re.compile('(&)')`


			`def next_possible_entity(text):`
			`"""Takes a text and generates a list of possible entities`

			`:arg text: the text to look at`

			`:returns: generator where each part (except the first) starts with an`
			`"&"`

			`"""`
			`for i, part in enumerate(AMP_SPLIT_RE.split(text)):`
			`if i == 0:`
			`yield part`
			`elif i % 2 == 0:`
			`yield '&' + part`


			`class BleachHTMLSerializer(HTMLSerializer):`
			`"""HTMLSerializer that undoes & -> & in attributes and sets`
			`escape_rcdata to True`
			`"""`

			`# per the HTMLSerializer.__init__ docstring:`
			`#`
			`# Whether to escape characters that need to be`
			`# escaped within normal elements within rcdata elements such as`
			`# style.`
			`#`
			`escape_rcdata = True`

			`def escape_base_amp(self, stoken):`
			`"""Escapes just bare & in HTML attribute values"""`
			`# First, undo escaping of &. We need to do this because html5lib's`
			`# HTMLSerializer expected the tokenizer to consume all the character`
			`# entities and convert them to their respective characters, but the`
			`# BleachHTMLTokenizer doesn't do that. For example, this fixes`
			`# &entity; back to &entity; .`
			`stoken = stoken.replace('&', '&')`

			`# However, we do want all bare & that are not marking character`
			`# entities to be changed to &, so let's do that carefully here.`
			`for part in next_possible_entity(stoken):`
			`if not part:`
			`continue`

			`if part.startswith('&'):`
			`entity = match_entity(part)`
			`# Only leave entities in that are not ambiguous. If they're`
			`# ambiguous, then we escape the ampersand.`
			`if entity is not None and convert_entity(entity) is not None:`
			`yield '&' + entity + ';'`

			`# Length of the entity plus 2--one for & at the beginning`
			`# and one for ; at the end`
			`part = part[len(entity) + 2:]`
			`if part:`
			`yield part`
			`continue`

			`yield part.replace('&', '&')`

			`def serialize(self, treewalker, encoding=None):`
			`"""Wrap HTMLSerializer.serialize and conver & to & in attribute values`

			`Note that this converts & to & in attribute values where the & isn't`
			`already part of an unambiguous character entity.`

			`"""`
			`in_tag = False`
			`after_equals = False`

			`for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):`
			`if in_tag:`
			`if stoken == '>':`
			`in_tag = False`

			`elif after_equals:`
			`if stoken != '"':`
			`for part in self.escape_base_amp(stoken):`
			`yield part`

			`after_equals = False`
			`continue`

			`elif stoken == '=':`
			`after_equals = True`

			`yield stoken`
			`else:`
			`if stoken.startswith('<'):`
			`in_tag = True`
			`yield stoken`